From 9c760bafb5f98ad07ac5ecb02a86df4c6296351c Mon Sep 17 00:00:00 2001 From: Rodrigo Tobar Date: Tue, 12 Nov 2024 23:37:20 +0800 Subject: [PATCH] Detect and report incorrectly delimited strings Strings in pofiles are always delimited on both ends with double-quotes. The POFile parser in polib didn't check for this, and therefore happily accepted invalid msgstr/msgid/etc, potentially loosing some of the contents of the file. In such cases, the first or last character of the string would be lost, as *they* would be considered the string delimiters. This commit adds a check to the POFile parser to ensure strings are always delimited by double quotes on both ends. After adding it, I spotted a couple of offending po file contents in the tests, which have also been fixed. Additionally, a new test had been added to ensure these cases are caught. The new test indeed fails if the new check is removed. This issue was found while investigating an error produced by the "powrap" tool while running it over the po files for the Spanish translation of the CPython documentation. The tool failed check one of our files because gettext's `msgcat` utility failed to parse the file. Upon closer inspection I realised the error in our pofile, which was caught by gettext but not polib. Signed-off-by: Rodrigo Tobar --- polib.py | 8 ++++++++ tests/test_ufeff.po | 3 +-- tests/tests.py | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/polib.py b/polib.py index f45caa2..c07cee6 100644 --- a/polib.py +++ b/polib.py @@ -1376,6 +1376,10 @@ def parse(self): raise IOError('Syntax error in po file %s(line %s): ' 'unescaped double quote found' % (fpath, self.current_line)) + if line[0] != '"' or line[-1] != '"': + raise IOError('Syntax error in po file %s(line %s): ' + 'string not delimited by double quotes' % + (fpath, self.current_line)) self.current_token = line self.process(keywords[tokens[0]]) continue @@ -1394,6 +1398,10 @@ def parse(self): raise IOError('Syntax error in po file %s(line %s): ' 'unescaped double quote found' % (fpath, self.current_line)) + if line[-1] != '"': + raise IOError('Syntax error in po file %s(line %s): ' + 'string not delimited by double quotes' % + (fpath, self.current_line)) self.process('mc') elif line[:7] == 'msgstr[': diff --git a/tests/test_ufeff.po b/tests/test_ufeff.po index e358091..74dbc6d 100644 --- a/tests/test_ufeff.po +++ b/tests/test_ufeff.po @@ -1,8 +1,7 @@ # test for pofile/mofile with ufeff msgid "" msgstr "" -"Project-Id-Version: django -" +"Project-Id-Version: django" msgid "foo" msgstr "bar" diff --git a/tests/tests.py b/tests/tests.py index 1739aea..0f23bc4 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -27,7 +27,7 @@ def test_pofile_and_mofile1(self): data = u('''# test for pofile/mofile with string buffer msgid "" msgstr "" -"Project-Id-Version: django\n" +"Project-Id-Version: django" msgid "foo" msgstr "bar" @@ -116,7 +116,7 @@ def test_ufeff_data_pofile(self): data = u('''\ufeff# test for pofile/mofile with ufeff msgid "" msgstr "" -"Project-Id-Version: django\n" +"Project-Id-Version: django" msgid "foo" msgstr "bar" @@ -258,6 +258,35 @@ def test_unescaped_double_quote4(self): msg = 'Syntax error in po file (line 4): unescaped double quote found' self.assertEqual(str(exc), msg) + def test_no_double_quote_delimiters(self): + """ + Test that polib reports an error when a string is not delimited by double quotes. + """ + invalid_msgstr = r''' +msgid "A" +msgstr *B" +''' + invalid_msgid = r''' +msgid "A/ +msgstr "B" +''' + invalid_msgid_plural = r''' +msgid_plural A +msgstr "B" +''' + invalid_msgstr_continuation = r''' +msgid "A" +msgstr "" +"B +''' + for data in (invalid_msgid, invalid_msgid_plural, invalid_msgstr, invalid_msgstr_continuation): + try: + polib.pofile(data) + self.fail("Strings not delimited by double quotes not detected") + except IOError as ex: + msg = 'string not delimited by double quotes' + self.assertIn(msg, str(ex)) + def test_syntax_error1(self): """ Test that syntax error is raised while processing a symbol parsing.