Issue #17156: pygettext.py now uses an encoding of source file and correctly

writes and escapes non-ascii characters.

Issue #17156: pygettext.py now uses an encoding of source file and correctly
writes and escapes non-ascii characters.
859cd472 · Serhiy Storchaka · 7451a72e · b6ed1734 · 859cd472 · 859cd472
Commit 859cd472 authored Feb 09, 2013 by Serhiy Storchaka
Hide whitespace changes
Inline Side-by-side

Showing with 36 additions and 33 deletions

Misc/NEWS Misc/NEWS +3 -0

Tools/i18n/pygettext.py Tools/i18n/pygettext.py +33 -33

No files found.
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -169,6 +169,9 @@ Core and Builtins
 Library
 -------

+- Issue #17156: pygettext.py now uses an encoding of source file and correctly
+  writes and escapes non-ascii characters.
+
 - Issue #16564: Fixed regression relative to Python2 in the operation of
  email.encoders.encode_noop when used with binary data.


--- a/Tools/i18n/pygettext.py
+++ b/Tools/i18n/pygettext.py
@@ -188,8 +188,8 @@ msgstr ""
 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
 "Language-Team: LANGUAGE <LL@li.org>\\n"
 "MIME-Version: 1.0\\n"
-"Content-Type: text/plain; charset=CHARSET\\n"
-"Content-Transfer-Encoding: ENCODING\\n"
+"Content-Type: text/plain; charset=%(charset)s\\n"
+"Content-Transfer-Encoding: %(encoding)s\\n"
 "Generated-By: pygettext.py %(version)s\\n"

 ''')
@@ -203,35 +203,32 @@ def usage(code, msg=''):



-escapes = []
-
-def make_escapes(pass_iso8859):
-    global escapes
-    if pass_iso8859:
-        # Allow iso-8859 characters to pass through so that e.g. 'msgid
+def make_escapes(pass_nonascii):
+    global escapes, escape
+    if pass_nonascii:
+        # Allow non-ascii characters to pass through so that e.g. 'msgid
        # "Höhe"' would result not result in 'msgid "H\366he"'.  Otherwise we
        # escape any character outside the 32..126 range.
        mod = 128
+        escape = escape_ascii
    else:
        mod = 256
-    for i in range(256):
-        if 32 <= (i % mod) <= 126:
-            escapes.append(chr(i))
-        else:
-            escapes.append("\\%03o" % i)
-    escapes[ord('\\')] = '\\\\'
-    escapes[ord('\t')] = '\\t'
-    escapes[ord('\r')] = '\\r'
-    escapes[ord('\n')] = '\\n'
-    escapes[ord('\"')] = '\\"'
+        escape = escape_nonascii
+    escapes = [r"\%03o" % i for i in range(mod)]
+    for i in range(32, 127):
+        escapes[i] = chr(i)
+    escapes[ord('\\')] = r'\\'
+    escapes[ord('\t')] = r'\t'
+    escapes[ord('\r')] = r'\r'
+    escapes[ord('\n')] = r'\n'
+    escapes[ord('\"')] = r'\"'
+

+def escape_ascii(s, encoding):
+    return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)

-def escape(s):
-    global escapes
-    s = list(s)
-    for i in range(len(s)):
-        s[i] = escapes[ord(s[i])]
-    return EMPTYSTRING.join(s)
+def escape_nonascii(s, encoding):
+    return ''.join(escapes[b] for b in s.encode(encoding))


 def safe_eval(s):
@@ -239,18 +236,18 @@ def safe_eval(s):
    return eval(s, {'__builtins__':{}}, {})


-def normalize(s):
+def normalize(s, encoding):
    # This converts the various Python string types into a format that is
    # appropriate for .po files, namely much closer to C style.
    lines = s.split('\n')
    if len(lines) == 1:
-        s = '"' + escape(s) + '"'
+        s = '"' + escape(s, encoding) + '"'
    else:
        if not lines[-1]:
            del lines[-1]
            lines[-1] = lines[-1] + '\n'
        for i in range(len(lines)):
-            lines[i] = escape(lines[i])
+            lines[i] = escape(lines[i], encoding)
        lineterm = '\\n"\n"'
        s = '""\n"' + lineterm.join(lines) + '"'
    return s
@@ -447,7 +444,10 @@ class TokenEater:
        timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
        # The time stamp in the header doesn't have the same format as that
        # generated by xgettext...
-        print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
+        encoding = fp.encoding if fp.encoding else 'UTF-8'
+        print(pot_header % {'time': timestamp, 'version': __version__,
+                            'charset': encoding,
+                            'encoding': '8bit'}, file=fp)
        # Sort the entries.  First sort each particular entry's keys, then
        # sort all the entries by their first item.
        reverse = {}
@@ -491,7 +491,7 @@ class TokenEater:
                        print(locline, file=fp)
                if isdocstring:
                    print('#, docstring', file=fp)
-                print('msgid', normalize(k), file=fp)
+                print('msgid', normalize(k, encoding), file=fp)
                print('msgstr ""\n', file=fp)


@@ -587,7 +587,7 @@ def main():
                fp.close()

    # calculate escapes
-    make_escapes(options.escape)
+    make_escapes(not options.escape)

    # calculate all keywords
    options.keywords.extend(default_keywords)
@@ -620,17 +620,17 @@ def main():
        if filename == '-':
            if options.verbose:
                print(_('Reading standard input'))
-            fp = sys.stdin
+            fp = sys.stdin.buffer
            closep = 0
        else:
            if options.verbose:
                print(_('Working on %s') % filename)
-            fp = open(filename)
+            fp = open(filename, 'rb')
            closep = 1
        try:
            eater.set_filename(filename)
            try:
-                tokens = tokenize.generate_tokens(fp.readline)
+                tokens = tokenize.tokenize(fp.readline)
                for _token in tokens:
                    eater(*_token)
            except tokenize.TokenError as e: