Commit b6ed1734 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #17156: pygettext.py now uses an encoding of source file and correctly

writes and escapes non-ascii characters.
parent 041d5533
......@@ -215,6 +215,9 @@ Core and Builtins
Library
-------
- Issue #17156: pygettext.py now uses an encoding of source file and correctly
writes and escapes non-ascii characters.
- Issue #16564: Fixed regression relative to Python2 in the operation of
email.encoders.encode_noop when used with binary data.
......
......@@ -189,8 +189,8 @@ msgstr ""
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
"Language-Team: LANGUAGE <LL@li.org>\\n"
"MIME-Version: 1.0\\n"
"Content-Type: text/plain; charset=CHARSET\\n"
"Content-Transfer-Encoding: ENCODING\\n"
"Content-Type: text/plain; charset=%(charset)s\\n"
"Content-Transfer-Encoding: %(encoding)s\\n"
"Generated-By: pygettext.py %(version)s\\n"
''')
......@@ -204,35 +204,32 @@ def usage(code, msg=''):
escapes = []
def make_escapes(pass_iso8859):
global escapes
if pass_iso8859:
# Allow iso-8859 characters to pass through so that e.g. 'msgid
def make_escapes(pass_nonascii):
global escapes, escape
if pass_nonascii:
# Allow non-ascii characters to pass through so that e.g. 'msgid
# "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
# escape any character outside the 32..126 range.
mod = 128
escape = escape_ascii
else:
mod = 256
for i in range(256):
if 32 <= (i % mod) <= 126:
escapes.append(chr(i))
else:
escapes.append("\\%03o" % i)
escapes[ord('\\')] = '\\\\'
escapes[ord('\t')] = '\\t'
escapes[ord('\r')] = '\\r'
escapes[ord('\n')] = '\\n'
escapes[ord('\"')] = '\\"'
escape = escape_nonascii
escapes = [r"\%03o" % i for i in range(mod)]
for i in range(32, 127):
escapes[i] = chr(i)
escapes[ord('\\')] = r'\\'
escapes[ord('\t')] = r'\t'
escapes[ord('\r')] = r'\r'
escapes[ord('\n')] = r'\n'
escapes[ord('\"')] = r'\"'
def escape_ascii(s, encoding):
return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s)
def escape(s):
global escapes
s = list(s)
for i in range(len(s)):
s[i] = escapes[ord(s[i])]
return EMPTYSTRING.join(s)
def escape_nonascii(s, encoding):
return ''.join(escapes[b] for b in s.encode(encoding))
def safe_eval(s):
......@@ -240,18 +237,18 @@ def safe_eval(s):
return eval(s, {'__builtins__':{}}, {})
def normalize(s):
def normalize(s, encoding):
# This converts the various Python string types into a format that is
# appropriate for .po files, namely much closer to C style.
lines = s.split('\n')
if len(lines) == 1:
s = '"' + escape(s) + '"'
s = '"' + escape(s, encoding) + '"'
else:
if not lines[-1]:
del lines[-1]
lines[-1] = lines[-1] + '\n'
for i in range(len(lines)):
lines[i] = escape(lines[i])
lines[i] = escape(lines[i], encoding)
lineterm = '\\n"\n"'
s = '""\n"' + lineterm.join(lines) + '"'
return s
......@@ -448,7 +445,10 @@ class TokenEater:
timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
# The time stamp in the header doesn't have the same format as that
# generated by xgettext...
print(pot_header % {'time': timestamp, 'version': __version__}, file=fp)
encoding = fp.encoding if fp.encoding else 'UTF-8'
print(pot_header % {'time': timestamp, 'version': __version__,
'charset': encoding,
'encoding': '8bit'}, file=fp)
# Sort the entries. First sort each particular entry's keys, then
# sort all the entries by their first item.
reverse = {}
......@@ -492,7 +492,7 @@ class TokenEater:
print(locline, file=fp)
if isdocstring:
print('#, docstring', file=fp)
print('msgid', normalize(k), file=fp)
print('msgid', normalize(k, encoding), file=fp)
print('msgstr ""\n', file=fp)
......@@ -588,7 +588,7 @@ def main():
fp.close()
# calculate escapes
make_escapes(options.escape)
make_escapes(not options.escape)
# calculate all keywords
options.keywords.extend(default_keywords)
......@@ -621,17 +621,17 @@ def main():
if filename == '-':
if options.verbose:
print(_('Reading standard input'))
fp = sys.stdin
fp = sys.stdin.buffer
closep = 0
else:
if options.verbose:
print(_('Working on %s') % filename)
fp = open(filename)
fp = open(filename, 'rb')
closep = 1
try:
eater.set_filename(filename)
try:
tokens = tokenize.generate_tokens(fp.readline)
tokens = tokenize.tokenize(fp.readline)
for _token in tokens:
eater(*_token)
except tokenize.TokenError as e:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment