Commit d3afadaa authored by Benjamin Peterson's avatar Benjamin Peterson

normalize latin-1 and utf-8 variant encodings like the builtin tokenizer does

parent ffc08fca
...@@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase): ...@@ -719,7 +719,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)\n' b'do_something(else)\n'
) )
encoding, consumed_lines = detect_encoding(self.get_readline(lines)) encoding, consumed_lines = detect_encoding(self.get_readline(lines))
self.assertEquals(encoding, 'latin-1') self.assertEquals(encoding, 'iso-8859-1')
self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) self.assertEquals(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
def test_matched_bom_and_cookie_first_line(self): def test_matched_bom_and_cookie_first_line(self):
...@@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase): ...@@ -775,6 +775,34 @@ class TestDetectEncoding(TestCase):
readline = self.get_readline(lines) readline = self.get_readline(lines)
self.assertRaises(SyntaxError, detect_encoding, readline) self.assertRaises(SyntaxError, detect_encoding, readline)
def test_latin1_normalization(self):
# See get_normal_name() in tokenizer.c.
encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
"iso-8859-1-unix", "iso-latin-1-mac")
for encoding in encodings:
for rep in ("-", "_"):
enc = encoding.replace("-", rep)
lines = (b"#!/usr/bin/python\n",
b"# coding: " + enc.encode("ascii") + b"\n",
b"print(things)\n",
b"do_something += 4\n")
rl = self.get_readline(lines)
found, consumed_lines = detect_encoding(rl)
self.assertEquals(found, "iso-8859-1")
def test_utf8_normalization(self):
# See get_normal_name() in tokenizer.c.
encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
for encoding in encodings:
for rep in ("-", "_"):
enc = encoding.replace("-", rep)
lines = (b"#!/usr/bin/python\n",
b"# coding: " + enc.encode("ascii") + b"\n",
b"1 + 3\n")
rl = self.get_readline(lines)
found, consumed_lines = detect_encoding(rl)
self.assertEquals(found, "utf-8")
def test_short_files(self): def test_short_files(self):
readline = self.get_readline((b'print(something)\n',)) readline = self.get_readline((b'print(something)\n',))
encoding, consumed_lines = detect_encoding(readline) encoding, consumed_lines = detect_encoding(readline)
......
...@@ -279,6 +279,17 @@ def untokenize(iterable): ...@@ -279,6 +279,17 @@ def untokenize(iterable):
return out return out
def _get_normal_name(orig_enc):
"""Imitates get_normal_name in tokenizer.c."""
# Only care about the first 12 characters.
enc = orig_enc[:12].lower().replace("_", "-")
if enc == "utf-8" or enc.startswith("utf-8-"):
return "utf-8"
if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
return "iso-8859-1"
return orig_enc
def detect_encoding(readline): def detect_encoding(readline):
""" """
The detect_encoding() function is used to detect the encoding that should The detect_encoding() function is used to detect the encoding that should
...@@ -313,7 +324,7 @@ def detect_encoding(readline): ...@@ -313,7 +324,7 @@ def detect_encoding(readline):
matches = cookie_re.findall(line_string) matches = cookie_re.findall(line_string)
if not matches: if not matches:
return None return None
encoding = matches[0] encoding = _get_normal_name(matches[0])
try: try:
codec = lookup(encoding) codec = lookup(encoding)
except LookupError: except LookupError:
......
...@@ -87,6 +87,9 @@ C-API ...@@ -87,6 +87,9 @@ C-API
Library Library
------- -------
- Make tokenize.detect_coding() normalize utf-8 and iso-8859-1 variants like the
builtin tokenizer.
- Issue #7048: Force Decimal.logb to round its result when that result - Issue #7048: Force Decimal.logb to round its result when that result
is too large to fit in the current precision. is too large to fit in the current precision.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment