merge 3.2: issue 14629

63c39fe3 · Martin v. Löwis · 7b17a4e1 · 63674f4b · 63c39fe3 · 63c39fe3
Commit 63c39fe3 authored Apr 20, 2012 by Martin v. Löwis
Show whitespace changes
Inline Side-by-side

Showing with 18 additions and 2 deletions

Lib/test/test_tokenize.py Lib/test/test_tokenize.py +10 -0

Lib/tokenize.py Lib/tokenize.py +5 -2

Misc/NEWS Misc/NEWS +3 -0

No files found.
--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -838,6 +838,16 @@ class TestDetectEncoding(TestCase):
                found, consumed_lines = detect_encoding(rl)
                self.assertEqual(found, "iso-8859-1")

+    def test_syntaxerror_latin1(self):
+        # Issue 14629: need to raise SyntaxError if the first
+        # line(s) have non-UTF-8 characters
+        lines = (
+            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
+            )
+        readline = self.get_readline(lines)
+        self.assertRaises(SyntaxError, detect_encoding, readline)
+
+
    def test_utf8_normalization(self):
        # See get_normal_name() in tokenizer.c.
        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")

--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -364,9 +364,12 @@ def detect_encoding(readline):

    def find_cookie(line):
        try:
-            line_string = line.decode('ascii')
+            # Decode as UTF-8. Either the line is an encoding declaration,
+            # in which case it should be pure ASCII, or it must be UTF-8
+            # per default encoding.
+            line_string = line.decode('utf-8')
        except UnicodeDecodeError:
-            return None
+            raise SyntaxError("invalid or missing encoding declaration")

        matches = cookie_re.findall(line_string)
        if not matches:

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -55,6 +55,9 @@ Core and Builtins
 Library
 -------

+- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
+  first two lines have non-UTF-8 characters without an encoding declaration.
+
 - Issue #14308: Fix an exception when a "dummy" thread is in the threading
  module's active list after a fork().