Commit c33f3f23 authored by Brett Cannon's avatar Brett Cannon

Issue #14629: Mention the filename in SyntaxError exceptions from

tokenizer.detect_encoding() (when available).
parent dd9a5695
...@@ -904,6 +904,35 @@ class TestDetectEncoding(TestCase): ...@@ -904,6 +904,35 @@ class TestDetectEncoding(TestCase):
self.assertEqual(fp.encoding, 'utf-8-sig') self.assertEqual(fp.encoding, 'utf-8-sig')
self.assertEqual(fp.mode, 'r') self.assertEqual(fp.mode, 'r')
def test_filename_in_exception(self):
# When possible, include the file name in the exception.
path = 'some_file_path'
lines = (
b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
)
class Bunk:
def __init__(self, lines, path):
self.name = path
self._lines = lines
self._index = 0
def readline(self):
if self._index == len(lines):
raise StopIteration
line = lines[self._index]
self._index += 1
return line
with self.assertRaises(SyntaxError):
ins = Bunk(lines, path)
# Make sure lacking a name isn't an issue.
del ins.name
detect_encoding(ins.readline)
with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
ins = Bunk(lines, path)
detect_encoding(ins.readline)
class TestTokenize(TestCase): class TestTokenize(TestCase):
def test_tokenize(self): def test_tokenize(self):
......
...@@ -353,6 +353,10 @@ def detect_encoding(readline): ...@@ -353,6 +353,10 @@ def detect_encoding(readline):
If no encoding is specified, then the default of 'utf-8' will be returned. If no encoding is specified, then the default of 'utf-8' will be returned.
""" """
try:
filename = readline.__self__.name
except AttributeError:
filename = None
bom_found = False bom_found = False
encoding = None encoding = None
default = 'utf-8' default = 'utf-8'
...@@ -369,7 +373,10 @@ def detect_encoding(readline): ...@@ -369,7 +373,10 @@ def detect_encoding(readline):
# per default encoding. # per default encoding.
line_string = line.decode('utf-8') line_string = line.decode('utf-8')
except UnicodeDecodeError: except UnicodeDecodeError:
raise SyntaxError("invalid or missing encoding declaration") msg = "invalid or missing encoding declaration"
if filename is not None:
msg = '{} for {!r}'.format(msg, filename)
raise SyntaxError(msg)
matches = cookie_re.findall(line_string) matches = cookie_re.findall(line_string)
if not matches: if not matches:
...@@ -379,12 +386,21 @@ def detect_encoding(readline): ...@@ -379,12 +386,21 @@ def detect_encoding(readline):
codec = lookup(encoding) codec = lookup(encoding)
except LookupError: except LookupError:
# This behaviour mimics the Python interpreter # This behaviour mimics the Python interpreter
raise SyntaxError("unknown encoding: " + encoding) if filename is None:
msg = "unknown encoding: " + encoding
else:
msg = "unknown encoding for {!r}: {}".format(filename,
encoding)
raise SyntaxError(msg)
if bom_found: if bom_found:
if codec.name != 'utf-8': if codec.name != 'utf-8':
# This behaviour mimics the Python interpreter # This behaviour mimics the Python interpreter
raise SyntaxError('encoding problem: utf-8') if filename is None:
msg = 'encoding problem: utf-8'
else:
msg = 'encoding problem for {!r}: utf-8'.format(filename)
raise SyntaxError(msg)
encoding += '-sig' encoding += '-sig'
return encoding return encoding
......
...@@ -55,6 +55,9 @@ Core and Builtins ...@@ -55,6 +55,9 @@ Core and Builtins
Library Library
------- -------
- Issue #14629: tokenizer.detect_encoding will specify the filename in the
SyntaxError exception if found at readline.__self__.name.
- Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the - Issue #14629: Raise SyntaxError in tokenizer.detect_encoding if the
first two lines have non-UTF-8 characters without an encoding declaration. first two lines have non-UTF-8 characters without an encoding declaration.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment