Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
433f32c3
Commit
433f32c3
authored
Dec 12, 2008
by
Benjamin Peterson
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
parent
e675f08e
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
25 additions
and
13 deletions
+25
-13
Lib/test/test_tokenize.py
Lib/test/test_tokenize.py
+2
-0
Lib/tokenize.py
Lib/tokenize.py
+20
-13
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Lib/test/test_tokenize.py
View file @
433f32c3
...
...
@@ -795,6 +795,8 @@ class TestDetectEncoding(TestCase):
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
consumed_lines
,
[])
readline
=
self
.
get_readline
((
b'# coding: bad
\
n
'
,))
self
.
assertRaises
(
SyntaxError
,
detect_encoding
,
readline
)
class
TestTokenize
(
TestCase
):
...
...
Lib/tokenize.py
View file @
433f32c3
...
...
@@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
import
re
,
string
,
sys
from
token
import
*
from
codecs
import
lookup
from
codecs
import
lookup
,
BOM_UTF8
from
itertools
import
chain
,
repeat
cookie_re
=
re
.
compile
(
"coding[:=]
\
s*([-
\
w.]+)"
)
...
...
@@ -251,11 +251,11 @@ def detect_encoding(readline):
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
but disagree, a SyntaxError will be raised.
but disagree, a SyntaxError will be raised. If the encoding cookie is an
invalid charset, raise a SyntaxError.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
utf8_bom
=
b'
\
xef
\
xbb
\
xbf
'
bom_found
=
False
encoding
=
None
def
read_or_stop
():
...
...
@@ -268,18 +268,25 @@ def detect_encoding(readline):
try
:
line_string
=
line
.
decode
(
'ascii'
)
except
UnicodeDecodeError
:
pass
else
:
matches
=
cookie_re
.
findall
(
line_string
)
if
matches
:
encoding
=
matches
[
0
]
if
bom_found
and
lookup
(
encoding
).
name
!=
'utf-8'
:
# This behaviour mimics the Python interpreter
raise
SyntaxError
(
'encoding problem: utf-8'
)
return
encoding
return
None
matches
=
cookie_re
.
findall
(
line_string
)
if
not
matches
:
return
None
encoding
=
matches
[
0
]
try
:
codec
=
lookup
(
encoding
)
except
LookupError
:
# This behaviour mimics the Python interpreter
raise
SyntaxError
(
"unknown encoding: "
+
encoding
)
if
bom_found
and
codec
.
name
!=
'utf-8'
:
# This behaviour mimics the Python interpreter
raise
SyntaxError
(
'encoding problem: utf-8'
)
return
encoding
first
=
read_or_stop
()
if
first
.
startswith
(
utf8_bom
):
if
first
.
startswith
(
BOM_UTF8
):
bom_found
=
True
first
=
first
[
3
:]
if
not
first
:
...
...
Misc/NEWS
View file @
433f32c3
...
...
@@ -45,6 +45,9 @@ Core and Builtins
Library
-------
- Issue #4021: tokenize.detect_encoding() now raises a SyntaxError when the
codec cannot be found. This is for compatibility with the builtin behavior.
- Issue #4084: Fix max, min, max_mag and min_mag Decimal methods to
give correct results in the case where one argument is a quiet NaN
and the other is a finite number that requires rounding.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment