Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
689a5580
Commit
689a5580
authored
Mar 18, 2010
by
Benjamin Peterson
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
in tokenize.detect_encoding(), return utf-8-sig when a BOM is found
parent
8c804273
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
12 deletions
+22
-12
Doc/library/tokenize.rst
Doc/library/tokenize.rst
+2
-1
Lib/test/test_tokenize.py
Lib/test/test_tokenize.py
+5
-5
Lib/tokenize.py
Lib/tokenize.py
+12
-6
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Doc/library/tokenize.rst
View file @
689a5580
...
...
@@ -95,7 +95,8 @@ function it uses to do this is available:
It detects the encoding from the presence of a UTF-8 BOM or an encoding
cookie as specified in :pep:`263`. If both a BOM and a cookie are present,
but disagree, a SyntaxError will be raised.
but disagree, a SyntaxError will be raised. Note that if the BOM is found,
``'utf-8-sig'`` will be returned as an encoding.
If no encoding is specified, then the default of ``'utf-8'`` will be returned.
...
...
Lib/test/test_tokenize.py
View file @
689a5580
...
...
@@ -726,7 +726,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)
\
n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b'# something
\
n
'
,
b'print(something)
\
n
'
])
...
...
@@ -747,7 +747,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)
\
n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b'# coding=utf-8
\
n
'
])
def
test_mismatched_bom_and_cookie_first_line_raises_syntaxerror
(
self
):
...
...
@@ -779,7 +779,7 @@ class TestDetectEncoding(TestCase):
b'do_something(else)
\
n
'
)
encoding
,
consumed_lines
=
detect_encoding
(
self
.
get_readline
(
lines
))
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b'#! something
\
n
'
,
b'f# coding=utf-8
\
n
'
])
...
...
@@ -833,12 +833,12 @@ class TestDetectEncoding(TestCase):
readline
=
self
.
get_readline
((
b'
\
xef
\
xbb
\
xbf
print(something)
\
n
'
,))
encoding
,
consumed_lines
=
detect_encoding
(
readline
)
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[
b'print(something)
\
n
'
])
readline
=
self
.
get_readline
((
b'
\
xef
\
xbb
\
xbf
'
,))
encoding
,
consumed_lines
=
detect_encoding
(
readline
)
self
.
assertEquals
(
encoding
,
'utf-8'
)
self
.
assertEquals
(
encoding
,
'utf-8
-sig
'
)
self
.
assertEquals
(
consumed_lines
,
[])
readline
=
self
.
get_readline
((
b'# coding: bad
\
n
'
,))
...
...
Lib/tokenize.py
View file @
689a5580
...
...
@@ -301,14 +301,16 @@ def detect_encoding(readline):
in.
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
but disagree, a SyntaxError will be raised. If the encoding cookie is an
invalid charset, raise a SyntaxError.
cookie as specified in pep-0263. If both a bom and a cookie are present, but
disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
charset, raise a SyntaxError. Note that if a utf-8 bom is found,
'utf-8-sig' is returned.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
bom_found
=
False
encoding
=
None
default
=
'utf-8'
def
read_or_stop
():
try
:
return
readline
()
...
...
@@ -340,8 +342,9 @@ def detect_encoding(readline):
if
first
.
startswith
(
BOM_UTF8
):
bom_found
=
True
first
=
first
[
3
:]
default
=
'utf-8-sig'
if
not
first
:
return
'utf-8'
,
[]
return
default
,
[]
encoding
=
find_cookie
(
first
)
if
encoding
:
...
...
@@ -349,13 +352,13 @@ def detect_encoding(readline):
second
=
read_or_stop
()
if
not
second
:
return
'utf-8'
,
[
first
]
return
default
,
[
first
]
encoding
=
find_cookie
(
second
)
if
encoding
:
return
encoding
,
[
first
,
second
]
return
'utf-8'
,
[
first
,
second
]
return
default
,
[
first
,
second
]
def
tokenize
(
readline
):
...
...
@@ -394,6 +397,9 @@ def _tokenize(readline, encoding):
indents
=
[
0
]
if
encoding
is
not
None
:
if
encoding
==
"utf-8-sig"
:
# BOM will already have been stripped.
encoding
=
"utf-8"
yield
TokenInfo
(
ENCODING
,
encoding
,
(
0
,
0
),
(
0
,
0
),
''
)
while
True
:
# loop over lines in stream
try
:
...
...
Misc/NEWS
View file @
689a5580
...
...
@@ -283,6 +283,9 @@ C-API
Library
-------
- ``tokenize.detect_encoding`` now returns ``'utf-8-sig'`` when a UTF-8 BOM is
detected.
- Issue #8024: Update the Unicode database to 5.2.
- Issue #6716/2: Backslash-replace error output in compilall.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment