Commit 33856de8 authored by Benjamin Peterson's avatar Benjamin Peterson

handle names starting with non-ascii characters correctly #9712

parent e01de8f2
...@@ -531,6 +531,7 @@ pass the '-ucompiler' option to process the full directory. ...@@ -531,6 +531,7 @@ pass the '-ucompiler' option to process the full directory.
True True
Evil tabs Evil tabs
>>> dump_tokens("def f():\\n\\tif x\\n \\tpass") >>> dump_tokens("def f():\\n\\tif x\\n \\tpass")
ENCODING 'utf-8' (0, 0) (0, 0) ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'def' (1, 0) (1, 3) NAME 'def' (1, 0) (1, 3)
...@@ -547,6 +548,18 @@ Evil tabs ...@@ -547,6 +548,18 @@ Evil tabs
NAME 'pass' (3, 9) (3, 13) NAME 'pass' (3, 9) (3, 13)
DEDENT '' (4, 0) (4, 0) DEDENT '' (4, 0) (4, 0)
DEDENT '' (4, 0) (4, 0) DEDENT '' (4, 0) (4, 0)
Non-ascii identifiers
>>> dump_tokens("Örter = 'places'\\ngrün = 'green'")
ENCODING 'utf-8' (0, 0) (0, 0)
NAME 'Örter' (1, 0) (1, 5)
OP '=' (1, 6) (1, 7)
STRING "'places'" (1, 8) (1, 16)
NEWLINE '\\n' (1, 16) (1, 17)
NAME 'grün' (2, 0) (2, 4)
OP '=' (2, 5) (2, 6)
STRING "'green'" (2, 7) (2, 14)
""" """
from test import support from test import support
......
...@@ -92,7 +92,7 @@ def maybe(*choices): return group(*choices) + '?' ...@@ -92,7 +92,7 @@ def maybe(*choices): return group(*choices) + '?'
Whitespace = r'[ \f\t]*' Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*' Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*' Name = r'\w+'
Hexnumber = r'0[xX][0-9a-fA-F]+' Hexnumber = r'0[xX][0-9a-fA-F]+'
Binnumber = r'0[bB][01]+' Binnumber = r'0[bB][01]+'
...@@ -142,9 +142,12 @@ ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + ...@@ -142,9 +142,12 @@ ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
PseudoExtras = group(r'\\\r?\n', Comment, Triple) PseudoExtras = group(r'\\\r?\n', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
def _compile(expr):
return re.compile(expr, re.UNICODE)
tokenprog, pseudoprog, single3prog, double3prog = map( tokenprog, pseudoprog, single3prog, double3prog = map(
re.compile, (Token, PseudoToken, Single3, Double3)) _compile, (Token, PseudoToken, Single3, Double3))
endprogs = {"'": re.compile(Single), '"': re.compile(Double), endprogs = {"'": _compile(Single), '"': _compile(Double),
"'''": single3prog, '"""': double3prog, "'''": single3prog, '"""': double3prog,
"r'''": single3prog, 'r"""': double3prog, "r'''": single3prog, 'r"""': double3prog,
"b'''": single3prog, 'b"""': double3prog, "b'''": single3prog, 'b"""': double3prog,
...@@ -171,6 +174,8 @@ for t in ("'", '"', ...@@ -171,6 +174,8 @@ for t in ("'", '"',
"bR'", 'bR"', "BR'", 'BR"' ): "bR'", 'bR"', "BR'", 'BR"' ):
single_quoted[t] = t single_quoted[t] = t
del _compile
tabsize = 8 tabsize = 8
class TokenError(Exception): pass class TokenError(Exception): pass
...@@ -393,7 +398,7 @@ def tokenize(readline): ...@@ -393,7 +398,7 @@ def tokenize(readline):
def _tokenize(readline, encoding): def _tokenize(readline, encoding):
lnum = parenlev = continued = 0 lnum = parenlev = continued = 0
namechars, numchars = string.ascii_letters + '_', '0123456789' numchars = '0123456789'
contstr, needcont = '', 0 contstr, needcont = '', 0
contline = None contline = None
indents = [0] indents = [0]
...@@ -520,7 +525,7 @@ def _tokenize(readline, encoding): ...@@ -520,7 +525,7 @@ def _tokenize(readline, encoding):
break break
else: # ordinary string else: # ordinary string
yield TokenInfo(STRING, token, spos, epos, line) yield TokenInfo(STRING, token, spos, epos, line)
elif initial in namechars: # ordinary name elif initial.isidentifier(): # ordinary name
yield TokenInfo(NAME, token, spos, epos, line) yield TokenInfo(NAME, token, spos, epos, line)
elif initial == '\\': # continued stmt elif initial == '\\': # continued stmt
continued = 1 continued = 1
......
...@@ -12,6 +12,8 @@ What's New in Python 3.2 Alpha 2? ...@@ -12,6 +12,8 @@ What's New in Python 3.2 Alpha 2?
Core and Builtins Core and Builtins
----------------- -----------------
- Issue #9712: Fix tokenize on identifiers that start with non-ascii names.
- Issue #9688: __basicsize__ and __itemsize__ must be accessed as Py_ssize_t. - Issue #9688: __basicsize__ and __itemsize__ must be accessed as Py_ssize_t.
- Issue #9684: Added a definition for SIZEOF_WCHAR_T to PC/pyconfig.h, - Issue #9684: Added a definition for SIZEOF_WCHAR_T to PC/pyconfig.h,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment