Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
2e4394ee
Commit
2e4394ee
authored
Jul 16, 2014
by
Tal Einat
Browse files
Options
Browse Files
Download
Plain Diff
Issue #21765: Add support for non-ascii identifiers to HyperParser
parents
b5cace89
9b7f9e6c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
213 additions
and
44 deletions
+213
-44
Lib/idlelib/HyperParser.py
Lib/idlelib/HyperParser.py
+75
-18
Lib/idlelib/PyParse.py
Lib/idlelib/PyParse.py
+54
-26
Lib/idlelib/idle_test/test_hyperparser.py
Lib/idlelib/idle_test/test_hyperparser.py
+82
-0
Misc/NEWS
Misc/NEWS
+2
-0
No files found.
Lib/idlelib/HyperParser.py
View file @
2e4394ee
...
...
@@ -6,11 +6,24 @@ the structure of code.
"""
import
string
import
keyword
from
keyword
import
is
keyword
from
idlelib
import
PyParse
class
HyperParser
:
# all ASCII chars that may be in an identifier
_ASCII_ID_CHARS
=
frozenset
(
string
.
ascii_letters
+
string
.
digits
+
"_"
)
# all ASCII chars that may be the first char of an identifier
_ASCII_ID_FIRST_CHARS
=
frozenset
(
string
.
ascii_letters
+
"_"
)
# lookup table for whether 7-bit ASCII chars are valid in a Python identifier
_IS_ASCII_ID_CHAR
=
[(
chr
(
x
)
in
_ASCII_ID_CHARS
)
for
x
in
range
(
128
)]
# lookup table for whether 7-bit ASCII chars are valid as the first
# char in a Python identifier
_IS_ASCII_ID_FIRST_CHAR
=
\
[(
chr
(
x
)
in
_ASCII_ID_FIRST_CHARS
)
for
x
in
range
(
128
)]
class
HyperParser
:
def
__init__
(
self
,
editwin
,
index
):
"To initialize, analyze the surroundings of the given index."
...
...
@@ -143,26 +156,70 @@ class HyperParser:
return
beforeindex
,
afterindex
# Ascii chars that may be in a white space
_whitespace_chars
=
"
\
t
\
n
\
\
"
# Ascii chars that may be in an identifier
_id_chars
=
string
.
ascii_letters
+
string
.
digits
+
"_"
# Ascii chars that may be the first char of an identifier
_id_first_chars
=
string
.
ascii_letters
+
"_"
# Given a string and pos, return the number of chars in the
# identifier which ends at pos, or 0 if there is no such one. Saved
# words are not identifiers.
def
_eat_identifier
(
self
,
str
,
limit
,
pos
):
# the set of built-in identifiers which are also keywords,
# i.e. keyword.iskeyword() returns True for them
_ID_KEYWORDS
=
frozenset
({
"True"
,
"False"
,
"None"
})
@
classmethod
def
_eat_identifier
(
cls
,
str
,
limit
,
pos
):
"""Given a string and pos, return the number of chars in the
identifier which ends at pos, or 0 if there is no such one.
This ignores non-identifier eywords are not identifiers.
"""
is_ascii_id_char
=
_IS_ASCII_ID_CHAR
# Start at the end (pos) and work backwards.
i
=
pos
while
i
>
limit
and
str
[
i
-
1
]
in
self
.
_id_chars
:
# Go backwards as long as the characters are valid ASCII
# identifier characters. This is an optimization, since it
# is faster in the common case where most of the characters
# are ASCII.
while
i
>
limit
and
(
ord
(
str
[
i
-
1
])
<
128
and
is_ascii_id_char
[
ord
(
str
[
i
-
1
])]
):
i
-=
1
if
(
i
<
pos
and
(
str
[
i
]
not
in
self
.
_id_first_chars
or
(
keyword
.
iskeyword
(
str
[
i
:
pos
])
and
str
[
i
:
pos
]
not
in
{
'None'
,
'False'
,
'True'
}))):
i
=
pos
# If the above loop ended due to reaching a non-ASCII
# character, continue going backwards using the most generic
# test for whether a string contains only valid identifier
# characters.
if
i
>
limit
and
ord
(
str
[
i
-
1
])
>=
128
:
while
i
-
4
>=
limit
and
(
'a'
+
str
[
i
-
4
:
pos
]).
isidentifier
():
i
-=
4
if
i
-
2
>=
limit
and
(
'a'
+
str
[
i
-
2
:
pos
]).
isidentifier
():
i
-=
2
if
i
-
1
>=
limit
and
(
'a'
+
str
[
i
-
1
:
pos
]).
isidentifier
():
i
-=
1
# The identifier candidate starts here. If it isn't a valid
# identifier, don't eat anything. At this point that is only
# possible if the first character isn't a valid first
# character for an identifier.
if
not
str
[
i
:
pos
].
isidentifier
():
return
0
elif
i
<
pos
:
# All characters in str[i:pos] are valid ASCII identifier
# characters, so it is enough to check that the first is
# valid as the first character of an identifier.
if
not
_IS_ASCII_ID_FIRST_CHAR
[
ord
(
str
[
i
])]:
return
0
# All keywords are valid identifiers, but should not be
# considered identifiers here, except for True, False and None.
if
i
<
pos
and
(
iskeyword
(
str
[
i
:
pos
])
and
str
[
i
:
pos
]
not
in
cls
.
_ID_KEYWORDS
):
return
0
return
pos
-
i
# This string includes all chars that may be in a white space
_whitespace_chars
=
"
\
t
\
n
\
\
"
def
get_expression
(
self
):
"""Return a string with the Python expression which ends at the
given index, which is empty if there is no real one.
...
...
Lib/idlelib/PyParse.py
View file @
2e4394ee
import
re
import
sys
from
collections
import
Mapping
from
functools
import
partial
# Reason last stmt is continued (or C_NONE if it's not).
(
C_NONE
,
C_BACKSLASH
,
C_STRING_FIRST_LINE
,
...
...
@@ -91,19 +93,48 @@ _chew_ordinaryre = re.compile(r"""
[^[\
](){}#
'"\\]+
"""
,
re
.
VERBOSE
).
match
# Build translation table to map uninteresting chars to "x", open
# brackets to "(", and close brackets to ")".
_tran
=
{}
for
i
in
range
(
256
):
_tran
[
i
]
=
'x'
for
ch
in
"({["
:
_tran
[
ord
(
ch
)]
=
'('
for
ch
in
")}]"
:
_tran
[
ord
(
ch
)]
=
')'
for
ch
in
"
\
"
'
\
\
\
n
#"
:
_tran
[
ord
(
ch
)]
=
ch
del
i
,
ch
class
StringTranslatePseudoMapping
(
Mapping
):
r"""Utility class to be used with str.translate()
This Mapping class wraps a given dict. When a value for a key is
requested via __getitem__() or get(), the key is looked up in the
given dict. If found there, the value from the dict is returned.
Otherwise, the default value given upon initialization is returned.
This allows using str.translate() to make some replacements, and to
replace all characters for which no replacement was specified with
a given character instead of leaving them as-is.
For example, to replace everything except whitespace with 'x':
>>> whitespace_chars = ' \t\n\r'
>>> preserve_dict = {ord(c): ord(c) for c in whitespace_chars}
>>> mapping = StringTranslatePseudoMapping(preserve_dict, ord('x'))
>>> text = "a + b\tc\nd"
>>> text.translate(mapping)
'x x x\tx\nx'
"""
def
__init__
(
self
,
non_defaults
,
default_value
):
self
.
_non_defaults
=
non_defaults
self
.
_default_value
=
default_value
def
_get
(
key
,
_get
=
non_defaults
.
get
,
_default
=
default_value
):
return
_get
(
key
,
_default
)
self
.
_get
=
_get
def
__getitem__
(
self
,
item
):
return
self
.
_get
(
item
)
def
__len__
(
self
):
return
len
(
self
.
_non_defaults
)
def
__iter__
(
self
):
return
iter
(
self
.
_non_defaults
)
def
get
(
self
,
key
,
default
=
None
):
return
self
.
_get
(
key
)
class
Parser
:
...
...
@@ -113,19 +144,6 @@ class Parser:
def
set_str
(
self
,
s
):
assert
len
(
s
)
==
0
or
s
[
-
1
]
==
'
\
n
'
if
isinstance
(
s
,
str
):
# The parse functions have no idea what to do with Unicode, so
# replace all Unicode characters with "x". This is "safe"
# so long as the only characters germane to parsing the structure
# of Python are 7-bit ASCII. It's *necessary* because Unicode
# strings don't have a .translate() method that supports
# deletechars.
uniphooey
=
s
s
=
[]
push
=
s
.
append
for
raw
in
map
(
ord
,
uniphooey
):
push
(
raw
<
127
and
chr
(
raw
)
or
"x"
)
s
=
""
.
join
(
s
)
self
.
str
=
s
self
.
study_level
=
0
...
...
@@ -197,6 +215,16 @@ class Parser:
if
lo
>
0
:
self
.
str
=
self
.
str
[
lo
:]
# Build a translation table to map uninteresting chars to 'x', open
# brackets to '(', close brackets to ')' while preserving quotes,
# backslashes, newlines and hashes. This is to be passed to
# str.translate() in _study1().
_tran
=
{}
_tran
.
update
((
ord
(
c
),
ord
(
'('
))
for
c
in
"({["
)
_tran
.
update
((
ord
(
c
),
ord
(
')'
))
for
c
in
")}]"
)
_tran
.
update
((
ord
(
c
),
ord
(
c
))
for
c
in
"
\
"
'
\
\
\
n
#"
)
_tran
=
StringTranslatePseudoMapping
(
_tran
,
default_value
=
ord
(
'x'
))
# As quickly as humanly possible <wink>, find the line numbers (0-
# based) of the non-continuation lines.
# Creates self.{goodlines, continuation}.
...
...
@@ -211,7 +239,7 @@ class Parser:
# uninteresting characters. This can cut the number of chars
# by a factor of 10-40, and so greatly speed the following loop.
str
=
self
.
str
str
=
str
.
translate
(
_tran
)
str
=
str
.
translate
(
self
.
_tran
)
str
=
str
.
replace
(
'xxxxxxxx'
,
'x'
)
str
=
str
.
replace
(
'xxxx'
,
'x'
)
str
=
str
.
replace
(
'xx'
,
'x'
)
...
...
Lib/idlelib/idle_test/test_hyperparser.py
View file @
2e4394ee
...
...
@@ -30,6 +30,7 @@ class HyperParserTest(unittest.TestCase):
"z = ((r'asdf')+('a')))
\
n
"
'[x for x in
\
n
'
'for = False
\
n
'
'cliché = "this is a string with unicode, what a cliché"'
)
@
classmethod
...
...
@@ -93,6 +94,8 @@ class HyperParserTest(unittest.TestCase):
self
.
assertTrue
(
p
.
is_in_string
())
p
=
get
(
'4.6'
)
self
.
assertTrue
(
p
.
is_in_string
())
p
=
get
(
'12.54'
)
self
.
assertTrue
(
p
.
is_in_string
())
def
test_is_in_code
(
self
):
get
=
self
.
get_parser
...
...
@@ -180,12 +183,91 @@ class HyperParserTest(unittest.TestCase):
p
=
get
(
'10.0'
)
self
.
assertEqual
(
p
.
get_expression
(),
''
)
p
=
get
(
'10.6'
)
self
.
assertEqual
(
p
.
get_expression
(),
''
)
p
=
get
(
'10.11'
)
self
.
assertEqual
(
p
.
get_expression
(),
''
)
p
=
get
(
'11.3'
)
self
.
assertEqual
(
p
.
get_expression
(),
''
)
p
=
get
(
'11.11'
)
self
.
assertEqual
(
p
.
get_expression
(),
'False'
)
p
=
get
(
'12.6'
)
self
.
assertEqual
(
p
.
get_expression
(),
'cliché'
)
def
test_eat_identifier
(
self
):
def
is_valid_id
(
candidate
):
result
=
HyperParser
.
_eat_identifier
(
candidate
,
0
,
len
(
candidate
))
if
result
==
len
(
candidate
):
return
True
elif
result
==
0
:
return
False
else
:
err_msg
=
"Unexpected result: {} (expected 0 or {}"
.
format
(
result
,
len
(
candidate
)
)
raise
Exception
(
err_msg
)
# invalid first character which is valid elsewhere in an identifier
self
.
assertFalse
(
is_valid_id
(
'2notid'
))
# ASCII-only valid identifiers
self
.
assertTrue
(
is_valid_id
(
'valid_id'
))
self
.
assertTrue
(
is_valid_id
(
'_valid_id'
))
self
.
assertTrue
(
is_valid_id
(
'valid_id_'
))
self
.
assertTrue
(
is_valid_id
(
'_2valid_id'
))
# keywords which should be "eaten"
self
.
assertTrue
(
is_valid_id
(
'True'
))
self
.
assertTrue
(
is_valid_id
(
'False'
))
self
.
assertTrue
(
is_valid_id
(
'None'
))
# keywords which should not be "eaten"
self
.
assertFalse
(
is_valid_id
(
'for'
))
self
.
assertFalse
(
is_valid_id
(
'import'
))
self
.
assertFalse
(
is_valid_id
(
'return'
))
# valid unicode identifiers
self
.
assertTrue
(
is_valid_id
(
'cliche'
))
self
.
assertTrue
(
is_valid_id
(
'cliché'
))
self
.
assertTrue
(
is_valid_id
(
'a٢'
))
# invalid unicode identifiers
self
.
assertFalse
(
is_valid_id
(
'2a'
))
self
.
assertFalse
(
is_valid_id
(
'٢a'
))
self
.
assertFalse
(
is_valid_id
(
'a²'
))
# valid identifier after "punctuation"
self
.
assertEqual
(
HyperParser
.
_eat_identifier
(
'+ var'
,
0
,
5
),
len
(
'var'
))
self
.
assertEqual
(
HyperParser
.
_eat_identifier
(
'+var'
,
0
,
4
),
len
(
'var'
))
self
.
assertEqual
(
HyperParser
.
_eat_identifier
(
'.var'
,
0
,
4
),
len
(
'var'
))
# invalid identifiers
self
.
assertFalse
(
is_valid_id
(
'+'
))
self
.
assertFalse
(
is_valid_id
(
' '
))
self
.
assertFalse
(
is_valid_id
(
':'
))
self
.
assertFalse
(
is_valid_id
(
'?'
))
self
.
assertFalse
(
is_valid_id
(
'^'
))
self
.
assertFalse
(
is_valid_id
(
'
\
\
'
))
self
.
assertFalse
(
is_valid_id
(
'"'
))
self
.
assertFalse
(
is_valid_id
(
'"a string"'
))
def
test_eat_identifier_various_lengths
(
self
):
eat_id
=
HyperParser
.
_eat_identifier
for
length
in
range
(
1
,
21
):
self
.
assertEqual
(
eat_id
(
'a'
*
length
,
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'é'
*
length
,
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'a'
+
'2'
*
(
length
-
1
),
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'é'
+
'2'
*
(
length
-
1
),
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'é'
+
'a'
*
(
length
-
1
),
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'é'
*
(
length
-
1
)
+
'a'
,
0
,
length
),
length
)
self
.
assertEqual
(
eat_id
(
'+'
*
length
,
0
,
length
),
0
)
self
.
assertEqual
(
eat_id
(
'2'
+
'a'
*
(
length
-
1
),
0
,
length
),
0
)
self
.
assertEqual
(
eat_id
(
'2'
+
'é'
*
(
length
-
1
),
0
,
length
),
0
)
if
__name__
==
'__main__'
:
unittest
.
main
(
verbosity
=
2
)
Misc/NEWS
View file @
2e4394ee
...
...
@@ -264,6 +264,8 @@ Library
-
Issue
#
21455
:
Add
a
default
backlog
to
socket
.
listen
().
-
Issue
#
21525
:
Most
Tkinter
methods
which
accepted
tuples
now
accept
lists
too
.
-
Issue
#
21765
:
Add
support
for
non
-
ascii
identifiers
to
HyperParser
.
-
Issue
#
10744
:
Fix
PEP
3118
format
strings
on
ctypes
objects
with
a
nontrivial
shape
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment