Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
00c7f852
Commit
00c7f852
authored
Jan 19, 2012
by
Meador Inge
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #2134: Add support for tokenize.TokenInfo.exact_type.
parent
3f67ec1a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
187 additions
and
3 deletions
+187
-3
Doc/library/tokenize.rst
Doc/library/tokenize.rst
+52
-1
Lib/test/test_tokenize.py
Lib/test/test_tokenize.py
+74
-1
Lib/tokenize.py
Lib/tokenize.py
+58
-1
Misc/NEWS
Misc/NEWS
+3
-0
No files found.
Doc/library/tokenize.rst
View file @
00c7f852
...
...
@@ -15,6 +15,11 @@ implemented in Python. The scanner in this module returns comments as tokens
as well, making it useful for implementing "pretty-printers," including
colorizers for on-screen displays.
To simplify token stream handling, all :ref:`operators` and :ref:`delimiters`
tokens are returned using the generic :data:`token.OP` token type. The exact
type can be determined by checking the ``exact_type`` property on the
:term:`named tuple` returned from :func:`tokenize.tokenize`.
Tokenizing Input
----------------
...
...
@@ -36,9 +41,17 @@ The primary entry point is a :term:`generator`:
returned as a :term:`named tuple` with the field names:
``type string start end line``.
The returned :term:`named tuple` has a additional property named
``exact_type`` that contains the exact operator type for
:data:`token.OP` tokens. For all other token types ``exact_type``
equals the named tuple ``type`` field.
.. versionchanged:: 3.1
Added support for named tuples.
.. versionchanged:: 3.3
Added support for ``exact_type``.
:func:`tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`.
...
...
@@ -131,7 +144,19 @@ It is as simple as:
.. code-block:: sh
python -m tokenize [filename.py]
python -m tokenize [-e] [filename.py]
The following options are accepted:
.. program:: tokenize
.. cmdoption:: -h, --help
show this help message and exit
.. cmdoption:: -e, --exact
display token names using the exact type
If :file:`filename.py` is specified its contents are tokenized to stdout.
Otherwise, tokenization is performed on stdin.
...
...
@@ -215,3 +240,29 @@ the name of the token, and the final column is the value of the token (if any)
4,10-4,11: OP ')'
4,11-4,12: NEWLINE '\n'
5,0-5,0: ENDMARKER ''
The exact token type names can be displayed using the ``-e`` option:
.. code-block:: sh
$ python -m tokenize -e hello.py
0,0-0,0: ENCODING 'utf-8'
1,0-1,3: NAME 'def'
1,4-1,13: NAME 'say_hello'
1,13-1,14: LPAR '('
1,14-1,15: RPAR ')'
1,15-1,16: COLON ':'
1,16-1,17: NEWLINE '\n'
2,0-2,4: INDENT ' '
2,4-2,9: NAME 'print'
2,9-2,10: LPAR '('
2,10-2,25: STRING '"Hello, World!"'
2,25-2,26: RPAR ')'
2,26-2,27: NEWLINE '\n'
3,0-3,1: NL '\n'
4,0-4,0: DEDENT ''
4,0-4,9: NAME 'say_hello'
4,9-4,10: LPAR '('
4,10-4,11: RPAR ')'
4,11-4,12: NEWLINE '\n'
5,0-5,0: ENDMARKER ''
Lib/test/test_tokenize.py
View file @
00c7f852
...
...
@@ -567,11 +567,12 @@ Non-ascii identifiers
from
test
import
support
from
tokenize
import
(
tokenize
,
_tokenize
,
untokenize
,
NUMBER
,
NAME
,
OP
,
STRING
,
ENDMARKER
,
tok_name
,
detect_encoding
,
STRING
,
ENDMARKER
,
ENCODING
,
tok_name
,
detect_encoding
,
open
as
tokenize_open
)
from
io
import
BytesIO
from
unittest
import
TestCase
import
os
,
sys
,
glob
import
token
def
dump_tokens
(
s
):
"""Print out the tokens in s in a table format.
...
...
@@ -922,6 +923,78 @@ class TestTokenize(TestCase):
self
.
assertTrue
(
encoding_used
,
encoding
)
def
assertExactTypeEqual
(
self
,
opstr
,
*
optypes
):
tokens
=
list
(
tokenize
(
BytesIO
(
opstr
.
encode
(
'utf-8'
)).
readline
))
num_optypes
=
len
(
optypes
)
self
.
assertEqual
(
len
(
tokens
),
2
+
num_optypes
)
self
.
assertEqual
(
token
.
tok_name
[
tokens
[
0
].
exact_type
],
token
.
tok_name
[
ENCODING
])
for
i
in
range
(
num_optypes
):
self
.
assertEqual
(
token
.
tok_name
[
tokens
[
i
+
1
].
exact_type
],
token
.
tok_name
[
optypes
[
i
]])
self
.
assertEqual
(
token
.
tok_name
[
tokens
[
1
+
num_optypes
].
exact_type
],
token
.
tok_name
[
token
.
ENDMARKER
])
def
test_exact_type
(
self
):
self
.
assertExactTypeEqual
(
'()'
,
token
.
LPAR
,
token
.
RPAR
)
self
.
assertExactTypeEqual
(
'[]'
,
token
.
LSQB
,
token
.
RSQB
)
self
.
assertExactTypeEqual
(
':'
,
token
.
COLON
)
self
.
assertExactTypeEqual
(
','
,
token
.
COMMA
)
self
.
assertExactTypeEqual
(
';'
,
token
.
SEMI
)
self
.
assertExactTypeEqual
(
'+'
,
token
.
PLUS
)
self
.
assertExactTypeEqual
(
'-'
,
token
.
MINUS
)
self
.
assertExactTypeEqual
(
'*'
,
token
.
STAR
)
self
.
assertExactTypeEqual
(
'/'
,
token
.
SLASH
)
self
.
assertExactTypeEqual
(
'|'
,
token
.
VBAR
)
self
.
assertExactTypeEqual
(
'&'
,
token
.
AMPER
)
self
.
assertExactTypeEqual
(
'<'
,
token
.
LESS
)
self
.
assertExactTypeEqual
(
'>'
,
token
.
GREATER
)
self
.
assertExactTypeEqual
(
'='
,
token
.
EQUAL
)
self
.
assertExactTypeEqual
(
'.'
,
token
.
DOT
)
self
.
assertExactTypeEqual
(
'%'
,
token
.
PERCENT
)
self
.
assertExactTypeEqual
(
'{}'
,
token
.
LBRACE
,
token
.
RBRACE
)
self
.
assertExactTypeEqual
(
'=='
,
token
.
EQEQUAL
)
self
.
assertExactTypeEqual
(
'!='
,
token
.
NOTEQUAL
)
self
.
assertExactTypeEqual
(
'<='
,
token
.
LESSEQUAL
)
self
.
assertExactTypeEqual
(
'>='
,
token
.
GREATEREQUAL
)
self
.
assertExactTypeEqual
(
'~'
,
token
.
TILDE
)
self
.
assertExactTypeEqual
(
'^'
,
token
.
CIRCUMFLEX
)
self
.
assertExactTypeEqual
(
'<<'
,
token
.
LEFTSHIFT
)
self
.
assertExactTypeEqual
(
'>>'
,
token
.
RIGHTSHIFT
)
self
.
assertExactTypeEqual
(
'**'
,
token
.
DOUBLESTAR
)
self
.
assertExactTypeEqual
(
'+='
,
token
.
PLUSEQUAL
)
self
.
assertExactTypeEqual
(
'-='
,
token
.
MINEQUAL
)
self
.
assertExactTypeEqual
(
'*='
,
token
.
STAREQUAL
)
self
.
assertExactTypeEqual
(
'/='
,
token
.
SLASHEQUAL
)
self
.
assertExactTypeEqual
(
'%='
,
token
.
PERCENTEQUAL
)
self
.
assertExactTypeEqual
(
'&='
,
token
.
AMPEREQUAL
)
self
.
assertExactTypeEqual
(
'|='
,
token
.
VBAREQUAL
)
self
.
assertExactTypeEqual
(
'^='
,
token
.
CIRCUMFLEXEQUAL
)
self
.
assertExactTypeEqual
(
'^='
,
token
.
CIRCUMFLEXEQUAL
)
self
.
assertExactTypeEqual
(
'<<='
,
token
.
LEFTSHIFTEQUAL
)
self
.
assertExactTypeEqual
(
'>>='
,
token
.
RIGHTSHIFTEQUAL
)
self
.
assertExactTypeEqual
(
'**='
,
token
.
DOUBLESTAREQUAL
)
self
.
assertExactTypeEqual
(
'//'
,
token
.
DOUBLESLASH
)
self
.
assertExactTypeEqual
(
'//='
,
token
.
DOUBLESLASHEQUAL
)
self
.
assertExactTypeEqual
(
'@'
,
token
.
AT
)
self
.
assertExactTypeEqual
(
'a**2+b**2==c**2'
,
NAME
,
token
.
DOUBLESTAR
,
NUMBER
,
token
.
PLUS
,
NAME
,
token
.
DOUBLESTAR
,
NUMBER
,
token
.
EQEQUAL
,
NAME
,
token
.
DOUBLESTAR
,
NUMBER
)
self
.
assertExactTypeEqual
(
'{1, 2, 3}'
,
token
.
LBRACE
,
token
.
NUMBER
,
token
.
COMMA
,
token
.
NUMBER
,
token
.
COMMA
,
token
.
NUMBER
,
token
.
RBRACE
)
self
.
assertExactTypeEqual
(
'^(x & 0x1)'
,
token
.
CIRCUMFLEX
,
token
.
LPAR
,
token
.
NAME
,
token
.
AMPER
,
token
.
NUMBER
,
token
.
RPAR
)
__test__
=
{
"doctests"
:
doctests
,
'decistmt'
:
decistmt
}
...
...
Lib/tokenize.py
View file @
00c7f852
...
...
@@ -45,6 +45,51 @@ tok_name[NL] = 'NL'
ENCODING
=
N_TOKENS
+
2
tok_name
[
ENCODING
]
=
'ENCODING'
N_TOKENS
+=
3
EXACT_TOKEN_TYPES
=
{
'('
:
LPAR
,
')'
:
RPAR
,
'['
:
LSQB
,
']'
:
RSQB
,
':'
:
COLON
,
','
:
COMMA
,
';'
:
SEMI
,
'+'
:
PLUS
,
'-'
:
MINUS
,
'*'
:
STAR
,
'/'
:
SLASH
,
'|'
:
VBAR
,
'&'
:
AMPER
,
'<'
:
LESS
,
'>'
:
GREATER
,
'='
:
EQUAL
,
'.'
:
DOT
,
'%'
:
PERCENT
,
'{'
:
LBRACE
,
'}'
:
RBRACE
,
'=='
:
EQEQUAL
,
'!='
:
NOTEQUAL
,
'<='
:
LESSEQUAL
,
'>='
:
GREATEREQUAL
,
'~'
:
TILDE
,
'^'
:
CIRCUMFLEX
,
'<<'
:
LEFTSHIFT
,
'>>'
:
RIGHTSHIFT
,
'**'
:
DOUBLESTAR
,
'+='
:
PLUSEQUAL
,
'-='
:
MINEQUAL
,
'*='
:
STAREQUAL
,
'/='
:
SLASHEQUAL
,
'%='
:
PERCENTEQUAL
,
'&='
:
AMPEREQUAL
,
'|='
:
VBAREQUAL
,
'^='
:
CIRCUMFLEXEQUAL
,
'<<='
:
LEFTSHIFTEQUAL
,
'>>='
:
RIGHTSHIFTEQUAL
,
'**='
:
DOUBLESTAREQUAL
,
'//'
:
DOUBLESLASH
,
'//='
:
DOUBLESLASHEQUAL
,
'@'
:
AT
}
class
TokenInfo
(
collections
.
namedtuple
(
'TokenInfo'
,
'type string start end line'
)):
def
__repr__
(
self
):
...
...
@@ -52,6 +97,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
return
(
'TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)'
%
self
.
_replace
(
type
=
annotated_type
))
@
property
def
exact_type
(
self
):
if
self
.
type
==
OP
and
self
.
string
in
EXACT_TOKEN_TYPES
:
return
EXACT_TOKEN_TYPES
[
self
.
string
]
else
:
return
self
.
type
def
group
(
*
choices
):
return
'('
+
'|'
.
join
(
choices
)
+
')'
def
any
(
*
choices
):
return
group
(
*
choices
)
+
'*'
def
maybe
(
*
choices
):
return
group
(
*
choices
)
+
'?'
...
...
@@ -549,6 +601,8 @@ def main():
parser
.
add_argument
(
dest
=
'filename'
,
nargs
=
'?'
,
metavar
=
'filename.py'
,
help
=
'the file to tokenize; defaults to stdin'
)
parser
.
add_argument
(
'-e'
,
'--exact'
,
dest
=
'exact'
,
action
=
'store_true'
,
help
=
'display token names using the exact type'
)
args
=
parser
.
parse_args
()
try
:
...
...
@@ -563,9 +617,12 @@ def main():
# Output the tokenization
for
token
in
tokens
:
token_type
=
token
.
type
if
args
.
exact
:
token_type
=
token
.
exact_type
token_range
=
"%d,%d-%d,%d:"
%
(
token
.
start
+
token
.
end
)
print
(
"%-20s%-15s%-15r"
%
(
token_range
,
tok_name
[
token
.
type
],
token
.
string
))
(
token_range
,
tok_name
[
token
_
type
],
token
.
string
))
except
IndentationError
as
err
:
line
,
column
=
err
.
args
[
1
][
1
:
3
]
error
(
err
.
args
[
0
],
filename
,
(
line
,
column
))
...
...
Misc/NEWS
View file @
00c7f852
...
...
@@ -450,6 +450,9 @@ Core and Builtins
Library
-------
-
Issue
#
2134
:
A
new
attribute
that
specifies
the
exact
type
of
token
.
OP
tokens
has
been
added
to
tokenize
.
TokenInfo
.
-
Issue
#
13722
:
Avoid
silencing
ImportErrors
when
initializing
the
codecs
registry
.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment