Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
C
cpython
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
cpython
Commits
a48db399
Commit
a48db399
authored
Apr 29, 2009
by
Raymond Hettinger
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Issue #5857: tokenize.tokenize() now returns named tuples.
parent
c1edc2d6
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
28 additions
and
20 deletions
+28
-20
Doc/library/tokenize.rst
Doc/library/tokenize.rst
+6
-1
Lib/tokenize.py
Lib/tokenize.py
+22
-19
No files found.
Doc/library/tokenize.rst
View file @
a48db399
...
@@ -27,7 +27,12 @@ The primary entry point is a :term:`generator`:
...
@@ -27,7 +27,12 @@ The primary entry point is a :term:`generator`:
column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of
column where the token begins in the source; a 2-tuple ``(erow, ecol)`` of
ints specifying the row and column where the token ends in the source; and
ints specifying the row and column where the token ends in the source; and
the line on which the token was found. The line passed (the last tuple item)
the line on which the token was found. The line passed (the last tuple item)
is the *logical* line; continuation lines are included.
is the *logical* line; continuation lines are included. The 5 tuple is
returned as a :term:`named tuple` with the field names:
``type string start end line``.
.. versionchanged:: 3.1
Added support for named tuples.
:func:`tokenize` determines the source encoding of the file by looking for a
:func:`tokenize` determines the source encoding of the file by looking for a
UTF-8 BOM or encoding cookie, according to :pep:`263`.
UTF-8 BOM or encoding cookie, according to :pep:`263`.
...
...
Lib/tokenize.py
View file @
a48db399
...
@@ -24,6 +24,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
...
@@ -24,6 +24,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
'Michael Foord'
)
'Michael Foord'
)
import
collections
import
re
,
string
,
sys
import
re
,
string
,
sys
from
token
import
*
from
token
import
*
from
codecs
import
lookup
,
BOM_UTF8
from
codecs
import
lookup
,
BOM_UTF8
...
@@ -31,7 +32,7 @@ cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
...
@@ -31,7 +32,7 @@ cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
import
token
import
token
__all__
=
[
x
for
x
in
dir
(
token
)
if
x
[
0
]
!=
'_'
]
+
[
"COMMENT"
,
"tokenize"
,
__all__
=
[
x
for
x
in
dir
(
token
)
if
x
[
0
]
!=
'_'
]
+
[
"COMMENT"
,
"tokenize"
,
"detect_encoding"
,
"NL"
,
"untokenize"
,
"ENCODING"
]
"detect_encoding"
,
"NL"
,
"untokenize"
,
"ENCODING"
,
"Tokenize"
]
del
token
del
token
COMMENT
=
N_TOKENS
COMMENT
=
N_TOKENS
...
@@ -42,6 +43,8 @@ ENCODING = N_TOKENS + 2
...
@@ -42,6 +43,8 @@ ENCODING = N_TOKENS + 2
tok_name
[
ENCODING
]
=
'ENCODING'
tok_name
[
ENCODING
]
=
'ENCODING'
N_TOKENS
+=
3
N_TOKENS
+=
3
TokenInfo
=
collections
.
namedtuple
(
'TokenInfo'
,
'type string start end line'
)
def
group
(
*
choices
):
return
'('
+
'|'
.
join
(
choices
)
+
')'
def
group
(
*
choices
):
return
'('
+
'|'
.
join
(
choices
)
+
')'
def
any
(
*
choices
):
return
group
(
*
choices
)
+
'*'
def
any
(
*
choices
):
return
group
(
*
choices
)
+
'*'
def
maybe
(
*
choices
):
return
group
(
*
choices
)
+
'?'
def
maybe
(
*
choices
):
return
group
(
*
choices
)
+
'?'
...
@@ -346,7 +349,7 @@ def _tokenize(readline, encoding):
...
@@ -346,7 +349,7 @@ def _tokenize(readline, encoding):
indents
=
[
0
]
indents
=
[
0
]
if
encoding
is
not
None
:
if
encoding
is
not
None
:
yield
(
ENCODING
,
encoding
,
(
0
,
0
),
(
0
,
0
),
''
)
yield
TokenInfo
(
ENCODING
,
encoding
,
(
0
,
0
),
(
0
,
0
),
''
)
while
True
:
# loop over lines in stream
while
True
:
# loop over lines in stream
try
:
try
:
line
=
readline
()
line
=
readline
()
...
@@ -364,12 +367,12 @@ def _tokenize(readline, encoding):
...
@@ -364,12 +367,12 @@ def _tokenize(readline, encoding):
endmatch
=
endprog
.
match
(
line
)
endmatch
=
endprog
.
match
(
line
)
if
endmatch
:
if
endmatch
:
pos
=
end
=
endmatch
.
end
(
0
)
pos
=
end
=
endmatch
.
end
(
0
)
yield
(
STRING
,
contstr
+
line
[:
end
],
yield
TokenInfo
(
STRING
,
contstr
+
line
[:
end
],
strstart
,
(
lnum
,
end
),
contline
+
line
)
strstart
,
(
lnum
,
end
),
contline
+
line
)
contstr
,
needcont
=
''
,
0
contstr
,
needcont
=
''
,
0
contline
=
None
contline
=
None
elif
needcont
and
line
[
-
2
:]
!=
'
\
\
\
n
'
and
line
[
-
3
:]
!=
'
\
\
\
r
\
n
'
:
elif
needcont
and
line
[
-
2
:]
!=
'
\
\
\
n
'
and
line
[
-
3
:]
!=
'
\
\
\
r
\
n
'
:
yield
(
ERRORTOKEN
,
contstr
+
line
,
yield
TokenInfo
(
ERRORTOKEN
,
contstr
+
line
,
strstart
,
(
lnum
,
len
(
line
)),
contline
)
strstart
,
(
lnum
,
len
(
line
)),
contline
)
contstr
=
''
contstr
=
''
contline
=
None
contline
=
None
...
@@ -394,25 +397,25 @@ def _tokenize(readline, encoding):
...
@@ -394,25 +397,25 @@ def _tokenize(readline, encoding):
if
line
[
pos
]
==
'#'
:
if
line
[
pos
]
==
'#'
:
comment_token
=
line
[
pos
:].
rstrip
(
'
\
r
\
n
'
)
comment_token
=
line
[
pos
:].
rstrip
(
'
\
r
\
n
'
)
nl_pos
=
pos
+
len
(
comment_token
)
nl_pos
=
pos
+
len
(
comment_token
)
yield
(
COMMENT
,
comment_token
,
yield
TokenInfo
(
COMMENT
,
comment_token
,
(
lnum
,
pos
),
(
lnum
,
pos
+
len
(
comment_token
)),
line
)
(
lnum
,
pos
),
(
lnum
,
pos
+
len
(
comment_token
)),
line
)
yield
(
NL
,
line
[
nl_pos
:],
yield
TokenInfo
(
NL
,
line
[
nl_pos
:],
(
lnum
,
nl_pos
),
(
lnum
,
len
(
line
)),
line
)
(
lnum
,
nl_pos
),
(
lnum
,
len
(
line
)),
line
)
else
:
else
:
yield
((
NL
,
COMMENT
)[
line
[
pos
]
==
'#'
],
line
[
pos
:],
yield
TokenInfo
((
NL
,
COMMENT
)[
line
[
pos
]
==
'#'
],
line
[
pos
:],
(
lnum
,
pos
),
(
lnum
,
len
(
line
)),
line
)
(
lnum
,
pos
),
(
lnum
,
len
(
line
)),
line
)
continue
continue
if
column
>
indents
[
-
1
]:
# count indents or dedents
if
column
>
indents
[
-
1
]:
# count indents or dedents
indents
.
append
(
column
)
indents
.
append
(
column
)
yield
(
INDENT
,
line
[:
pos
],
(
lnum
,
0
),
(
lnum
,
pos
),
line
)
yield
TokenInfo
(
INDENT
,
line
[:
pos
],
(
lnum
,
0
),
(
lnum
,
pos
),
line
)
while
column
<
indents
[
-
1
]:
while
column
<
indents
[
-
1
]:
if
column
not
in
indents
:
if
column
not
in
indents
:
raise
IndentationError
(
raise
IndentationError
(
"unindent does not match any outer indentation level"
,
"unindent does not match any outer indentation level"
,
(
"<tokenize>"
,
lnum
,
pos
,
line
))
(
"<tokenize>"
,
lnum
,
pos
,
line
))
indents
=
indents
[:
-
1
]
indents
=
indents
[:
-
1
]
yield
(
DEDENT
,
''
,
(
lnum
,
pos
),
(
lnum
,
pos
),
line
)
yield
TokenInfo
(
DEDENT
,
''
,
(
lnum
,
pos
),
(
lnum
,
pos
),
line
)
else
:
# continued statement
else
:
# continued statement
if
not
line
:
if
not
line
:
...
@@ -428,20 +431,20 @@ def _tokenize(readline, encoding):
...
@@ -428,20 +431,20 @@ def _tokenize(readline, encoding):
if
(
initial
in
numchars
or
# ordinary number
if
(
initial
in
numchars
or
# ordinary number
(
initial
==
'.'
and
token
!=
'.'
and
token
!=
'...'
)):
(
initial
==
'.'
and
token
!=
'.'
and
token
!=
'...'
)):
yield
(
NUMBER
,
token
,
spos
,
epos
,
line
)
yield
TokenInfo
(
NUMBER
,
token
,
spos
,
epos
,
line
)
elif
initial
in
'
\
r
\
n
'
:
elif
initial
in
'
\
r
\
n
'
:
yield
(
NL
if
parenlev
>
0
else
NEWLINE
,
yield
TokenInfo
(
NL
if
parenlev
>
0
else
NEWLINE
,
token
,
spos
,
epos
,
line
)
token
,
spos
,
epos
,
line
)
elif
initial
==
'#'
:
elif
initial
==
'#'
:
assert
not
token
.
endswith
(
"
\
n
"
)
assert
not
token
.
endswith
(
"
\
n
"
)
yield
(
COMMENT
,
token
,
spos
,
epos
,
line
)
yield
TokenInfo
(
COMMENT
,
token
,
spos
,
epos
,
line
)
elif
token
in
triple_quoted
:
elif
token
in
triple_quoted
:
endprog
=
endprogs
[
token
]
endprog
=
endprogs
[
token
]
endmatch
=
endprog
.
match
(
line
,
pos
)
endmatch
=
endprog
.
match
(
line
,
pos
)
if
endmatch
:
# all on one line
if
endmatch
:
# all on one line
pos
=
endmatch
.
end
(
0
)
pos
=
endmatch
.
end
(
0
)
token
=
line
[
start
:
pos
]
token
=
line
[
start
:
pos
]
yield
(
STRING
,
token
,
spos
,
(
lnum
,
pos
),
line
)
yield
TokenInfo
(
STRING
,
token
,
spos
,
(
lnum
,
pos
),
line
)
else
:
else
:
strstart
=
(
lnum
,
start
)
# multiple lines
strstart
=
(
lnum
,
start
)
# multiple lines
contstr
=
line
[
start
:]
contstr
=
line
[
start
:]
...
@@ -458,23 +461,23 @@ def _tokenize(readline, encoding):
...
@@ -458,23 +461,23 @@ def _tokenize(readline, encoding):
contline
=
line
contline
=
line
break
break
else
:
# ordinary string
else
:
# ordinary string
yield
(
STRING
,
token
,
spos
,
epos
,
line
)
yield
TokenInfo
(
STRING
,
token
,
spos
,
epos
,
line
)
elif
initial
in
namechars
:
# ordinary name
elif
initial
in
namechars
:
# ordinary name
yield
(
NAME
,
token
,
spos
,
epos
,
line
)
yield
TokenInfo
(
NAME
,
token
,
spos
,
epos
,
line
)
elif
initial
==
'
\
\
'
:
# continued stmt
elif
initial
==
'
\
\
'
:
# continued stmt
continued
=
1
continued
=
1
else
:
else
:
if
initial
in
'([{'
:
parenlev
=
parenlev
+
1
if
initial
in
'([{'
:
parenlev
=
parenlev
+
1
elif
initial
in
')]}'
:
parenlev
=
parenlev
-
1
elif
initial
in
')]}'
:
parenlev
=
parenlev
-
1
yield
(
OP
,
token
,
spos
,
epos
,
line
)
yield
TokenInfo
(
OP
,
token
,
spos
,
epos
,
line
)
else
:
else
:
yield
(
ERRORTOKEN
,
line
[
pos
],
yield
TokenInfo
(
ERRORTOKEN
,
line
[
pos
],
(
lnum
,
pos
),
(
lnum
,
pos
+
1
),
line
)
(
lnum
,
pos
),
(
lnum
,
pos
+
1
),
line
)
pos
=
pos
+
1
pos
=
pos
+
1
for
indent
in
indents
[
1
:]:
# pop remaining indent levels
for
indent
in
indents
[
1
:]:
# pop remaining indent levels
yield
(
DEDENT
,
''
,
(
lnum
,
0
),
(
lnum
,
0
),
''
)
yield
TokenInfo
(
DEDENT
,
''
,
(
lnum
,
0
),
(
lnum
,
0
),
''
)
yield
(
ENDMARKER
,
''
,
(
lnum
,
0
),
(
lnum
,
0
),
''
)
yield
TokenInfo
(
ENDMARKER
,
''
,
(
lnum
,
0
),
(
lnum
,
0
),
''
)
# An undocumented, backwards compatible, API for all the places in the standard
# An undocumented, backwards compatible, API for all the places in the standard
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment