tokenize.py 16.2 KB
Newer Older
1 2
"""Tokenization help for Python programs.

3
generate_tokens(readline) is a generator that breaks a stream of
4
text into Python tokens.  It accepts a readline-like method which is called
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
repeatedly to get the next line of input (or "" for EOF).  It generates
5-tuples with these members:

    the token type (see token.py)
    the token (a string)
    the starting (row, column) indices of the token (a 2-tuple of ints)
    the ending (row, column) indices of the token (a 2-tuple of ints)
    the original line (string)

It is designed to match the working of the Python tokenizer exactly, except
that it produces COMMENT tokens for comments and gives type OP for all
operators

Older entry points
    tokenize_loop(readline, tokeneater)
    tokenize(readline, tokeneater=printtoken)
are the same, except instead of generating tokens, tokeneater is a callback
function to which the 5 fields described above are passed as 5 arguments,
each time a new token is found."""
24

25
__author__ = 'Ka-Ping Yee <ping@lfw.org>'
Benjamin Peterson's avatar
Benjamin Peterson committed
26 27
__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
               'Skip Montanaro, Raymond Hettinger')
28

29
import string, re
Guido van Rossum's avatar
Guido van Rossum committed
30
from token import *
Guido van Rossum's avatar
Guido van Rossum committed
31

32
import token
Benjamin Peterson's avatar
Benjamin Peterson committed
33 34
__all__ = [x for x in dir(token) if not x.startswith("_")]
__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
35
del x
36 37
del token

38 39
COMMENT = N_TOKENS
tok_name[COMMENT] = 'COMMENT'
40 41
NL = N_TOKENS + 1
tok_name[NL] = 'NL'
42
N_TOKENS += 2
43

Eric S. Raymond's avatar
Eric S. Raymond committed
44
def group(*choices): return '(' + '|'.join(choices) + ')'
45 46
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
Guido van Rossum's avatar
Guido van Rossum committed
47

48 49 50 51
Whitespace = r'[ \f\t]*'
Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'[a-zA-Z_]\w*'
Guido van Rossum's avatar
Guido van Rossum committed
52

53
Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
54 55
Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
Binnumber = r'0[bB][01]+[lL]?'
56
Decnumber = r'[1-9]\d*[lL]?'
57
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
58 59
Exponent = r'[eE][-+]?\d+'
Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
60
Expfloat = r'\d+' + Exponent
61
Floatnumber = group(Pointfloat, Expfloat)
62
Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
63
Number = group(Imagnumber, Floatnumber, Intnumber)
Guido van Rossum's avatar
Guido van Rossum committed
64

65 66 67 68 69 70 71 72
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
# Tail end of ''' string.
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
73
Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
74
# Single-line ' or " string.
75 76
String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
Guido van Rossum's avatar
Guido van Rossum committed
77

78 79 80 81
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
82
                 r"//=?",
83 84
                 r"[+\-*/%&|^=<>]=?",
                 r"~")
85

Guido van Rossum's avatar
Guido van Rossum committed
86
Bracket = '[][(){}]'
87
Special = group(r'\r?\n', r'[:;.,`@]')
Guido van Rossum's avatar
Guido van Rossum committed
88
Funny = group(Operator, Bracket, Special)
Guido van Rossum's avatar
Guido van Rossum committed
89

90
PlainToken = group(Number, Funny, String, Name)
Guido van Rossum's avatar
Guido van Rossum committed
91
Token = Ignore + PlainToken
Guido van Rossum's avatar
Guido van Rossum committed
92

93
# First (or only) line of ' or " string.
94
ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
95
                group("'", r'\\\r?\n'),
96
                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
97
                group('"', r'\\\r?\n'))
98
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
99
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
100

101 102
tokenprog, pseudoprog, single3prog, double3prog = map(
    re.compile, (Token, PseudoToken, Single3, Double3))
103
endprogs = {"'": re.compile(Single), '"': re.compile(Double),
104
            "'''": single3prog, '"""': double3prog,
105
            "r'''": single3prog, 'r"""': double3prog,
106 107 108 109 110 111 112
            "u'''": single3prog, 'u"""': double3prog,
            "ur'''": single3prog, 'ur"""': double3prog,
            "R'''": single3prog, 'R"""': double3prog,
            "U'''": single3prog, 'U"""': double3prog,
            "uR'''": single3prog, 'uR"""': double3prog,
            "Ur'''": single3prog, 'Ur"""': double3prog,
            "UR'''": single3prog, 'UR"""': double3prog,
113 114 115 116 117 118 119 120
            "b'''": single3prog, 'b"""': double3prog,
            "br'''": single3prog, 'br"""': double3prog,
            "B'''": single3prog, 'B"""': double3prog,
            "bR'''": single3prog, 'bR"""': double3prog,
            "Br'''": single3prog, 'Br"""': double3prog,
            "BR'''": single3prog, 'BR"""': double3prog,
            'r': None, 'R': None, 'u': None, 'U': None,
            'b': None, 'B': None}
Guido van Rossum's avatar
Guido van Rossum committed
121

122 123 124 125 126
triple_quoted = {}
for t in ("'''", '"""',
          "r'''", 'r"""', "R'''", 'R"""',
          "u'''", 'u"""', "U'''", 'U"""',
          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
127 128 129 130
          "uR'''", 'uR"""', "UR'''", 'UR"""',
          "b'''", 'b"""', "B'''", 'B"""',
          "br'''", 'br"""', "Br'''", 'Br"""',
          "bR'''", 'bR"""', "BR'''", 'BR"""'):
131 132 133 134 135 136
    triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
          "r'", 'r"', "R'", 'R"',
          "u'", 'u"', "U'", 'U"',
          "ur'", 'ur"', "Ur'", 'Ur"',
137 138 139 140
          "uR'", 'uR"', "UR'", 'UR"',
          "b'", 'b"', "B'", 'B"',
          "br'", 'br"', "Br'", 'Br"',
          "bR'", 'bR"', "BR'", 'BR"' ):
141 142
    single_quoted[t] = t

Guido van Rossum's avatar
Guido van Rossum committed
143
tabsize = 8
144

145 146 147
class TokenError(Exception): pass

class StopTokenizing(Exception): pass
148

149 150 151
def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
    srow, scol = srow_scol
    erow, ecol = erow_ecol
152 153
    print "%d,%d-%d,%d:\t%s\t%s" % \
        (srow, scol, erow, ecol, tok_name[type], repr(token))
Guido van Rossum's avatar
Guido van Rossum committed
154

155
def tokenize(readline, tokeneater=printtoken):
156 157 158
    """
    The tokenize() function accepts two parameters: one representing the
    input stream, and one providing an output mechanism for tokenize().
Tim Peters's avatar
Tim Peters committed
159

160 161
    The first parameter, readline, must be a callable object which provides
    the same interface as the readline() method of built-in file objects.
Tim Peters's avatar
Tim Peters committed
162
    Each call to the function should return one line of input as a string.
163 164 165

    The second parameter, tokeneater, must also be a callable object. It is
    called once for each token, with five arguments, corresponding to the
Tim Peters's avatar
Tim Peters committed
166
    tuples generated by generate_tokens().
167
    """
168 169 170 171 172
    try:
        tokenize_loop(readline, tokeneater)
    except StopTokenizing:
        pass

173
# backwards compatible interface
174
def tokenize_loop(readline, tokeneater):
175
    for token_info in generate_tokens(readline):
176
        tokeneater(*token_info)
177

178 179 180 181 182 183 184 185 186
class Untokenizer:

    def __init__(self):
        self.tokens = []
        self.prev_row = 1
        self.prev_col = 0

    def add_whitespace(self, start):
        row, col = start
187
        assert row <= self.prev_row
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
        col_offset = col - self.prev_col
        if col_offset:
            self.tokens.append(" " * col_offset)

    def untokenize(self, iterable):
        for t in iterable:
            if len(t) == 2:
                self.compat(t, iterable)
                break
            tok_type, token, start, end, line = t
            self.add_whitespace(start)
            self.tokens.append(token)
            self.prev_row, self.prev_col = end
            if tok_type in (NEWLINE, NL):
                self.prev_row += 1
                self.prev_col = 0
        return "".join(self.tokens)

    def compat(self, token, iterable):
        startline = False
        indents = []
        toks_append = self.tokens.append
        toknum, tokval = token
        if toknum in (NAME, NUMBER):
            tokval += ' '
        if toknum in (NEWLINE, NL):
            startline = True
215
        prevstring = False
216 217 218 219 220 221
        for tok in iterable:
            toknum, tokval = tok[:2]

            if toknum in (NAME, NUMBER):
                tokval += ' '

222 223 224 225 226 227 228 229
            # Insert a space between two consecutive strings
            if toknum == STRING:
                if prevstring:
                    tokval = ' ' + tokval
                prevstring = True
            else:
                prevstring = False

230 231 232 233 234 235 236 237 238 239 240 241
            if toknum == INDENT:
                indents.append(tokval)
                continue
            elif toknum == DEDENT:
                indents.pop()
                continue
            elif toknum in (NEWLINE, NL):
                startline = True
            elif startline and indents:
                toks_append(indents[-1])
                startline = False
            toks_append(tokval)
242 243 244 245 246

def untokenize(iterable):
    """Transform tokens back into Python source code.

    Each element returned by the iterable must be a token sequence
247 248 249 250 251
    with at least two elements, a token number and token value.  If
    only two tokens are passed, the resulting output is poor.

    Round-trip invariant for full input:
        Untokenized source will match input source exactly
252

253
    Round-trip invariant for limited intput:
254 255 256 257
        # Output text will tokenize the back to the input
        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
        newcode = untokenize(t1)
        readline = iter(newcode.splitlines(1)).next
258
        t2 = [tok[:2] for tok in generate_tokens(readline)]
259 260
        assert t1 == t2
    """
261 262
    ut = Untokenizer()
    return ut.untokenize(iterable)
263

264
def generate_tokens(readline):
265
    """
266
    The generate_tokens() generator requires one argument, readline, which
267 268
    must be a callable object which provides the same interface as the
    readline() method of built-in file objects. Each call to the function
269 270 271
    should return one line of input as a string.  Alternately, readline
    can be a callable function terminating with StopIteration:
        readline = open(myfile).next    # Example of alternate readline
Tim Peters's avatar
Tim Peters committed
272

273 274 275 276 277
    The generator produces 5-tuples with these members: the token type; the
    token string; a 2-tuple (srow, scol) of ints specifying the row and
    column where the token begins in the source; a 2-tuple (erow, ecol) of
    ints specifying the row and column where the token ends in the source;
    and the line on which the token was found. The line passed is the
Tim Peters's avatar
Tim Peters committed
278
    logical line; continuation lines are included.
279
    """
280
    lnum = parenlev = continued = 0
281
    namechars, numchars = string.ascii_letters + '_', '0123456789'
282
    contstr, needcont = '', 0
283
    contline = None
Guido van Rossum's avatar
Guido van Rossum committed
284
    indents = [0]
285

286
    while 1:                                   # loop over lines in stream
287 288 289 290
        try:
            line = readline()
        except StopIteration:
            line = ''
Benjamin Peterson's avatar
Benjamin Peterson committed
291
        lnum += 1
Guido van Rossum's avatar
Guido van Rossum committed
292 293 294
        pos, max = 0, len(line)

        if contstr:                            # continued string
295 296
            if not line:
                raise TokenError, ("EOF in multi-line string", strstart)
297 298 299
            endmatch = endprog.match(line)
            if endmatch:
                pos = end = endmatch.end(0)
300
                yield (STRING, contstr + line[:end],
301
                       strstart, (lnum, end), contline + line)
302
                contstr, needcont = '', 0
303
                contline = None
304
            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
305
                yield (ERRORTOKEN, contstr + line,
306
                           strstart, (lnum, len(line)), contline)
Guido van Rossum's avatar
Guido van Rossum committed
307
                contstr = ''
308
                contline = None
309
                continue
Guido van Rossum's avatar
Guido van Rossum committed
310 311
            else:
                contstr = contstr + line
312
                contline = contline + line
Guido van Rossum's avatar
Guido van Rossum committed
313 314
                continue

315
        elif parenlev == 0 and not continued:  # new statement
Guido van Rossum's avatar
Guido van Rossum committed
316 317
            if not line: break
            column = 0
318
            while pos < max:                   # measure leading whitespace
Benjamin Peterson's avatar
Benjamin Peterson committed
319
                if line[pos] == ' ':
Benjamin Peterson's avatar
Benjamin Peterson committed
320
                    column += 1
Benjamin Peterson's avatar
Benjamin Peterson committed
321
                elif line[pos] == '\t':
322
                    column = (column//tabsize + 1)*tabsize
Benjamin Peterson's avatar
Benjamin Peterson committed
323 324 325 326
                elif line[pos] == '\f':
                    column = 0
                else:
                    break
Benjamin Peterson's avatar
Benjamin Peterson committed
327
                pos += 1
Benjamin Peterson's avatar
Benjamin Peterson committed
328 329
            if pos == max:
                break
330 331

            if line[pos] in '#\r\n':           # skip comments or blank lines
332 333 334 335 336 337 338 339 340
                if line[pos] == '#':
                    comment_token = line[pos:].rstrip('\r\n')
                    nl_pos = pos + len(comment_token)
                    yield (COMMENT, comment_token,
                           (lnum, pos), (lnum, pos + len(comment_token)), line)
                    yield (NL, line[nl_pos:],
                           (lnum, nl_pos), (lnum, len(line)), line)
                else:
                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
341 342
                           (lnum, pos), (lnum, len(line)), line)
                continue
Guido van Rossum's avatar
Guido van Rossum committed
343 344 345

            if column > indents[-1]:           # count indents or dedents
                indents.append(column)
346
                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
347
            while column < indents[-1]:
348 349
                if column not in indents:
                    raise IndentationError(
350 351
                        "unindent does not match any outer indentation level",
                        ("<tokenize>", lnum, pos, line))
Guido van Rossum's avatar
Guido van Rossum committed
352
                indents = indents[:-1]
353
                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
354 355

        else:                                  # continued statement
356 357
            if not line:
                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
Guido van Rossum's avatar
Guido van Rossum committed
358 359 360
            continued = 0

        while pos < max:
361 362 363
            pseudomatch = pseudoprog.match(line, pos)
            if pseudomatch:                                # scan for tokens
                start, end = pseudomatch.span(1)
364
                spos, epos, pos = (lnum, start), (lnum, end), end
365 366
                if start == end:
                    continue
367
                token, initial = line[start:end], line[start]
Guido van Rossum's avatar
Guido van Rossum committed
368

369 370
                if initial in numchars or \
                   (initial == '.' and token != '.'):      # ordinary number
371
                    yield (NUMBER, token, spos, epos, line)
372
                elif initial in '\r\n':
373 374
                    yield (NL if parenlev > 0 else NEWLINE,
                           token, spos, epos, line)
375
                elif initial == '#':
376
                    assert not token.endswith("\n")
377
                    yield (COMMENT, token, spos, epos, line)
378
                elif token in triple_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
379
                    endprog = endprogs[token]
380 381 382
                    endmatch = endprog.match(line, pos)
                    if endmatch:                           # all on one line
                        pos = endmatch.end(0)
383
                        token = line[start:pos]
384
                        yield (STRING, token, spos, (lnum, pos), line)
Guido van Rossum's avatar
Guido van Rossum committed
385
                    else:
386 387
                        strstart = (lnum, start)           # multiple lines
                        contstr = line[start:]
388
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
389
                        break
390 391 392
                elif initial in single_quoted or \
                    token[:2] in single_quoted or \
                    token[:3] in single_quoted:
Guido van Rossum's avatar
Guido van Rossum committed
393
                    if token[-1] == '\n':                  # continued string
394
                        strstart = (lnum, start)
395 396
                        endprog = (endprogs[initial] or endprogs[token[1]] or
                                   endprogs[token[2]])
397
                        contstr, needcont = line[start:], 1
398
                        contline = line
Guido van Rossum's avatar
Guido van Rossum committed
399 400
                        break
                    else:                                  # ordinary string
401
                        yield (STRING, token, spos, epos, line)
402
                elif initial in namechars:                 # ordinary name
403
                    yield (NAME, token, spos, epos, line)
404 405
                elif initial == '\\':                      # continued stmt
                    continued = 1
Guido van Rossum's avatar
Guido van Rossum committed
406
                else:
Benjamin Peterson's avatar
Benjamin Peterson committed
407
                    if initial in '([{':
Benjamin Peterson's avatar
Benjamin Peterson committed
408
                        parenlev += 1
Benjamin Peterson's avatar
Benjamin Peterson committed
409
                    elif initial in ')]}':
Benjamin Peterson's avatar
Benjamin Peterson committed
410
                        parenlev -= 1
411
                    yield (OP, token, spos, epos, line)
Guido van Rossum's avatar
Guido van Rossum committed
412
            else:
413
                yield (ERRORTOKEN, line[pos],
414
                           (lnum, pos), (lnum, pos+1), line)
Benjamin Peterson's avatar
Benjamin Peterson committed
415
                pos += 1
Guido van Rossum's avatar
Guido van Rossum committed
416 417

    for indent in indents[1:]:                 # pop remaining indent levels
418 419
        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
Guido van Rossum's avatar
Guido van Rossum committed
420 421 422

if __name__ == '__main__':                     # testing
    import sys
Benjamin Peterson's avatar
Benjamin Peterson committed
423 424 425 426
    if len(sys.argv) > 1:
        tokenize(open(sys.argv[1]).readline)
    else:
        tokenize(sys.stdin.readline)