Commit c566431b authored by Raymond Hettinger's avatar Raymond Hettinger

Improve regex tokenizer example by using re.finditer().

Also, improve variable names and fix column numbers
in the generated output.
parent 4036d87f
...@@ -1333,7 +1333,7 @@ successive matches:: ...@@ -1333,7 +1333,7 @@ successive matches::
Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column']) Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
def tokenize(s): def tokenize(code):
keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'} keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
token_specification = [ token_specification = [
('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number ('NUMBER', r'\d+(\.\d*)?'), # Integer or decimal number
...@@ -1343,26 +1343,27 @@ successive matches:: ...@@ -1343,26 +1343,27 @@ successive matches::
('OP', r'[+\-*/]'), # Arithmetic operators ('OP', r'[+\-*/]'), # Arithmetic operators
('NEWLINE', r'\n'), # Line endings ('NEWLINE', r'\n'), # Line endings
('SKIP', r'[ \t]+'), # Skip over spaces and tabs ('SKIP', r'[ \t]+'), # Skip over spaces and tabs
('MISMATCH',r'.'), # Any other character
] ]
tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification) tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
get_token = re.compile(tok_regex).match get_token = re.compile(tok_regex).match
line = 1 line_num = 1
pos = line_start = 0 line_start = 0
mo = get_token(s) for mo in re.finditer(tok_regex, code):
while mo is not None: kind = mo.lastgroup
typ = mo.lastgroup value = mo.group(kind)
if typ == 'NEWLINE': if kind == 'NEWLINE':
line_start = pos line_start = mo.end()
line += 1 line_num += 1
elif typ != 'SKIP': elif kind == 'SKIP':
val = mo.group(typ) pass
if typ == 'ID' and val in keywords: elif kind == 'MISMATCH':
typ = val raise RuntimeError('%r unexpected on line %d' % (value, line_num))
yield Token(typ, val, line, mo.start()-line_start) else:
pos = mo.end() if kind == 'ID' and value in keywords:
mo = get_token(s, pos) kind = value
if pos != len(s): column = mo.start() - line_start
raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line)) yield Token(kind, value, line_num, column)
statements = ''' statements = '''
IF quantity THEN IF quantity THEN
...@@ -1376,22 +1377,22 @@ successive matches:: ...@@ -1376,22 +1377,22 @@ successive matches::
The tokenizer produces the following output:: The tokenizer produces the following output::
Token(typ='IF', value='IF', line=2, column=5) Token(typ='IF', value='IF', line=2, column=4)
Token(typ='ID', value='quantity', line=2, column=8) Token(typ='ID', value='quantity', line=2, column=7)
Token(typ='THEN', value='THEN', line=2, column=17) Token(typ='THEN', value='THEN', line=2, column=16)
Token(typ='ID', value='total', line=3, column=9) Token(typ='ID', value='total', line=3, column=8)
Token(typ='ASSIGN', value=':=', line=3, column=15) Token(typ='ASSIGN', value=':=', line=3, column=14)
Token(typ='ID', value='total', line=3, column=18) Token(typ='ID', value='total', line=3, column=17)
Token(typ='OP', value='+', line=3, column=24) Token(typ='OP', value='+', line=3, column=23)
Token(typ='ID', value='price', line=3, column=26) Token(typ='ID', value='price', line=3, column=25)
Token(typ='OP', value='*', line=3, column=32) Token(typ='OP', value='*', line=3, column=31)
Token(typ='ID', value='quantity', line=3, column=34) Token(typ='ID', value='quantity', line=3, column=33)
Token(typ='END', value=';', line=3, column=42) Token(typ='END', value=';', line=3, column=41)
Token(typ='ID', value='tax', line=4, column=9) Token(typ='ID', value='tax', line=4, column=8)
Token(typ='ASSIGN', value=':=', line=4, column=13) Token(typ='ASSIGN', value=':=', line=4, column=12)
Token(typ='ID', value='price', line=4, column=16) Token(typ='ID', value='price', line=4, column=15)
Token(typ='OP', value='*', line=4, column=22) Token(typ='OP', value='*', line=4, column=21)
Token(typ='NUMBER', value='0.05', line=4, column=24) Token(typ='NUMBER', value='0.05', line=4, column=23)
Token(typ='END', value=';', line=4, column=28) Token(typ='END', value=';', line=4, column=27)
Token(typ='ENDIF', value='ENDIF', line=5, column=5) Token(typ='ENDIF', value='ENDIF', line=5, column=4)
Token(typ='END', value=';', line=5, column=10) Token(typ='END', value=';', line=5, column=9)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment