Improve regex tokenizer example by using re.finditer().

Also, improve variable names and fix column numbers in the generated output.

Improve regex tokenizer example by using re.finditer().
Also, improve variable names and fix column numbers in the generated output.
c566431b · Raymond Hettinger · 4036d87f · c566431b
Commit c566431b authored Aug 03, 2014 by Raymond Hettinger
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 37 deletions

Doc/library/re.rst Doc/library/re.rst +38 -37

No files found.
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -1333,7 +1333,7 @@ successive matches::
    Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])
-    def tokenize(s):
+    def tokenize(code):
        keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
        token_specification = [
            ('NUMBER',  r'\d+(\.\d*)?'), # Integer or decimal number
@@ -1343,26 +1343,27 @@ successive matches::
            ('OP',      r'[+\-*/]'),     # Arithmetic operators
            ('NEWLINE', r'\n'),          # Line endings
            ('SKIP',    r'[ \t]+'),      # Skip over spaces and tabs
+            ('MISMATCH',r'.'),           # Any other character
        ]
        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
        get_token = re.compile(tok_regex).match
-        line = 1
+        line_num = 1
-        pos = line_start = 0
+        line_start = 0
-        mo = get_token(s)
+        for mo in re.finditer(tok_regex, code):
-        while mo is not None:
+            kind = mo.lastgroup
-            typ = mo.lastgroup
+            value = mo.group(kind)
-            if typ == 'NEWLINE':
+            if kind == 'NEWLINE':
-                line_start = pos
+                line_start = mo.end()
-                line += 1
+                line_num += 1
-            elif typ != 'SKIP':
+            elif kind == 'SKIP':
-                val = mo.group(typ)
+                pass
-                if typ == 'ID' and val in keywords:
+            elif kind == 'MISMATCH':
-                    typ = val
+                raise RuntimeError('%r unexpected on line %d' % (value, line_num))
-                yield Token(typ, val, line, mo.start()-line_start)
+            else:
-            pos = mo.end()
+                if kind == 'ID' and value in keywords:
-            mo = get_token(s, pos)
+                    kind = value
-        if pos != len(s):
+                column = mo.start() - line_start
-            raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
+                yield Token(kind, value, line_num, column)
    statements = '''
        IF quantity THEN
@@ -1376,22 +1377,22 @@ successive matches::
 The tokenizer produces the following output::
-    Token(typ='IF', value='IF', line=2, column=5)
+    Token(typ='IF', value='IF', line=2, column=4)
-    Token(typ='ID', value='quantity', line=2, column=8)
+    Token(typ='ID', value='quantity', line=2, column=7)
-    Token(typ='THEN', value='THEN', line=2, column=17)
+    Token(typ='THEN', value='THEN', line=2, column=16)
-    Token(typ='ID', value='total', line=3, column=9)
+    Token(typ='ID', value='total', line=3, column=8)
-    Token(typ='ASSIGN', value=':=', line=3, column=15)
+    Token(typ='ASSIGN', value=':=', line=3, column=14)
-    Token(typ='ID', value='total', line=3, column=18)
+    Token(typ='ID', value='total', line=3, column=17)
-    Token(typ='OP', value='+', line=3, column=24)
+    Token(typ='OP', value='+', line=3, column=23)
-    Token(typ='ID', value='price', line=3, column=26)
+    Token(typ='ID', value='price', line=3, column=25)
-    Token(typ='OP', value='*', line=3, column=32)
+    Token(typ='OP', value='*', line=3, column=31)
-    Token(typ='ID', value='quantity', line=3, column=34)
+    Token(typ='ID', value='quantity', line=3, column=33)
-    Token(typ='END', value=';', line=3, column=42)
+    Token(typ='END', value=';', line=3, column=41)
-    Token(typ='ID', value='tax', line=4, column=9)
+    Token(typ='ID', value='tax', line=4, column=8)
-    Token(typ='ASSIGN', value=':=', line=4, column=13)
+    Token(typ='ASSIGN', value=':=', line=4, column=12)
-    Token(typ='ID', value='price', line=4, column=16)
+    Token(typ='ID', value='price', line=4, column=15)
-    Token(typ='OP', value='*', line=4, column=22)
+    Token(typ='OP', value='*', line=4, column=21)
-    Token(typ='NUMBER', value='0.05', line=4, column=24)
+    Token(typ='NUMBER', value='0.05', line=4, column=23)
-    Token(typ='END', value=';', line=4, column=28)
+    Token(typ='END', value=';', line=4, column=27)
-    Token(typ='ENDIF', value='ENDIF', line=5, column=5)
+    Token(typ='ENDIF', value='ENDIF', line=5, column=4)
-    Token(typ='END', value=';', line=5, column=10)
+    Token(typ='END', value=';', line=5, column=9)