Improve regex tokenizer example by using re.finditer().

Also, improve variable names and fix column numbers in the generated output.

Improve regex tokenizer example by using re.finditer().
Also, improve variable names and fix column numbers in the generated output.
c566431b · Raymond Hettinger · 4036d87f · c566431b
Commit c566431b authored Aug 03, 2014 by Raymond Hettinger
Hide whitespace changes
Inline Side-by-side

Showing with 38 additions and 37 deletions

Doc/library/re.rst Doc/library/re.rst +38 -37

No files found.
--- a/Doc/library/re.rst
+++ b/Doc/library/re.rst
@@ -1333,7 +1333,7 @@ successive matches::

    Token = collections.namedtuple('Token', ['typ', 'value', 'line', 'column'])

-    def tokenize(s):
+    def tokenize(code):
        keywords = {'IF', 'THEN', 'ENDIF', 'FOR', 'NEXT', 'GOSUB', 'RETURN'}
        token_specification = [
            ('NUMBER',  r'\d+(\.\d*)?'), # Integer or decimal number
@@ -1343,26 +1343,27 @@ successive matches::
            ('OP',      r'[+\-*/]'),     # Arithmetic operators
            ('NEWLINE', r'\n'),          # Line endings
            ('SKIP',    r'[ \t]+'),      # Skip over spaces and tabs
+            ('MISMATCH',r'.'),           # Any other character
        ]
        tok_regex = '|'.join('(?P<%s>%s)' % pair for pair in token_specification)
        get_token = re.compile(tok_regex).match
-        line = 1
-        pos = line_start = 0
-        mo = get_token(s)
-        while mo is not None:
-            typ = mo.lastgroup
-            if typ == 'NEWLINE':
-                line_start = pos
-                line += 1
-            elif typ != 'SKIP':
-                val = mo.group(typ)
-                if typ == 'ID' and val in keywords:
-                    typ = val
-                yield Token(typ, val, line, mo.start()-line_start)
-            pos = mo.end()
-            mo = get_token(s, pos)
-        if pos != len(s):
-            raise RuntimeError('Unexpected character %r on line %d' %(s[pos], line))
+        line_num = 1
+        line_start = 0
+        for mo in re.finditer(tok_regex, code):
+            kind = mo.lastgroup
+            value = mo.group(kind)
+            if kind == 'NEWLINE':
+                line_start = mo.end()
+                line_num += 1
+            elif kind == 'SKIP':
+                pass
+            elif kind == 'MISMATCH':
+                raise RuntimeError('%r unexpected on line %d' % (value, line_num))
+            else:
+                if kind == 'ID' and value in keywords:
+                    kind = value
+                column = mo.start() - line_start
+                yield Token(kind, value, line_num, column)

    statements = '''
        IF quantity THEN
@@ -1376,22 +1377,22 @@ successive matches::

 The tokenizer produces the following output::

-    Token(typ='IF', value='IF', line=2, column=5)
-    Token(typ='ID', value='quantity', line=2, column=8)
-    Token(typ='THEN', value='THEN', line=2, column=17)
-    Token(typ='ID', value='total', line=3, column=9)
-    Token(typ='ASSIGN', value=':=', line=3, column=15)
-    Token(typ='ID', value='total', line=3, column=18)
-    Token(typ='OP', value='+', line=3, column=24)
-    Token(typ='ID', value='price', line=3, column=26)
-    Token(typ='OP', value='*', line=3, column=32)
-    Token(typ='ID', value='quantity', line=3, column=34)
-    Token(typ='END', value=';', line=3, column=42)
-    Token(typ='ID', value='tax', line=4, column=9)
-    Token(typ='ASSIGN', value=':=', line=4, column=13)
-    Token(typ='ID', value='price', line=4, column=16)
-    Token(typ='OP', value='*', line=4, column=22)
-    Token(typ='NUMBER', value='0.05', line=4, column=24)
-    Token(typ='END', value=';', line=4, column=28)
-    Token(typ='ENDIF', value='ENDIF', line=5, column=5)
-    Token(typ='END', value=';', line=5, column=10)
+    Token(typ='IF', value='IF', line=2, column=4)
+    Token(typ='ID', value='quantity', line=2, column=7)
+    Token(typ='THEN', value='THEN', line=2, column=16)
+    Token(typ='ID', value='total', line=3, column=8)
+    Token(typ='ASSIGN', value=':=', line=3, column=14)
+    Token(typ='ID', value='total', line=3, column=17)
+    Token(typ='OP', value='+', line=3, column=23)
+    Token(typ='ID', value='price', line=3, column=25)
+    Token(typ='OP', value='*', line=3, column=31)
+    Token(typ='ID', value='quantity', line=3, column=33)
+    Token(typ='END', value=';', line=3, column=41)
+    Token(typ='ID', value='tax', line=4, column=8)
+    Token(typ='ASSIGN', value=':=', line=4, column=12)
+    Token(typ='ID', value='price', line=4, column=15)
+    Token(typ='OP', value='*', line=4, column=21)
+    Token(typ='NUMBER', value='0.05', line=4, column=23)
+    Token(typ='END', value=';', line=4, column=27)
+    Token(typ='ENDIF', value='ENDIF', line=5, column=4)
+    Token(typ='END', value=';', line=5, column=9)