bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.

bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)
"Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
8ac65811 · Serhiy Storchaka · GitHub · c1b4b0f6 · 8ac65811 · 8ac65811
Commit 8ac65811 authored Dec 22, 2018 by Serhiy Storchaka Committed by GitHub Dec 22, 2018
18 changed files
--- a/.gitattributes
+++ b/.gitattributes
@@ -55,3 +55,7 @@ Include/opcode.h            linguist-generated=true
 Python/opcode_targets.h     linguist-generated=true
 Objects/typeslots.inc       linguist-generated=true
 Modules/unicodedata_db.h    linguist-generated=true
+Doc/library/token-list.inc  linguist-generated=true
+Include/token.h             linguist-generated=true
+Lib/token.py                linguist-generated=true
+Parser/token.c              linguist-generated=true
--- a/Doc/library/token-list.inc
+++ b/Doc/library/token-list.inc
+.. Auto-generated by Tools/scripts/generate_token.py
+.. data:: ENDMARKER
+
+.. data:: NAME
+
+.. data:: NUMBER
+
+.. data:: STRING
+
+.. data:: NEWLINE
+
+.. data:: INDENT
+
+.. data:: DEDENT
+
+.. data:: LPAR
+
+   Token value for ``"("``.
+
+.. data:: RPAR
+
+   Token value for ``")"``.
+
+.. data:: LSQB
+
+   Token value for ``"["``.
+
+.. data:: RSQB
+
+   Token value for ``"]"``.
+
+.. data:: COLON
+
+   Token value for ``":"``.
+
+.. data:: COMMA
+
+   Token value for ``","``.
+
+.. data:: SEMI
+
+   Token value for ``";"``.
+
+.. data:: PLUS
+
+   Token value for ``"+"``.
+
+.. data:: MINUS
+
+   Token value for ``"-"``.
+
+.. data:: STAR
+
+   Token value for ``"*"``.
+
+.. data:: SLASH
+
+   Token value for ``"/"``.
+
+.. data:: VBAR
+
+   Token value for ``"|"``.
+
+.. data:: AMPER
+
+   Token value for ``"&"``.
+
+.. data:: LESS
+
+   Token value for ``"<"``.
+
+.. data:: GREATER
+
+   Token value for ``">"``.
+
+.. data:: EQUAL
+
+   Token value for ``"="``.
+
+.. data:: DOT
+
+   Token value for ``"."``.
+
+.. data:: PERCENT
+
+   Token value for ``"%"``.
+
+.. data:: LBRACE
+
+   Token value for ``"{"``.
+
+.. data:: RBRACE
+
+   Token value for ``"}"``.
+
+.. data:: EQEQUAL
+
+   Token value for ``"=="``.
+
+.. data:: NOTEQUAL
+
+   Token value for ``"!="``.
+
+.. data:: LESSEQUAL
+
+   Token value for ``"<="``.
+
+.. data:: GREATEREQUAL
+
+   Token value for ``">="``.
+
+.. data:: TILDE
+
+   Token value for ``"~"``.
+
+.. data:: CIRCUMFLEX
+
+   Token value for ``"^"``.
+
+.. data:: LEFTSHIFT
+
+   Token value for ``"<<"``.
+
+.. data:: RIGHTSHIFT
+
+   Token value for ``">>"``.
+
+.. data:: DOUBLESTAR
+
+   Token value for ``"**"``.
+
+.. data:: PLUSEQUAL
+
+   Token value for ``"+="``.
+
+.. data:: MINEQUAL
+
+   Token value for ``"-="``.
+
+.. data:: STAREQUAL
+
+   Token value for ``"*="``.
+
+.. data:: SLASHEQUAL
+
+   Token value for ``"/="``.
+
+.. data:: PERCENTEQUAL
+
+   Token value for ``"%="``.
+
+.. data:: AMPEREQUAL
+
+   Token value for ``"&="``.
+
+.. data:: VBAREQUAL
+
+   Token value for ``"|="``.
+
+.. data:: CIRCUMFLEXEQUAL
+
+   Token value for ``"^="``.
+
+.. data:: LEFTSHIFTEQUAL
+
+   Token value for ``"<<="``.
+
+.. data:: RIGHTSHIFTEQUAL
+
+   Token value for ``">>="``.
+
+.. data:: DOUBLESTAREQUAL
+
+   Token value for ``"**="``.
+
+.. data:: DOUBLESLASH
+
+   Token value for ``"//"``.
+
+.. data:: DOUBLESLASHEQUAL
+
+   Token value for ``"//="``.
+
+.. data:: AT
+
+   Token value for ``"@"``.
+
+.. data:: ATEQUAL
+
+   Token value for ``"@="``.
+
+.. data:: RARROW
+
+   Token value for ``"->"``.
+
+.. data:: ELLIPSIS
+
+   Token value for ``"..."``.
+
+.. data:: OP
+
+.. data:: ERRORTOKEN
+
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
--- a/Doc/library/token.rst
+++ b/Doc/library/token.rst
@@ -44,64 +44,7 @@ functions.  The functions mirror definitions in the Python C header files.

 The token constants are:

-.. data:: ENDMARKER
-          NAME
-          NUMBER
-          STRING
-          NEWLINE
-          INDENT
-          DEDENT
-          LPAR
-          RPAR
-          LSQB
-          RSQB
-          COLON
-          COMMA
-          SEMI
-          PLUS
-          MINUS
-          STAR
-          SLASH
-          VBAR
-          AMPER
-          LESS
-          GREATER
-          EQUAL
-          DOT
-          PERCENT
-          LBRACE
-          RBRACE
-          EQEQUAL
-          NOTEQUAL
-          LESSEQUAL
-          GREATEREQUAL
-          TILDE
-          CIRCUMFLEX
-          LEFTSHIFT
-          RIGHTSHIFT
-          DOUBLESTAR
-          PLUSEQUAL
-          MINEQUAL
-          STAREQUAL
-          SLASHEQUAL
-          PERCENTEQUAL
-          AMPEREQUAL
-          VBAREQUAL
-          CIRCUMFLEXEQUAL
-          LEFTSHIFTEQUAL
-          RIGHTSHIFTEQUAL
-          DOUBLESTAREQUAL
-          DOUBLESLASH
-          DOUBLESLASHEQUAL
-          AT
-          ATEQUAL
-          RARROW
-          ELLIPSIS
-          OP
-          ERRORTOKEN
-          N_TOKENS
-          NT_OFFSET
-
+.. include:: token-list.inc

 The following token type values aren't used by the C tokenizer but are needed for
 the :mod:`tokenize` module.

--- a/Grammar/Tokens
+++ b/Grammar/Tokens
+ENDMARKER
+NAME
+NUMBER
+STRING
+NEWLINE
+INDENT
+DEDENT
+
+LPAR                    '('
+RPAR                    ')'
+LSQB                    '['
+RSQB                    ']'
+COLON                   ':'
+COMMA                   ','
+SEMI                    ';'
+PLUS                    '+'
+MINUS                   '-'
+STAR                    '*'
+SLASH                   '/'
+VBAR                    '|'
+AMPER                   '&'
+LESS                    '<'
+GREATER                 '>'
+EQUAL                   '='
+DOT                     '.'
+PERCENT                 '%'
+LBRACE                  '{'
+RBRACE                  '}'
+EQEQUAL                 '=='
+NOTEQUAL                '!='
+LESSEQUAL               '<='
+GREATEREQUAL            '>='
+TILDE                   '~'
+CIRCUMFLEX              '^'
+LEFTSHIFT               '<<'
+RIGHTSHIFT              '>>'
+DOUBLESTAR              '**'
+PLUSEQUAL               '+='
+MINEQUAL                '-='
+STAREQUAL               '*='
+SLASHEQUAL              '/='
+PERCENTEQUAL            '%='
+AMPEREQUAL              '&='
+VBAREQUAL               '|='
+CIRCUMFLEXEQUAL         '^='
+LEFTSHIFTEQUAL          '<<='
+RIGHTSHIFTEQUAL         '>>='
+DOUBLESTAREQUAL         '**='
+DOUBLESLASH             '//'
+DOUBLESLASHEQUAL        '//='
+AT                      '@'
+ATEQUAL                 '@='
+RARROW                  '->'
+ELLIPSIS                '...'
+
+OP
+ERRORTOKEN
+
+# These aren't used by the C tokenizer but are needed for tokenize.py
+COMMENT
+NL
+ENCODING
--- a/Include/token.h
+++ b/Include/token.h
+/* Auto-generated by Tools/scripts/generate_token.py */

 /* Token types */
 #ifndef Py_LIMITED_API
@@ -62,25 +63,19 @@ extern "C" {
 #define ATEQUAL         50
 #define RARROW          51
 #define ELLIPSIS        52
-/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */
 #define OP              53
 #define ERRORTOKEN      54
-/* These aren't used by the C tokenizer but are needed for tokenize.py */
-#define COMMENT         55
-#define NL              56
-#define ENCODING                57
 #define N_TOKENS        58
+#define NT_OFFSET       256

 /* Special definitions for cooperation with parser */

-#define NT_OFFSET               256
-
 #define ISTERMINAL(x)           ((x) < NT_OFFSET)
 #define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
 #define ISEOF(x)                ((x) == ENDMARKER)


-PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
 PyAPI_FUNC(int) PyToken_OneChar(int);
 PyAPI_FUNC(int) PyToken_TwoChars(int, int);
 PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);

--- a/Lib/symbol.py
+++ b/Lib/symbol.py
-#! /usr/bin/env python3
-
 """Non-terminal symbols of Python grammar (from "graminit.h")."""

 #  This file is automatically generated; please don't muck it up!
@@ -7,7 +5,11 @@
 #  To update the symbols in this file, 'cd' to the top directory of
 #  the python source tree after building the interpreter and run:
 #
-#    ./python Lib/symbol.py
+#    python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
+#
+# or just
+#
+#    make regen-symbol

 #--start constants--
 single_input = 256
@@ -103,14 +105,4 @@ sym_name = {}
 for _name, _value in list(globals().items()):
    if type(_value) is type(0):
        sym_name[_value] = _name
-
-
-def _main():
-    import sys
-    import token
-    if len(sys.argv) == 1:
-        sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
-    token._main()
-
-if __name__ == "__main__":
-    _main()
+del _name, _value
--- a/Lib/test/test_symbol.py
+++ b/Lib/test/test_symbol.py
@@ -6,6 +6,9 @@ import subprocess


 SYMBOL_FILE              = support.findfile('symbol.py')
+GEN_SYMBOL_FILE          = os.path.join(os.path.dirname(__file__),
+                                        '..', '..', 'Tools', 'scripts',
+                                        'generate_symbol_py.py')
 GRAMMAR_FILE             = os.path.join(os.path.dirname(__file__),
                                        '..', '..', 'Include', 'graminit.h')
 TEST_PY_FILE             = 'symbol_test.py'
@@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase):

    def _generate_symbols(self, grammar_file, target_symbol_py_file):
        proc = subprocess.Popen([sys.executable,
-                                 SYMBOL_FILE,
+                                 GEN_SYMBOL_FILE,
                                 grammar_file,
                                 target_symbol_py_file], stderr=subprocess.PIPE)
        stderr = proc.communicate()[1]

--- a/Lib/test/test_tokenize.py
+++ b/Lib/test/test_tokenize.py
@@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase):
            testfiles = random.sample(testfiles, 10)

        for testfile in testfiles:
+            if support.verbose >= 2:
+                print('tokenize', testfile)
            with open(testfile, 'rb') as f:
                with self.subTest(file=testfile):
                    self.check_roundtrip(f)

--- a/Lib/token.py
+++ b/Lib/token.py
-"""Token constants (from "token.h")."""
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py

 __all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']

-#  This file is automatically generated; please don't muck it up!
-#
-#  To update the symbols in this file, 'cd' to the top directory of
-#  the python source tree after building the interpreter and run:
-#
-#    ./python Lib/token.py
-
-#--start constants--
 ENDMARKER = 0
 NAME = 1
 NUMBER = 2
@@ -63,23 +56,70 @@ AT = 49
 ATEQUAL = 50
 RARROW = 51
 ELLIPSIS = 52
-# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
 OP = 53
-ERRORTOKEN = 54
 # These aren't used by the C tokenizer but are needed for tokenize.py
+ERRORTOKEN = 54
 COMMENT = 55
 NL = 56
 ENCODING = 57
 N_TOKENS = 58
 # Special definitions for cooperation with parser
 NT_OFFSET = 256
-#--end constants--

 tok_name = {value: name
            for name, value in globals().items()
            if isinstance(value, int) and not name.startswith('_')}
 __all__.extend(tok_name.values())

+EXACT_TOKEN_TYPES = {
+    '!=': NOTEQUAL,
+    '%': PERCENT,
+    '%=': PERCENTEQUAL,
+    '&': AMPER,
+    '&=': AMPEREQUAL,
+    '(': LPAR,
+    ')': RPAR,
+    '*': STAR,
+    '**': DOUBLESTAR,
+    '**=': DOUBLESTAREQUAL,
+    '*=': STAREQUAL,
+    '+': PLUS,
+    '+=': PLUSEQUAL,
+    ',': COMMA,
+    '-': MINUS,
+    '-=': MINEQUAL,
+    '->': RARROW,
+    '.': DOT,
+    '...': ELLIPSIS,
+    '/': SLASH,
+    '//': DOUBLESLASH,
+    '//=': DOUBLESLASHEQUAL,
+    '/=': SLASHEQUAL,
+    ':': COLON,
+    ';': SEMI,
+    '<': LESS,
+    '<<': LEFTSHIFT,
+    '<<=': LEFTSHIFTEQUAL,
+    '<=': LESSEQUAL,
+    '=': EQUAL,
+    '==': EQEQUAL,
+    '>': GREATER,
+    '>=': GREATEREQUAL,
+    '>>': RIGHTSHIFT,
+    '>>=': RIGHTSHIFTEQUAL,
+    '@': AT,
+    '@=': ATEQUAL,
+    '[': LSQB,
+    ']': RSQB,
+    '^': CIRCUMFLEX,
+    '^=': CIRCUMFLEXEQUAL,
+    '{': LBRACE,
+    '|': VBAR,
+    '|=': VBAREQUAL,
+    '}': RBRACE,
+    '~': TILDE,
+}
+
 def ISTERMINAL(x):
    return x < NT_OFFSET

@@ -88,73 +128,3 @@ def ISNONTERMINAL(x):

 def ISEOF(x):
    return x == ENDMARKER
-
-
-def _main():
-    import re
-    import sys
-    args = sys.argv[1:]
-    inFileName = args and args[0] or "Include/token.h"
-    outFileName = "Lib/token.py"
-    if len(args) > 1:
-        outFileName = args[1]
-    try:
-        fp = open(inFileName)
-    except OSError as err:
-        sys.stdout.write("I/O error: %s\n" % str(err))
-        sys.exit(1)
-    with fp:
-        lines = fp.read().split("\n")
-    prog = re.compile(
-        r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
-        re.IGNORECASE)
-    comment_regex = re.compile(
-        r"^\s*/\*\s*(.+?)\s*\*/\s*$",
-        re.IGNORECASE)
-
-    tokens = {}
-    prev_val = None
-    for line in lines:
-        match = prog.match(line)
-        if match:
-            name, val = match.group(1, 2)
-            val = int(val)
-            tokens[val] = {'token': name}          # reverse so we can sort them...
-            prev_val = val
-        else:
-            comment_match = comment_regex.match(line)
-            if comment_match and prev_val is not None:
-                comment = comment_match.group(1)
-                tokens[prev_val]['comment'] = comment
-    keys = sorted(tokens.keys())
-    # load the output skeleton from the target:
-    try:
-        fp = open(outFileName)
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(2)
-    with fp:
-        format = fp.read().split("\n")
-    try:
-        start = format.index("#--start constants--") + 1
-        end = format.index("#--end constants--")
-    except ValueError:
-        sys.stderr.write("target does not contain format markers")
-        sys.exit(3)
-    lines = []
-    for key in keys:
-        lines.append("%s = %d" % (tokens[key]["token"], key))
-        if "comment" in tokens[key]:
-            lines.append("# %s" % tokens[key]["comment"])
-    format[start:end] = lines
-    try:
-        fp = open(outFileName, 'w')
-    except OSError as err:
-        sys.stderr.write("I/O error: %s\n" % str(err))
-        sys.exit(4)
-    with fp:
-        fp.write("\n".join(format))
-
-
-if __name__ == "__main__":
-    _main()
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -32,6 +32,7 @@ import itertools as _itertools
 import re
 import sys
 from token import *
+from token import EXACT_TOKEN_TYPES

 cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
 blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
@@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
                           "untokenize", "TokenInfo"]
 del token

-EXACT_TOKEN_TYPES = {
-    '(':   LPAR,
-    ')':   RPAR,
-    '[':   LSQB,
-    ']':   RSQB,
-    ':':   COLON,
-    ',':   COMMA,
-    ';':   SEMI,
-    '+':   PLUS,
-    '-':   MINUS,
-    '*':   STAR,
-    '/':   SLASH,
-    '|':   VBAR,
-    '&':   AMPER,
-    '<':   LESS,
-    '>':   GREATER,
-    '=':   EQUAL,
-    '.':   DOT,
-    '%':   PERCENT,
-    '{':   LBRACE,
-    '}':   RBRACE,
-    '==':  EQEQUAL,
-    '!=':  NOTEQUAL,
-    '<=':  LESSEQUAL,
-    '>=':  GREATEREQUAL,
-    '~':   TILDE,
-    '^':   CIRCUMFLEX,
-    '<<':  LEFTSHIFT,
-    '>>':  RIGHTSHIFT,
-    '**':  DOUBLESTAR,
-    '+=':  PLUSEQUAL,
-    '-=':  MINEQUAL,
-    '*=':  STAREQUAL,
-    '/=':  SLASHEQUAL,
-    '%=':  PERCENTEQUAL,
-    '&=':  AMPEREQUAL,
-    '|=':  VBAREQUAL,
-    '^=':  CIRCUMFLEXEQUAL,
-    '<<=': LEFTSHIFTEQUAL,
-    '>>=': RIGHTSHIFTEQUAL,
-    '**=': DOUBLESTAREQUAL,
-    '//':  DOUBLESLASH,
-    '//=': DOUBLESLASHEQUAL,
-    '...': ELLIPSIS,
-    '->':  RARROW,
-    '@':   AT,
-    '@=':  ATEQUAL,
-}
-
 class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
    def __repr__(self):
        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
@@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
 String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')

-# Because of leftmost-then-longest match semantics, be sure to put the
-# longest operators first (e.g., if = came before ==, == would get
-# recognized as two instances of =).
-Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
-                 r"//=?", r"->",
-                 r"[+\-*/%&@|^=<>]=?",
-                 r"~")
-
-Bracket = '[][(){}]'
-Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
-Funny = group(Operator, Bracket, Special)
+# Sorting in reverse order puts the long operators before their prefixes.
+# Otherwise if = came before ==, == would get recognized as two instances
+# of =.
+Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
+Funny = group(r'\r?\n', Special)

 PlainToken = group(Number, Funny, String, Name)
 Token = Ignore + PlainToken

--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -302,6 +302,7 @@ POBJS=		\
 		Parser/metagrammar.o \
 		Parser/firstsets.o \
 		Parser/grammar.o \
+		Parser/token.o \
 		Parser/pgen.o

 PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
@@ -559,7 +560,7 @@ coverage-lcov:
 	@echo

 # Force regeneration of parser and importlib
-coverage-report: regen-grammar regen-importlib
+coverage-report: regen-grammar regen-token regen-importlib
 	@ # build with coverage info
 	$(MAKE) coverage
 	@ # run tests, ignore failures
@@ -741,7 +742,7 @@ regen-importlib: Programs/_freeze_importlib
 # Regenerate all generated files

 regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
-	regen-ast regen-importlib clinic
+	regen-token regen-symbol regen-ast regen-importlib clinic

 ############################################################################
 # Special rules for object files
@@ -849,6 +850,37 @@ regen-opcode:
 		$(srcdir)/Include/opcode.h.new
 	$(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new

+.PHONY: regen-token
+regen-token:
+	# Regenerate Doc/library/token-list.inc from Grammar/Tokens
+	# using Tools/scripts/generate_token.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
+		$(srcdir)/Grammar/Tokens \
+		$(srcdir)/Doc/library/token-list.inc
+	# Regenerate Include/token.h from Grammar/Tokens
+	# using Tools/scripts/generate_token.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
+		$(srcdir)/Grammar/Tokens \
+		$(srcdir)/Include/token.h
+	# Regenerate Parser/token.c from Grammar/Tokens
+	# using Tools/scripts/generate_token.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
+		$(srcdir)/Grammar/Tokens \
+		$(srcdir)/Parser/token.c
+	# Regenerate Lib/token.py from Grammar/Tokens
+	# using Tools/scripts/generate_token.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \
+		$(srcdir)/Grammar/Tokens \
+		$(srcdir)/Lib/token.py
+
+.PHONY: regen-symbol
+regen-symbol: $(srcdir)/Include/graminit.h
+	# Regenerate Lib/symbol.py from Include/graminit.h
+	# using Tools/scripts/generate_symbol_py.py
+	$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \
+		$(srcdir)/Include/graminit.h \
+		$(srcdir)/Lib/symbol.py
+
 Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h

 Python/getplatform.o: $(srcdir)/Python/getplatform.c

--- a/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2018-04-14-11-02-57.bpo-30455.ANRwjo.rst
+The C and Python code and the documentation related to tokens are now generated
+from a single source file :file:`Grammar/Tokens`.
--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -367,6 +367,7 @@
    <ClCompile Include="..\Parser\parser.c" />
    <ClCompile Include="..\Parser\parsetok.c" />
    <ClCompile Include="..\Parser\tokenizer.c" />
+    <ClCompile Include="..\Parser\token.c" />
    <ClCompile Include="..\PC\invalid_parameter_handler.c" />
    <ClCompile Include="..\PC\winreg.c" />
    <ClCompile Include="..\PC\config.c" />

--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -866,6 +866,9 @@
    <ClCompile Include="..\Parser\tokenizer.c">
      <Filter>Parser</Filter>
    </ClCompile>
+    <ClCompile Include="..\Parser\token.c">
+      <Filter>Parser</Filter>
+    </ClCompile>
    <ClCompile Include="..\PC\winreg.c">
      <Filter>PC</Filter>
    </ClCompile>

--- a/Parser/token.c
+++ b/Parser/token.c
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+    "ENDMARKER",
+    "NAME",
+    "NUMBER",
+    "STRING",
+    "NEWLINE",
+    "INDENT",
+    "DEDENT",
+    "LPAR",
+    "RPAR",
+    "LSQB",
+    "RSQB",
+    "COLON",
+    "COMMA",
+    "SEMI",
+    "PLUS",
+    "MINUS",
+    "STAR",
+    "SLASH",
+    "VBAR",
+    "AMPER",
+    "LESS",
+    "GREATER",
+    "EQUAL",
+    "DOT",
+    "PERCENT",
+    "LBRACE",
+    "RBRACE",
+    "EQEQUAL",
+    "NOTEQUAL",
+    "LESSEQUAL",
+    "GREATEREQUAL",
+    "TILDE",
+    "CIRCUMFLEX",
+    "LEFTSHIFT",
+    "RIGHTSHIFT",
+    "DOUBLESTAR",
+    "PLUSEQUAL",
+    "MINEQUAL",
+    "STAREQUAL",
+    "SLASHEQUAL",
+    "PERCENTEQUAL",
+    "AMPEREQUAL",
+    "VBAREQUAL",
+    "CIRCUMFLEXEQUAL",
+    "LEFTSHIFTEQUAL",
+    "RIGHTSHIFTEQUAL",
+    "DOUBLESTAREQUAL",
+    "DOUBLESLASH",
+    "DOUBLESLASHEQUAL",
+    "AT",
+    "ATEQUAL",
+    "RARROW",
+    "ELLIPSIS",
+    "OP",
+    "<ERRORTOKEN>",
+    "<COMMENT>",
+    "<NL>",
+    "<ENCODING>",
+    "<N_TOKENS>",
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+    switch (c1) {
+    case '%': return PERCENT;
+    case '&': return AMPER;
+    case '(': return LPAR;
+    case ')': return RPAR;
+    case '*': return STAR;
+    case '+': return PLUS;
+    case ',': return COMMA;
+    case '-': return MINUS;
+    case '.': return DOT;
+    case '/': return SLASH;
+    case ':': return COLON;
+    case ';': return SEMI;
+    case '<': return LESS;
+    case '=': return EQUAL;
+    case '>': return GREATER;
+    case '@': return AT;
+    case '[': return LSQB;
+    case ']': return RSQB;
+    case '^': return CIRCUMFLEX;
+    case '{': return LBRACE;
+    case '|': return VBAR;
+    case '}': return RBRACE;
+    case '~': return TILDE;
+    }
+    return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+    switch (c1) {
+    case '!':
+        switch (c2) {
+        case '=': return NOTEQUAL;
+        }
+        break;
+    case '%':
+        switch (c2) {
+        case '=': return PERCENTEQUAL;
+        }
+        break;
+    case '&':
+        switch (c2) {
+        case '=': return AMPEREQUAL;
+        }
+        break;
+    case '*':
+        switch (c2) {
+        case '*': return DOUBLESTAR;
+        case '=': return STAREQUAL;
+        }
+        break;
+    case '+':
+        switch (c2) {
+        case '=': return PLUSEQUAL;
+        }
+        break;
+    case '-':
+        switch (c2) {
+        case '=': return MINEQUAL;
+        case '>': return RARROW;
+        }
+        break;
+    case '/':
+        switch (c2) {
+        case '/': return DOUBLESLASH;
+        case '=': return SLASHEQUAL;
+        }
+        break;
+    case '<':
+        switch (c2) {
+        case '<': return LEFTSHIFT;
+        case '=': return LESSEQUAL;
+        case '>': return NOTEQUAL;
+        }
+        break;
+    case '=':
+        switch (c2) {
+        case '=': return EQEQUAL;
+        }
+        break;
+    case '>':
+        switch (c2) {
+        case '=': return GREATEREQUAL;
+        case '>': return RIGHTSHIFT;
+        }
+        break;
+    case '@':
+        switch (c2) {
+        case '=': return ATEQUAL;
+        }
+        break;
+    case '^':
+        switch (c2) {
+        case '=': return CIRCUMFLEXEQUAL;
+        }
+        break;
+    case '|':
+        switch (c2) {
+        case '=': return VBAREQUAL;
+        }
+        break;
+    }
+    return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+    switch (c1) {
+    case '*':
+        switch (c2) {
+        case '*':
+            switch (c3) {
+            case '=': return DOUBLESTAREQUAL;
+            }
+            break;
+        }
+        break;
+    case '.':
+        switch (c2) {
+        case '.':
+            switch (c3) {
+            case '.': return ELLIPSIS;
+            }
+            break;
+        }
+        break;
+    case '/':
+        switch (c2) {
+        case '/':
+            switch (c3) {
+            case '=': return DOUBLESLASHEQUAL;
+            }
+            break;
+        }
+        break;
+    case '<':
+        switch (c2) {
+        case '<':
+            switch (c3) {
+            case '=': return LEFTSHIFTEQUAL;
+            }
+            break;
+        }
+        break;
+    case '>':
+        switch (c2) {
+        case '>':
+            switch (c3) {
+            case '=': return RIGHTSHIFTEQUAL;
+            }
+            break;
+        }
+        break;
+    }
+    return OP;
+}
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok);
 static void tok_backup(struct tok_state *tok, int c);


-/* Token names */
-
-const char *_PyParser_TokenNames[] = {
-    "ENDMARKER",
-    "NAME",
-    "NUMBER",
-    "STRING",
-    "NEWLINE",
-    "INDENT",
-    "DEDENT",
-    "LPAR",
-    "RPAR",
-    "LSQB",
-    "RSQB",
-    "COLON",
-    "COMMA",
-    "SEMI",
-    "PLUS",
-    "MINUS",
-    "STAR",
-    "SLASH",
-    "VBAR",
-    "AMPER",
-    "LESS",
-    "GREATER",
-    "EQUAL",
-    "DOT",
-    "PERCENT",
-    "LBRACE",
-    "RBRACE",
-    "EQEQUAL",
-    "NOTEQUAL",
-    "LESSEQUAL",
-    "GREATEREQUAL",
-    "TILDE",
-    "CIRCUMFLEX",
-    "LEFTSHIFT",
-    "RIGHTSHIFT",
-    "DOUBLESTAR",
-    "PLUSEQUAL",
-    "MINEQUAL",
-    "STAREQUAL",
-    "SLASHEQUAL",
-    "PERCENTEQUAL",
-    "AMPEREQUAL",
-    "VBAREQUAL",
-    "CIRCUMFLEXEQUAL",
-    "LEFTSHIFTEQUAL",
-    "RIGHTSHIFTEQUAL",
-    "DOUBLESTAREQUAL",
-    "DOUBLESLASH",
-    "DOUBLESLASHEQUAL",
-    "AT",
-    "ATEQUAL",
-    "RARROW",
-    "ELLIPSIS",
-    /* This table must match the #defines in token.h! */
-    "OP",
-    "<ERRORTOKEN>",
-    "COMMENT",
-    "NL",
-    "ENCODING",
-    "<N_TOKENS>"
-};
-
-
 /* Create and initialize a new tok_state structure */

 static struct tok_state *
@@ -1114,177 +1048,6 @@ tok_backup(struct tok_state *tok, int c)
 }


-/* Return the token corresponding to a single character */
-
-int
-PyToken_OneChar(int c)
-{
-    switch (c) {
-    case '(':           return LPAR;
-    case ')':           return RPAR;
-    case '[':           return LSQB;
-    case ']':           return RSQB;
-    case ':':           return COLON;
-    case ',':           return COMMA;
-    case ';':           return SEMI;
-    case '+':           return PLUS;
-    case '-':           return MINUS;
-    case '*':           return STAR;
-    case '/':           return SLASH;
-    case '|':           return VBAR;
-    case '&':           return AMPER;
-    case '<':           return LESS;
-    case '>':           return GREATER;
-    case '=':           return EQUAL;
-    case '.':           return DOT;
-    case '%':           return PERCENT;
-    case '{':           return LBRACE;
-    case '}':           return RBRACE;
-    case '^':           return CIRCUMFLEX;
-    case '~':           return TILDE;
-    case '@':           return AT;
-    default:            return OP;
-    }
-}
-
-
-int
-PyToken_TwoChars(int c1, int c2)
-{
-    switch (c1) {
-    case '=':
-        switch (c2) {
-        case '=':               return EQEQUAL;
-        }
-        break;
-    case '!':
-        switch (c2) {
-        case '=':               return NOTEQUAL;
-        }
-        break;
-    case '<':
-        switch (c2) {
-        case '>':               return NOTEQUAL;
-        case '=':               return LESSEQUAL;
-        case '<':               return LEFTSHIFT;
-        }
-        break;
-    case '>':
-        switch (c2) {
-        case '=':               return GREATEREQUAL;
-        case '>':               return RIGHTSHIFT;
-        }
-        break;
-    case '+':
-        switch (c2) {
-        case '=':               return PLUSEQUAL;
-        }
-        break;
-    case '-':
-        switch (c2) {
-        case '=':               return MINEQUAL;
-        case '>':               return RARROW;
-        }
-        break;
-    case '*':
-        switch (c2) {
-        case '*':               return DOUBLESTAR;
-        case '=':               return STAREQUAL;
-        }
-        break;
-    case '/':
-        switch (c2) {
-        case '/':               return DOUBLESLASH;
-        case '=':               return SLASHEQUAL;
-        }
-        break;
-    case '|':
-        switch (c2) {
-        case '=':               return VBAREQUAL;
-        }
-        break;
-    case '%':
-        switch (c2) {
-        case '=':               return PERCENTEQUAL;
-        }
-        break;
-    case '&':
-        switch (c2) {
-        case '=':               return AMPEREQUAL;
-        }
-        break;
-    case '^':
-        switch (c2) {
-        case '=':               return CIRCUMFLEXEQUAL;
-        }
-        break;
-    case '@':
-        switch (c2) {
-        case '=':               return ATEQUAL;
-        }
-        break;
-    }
-    return OP;
-}
-
-int
-PyToken_ThreeChars(int c1, int c2, int c3)
-{
-    switch (c1) {
-    case '<':
-        switch (c2) {
-        case '<':
-            switch (c3) {
-            case '=':
-                return LEFTSHIFTEQUAL;
-            }
-            break;
-        }
-        break;
-    case '>':
-        switch (c2) {
-        case '>':
-            switch (c3) {
-            case '=':
-                return RIGHTSHIFTEQUAL;
-            }
-            break;
-        }
-        break;
-    case '*':
-        switch (c2) {
-        case '*':
-            switch (c3) {
-            case '=':
-                return DOUBLESTAREQUAL;
-            }
-            break;
-        }
-        break;
-    case '/':
-        switch (c2) {
-        case '/':
-            switch (c3) {
-            case '=':
-                return DOUBLESLASHEQUAL;
-            }
-            break;
-        }
-        break;
-    case '.':
-        switch (c2) {
-        case '.':
-            switch (c3) {
-            case '.':
-                return ELLIPSIS;
-            }
-            break;
-        }
-        break;
-    }
-    return OP;
-}
-
 static int
 syntaxerror(struct tok_state *tok, const char *format, ...)
 {

--- a/Tools/scripts/generate_symbol_py.py
+++ b/Tools/scripts/generate_symbol_py.py
+#! /usr/bin/env python3
+# This script generates the symbol.py source file.
+
+import sys
+import re
+
+def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"):
+    try:
+        fp = open(inFileName)
+    except OSError as err:
+        sys.stderr.write("I/O error: %s\n" % str(err))
+        sys.exit(1)
+    with fp:
+        lines = fp.read().split("\n")
+    prog = re.compile(
+        "#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
+        re.IGNORECASE)
+    tokens = {}
+    for line in lines:
+        match = prog.match(line)
+        if match:
+            name, val = match.group(1, 2)
+            val = int(val)
+            tokens[val] = name          # reverse so we can sort them...
+    keys = sorted(tokens.keys())
+    # load the output skeleton from the target:
+    try:
+        fp = open(outFileName)
+    except OSError as err:
+        sys.stderr.write("I/O error: %s\n" % str(err))
+        sys.exit(2)
+    with fp:
+        format = fp.read().split("\n")
+    try:
+        start = format.index("#--start constants--") + 1
+        end = format.index("#--end constants--")
+    except ValueError:
+        sys.stderr.write("target does not contain format markers")
+        sys.exit(3)
+    lines = []
+    for val in keys:
+        lines.append("%s = %d" % (tokens[val], val))
+    format[start:end] = lines
+    try:
+        fp = open(outFileName, 'w')
+    except OSError as err:
+        sys.stderr.write("I/O error: %s\n" % str(err))
+        sys.exit(4)
+    with fp:
+        fp.write("\n".join(format))
+
+if __name__ == '__main__':
+    main(*sys.argv[1:])
--- a/Tools/scripts/generate_token.py
+++ b/Tools/scripts/generate_token.py
+#! /usr/bin/env python3
+# This script generates token related files from Grammar/Tokens:
+#
+#   Doc/library/token-list.inc
+#   Include/token.h
+#   Parser/token.c
+#   Lib/token.py
+
+
+NT_OFFSET = 256
+
+def load_tokens(path):
+    tok_names = []
+    string_to_tok = {}
+    ERRORTOKEN = None
+    with open(path) as fp:
+        for line in fp:
+            line = line.strip()
+            # strip comments
+            i = line.find('#')
+            if i >= 0:
+                line = line[:i].strip()
+            if not line:
+                continue
+            fields = line.split()
+            name = fields[0]
+            value = len(tok_names)
+            if name == 'ERRORTOKEN':
+                ERRORTOKEN = value
+            string = fields[1] if len(fields) > 1 else None
+            if string:
+                string = eval(string)
+                string_to_tok[string] = value
+            tok_names.append(name)
+    return tok_names, ERRORTOKEN, string_to_tok
+
+
+def update_file(file, content):
+    try:
+        with open(file, 'r') as fobj:
+            if fobj.read() == content:
+                return False
+    except (OSError, ValueError):
+        pass
+    with open(file, 'w') as fobj:
+        fobj.write(content)
+    return True
+
+
+token_h_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+/* Token types */
+#ifndef Py_LIMITED_API
+#ifndef Py_TOKEN_H
+#define Py_TOKEN_H
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef TILDE   /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
+
+%s\
+#define N_TOKENS        %d
+#define NT_OFFSET       %d
+
+/* Special definitions for cooperation with parser */
+
+#define ISTERMINAL(x)           ((x) < NT_OFFSET)
+#define ISNONTERMINAL(x)        ((x) >= NT_OFFSET)
+#define ISEOF(x)                ((x) == ENDMARKER)
+
+
+PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
+PyAPI_FUNC(int) PyToken_OneChar(int);
+PyAPI_FUNC(int) PyToken_TwoChars(int, int);
+PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* !Py_TOKEN_H */
+#endif /* Py_LIMITED_API */
+"""
+
+def make_h(infile, outfile='Include/token.h'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+    defines = []
+    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+        defines.append("#define %-15s %d\n" % (name, value))
+
+    if update_file(outfile, token_h_template % (
+            ''.join(defines),
+            len(tok_names),
+            NT_OFFSET
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_c_template = """\
+/* Auto-generated by Tools/scripts/generate_token.py */
+
+#include "Python.h"
+#include "token.h"
+
+/* Token names */
+
+const char * const _PyParser_TokenNames[] = {
+%s\
+};
+
+/* Return the token corresponding to a single character */
+
+int
+PyToken_OneChar(int c1)
+{
+%s\
+    return OP;
+}
+
+int
+PyToken_TwoChars(int c1, int c2)
+{
+%s\
+    return OP;
+}
+
+int
+PyToken_ThreeChars(int c1, int c2, int c3)
+{
+%s\
+    return OP;
+}
+"""
+
+def generate_chars_to_token(mapping, n=1):
+    result = []
+    write = result.append
+    indent = '    ' * n
+    write(indent)
+    write('switch (c%d) {\n' % (n,))
+    for c in sorted(mapping):
+        write(indent)
+        value = mapping[c]
+        if isinstance(value, dict):
+            write("case '%s':\n" % (c,))
+            write(generate_chars_to_token(value, n + 1))
+            write(indent)
+            write('    break;\n')
+        else:
+            write("case '%s': return %s;\n" % (c, value))
+    write(indent)
+    write('}\n')
+    return ''.join(result)
+
+def make_c(infile, outfile='Parser/token.c'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+    string_to_tok['<>'] = string_to_tok['!=']
+    chars_to_token = {}
+    for string, value in string_to_tok.items():
+        assert 1 <= len(string) <= 3
+        name = tok_names[value]
+        m = chars_to_token.setdefault(len(string), {})
+        for c in string[:-1]:
+            m = m.setdefault(c, {})
+        m[string[-1]] = name
+
+    names = []
+    for value, name in enumerate(tok_names):
+        if value >= ERRORTOKEN:
+            name = '<%s>' % name
+        names.append('    "%s",\n' % name)
+    names.append('    "<N_TOKENS>",\n')
+
+    if update_file(outfile, token_c_template % (
+            ''.join(names),
+            generate_chars_to_token(chars_to_token[1]),
+            generate_chars_to_token(chars_to_token[2]),
+            generate_chars_to_token(chars_to_token[3])
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_inc_template = """\
+.. Auto-generated by Tools/scripts/generate_token.py
+%s
+.. data:: N_TOKENS
+
+.. data:: NT_OFFSET
+"""
+
+def make_rst(infile, outfile='Doc/library/token-list.inc'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+    tok_to_string = {value: s for s, value in string_to_tok.items()}
+
+    names = []
+    for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
+        names.append('.. data:: %s' % (name,))
+        if value in tok_to_string:
+            names.append('')
+            names.append('   Token value for ``"%s"``.' % tok_to_string[value])
+        names.append('')
+
+    if update_file(outfile, token_inc_template % '\n'.join(names)):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+token_py_template = '''\
+"""Token constants."""
+# Auto-generated by Tools/scripts/generate_token.py
+
+__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
+
+%s
+N_TOKENS = %d
+# Special definitions for cooperation with parser
+NT_OFFSET = %d
+
+tok_name = {value: name
+            for name, value in globals().items()
+            if isinstance(value, int) and not name.startswith('_')}
+__all__.extend(tok_name.values())
+
+EXACT_TOKEN_TYPES = {
+%s
+}
+
+def ISTERMINAL(x):
+    return x < NT_OFFSET
+
+def ISNONTERMINAL(x):
+    return x >= NT_OFFSET
+
+def ISEOF(x):
+    return x == ENDMARKER
+'''
+
+def make_py(infile, outfile='Lib/token.py'):
+    tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
+
+    constants = []
+    for value, name in enumerate(tok_names):
+        constants.append('%s = %d' % (name, value))
+    constants.insert(ERRORTOKEN,
+        "# These aren't used by the C tokenizer but are needed for tokenize.py")
+
+    token_types = []
+    for s, value in sorted(string_to_tok.items()):
+        token_types.append('    %r: %s,' % (s, tok_names[value]))
+
+    if update_file(outfile, token_py_template % (
+            '\n'.join(constants),
+            len(tok_names),
+            NT_OFFSET,
+            '\n'.join(token_types),
+        )):
+        print("%s regenerated from %s" % (outfile, infile))
+
+
+def main(op, infile='Grammar/Tokens', *args):
+    make = globals()['make_' + op]
+    make(infile, *args)
+
+
+if __name__ == '__main__':
+    import sys
+    main(*sys.argv[1:])