Commit 8ac65811 authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-30455: Generate all token related code and docs from Grammar/Tokens. (GH-10370)

"Include/token.h", "Lib/token.py" (containing now some data moved from
"Lib/tokenize.py") and new files "Parser/token.c" (containing the code
moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included
in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by
"Tools/scripts/generate_token.py". The script overwrites files only if
needed and can be used on the read-only sources tree.

"Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py"
instead of been executable itself.

Added new make targets "regen-token" and "regen-symbol" which are now
dependencies of "regen-all".

The documentation contains now strings for operators and punctuation tokens.
parent c1b4b0f6
......@@ -55,3 +55,7 @@ Include/opcode.h linguist-generated=true
Python/opcode_targets.h linguist-generated=true
Objects/typeslots.inc linguist-generated=true
Modules/unicodedata_db.h linguist-generated=true
Doc/library/token-list.inc linguist-generated=true
Include/token.h linguist-generated=true
Lib/token.py linguist-generated=true
Parser/token.c linguist-generated=true
.. Auto-generated by Tools/scripts/generate_token.py
.. data:: ENDMARKER
.. data:: NAME
.. data:: NUMBER
.. data:: STRING
.. data:: NEWLINE
.. data:: INDENT
.. data:: DEDENT
.. data:: LPAR
Token value for ``"("``.
.. data:: RPAR
Token value for ``")"``.
.. data:: LSQB
Token value for ``"["``.
.. data:: RSQB
Token value for ``"]"``.
.. data:: COLON
Token value for ``":"``.
.. data:: COMMA
Token value for ``","``.
.. data:: SEMI
Token value for ``";"``.
.. data:: PLUS
Token value for ``"+"``.
.. data:: MINUS
Token value for ``"-"``.
.. data:: STAR
Token value for ``"*"``.
.. data:: SLASH
Token value for ``"/"``.
.. data:: VBAR
Token value for ``"|"``.
.. data:: AMPER
Token value for ``"&"``.
.. data:: LESS
Token value for ``"<"``.
.. data:: GREATER
Token value for ``">"``.
.. data:: EQUAL
Token value for ``"="``.
.. data:: DOT
Token value for ``"."``.
.. data:: PERCENT
Token value for ``"%"``.
.. data:: LBRACE
Token value for ``"{"``.
.. data:: RBRACE
Token value for ``"}"``.
.. data:: EQEQUAL
Token value for ``"=="``.
.. data:: NOTEQUAL
Token value for ``"!="``.
.. data:: LESSEQUAL
Token value for ``"<="``.
.. data:: GREATEREQUAL
Token value for ``">="``.
.. data:: TILDE
Token value for ``"~"``.
.. data:: CIRCUMFLEX
Token value for ``"^"``.
.. data:: LEFTSHIFT
Token value for ``"<<"``.
.. data:: RIGHTSHIFT
Token value for ``">>"``.
.. data:: DOUBLESTAR
Token value for ``"**"``.
.. data:: PLUSEQUAL
Token value for ``"+="``.
.. data:: MINEQUAL
Token value for ``"-="``.
.. data:: STAREQUAL
Token value for ``"*="``.
.. data:: SLASHEQUAL
Token value for ``"/="``.
.. data:: PERCENTEQUAL
Token value for ``"%="``.
.. data:: AMPEREQUAL
Token value for ``"&="``.
.. data:: VBAREQUAL
Token value for ``"|="``.
.. data:: CIRCUMFLEXEQUAL
Token value for ``"^="``.
.. data:: LEFTSHIFTEQUAL
Token value for ``"<<="``.
.. data:: RIGHTSHIFTEQUAL
Token value for ``">>="``.
.. data:: DOUBLESTAREQUAL
Token value for ``"**="``.
.. data:: DOUBLESLASH
Token value for ``"//"``.
.. data:: DOUBLESLASHEQUAL
Token value for ``"//="``.
.. data:: AT
Token value for ``"@"``.
.. data:: ATEQUAL
Token value for ``"@="``.
.. data:: RARROW
Token value for ``"->"``.
.. data:: ELLIPSIS
Token value for ``"..."``.
.. data:: OP
.. data:: ERRORTOKEN
.. data:: N_TOKENS
.. data:: NT_OFFSET
......@@ -44,64 +44,7 @@ functions. The functions mirror definitions in the Python C header files.
The token constants are:
.. data:: ENDMARKER
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR
RPAR
LSQB
RSQB
COLON
COMMA
SEMI
PLUS
MINUS
STAR
SLASH
VBAR
AMPER
LESS
GREATER
EQUAL
DOT
PERCENT
LBRACE
RBRACE
EQEQUAL
NOTEQUAL
LESSEQUAL
GREATEREQUAL
TILDE
CIRCUMFLEX
LEFTSHIFT
RIGHTSHIFT
DOUBLESTAR
PLUSEQUAL
MINEQUAL
STAREQUAL
SLASHEQUAL
PERCENTEQUAL
AMPEREQUAL
VBAREQUAL
CIRCUMFLEXEQUAL
LEFTSHIFTEQUAL
RIGHTSHIFTEQUAL
DOUBLESTAREQUAL
DOUBLESLASH
DOUBLESLASHEQUAL
AT
ATEQUAL
RARROW
ELLIPSIS
OP
ERRORTOKEN
N_TOKENS
NT_OFFSET
.. include:: token-list.inc
The following token type values aren't used by the C tokenizer but are needed for
the :mod:`tokenize` module.
......
ENDMARKER
NAME
NUMBER
STRING
NEWLINE
INDENT
DEDENT
LPAR '('
RPAR ')'
LSQB '['
RSQB ']'
COLON ':'
COMMA ','
SEMI ';'
PLUS '+'
MINUS '-'
STAR '*'
SLASH '/'
VBAR '|'
AMPER '&'
LESS '<'
GREATER '>'
EQUAL '='
DOT '.'
PERCENT '%'
LBRACE '{'
RBRACE '}'
EQEQUAL '=='
NOTEQUAL '!='
LESSEQUAL '<='
GREATEREQUAL '>='
TILDE '~'
CIRCUMFLEX '^'
LEFTSHIFT '<<'
RIGHTSHIFT '>>'
DOUBLESTAR '**'
PLUSEQUAL '+='
MINEQUAL '-='
STAREQUAL '*='
SLASHEQUAL '/='
PERCENTEQUAL '%='
AMPEREQUAL '&='
VBAREQUAL '|='
CIRCUMFLEXEQUAL '^='
LEFTSHIFTEQUAL '<<='
RIGHTSHIFTEQUAL '>>='
DOUBLESTAREQUAL '**='
DOUBLESLASH '//'
DOUBLESLASHEQUAL '//='
AT '@'
ATEQUAL '@='
RARROW '->'
ELLIPSIS '...'
OP
ERRORTOKEN
# These aren't used by the C tokenizer but are needed for tokenize.py
COMMENT
NL
ENCODING
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
......@@ -62,25 +63,19 @@ extern "C" {
#define ATEQUAL 50
#define RARROW 51
#define ELLIPSIS 52
/* Don't forget to update the table _PyParser_TokenNames in tokenizer.c! */
#define OP 53
#define ERRORTOKEN 54
/* These aren't used by the C tokenizer but are needed for tokenize.py */
#define COMMENT 55
#define NL 56
#define ENCODING 57
#define N_TOKENS 58
#define NT_OFFSET 256
/* Special definitions for cooperation with parser */
#define NT_OFFSET 256
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
PyAPI_DATA(const char *) _PyParser_TokenNames[]; /* Token names */
PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
......
#! /usr/bin/env python3
"""Non-terminal symbols of Python grammar (from "graminit.h")."""
# This file is automatically generated; please don't muck it up!
......@@ -7,7 +5,11 @@
# To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run:
#
# ./python Lib/symbol.py
# python3 Tools/scripts/generate_symbol_py.py Include/graminit.h Lib/symbol.py
#
# or just
#
# make regen-symbol
#--start constants--
single_input = 256
......@@ -103,14 +105,4 @@ sym_name = {}
for _name, _value in list(globals().items()):
if type(_value) is type(0):
sym_name[_value] = _name
def _main():
import sys
import token
if len(sys.argv) == 1:
sys.argv = sys.argv + ["Include/graminit.h", "Lib/symbol.py"]
token._main()
if __name__ == "__main__":
_main()
del _name, _value
......@@ -6,6 +6,9 @@ import subprocess
SYMBOL_FILE = support.findfile('symbol.py')
GEN_SYMBOL_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Tools', 'scripts',
'generate_symbol_py.py')
GRAMMAR_FILE = os.path.join(os.path.dirname(__file__),
'..', '..', 'Include', 'graminit.h')
TEST_PY_FILE = 'symbol_test.py'
......@@ -22,7 +25,7 @@ class TestSymbolGeneration(unittest.TestCase):
def _generate_symbols(self, grammar_file, target_symbol_py_file):
proc = subprocess.Popen([sys.executable,
SYMBOL_FILE,
GEN_SYMBOL_FILE,
grammar_file,
target_symbol_py_file], stderr=subprocess.PIPE)
stderr = proc.communicate()[1]
......
......@@ -1619,6 +1619,8 @@ class TestRoundtrip(TestCase):
testfiles = random.sample(testfiles, 10)
for testfile in testfiles:
if support.verbose >= 2:
print('tokenize', testfile)
with open(testfile, 'rb') as f:
with self.subTest(file=testfile):
self.check_roundtrip(f)
......
"""Token constants (from "token.h")."""
"""Token constants."""
# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
# This file is automatically generated; please don't muck it up!
#
# To update the symbols in this file, 'cd' to the top directory of
# the python source tree after building the interpreter and run:
#
# ./python Lib/token.py
#--start constants--
ENDMARKER = 0
NAME = 1
NUMBER = 2
......@@ -63,23 +56,70 @@ AT = 49
ATEQUAL = 50
RARROW = 51
ELLIPSIS = 52
# Don't forget to update the table _PyParser_TokenNames in tokenizer.c!
OP = 53
ERRORTOKEN = 54
# These aren't used by the C tokenizer but are needed for tokenize.py
ERRORTOKEN = 54
COMMENT = 55
NL = 56
ENCODING = 57
N_TOKENS = 58
# Special definitions for cooperation with parser
NT_OFFSET = 256
#--end constants--
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
EXACT_TOKEN_TYPES = {
'!=': NOTEQUAL,
'%': PERCENT,
'%=': PERCENTEQUAL,
'&': AMPER,
'&=': AMPEREQUAL,
'(': LPAR,
')': RPAR,
'*': STAR,
'**': DOUBLESTAR,
'**=': DOUBLESTAREQUAL,
'*=': STAREQUAL,
'+': PLUS,
'+=': PLUSEQUAL,
',': COMMA,
'-': MINUS,
'-=': MINEQUAL,
'->': RARROW,
'.': DOT,
'...': ELLIPSIS,
'/': SLASH,
'//': DOUBLESLASH,
'//=': DOUBLESLASHEQUAL,
'/=': SLASHEQUAL,
':': COLON,
';': SEMI,
'<': LESS,
'<<': LEFTSHIFT,
'<<=': LEFTSHIFTEQUAL,
'<=': LESSEQUAL,
'=': EQUAL,
'==': EQEQUAL,
'>': GREATER,
'>=': GREATEREQUAL,
'>>': RIGHTSHIFT,
'>>=': RIGHTSHIFTEQUAL,
'@': AT,
'@=': ATEQUAL,
'[': LSQB,
']': RSQB,
'^': CIRCUMFLEX,
'^=': CIRCUMFLEXEQUAL,
'{': LBRACE,
'|': VBAR,
'|=': VBAREQUAL,
'}': RBRACE,
'~': TILDE,
}
def ISTERMINAL(x):
return x < NT_OFFSET
......@@ -88,73 +128,3 @@ def ISNONTERMINAL(x):
def ISEOF(x):
return x == ENDMARKER
def _main():
import re
import sys
args = sys.argv[1:]
inFileName = args and args[0] or "Include/token.h"
outFileName = "Lib/token.py"
if len(args) > 1:
outFileName = args[1]
try:
fp = open(inFileName)
except OSError as err:
sys.stdout.write("I/O error: %s\n" % str(err))
sys.exit(1)
with fp:
lines = fp.read().split("\n")
prog = re.compile(
r"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE)
comment_regex = re.compile(
r"^\s*/\*\s*(.+?)\s*\*/\s*$",
re.IGNORECASE)
tokens = {}
prev_val = None
for line in lines:
match = prog.match(line)
if match:
name, val = match.group(1, 2)
val = int(val)
tokens[val] = {'token': name} # reverse so we can sort them...
prev_val = val
else:
comment_match = comment_regex.match(line)
if comment_match and prev_val is not None:
comment = comment_match.group(1)
tokens[prev_val]['comment'] = comment
keys = sorted(tokens.keys())
# load the output skeleton from the target:
try:
fp = open(outFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(2)
with fp:
format = fp.read().split("\n")
try:
start = format.index("#--start constants--") + 1
end = format.index("#--end constants--")
except ValueError:
sys.stderr.write("target does not contain format markers")
sys.exit(3)
lines = []
for key in keys:
lines.append("%s = %d" % (tokens[key]["token"], key))
if "comment" in tokens[key]:
lines.append("# %s" % tokens[key]["comment"])
format[start:end] = lines
try:
fp = open(outFileName, 'w')
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(4)
with fp:
fp.write("\n".join(format))
if __name__ == "__main__":
_main()
......@@ -32,6 +32,7 @@ import itertools as _itertools
import re
import sys
from token import *
from token import EXACT_TOKEN_TYPES
cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
......@@ -41,55 +42,6 @@ __all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
"untokenize", "TokenInfo"]
del token
EXACT_TOKEN_TYPES = {
'(': LPAR,
')': RPAR,
'[': LSQB,
']': RSQB,
':': COLON,
',': COMMA,
';': SEMI,
'+': PLUS,
'-': MINUS,
'*': STAR,
'/': SLASH,
'|': VBAR,
'&': AMPER,
'<': LESS,
'>': GREATER,
'=': EQUAL,
'.': DOT,
'%': PERCENT,
'{': LBRACE,
'}': RBRACE,
'==': EQEQUAL,
'!=': NOTEQUAL,
'<=': LESSEQUAL,
'>=': GREATEREQUAL,
'~': TILDE,
'^': CIRCUMFLEX,
'<<': LEFTSHIFT,
'>>': RIGHTSHIFT,
'**': DOUBLESTAR,
'+=': PLUSEQUAL,
'-=': MINEQUAL,
'*=': STAREQUAL,
'/=': SLASHEQUAL,
'%=': PERCENTEQUAL,
'&=': AMPEREQUAL,
'|=': VBAREQUAL,
'^=': CIRCUMFLEXEQUAL,
'<<=': LEFTSHIFTEQUAL,
'>>=': RIGHTSHIFTEQUAL,
'**=': DOUBLESTAREQUAL,
'//': DOUBLESLASH,
'//=': DOUBLESLASHEQUAL,
'...': ELLIPSIS,
'->': RARROW,
'@': AT,
'@=': ATEQUAL,
}
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
......@@ -163,17 +115,11 @@ Triple = group(StringPrefix + "'''", StringPrefix + '"""')
String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
# recognized as two instances of =).
Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"!=",
r"//=?", r"->",
r"[+\-*/%&@|^=<>]=?",
r"~")
Bracket = '[][(){}]'
Special = group(r'\r?\n', r'\.\.\.', r'[:;.,@]')
Funny = group(Operator, Bracket, Special)
# Sorting in reverse order puts the long operators before their prefixes.
# Otherwise if = came before ==, == would get recognized as two instances
# of =.
Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
Funny = group(r'\r?\n', Special)
PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
......
......@@ -302,6 +302,7 @@ POBJS= \
Parser/metagrammar.o \
Parser/firstsets.o \
Parser/grammar.o \
Parser/token.o \
Parser/pgen.o
PARSER_OBJS= $(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
......@@ -559,7 +560,7 @@ coverage-lcov:
@echo
# Force regeneration of parser and importlib
coverage-report: regen-grammar regen-importlib
coverage-report: regen-grammar regen-token regen-importlib
@ # build with coverage info
$(MAKE) coverage
@ # run tests, ignore failures
......@@ -741,7 +742,7 @@ regen-importlib: Programs/_freeze_importlib
# Regenerate all generated files
regen-all: regen-opcode regen-opcode-targets regen-typeslots regen-grammar \
regen-ast regen-importlib clinic
regen-token regen-symbol regen-ast regen-importlib clinic
############################################################################
# Special rules for object files
......@@ -849,6 +850,37 @@ regen-opcode:
$(srcdir)/Include/opcode.h.new
$(UPDATE_FILE) $(srcdir)/Include/opcode.h $(srcdir)/Include/opcode.h.new
.PHONY: regen-token
regen-token:
# Regenerate Doc/library/token-list.inc from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py rst \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Doc/library/token-list.inc
# Regenerate Include/token.h from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py h \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Include/token.h
# Regenerate Parser/token.c from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py c \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Parser/token.c
# Regenerate Lib/token.py from Grammar/Tokens
# using Tools/scripts/generate_token.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_token.py py \
$(srcdir)/Grammar/Tokens \
$(srcdir)/Lib/token.py
.PHONY: regen-symbol
regen-symbol: $(srcdir)/Include/graminit.h
# Regenerate Lib/symbol.py from Include/graminit.h
# using Tools/scripts/generate_symbol_py.py
$(PYTHON_FOR_REGEN) $(srcdir)/Tools/scripts/generate_symbol_py.py \
$(srcdir)/Include/graminit.h \
$(srcdir)/Lib/symbol.py
Python/compile.o Python/symtable.o Python/ast_unparse.o Python/ast.o: $(srcdir)/Include/graminit.h $(srcdir)/Include/Python-ast.h
Python/getplatform.o: $(srcdir)/Python/getplatform.c
......
The C and Python code and the documentation related to tokens are now generated
from a single source file :file:`Grammar/Tokens`.
......@@ -367,6 +367,7 @@
<ClCompile Include="..\Parser\parser.c" />
<ClCompile Include="..\Parser\parsetok.c" />
<ClCompile Include="..\Parser\tokenizer.c" />
<ClCompile Include="..\Parser\token.c" />
<ClCompile Include="..\PC\invalid_parameter_handler.c" />
<ClCompile Include="..\PC\winreg.c" />
<ClCompile Include="..\PC\config.c" />
......
......@@ -866,6 +866,9 @@
<ClCompile Include="..\Parser\tokenizer.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\Parser\token.c">
<Filter>Parser</Filter>
</ClCompile>
<ClCompile Include="..\PC\winreg.c">
<Filter>PC</Filter>
</ClCompile>
......
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"LBRACE",
"RBRACE",
"EQEQUAL",
"NOTEQUAL",
"LESSEQUAL",
"GREATEREQUAL",
"TILDE",
"CIRCUMFLEX",
"LEFTSHIFT",
"RIGHTSHIFT",
"DOUBLESTAR",
"PLUSEQUAL",
"MINEQUAL",
"STAREQUAL",
"SLASHEQUAL",
"PERCENTEQUAL",
"AMPEREQUAL",
"VBAREQUAL",
"CIRCUMFLEXEQUAL",
"LEFTSHIFTEQUAL",
"RIGHTSHIFTEQUAL",
"DOUBLESTAREQUAL",
"DOUBLESLASH",
"DOUBLESLASHEQUAL",
"AT",
"ATEQUAL",
"RARROW",
"ELLIPSIS",
"OP",
"<ERRORTOKEN>",
"<COMMENT>",
"<NL>",
"<ENCODING>",
"<N_TOKENS>",
};
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c1)
{
switch (c1) {
case '%': return PERCENT;
case '&': return AMPER;
case '(': return LPAR;
case ')': return RPAR;
case '*': return STAR;
case '+': return PLUS;
case ',': return COMMA;
case '-': return MINUS;
case '.': return DOT;
case '/': return SLASH;
case ':': return COLON;
case ';': return SEMI;
case '<': return LESS;
case '=': return EQUAL;
case '>': return GREATER;
case '@': return AT;
case '[': return LSQB;
case ']': return RSQB;
case '^': return CIRCUMFLEX;
case '{': return LBRACE;
case '|': return VBAR;
case '}': return RBRACE;
case '~': return TILDE;
}
return OP;
}
int
PyToken_TwoChars(int c1, int c2)
{
switch (c1) {
case '!':
switch (c2) {
case '=': return NOTEQUAL;
}
break;
case '%':
switch (c2) {
case '=': return PERCENTEQUAL;
}
break;
case '&':
switch (c2) {
case '=': return AMPEREQUAL;
}
break;
case '*':
switch (c2) {
case '*': return DOUBLESTAR;
case '=': return STAREQUAL;
}
break;
case '+':
switch (c2) {
case '=': return PLUSEQUAL;
}
break;
case '-':
switch (c2) {
case '=': return MINEQUAL;
case '>': return RARROW;
}
break;
case '/':
switch (c2) {
case '/': return DOUBLESLASH;
case '=': return SLASHEQUAL;
}
break;
case '<':
switch (c2) {
case '<': return LEFTSHIFT;
case '=': return LESSEQUAL;
case '>': return NOTEQUAL;
}
break;
case '=':
switch (c2) {
case '=': return EQEQUAL;
}
break;
case '>':
switch (c2) {
case '=': return GREATEREQUAL;
case '>': return RIGHTSHIFT;
}
break;
case '@':
switch (c2) {
case '=': return ATEQUAL;
}
break;
case '^':
switch (c2) {
case '=': return CIRCUMFLEXEQUAL;
}
break;
case '|':
switch (c2) {
case '=': return VBAREQUAL;
}
break;
}
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
switch (c1) {
case '*':
switch (c2) {
case '*':
switch (c3) {
case '=': return DOUBLESTAREQUAL;
}
break;
}
break;
case '.':
switch (c2) {
case '.':
switch (c3) {
case '.': return ELLIPSIS;
}
break;
}
break;
case '/':
switch (c2) {
case '/':
switch (c3) {
case '=': return DOUBLESLASHEQUAL;
}
break;
}
break;
case '<':
switch (c2) {
case '<':
switch (c3) {
case '=': return LEFTSHIFTEQUAL;
}
break;
}
break;
case '>':
switch (c2) {
case '>':
switch (c3) {
case '=': return RIGHTSHIFTEQUAL;
}
break;
}
break;
}
return OP;
}
......@@ -48,72 +48,6 @@ static int tok_nextc(struct tok_state *tok);
static void tok_backup(struct tok_state *tok, int c);
/* Token names */
const char *_PyParser_TokenNames[] = {
"ENDMARKER",
"NAME",
"NUMBER",
"STRING",
"NEWLINE",
"INDENT",
"DEDENT",
"LPAR",
"RPAR",
"LSQB",
"RSQB",
"COLON",
"COMMA",
"SEMI",
"PLUS",
"MINUS",
"STAR",
"SLASH",
"VBAR",
"AMPER",
"LESS",
"GREATER",
"EQUAL",
"DOT",
"PERCENT",
"LBRACE",
"RBRACE",
"EQEQUAL",
"NOTEQUAL",
"LESSEQUAL",
"GREATEREQUAL",
"TILDE",
"CIRCUMFLEX",
"LEFTSHIFT",
"RIGHTSHIFT",
"DOUBLESTAR",
"PLUSEQUAL",
"MINEQUAL",
"STAREQUAL",
"SLASHEQUAL",
"PERCENTEQUAL",
"AMPEREQUAL",
"VBAREQUAL",
"CIRCUMFLEXEQUAL",
"LEFTSHIFTEQUAL",
"RIGHTSHIFTEQUAL",
"DOUBLESTAREQUAL",
"DOUBLESLASH",
"DOUBLESLASHEQUAL",
"AT",
"ATEQUAL",
"RARROW",
"ELLIPSIS",
/* This table must match the #defines in token.h! */
"OP",
"<ERRORTOKEN>",
"COMMENT",
"NL",
"ENCODING",
"<N_TOKENS>"
};
/* Create and initialize a new tok_state structure */
static struct tok_state *
......@@ -1114,177 +1048,6 @@ tok_backup(struct tok_state *tok, int c)
}
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c)
{
switch (c) {
case '(': return LPAR;
case ')': return RPAR;
case '[': return LSQB;
case ']': return RSQB;
case ':': return COLON;
case ',': return COMMA;
case ';': return SEMI;
case '+': return PLUS;
case '-': return MINUS;
case '*': return STAR;
case '/': return SLASH;
case '|': return VBAR;
case '&': return AMPER;
case '<': return LESS;
case '>': return GREATER;
case '=': return EQUAL;
case '.': return DOT;
case '%': return PERCENT;
case '{': return LBRACE;
case '}': return RBRACE;
case '^': return CIRCUMFLEX;
case '~': return TILDE;
case '@': return AT;
default: return OP;
}
}
int
PyToken_TwoChars(int c1, int c2)
{
switch (c1) {
case '=':
switch (c2) {
case '=': return EQEQUAL;
}
break;
case '!':
switch (c2) {
case '=': return NOTEQUAL;
}
break;
case '<':
switch (c2) {
case '>': return NOTEQUAL;
case '=': return LESSEQUAL;
case '<': return LEFTSHIFT;
}
break;
case '>':
switch (c2) {
case '=': return GREATEREQUAL;
case '>': return RIGHTSHIFT;
}
break;
case '+':
switch (c2) {
case '=': return PLUSEQUAL;
}
break;
case '-':
switch (c2) {
case '=': return MINEQUAL;
case '>': return RARROW;
}
break;
case '*':
switch (c2) {
case '*': return DOUBLESTAR;
case '=': return STAREQUAL;
}
break;
case '/':
switch (c2) {
case '/': return DOUBLESLASH;
case '=': return SLASHEQUAL;
}
break;
case '|':
switch (c2) {
case '=': return VBAREQUAL;
}
break;
case '%':
switch (c2) {
case '=': return PERCENTEQUAL;
}
break;
case '&':
switch (c2) {
case '=': return AMPEREQUAL;
}
break;
case '^':
switch (c2) {
case '=': return CIRCUMFLEXEQUAL;
}
break;
case '@':
switch (c2) {
case '=': return ATEQUAL;
}
break;
}
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
switch (c1) {
case '<':
switch (c2) {
case '<':
switch (c3) {
case '=':
return LEFTSHIFTEQUAL;
}
break;
}
break;
case '>':
switch (c2) {
case '>':
switch (c3) {
case '=':
return RIGHTSHIFTEQUAL;
}
break;
}
break;
case '*':
switch (c2) {
case '*':
switch (c3) {
case '=':
return DOUBLESTAREQUAL;
}
break;
}
break;
case '/':
switch (c2) {
case '/':
switch (c3) {
case '=':
return DOUBLESLASHEQUAL;
}
break;
}
break;
case '.':
switch (c2) {
case '.':
switch (c3) {
case '.':
return ELLIPSIS;
}
break;
}
break;
}
return OP;
}
static int
syntaxerror(struct tok_state *tok, const char *format, ...)
{
......
#! /usr/bin/env python3
# This script generates the symbol.py source file.
import sys
import re
def main(inFileName="Include/graminit.h", outFileName="Lib/symbol.py"):
try:
fp = open(inFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(1)
with fp:
lines = fp.read().split("\n")
prog = re.compile(
"#define[ \t][ \t]*([A-Z0-9][A-Z0-9_]*)[ \t][ \t]*([0-9][0-9]*)",
re.IGNORECASE)
tokens = {}
for line in lines:
match = prog.match(line)
if match:
name, val = match.group(1, 2)
val = int(val)
tokens[val] = name # reverse so we can sort them...
keys = sorted(tokens.keys())
# load the output skeleton from the target:
try:
fp = open(outFileName)
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(2)
with fp:
format = fp.read().split("\n")
try:
start = format.index("#--start constants--") + 1
end = format.index("#--end constants--")
except ValueError:
sys.stderr.write("target does not contain format markers")
sys.exit(3)
lines = []
for val in keys:
lines.append("%s = %d" % (tokens[val], val))
format[start:end] = lines
try:
fp = open(outFileName, 'w')
except OSError as err:
sys.stderr.write("I/O error: %s\n" % str(err))
sys.exit(4)
with fp:
fp.write("\n".join(format))
if __name__ == '__main__':
main(*sys.argv[1:])
#! /usr/bin/env python3
# This script generates token related files from Grammar/Tokens:
#
# Doc/library/token-list.inc
# Include/token.h
# Parser/token.c
# Lib/token.py
NT_OFFSET = 256
def load_tokens(path):
tok_names = []
string_to_tok = {}
ERRORTOKEN = None
with open(path) as fp:
for line in fp:
line = line.strip()
# strip comments
i = line.find('#')
if i >= 0:
line = line[:i].strip()
if not line:
continue
fields = line.split()
name = fields[0]
value = len(tok_names)
if name == 'ERRORTOKEN':
ERRORTOKEN = value
string = fields[1] if len(fields) > 1 else None
if string:
string = eval(string)
string_to_tok[string] = value
tok_names.append(name)
return tok_names, ERRORTOKEN, string_to_tok
def update_file(file, content):
try:
with open(file, 'r') as fobj:
if fobj.read() == content:
return False
except (OSError, ValueError):
pass
with open(file, 'w') as fobj:
fobj.write(content)
return True
token_h_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
/* Token types */
#ifndef Py_LIMITED_API
#ifndef Py_TOKEN_H
#define Py_TOKEN_H
#ifdef __cplusplus
extern "C" {
#endif
#undef TILDE /* Prevent clash of our definition with system macro. Ex AIX, ioctl.h */
%s\
#define N_TOKENS %d
#define NT_OFFSET %d
/* Special definitions for cooperation with parser */
#define ISTERMINAL(x) ((x) < NT_OFFSET)
#define ISNONTERMINAL(x) ((x) >= NT_OFFSET)
#define ISEOF(x) ((x) == ENDMARKER)
PyAPI_DATA(const char * const) _PyParser_TokenNames[]; /* Token names */
PyAPI_FUNC(int) PyToken_OneChar(int);
PyAPI_FUNC(int) PyToken_TwoChars(int, int);
PyAPI_FUNC(int) PyToken_ThreeChars(int, int, int);
#ifdef __cplusplus
}
#endif
#endif /* !Py_TOKEN_H */
#endif /* Py_LIMITED_API */
"""
def make_h(infile, outfile='Include/token.h'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
defines = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
defines.append("#define %-15s %d\n" % (name, value))
if update_file(outfile, token_h_template % (
''.join(defines),
len(tok_names),
NT_OFFSET
)):
print("%s regenerated from %s" % (outfile, infile))
token_c_template = """\
/* Auto-generated by Tools/scripts/generate_token.py */
#include "Python.h"
#include "token.h"
/* Token names */
const char * const _PyParser_TokenNames[] = {
%s\
};
/* Return the token corresponding to a single character */
int
PyToken_OneChar(int c1)
{
%s\
return OP;
}
int
PyToken_TwoChars(int c1, int c2)
{
%s\
return OP;
}
int
PyToken_ThreeChars(int c1, int c2, int c3)
{
%s\
return OP;
}
"""
def generate_chars_to_token(mapping, n=1):
result = []
write = result.append
indent = ' ' * n
write(indent)
write('switch (c%d) {\n' % (n,))
for c in sorted(mapping):
write(indent)
value = mapping[c]
if isinstance(value, dict):
write("case '%s':\n" % (c,))
write(generate_chars_to_token(value, n + 1))
write(indent)
write(' break;\n')
else:
write("case '%s': return %s;\n" % (c, value))
write(indent)
write('}\n')
return ''.join(result)
def make_c(infile, outfile='Parser/token.c'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
string_to_tok['<>'] = string_to_tok['!=']
chars_to_token = {}
for string, value in string_to_tok.items():
assert 1 <= len(string) <= 3
name = tok_names[value]
m = chars_to_token.setdefault(len(string), {})
for c in string[:-1]:
m = m.setdefault(c, {})
m[string[-1]] = name
names = []
for value, name in enumerate(tok_names):
if value >= ERRORTOKEN:
name = '<%s>' % name
names.append(' "%s",\n' % name)
names.append(' "<N_TOKENS>",\n')
if update_file(outfile, token_c_template % (
''.join(names),
generate_chars_to_token(chars_to_token[1]),
generate_chars_to_token(chars_to_token[2]),
generate_chars_to_token(chars_to_token[3])
)):
print("%s regenerated from %s" % (outfile, infile))
token_inc_template = """\
.. Auto-generated by Tools/scripts/generate_token.py
%s
.. data:: N_TOKENS
.. data:: NT_OFFSET
"""
def make_rst(infile, outfile='Doc/library/token-list.inc'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
tok_to_string = {value: s for s, value in string_to_tok.items()}
names = []
for value, name in enumerate(tok_names[:ERRORTOKEN + 1]):
names.append('.. data:: %s' % (name,))
if value in tok_to_string:
names.append('')
names.append(' Token value for ``"%s"``.' % tok_to_string[value])
names.append('')
if update_file(outfile, token_inc_template % '\n'.join(names)):
print("%s regenerated from %s" % (outfile, infile))
token_py_template = '''\
"""Token constants."""
# Auto-generated by Tools/scripts/generate_token.py
__all__ = ['tok_name', 'ISTERMINAL', 'ISNONTERMINAL', 'ISEOF']
%s
N_TOKENS = %d
# Special definitions for cooperation with parser
NT_OFFSET = %d
tok_name = {value: name
for name, value in globals().items()
if isinstance(value, int) and not name.startswith('_')}
__all__.extend(tok_name.values())
EXACT_TOKEN_TYPES = {
%s
}
def ISTERMINAL(x):
return x < NT_OFFSET
def ISNONTERMINAL(x):
return x >= NT_OFFSET
def ISEOF(x):
return x == ENDMARKER
'''
def make_py(infile, outfile='Lib/token.py'):
tok_names, ERRORTOKEN, string_to_tok = load_tokens(infile)
constants = []
for value, name in enumerate(tok_names):
constants.append('%s = %d' % (name, value))
constants.insert(ERRORTOKEN,
"# These aren't used by the C tokenizer but are needed for tokenize.py")
token_types = []
for s, value in sorted(string_to_tok.items()):
token_types.append(' %r: %s,' % (s, tok_names[value]))
if update_file(outfile, token_py_template % (
'\n'.join(constants),
len(tok_names),
NT_OFFSET,
'\n'.join(token_types),
)):
print("%s regenerated from %s" % (outfile, infile))
def main(op, infile='Grammar/Tokens', *args):
make = globals()['make_' + op]
make(infile, *args)
if __name__ == '__main__':
import sys
main(*sys.argv[1:])
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment