bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)

Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.

bpo-35808: Retire pgen and use pgen2 to generate the parser (GH-11814)
Pgen is the oldest piece of technology in the CPython repository, building it requires various #if[n]def PGEN hacks in other parts of the code and it also depends more and more on CPython internals. This commit removes the old pgen C code and replaces it for a new version implemented in pure Python. This is a modified and adapted version of lib2to3/pgen2 that can generate grammar files compatibles with the current parser. This commit also eliminates all the #ifdef and code branches related to pgen, simplifying the code and making it more maintainable. The regen-grammar step now uses $(PYTHON_FOR_REGEN) that can be any version of the interpreter, so the new pgen code maintains compatibility with older versions of the interpreter (this also allows regenerating the grammar with the current CI solution that uses Python3.5). The new pgen Python module also makes use of the Grammar/Tokens file that holds the token specification, so is always kept in sync and avoids having to maintain duplicate token definitions.
1f24a719 · Pablo Galindo · GitHub · 7eebbbd5 · 1f24a719 · 7eebbbd5
Commit 1f24a719 authored Mar 01, 2019 by Pablo Galindo Committed by GitHub Mar 01, 2019
27 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -73,8 +73,6 @@ PCbuild/arm32/
 PCbuild/obj/
 PCbuild/win32/
 .purify
-Parser/pgen
-Parser/pgen.exe
 __pycache__
 autom4te.cache
 build/

--- a/Include/metagrammar.h
+++ b/Include/metagrammar.h
-#ifndef Py_METAGRAMMAR_H
-#define Py_METAGRAMMAR_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-#define MSTART 256
-#define RULE 257
-#define RHS 258
-#define ALT 259
-#define ITEM 260
-#define ATOM 261
-#ifdef __cplusplus
-}
-#endif
-#endif /* !Py_METAGRAMMAR_H */
--- a/Include/parsetok.h
+++ b/Include/parsetok.h
@@ -12,10 +12,7 @@ extern "C" {
 typedef struct {
    int error;
-#ifndef PGEN
-    /* The filename is useless for pgen, see comment in tok_state structure */
    PyObject *filename;
-#endif
    int lineno;
    int offset;
    char *text;                 /* UTF-8-encoded string */

--- a/Include/pgen.h
+++ b/Include/pgen.h
-#ifndef Py_PGEN_H
-#define Py_PGEN_H
-#ifdef __cplusplus
-extern "C" {
-#endif
-/* Parser generator interface */
-extern grammar *meta_grammar(void);
-struct _node;
-extern grammar *pgen(struct _node *);
-#ifdef __cplusplus
-}
-#endif
-#endif /* !Py_PGEN_H */
--- a/Makefile.pre.in
+++ b/Makefile.pre.in
@@ -290,40 +290,21 @@ LIBFFI_INCLUDEDIR=	@LIBFFI_INCLUDEDIR@
 ##########################################################################
 # Parser
-PGEN=		Parser/pgen$(EXE)
 POBJS=		\
 		Parser/acceler.o \
 		Parser/grammar1.o \
 		Parser/listnode.o \
 		Parser/node.o \
 		Parser/parser.o \
-		Parser/bitset.o \
-		Parser/metagrammar.o \
-		Parser/firstsets.o \
-		Parser/grammar.o \
 		Parser/token.o \
-		Parser/pgen.o
 PARSER_OBJS=	$(POBJS) Parser/myreadline.o Parser/parsetok.o Parser/tokenizer.o
-PGOBJS=		\
-		Objects/obmalloc.o \
-		Python/dynamic_annotations.o \
-		Python/mysnprintf.o \
-		Python/pyctype.o \
-		Parser/tokenizer_pgen.o \
-		Parser/printgrammar.o \
-		Parser/parsetok_pgen.o \
-		Parser/pgenmain.o
 PARSER_HEADERS= \
 		$(srcdir)/Parser/parser.h \
 		$(srcdir)/Include/parsetok.h \
 		$(srcdir)/Parser/tokenizer.h
-PGENOBJS=	$(POBJS) $(PGOBJS)
 ##########################################################################
 # Python
@@ -802,31 +783,18 @@ Python/sysmodule.o: $(srcdir)/Python/sysmodule.c Makefile
 $(IO_OBJS): $(IO_H)
-$(PGEN): $(PGENOBJS)
-		$(CC) $(OPT) $(PY_CORE_LDFLAGS) $(PGENOBJS) $(LIBS) -o $(PGEN)
 .PHONY: regen-grammar
-regen-grammar: $(PGEN)
+regen-grammar: regen-token
 	# Regenerate Include/graminit.h and Python/graminit.c
 	# from Grammar/Grammar using pgen
 	@$(MKDIR_P) Include
-	$(PGEN) $(srcdir)/Grammar/Grammar \
+	$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \
+		$(srcdir)/Grammar/Tokens \
 		$(srcdir)/Include/graminit.h.new \
 		$(srcdir)/Python/graminit.c.new
 	$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new
 	$(UPDATE_FILE) $(srcdir)/Python/graminit.c $(srcdir)/Python/graminit.c.new
-Parser/grammar.o:	$(srcdir)/Parser/grammar.c \
-				$(srcdir)/Include/token.h \
-				$(srcdir)/Include/grammar.h
-Parser/metagrammar.o:	$(srcdir)/Parser/metagrammar.c
-Parser/tokenizer_pgen.o:	$(srcdir)/Parser/tokenizer.c
-Parser/parsetok_pgen.o:	$(srcdir)/Parser/parsetok.c
-Parser/printgrammar.o: $(srcdir)/Parser/printgrammar.c
-Parser/pgenmain.o:	$(srcdir)/Include/parsetok.h
 .PHONY=regen-ast
 regen-ast:
 	# Regenerate Include/Python-ast.h using Parser/asdl_c.py -h
@@ -1016,7 +984,6 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/longobject.h \
 		$(srcdir)/Include/marshal.h \
 		$(srcdir)/Include/memoryobject.h \
-		$(srcdir)/Include/metagrammar.h \
 		$(srcdir)/Include/methodobject.h \
 		$(srcdir)/Include/modsupport.h \
 		$(srcdir)/Include/moduleobject.h \
@@ -1028,7 +995,6 @@ PYTHON_HEADERS= \
 		$(srcdir)/Include/osdefs.h \
 		$(srcdir)/Include/osmodule.h \
 		$(srcdir)/Include/patchlevel.h \
-		$(srcdir)/Include/pgen.h \
 		$(srcdir)/Include/pgenheaders.h \
 		$(srcdir)/Include/pyarena.h \
 		$(srcdir)/Include/pycapsule.h \
@@ -1771,7 +1737,7 @@ profile-removal:
 	rm -f profile-run-stamp
 clobber: clean profile-removal
-	-rm -f $(BUILDPYTHON) $(PGEN) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
+	-rm -f $(BUILDPYTHON) $(LIBRARY) $(LDLIBRARY) $(DLLLIBRARY) \
 		tags TAGS \
 		config.cache config.log pyconfig.h Modules/config.c
 	-rm -rf build platform

--- a/Misc/NEWS.d/next/Core and Builtins/2019-02-11-00-50-03.bpo-11814.M12CMH.rst
+++ b/Misc/NEWS.d/next/Core and Builtins/2019-02-11-00-50-03.bpo-11814.M12CMH.rst
+Retire pgen and use a modified version of pgen2 to generate the parser.
+Patch by Pablo Galindo.
--- a/Misc/coverity_model.c
+++ b/Misc/coverity_model.c
@@ -92,14 +92,6 @@ wchar_t *Py_DecodeLocale(const char* arg, size_t *size)
   return w;
 }
-/* Parser/pgenmain.c */
-grammar *getgrammar(const char *filename)
-{
-    grammar *g;
-    __coverity_tainted_data_sink__(filename);
-    return g;
-}
 /* Python/marshal.c */
 static Py_ssize_t r_string(char *s, Py_ssize_t n, RFILE *p)

--- a/PCbuild/pythoncore.vcxproj
+++ b/PCbuild/pythoncore.vcxproj
@@ -161,7 +161,6 @@
    <ClInclude Include="..\Include\longobject.h" />
    <ClInclude Include="..\Include\marshal.h" />
    <ClInclude Include="..\Include\memoryobject.h" />
-    <ClInclude Include="..\Include\metagrammar.h" />
    <ClInclude Include="..\Include\methodobject.h" />
    <ClInclude Include="..\Include\modsupport.h" />
    <ClInclude Include="..\Include\moduleobject.h" />
@@ -175,7 +174,6 @@
    <ClInclude Include="..\Include\osmodule.h" />
    <ClInclude Include="..\Include\parsetok.h" />
    <ClInclude Include="..\Include\patchlevel.h" />
-    <ClInclude Include="..\Include\pgen.h" />
    <ClInclude Include="..\Include\pgenheaders.h" />
    <ClInclude Include="..\Include\pyhash.h" />
    <ClInclude Include="..\Include\py_curses.h" />
@@ -372,12 +370,8 @@
    <ClCompile Include="..\Objects\unicodeobject.c" />
    <ClCompile Include="..\Objects\weakrefobject.c" />
    <ClCompile Include="..\Parser\acceler.c" />
-    <ClCompile Include="..\Parser\bitset.c" />
-    <ClCompile Include="..\Parser\firstsets.c" />
-    <ClCompile Include="..\Parser\grammar.c" />
    <ClCompile Include="..\Parser\grammar1.c" />
    <ClCompile Include="..\Parser\listnode.c" />
-    <ClCompile Include="..\Parser\metagrammar.c" />
    <ClCompile Include="..\Parser\myreadline.c" />
    <ClCompile Include="..\Parser\node.c" />
    <ClCompile Include="..\Parser\parser.c" />

--- a/PCbuild/pythoncore.vcxproj.filters
+++ b/PCbuild/pythoncore.vcxproj.filters
@@ -234,9 +234,6 @@
    <ClInclude Include="..\Include\memoryobject.h">
      <Filter>Include</Filter>
    </ClInclude>
-    <ClInclude Include="..\Include\metagrammar.h">
-      <Filter>Include</Filter>
-    </ClInclude>
    <ClInclude Include="..\Include\methodobject.h">
      <Filter>Include</Filter>
    </ClInclude>
@@ -270,9 +267,6 @@
    <ClInclude Include="..\Include\patchlevel.h">
      <Filter>Include</Filter>
    </ClInclude>
-    <ClInclude Include="..\Include\pgen.h">
-      <Filter>Include</Filter>
-    </ClInclude>
    <ClInclude Include="..\Include\pgenheaders.h">
      <Filter>Include</Filter>
    </ClInclude>
@@ -836,24 +830,12 @@
    <ClCompile Include="..\Parser\acceler.c">
      <Filter>Parser</Filter>
    </ClCompile>
-    <ClCompile Include="..\Parser\bitset.c">
-      <Filter>Parser</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Parser\firstsets.c">
-      <Filter>Parser</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Parser\grammar.c">
-      <Filter>Parser</Filter>
-    </ClCompile>
    <ClCompile Include="..\Parser\grammar1.c">
      <Filter>Parser</Filter>
    </ClCompile>
    <ClCompile Include="..\Parser\listnode.c">
      <Filter>Parser</Filter>
    </ClCompile>
-    <ClCompile Include="..\Parser\metagrammar.c">
-      <Filter>Parser</Filter>
-    </ClCompile>
    <ClCompile Include="..\Parser\myreadline.c">
      <Filter>Parser</Filter>
    </ClCompile>

--- a/Parser/bitset.c
+++ b/Parser/bitset.c
-/* Bitset primitives used by the parser generator */
-#include "pgenheaders.h"
-#include "bitset.h"
-bitset
-newbitset(int nbits)
-{
-    int nbytes = NBYTES(nbits);
-    bitset ss = (char *)PyObject_MALLOC(sizeof(BYTE) *  nbytes);
-    if (ss == NULL)
-        Py_FatalError("no mem for bitset");
-    ss += nbytes;
-    while (--nbytes >= 0)
-        *--ss = 0;
-    return ss;
-}
-void
-delbitset(bitset ss)
-{
-    PyObject_FREE(ss);
-}
-int
-addbit(bitset ss, int ibit)
-{
-    int ibyte = BIT2BYTE(ibit);
-    BYTE mask = BIT2MASK(ibit);
-    if (ss[ibyte] & mask)
-        return 0; /* Bit already set */
-    ss[ibyte] |= mask;
-    return 1;
-}
-#if 0 /* Now a macro */
-int
-testbit(bitset ss, int ibit)
-{
-    return (ss[BIT2BYTE(ibit)] & BIT2MASK(ibit)) != 0;
-}
-#endif
-int
-samebitset(bitset ss1, bitset ss2, int nbits)
-{
-    int i;
-    for (i = NBYTES(nbits); --i >= 0; )
-        if (*ss1++ != *ss2++)
-            return 0;
-    return 1;
-}
-void
-mergebitset(bitset ss1, bitset ss2, int nbits)
-{
-    int i;
-    for (i = NBYTES(nbits); --i >= 0; )
-        *ss1++ |= *ss2++;
-}
--- a/Parser/firstsets.c
+++ b/Parser/firstsets.c
-/* Computation of FIRST stets */
-#include "pgenheaders.h"
-#include "grammar.h"
-#include "token.h"
-extern int Py_DebugFlag;
-/* Forward */
-static void calcfirstset(grammar *, dfa *);
-void
-addfirstsets(grammar *g)
-{
-    int i;
-    dfa *d;
-    if (Py_DebugFlag)
-        printf("Adding FIRST sets ...\n");
-    for (i = 0; i < g->g_ndfas; i++) {
-        d = &g->g_dfa[i];
-        if (d->d_first == NULL)
-            calcfirstset(g, d);
-    }
-}
-static void
-calcfirstset(grammar *g, dfa *d)
-{
-    int i, j;
-    state *s;
-    arc *a;
-    int nsyms;
-    int *sym;
-    int nbits;
-    static bitset dummy;
-    bitset result;
-    int type;
-    dfa *d1;
-    label *l0;
-    if (Py_DebugFlag)
-        printf("Calculate FIRST set for '%s'\n", d->d_name);
-    if (dummy == NULL)
-        dummy = newbitset(1);
-    if (d->d_first == dummy) {
-        fprintf(stderr, "Left-recursion for '%s'\n", d->d_name);
-        return;
-    }
-    if (d->d_first != NULL) {
-        fprintf(stderr, "Re-calculating FIRST set for '%s' ???\n",
-            d->d_name);
-    }
-    d->d_first = dummy;
-    l0 = g->g_ll.ll_label;
-    nbits = g->g_ll.ll_nlabels;
-    result = newbitset(nbits);
-    sym = (int *)PyObject_MALLOC(sizeof(int));
-    if (sym == NULL)
-        Py_FatalError("no mem for new sym in calcfirstset");
-    nsyms = 1;
-    sym[0] = findlabel(&g->g_ll, d->d_type, (char *)NULL);
-    s = &d->d_state[d->d_initial];
-    for (i = 0; i < s->s_narcs; i++) {
-        a = &s->s_arc[i];
-        for (j = 0; j < nsyms; j++) {
-            if (sym[j] == a->a_lbl)
-                break;
-        }
-        if (j >= nsyms) { /* New label */
-            sym = (int *)PyObject_REALLOC(sym,
-                                    sizeof(int) * (nsyms + 1));
-            if (sym == NULL)
-                Py_FatalError(
-                    "no mem to resize sym in calcfirstset");
-            sym[nsyms++] = a->a_lbl;
-            type = l0[a->a_lbl].lb_type;
-            if (ISNONTERMINAL(type)) {
-                d1 = PyGrammar_FindDFA(g, type);
-                if (d1->d_first == dummy) {
-                    fprintf(stderr,
-                        "Left-recursion below '%s'\n",
-                        d->d_name);
-                }
-                else {
-                    if (d1->d_first == NULL)
-                        calcfirstset(g, d1);
-                    mergebitset(result,
-                                d1->d_first, nbits);
-                }
-            }
-            else if (ISTERMINAL(type)) {
-                addbit(result, a->a_lbl);
-            }
-        }
-    }
-    d->d_first = result;
-    if (Py_DebugFlag) {
-        printf("FIRST set for '%s': {", d->d_name);
-        for (i = 0; i < nbits; i++) {
-            if (testbit(result, i))
-                printf(" %s", PyGrammar_LabelRepr(&l0[i]));
-        }
-        printf(" }\n");
-    }
-    PyObject_FREE(sym);
-}
--- a/Parser/grammar.c
+++ b/Parser/grammar.c
-/* Grammar implementation */
-#include "Python.h"
-#include "pgenheaders.h"
-#include <ctype.h>
-#include "token.h"
-#include "grammar.h"
-extern int Py_DebugFlag;
-grammar *
-newgrammar(int start)
-{
-    grammar *g;
-    g = (grammar *)PyObject_MALLOC(sizeof(grammar));
-    if (g == NULL)
-        Py_FatalError("no mem for new grammar");
-    g->g_ndfas = 0;
-    g->g_dfa = NULL;
-    g->g_start = start;
-    g->g_ll.ll_nlabels = 0;
-    g->g_ll.ll_label = NULL;
-    g->g_accel = 0;
-    return g;
-}
-void
-freegrammar(grammar *g)
-{
-    int i;
-    for (i = 0; i < g->g_ndfas; i++) {
-        free(g->g_dfa[i].d_name);
-        for (int j = 0; j < g->g_dfa[i].d_nstates; j++)
-            PyObject_FREE(g->g_dfa[i].d_state[j].s_arc);
-        PyObject_FREE(g->g_dfa[i].d_state);
-    }
-    PyObject_FREE(g->g_dfa);
-    for (i = 0; i < g->g_ll.ll_nlabels; i++)
-        free(g->g_ll.ll_label[i].lb_str);
-    PyObject_FREE(g->g_ll.ll_label);
-    PyObject_FREE(g);
-}
-dfa *
-adddfa(grammar *g, int type, const char *name)
-{
-    dfa *d;
-    g->g_dfa = (dfa *)PyObject_REALLOC(g->g_dfa,
-                                        sizeof(dfa) * (g->g_ndfas + 1));
-    if (g->g_dfa == NULL)
-        Py_FatalError("no mem to resize dfa in adddfa");
-    d = &g->g_dfa[g->g_ndfas++];
-    d->d_type = type;
-    d->d_name = strdup(name);
-    d->d_nstates = 0;
-    d->d_state = NULL;
-    d->d_initial = -1;
-    d->d_first = NULL;
-    return d; /* Only use while fresh! */
-}
-int
-addstate(dfa *d)
-{
-    state *s;
-    d->d_state = (state *)PyObject_REALLOC(d->d_state,
-                                  sizeof(state) * (d->d_nstates + 1));
-    if (d->d_state == NULL)
-        Py_FatalError("no mem to resize state in addstate");
-    s = &d->d_state[d->d_nstates++];
-    s->s_narcs = 0;
-    s->s_arc = NULL;
-    s->s_lower = 0;
-    s->s_upper = 0;
-    s->s_accel = NULL;
-    s->s_accept = 0;
-    return Py_SAFE_DOWNCAST(s - d->d_state, intptr_t, int);
-}
-void
-addarc(dfa *d, int from, int to, int lbl)
-{
-    state *s;
-    arc *a;
-    assert(0 <= from && from < d->d_nstates);
-    assert(0 <= to && to < d->d_nstates);
-    s = &d->d_state[from];
-    s->s_arc = (arc *)PyObject_REALLOC(s->s_arc, sizeof(arc) * (s->s_narcs + 1));
-    if (s->s_arc == NULL)
-        Py_FatalError("no mem to resize arc list in addarc");
-    a = &s->s_arc[s->s_narcs++];
-    a->a_lbl = lbl;
-    a->a_arrow = to;
-}
-int
-addlabel(labellist *ll, int type, const char *str)
-{
-    int i;
-    label *lb;
-    for (i = 0; i < ll->ll_nlabels; i++) {
-        if (ll->ll_label[i].lb_type == type &&
-            strcmp(ll->ll_label[i].lb_str, str) == 0)
-            return i;
-    }
-    ll->ll_label = (label *)PyObject_REALLOC(ll->ll_label,
-                                    sizeof(label) * (ll->ll_nlabels + 1));
-    if (ll->ll_label == NULL)
-        Py_FatalError("no mem to resize labellist in addlabel");
-    lb = &ll->ll_label[ll->ll_nlabels++];
-    lb->lb_type = type;
-    lb->lb_str = strdup(str);
-    if (Py_DebugFlag)
-        printf("Label @ %8p, %d: %s\n", ll, ll->ll_nlabels,
-               PyGrammar_LabelRepr(lb));
-    return Py_SAFE_DOWNCAST(lb - ll->ll_label, intptr_t, int);
-}
-/* Same, but rather dies than adds */
-int
-findlabel(labellist *ll, int type, const char *str)
-{
-    int i;
-    for (i = 0; i < ll->ll_nlabels; i++) {
-        if (ll->ll_label[i].lb_type == type /*&&
-            strcmp(ll->ll_label[i].lb_str, str) == 0*/)
-            return i;
-    }
-    fprintf(stderr, "Label %d/'%s' not found\n", type, str);
-    Py_FatalError("grammar.c:findlabel()");
-    /* Py_FatalError() is declared with __attribute__((__noreturn__)).
-       GCC emits a warning without "return 0;" (compiler bug!), but Clang is
-       smarter and emits a warning on the return... */
-#ifndef __clang__
-    return 0; /* Make gcc -Wall happy */
-#endif
-}
-/* Forward */
-static void translabel(grammar *, label *);
-void
-translatelabels(grammar *g)
-{
-    int i;
-#ifdef Py_DEBUG
-    printf("Translating labels ...\n");
-#endif
-    /* Don't translate EMPTY */
-    for (i = EMPTY+1; i < g->g_ll.ll_nlabels; i++)
-        translabel(g, &g->g_ll.ll_label[i]);
-}
-static void
-translabel(grammar *g, label *lb)
-{
-    int i;
-    if (Py_DebugFlag)
-        printf("Translating label %s ...\n", PyGrammar_LabelRepr(lb));
-    if (lb->lb_type == NAME) {
-        for (i = 0; i < g->g_ndfas; i++) {
-            if (strcmp(lb->lb_str, g->g_dfa[i].d_name) == 0) {
-                if (Py_DebugFlag)
-                    printf(
-                        "Label %s is non-terminal %d.\n",
-                        lb->lb_str,
-                        g->g_dfa[i].d_type);
-                lb->lb_type = g->g_dfa[i].d_type;
-                free(lb->lb_str);
-                lb->lb_str = NULL;
-                return;
-            }
-        }
-        for (i = 0; i < (int)N_TOKENS; i++) {
-            if (strcmp(lb->lb_str, _PyParser_TokenNames[i]) == 0) {
-                if (Py_DebugFlag)
-                    printf("Label %s is terminal %d.\n",
-                        lb->lb_str, i);
-                lb->lb_type = i;
-                free(lb->lb_str);
-                lb->lb_str = NULL;
-                return;
-            }
-        }
-        printf("Can't translate NAME label '%s'\n", lb->lb_str);
-        return;
-    }
-    if (lb->lb_type == STRING) {
-        if (isalpha(Py_CHARMASK(lb->lb_str[1])) ||
-            lb->lb_str[1] == '_') {
-            char *p;
-            char *src;
-            char *dest;
-            size_t name_len;
-            if (Py_DebugFlag)
-                printf("Label %s is a keyword\n", lb->lb_str);
-            lb->lb_type = NAME;
-            src = lb->lb_str + 1;
-            p = strchr(src, '\'');
-            if (p)
-                name_len = p - src;
-            else
-                name_len = strlen(src);
-            dest = (char *)malloc(name_len + 1);
-            if (!dest) {
-                printf("Can't alloc dest '%s'\n", src);
-                return;
-            }
-            strncpy(dest, src, name_len);
-            dest[name_len] = '\0';
-            free(lb->lb_str);
-            lb->lb_str = dest;
-        }
-        else if (lb->lb_str[2] == lb->lb_str[0]) {
-            int type = (int) PyToken_OneChar(lb->lb_str[1]);
-            if (type != OP) {
-                lb->lb_type = type;
-                free(lb->lb_str);
-                lb->lb_str = NULL;
-            }
-            else
-                printf("Unknown OP label %s\n",
-                    lb->lb_str);
-        }
-        else if (lb->lb_str[2] && lb->lb_str[3] == lb->lb_str[0]) {
-            int type = (int) PyToken_TwoChars(lb->lb_str[1],
-                                       lb->lb_str[2]);
-            if (type != OP) {
-                lb->lb_type = type;
-                free(lb->lb_str);
-                lb->lb_str = NULL;
-            }
-            else
-                printf("Unknown OP label %s\n",
-                    lb->lb_str);
-        }
-        else if (lb->lb_str[2] && lb->lb_str[3] && lb->lb_str[4] == lb->lb_str[0]) {
-            int type = (int) PyToken_ThreeChars(lb->lb_str[1],
-                                                lb->lb_str[2],
-                                                lb->lb_str[3]);
-            if (type != OP) {
-                lb->lb_type = type;
-                free(lb->lb_str);
-                lb->lb_str = NULL;
-            }
-            else
-                printf("Unknown OP label %s\n",
-                    lb->lb_str);
-        }
-        else
-            printf("Can't translate STRING label %s\n",
-                lb->lb_str);
-    }
-    else
-        printf("Can't translate label '%s'\n",
-               PyGrammar_LabelRepr(lb));
-}
--- a/Parser/metagrammar.c
+++ b/Parser/metagrammar.c
-#include "pgenheaders.h"
-#include "metagrammar.h"
-#include "grammar.h"
-#include "pgen.h"
-static arc arcs_0_0[3] = {
-    {2, 0},
-    {3, 0},
-    {4, 1},
-};
-static arc arcs_0_1[1] = {
-    {0, 1},
-};
-static state states_0[2] = {
-    {3, arcs_0_0},
-    {1, arcs_0_1},
-};
-static arc arcs_1_0[1] = {
-    {5, 1},
-};
-static arc arcs_1_1[1] = {
-    {6, 2},
-};
-static arc arcs_1_2[1] = {
-    {7, 3},
-};
-static arc arcs_1_3[1] = {
-    {3, 4},
-};
-static arc arcs_1_4[1] = {
-    {0, 4},
-};
-static state states_1[5] = {
-    {1, arcs_1_0},
-    {1, arcs_1_1},
-    {1, arcs_1_2},
-    {1, arcs_1_3},
-    {1, arcs_1_4},
-};
-static arc arcs_2_0[1] = {
-    {8, 1},
-};
-static arc arcs_2_1[2] = {
-    {9, 0},
-    {0, 1},
-};
-static state states_2[2] = {
-    {1, arcs_2_0},
-    {2, arcs_2_1},
-};
-static arc arcs_3_0[1] = {
-    {10, 1},
-};
-static arc arcs_3_1[2] = {
-    {10, 1},
-    {0, 1},
-};
-static state states_3[2] = {
-    {1, arcs_3_0},
-    {2, arcs_3_1},
-};
-static arc arcs_4_0[2] = {
-    {11, 1},
-    {13, 2},
-};
-static arc arcs_4_1[1] = {
-    {7, 3},
-};
-static arc arcs_4_2[3] = {
-    {14, 4},
-    {15, 4},
-    {0, 2},
-};
-static arc arcs_4_3[1] = {
-    {12, 4},
-};
-static arc arcs_4_4[1] = {
-    {0, 4},
-};
-static state states_4[5] = {
-    {2, arcs_4_0},
-    {1, arcs_4_1},
-    {3, arcs_4_2},
-    {1, arcs_4_3},
-    {1, arcs_4_4},
-};
-static arc arcs_5_0[3] = {
-    {5, 1},
-    {16, 1},
-    {17, 2},
-};
-static arc arcs_5_1[1] = {
-    {0, 1},
-};
-static arc arcs_5_2[1] = {
-    {7, 3},
-};
-static arc arcs_5_3[1] = {
-    {18, 1},
-};
-static state states_5[4] = {
-    {3, arcs_5_0},
-    {1, arcs_5_1},
-    {1, arcs_5_2},
-    {1, arcs_5_3},
-};
-static dfa dfas[6] = {
-    {256, "MSTART", 0, 2, states_0,
-     "\070\000\000"},
-    {257, "RULE", 0, 5, states_1,
-     "\040\000\000"},
-    {258, "RHS", 0, 2, states_2,
-     "\040\010\003"},
-    {259, "ALT", 0, 2, states_3,
-     "\040\010\003"},
-    {260, "ITEM", 0, 5, states_4,
-     "\040\010\003"},
-    {261, "ATOM", 0, 4, states_5,
-     "\040\000\003"},
-};
-static label labels[19] = {
-    {0, "EMPTY"},
-    {256, 0},
-    {257, 0},
-    {4, 0},
-    {0, 0},
-    {1, 0},
-    {11, 0},
-    {258, 0},
-    {259, 0},
-    {18, 0},
-    {260, 0},
-    {9, 0},
-    {10, 0},
-    {261, 0},
-    {16, 0},
-    {14, 0},
-    {3, 0},
-    {7, 0},
-    {8, 0},
-};
-static grammar _PyParser_Grammar = {
-    6,
-    dfas,
-    {19, labels},
-    256
-};
-grammar *
-meta_grammar(void)
-{
-    return &_PyParser_Grammar;
-}
-grammar *
-Py_meta_grammar(void)
-{
-  return meta_grammar();
-}
--- a/Parser/parsetok.c
+++ b/Parser/parsetok.c
@@ -99,10 +99,8 @@ PyParser_ParseStringObject(const char *s, PyObject *filename,
        tok->type_comments = 1;
    }
-#ifndef PGEN
    Py_INCREF(err_ret->filename);
    tok->filename = err_ret->filename;
-#endif
    return parsetok(tok, g, start, err_ret, flags);
 }
@@ -113,7 +111,6 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
 {
    node *n;
    PyObject *filename = NULL;
-#ifndef PGEN
    if (filename_str != NULL) {
        filename = PyUnicode_DecodeFSDefault(filename_str);
        if (filename == NULL) {
@@ -121,11 +118,8 @@ PyParser_ParseStringFlagsFilenameEx(const char *s, const char *filename_str,
            return NULL;
        }
    }
-#endif
    n = PyParser_ParseStringObject(s, filename, g, start, err_ret, flags);
-#ifndef PGEN
    Py_XDECREF(filename);
-#endif
    return n;
 }
@@ -169,10 +163,8 @@ PyParser_ParseFileObject(FILE *fp, PyObject *filename,
    if (*flags & PyPARSE_TYPE_COMMENTS) {
        tok->type_comments = 1;
    }
-#ifndef PGEN
    Py_INCREF(err_ret->filename);
    tok->filename = err_ret->filename;
-#endif
    return parsetok(tok, g, start, err_ret, flags);
 }
@@ -184,7 +176,6 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
 {
    node *n;
    PyObject *fileobj = NULL;
-#ifndef PGEN
    if (filename != NULL) {
        fileobj = PyUnicode_DecodeFSDefault(filename);
        if (fileobj == NULL) {
@@ -192,12 +183,9 @@ PyParser_ParseFileFlagsEx(FILE *fp, const char *filename,
            return NULL;
        }
    }
-#endif
    n = PyParser_ParseFileObject(fp, fileobj, enc, g,
                                 start, ps1, ps2, err_ret, flags);
-#ifndef PGEN
    Py_XDECREF(fileobj);
-#endif
    return n;
 }
@@ -371,7 +359,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
            }
        }
-#ifndef PGEN
        /* Check that the source for a single input statement really
           is a single statement by looking at what is left in the
           buffer after parsing.  Trailing whitespace and comments
@@ -399,7 +386,6 @@ parsetok(struct tok_state *tok, grammar *g, int start, perrdetail *err_ret,
                    c = *++cur;
            }
        }
-#endif
    }
    else
        n = NULL;
@@ -470,7 +456,6 @@ initerr(perrdetail *err_ret, PyObject *filename)
    err_ret->text = NULL;
    err_ret->token = -1;
    err_ret->expected = -1;
-#ifndef PGEN
    if (filename) {
        Py_INCREF(filename);
        err_ret->filename = filename;
@@ -482,6 +467,5 @@ initerr(perrdetail *err_ret, PyObject *filename)
            return -1;
        }
    }
-#endif
    return 0;
 }
--- a/Parser/parsetok_pgen.c
+++ b/Parser/parsetok_pgen.c
-#define PGEN
-#include "parsetok.c"
--- a/Parser/pgen.c
+++ b/Parser/pgen.c
--- a/Parser/pgen/__init__.py
+++ b/Parser/pgen/__init__.py
--- a/Parser/pgen/__main__.py
+++ b/Parser/pgen/__main__.py
+import argparse
+from .pgen import ParserGenerator
+def main():
+    parser = argparse.ArgumentParser(description="Parser generator main program.")
+    parser.add_argument(
+        "grammar", type=str, help="The file with the grammar definition in EBNF format"
+    )
+    parser.add_argument(
+        "tokens", type=str, help="The file with the token definitions"
+    )
+    parser.add_argument(
+        "graminit_h",
+        type=argparse.FileType('w'),
+        help="The path to write the grammar's non-terminals as #defines",
+    )
+    parser.add_argument(
+        "graminit_c",
+        type=argparse.FileType('w'),
+        help="The path to write the grammar as initialized data",
+    )
+    parser.add_argument("--verbose", "-v", action="count")
+    args = parser.parse_args()
+    p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
+    grammar = p.make_grammar()
+    grammar.produce_graminit_h(args.graminit_h.write)
+    grammar.produce_graminit_c(args.graminit_c.write)
+if __name__ == "__main__":
+    main()
--- a/Parser/pgen/grammar.py
+++ b/Parser/pgen/grammar.py
+import collections
+class Grammar:
+    """Pgen parsing tables conversion class.
+    Once initialized, this class supplies the grammar tables for the
+    parsing engine implemented by parse.py.  The parsing engine
+    accesses the instance variables directly.  The class here does not
+    provide initialization of the tables; several subclasses exist to
+    do this (see the conv and pgen modules).
+    The load() method reads the tables from a pickle file, which is
+    much faster than the other ways offered by subclasses.  The pickle
+    file is written by calling dump() (after loading the grammar
+    tables using a subclass).  The report() method prints a readable
+    representation of the tables to stdout, for debugging.
+    The instance variables are as follows:
+    symbol2number -- a dict mapping symbol names to numbers.  Symbol
+                     numbers are always 256 or higher, to distinguish
+                     them from token numbers, which are between 0 and
+                     255 (inclusive).
+    number2symbol -- a dict mapping numbers to symbol names;
+                     these two are each other's inverse.
+    states        -- a list of DFAs, where each DFA is a list of
+                     states, each state is a list of arcs, and each
+                     arc is a (i, j) pair where i is a label and j is
+                     a state number.  The DFA number is the index into
+                     this list.  (This name is slightly confusing.)
+                     Final states are represented by a special arc of
+                     the form (0, j) where j is its own state number.
+    dfas          -- a dict mapping symbol numbers to (DFA, first)
+                     pairs, where DFA is an item from the states list
+                     above, and first is a set of tokens that can
+                     begin this grammar rule (represented by a dict
+                     whose values are always 1).
+    labels        -- a list of (x, y) pairs where x is either a token
+                     number or a symbol number, and y is either None
+                     or a string; the strings are keywords.  The label
+                     number is the index in this list; label numbers
+                     are used to mark state transitions (arcs) in the
+                     DFAs.
+    start         -- the number of the grammar's start symbol.
+    keywords      -- a dict mapping keyword strings to arc labels.
+    tokens        -- a dict mapping token numbers to arc labels.
+    """
+    def __init__(self):
+        self.symbol2number = collections.OrderedDict()
+        self.number2symbol = collections.OrderedDict()
+        self.states = []
+        self.dfas = collections.OrderedDict()
+        self.labels = [(0, "EMPTY")]
+        self.keywords = collections.OrderedDict()
+        self.tokens = collections.OrderedDict()
+        self.symbol2label = collections.OrderedDict()
+        self.start = 256
+    def produce_graminit_h(self, writer):
+        writer("/* Generated by Parser/pgen */\n\n")
+        for number, symbol in self.number2symbol.items():
+            writer("#define {} {}\n".format(symbol, number))
+    def produce_graminit_c(self, writer):
+        writer("/* Generated by Parser/pgen */\n\n")
+        writer('#include "pgenheaders.h"\n')
+        writer('#include "grammar.h"\n')
+        writer("grammar _PyParser_Grammar;\n")
+        self.print_dfas(writer)
+        self.print_labels(writer)
+        writer("grammar _PyParser_Grammar = {\n")
+        writer("    {n_dfas},\n".format(n_dfas=len(self.dfas)))
+        writer("    dfas,\n")
+        writer("    {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
+        writer("    {start_number}\n".format(start_number=self.start))
+        writer("};\n")
+    def print_labels(self, writer):
+        writer(
+            "static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
+        )
+        for label, name in self.labels:
+            if name is None:
+                writer("    {{{label}, 0}},\n".format(label=label))
+            else:
+                writer(
+                    '    {{{label}, "{label_name}"}},\n'.format(
+                        label=label, label_name=name
+                    )
+                )
+        writer("};\n")
+    def print_dfas(self, writer):
+        self.print_states(writer)
+        writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
+        for dfaindex, dfa_elem in enumerate(self.dfas.items()):
+            symbol, (dfa, first_sets) = dfa_elem
+            writer(
+                '    {{{dfa_symbol}, "{symbol_name}", '.format(
+                    dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
+                )
+                + "0, {n_states}, states_{dfa_index},\n".format(
+                    n_states=len(dfa), dfa_index=dfaindex
+                )
+            )
+            writer('     "')
+            k = [name for label, name in self.labels if label in first_sets]
+            bitset = bytearray((len(self.labels) >> 3) + 1)
+            for token in first_sets:
+                bitset[token >> 3] |= 1 << (token & 7)
+            for byte in bitset:
+                writer("\\%03o" % (byte & 0xFF))
+            writer('"},\n')
+        writer("};\n")
+    def print_states(self, write):
+        for dfaindex, dfa in enumerate(self.states):
+            self.print_arcs(write, dfaindex, dfa)
+            write(
+                "static state states_{dfa_index}[{n_states}] = {{\n".format(
+                    dfa_index=dfaindex, n_states=len(dfa)
+                )
+            )
+            for stateindex, state in enumerate(dfa):
+                narcs = len(state)
+                write(
+                    "    {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
+                        n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
+                    )
+                )
+            write("};\n")
+    def print_arcs(self, write, dfaindex, states):
+        for stateindex, state in enumerate(states):
+            narcs = len(state)
+            write(
+                "static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
+                    dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
+                )
+            )
+            for a, b in state:
+                write(
+                    "    {{{from_label}, {to_state}}},\n".format(
+                        from_label=a, to_state=b
+                    )
+                )
+            write("};\n")
--- a/Parser/pgen/pgen.py
+++ b/Parser/pgen/pgen.py
--- a/Parser/pgen/token.py
+++ b/Parser/pgen/token.py
+import itertools
+def generate_tokens(tokens):
+    numbers = itertools.count(0)
+    for line in tokens:
+        line = line.strip()
+        if not line:
+            continue
+        if line.strip().startswith('#'):
+            continue
+        name = line.split()[0]
+        yield (name, next(numbers))
+    yield ('N_TOKENS', next(numbers))
+    yield ('NT_OFFSET', 256)
+def generate_opmap(tokens):
+    for line in tokens:
+        line = line.strip()
+        if not line:
+            continue
+        if line.strip().startswith('#'):
+            continue
+        pieces = line.split()
+        if len(pieces) != 2:
+            continue
+        name, op = pieces
+        yield (op.strip("'"), name)
+    # Yield independently <>. This is needed so it does not collide
+    # with the token generation in "generate_tokens" because if this
+    # symbol is included in Grammar/Tokens, it will collide with !=
+    # as it has the same name (NOTEQUAL).
+    yield ('<>', 'NOTEQUAL')
--- a/Parser/pgenmain.c
+++ b/Parser/pgenmain.c
-/* Parser generator main program */
-/* This expects a filename containing the grammar as argv[1] (UNIX)
-   or asks the console for such a file name (THINK C).
-   It writes its output on two files in the current directory:
-   - "graminit.c" gets the grammar as a bunch of initialized data
-   - "graminit.h" gets the grammar's non-terminals as #defines.
-   Error messages and status info during the generation process are
-   written to stdout, or sometimes to stderr. */
-/* XXX TO DO:
-   - check for duplicate definitions of names (instead of fatal err)
-*/
-#define PGEN
-#include "Python.h"
-#include "pycore_pymem.h"
-#include "pycore_pystate.h"
-#include "pgenheaders.h"
-#include "grammar.h"
-#include "node.h"
-#include "parsetok.h"
-#include "pgen.h"
-int Py_DebugFlag = 0;
-int Py_VerboseFlag = 0;
-int Py_IgnoreEnvironmentFlag = 0;
-_PyRuntimeState _PyRuntime = _PyRuntimeState_INIT;
-/* Forward */
-grammar *getgrammar(const char *filename);
-void
-Py_Exit(int sts)
-{
-    exit(sts);
-}
-/* Needed by obmalloc.c */
-int PyGILState_Check(void)
-{ return 1; }
-void _PyMem_DumpTraceback(int fd, const void *ptr)
-{}
-int
-main(int argc, char **argv)
-{
-    grammar *g;
-    FILE *fp;
-    char *filename, *graminit_h, *graminit_c;
-    if (argc != 4) {
-        fprintf(stderr,
-            "usage: %s grammar graminit.h graminit.c\n", argv[0]);
-        Py_Exit(2);
-    }
-    filename = argv[1];
-    graminit_h = argv[2];
-    graminit_c = argv[3];
-    g = getgrammar(filename);
-    fp = fopen(graminit_c, "w");
-    if (fp == NULL) {
-        perror(graminit_c);
-        Py_Exit(1);
-    }
-    if (Py_DebugFlag)
-        printf("Writing %s ...\n", graminit_c);
-    printgrammar(g, fp);
-    fclose(fp);
-    fp = fopen(graminit_h, "w");
-    if (fp == NULL) {
-        perror(graminit_h);
-        Py_Exit(1);
-    }
-    if (Py_DebugFlag)
-        printf("Writing %s ...\n", graminit_h);
-    printnonterminals(g, fp);
-    fclose(fp);
-    freegrammar(g);
-    Py_Exit(0);
-    return 0; /* Make gcc -Wall happy */
-}
-grammar *
-getgrammar(const char *filename)
-{
-    FILE *fp;
-    node *n;
-    grammar *g0, *g;
-    perrdetail err;
-    fp = fopen(filename, "r");
-    if (fp == NULL) {
-        perror(filename);
-        Py_Exit(1);
-    }
-    g0 = meta_grammar();
-    n = PyParser_ParseFile(fp, filename, g0, g0->g_start,
-                  (char *)NULL, (char *)NULL, &err);
-    fclose(fp);
-    if (n == NULL) {
-        fprintf(stderr, "Parsing error %d, line %d.\n",
-            err.error, err.lineno);
-        if (err.text != NULL) {
-            size_t len;
-            int i;
-            fprintf(stderr, "%s", err.text);
-            len = strlen(err.text);
-            if (len == 0 || err.text[len-1] != '\n')
-                fprintf(stderr, "\n");
-            for (i = 0; i < err.offset; i++) {
-                if (err.text[i] == '\t')
-                    putc('\t', stderr);
-                else
-                    putc(' ', stderr);
-            }
-            fprintf(stderr, "^\n");
-            PyObject_FREE(err.text);
-        }
-        Py_Exit(1);
-    }
-    g = pgen(n);
-    PyNode_Free(n);
-    if (g == NULL) {
-        printf("Bad grammar.\n");
-        Py_Exit(1);
-    }
-    return g;
-}
-/* Can't happen in pgen */
-PyObject*
-PyErr_Occurred()
-{
-    return 0;
-}
-void
-Py_FatalError(const char *msg)
-{
-    fprintf(stderr, "pgen: FATAL ERROR: %s\n", msg);
-    Py_Exit(1);
-}
-/* No-nonsense my_readline() for tokenizer.c */
-char *
-PyOS_Readline(FILE *sys_stdin, FILE *sys_stdout, const char *prompt)
-{
-    size_t n = 1000;
-    char *p = (char *)PyMem_MALLOC(n);
-    char *q;
-    if (p == NULL)
-        return NULL;
-    fprintf(stderr, "%s", prompt);
-    q = fgets(p, n, sys_stdin);
-    if (q == NULL) {
-        *p = '\0';
-        return p;
-    }
-    n = strlen(p);
-    if (n > 0 && p[n-1] != '\n')
-        p[n-1] = '\n';
-    return (char *)PyMem_REALLOC(p, n+1);
-}
-/* No-nonsense fgets */
-char *
-Py_UniversalNewlineFgets(char *buf, int n, FILE *stream, PyObject *fobj)
-{
-    return fgets(buf, n, stream);
-}
-#include <stdarg.h>
-void
-PySys_WriteStderr(const char *format, ...)
-{
-    va_list va;
-    va_start(va, format);
-    vfprintf(stderr, format, va);
-    va_end(va);
-}
--- a/Parser/printgrammar.c
+++ b/Parser/printgrammar.c
-/* Print a bunch of C initializers that represent a grammar */
-#define PGEN
-#include "pgenheaders.h"
-#include "grammar.h"
-/* Forward */
-static void printarcs(int, dfa *, FILE *);
-static void printstates(grammar *, FILE *);
-static void printdfas(grammar *, FILE *);
-static void printlabels(grammar *, FILE *);
-void
-printgrammar(grammar *g, FILE *fp)
-{
-    fprintf(fp, "/* Generated by Parser/pgen */\n\n");
-    fprintf(fp, "#include \"pgenheaders.h\"\n");
-    fprintf(fp, "#include \"grammar.h\"\n");
-    fprintf(fp, "grammar _PyParser_Grammar;\n");
-    printdfas(g, fp);
-    printlabels(g, fp);
-    fprintf(fp, "grammar _PyParser_Grammar = {\n");
-    fprintf(fp, "    %d,\n", g->g_ndfas);
-    fprintf(fp, "    dfas,\n");
-    fprintf(fp, "    {%d, labels},\n", g->g_ll.ll_nlabels);
-    fprintf(fp, "    %d\n", g->g_start);
-    fprintf(fp, "};\n");
-}
-void
-printnonterminals(grammar *g, FILE *fp)
-{
-    dfa *d;
-    int i;
-    fprintf(fp, "/* Generated by Parser/pgen */\n\n");
-    d = g->g_dfa;
-    for (i = g->g_ndfas; --i >= 0; d++)
-        fprintf(fp, "#define %s %d\n", d->d_name, d->d_type);
-}
-static void
-printarcs(int i, dfa *d, FILE *fp)
-{
-    arc *a;
-    state *s;
-    int j, k;
-    s = d->d_state;
-    for (j = 0; j < d->d_nstates; j++, s++) {
-        fprintf(fp, "static arc arcs_%d_%d[%d] = {\n",
-            i, j, s->s_narcs);
-        a = s->s_arc;
-        for (k = 0; k < s->s_narcs; k++, a++)
-            fprintf(fp, "    {%d, %d},\n", a->a_lbl, a->a_arrow);
-        fprintf(fp, "};\n");
-    }
-}
-static void
-printstates(grammar *g, FILE *fp)
-{
-    state *s;
-    dfa *d;
-    int i, j;
-    d = g->g_dfa;
-    for (i = 0; i < g->g_ndfas; i++, d++) {
-        printarcs(i, d, fp);
-        fprintf(fp, "static state states_%d[%d] = {\n",
-            i, d->d_nstates);
-        s = d->d_state;
-        for (j = 0; j < d->d_nstates; j++, s++)
-            fprintf(fp, "    {%d, arcs_%d_%d},\n",
-                s->s_narcs, i, j);
-        fprintf(fp, "};\n");
-    }
-}
-static void
-printdfas(grammar *g, FILE *fp)
-{
-    dfa *d;
-    int i, j, n;
-    printstates(g, fp);
-    fprintf(fp, "static dfa dfas[%d] = {\n", g->g_ndfas);
-    d = g->g_dfa;
-    for (i = 0; i < g->g_ndfas; i++, d++) {
-        fprintf(fp, "    {%d, \"%s\", %d, %d, states_%d,\n",
-            d->d_type, d->d_name, d->d_initial, d->d_nstates, i);
-        fprintf(fp, "     \"");
-        n = NBYTES(g->g_ll.ll_nlabels);
-        for (j = 0; j < n; j++)
-            fprintf(fp, "\\%03o", d->d_first[j] & 0xff);
-        fprintf(fp, "\"},\n");
-    }
-    fprintf(fp, "};\n");
-}
-static void
-printlabels(grammar *g, FILE *fp)
-{
-    label *l;
-    int i;
-    fprintf(fp, "static label labels[%d] = {\n", g->g_ll.ll_nlabels);
-    l = g->g_ll.ll_label;
-    for (i = g->g_ll.ll_nlabels; --i >= 0; l++) {
-        if (l->lb_str == NULL)
-            fprintf(fp, "    {%d, 0},\n", l->lb_type);
-        else
-            fprintf(fp, "    {%d, \"%s\"},\n",
-                l->lb_type, l->lb_str);
-    }
-    fprintf(fp, "};\n");
-}
--- a/Parser/tokenizer.c
+++ b/Parser/tokenizer.c
@@ -10,13 +10,11 @@
 #include "tokenizer.h"
 #include "errcode.h"
-#ifndef PGEN
 #include "unicodeobject.h"
 #include "bytesobject.h"
 #include "fileobject.h"
 #include "codecs.h"
 #include "abstract.h"
-#endif /* PGEN */
 /* Alternate tab spacing */
 #define ALTTABSIZE 1
@@ -81,11 +79,9 @@ tok_new(void)
    tok->enc = NULL;
    tok->encoding = NULL;
    tok->cont_line = 0;
-#ifndef PGEN
    tok->filename = NULL;
    tok->decoding_readline = NULL;
    tok->decoding_buffer = NULL;
-#endif
    tok->type_comments = 0;
    return tok;
@@ -104,28 +100,6 @@ new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
    return result;
 }
-#ifdef PGEN
-static char *
-decoding_fgets(char *s, int size, struct tok_state *tok)
-{
-    return fgets(s, size, tok->fp);
-}
-static int
-decoding_feof(struct tok_state *tok)
-{
-    return feof(tok->fp);
-}
-static char *
-decode_str(const char *str, int exec_input, struct tok_state *tok)
-{
-    return new_string(str, strlen(str), tok);
-}
-#else /* PGEN */
 static char *
 error_ret(struct tok_state *tok) /* XXX */
 {
@@ -551,7 +525,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
            return error_ret(tok);
        }
    }
-#ifndef PGEN
    /* The default encoding is UTF-8, so make sure we don't have any
       non-UTF-8 sequences in it. */
    if (line && !tok->encoding) {
@@ -574,7 +547,6 @@ decoding_fgets(char *s, int size, struct tok_state *tok)
                badchar, tok->filename, tok->lineno + 1);
        return error_ret(tok);
    }
-#endif
    return line;
 }
@@ -738,8 +710,6 @@ decode_str(const char *input, int single, struct tok_state *tok)
    return str;
 }
-#endif /* PGEN */
 /* Set up tokenizer for string */
 struct tok_state *
@@ -765,9 +735,7 @@ PyTokenizer_FromUTF8(const char *str, int exec_input)
    struct tok_state *tok = tok_new();
    if (tok == NULL)
        return NULL;
-#ifndef PGEN
    tok->input = str = translate_newlines(str, exec_input, tok);
-#endif
    if (str == NULL) {
        PyTokenizer_Free(tok);
        return NULL;
@@ -828,11 +796,9 @@ PyTokenizer_Free(struct tok_state *tok)
 {
    if (tok->encoding != NULL)
        PyMem_FREE(tok->encoding);
-#ifndef PGEN
    Py_XDECREF(tok->decoding_readline);
    Py_XDECREF(tok->decoding_buffer);
    Py_XDECREF(tok->filename);
-#endif
    if (tok->fp != NULL && tok->buf != NULL)
        PyMem_FREE(tok->buf);
    if (tok->input)
@@ -871,7 +837,6 @@ tok_nextc(struct tok_state *tok)
        }
        if (tok->prompt != NULL) {
            char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
-#ifndef PGEN
            if (newtok != NULL) {
                char *translated = translate_newlines(newtok, 0, tok);
                PyMem_FREE(newtok);
@@ -900,7 +865,6 @@ tok_nextc(struct tok_state *tok)
                strcpy(newtok, buf);
                Py_DECREF(u);
            }
-#endif
            if (tok->nextprompt != NULL)
                tok->prompt = tok->nextprompt;
            if (newtok == NULL)
@@ -1056,7 +1020,6 @@ tok_backup(struct tok_state *tok, int c)
 static int
 syntaxerror(struct tok_state *tok, const char *format, ...)
 {
-#ifndef PGEN
    va_list vargs;
 #ifdef HAVE_STDARG_PROTOTYPES
    va_start(vargs, format);
@@ -1069,9 +1032,6 @@ syntaxerror(struct tok_state *tok, const char *format, ...)
                               tok->lineno,
                               (int)(tok->cur - tok->line_start));
    tok->done = E_ERROR;
-#else
-    tok->done = E_TOKEN;
-#endif
    return ERRORTOKEN;
 }
@@ -1083,9 +1043,6 @@ indenterror(struct tok_state *tok)
    return ERRORTOKEN;
 }
-#ifdef PGEN
-#define verify_identifier(tok) 1
-#else
 /* Verify that the identifier follows PEP 3131.
   All identifier strings are guaranteed to be "ready" unicode objects.
 */
@@ -1112,7 +1069,6 @@ verify_identifier(struct tok_state *tok)
        tok->done = E_IDENTIFIER;
    return result;
 }
-#endif
 static int
 tok_decimal_tail(struct tok_state *tok)
@@ -1667,25 +1623,20 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
    case '(':
    case '[':
    case '{':
-#ifndef PGEN
        if (tok->level >= MAXLEVEL) {
            return syntaxerror(tok, "too many nested parentheses");
        }
        tok->parenstack[tok->level] = c;
        tok->parenlinenostack[tok->level] = tok->lineno;
-#endif
        tok->level++;
        break;
    case ')':
    case ']':
    case '}':
-#ifndef PGEN
        if (!tok->level) {
            return syntaxerror(tok, "unmatched '%c'", c);
        }
-#endif
        tok->level--;
-#ifndef PGEN
        int opening = tok->parenstack[tok->level];
        if (!((opening == '(' && c == ')') ||
              (opening == '[' && c == ']') ||
@@ -1704,7 +1655,6 @@ tok_get(struct tok_state *tok, char **p_start, char **p_end)
                        c, opening);
            }
        }
-#endif
        break;
    }
@@ -1742,11 +1692,7 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
    FILE *fp;
    char *p_start =NULL , *p_end =NULL , *encoding = NULL;
-#ifndef PGEN
    fd = _Py_dup(fd);
-#else
-    fd = dup(fd);
-#endif
    if (fd < 0) {
        return NULL;
    }
@@ -1760,7 +1706,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
        fclose(fp);
        return NULL;
    }
-#ifndef PGEN
    if (filename != NULL) {
        Py_INCREF(filename);
        tok->filename = filename;
@@ -1773,7 +1718,6 @@ PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
            return encoding;
        }
    }
-#endif
    while (tok->lineno < 2 && tok->done == E_OK) {
        PyTokenizer_Get(tok, &p_start, &p_end);
    }

--- a/Parser/tokenizer.h
+++ b/Parser/tokenizer.h
@@ -42,15 +42,9 @@ struct tok_state {
                           expression (cf. issue 16806) */
    int level;          /* () [] {} Parentheses nesting level */
            /* Used to allow free continuations inside them */
-#ifndef PGEN
    char parenstack[MAXLEVEL];
    int parenlinenostack[MAXLEVEL];
-    /* pgen doesn't have access to Python codecs, it cannot decode the input
-       filename. The bytes filename might be kept, but it is only used by
-       indenterror() and it is not really needed: pgen only compiles one file
-       (Grammar/Grammar). */
    PyObject *filename;
-#endif
    /* Stuff for checking on different tab sizes */
    int altindstack[MAXINDENT];         /* Stack of alternate indents */
    /* Stuff for PEP 0263 */
@@ -63,10 +57,8 @@ struct tok_state {
    const char* multi_line_start; /* pointer to start of first line of
                                     a single line or multi line string
                                     expression (cf. issue 16806) */
-#ifndef PGEN
    PyObject *decoding_readline; /* open(...).readline */
    PyObject *decoding_buffer;
-#endif
    const char* enc;        /* Encoding for the current str. */
    const char* str;
    const char* input; /* Tokenizer's newline translated copy of the string. */

--- a/Parser/tokenizer_pgen.c
+++ b/Parser/tokenizer_pgen.c
-#define PGEN
-#include "tokenizer.c"
--- a/Python/graminit.c
+++ b/Python/graminit.c