lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash randomization setting.

lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
between runs given the same Grammar.txt input regardless of the hash randomization setting.
dd1c638b · Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D) · d61910c5 · dd1c638b · dd1c638b · dd1c638b
Commit dd1c638b authored Sep 08, 2016 by Gregory P. Smith ext:(%20%5BGoogle%20Inc.%5D)
6 changed files
--- a/Lib/lib2to3/pgen2/driver.py
+++ b/Lib/lib2to3/pgen2/driver.py
@@ -106,16 +106,19 @@ class Driver(object):
        return self.parse_tokens(tokens, debug)


+def _generate_pickle_name(gt):
+    head, tail = os.path.splitext(gt)
+    if tail == ".txt":
+        tail = ""
+    return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+
+
 def load_grammar(gt="Grammar.txt", gp=None,
                 save=True, force=False, logger=None):
    """Load the grammar (maybe from a pickle)."""
    if logger is None:
        logger = logging.getLogger()
-    if gp is None:
-        head, tail = os.path.splitext(gt)
-        if tail == ".txt":
-            tail = ""
-        gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
+    gp = _generate_pickle_name(gt) if gp is None else gp
    if force or not _newer(gp, gt):
        logger.info("Generating grammar tables from %s", gt)
        g = pgen.generate_grammar(gt)
@@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
            try:
                g.dump(gp)
            except OSError as e:
-                logger.info("Writing failed:"+str(e))
+                logger.info("Writing failed: %s", e)
    else:
        g = grammar.Grammar()
        g.load(gp)

--- a/Lib/lib2to3/pgen2/grammar.py
+++ b/Lib/lib2to3/pgen2/grammar.py
@@ -13,6 +13,7 @@ fallback token code OP, but the parser needs the actual token code.
 """

 # Python imports
+import collections
 import pickle

 # Local imports
@@ -85,9 +86,21 @@ class Grammar(object):
        self.start = 256

    def dump(self, filename):
-        """Dump the grammar tables to a pickle file."""
+        """Dump the grammar tables to a pickle file.
+
+        dump() recursively changes all dict to OrderedDict, so the pickled file
+        is not exactly the same as what was passed in to dump(). load() uses the
+        pickled file to create the tables, but  only changes OrderedDict to dict
+        at the top level; it does not recursively change OrderedDict to dict.
+        So, the loaded tables are different from the original tables that were
+        passed to load() in that some of the OrderedDict (from the pickled file)
+        are not changed back to dict. For parsing, this has no effect on
+        performance because OrderedDict uses dict's __getitem__ with nothing in
+        between.
+        """
        with open(filename, "wb") as f:
-            pickle.dump(self.__dict__, f, 2)
+            d = _make_deterministic(self.__dict__)
+            pickle.dump(d, f, 2)

    def load(self, filename):
        """Load the grammar tables from a pickle file."""
@@ -124,6 +137,17 @@ class Grammar(object):
        print("start", self.start)


+def _make_deterministic(top):
+    if isinstance(top, dict):
+      return collections.OrderedDict(
+          sorted(((k, _make_deterministic(v)) for k, v in top.items())))
+    if isinstance(top, list):
+      return [_make_deterministic(e) for e in top]
+    if isinstance(top, tuple):
+      return tuple(_make_deterministic(e) for e in top)
+    return top
+
+
 # Map from operator to number (since tokenize doesn't do this)

 opmap_raw = """

--- a/Lib/lib2to3/pgen2/pgen.py
+++ b/Lib/lib2to3/pgen2/pgen.py
@@ -39,7 +39,7 @@ class ParserGenerator(object):
            states = []
            for state in dfa:
                arcs = []
-                for label, next in state.arcs.items():
+                for label, next in sorted(state.arcs.items()):
                    arcs.append((self.make_label(c, label), dfa.index(next)))
                if state.isfinal:
                    arcs.append((0, dfa.index(state)))
@@ -52,7 +52,7 @@ class ParserGenerator(object):
    def make_first(self, c, name):
        rawfirst = self.first[name]
        first = {}
-        for label in rawfirst:
+        for label in sorted(rawfirst):
            ilabel = self.make_label(c, label)
            ##assert ilabel not in first # XXX failed on <> ... !=
            first[ilabel] = 1
@@ -192,7 +192,7 @@ class ParserGenerator(object):
                for label, next in nfastate.arcs:
                    if label is not None:
                        addclosure(next, arcs.setdefault(label, {}))
-            for label, nfaset in arcs.items():
+            for label, nfaset in sorted(arcs.items()):
                for st in states:
                    if st.nfaset == nfaset:
                        break
@@ -222,7 +222,7 @@ class ParserGenerator(object):
        print("Dump of DFA for", name)
        for i, state in enumerate(dfa):
            print("  State", i, state.isfinal and "(final)" or "")
-            for label, next in state.arcs.items():
+            for label, next in sorted(state.arcs.items()):
                print("    %s -> %d" % (label, dfa.index(next)))

    def simplify_dfa(self, dfa):

--- a/Lib/lib2to3/tests/support.py
+++ b/Lib/lib2to3/tests/support.py
@@ -11,13 +11,13 @@ from textwrap import dedent

 # Local imports
 from lib2to3 import pytree, refactor
-from lib2to3.pgen2 import driver
+from lib2to3.pgen2 import driver as pgen2_driver

 test_dir = os.path.dirname(__file__)
 proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
 grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
-grammar = driver.load_grammar(grammar_path)
-driver = driver.Driver(grammar, convert=pytree.convert)
+grammar = pgen2_driver.load_grammar(grammar_path)
+driver = pgen2_driver.Driver(grammar, convert=pytree.convert)

 def parse_string(string):
    return driver.parse_string(reformat(string), debug=True)

--- a/Lib/lib2to3/tests/test_parser.py
+++ b/Lib/lib2to3/tests/test_parser.py
@@ -6,8 +6,6 @@ parts of the grammar we've changed, we also make sure we can parse the
 test_grammar.py files from both Python 2 and Python 3.
 """

-from __future__ import with_statement
-
 # Testing imports
 from . import support
 from .support import driver, test_dir
@@ -15,12 +13,15 @@ from test.support import verbose

 # Python imports
 import os
+import shutil
+import subprocess
 import sys
+import tempfile
 import unittest
 import warnings
-import subprocess

 # Local imports
+from lib2to3.pgen2 import driver as pgen2_driver
 from lib2to3.pgen2 import tokenize
 from ..pgen2.parse import ParseError
 from lib2to3.pygram import python_symbols as syms
@@ -35,6 +36,71 @@ class TestDriver(support.TestCase):
        self.assertEqual(t.children[1].children[0].type, syms.print_stmt)


+class TestPgen2Caching(support.TestCase):
+    def test_load_grammar_from_txt_file(self):
+        pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
+
+    def test_load_grammar_from_pickle(self):
+        # Make a copy of the grammar file in a temp directory we are
+        # guaranteed to be able to write to.
+        tmpdir = tempfile.mkdtemp()
+        try:
+            grammar_copy = os.path.join(
+                    tmpdir, os.path.basename(support.grammar_path))
+            shutil.copy(support.grammar_path, grammar_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            os.unlink(grammar_copy)  # Only the pickle remains...
+            pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
+        finally:
+            shutil.rmtree(tmpdir)
+
+    @unittest.skipIf(sys.executable is None, 'sys.executable required')
+    def test_load_grammar_from_subprocess(self):
+        tmpdir = tempfile.mkdtemp()
+        tmpsubdir = os.path.join(tmpdir, 'subdir')
+        try:
+            os.mkdir(tmpsubdir)
+            grammar_base = os.path.basename(support.grammar_path)
+            grammar_copy = os.path.join(tmpdir, grammar_base)
+            grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
+            shutil.copy(support.grammar_path, grammar_copy)
+            shutil.copy(support.grammar_path, grammar_sub_copy)
+            pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
+            pickle_sub_name = pgen2_driver._generate_pickle_name(
+                     grammar_sub_copy)
+            self.assertNotEqual(pickle_name, pickle_sub_name)
+
+            # Generate a pickle file from this process.
+            pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
+            self.assertTrue(os.path.exists(pickle_name))
+
+            # Generate a new pickle file in a subprocess with a most likely
+            # different hash randomization seed.
+            sub_env = dict(os.environ)
+            sub_env['PYTHONHASHSEED'] = 'random'
+            subprocess.check_call(
+                    [sys.executable, '-c', """
+from lib2to3.pgen2 import driver as pgen2_driver
+pgen2_driver.load_grammar(%r, save=True, force=True)
+                    """ % (grammar_sub_copy,)],
+                    env=sub_env)
+            self.assertTrue(os.path.exists(pickle_sub_name))
+
+            with open(pickle_name, 'rb') as pickle_f_1, \
+                    open(pickle_sub_name, 'rb') as pickle_f_2:
+                self.assertEqual(
+                    pickle_f_1.read(), pickle_f_2.read(),
+                    msg='Grammar caches generated using different hash seeds'
+                    ' were not identical.')
+        finally:
+            shutil.rmtree(tmpdir)
+
+
+
 class GrammarTest(support.TestCase):
    def validate(self, code):
        support.parse_string(code)

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -67,6 +67,10 @@ Core and Builtins
 Library
 -------

+- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
+  between runs given the same Grammar.txt input regardless of the hash
+  randomization setting.
+
 - Issue #27570: Avoid zero-length memcpy() etc calls with null source
  pointers in the "ctypes" and "array" modules.