lib2to3.pgen3.driver.load_grammar() now creates a stable cache file

between runs given the same Grammar.txt input regardless of the hash
randomization setting.
parent d61910c5
...@@ -106,16 +106,19 @@ class Driver(object): ...@@ -106,16 +106,19 @@ class Driver(object):
return self.parse_tokens(tokens, debug) return self.parse_tokens(tokens, debug)
def _generate_pickle_name(gt):
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
return head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
def load_grammar(gt="Grammar.txt", gp=None, def load_grammar(gt="Grammar.txt", gp=None,
save=True, force=False, logger=None): save=True, force=False, logger=None):
"""Load the grammar (maybe from a pickle).""" """Load the grammar (maybe from a pickle)."""
if logger is None: if logger is None:
logger = logging.getLogger() logger = logging.getLogger()
if gp is None: gp = _generate_pickle_name(gt) if gp is None else gp
head, tail = os.path.splitext(gt)
if tail == ".txt":
tail = ""
gp = head + tail + ".".join(map(str, sys.version_info)) + ".pickle"
if force or not _newer(gp, gt): if force or not _newer(gp, gt):
logger.info("Generating grammar tables from %s", gt) logger.info("Generating grammar tables from %s", gt)
g = pgen.generate_grammar(gt) g = pgen.generate_grammar(gt)
...@@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None, ...@@ -124,7 +127,7 @@ def load_grammar(gt="Grammar.txt", gp=None,
try: try:
g.dump(gp) g.dump(gp)
except OSError as e: except OSError as e:
logger.info("Writing failed:"+str(e)) logger.info("Writing failed: %s", e)
else: else:
g = grammar.Grammar() g = grammar.Grammar()
g.load(gp) g.load(gp)
......
...@@ -13,6 +13,7 @@ fallback token code OP, but the parser needs the actual token code. ...@@ -13,6 +13,7 @@ fallback token code OP, but the parser needs the actual token code.
""" """
# Python imports # Python imports
import collections
import pickle import pickle
# Local imports # Local imports
...@@ -85,9 +86,21 @@ class Grammar(object): ...@@ -85,9 +86,21 @@ class Grammar(object):
self.start = 256 self.start = 256
def dump(self, filename): def dump(self, filename):
"""Dump the grammar tables to a pickle file.""" """Dump the grammar tables to a pickle file.
dump() recursively changes all dict to OrderedDict, so the pickled file
is not exactly the same as what was passed in to dump(). load() uses the
pickled file to create the tables, but only changes OrderedDict to dict
at the top level; it does not recursively change OrderedDict to dict.
So, the loaded tables are different from the original tables that were
passed to load() in that some of the OrderedDict (from the pickled file)
are not changed back to dict. For parsing, this has no effect on
performance because OrderedDict uses dict's __getitem__ with nothing in
between.
"""
with open(filename, "wb") as f: with open(filename, "wb") as f:
pickle.dump(self.__dict__, f, 2) d = _make_deterministic(self.__dict__)
pickle.dump(d, f, 2)
def load(self, filename): def load(self, filename):
"""Load the grammar tables from a pickle file.""" """Load the grammar tables from a pickle file."""
...@@ -124,6 +137,17 @@ class Grammar(object): ...@@ -124,6 +137,17 @@ class Grammar(object):
print("start", self.start) print("start", self.start)
def _make_deterministic(top):
if isinstance(top, dict):
return collections.OrderedDict(
sorted(((k, _make_deterministic(v)) for k, v in top.items())))
if isinstance(top, list):
return [_make_deterministic(e) for e in top]
if isinstance(top, tuple):
return tuple(_make_deterministic(e) for e in top)
return top
# Map from operator to number (since tokenize doesn't do this) # Map from operator to number (since tokenize doesn't do this)
opmap_raw = """ opmap_raw = """
......
...@@ -39,7 +39,7 @@ class ParserGenerator(object): ...@@ -39,7 +39,7 @@ class ParserGenerator(object):
states = [] states = []
for state in dfa: for state in dfa:
arcs = [] arcs = []
for label, next in state.arcs.items(): for label, next in sorted(state.arcs.items()):
arcs.append((self.make_label(c, label), dfa.index(next))) arcs.append((self.make_label(c, label), dfa.index(next)))
if state.isfinal: if state.isfinal:
arcs.append((0, dfa.index(state))) arcs.append((0, dfa.index(state)))
...@@ -52,7 +52,7 @@ class ParserGenerator(object): ...@@ -52,7 +52,7 @@ class ParserGenerator(object):
def make_first(self, c, name): def make_first(self, c, name):
rawfirst = self.first[name] rawfirst = self.first[name]
first = {} first = {}
for label in rawfirst: for label in sorted(rawfirst):
ilabel = self.make_label(c, label) ilabel = self.make_label(c, label)
##assert ilabel not in first # XXX failed on <> ... != ##assert ilabel not in first # XXX failed on <> ... !=
first[ilabel] = 1 first[ilabel] = 1
...@@ -192,7 +192,7 @@ class ParserGenerator(object): ...@@ -192,7 +192,7 @@ class ParserGenerator(object):
for label, next in nfastate.arcs: for label, next in nfastate.arcs:
if label is not None: if label is not None:
addclosure(next, arcs.setdefault(label, {})) addclosure(next, arcs.setdefault(label, {}))
for label, nfaset in arcs.items(): for label, nfaset in sorted(arcs.items()):
for st in states: for st in states:
if st.nfaset == nfaset: if st.nfaset == nfaset:
break break
...@@ -222,7 +222,7 @@ class ParserGenerator(object): ...@@ -222,7 +222,7 @@ class ParserGenerator(object):
print("Dump of DFA for", name) print("Dump of DFA for", name)
for i, state in enumerate(dfa): for i, state in enumerate(dfa):
print(" State", i, state.isfinal and "(final)" or "") print(" State", i, state.isfinal and "(final)" or "")
for label, next in state.arcs.items(): for label, next in sorted(state.arcs.items()):
print(" %s -> %d" % (label, dfa.index(next))) print(" %s -> %d" % (label, dfa.index(next)))
def simplify_dfa(self, dfa): def simplify_dfa(self, dfa):
......
...@@ -11,13 +11,13 @@ from textwrap import dedent ...@@ -11,13 +11,13 @@ from textwrap import dedent
# Local imports # Local imports
from lib2to3 import pytree, refactor from lib2to3 import pytree, refactor
from lib2to3.pgen2 import driver from lib2to3.pgen2 import driver as pgen2_driver
test_dir = os.path.dirname(__file__) test_dir = os.path.dirname(__file__)
proj_dir = os.path.normpath(os.path.join(test_dir, "..")) proj_dir = os.path.normpath(os.path.join(test_dir, ".."))
grammar_path = os.path.join(test_dir, "..", "Grammar.txt") grammar_path = os.path.join(test_dir, "..", "Grammar.txt")
grammar = driver.load_grammar(grammar_path) grammar = pgen2_driver.load_grammar(grammar_path)
driver = driver.Driver(grammar, convert=pytree.convert) driver = pgen2_driver.Driver(grammar, convert=pytree.convert)
def parse_string(string): def parse_string(string):
return driver.parse_string(reformat(string), debug=True) return driver.parse_string(reformat(string), debug=True)
......
...@@ -6,8 +6,6 @@ parts of the grammar we've changed, we also make sure we can parse the ...@@ -6,8 +6,6 @@ parts of the grammar we've changed, we also make sure we can parse the
test_grammar.py files from both Python 2 and Python 3. test_grammar.py files from both Python 2 and Python 3.
""" """
from __future__ import with_statement
# Testing imports # Testing imports
from . import support from . import support
from .support import driver, test_dir from .support import driver, test_dir
...@@ -15,12 +13,15 @@ from test.support import verbose ...@@ -15,12 +13,15 @@ from test.support import verbose
# Python imports # Python imports
import os import os
import shutil
import subprocess
import sys import sys
import tempfile
import unittest import unittest
import warnings import warnings
import subprocess
# Local imports # Local imports
from lib2to3.pgen2 import driver as pgen2_driver
from lib2to3.pgen2 import tokenize from lib2to3.pgen2 import tokenize
from ..pgen2.parse import ParseError from ..pgen2.parse import ParseError
from lib2to3.pygram import python_symbols as syms from lib2to3.pygram import python_symbols as syms
...@@ -35,6 +36,71 @@ class TestDriver(support.TestCase): ...@@ -35,6 +36,71 @@ class TestDriver(support.TestCase):
self.assertEqual(t.children[1].children[0].type, syms.print_stmt) self.assertEqual(t.children[1].children[0].type, syms.print_stmt)
class TestPgen2Caching(support.TestCase):
def test_load_grammar_from_txt_file(self):
pgen2_driver.load_grammar(support.grammar_path, save=False, force=True)
def test_load_grammar_from_pickle(self):
# Make a copy of the grammar file in a temp directory we are
# guaranteed to be able to write to.
tmpdir = tempfile.mkdtemp()
try:
grammar_copy = os.path.join(
tmpdir, os.path.basename(support.grammar_path))
shutil.copy(support.grammar_path, grammar_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
os.unlink(grammar_copy) # Only the pickle remains...
pgen2_driver.load_grammar(grammar_copy, save=False, force=False)
finally:
shutil.rmtree(tmpdir)
@unittest.skipIf(sys.executable is None, 'sys.executable required')
def test_load_grammar_from_subprocess(self):
tmpdir = tempfile.mkdtemp()
tmpsubdir = os.path.join(tmpdir, 'subdir')
try:
os.mkdir(tmpsubdir)
grammar_base = os.path.basename(support.grammar_path)
grammar_copy = os.path.join(tmpdir, grammar_base)
grammar_sub_copy = os.path.join(tmpsubdir, grammar_base)
shutil.copy(support.grammar_path, grammar_copy)
shutil.copy(support.grammar_path, grammar_sub_copy)
pickle_name = pgen2_driver._generate_pickle_name(grammar_copy)
pickle_sub_name = pgen2_driver._generate_pickle_name(
grammar_sub_copy)
self.assertNotEqual(pickle_name, pickle_sub_name)
# Generate a pickle file from this process.
pgen2_driver.load_grammar(grammar_copy, save=True, force=True)
self.assertTrue(os.path.exists(pickle_name))
# Generate a new pickle file in a subprocess with a most likely
# different hash randomization seed.
sub_env = dict(os.environ)
sub_env['PYTHONHASHSEED'] = 'random'
subprocess.check_call(
[sys.executable, '-c', """
from lib2to3.pgen2 import driver as pgen2_driver
pgen2_driver.load_grammar(%r, save=True, force=True)
""" % (grammar_sub_copy,)],
env=sub_env)
self.assertTrue(os.path.exists(pickle_sub_name))
with open(pickle_name, 'rb') as pickle_f_1, \
open(pickle_sub_name, 'rb') as pickle_f_2:
self.assertEqual(
pickle_f_1.read(), pickle_f_2.read(),
msg='Grammar caches generated using different hash seeds'
' were not identical.')
finally:
shutil.rmtree(tmpdir)
class GrammarTest(support.TestCase): class GrammarTest(support.TestCase):
def validate(self, code): def validate(self, code):
support.parse_string(code) support.parse_string(code)
......
...@@ -67,6 +67,10 @@ Core and Builtins ...@@ -67,6 +67,10 @@ Core and Builtins
Library Library
------- -------
- lib2to3.pgen3.driver.load_grammar() now creates a stable cache file
between runs given the same Grammar.txt input regardless of the hash
randomization setting.
- Issue #27570: Avoid zero-length memcpy() etc calls with null source - Issue #27570: Avoid zero-length memcpy() etc calls with null source
pointers in the "ctypes" and "array" modules. pointers in the "ctypes" and "array" modules.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment