Commit 05dadcfb authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can

produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
parent df938694
...@@ -2282,40 +2282,61 @@ def genops(pickle): ...@@ -2282,40 +2282,61 @@ def genops(pickle):
def optimize(p): def optimize(p):
'Optimize a pickle string by removing unused PUT opcodes' 'Optimize a pickle string by removing unused PUT opcodes'
not_a_put = object() put = 'PUT'
gets = { not_a_put } # set of args used by a GET opcode get = 'GET'
opcodes = [] # (startpos, stoppos, putid) oldids = set() # set of all PUT ids
newids = {} # set of ids used by a GET opcode
opcodes = [] # (op, idx) or (pos, end_pos)
proto = 0 proto = 0
protoheader = b''
for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True): for opcode, arg, pos, end_pos in _genops(p, yield_end_pos=True):
if 'PUT' in opcode.name: if 'PUT' in opcode.name:
opcodes.append((pos, end_pos, arg)) oldids.add(arg)
opcodes.append((put, arg))
elif opcode.name == 'MEMOIZE':
idx = len(oldids)
oldids.add(idx)
opcodes.append((put, idx))
elif 'FRAME' in opcode.name: elif 'FRAME' in opcode.name:
pass pass
else: elif 'GET' in opcode.name:
if 'GET' in opcode.name: if opcode.proto > proto:
gets.add(arg) proto = opcode.proto
elif opcode.name == 'PROTO': newids[arg] = None
assert pos == 0, pos opcodes.append((get, arg))
elif opcode.name == 'PROTO':
if arg > proto:
proto = arg proto = arg
opcodes.append((pos, end_pos, not_a_put)) if pos == 0:
prevpos, prevarg = pos, None protoheader = p[pos: end_pos]
else:
opcodes.append((pos, end_pos))
else:
opcodes.append((pos, end_pos))
del oldids
# Copy the opcodes except for PUTS without a corresponding GET # Copy the opcodes except for PUTS without a corresponding GET
out = io.BytesIO() out = io.BytesIO()
opcodes = iter(opcodes) # Write the PROTO header before any framing
if proto >= 2: out.write(protoheader)
# Write the PROTO header before any framing pickler = pickle._Pickler(out, proto)
start, stop, _ = next(opcodes)
out.write(p[start:stop])
buf = pickle._Framer(out.write)
if proto >= 4: if proto >= 4:
buf.start_framing() pickler.framer.start_framing()
for start, stop, putid in opcodes: idx = 0
if putid in gets: for op, arg in opcodes:
buf.commit_frame() if op is put:
buf.write(p[start:stop]) if arg not in newids:
if proto >= 4: continue
buf.end_framing() data = pickler.put(idx)
newids[arg] = idx
idx += 1
elif op is get:
data = pickler.get(newids[arg])
else:
data = p[op:arg]
pickler.framer.commit_frame()
pickler.write(data)
pickler.framer.end_framing()
return out.getvalue() return out.getvalue()
############################################################################## ##############################################################################
......
import struct
import pickle import pickle
import pickletools import pickletools
from test import support from test import support
...@@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests): ...@@ -15,6 +16,48 @@ class OptimizedPickleTests(AbstractPickleTests, AbstractPickleModuleTests):
# Test relies on precise output of dumps() # Test relies on precise output of dumps()
test_pickle_to_2x = None test_pickle_to_2x = None
def test_optimize_long_binget(self):
data = [str(i) for i in range(257)]
data.append(data[-1])
for proto in range(pickle.HIGHEST_PROTOCOL + 1):
pickled = pickle.dumps(data, proto)
unpickled = pickle.loads(pickled)
self.assertEqual(unpickled, data)
self.assertIs(unpickled[-1], unpickled[-2])
pickled2 = pickletools.optimize(pickled)
unpickled2 = pickle.loads(pickled2)
self.assertEqual(unpickled2, data)
self.assertIs(unpickled2[-1], unpickled2[-2])
self.assertNotIn(pickle.LONG_BINGET, pickled2)
self.assertNotIn(pickle.LONG_BINPUT, pickled2)
def test_optimize_binput_and_memoize(self):
pickled = (b'\x80\x04\x95\x15\x00\x00\x00\x00\x00\x00\x00'
b']\x94(\x8c\x04spamq\x01\x8c\x03ham\x94h\x02e.')
# 0: \x80 PROTO 4
# 2: \x95 FRAME 21
# 11: ] EMPTY_LIST
# 12: \x94 MEMOIZE
# 13: ( MARK
# 14: \x8c SHORT_BINUNICODE 'spam'
# 20: q BINPUT 1
# 22: \x8c SHORT_BINUNICODE 'ham'
# 27: \x94 MEMOIZE
# 28: h BINGET 2
# 30: e APPENDS (MARK at 13)
# 31: . STOP
self.assertIn(pickle.BINPUT, pickled)
unpickled = pickle.loads(pickled)
self.assertEqual(unpickled, ['spam', 'ham', 'ham'])
self.assertIs(unpickled[1], unpickled[2])
pickled2 = pickletools.optimize(pickled)
unpickled2 = pickle.loads(pickled2)
self.assertEqual(unpickled2, ['spam', 'ham', 'ham'])
self.assertIs(unpickled2[1], unpickled2[2])
self.assertNotIn(pickle.BINPUT, pickled2)
def test_main(): def test_main():
support.run_unittest(OptimizedPickleTests) support.run_unittest(OptimizedPickleTests)
......
...@@ -41,6 +41,10 @@ Core and Builtins ...@@ -41,6 +41,10 @@ Core and Builtins
Library Library
------- -------
- Issue #19858: pickletools.optimize() now aware of the MEMOIZE opcode, can
produce more compact result and no longer produces invalid output if input
data contains MEMOIZE opcodes together with PUT or BINPUT opcodes.
- Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port - Issue #22095: Fixed HTTPConnection.set_tunnel with default port. The port
value in the host header was set to "None". Patch by Demian Brecht. value in the host header was set to "None". Patch by Demian Brecht.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment