Commit 5b76bdba authored by Serhiy Storchaka's avatar Serhiy Storchaka Committed by GitHub

bpo-31993: Do not use memoryview when pickle large strings. (#5154)

PyMemoryView_FromMemory() created a memoryview referring to
the internal data of the string.  When the string is destroyed
the memoryview become referring to a freed memory.
parent f3031b8a
...@@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase): ...@@ -2169,67 +2169,67 @@ class AbstractPickleTests(unittest.TestCase):
def test_framed_write_sizes_with_delayed_writer(self): def test_framed_write_sizes_with_delayed_writer(self):
class ChunkAccumulator: class ChunkAccumulator:
"""Accumulate pickler output in a list of raw chunks.""" """Accumulate pickler output in a list of raw chunks."""
def __init__(self): def __init__(self):
self.chunks = [] self.chunks = []
def write(self, chunk): def write(self, chunk):
self.chunks.append(chunk) self.chunks.append(chunk)
def concatenate_chunks(self): def concatenate_chunks(self):
# Some chunks can be memoryview instances, we need to convert return b"".join(self.chunks)
# them to bytes to be able to call join
return b"".join([c.tobytes() if hasattr(c, 'tobytes') else c
for c in self.chunks])
small_objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)}) for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
objects = [(str(i).encode('ascii'), i % 42, {'i': str(i)})
for i in range(int(1e4))] for i in range(int(1e4))]
# Add a large unique ASCII string
objects.append('0123456789abcdef' *
(self.FRAME_SIZE_TARGET // 16 + 1))
for proto in range(4, pickle.HIGHEST_PROTOCOL + 1):
# Protocol 4 packs groups of small objects into frames and issues # Protocol 4 packs groups of small objects into frames and issues
# calls to write only once or twice per frame: # calls to write only once or twice per frame:
# The C pickler issues one call to write per-frame (header and # The C pickler issues one call to write per-frame (header and
# contents) while Python pickler issues two calls to write: one for # contents) while Python pickler issues two calls to write: one for
# the frame header and one for the frame binary contents. # the frame header and one for the frame binary contents.
writer = ChunkAccumulator() writer = ChunkAccumulator()
self.pickler(writer, proto).dump(small_objects) self.pickler(writer, proto).dump(objects)
# Actually read the binary content of the chunks after the end # Actually read the binary content of the chunks after the end
# of the call to dump: ant memoryview passed to write should not # of the call to dump: any memoryview passed to write should not
# be released otherwise this delayed access would not be possible. # be released otherwise this delayed access would not be possible.
pickled = writer.concatenate_chunks() pickled = writer.concatenate_chunks()
reconstructed = self.loads(pickled) reconstructed = self.loads(pickled)
self.assertEqual(reconstructed, small_objects) self.assertEqual(reconstructed, objects)
self.assertGreater(len(writer.chunks), 1) self.assertGreater(len(writer.chunks), 1)
n_frames, remainder = divmod(len(pickled), self.FRAME_SIZE_TARGET) # memoryviews should own the memory.
if remainder > 0: del objects
n_frames += 1 support.gc_collect()
self.assertEqual(writer.concatenate_chunks(), pickled)
n_frames = (len(pickled) - 1) // self.FRAME_SIZE_TARGET + 1
# There should be at least one call to write per frame # There should be at least one call to write per frame
self.assertGreaterEqual(len(writer.chunks), n_frames) self.assertGreaterEqual(len(writer.chunks), n_frames)
# but not too many either: there can be one for the proto, # but not too many either: there can be one for the proto,
# one per-frame header and one per frame for the actual contents. # one per-frame header, one per frame for the actual contents,
self.assertGreaterEqual(2 * n_frames + 1, len(writer.chunks)) # and two for the header.
self.assertLessEqual(len(writer.chunks), 2 * n_frames + 3)
chunk_sizes = [len(c) for c in writer.chunks[:-1]] chunk_sizes = [len(c) for c in writer.chunks]
large_sizes = [s for s in chunk_sizes large_sizes = [s for s in chunk_sizes
if s >= self.FRAME_SIZE_TARGET] if s >= self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes medium_sizes = [s for s in chunk_sizes
if s < self.FRAME_SIZE_TARGET] if 9 < s < self.FRAME_SIZE_TARGET]
small_sizes = [s for s in chunk_sizes if s <= 9]
# Large chunks should not be too large: # Large chunks should not be too large:
for chunk_size in large_sizes: for chunk_size in large_sizes:
self.assertGreater(2 * self.FRAME_SIZE_TARGET, chunk_size) self.assertLess(chunk_size, 2 * self.FRAME_SIZE_TARGET,
chunk_sizes)
last_chunk_size = len(writer.chunks[-1]) # There shouldn't bee too many small chunks: the protocol header,
self.assertGreater(2 * self.FRAME_SIZE_TARGET, last_chunk_size) # the frame headers and the large string headers are written
# in small chunks.
# Small chunks (if any) should be very small self.assertLessEqual(len(small_sizes),
# (only proto and frame headers) len(large_sizes) + len(medium_sizes) + 3,
for chunk_size in small_sizes: chunk_sizes)
self.assertGreaterEqual(9, chunk_size)
def test_nested_names(self): def test_nested_names(self):
global Nested global Nested
......
...@@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt ...@@ -693,9 +693,9 @@ ctypes._aix.find_library() Patch by: Michael Felt
.. nonce: -OMNg8 .. nonce: -OMNg8
.. section: Library .. section: Library
The picklers no longer allocate temporary memory when dumping large The pickler now uses less memory when serializing large bytes and str
``bytes`` and ``str`` objects into a file object. Instead the data is objects into a file. Pickles created with protocol 4 will require less
directly streamed into the underlying file object. memory for unpickling large bytes and str objects.
.. ..
......
...@@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self, ...@@ -2184,8 +2184,9 @@ _Pickler_write_bytes(PicklerObject *self,
/* Stream write the payload into the file without going through the /* Stream write the payload into the file without going through the
output buffer. */ output buffer. */
if (payload == NULL) { if (payload == NULL) {
payload = mem = PyMemoryView_FromMemory((char *) data, data_size, /* TODO: It would be better to use a memoryview with a linked
PyBUF_READ); original string if this is possible. */
payload = mem = PyBytes_FromStringAndSize(data, data_size);
if (payload == NULL) { if (payload == NULL) {
return -1; return -1;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment