Commit 04248a8d authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #3873: Speed up unpickling from file objects which have a peek()

method.
parent b0182c8c
...@@ -30,6 +30,21 @@ def count_opcode(code, pickle): ...@@ -30,6 +30,21 @@ def count_opcode(code, pickle):
n += 1 n += 1
return n return n
class UnseekableIO(io.BytesIO):
def peek(self, *args):
raise NotImplementedError
def seekable(self):
return False
def seek(self, *args):
raise io.UnsupportedOperation
def tell(self):
raise io.UnsupportedOperation
# We can't very well test the extension registry without putting known stuff # We can't very well test the extension registry without putting known stuff
# in it, but we have to be careful to restore its original state. Code # in it, but we have to be careful to restore its original state. Code
# should do this: # should do this:
...@@ -1072,9 +1087,10 @@ class AbstractPickleTests(unittest.TestCase): ...@@ -1072,9 +1087,10 @@ class AbstractPickleTests(unittest.TestCase):
# Test the correctness of internal buffering routines when handling # Test the correctness of internal buffering routines when handling
# large data. # large data.
for proto in protocols: for proto in protocols:
data = (1, b'x' * (256 * 1024)) data = (1, min, b'xy' * (30 * 1024), len)
dumped = self.dumps(data, proto) dumped = self.dumps(data, proto)
loaded = self.loads(dumped) loaded = self.loads(dumped)
self.assertEqual(len(loaded), len(data))
self.assertEqual(loaded, data) self.assertEqual(loaded, data)
...@@ -1373,6 +1389,31 @@ class AbstractPicklerUnpicklerObjectTests(unittest.TestCase): ...@@ -1373,6 +1389,31 @@ class AbstractPicklerUnpicklerObjectTests(unittest.TestCase):
f.seek(0) f.seek(0)
self.assertEqual(unpickler.load(), data2) self.assertEqual(unpickler.load(), data2)
def _check_multiple_unpicklings(self, ioclass):
for proto in protocols:
data1 = [(x, str(x)) for x in range(2000)] + [b"abcde", len]
f = ioclass()
pickler = self.pickler_class(f, protocol=proto)
pickler.dump(data1)
pickled = f.getvalue()
N = 5
f = ioclass(pickled * N)
unpickler = self.unpickler_class(f)
for i in range(N):
if f.seekable():
pos = f.tell()
self.assertEqual(unpickler.load(), data1)
if f.seekable():
self.assertEqual(f.tell(), pos + len(pickled))
self.assertRaises(EOFError, unpickler.load)
def test_multiple_unpicklings_seekable(self):
self._check_multiple_unpicklings(io.BytesIO)
def test_multiple_unpicklings_unseekable(self):
self._check_multiple_unpicklings(UnseekableIO)
if __name__ == "__main__": if __name__ == "__main__":
# Print some stuff that can be used to rewrite DATA{0,1,2} # Print some stuff that can be used to rewrite DATA{0,1,2}
......
...@@ -13,6 +13,9 @@ Core and Builtins ...@@ -13,6 +13,9 @@ Core and Builtins
Library Library
------- -------
- Issue #3873: Speed up unpickling from file objects which have a peek()
method.
- Issue #10075: Add a session_stats() method to SSLContext objects. - Issue #10075: Add a session_stats() method to SSLContext objects.
- Issue #9948: Fixed problem of losing filename case information. - Issue #9948: Fixed problem of losing filename case information.
......
...@@ -101,6 +101,9 @@ enum { ...@@ -101,6 +101,9 @@ enum {
/* Maximum size of the write buffer of Pickler when pickling to a /* Maximum size of the write buffer of Pickler when pickling to a
stream. This is ignored for in-memory pickling. */ stream. This is ignored for in-memory pickling. */
MAX_WRITE_BUF_SIZE = 64 * 1024, MAX_WRITE_BUF_SIZE = 64 * 1024,
/* Prefetch size when unpickling (disabled on unpeekable streams) */
PREFETCH = 8192 * 16,
}; };
/* Exception classes for pickle. These should override the ones defined in /* Exception classes for pickle. These should override the ones defined in
...@@ -355,8 +358,10 @@ typedef struct UnpicklerObject { ...@@ -355,8 +358,10 @@ typedef struct UnpicklerObject {
char *input_line; char *input_line;
Py_ssize_t input_len; Py_ssize_t input_len;
Py_ssize_t next_read_idx; Py_ssize_t next_read_idx;
Py_ssize_t prefetched_idx; /* index of first prefetched byte */
PyObject *read; /* read() method of the input stream. */ PyObject *read; /* read() method of the input stream. */
PyObject *readline; /* readline() method of the input stream. */ PyObject *readline; /* readline() method of the input stream. */
PyObject *peek; /* peek() method of the input stream, or NULL */
char *encoding; /* Name of the encoding to be used for char *encoding; /* Name of the encoding to be used for
decoding strings pickled using Python decoding strings pickled using Python
...@@ -859,9 +864,28 @@ _Unpickler_SetStringInput(UnpicklerObject *self, PyObject *input) ...@@ -859,9 +864,28 @@ _Unpickler_SetStringInput(UnpicklerObject *self, PyObject *input)
self->input_buffer = self->buffer.buf; self->input_buffer = self->buffer.buf;
self->input_len = self->buffer.len; self->input_len = self->buffer.len;
self->next_read_idx = 0; self->next_read_idx = 0;
self->prefetched_idx = self->input_len;
return self->input_len; return self->input_len;
} }
static int
_Unpickler_SkipConsumed(UnpicklerObject *self)
{
Py_ssize_t consumed = self->next_read_idx - self->prefetched_idx;
if (consumed > 0) {
PyObject *r;
assert(self->peek); /* otherwise we did something wrong */
/* This makes an useless copy... */
r = PyObject_CallFunction(self->read, "n", consumed);
if (r == NULL)
return -1;
Py_DECREF(r);
self->prefetched_idx = self->next_read_idx;
}
return 0;
}
static const Py_ssize_t READ_WHOLE_LINE = -1; static const Py_ssize_t READ_WHOLE_LINE = -1;
/* If reading from a file, we need to only pull the bytes we need, since there /* If reading from a file, we need to only pull the bytes we need, since there
...@@ -882,10 +906,12 @@ static Py_ssize_t ...@@ -882,10 +906,12 @@ static Py_ssize_t
_Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
{ {
PyObject *data; PyObject *data;
Py_ssize_t read_size; Py_ssize_t read_size, prefetched_size = 0;
assert(self->read != NULL); assert(self->read != NULL);
assert(self->next_read_idx == 0);
if (_Unpickler_SkipConsumed(self) < 0)
return -1;
if (n == READ_WHOLE_LINE) if (n == READ_WHOLE_LINE)
data = PyObject_Call(self->readline, empty_tuple, NULL); data = PyObject_Call(self->readline, empty_tuple, NULL);
...@@ -895,13 +921,41 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) ...@@ -895,13 +921,41 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
return -1; return -1;
data = _Unpickler_FastCall(self, self->read, len); data = _Unpickler_FastCall(self, self->read, len);
} }
if (data == NULL) if (data == NULL)
return -1; return -1;
read_size = _Unpickler_SetStringInput(self, data); /* Prefetch some data without advancing the file pointer, if possible */
self->input_len = 0; if (self->peek) {
PyObject *len, *prefetched;
len = PyLong_FromSsize_t(PREFETCH);
if (len == NULL) {
Py_DECREF(data);
return -1;
}
prefetched = _Unpickler_FastCall(self, self->peek, len);
if (prefetched == NULL) {
if (PyErr_ExceptionMatches(PyExc_NotImplementedError)) {
/* peek() is probably not supported by the given file object */
PyErr_Clear();
Py_CLEAR(self->peek);
}
else {
Py_DECREF(data);
return -1;
}
}
else {
assert(PyBytes_Check(prefetched));
prefetched_size = PyBytes_GET_SIZE(prefetched);
PyBytes_ConcatAndDel(&data, prefetched);
if (data == NULL)
return -1;
}
}
read_size = _Unpickler_SetStringInput(self, data) - prefetched_size;
Py_DECREF(data); Py_DECREF(data);
self->prefetched_idx = read_size;
return read_size; return read_size;
} }
...@@ -921,30 +975,31 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n) ...@@ -921,30 +975,31 @@ _Unpickler_ReadFromFile(UnpicklerObject *self, Py_ssize_t n)
static Py_ssize_t static Py_ssize_t
_Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n) _Unpickler_Read(UnpicklerObject *self, char **s, Py_ssize_t n)
{ {
Py_ssize_t num_read;
if (n == 0) { if (n == 0) {
*s = NULL; *s = NULL;
return 0; return 0;
} }
/* This condition will always be true if self->read. */ if (self->next_read_idx + n <= self->input_len) {
if (self->next_read_idx + n > self->input_len) { *s = self->input_buffer + self->next_read_idx;
if (self->read) { self->next_read_idx += n;
Py_ssize_t num_read; return n;
assert(self->next_read_idx == self->input_len); }
num_read = _Unpickler_ReadFromFile(self, n); if (!self->read) {
if (n < 0) PyErr_Format(PyExc_EOFError, "Ran out of input");
return -1; return -1;
if (num_read == n) { }
*s = self->input_buffer; num_read = _Unpickler_ReadFromFile(self, n);
return num_read; if (num_read < 0)
} return -1;
} if (num_read < n) {
PyErr_Format(PyExc_EOFError, "Ran out of input"); PyErr_Format(PyExc_EOFError, "Ran out of input");
return -1; return -1;
} }
assert(self->read == NULL); *s = self->input_buffer;
*s = self->input_buffer + self->next_read_idx; self->next_read_idx = n;
self->next_read_idx += n;
return n; return n;
} }
...@@ -972,9 +1027,7 @@ _Unpickler_Readline(UnpicklerObject *self, char **result) ...@@ -972,9 +1027,7 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
{ {
Py_ssize_t i, num_read; Py_ssize_t i, num_read;
/* This loop will never be entered if self->read is not NULL. */
for (i = self->next_read_idx; i < self->input_len; i++) { for (i = self->next_read_idx; i < self->input_len; i++) {
assert(self->read == NULL);
if (self->input_buffer[i] == '\n') { if (self->input_buffer[i] == '\n') {
char *line_start = self->input_buffer + self->next_read_idx; char *line_start = self->input_buffer + self->next_read_idx;
num_read = i - self->next_read_idx + 1; num_read = i - self->next_read_idx + 1;
...@@ -983,11 +1036,11 @@ _Unpickler_Readline(UnpicklerObject *self, char **result) ...@@ -983,11 +1036,11 @@ _Unpickler_Readline(UnpicklerObject *self, char **result)
} }
} }
if (self->read) { if (self->read) {
assert(self->next_read_idx == self->input_len);
num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE); num_read = _Unpickler_ReadFromFile(self, READ_WHOLE_LINE);
if (num_read < 0) if (num_read < 0)
return -1; return -1;
*result = self->input_buffer; *result = self->input_buffer;
self->next_read_idx = num_read;
return num_read; return num_read;
} }
...@@ -1106,8 +1159,10 @@ _Unpickler_New(void) ...@@ -1106,8 +1159,10 @@ _Unpickler_New(void)
self->input_line = NULL; self->input_line = NULL;
self->input_len = 0; self->input_len = 0;
self->next_read_idx = 0; self->next_read_idx = 0;
self->prefetched_idx = 0;
self->read = NULL; self->read = NULL;
self->readline = NULL; self->readline = NULL;
self->peek = NULL;
self->encoding = NULL; self->encoding = NULL;
self->errors = NULL; self->errors = NULL;
self->marks = NULL; self->marks = NULL;
...@@ -1124,6 +1179,13 @@ _Unpickler_New(void) ...@@ -1124,6 +1179,13 @@ _Unpickler_New(void)
static int static int
_Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file) _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
{ {
self->peek = PyObject_GetAttrString(file, "peek");
if (self->peek == NULL) {
if (PyErr_ExceptionMatches(PyExc_AttributeError))
PyErr_Clear();
else
return -1;
}
self->read = PyObject_GetAttrString(file, "read"); self->read = PyObject_GetAttrString(file, "read");
self->readline = PyObject_GetAttrString(file, "readline"); self->readline = PyObject_GetAttrString(file, "readline");
if (self->readline == NULL || self->read == NULL) { if (self->readline == NULL || self->read == NULL) {
...@@ -1132,6 +1194,7 @@ _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file) ...@@ -1132,6 +1194,7 @@ _Unpickler_SetInputStream(UnpicklerObject *self, PyObject *file)
"file must have 'read' and 'readline' attributes"); "file must have 'read' and 'readline' attributes");
Py_CLEAR(self->read); Py_CLEAR(self->read);
Py_CLEAR(self->readline); Py_CLEAR(self->readline);
Py_CLEAR(self->peek);
return -1; return -1;
} }
return 0; return 0;
...@@ -5207,6 +5270,9 @@ load(UnpicklerObject *self) ...@@ -5207,6 +5270,9 @@ load(UnpicklerObject *self)
break; /* and we are done! */ break; /* and we are done! */
} }
if (_Unpickler_SkipConsumed(self) < 0)
return NULL;
/* XXX: It is not clear what this is actually for. */ /* XXX: It is not clear what this is actually for. */
if ((err = PyErr_Occurred())) { if ((err = PyErr_Occurred())) {
if (err == PyExc_EOFError) { if (err == PyExc_EOFError) {
...@@ -5356,6 +5422,7 @@ Unpickler_dealloc(UnpicklerObject *self) ...@@ -5356,6 +5422,7 @@ Unpickler_dealloc(UnpicklerObject *self)
PyObject_GC_UnTrack((PyObject *)self); PyObject_GC_UnTrack((PyObject *)self);
Py_XDECREF(self->readline); Py_XDECREF(self->readline);
Py_XDECREF(self->read); Py_XDECREF(self->read);
Py_XDECREF(self->peek);
Py_XDECREF(self->stack); Py_XDECREF(self->stack);
Py_XDECREF(self->pers_func); Py_XDECREF(self->pers_func);
Py_XDECREF(self->arg); Py_XDECREF(self->arg);
...@@ -5378,6 +5445,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg) ...@@ -5378,6 +5445,7 @@ Unpickler_traverse(UnpicklerObject *self, visitproc visit, void *arg)
{ {
Py_VISIT(self->readline); Py_VISIT(self->readline);
Py_VISIT(self->read); Py_VISIT(self->read);
Py_VISIT(self->peek);
Py_VISIT(self->stack); Py_VISIT(self->stack);
Py_VISIT(self->pers_func); Py_VISIT(self->pers_func);
Py_VISIT(self->arg); Py_VISIT(self->arg);
...@@ -5389,6 +5457,7 @@ Unpickler_clear(UnpicklerObject *self) ...@@ -5389,6 +5457,7 @@ Unpickler_clear(UnpicklerObject *self)
{ {
Py_CLEAR(self->readline); Py_CLEAR(self->readline);
Py_CLEAR(self->read); Py_CLEAR(self->read);
Py_CLEAR(self->peek);
Py_CLEAR(self->stack); Py_CLEAR(self->stack);
Py_CLEAR(self->pers_func); Py_CLEAR(self->pers_func);
Py_CLEAR(self->arg); Py_CLEAR(self->arg);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment