Commit fd8a838d authored by Nadeem Vawda's avatar Nadeem Vawda

Issue #14684: Add support for predefined compression dictionaries to the zlib module.

Original patch by Sam Rushing.
parent 50b0a365
......@@ -58,12 +58,19 @@ The available exception and functions in this module are:
exception if any error occurs.
.. function:: compressobj([level])
.. function:: compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])
Returns a compression object, to be used for compressing data streams that won't
fit into memory at once. *level* is an integer from ``1`` to ``9`` controlling
the level of compression; ``1`` is fastest and produces the least compression,
``9`` is slowest and produces the most. The default value is ``6``.
fit into memory at once.
*level* is an integer from ``1`` to ``9`` controlling the level of
compression; ``1`` is fastest and produces the least compression, ``9`` is
slowest and produces the most. The default value is ``6``.
*zdict* is a predefined compression dictionary. This is a sequence of bytes
(such as a :class:`bytes` object) containing subsequences that are expected
to occur frequently in the data that is to be compressed. Those subsequences
that are expected to be most common should come at the end of the dictionary.
.. function:: crc32(data[, value])
......@@ -114,11 +121,21 @@ The available exception and functions in this module are:
to :c:func:`malloc`. The default size is 16384.
.. function:: decompressobj([wbits])
.. function:: decompressobj([wbits[, zdict]])
Returns a decompression object, to be used for decompressing data streams that
won't fit into memory at once. The *wbits* parameter controls the size of the
window buffer.
won't fit into memory at once.
The *wbits* parameter controls the size of the window buffer.
The *zdict* parameter specifies a predefined compression dictionary. If
provided, this must be the same dictionary as was used by the compressor that
produced the data that is to be decompressed.
.. note::
If *zdict* is a mutable object (such as a :class:`bytearray`), you must not
modify its contents between the call to :func:`decompressobj` and the first
call to the decompressor's ``decompress()`` method.
Compression objects support the following methods:
......
......@@ -425,6 +425,36 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
dco = zlib.decompressobj()
self.assertEqual(dco.flush(), b"") # Returns nothing
def test_dictionary(self):
h = HAMLET_SCENE
# build a simulated dictionary out of the words in HAMLET
words = h.split()
random.shuffle(words)
zdict = b''.join(words)
# use it to compress HAMLET
co = zlib.compressobj(zdict=zdict)
cd = co.compress(h) + co.flush()
# verify that it will decompress with the dictionary
dco = zlib.decompressobj(zdict=zdict)
self.assertEqual(dco.decompress(cd) + dco.flush(), h)
# verify that it fails when not given the dictionary
dco = zlib.decompressobj()
self.assertRaises(zlib.error, dco.decompress, cd)
def test_dictionary_streaming(self):
# this is simulating the needs of SPDY to be able to reuse the same
# stream object (with its compression state) between sets of compressed
# headers.
co = zlib.compressobj(zdict=HAMLET_SCENE)
do = zlib.decompressobj(zdict=HAMLET_SCENE)
piece = HAMLET_SCENE[1000:1500]
d0 = co.compress(piece) + co.flush(zlib.Z_SYNC_FLUSH)
d1 = co.compress(piece[100:]) + co.flush(zlib.Z_SYNC_FLUSH)
d2 = co.compress(piece[:-100]) + co.flush(zlib.Z_SYNC_FLUSH)
self.assertEqual(do.decompress(d0), piece)
self.assertEqual(do.decompress(d1), piece[100:])
self.assertEqual(do.decompress(d2), piece[:-100])
def test_decompress_incomplete_stream(self):
# This is 'foo', deflated
x = b'x\x9cK\xcb\xcf\x07\x00\x02\x82\x01E'
......
......@@ -34,6 +34,9 @@ Core and Builtins
Library
-------
- Issue #14684: zlib.compressobj() and zlib.decompressobj() now support the use
of predefined compression dictionaries. Original patch by Sam Rushing.
- Fix GzipFile's handling of filenames given as bytes objects.
- Issue #14772: Return destination values from some shutil functions.
......
......@@ -45,6 +45,7 @@ typedef struct
PyObject *unconsumed_tail;
char eof;
int is_initialised;
PyObject *zdict;
#ifdef WITH_THREAD
PyThread_type_lock lock;
#endif
......@@ -80,14 +81,21 @@ zlib_error(z_stream zst, int err, char *msg)
}
PyDoc_STRVAR(compressobj__doc__,
"compressobj([level]) -- Return a compressor object.\n"
"compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])\n"
" -- Return a compressor object.\n"
"\n"
"Optional arg level is the compression level, in 1-9.");
"Optional arg level is the compression level, in 1-9.\n"
"\n"
"Optional arg zdict is the predefined compression dictionary - a sequence of\n"
"bytes containing subsequences that are likely to occur in the input data.");
PyDoc_STRVAR(decompressobj__doc__,
"decompressobj([wbits]) -- Return a decompressor object.\n"
"decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n"
"\n"
"Optional arg wbits is the window buffer size.\n"
"\n"
"Optional arg wbits is the window buffer size.");
"Optional arg zdict is the predefined compression dictionary. This must be\n"
"the same dictionary as used by the compressor that produced the input data.");
static compobject *
newcompobject(PyTypeObject *type)
......@@ -98,6 +106,7 @@ newcompobject(PyTypeObject *type)
return NULL;
self->eof = 0;
self->is_initialised = 0;
self->zdict = NULL;
self->unused_data = PyBytes_FromStringAndSize("", 0);
if (self->unused_data == NULL) {
Py_DECREF(self);
......@@ -316,19 +325,24 @@ PyZlib_decompress(PyObject *self, PyObject *args)
}
static PyObject *
PyZlib_compressobj(PyObject *selfptr, PyObject *args)
PyZlib_compressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
{
compobject *self;
int level=Z_DEFAULT_COMPRESSION, method=DEFLATED;
int wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=0, err;
if (!PyArg_ParseTuple(args, "|iiiii:compressobj", &level, &method, &wbits,
&memLevel, &strategy))
Py_buffer zdict;
static char *kwlist[] = {"level", "method", "wbits",
"memLevel", "strategy", "zdict", NULL};
zdict.buf = NULL; /* Sentinel, so we can tell whether zdict was supplied. */
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiiy*:compressobj",
kwlist, &level, &method, &wbits,
&memLevel, &strategy, &zdict))
return NULL;
self = newcompobject(&Comptype);
if (self==NULL)
return(NULL);
goto error;
self->zst.zalloc = (alloc_func)NULL;
self->zst.zfree = (free_func)Z_NULL;
self->zst.next_in = NULL;
......@@ -337,30 +351,58 @@ PyZlib_compressobj(PyObject *selfptr, PyObject *args)
switch(err) {
case (Z_OK):
self->is_initialised = 1;
return (PyObject*)self;
if (zdict.buf == NULL) {
goto success;
} else {
err = deflateSetDictionary(&self->zst, zdict.buf, zdict.len);
switch (err) {
case (Z_OK):
goto success;
case (Z_STREAM_ERROR):
PyErr_SetString(PyExc_ValueError, "Invalid dictionary");
goto error;
default:
PyErr_SetString(PyExc_ValueError, "deflateSetDictionary()");
goto error;
}
}
case (Z_MEM_ERROR):
Py_DECREF(self);
PyErr_SetString(PyExc_MemoryError,
"Can't allocate memory for compression object");
return NULL;
goto error;
case(Z_STREAM_ERROR):
Py_DECREF(self);
PyErr_SetString(PyExc_ValueError, "Invalid initialization option");
return NULL;
goto error;
default:
zlib_error(self->zst, err, "while creating compression object");
Py_DECREF(self);
return NULL;
goto error;
}
error:
Py_XDECREF(self);
self = NULL;
success:
if (zdict.buf != NULL)
PyBuffer_Release(&zdict);
return (PyObject*)self;
}
static PyObject *
PyZlib_decompressobj(PyObject *selfptr, PyObject *args)
PyZlib_decompressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
{
static char *kwlist[] = {"wbits", "zdict", NULL};
int wbits=DEF_WBITS, err;
compobject *self;
if (!PyArg_ParseTuple(args, "|i:decompressobj", &wbits))
PyObject *zdict=NULL;
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO:decompressobj",
kwlist, &wbits, &zdict))
return NULL;
if (zdict != NULL && !PyObject_CheckBuffer(zdict)) {
PyErr_SetString(PyExc_TypeError,
"zdict argument must support the buffer protocol");
return NULL;
}
self = newcompobject(&Decomptype);
if (self == NULL)
......@@ -369,6 +411,10 @@ PyZlib_decompressobj(PyObject *selfptr, PyObject *args)
self->zst.zfree = (free_func)Z_NULL;
self->zst.next_in = NULL;
self->zst.avail_in = 0;
if (zdict != NULL) {
Py_INCREF(zdict);
self->zdict = zdict;
}
err = inflateInit2(&self->zst, wbits);
switch(err) {
case (Z_OK):
......@@ -398,6 +444,7 @@ Dealloc(compobject *self)
#endif
Py_XDECREF(self->unused_data);
Py_XDECREF(self->unconsumed_tail);
Py_XDECREF(self->zdict);
PyObject_Del(self);
}
......@@ -557,6 +604,27 @@ PyZlib_objdecompress(compobject *self, PyObject *args)
err = inflate(&(self->zst), Z_SYNC_FLUSH);
Py_END_ALLOW_THREADS
if (err == Z_NEED_DICT && self->zdict != NULL) {
Py_buffer zdict_buf;
if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) {
Py_DECREF(RetVal);
RetVal = NULL;
goto error;
}
err = inflateSetDictionary(&(self->zst), zdict_buf.buf, zdict_buf.len);
PyBuffer_Release(&zdict_buf);
if (err != Z_OK) {
zlib_error(self->zst, err, "while decompressing data");
Py_DECREF(RetVal);
RetVal = NULL;
goto error;
}
/* repeat the call to inflate! */
Py_BEGIN_ALLOW_THREADS
err = inflate(&(self->zst), Z_SYNC_FLUSH);
Py_END_ALLOW_THREADS
}
/* While Z_OK and the output buffer is full, there might be more output.
So extend the output buffer and try again.
*/
......@@ -770,10 +838,13 @@ PyZlib_copy(compobject *self)
}
Py_INCREF(self->unused_data);
Py_INCREF(self->unconsumed_tail);
Py_XINCREF(self->zdict);
Py_XDECREF(retval->unused_data);
Py_XDECREF(retval->unconsumed_tail);
Py_XDECREF(retval->zdict);
retval->unused_data = self->unused_data;
retval->unconsumed_tail = self->unconsumed_tail;
retval->zdict = self->zdict;
retval->eof = self->eof;
/* Mark it as being initialized */
......@@ -822,10 +893,13 @@ PyZlib_uncopy(compobject *self)
Py_INCREF(self->unused_data);
Py_INCREF(self->unconsumed_tail);
Py_XINCREF(self->zdict);
Py_XDECREF(retval->unused_data);
Py_XDECREF(retval->unconsumed_tail);
Py_XDECREF(retval->zdict);
retval->unused_data = self->unused_data;
retval->unconsumed_tail = self->unconsumed_tail;
retval->zdict = self->zdict;
retval->eof = self->eof;
/* Mark it as being initialized */
......@@ -1032,13 +1106,13 @@ static PyMethodDef zlib_methods[] =
adler32__doc__},
{"compress", (PyCFunction)PyZlib_compress, METH_VARARGS,
compress__doc__},
{"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS,
{"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS|METH_KEYWORDS,
compressobj__doc__},
{"crc32", (PyCFunction)PyZlib_crc32, METH_VARARGS,
crc32__doc__},
{"decompress", (PyCFunction)PyZlib_decompress, METH_VARARGS,
decompress__doc__},
{"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS,
{"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS|METH_KEYWORDS,
decompressobj__doc__},
{NULL, NULL}
};
......@@ -1112,10 +1186,10 @@ PyDoc_STRVAR(zlib_module_documentation,
"\n"
"adler32(string[, start]) -- Compute an Adler-32 checksum.\n"
"compress(string[, level]) -- Compress string, with compression level in 1-9.\n"
"compressobj([level]) -- Return a compressor object.\n"
"compressobj([level[, ...]]) -- Return a compressor object.\n"
"crc32(string[, start]) -- Compute a CRC-32 checksum.\n"
"decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n"
"decompressobj([wbits]) -- Return a decompressor object.\n"
"decompressobj([wbits[, zdict]]]) -- Return a decompressor object.\n"
"\n"
"'wbits' is window buffer size.\n"
"Compressor objects support compress() and flush() methods; decompressor\n"
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment