Issue #14684: Add support for predefined compression dictionaries to the zlib module.

Original patch by Sam Rushing.

Issue #14684: Add support for predefined compression dictionaries to the zlib module.
Original patch by Sam Rushing.
df1efbbb · Nadeem Vawda · 088bfe89 · df1efbbb · df1efbbb · df1efbbb
Commit df1efbbb authored Jun 21, 2012 by Nadeem Vawda
Showing with 153 additions and 29 deletions

Doc/library/zlib.rst Doc/library/zlib.rst +24 -7

Lib/test/test_zlib.py Lib/test/test_zlib.py +30 -0

Misc/NEWS Misc/NEWS +3 -0

Modules/zlibmodule.c Modules/zlibmodule.c +96 -22

No files found.
--- a/Doc/library/zlib.rst
+++ b/Doc/library/zlib.rst
@@ -58,12 +58,19 @@ The available exception and functions in this module are:
   exception if any error occurs.


-.. function:: compressobj([level])
+.. function:: compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])

   Returns a compression object, to be used for compressing data streams that won't
-   fit into memory at once.  *level* is an integer from ``1`` to ``9`` controlling
-   the level of compression; ``1`` is fastest and produces the least compression,
-   ``9`` is slowest and produces the most.  The default value is ``6``.
+   fit into memory at once.
+
+   *level* is an integer from ``1`` to ``9`` controlling the level of
+   compression; ``1`` is fastest and produces the least compression, ``9`` is
+   slowest and produces the most.  The default value is ``6``.
+
+   *zdict* is a predefined compression dictionary. This is a sequence of bytes
+   (such as a :class:`bytes` object) containing subsequences that are expected
+   to occur frequently in the data that is to be compressed. Those subsequences
+   that are expected to be most common should come at the end of the dictionary.


 .. function:: crc32(data[, value])
@@ -114,11 +121,21 @@ The available exception and functions in this module are:
   to :c:func:`malloc`.  The default size is 16384.


-.. function:: decompressobj([wbits])
+.. function:: decompressobj([wbits[, zdict]])

   Returns a decompression object, to be used for decompressing data streams that
-   won't fit into memory at once.  The *wbits* parameter controls the size of the
-   window buffer.
+   won't fit into memory at once.
+
+   The *wbits* parameter controls the size of the window buffer.
+
+   The *zdict* parameter specifies a predefined compression dictionary. If
+   provided, this must be the same dictionary as was used by the compressor that
+   produced the data that is to be decompressed.
+
+.. note::
+   If *zdict* is a mutable object (such as a :class:`bytearray`), you must not
+   modify its contents between the call to :func:`decompressobj` and the first
+   call to the decompressor's ``decompress()`` method.


 Compression objects support the following methods:

--- a/Lib/test/test_zlib.py
+++ b/Lib/test/test_zlib.py
@@ -425,6 +425,36 @@ class CompressObjectTestCase(BaseCompressTestCase, unittest.TestCase):
        dco = zlib.decompressobj()
        self.assertEqual(dco.flush(), b"") # Returns nothing

+    def test_dictionary(self):
+        h = HAMLET_SCENE
+        # build a simulated dictionary out of the words in HAMLET
+        words = h.split()
+        random.shuffle(words)
+        zdict = b''.join(words)
+        # use it to compress HAMLET
+        co = zlib.compressobj(zdict=zdict)
+        cd = co.compress(h) + co.flush()
+        # verify that it will decompress with the dictionary
+        dco = zlib.decompressobj(zdict=zdict)
+        self.assertEqual(dco.decompress(cd) + dco.flush(), h)
+        # verify that it fails when not given the dictionary
+        dco = zlib.decompressobj()
+        self.assertRaises(zlib.error, dco.decompress, cd)
+
+    def test_dictionary_streaming(self):
+        # this is simulating the needs of SPDY to be able to reuse the same
+        #  stream object (with its compression state) between sets of compressed
+        #  headers.
+        co = zlib.compressobj(zdict=HAMLET_SCENE)
+        do = zlib.decompressobj(zdict=HAMLET_SCENE)
+        piece = HAMLET_SCENE[1000:1500]
+        d0 = co.compress(piece) + co.flush(zlib.Z_SYNC_FLUSH)
+        d1 = co.compress(piece[100:]) + co.flush(zlib.Z_SYNC_FLUSH)
+        d2 = co.compress(piece[:-100]) + co.flush(zlib.Z_SYNC_FLUSH)
+        self.assertEqual(do.decompress(d0), piece)
+        self.assertEqual(do.decompress(d1), piece[100:])
+        self.assertEqual(do.decompress(d2), piece[:-100])
+
    def test_decompress_incomplete_stream(self):
        # This is 'foo', deflated
        x = b'x\x9cK\xcb\xcf\x07\x00\x02\x82\x01E'

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -34,6 +34,9 @@ Core and Builtins
 Library
 -------

+- Issue #14684: zlib.compressobj() and zlib.decompressobj() now support the use
+  of predefined compression dictionaries. Original patch by Sam Rushing.
+
 - Fix GzipFile's handling of filenames given as bytes objects.

 - Issue #14772: Return destination values from some shutil functions.

--- a/Modules/zlibmodule.c
+++ b/Modules/zlibmodule.c
@@ -45,6 +45,7 @@ typedef struct
    PyObject *unconsumed_tail;
    char eof;
    int is_initialised;
+    PyObject *zdict;
    #ifdef WITH_THREAD
        PyThread_type_lock lock;
    #endif
@@ -80,14 +81,21 @@ zlib_error(z_stream zst, int err, char *msg)
 }

 PyDoc_STRVAR(compressobj__doc__,
-"compressobj([level]) -- Return a compressor object.\n"
+"compressobj([level[, method[, wbits[, memlevel[, strategy[, zdict]]]]]])\n"
+" -- Return a compressor object.\n"
 "\n"
-"Optional arg level is the compression level, in 1-9.");
+"Optional arg level is the compression level, in 1-9.\n"
+"\n"
+"Optional arg zdict is the predefined compression dictionary - a sequence of\n"
+"bytes containing subsequences that are likely to occur in the input data.");

 PyDoc_STRVAR(decompressobj__doc__,
-"decompressobj([wbits]) -- Return a decompressor object.\n"
+"decompressobj([wbits[, zdict]]) -- Return a decompressor object.\n"
+"\n"
+"Optional arg wbits is the window buffer size.\n"
 "\n"
-"Optional arg wbits is the window buffer size.");
+"Optional arg zdict is the predefined compression dictionary. This must be\n"
+"the same dictionary as used by the compressor that produced the input data.");

 static compobject *
 newcompobject(PyTypeObject *type)
@@ -98,6 +106,7 @@ newcompobject(PyTypeObject *type)
        return NULL;
    self->eof = 0;
    self->is_initialised = 0;
+    self->zdict = NULL;
    self->unused_data = PyBytes_FromStringAndSize("", 0);
    if (self->unused_data == NULL) {
        Py_DECREF(self);
@@ -316,19 +325,24 @@ PyZlib_decompress(PyObject *self, PyObject *args)
 }

 static PyObject *
-PyZlib_compressobj(PyObject *selfptr, PyObject *args)
+PyZlib_compressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
 {
    compobject *self;
    int level=Z_DEFAULT_COMPRESSION, method=DEFLATED;
    int wbits=MAX_WBITS, memLevel=DEF_MEM_LEVEL, strategy=0, err;
-
-    if (!PyArg_ParseTuple(args, "|iiiii:compressobj", &level, &method, &wbits,
-                          &memLevel, &strategy))
+    Py_buffer zdict;
+    static char *kwlist[] = {"level", "method", "wbits",
+                             "memLevel", "strategy", "zdict", NULL};
+
+    zdict.buf = NULL; /* Sentinel, so we can tell whether zdict was supplied. */
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iiiiiy*:compressobj",
+                                     kwlist, &level, &method, &wbits,
+                                     &memLevel, &strategy, &zdict))
        return NULL;

    self = newcompobject(&Comptype);
    if (self==NULL)
-        return(NULL);
+        goto error;
    self->zst.zalloc = (alloc_func)NULL;
    self->zst.zfree = (free_func)Z_NULL;
    self->zst.next_in = NULL;
@@ -337,30 +351,58 @@ PyZlib_compressobj(PyObject *selfptr, PyObject *args)
    switch(err) {
    case (Z_OK):
        self->is_initialised = 1;
-        return (PyObject*)self;
+        if (zdict.buf == NULL) {
+            goto success;
+        } else {
+            err = deflateSetDictionary(&self->zst, zdict.buf, zdict.len);
+            switch (err) {
+            case (Z_OK):
+                goto success;
+            case (Z_STREAM_ERROR):
+                PyErr_SetString(PyExc_ValueError, "Invalid dictionary");
+                goto error;
+            default:
+                PyErr_SetString(PyExc_ValueError, "deflateSetDictionary()");
+                goto error;
+            }
+       }
    case (Z_MEM_ERROR):
-        Py_DECREF(self);
        PyErr_SetString(PyExc_MemoryError,
                        "Can't allocate memory for compression object");
-        return NULL;
+        goto error;
    case(Z_STREAM_ERROR):
-        Py_DECREF(self);
        PyErr_SetString(PyExc_ValueError, "Invalid initialization option");
-        return NULL;
+        goto error;
    default:
        zlib_error(self->zst, err, "while creating compression object");
-        Py_DECREF(self);
-        return NULL;
+        goto error;
    }
+
+ error:
+    Py_XDECREF(self);
+    self = NULL;
+ success:
+    if (zdict.buf != NULL)
+        PyBuffer_Release(&zdict);
+    return (PyObject*)self;
 }

 static PyObject *
-PyZlib_decompressobj(PyObject *selfptr, PyObject *args)
+PyZlib_decompressobj(PyObject *selfptr, PyObject *args, PyObject *kwargs)
 {
+    static char *kwlist[] = {"wbits", "zdict", NULL};
    int wbits=DEF_WBITS, err;
    compobject *self;
-    if (!PyArg_ParseTuple(args, "|i:decompressobj", &wbits))
+    PyObject *zdict=NULL;
+
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|iO:decompressobj",
+                                     kwlist, &wbits, &zdict))
+        return NULL;
+    if (zdict != NULL && !PyObject_CheckBuffer(zdict)) {
+        PyErr_SetString(PyExc_TypeError,
+                        "zdict argument must support the buffer protocol");
        return NULL;
+    }

    self = newcompobject(&Decomptype);
    if (self == NULL)
@@ -369,6 +411,10 @@ PyZlib_decompressobj(PyObject *selfptr, PyObject *args)
    self->zst.zfree = (free_func)Z_NULL;
    self->zst.next_in = NULL;
    self->zst.avail_in = 0;
+    if (zdict != NULL) {
+        Py_INCREF(zdict);
+        self->zdict = zdict;
+    }
    err = inflateInit2(&self->zst, wbits);
    switch(err) {
    case (Z_OK):
@@ -398,6 +444,7 @@ Dealloc(compobject *self)
 #endif
    Py_XDECREF(self->unused_data);
    Py_XDECREF(self->unconsumed_tail);
+    Py_XDECREF(self->zdict);
    PyObject_Del(self);
 }

@@ -557,6 +604,27 @@ PyZlib_objdecompress(compobject *self, PyObject *args)
    err = inflate(&(self->zst), Z_SYNC_FLUSH);
    Py_END_ALLOW_THREADS

+    if (err == Z_NEED_DICT && self->zdict != NULL) {
+        Py_buffer zdict_buf;
+        if (PyObject_GetBuffer(self->zdict, &zdict_buf, PyBUF_SIMPLE) == -1) {
+            Py_DECREF(RetVal);
+            RetVal = NULL;
+            goto error;
+        }
+        err = inflateSetDictionary(&(self->zst), zdict_buf.buf, zdict_buf.len);
+        PyBuffer_Release(&zdict_buf);
+        if (err != Z_OK) {
+            zlib_error(self->zst, err, "while decompressing data");
+            Py_DECREF(RetVal);
+            RetVal = NULL;
+            goto error;
+        }
+        /* repeat the call to inflate! */
+        Py_BEGIN_ALLOW_THREADS
+        err = inflate(&(self->zst), Z_SYNC_FLUSH);
+        Py_END_ALLOW_THREADS
+    }
+
    /* While Z_OK and the output buffer is full, there might be more output.
       So extend the output buffer and try again.
    */
@@ -770,10 +838,13 @@ PyZlib_copy(compobject *self)
    }
    Py_INCREF(self->unused_data);
    Py_INCREF(self->unconsumed_tail);
+    Py_XINCREF(self->zdict);
    Py_XDECREF(retval->unused_data);
    Py_XDECREF(retval->unconsumed_tail);
+    Py_XDECREF(retval->zdict);
    retval->unused_data = self->unused_data;
    retval->unconsumed_tail = self->unconsumed_tail;
+    retval->zdict = self->zdict;
    retval->eof = self->eof;

    /* Mark it as being initialized */
@@ -822,10 +893,13 @@ PyZlib_uncopy(compobject *self)

    Py_INCREF(self->unused_data);
    Py_INCREF(self->unconsumed_tail);
+    Py_XINCREF(self->zdict);
    Py_XDECREF(retval->unused_data);
    Py_XDECREF(retval->unconsumed_tail);
+    Py_XDECREF(retval->zdict);
    retval->unused_data = self->unused_data;
    retval->unconsumed_tail = self->unconsumed_tail;
+    retval->zdict = self->zdict;
    retval->eof = self->eof;

    /* Mark it as being initialized */
@@ -1032,13 +1106,13 @@ static PyMethodDef zlib_methods[] =
                adler32__doc__},
    {"compress", (PyCFunction)PyZlib_compress,  METH_VARARGS,
                 compress__doc__},
-    {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS,
+    {"compressobj", (PyCFunction)PyZlib_compressobj, METH_VARARGS|METH_KEYWORDS,
                    compressobj__doc__},
    {"crc32", (PyCFunction)PyZlib_crc32, METH_VARARGS,
              crc32__doc__},
    {"decompress", (PyCFunction)PyZlib_decompress, METH_VARARGS,
                   decompress__doc__},
-    {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS,
+    {"decompressobj", (PyCFunction)PyZlib_decompressobj, METH_VARARGS|METH_KEYWORDS,
                   decompressobj__doc__},
    {NULL, NULL}
 };
@@ -1112,10 +1186,10 @@ PyDoc_STRVAR(zlib_module_documentation,
 "\n"
 "adler32(string[, start]) -- Compute an Adler-32 checksum.\n"
 "compress(string[, level]) -- Compress string, with compression level in 1-9.\n"
-"compressobj([level]) -- Return a compressor object.\n"
+"compressobj([level[, ...]]) -- Return a compressor object.\n"
 "crc32(string[, start]) -- Compute a CRC-32 checksum.\n"
 "decompress(string,[wbits],[bufsize]) -- Decompresses a compressed string.\n"
-"decompressobj([wbits]) -- Return a decompressor object.\n"
+"decompressobj([wbits[, zdict]]]) -- Return a decompressor object.\n"
 "\n"
 "'wbits' is window buffer size.\n"
 "Compressor objects support compress() and flush() methods; decompressor\n"