Issue #15955: Add an option to limit the output size in bz2.decompress().

Patch by Nikolaus Rath.

Issue #15955: Add an option to limit the output size in bz2.decompress().
Patch by Nikolaus Rath.
e71258a0 · Antoine Pitrou · 87f50158 · e71258a0 · e71258a0 · e71258a0
Commit e71258a0 authored Feb 26, 2015 by Antoine Pitrou
5 changed files
--- a/Doc/library/bz2.rst
+++ b/Doc/library/bz2.rst
@@ -162,15 +162,32 @@ Incremental (de)compression
      you need to decompress a multi-stream input with :class:`BZ2Decompressor`,
      you must use a new decompressor for each stream.

-   .. method:: decompress(data)
+   .. method:: decompress(data, max_length=-1)

-      Provide data to the decompressor object. Returns a chunk of decompressed
-      data if possible, or an empty byte string otherwise.
+      Decompress *data* (a :term:`bytes-like object`), returning
+      uncompressed data as bytes. Some of *data* may be buffered
+      internally, for use in later calls to :meth:`decompress`. The
+      returned data should be concatenated with the output of any
+      previous calls to :meth:`decompress`.

-      Attempting to decompress data after the end of the current stream is
-      reached raises an :exc:`EOFError`. If any data is found after the end of
-      the stream, it is ignored and saved in the :attr:`unused_data` attribute.
+      If *max_length* is nonnegative, returns at most *max_length*
+      bytes of decompressed data. If this limit is reached and further
+      output can be produced, the :attr:`~.needs_input` attribute will
+      be set to ``False``. In this case, the next call to
+      :meth:`~.decompress` may provide *data* as ``b''`` to obtain
+      more of the output.

+      If all of the input data was decompressed and returned (either
+      because this was less than *max_length* bytes, or because
+      *max_length* was negative), the :attr:`~.needs_input` attribute
+      will be set to ``True``.
+
+      Attempting to decompress data after the end of stream is reached
+      raises an `EOFError`.  Any data found after the end of the
+      stream is ignored and saved in the :attr:`~.unused_data` attribute.
+
+      .. versionchanged:: 3.5
+         Added the *max_length* parameter.

   .. attribute:: eof

@@ -186,6 +203,13 @@ Incremental (de)compression
      If this attribute is accessed before the end of the stream has been
      reached, its value will be ``b''``.

+   .. attribute:: needs_input
+
+      ``False`` if the :meth:`.decompress` method can provide more
+      decompressed data before requiring new uncompressed input.
+
+      .. versionadded:: 3.5
+

 One-shot (de)compression
 ------------------------

--- a/Lib/test/test_bz2.py
+++ b/Lib/test/test_bz2.py
@@ -5,6 +5,7 @@ import unittest
 from io import BytesIO
 import os
 import pickle
+import glob
 import random
 import subprocess
 import sys
@@ -51,6 +52,19 @@ class BaseTest(unittest.TestCase):
    EMPTY_DATA = b'BZh9\x17rE8P\x90\x00\x00\x00\x00'
    BAD_DATA = b'this is not a valid bzip2 file'

+    # Some tests need more than one block of uncompressed data. Since one block
+    # is at least 100 kB, we gather some data dynamically and compress it.
+    # Note that this assumes that compression works correctly, so we cannot
+    # simply use the bigger test data for all tests.
+    test_size = 0
+    BIG_TEXT = bytearray(128*1024)
+    for fname in glob.glob(os.path.join(os.path.dirname(__file__), '*.py')):
+        with open(fname, 'rb') as fh:
+            test_size += fh.readinto(memoryview(BIG_TEXT)[test_size:])
+        if test_size > 128*1024:
+            break
+    BIG_DATA = bz2.compress(BIG_TEXT, compresslevel=1)
+
    def setUp(self):
        self.filename = support.TESTFN

@@ -707,6 +721,95 @@ class BZ2DecompressorTest(BaseTest):
            with self.assertRaises(TypeError):
                pickle.dumps(BZ2Decompressor(), proto)

+    def testDecompressorChunksMaxsize(self):
+        bzd = BZ2Decompressor()
+        max_length = 100
+        out = []
+
+        # Feed some input
+        len_ = len(self.BIG_DATA) - 64
+        out.append(bzd.decompress(self.BIG_DATA[:len_],
+                                  max_length=max_length))
+        self.assertFalse(bzd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data without providing more input
+        out.append(bzd.decompress(b'', max_length=max_length))
+        self.assertFalse(bzd.needs_input)
+        self.assertEqual(len(out[-1]), max_length)
+
+        # Retrieve more data while providing more input
+        out.append(bzd.decompress(self.BIG_DATA[len_:],
+                                  max_length=max_length))
+        self.assertLessEqual(len(out[-1]), max_length)
+
+        # Retrieve remaining uncompressed data
+        while not bzd.eof:
+            out.append(bzd.decompress(b'', max_length=max_length))
+            self.assertLessEqual(len(out[-1]), max_length)
+
+        out = b"".join(out)
+        self.assertEqual(out, self.BIG_TEXT)
+        self.assertEqual(bzd.unused_data, b"")
+
+    def test_decompressor_inputbuf_1(self):
+        # Test reusing input buffer after moving existing
+        # contents to beginning
+        bzd = BZ2Decompressor()
+        out = []
+
+        # Create input buffer and fill it
+        self.assertEqual(bzd.decompress(self.DATA[:100],
+                                        max_length=0), b'')
+
+        # Retrieve some results, freeing capacity at beginning
+        # of input buffer
+        out.append(bzd.decompress(b'', 2))
+
+        # Add more data that fits into input buffer after
+        # moving existing data to beginning
+        out.append(bzd.decompress(self.DATA[100:105], 15))
+
+        # Decompress rest of data
+        out.append(bzd.decompress(self.DATA[105:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_2(self):
+        # Test reusing input buffer by appending data at the
+        # end right away
+        bzd = BZ2Decompressor()
+        out = []
+
+        # Create input buffer and empty it
+        self.assertEqual(bzd.decompress(self.DATA[:200],
+                                        max_length=0), b'')
+        out.append(bzd.decompress(b''))
+
+        # Fill buffer with new data
+        out.append(bzd.decompress(self.DATA[200:280], 2))
+
+        # Append some more data, not enough to require resize
+        out.append(bzd.decompress(self.DATA[280:300], 2))
+
+        # Decompress rest of data
+        out.append(bzd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)
+
+    def test_decompressor_inputbuf_3(self):
+        # Test reusing input buffer after extending it
+
+        bzd = BZ2Decompressor()
+        out = []
+
+        # Create almost full input buffer
+        out.append(bzd.decompress(self.DATA[:200], 5))
+
+        # Add even more data to it, requiring resize
+        out.append(bzd.decompress(self.DATA[200:300], 5))
+
+        # Decompress rest of data
+        out.append(bzd.decompress(self.DATA[300:]))
+        self.assertEqual(b''.join(out), self.TEXT)

 class CompressDecompressTest(BaseTest):
    def testCompress(self):

--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -13,6 +13,9 @@ Core and Builtins
 Library
 -------

+- Issue #15955: Add an option to limit the output size in bz2.decompress().
+  Patch by Nikolaus Rath.
+
 - Issue #6639: Module-level turtle functions no longer raise TclError after
  closing the window.


--- a/Modules/_bz2module.c
+++ b/Modules/_bz2module.c
--- a/Modules/clinic/_bz2module.c.h
+++ b/Modules/clinic/_bz2module.c.h
@@ -95,34 +95,43 @@ exit:
 }

 PyDoc_STRVAR(_bz2_BZ2Decompressor_decompress__doc__,
-"decompress($self, data, /)\n"
+"decompress($self, /, data, max_length=-1)\n"
 "--\n"
 "\n"
-"Provide data to the decompressor object.\n"
+"Decompress *data*, returning uncompressed data as bytes.\n"
 "\n"
-"Returns a chunk of decompressed data if possible, or b\'\' otherwise.\n"
+"If *max_length* is nonnegative, returns at most *max_length* bytes of\n"
+"decompressed data. If this limit is reached and further output can be\n"
+"produced, *self.needs_input* will be set to ``False``. In this case, the next\n"
+"call to *decompress()* may provide *data* as b\'\' to obtain more of the output.\n"
 "\n"
-"Attempting to decompress data after the end of stream is reached\n"
-"raises an EOFError.  Any data found after the end of the stream\n"
-"is ignored and saved in the unused_data attribute.");
+"If all of the input data was decompressed and returned (either because this\n"
+"was less than *max_length* bytes, or because *max_length* was negative),\n"
+"*self.needs_input* will be set to True.\n"
+"\n"
+"Attempting to decompress data after the end of stream is reached raises an\n"
+"EOFError.  Any data found after the end of the stream is ignored and saved in\n"
+"the unused_data attribute.");

 #define _BZ2_BZ2DECOMPRESSOR_DECOMPRESS_METHODDEF    \
-    {"decompress", (PyCFunction)_bz2_BZ2Decompressor_decompress, METH_VARARGS, _bz2_BZ2Decompressor_decompress__doc__},
+    {"decompress", (PyCFunction)_bz2_BZ2Decompressor_decompress, METH_VARARGS|METH_KEYWORDS, _bz2_BZ2Decompressor_decompress__doc__},

 static PyObject *
-_bz2_BZ2Decompressor_decompress_impl(BZ2Decompressor *self, Py_buffer *data);
+_bz2_BZ2Decompressor_decompress_impl(BZ2Decompressor *self, Py_buffer *data, Py_ssize_t max_length);

 static PyObject *
-_bz2_BZ2Decompressor_decompress(BZ2Decompressor *self, PyObject *args)
+_bz2_BZ2Decompressor_decompress(BZ2Decompressor *self, PyObject *args, PyObject *kwargs)
 {
    PyObject *return_value = NULL;
+    static char *_keywords[] = {"data", "max_length", NULL};
    Py_buffer data = {NULL, NULL};
+    Py_ssize_t max_length = -1;

-    if (!PyArg_ParseTuple(args,
-        "y*:decompress",
-        &data))
+    if (!PyArg_ParseTupleAndKeywords(args, kwargs,
+        "y*|n:decompress", _keywords,
+        &data, &max_length))
        goto exit;
-    return_value = _bz2_BZ2Decompressor_decompress_impl(self, &data);
+    return_value = _bz2_BZ2Decompressor_decompress_impl(self, &data, max_length);

 exit:
    /* Cleanup for data */
@@ -159,4 +168,4 @@ _bz2_BZ2Decompressor___init__(PyObject *self, PyObject *args, PyObject *kwargs)
 exit:
    return return_value;
 }
-/*[clinic end generated code: output=21ca4405519a0931 input=a9049054013a1b77]*/
+/*[clinic end generated code: output=8e65e3953430bc3d input=a9049054013a1b77]*/