Commit 2dbc6e6b authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #23529: Limit the size of decompressed data when reading from

GzipFile, BZ2File or LZMAFile.  This defeats denial of service attacks
using compressed bombs (i.e. compressed payloads which decompress to a huge
size).

Patch by Martin Panter and Nikolaus Rath.
parent 2ce11d29
......@@ -120,6 +120,10 @@ All of the classes in this module may safely be accessed from multiple threads.
.. versionchanged:: 3.4
The ``'x'`` (exclusive creation) mode was added.
.. versionchanged:: 3.5
The :meth:`~io.BufferedIOBase.read` method now accepts an argument of
``None``.
Incremental (de)compression
---------------------------
......
......@@ -90,13 +90,9 @@ The module defines the following items:
is no compression. The default is ``9``.
The *mtime* argument is an optional numeric timestamp to be written to
the stream when compressing. All :program:`gzip` compressed streams are
required to contain a timestamp. If omitted or ``None``, the current
time is used. This module ignores the timestamp when decompressing;
however, some programs, such as :program:`gunzip`\ , make use of it.
The format of the timestamp is the same as that of the return value of
``time.time()`` and of the ``st_mtime`` attribute of the object returned
by ``os.stat()``.
the last modification time field in the stream when compressing. It
should only be provided in compression mode. If omitted or ``None``, the
current time is used. See the :attr:`mtime` attribute for more details.
Calling a :class:`GzipFile` object's :meth:`close` method does not close
*fileobj*, since you might wish to append more material after the compressed
......@@ -108,9 +104,9 @@ The module defines the following items:
including iteration and the :keyword:`with` statement. Only the
:meth:`truncate` method isn't implemented.
:class:`GzipFile` also provides the following method:
:class:`GzipFile` also provides the following method and attribute:
.. method:: peek([n])
.. method:: peek(n)
Read *n* uncompressed bytes without advancing the file position.
At most one single read on the compressed stream is done to satisfy
......@@ -124,9 +120,21 @@ The module defines the following items:
.. versionadded:: 3.2
.. attribute:: mtime
When decompressing, the value of the last modification time field in
the most recently read header may be read from this attribute, as an
integer. The initial value before reading any headers is ``None``.
All :program:`gzip` compressed streams are required to contain this
timestamp field. Some programs, such as :program:`gunzip`\ , make use
of the timestamp. The format is the same as the return value of
:func:`time.time` and the :attr:`~os.stat_result.st_mtime` attribute of
the object returned by :func:`os.stat`.
.. versionchanged:: 3.1
Support for the :keyword:`with` statement was added, along with the
*mtime* argument.
*mtime* constructor argument and :attr:`mtime` attribute.
.. versionchanged:: 3.2
Support for zero-padded and unseekable files was added.
......@@ -140,6 +148,8 @@ The module defines the following items:
.. versionchanged:: 3.5
Added support for writing arbitrary
:term:`bytes-like objects <bytes-like object>`.
The :meth:`~io.BufferedIOBase.read` method now accepts an argument of
``None``.
.. function:: compress(data, compresslevel=9)
......
......@@ -110,6 +110,10 @@ Reading and writing compressed files
.. versionchanged:: 3.4
Added support for the ``"x"`` and ``"xb"`` modes.
.. versionchanged:: 3.5
The :meth:`~io.BufferedIOBase.read` method now accepts an argument of
``None``.
Compressing and decompressing data in memory
--------------------------------------------
......
"""Internal classes used by the gzip, lzma and bz2 modules"""
import io
BUFFER_SIZE = io.DEFAULT_BUFFER_SIZE # Compressed data read chunk size
class BaseStream(io.BufferedIOBase):
"""Mode-checking helper functions."""
def _check_not_closed(self):
if self.closed:
raise ValueError("I/O operation on closed file")
def _check_can_read(self):
if not self.readable():
raise io.UnsupportedOperation("File not open for reading")
def _check_can_write(self):
if not self.writable():
raise io.UnsupportedOperation("File not open for writing")
def _check_can_seek(self):
if not self.readable():
raise io.UnsupportedOperation("Seeking is only supported "
"on files open for reading")
if not self.seekable():
raise io.UnsupportedOperation("The underlying file object "
"does not support seeking")
class DecompressReader(io.RawIOBase):
"""Adapts the decompressor API to a RawIOBase reader API"""
def readable(self):
return True
def __init__(self, fp, decomp_factory, trailing_error=(), **decomp_args):
self._fp = fp
self._eof = False
self._pos = 0 # Current offset in decompressed stream
# Set to size of decompressed stream once it is known, for SEEK_END
self._size = -1
# Save the decompressor factory and arguments.
# If the file contains multiple compressed streams, each
# stream will need a separate decompressor object. A new decompressor
# object is also needed when implementing a backwards seek().
self._decomp_factory = decomp_factory
self._decomp_args = decomp_args
self._decompressor = self._decomp_factory(**self._decomp_args)
# Exception class to catch from decompressor signifying invalid
# trailing data to ignore
self._trailing_error = trailing_error
def close(self):
self._decompressor = None
return super().close()
def seekable(self):
return self._fp.seekable()
def readinto(self, b):
with memoryview(b) as view, view.cast("B") as byte_view:
data = self.read(len(byte_view))
byte_view[:len(data)] = data
return len(data)
def read(self, size=-1):
if size < 0:
return self.readall()
if not size or self._eof:
return b""
data = None # Default if EOF is encountered
# Depending on the input data, our call to the decompressor may not
# return any data. In this case, try again after reading another block.
while True:
if self._decompressor.eof:
rawblock = (self._decompressor.unused_data or
self._fp.read(BUFFER_SIZE))
if not rawblock:
break
# Continue to next stream.
self._decompressor = self._decomp_factory(
**self._decomp_args)
try:
data = self._decompressor.decompress(rawblock, size)
except self._trailing_error:
# Trailing data isn't a valid compressed stream; ignore it.
break
else:
if self._decompressor.needs_input:
rawblock = self._fp.read(BUFFER_SIZE)
if not rawblock:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
else:
rawblock = b""
data = self._decompressor.decompress(rawblock, size)
if data:
break
if not data:
self._eof = True
self._size = self._pos
return b""
self._pos += len(data)
return data
# Rewind the file to the beginning of the data stream.
def _rewind(self):
self._fp.seek(0)
self._eof = False
self._pos = 0
self._decompressor = self._decomp_factory(**self._decomp_args)
def seek(self, offset, whence=io.SEEK_SET):
# Recalculate offset as an absolute file position.
if whence == io.SEEK_SET:
pass
elif whence == io.SEEK_CUR:
offset = self._pos + offset
elif whence == io.SEEK_END:
# Seeking relative to EOF - we need to know the file's size.
if self._size < 0:
while self.read(io.DEFAULT_BUFFER_SIZE):
pass
offset = self._size + offset
else:
raise ValueError("Invalid value for whence: {}".format(whence))
# Make it so that offset is the number of bytes to skip forward.
if offset < self._pos:
self._rewind()
else:
offset -= self._pos
# Read and discard data until we reach the desired position.
while offset > 0:
data = self.read(min(io.DEFAULT_BUFFER_SIZE, offset))
if not data:
break
offset -= len(data)
return self._pos
def tell(self):
"""Return the current file position."""
return self._pos
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -2,7 +2,7 @@ from test import support
from test.support import bigmemtest, _4G
import unittest
from io import BytesIO
from io import BytesIO, DEFAULT_BUFFER_SIZE
import os
import pickle
import glob
......@@ -10,6 +10,7 @@ import random
import subprocess
import sys
from test.support import unlink
import _compression
try:
import threading
......@@ -110,7 +111,7 @@ class BZ2FileTest(BaseTest):
def testRead(self):
self.createTempFile()
with BZ2File(self.filename) as bz2f:
self.assertRaises(TypeError, bz2f.read, None)
self.assertRaises(TypeError, bz2f.read, float())
self.assertEqual(bz2f.read(), self.TEXT)
def testReadBadFile(self):
......@@ -121,21 +122,21 @@ class BZ2FileTest(BaseTest):
def testReadMultiStream(self):
self.createTempFile(streams=5)
with BZ2File(self.filename) as bz2f:
self.assertRaises(TypeError, bz2f.read, None)
self.assertRaises(TypeError, bz2f.read, float())
self.assertEqual(bz2f.read(), self.TEXT * 5)
def testReadMonkeyMultiStream(self):
# Test BZ2File.read() on a multi-stream archive where a stream
# boundary coincides with the end of the raw read buffer.
buffer_size = bz2._BUFFER_SIZE
bz2._BUFFER_SIZE = len(self.DATA)
buffer_size = _compression.BUFFER_SIZE
_compression.BUFFER_SIZE = len(self.DATA)
try:
self.createTempFile(streams=5)
with BZ2File(self.filename) as bz2f:
self.assertRaises(TypeError, bz2f.read, None)
self.assertRaises(TypeError, bz2f.read, float())
self.assertEqual(bz2f.read(), self.TEXT * 5)
finally:
bz2._BUFFER_SIZE = buffer_size
_compression.BUFFER_SIZE = buffer_size
def testReadTrailingJunk(self):
self.createTempFile(suffix=self.BAD_DATA)
......@@ -150,7 +151,7 @@ class BZ2FileTest(BaseTest):
def testRead0(self):
self.createTempFile()
with BZ2File(self.filename) as bz2f:
self.assertRaises(TypeError, bz2f.read, None)
self.assertRaises(TypeError, bz2f.read, float())
self.assertEqual(bz2f.read(0), b"")
def testReadChunk10(self):
......@@ -559,13 +560,24 @@ class BZ2FileTest(BaseTest):
with BZ2File(str_filename, "rb") as f:
self.assertEqual(f.read(), self.DATA)
def testDecompressLimited(self):
"""Decompressed data buffering should be limited"""
bomb = bz2.compress(bytes(int(2e6)), compresslevel=9)
self.assertLess(len(bomb), _compression.BUFFER_SIZE)
decomp = BZ2File(BytesIO(bomb))
self.assertEqual(bytes(1), decomp.read(1))
max_decomp = 1 + DEFAULT_BUFFER_SIZE
self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp,
"Excessive amount of data was decompressed")
# Tests for a BZ2File wrapping another file object:
def testReadBytesIO(self):
with BytesIO(self.DATA) as bio:
with BZ2File(bio) as bz2f:
self.assertRaises(TypeError, bz2f.read, None)
self.assertRaises(TypeError, bz2f.read, float())
self.assertEqual(bz2f.read(), self.TEXT)
self.assertFalse(bio.closed)
......
......@@ -123,7 +123,10 @@ class TestGzip(BaseTest):
# Write to a file, open it for reading, then close it.
self.test_write()
f = gzip.GzipFile(self.filename, 'r')
fileobj = f.fileobj
self.assertFalse(fileobj.closed)
f.close()
self.assertTrue(fileobj.closed)
with self.assertRaises(ValueError):
f.read(1)
with self.assertRaises(ValueError):
......@@ -132,7 +135,10 @@ class TestGzip(BaseTest):
f.tell()
# Open the file for writing, then close it.
f = gzip.GzipFile(self.filename, 'w')
fileobj = f.fileobj
self.assertFalse(fileobj.closed)
f.close()
self.assertTrue(fileobj.closed)
with self.assertRaises(ValueError):
f.write(b'')
with self.assertRaises(ValueError):
......@@ -271,9 +277,10 @@ class TestGzip(BaseTest):
with gzip.GzipFile(self.filename, 'w', mtime = mtime) as fWrite:
fWrite.write(data1)
with gzip.GzipFile(self.filename) as fRead:
self.assertTrue(hasattr(fRead, 'mtime'))
self.assertIsNone(fRead.mtime)
dataRead = fRead.read()
self.assertEqual(dataRead, data1)
self.assertTrue(hasattr(fRead, 'mtime'))
self.assertEqual(fRead.mtime, mtime)
def test_metadata(self):
......@@ -416,6 +423,18 @@ class TestGzip(BaseTest):
with gzip.GzipFile(str_filename, "rb") as f:
self.assertEqual(f.read(), data1 * 50)
def test_decompress_limited(self):
"""Decompressed data buffering should be limited"""
bomb = gzip.compress(bytes(int(2e6)), compresslevel=9)
self.assertLess(len(bomb), io.DEFAULT_BUFFER_SIZE)
bomb = io.BytesIO(bomb)
decomp = gzip.GzipFile(fileobj=bomb)
self.assertEqual(bytes(1), decomp.read(1))
max_decomp = 1 + io.DEFAULT_BUFFER_SIZE
self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp,
"Excessive amount of data was decompressed")
# Testing compress/decompress shortcut functions
def test_compress(self):
......@@ -463,7 +482,7 @@ class TestGzip(BaseTest):
with gzip.open(self.filename, "wb") as f:
f.write(data1)
with gzip.open(self.filename, "rb") as f:
f.fileobj.prepend()
f._buffer.raw._fp.prepend()
class TestOpen(BaseTest):
def test_binary_modes(self):
......
from io import BytesIO, UnsupportedOperation
import _compression
from io import BytesIO, UnsupportedOperation, DEFAULT_BUFFER_SIZE
import os
import pickle
import random
......@@ -772,13 +773,13 @@ class FileTestCase(unittest.TestCase):
def test_read_multistream_buffer_size_aligned(self):
# Test the case where a stream boundary coincides with the end
# of the raw read buffer.
saved_buffer_size = lzma._BUFFER_SIZE
lzma._BUFFER_SIZE = len(COMPRESSED_XZ)
saved_buffer_size = _compression.BUFFER_SIZE
_compression.BUFFER_SIZE = len(COMPRESSED_XZ)
try:
with LZMAFile(BytesIO(COMPRESSED_XZ * 5)) as f:
self.assertEqual(f.read(), INPUT * 5)
finally:
lzma._BUFFER_SIZE = saved_buffer_size
_compression.BUFFER_SIZE = saved_buffer_size
def test_read_trailing_junk(self):
with LZMAFile(BytesIO(COMPRESSED_XZ + COMPRESSED_BOGUS)) as f:
......@@ -829,7 +830,7 @@ class FileTestCase(unittest.TestCase):
with LZMAFile(BytesIO(), "w") as f:
self.assertRaises(ValueError, f.read)
with LZMAFile(BytesIO(COMPRESSED_XZ)) as f:
self.assertRaises(TypeError, f.read, None)
self.assertRaises(TypeError, f.read, float())
def test_read_bad_data(self):
with LZMAFile(BytesIO(COMPRESSED_BOGUS)) as f:
......@@ -925,6 +926,17 @@ class FileTestCase(unittest.TestCase):
with LZMAFile(BytesIO(COMPRESSED_XZ)) as f:
self.assertListEqual(f.readlines(), lines)
def test_decompress_limited(self):
"""Decompressed data buffering should be limited"""
bomb = lzma.compress(bytes(int(2e6)), preset=6)
self.assertLess(len(bomb), _compression.BUFFER_SIZE)
decomp = LZMAFile(BytesIO(bomb))
self.assertEqual(bytes(1), decomp.read(1))
max_decomp = 1 + DEFAULT_BUFFER_SIZE
self.assertLessEqual(decomp._buffer.raw.tell(), max_decomp,
"Excessive amount of data was decompressed")
def test_write(self):
with BytesIO() as dst:
with LZMAFile(dst, "w") as f:
......@@ -1090,7 +1102,8 @@ class FileTestCase(unittest.TestCase):
self.assertRaises(ValueError, f.seek, 0)
with LZMAFile(BytesIO(COMPRESSED_XZ)) as f:
self.assertRaises(ValueError, f.seek, 0, 3)
self.assertRaises(ValueError, f.seek, 9, ())
# io.BufferedReader raises TypeError instead of ValueError
self.assertRaises((TypeError, ValueError), f.seek, 9, ())
self.assertRaises(TypeError, f.seek, None)
self.assertRaises(TypeError, f.seek, b"derp")
......
......@@ -19,6 +19,11 @@ Core and Builtins
Library
-------
- Issue #23529: Limit the size of decompressed data when reading from
GzipFile, BZ2File or LZMAFile. This defeats denial of service attacks
using compressed bombs (i.e. compressed payloads which decompress to a huge
size). Patch by Martin Panter and Nikolaus Rath.
- Issue #21859: Added Python implementation of io.FileIO.
- Issue #23865: close() methods in multiple modules now are idempotent and more
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment