Commit 57f9b7a1 authored by Serhiy Storchaka's avatar Serhiy Storchaka

Issue #1159051: GzipFile now raises EOFError when reading a corrupted file

with truncated header or footer.
Added tests for reading truncated gzip, bzip2, and lzma files.
parents a80f761a 7c3922f4
...@@ -65,9 +65,6 @@ def write32u(output, value): ...@@ -65,9 +65,6 @@ def write32u(output, value):
# or unsigned. # or unsigned.
output.write(struct.pack("<L", value)) output.write(struct.pack("<L", value))
def read32(input):
return struct.unpack("<I", input.read(4))[0]
class _PaddedFile: class _PaddedFile:
"""Minimal read-only file object that prepends a string to the contents """Minimal read-only file object that prepends a string to the contents
of an actual file. Shouldn't be used outside of gzip.py, as it lacks of an actual file. Shouldn't be used outside of gzip.py, as it lacks
...@@ -281,27 +278,31 @@ class GzipFile(io.BufferedIOBase): ...@@ -281,27 +278,31 @@ class GzipFile(io.BufferedIOBase):
self.crc = zlib.crc32(b"") & 0xffffffff self.crc = zlib.crc32(b"") & 0xffffffff
self.size = 0 self.size = 0
def _read_exact(self, n):
data = self.fileobj.read(n)
while len(data) < n:
b = self.fileobj.read(n - len(data))
if not b:
raise EOFError("Compressed file ended before the "
"end-of-stream marker was reached")
data += b
return data
def _read_gzip_header(self): def _read_gzip_header(self):
magic = self.fileobj.read(2) magic = self.fileobj.read(2)
if magic == b'': if magic == b'':
raise EOFError("Reached EOF") return False
if magic != b'\037\213': if magic != b'\037\213':
raise IOError('Not a gzipped file') raise IOError('Not a gzipped file')
method = ord( self.fileobj.read(1) )
method, flag, self.mtime = struct.unpack("<BBIxx", self._read_exact(8))
if method != 8: if method != 8:
raise IOError('Unknown compression method') raise IOError('Unknown compression method')
flag = ord( self.fileobj.read(1) )
self.mtime = read32(self.fileobj)
# extraflag = self.fileobj.read(1)
# os = self.fileobj.read(1)
self.fileobj.read(2)
if flag & FEXTRA: if flag & FEXTRA:
# Read & discard the extra field, if present # Read & discard the extra field, if present
xlen = ord(self.fileobj.read(1)) self._read_exact(struct.unpack("<H", self._read_exact(2)))
xlen = xlen + 256*ord(self.fileobj.read(1))
self.fileobj.read(xlen)
if flag & FNAME: if flag & FNAME:
# Read and discard a null-terminated string containing the filename # Read and discard a null-terminated string containing the filename
while True: while True:
...@@ -315,12 +316,13 @@ class GzipFile(io.BufferedIOBase): ...@@ -315,12 +316,13 @@ class GzipFile(io.BufferedIOBase):
if not s or s==b'\000': if not s or s==b'\000':
break break
if flag & FHCRC: if flag & FHCRC:
self.fileobj.read(2) # Read & discard the 16-bit header CRC self._read_exact(2) # Read & discard the 16-bit header CRC
unused = self.fileobj.unused() unused = self.fileobj.unused()
if unused: if unused:
uncompress = self.decompress.decompress(unused) uncompress = self.decompress.decompress(unused)
self._add_read_data(uncompress) self._add_read_data(uncompress)
return True
def write(self,data): def write(self,data):
self._check_closed() self._check_closed()
...@@ -354,20 +356,16 @@ class GzipFile(io.BufferedIOBase): ...@@ -354,20 +356,16 @@ class GzipFile(io.BufferedIOBase):
readsize = 1024 readsize = 1024
if size < 0: # get the whole thing if size < 0: # get the whole thing
try: while self._read(readsize):
while True: readsize = min(self.max_read_chunk, readsize * 2)
self._read(readsize) size = self.extrasize
readsize = min(self.max_read_chunk, readsize * 2)
except EOFError:
size = self.extrasize
else: # just get some more of it else: # just get some more of it
try: while size > self.extrasize:
while size > self.extrasize: if not self._read(readsize):
self._read(readsize) if size > self.extrasize:
readsize = min(self.max_read_chunk, readsize * 2) size = self.extrasize
except EOFError: break
if size > self.extrasize: readsize = min(self.max_read_chunk, readsize * 2)
size = self.extrasize
offset = self.offset - self.extrastart offset = self.offset - self.extrastart
chunk = self.extrabuf[offset: offset + size] chunk = self.extrabuf[offset: offset + size]
...@@ -385,12 +383,9 @@ class GzipFile(io.BufferedIOBase): ...@@ -385,12 +383,9 @@ class GzipFile(io.BufferedIOBase):
if self.extrasize <= 0 and self.fileobj is None: if self.extrasize <= 0 and self.fileobj is None:
return b'' return b''
try: # For certain input data, a single call to _read() may not return
# For certain input data, a single call to _read() may not return # any data. In this case, retry until we get some data or reach EOF.
# any data. In this case, retry until we get some data or reach EOF. while self.extrasize <= 0 and self._read():
while self.extrasize <= 0:
self._read()
except EOFError:
pass pass
if size < 0 or size > self.extrasize: if size < 0 or size > self.extrasize:
size = self.extrasize size = self.extrasize
...@@ -413,12 +408,9 @@ class GzipFile(io.BufferedIOBase): ...@@ -413,12 +408,9 @@ class GzipFile(io.BufferedIOBase):
if self.extrasize == 0: if self.extrasize == 0:
if self.fileobj is None: if self.fileobj is None:
return b'' return b''
try: # Ensure that we don't return b"" if we haven't reached EOF.
# Ensure that we don't return b"" if we haven't reached EOF. # 1024 is the same buffering heuristic used in read()
while self.extrasize == 0: while self.extrasize == 0 and self._read(max(n, 1024)):
# 1024 is the same buffering heuristic used in read()
self._read(max(n, 1024))
except EOFError:
pass pass
offset = self.offset - self.extrastart offset = self.offset - self.extrastart
remaining = self.extrasize remaining = self.extrasize
...@@ -431,13 +423,14 @@ class GzipFile(io.BufferedIOBase): ...@@ -431,13 +423,14 @@ class GzipFile(io.BufferedIOBase):
def _read(self, size=1024): def _read(self, size=1024):
if self.fileobj is None: if self.fileobj is None:
raise EOFError("Reached EOF") return False
if self._new_member: if self._new_member:
# If the _new_member flag is set, we have to # If the _new_member flag is set, we have to
# jump to the next member, if there is one. # jump to the next member, if there is one.
self._init_read() self._init_read()
self._read_gzip_header() if not self._read_gzip_header():
return False
self.decompress = zlib.decompressobj(-zlib.MAX_WBITS) self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
self._new_member = False self._new_member = False
...@@ -454,7 +447,7 @@ class GzipFile(io.BufferedIOBase): ...@@ -454,7 +447,7 @@ class GzipFile(io.BufferedIOBase):
self.fileobj.prepend(self.decompress.unused_data, True) self.fileobj.prepend(self.decompress.unused_data, True)
self._read_eof() self._read_eof()
self._add_read_data( uncompress ) self._add_read_data( uncompress )
raise EOFError('Reached EOF') return False
uncompress = self.decompress.decompress(buf) uncompress = self.decompress.decompress(buf)
self._add_read_data( uncompress ) self._add_read_data( uncompress )
...@@ -470,6 +463,7 @@ class GzipFile(io.BufferedIOBase): ...@@ -470,6 +463,7 @@ class GzipFile(io.BufferedIOBase):
# a new member on the next call # a new member on the next call
self._read_eof() self._read_eof()
self._new_member = True self._new_member = True
return True
def _add_read_data(self, data): def _add_read_data(self, data):
self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.crc = zlib.crc32(data, self.crc) & 0xffffffff
...@@ -484,8 +478,7 @@ class GzipFile(io.BufferedIOBase): ...@@ -484,8 +478,7 @@ class GzipFile(io.BufferedIOBase):
# We check the that the computed CRC and size of the # We check the that the computed CRC and size of the
# uncompressed data matches the stored values. Note that the size # uncompressed data matches the stored values. Note that the size
# stored is the true file size mod 2**32. # stored is the true file size mod 2**32.
crc32 = read32(self.fileobj) crc32, isize = struct.unpack("<II", self._read_exact(8))
isize = read32(self.fileobj) # may exceed 2GB
if crc32 != self.crc: if crc32 != self.crc:
raise IOError("CRC check failed %s != %s" % (hex(crc32), raise IOError("CRC check failed %s != %s" % (hex(crc32),
hex(self.crc))) hex(self.crc)))
......
...@@ -577,6 +577,20 @@ class BZ2FileTest(BaseTest): ...@@ -577,6 +577,20 @@ class BZ2FileTest(BaseTest):
bz2f.seek(-150, 1) bz2f.seek(-150, 1)
self.assertEqual(bz2f.read(), self.TEXT[500-150:]) self.assertEqual(bz2f.read(), self.TEXT[500-150:])
def test_read_truncated(self):
# Drop the eos_magic field (6 bytes) and CRC (4 bytes).
truncated = self.DATA[:-10]
with BZ2File(BytesIO(truncated)) as f:
self.assertRaises(EOFError, f.read)
with BZ2File(BytesIO(truncated)) as f:
self.assertEqual(f.read(len(self.TEXT)), self.TEXT)
self.assertRaises(EOFError, f.read, 1)
# Incomplete 4-byte file header, and block header of at least 146 bits.
for i in range(22):
with BZ2File(BytesIO(truncated[:i])) as f:
self.assertRaises(EOFError, f.read, 1)
class BZ2CompressorTest(BaseTest): class BZ2CompressorTest(BaseTest):
def testCompress(self): def testCompress(self):
bz2c = BZ2Compressor() bz2c = BZ2Compressor()
......
...@@ -389,6 +389,20 @@ class TestGzip(BaseTest): ...@@ -389,6 +389,20 @@ class TestGzip(BaseTest):
datac = gzip.compress(data) datac = gzip.compress(data)
self.assertEqual(gzip.decompress(datac), data) self.assertEqual(gzip.decompress(datac), data)
def test_read_truncated(self):
data = data1*50
# Drop the CRC (4 bytes) and file size (4 bytes).
truncated = gzip.compress(data)[:-8]
with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f:
self.assertRaises(EOFError, f.read)
with gzip.GzipFile(fileobj=io.BytesIO(truncated)) as f:
self.assertEqual(f.read(len(data)), data)
self.assertRaises(EOFError, f.read, 1)
# Incomplete 10-byte header.
for i in range(2, 10):
with gzip.GzipFile(fileobj=io.BytesIO(truncated[:i])) as f:
self.assertRaises(EOFError, f.read, 1)
class TestOpen(BaseTest): class TestOpen(BaseTest):
def test_binary_modes(self): def test_binary_modes(self):
......
...@@ -669,6 +669,20 @@ class FileTestCase(unittest.TestCase): ...@@ -669,6 +669,20 @@ class FileTestCase(unittest.TestCase):
with LZMAFile(BytesIO(COMPRESSED_XZ[:128])) as f: with LZMAFile(BytesIO(COMPRESSED_XZ[:128])) as f:
self.assertRaises(EOFError, f.read) self.assertRaises(EOFError, f.read)
def test_read_truncated(self):
# Drop stream footer: CRC (4 bytes), index size (4 bytes),
# flags (2 bytes) and magic number (2 bytes).
truncated = COMPRESSED_XZ[:-12]
with LZMAFile(BytesIO(truncated)) as f:
self.assertRaises(EOFError, f.read)
with LZMAFile(BytesIO(truncated)) as f:
self.assertEqual(f.read(len(INPUT)), INPUT)
self.assertRaises(EOFError, f.read, 1)
# Incomplete 12-byte header.
for i in range(12):
with LZMAFile(BytesIO(truncated[:i])) as f:
self.assertRaises(EOFError, f.read, 1)
def test_read_bad_args(self): def test_read_bad_args(self):
f = LZMAFile(BytesIO(COMPRESSED_XZ)) f = LZMAFile(BytesIO(COMPRESSED_XZ))
f.close() f.close()
......
...@@ -150,6 +150,9 @@ Core and Builtins ...@@ -150,6 +150,9 @@ Core and Builtins
Library Library
------- -------
- Issue #1159051: GzipFile now raises EOFError when reading a corrupted file
with truncated header or footer.
- Issue #16993: shutil.which() now preserves the case of the path and extension - Issue #16993: shutil.which() now preserves the case of the path and extension
on Windows. on Windows.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment