Commit 94c33ebf authored by Antoine Pitrou's avatar Antoine Pitrou

Issue #7610: Reworked implementation of the internal

:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive.  The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results.  Patch by Nir Aides.
parent 7b4e02c8
...@@ -172,6 +172,45 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -172,6 +172,45 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.zip_random_open_test(f, zipfile.ZIP_STORED) self.zip_random_open_test(f, zipfile.ZIP_STORED)
def test_univeral_readaheads(self):
f = StringIO()
data = 'a\r\n' * 16 * 1024
zipfp = zipfile.ZipFile(f, 'w', zipfile.ZIP_STORED)
zipfp.writestr(TESTFN, data)
zipfp.close()
data2 = ''
zipfp = zipfile.ZipFile(f, 'r')
zipopen = zipfp.open(TESTFN, 'rU')
for line in zipopen:
data2 += line
zipfp.close()
self.assertEqual(data, data2.replace('\n', '\r\n'))
def zip_readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
zipopen = zipfp.open(TESTFN)
data = ''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(100)
if not read:
break
data += read
self.assertEqual(data, self.data)
zipfp.close()
def zip_readline_test(self, f, compression): def zip_readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -199,6 +238,11 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -199,6 +238,11 @@ class TestsWithSourceFile(unittest.TestCase):
for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)): for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)):
self.assertEqual(zipline, line + '\n') self.assertEqual(zipline, line + '\n')
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), StringIO()):
self.zip_readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.zip_readline_test(f, zipfile.ZIP_STORED) self.zip_readline_test(f, zipfile.ZIP_STORED)
...@@ -226,6 +270,12 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -226,6 +270,12 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) self.zip_random_open_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), StringIO()):
self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
...@@ -1058,6 +1108,29 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1058,6 +1108,29 @@ class UniversalNewlineTests(unittest.TestCase):
zipdata = zipfp.open(fn, "rU").read() zipdata = zipfp.open(fn, "rU").read()
self.assertEqual(self.arcdata[sep], zipdata) self.assertEqual(self.arcdata[sep], zipdata)
def readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
for sep, fn in self.arcfiles.items():
zipopen = zipfp.open(fn, "rU")
data = ''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(5)
if not read:
break
data += read
self.assertEqual(data, self.arcdata['\n'])
zipfp.close()
def readline_test(self, f, compression): def readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -1092,6 +1165,11 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1092,6 +1165,11 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.read_test(f, zipfile.ZIP_STORED) self.read_test(f, zipfile.ZIP_STORED)
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), StringIO()):
self.readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.readline_test(f, zipfile.ZIP_STORED) self.readline_test(f, zipfile.ZIP_STORED)
...@@ -1109,6 +1187,12 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1109,6 +1187,12 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
self.read_test(f, zipfile.ZIP_DEFLATED) self.read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), StringIO()):
self.readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), StringIO()): for f in (TESTFN2, TemporaryFile(), StringIO()):
......
...@@ -3,6 +3,8 @@ Read and write ZIP files. ...@@ -3,6 +3,8 @@ Read and write ZIP files.
""" """
import struct, os, time, sys, shutil import struct, os, time, sys, shutil
import binascii, cStringIO, stat import binascii, cStringIO, stat
import io
import re
try: try:
import zlib # We may need its compression method import zlib # We may need its compression method
...@@ -451,198 +453,170 @@ class _ZipDecrypter: ...@@ -451,198 +453,170 @@ class _ZipDecrypter:
self._UpdateKeys(c) self._UpdateKeys(c)
return c return c
class ZipExtFile: class ZipExtFile(io.BufferedIOBase):
"""File-like object for reading an archive member. """File-like object for reading an archive member.
Is returned by ZipFile.open(). Is returned by ZipFile.open().
""" """
def __init__(self, fileobj, zipinfo, decrypt=None): # Max size supported by decompressor.
self.fileobj = fileobj MAX_N = 1 << 31 - 1
self.decrypter = decrypt
self.bytes_read = 0L
self.rawbuffer = ''
self.readbuffer = ''
self.linebuffer = ''
self.eof = False
self.univ_newlines = False
self.nlSeps = ("\n", )
self.lastdiscard = ''
self.compress_type = zipinfo.compress_type
self.compress_size = zipinfo.compress_size
self.closed = False
self.mode = "r"
self.name = zipinfo.filename
# read from compressed files in 64k blocks # Read from compressed files in 4k blocks.
self.compreadsize = 64*1024 MIN_READ_SIZE = 4096
if self.compress_type == ZIP_DEFLATED:
self.dc = zlib.decompressobj(-15)
def set_univ_newlines(self, univ_newlines): # Search for universal newlines or line chunks.
self.univ_newlines = univ_newlines PATTERN = re.compile(r'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
# pick line separator char(s) based on universal newlines flag def __init__(self, fileobj, mode, zipinfo, decrypter=None):
self.nlSeps = ("\n", ) self._fileobj = fileobj
if self.univ_newlines: self._decrypter = decrypter
self.nlSeps = ("\r\n", "\r", "\n")
def __iter__(self): self._decompressor = zlib.decompressobj(-15)
return self self._unconsumed = ''
def next(self): self._readbuffer = ''
nextline = self.readline() self._offset = 0
if not nextline:
raise StopIteration()
return nextline self._universal = 'U' in mode
self.newlines = None
def close(self): self._compress_type = zipinfo.compress_type
self.closed = True self._compress_size = zipinfo.compress_size
self._compress_left = zipinfo.compress_size
def _checkfornewline(self):
nl, nllen = -1, -1
if self.linebuffer:
# ugly check for cases where half of an \r\n pair was
# read on the last pass, and the \r was discarded. In this
# case we just throw away the \n at the start of the buffer.
if (self.lastdiscard, self.linebuffer[0]) == ('\r','\n'):
self.linebuffer = self.linebuffer[1:]
for sep in self.nlSeps:
nl = self.linebuffer.find(sep)
if nl >= 0:
nllen = len(sep)
return nl, nllen
return nl, nllen
def readline(self, size = -1):
"""Read a line with approx. size. If size is negative,
read a whole line.
"""
if size < 0:
size = sys.maxint
elif size == 0:
return ''
# check for a newline already in buffer # Adjust read size for encrypted files since the first 12 bytes
nl, nllen = self._checkfornewline() # are for the encryption/password information.
if self._decrypter is not None:
self._compress_left -= 12
if nl >= 0: self.mode = mode
# the next line was already in the buffer self.name = zipinfo.filename
nl = min(nl, size)
else: def readline(self, limit=-1):
# no line break in buffer - try to read more """Read and return a line from the stream.
size -= len(self.linebuffer)
while nl < 0 and size > 0: If limit is specified, at most limit bytes will be read.
buf = self.read(min(size, 100))
if not buf:
break
self.linebuffer += buf
size -= len(buf)
# check for a newline in buffer
nl, nllen = self._checkfornewline()
# we either ran out of bytes in the file, or
# met the specified size limit without finding a newline,
# so return current buffer
if nl < 0:
s = self.linebuffer
self.linebuffer = ''
return s
buf = self.linebuffer[:nl]
self.lastdiscard = self.linebuffer[nl:nl + nllen]
self.linebuffer = self.linebuffer[nl + nllen:]
# line is always returned with \n as newline char (except possibly
# for a final incomplete line in the file, which is handled above).
return buf + "\n"
def readlines(self, sizehint = -1):
"""Return a list with all (following) lines. The sizehint parameter
is ignored in this implementation.
""" """
result = []
while True:
line = self.readline()
if not line: break
result.append(line)
return result
def read(self, size = None): if not self._universal and limit < 0:
# act like file() obj and return empty string if size is 0 # Shortcut common case - newline found in buffer.
if size == 0: i = self._readbuffer.find('\n', self._offset) + 1
return '' if i > 0:
line = self._readbuffer[self._offset: i]
# determine read size self._offset = i
bytesToRead = self.compress_size - self.bytes_read return line
# adjust read size for encrypted files since the first 12 bytes if not self._universal:
# are for the encryption/password information return io.BufferedIOBase.readline(self, limit)
if self.decrypter is not None:
bytesToRead -= 12 line = ''
while limit < 0 or len(line) < limit:
if size is not None and size >= 0: readahead = self.peek(2)
if self.compress_type == ZIP_STORED: if readahead == '':
lr = len(self.readbuffer) return line
bytesToRead = min(bytesToRead, size - lr)
elif self.compress_type == ZIP_DEFLATED: #
if len(self.readbuffer) > size: # Search for universal newlines or line chunks.
# the user has requested fewer bytes than we've already #
# pulled through the decompressor; don't read any more # The pattern returns either a line chunk or a newline, but not
bytesToRead = 0 # both. Combined with peek(2), we are assured that the sequence
else: # '\r\n' is always retrieved completely and never split into
# user will use up the buffer, so read some more # separate newlines - '\r', '\n' due to coincidental readaheads.
lr = len(self.rawbuffer) #
bytesToRead = min(bytesToRead, self.compreadsize - lr) match = self.PATTERN.search(readahead)
newline = match.group('newline')
# avoid reading past end of file contents if newline is not None:
if bytesToRead + self.bytes_read > self.compress_size: if self.newlines is None:
bytesToRead = self.compress_size - self.bytes_read self.newlines = []
if newline not in self.newlines:
# try to read from file (if necessary) self.newlines.append(newline)
if bytesToRead > 0: self._offset += len(newline)
bytes = self.fileobj.read(bytesToRead) return line + '\n'
self.bytes_read += len(bytes)
self.rawbuffer += bytes chunk = match.group('chunk')
if limit >= 0:
# handle contents of raw buffer chunk = chunk[: limit - len(line)]
if self.rawbuffer:
newdata = self.rawbuffer self._offset += len(chunk)
self.rawbuffer = '' line += chunk
# decrypt new data if we were given an object to handle that return line
if newdata and self.decrypter is not None:
newdata = ''.join(map(self.decrypter, newdata)) def peek(self, n=1):
"""Returns buffered bytes without advancing the position."""
# decompress newly read data if necessary if n > len(self._readbuffer) - self._offset:
if newdata and self.compress_type == ZIP_DEFLATED: chunk = self.read(n)
newdata = self.dc.decompress(newdata) self._offset -= len(chunk)
self.rawbuffer = self.dc.unconsumed_tail
if self.eof and len(self.rawbuffer) == 0: # Return up to 512 bytes to reduce allocation overhead for tight loops.
# we're out of raw bytes (both from the file and return self._readbuffer[self._offset: self._offset + 512]
# the local buffer); flush just to make sure the
# decompressor is done def readable(self):
newdata += self.dc.flush() return True
# prevent decompressor from being used again
self.dc = None def read(self, n=-1):
"""Read and return up to n bytes.
self.readbuffer += newdata If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
"""
# return what the user asked for buf = ''
if size is None or len(self.readbuffer) <= size: while n < 0 or n is None or n > len(buf):
bytes = self.readbuffer data = self.read1(n)
self.readbuffer = '' if len(data) == 0:
return buf
buf += data
return buf
def read1(self, n):
"""Read up to n bytes with at most one read() system call."""
# Simplify algorithm (branching) by transforming negative n to large n.
if n < 0 or n is None:
n = self.MAX_N
# Bytes available in read buffer.
len_readbuffer = len(self._readbuffer) - self._offset
# Read from file.
if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
nbytes = n - len_readbuffer - len(self._unconsumed)
nbytes = max(nbytes, self.MIN_READ_SIZE)
nbytes = min(nbytes, self._compress_left)
data = self._fileobj.read(nbytes)
self._compress_left -= len(data)
if data and self._decrypter is not None:
data = ''.join(map(self._decrypter, data))
if self._compress_type == ZIP_STORED:
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
else: else:
bytes = self.readbuffer[:size] # Prepare deflated bytes for decompression.
self.readbuffer = self.readbuffer[size:] self._unconsumed += data
# Handle unconsumed data.
if len(self._unconsumed) > 0 and n > len_readbuffer:
data = self._decompressor.decompress(
self._unconsumed,
max(n - len_readbuffer, self.MIN_READ_SIZE)
)
self._unconsumed = self._decompressor.unconsumed_tail
if len(self._unconsumed) == 0 and self._compress_left == 0:
data += self._decompressor.flush()
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
# Read from buffer.
data = self._readbuffer[self._offset: self._offset + n]
self._offset += len(data)
return data
return bytes
class ZipFile: class ZipFile:
...@@ -918,16 +892,7 @@ class ZipFile: ...@@ -918,16 +892,7 @@ class ZipFile:
if ord(h[11]) != check_byte: if ord(h[11]) != check_byte:
raise RuntimeError("Bad password for file", name) raise RuntimeError("Bad password for file", name)
# build and return a ZipExtFile return ZipExtFile(zef_file, mode, zinfo, zd)
if zd is None:
zef = ZipExtFile(zef_file, zinfo)
else:
zef = ZipExtFile(zef_file, zinfo, zd)
# set universal newlines on ZipExtFile if necessary
if "U" in mode:
zef.set_univ_newlines(True)
return zef
def extract(self, member, path=None, pwd=None): def extract(self, member, path=None, pwd=None):
"""Extract a member from the archive to the current working directory, """Extract a member from the archive to the current working directory,
......
...@@ -47,6 +47,13 @@ Core and Builtins ...@@ -47,6 +47,13 @@ Core and Builtins
Library Library
------- -------
- Issue #7610: Reworked implementation of the internal
:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive. The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results. Patch by Nir Aides.
- Issue #7792: Registering non-classes to ABCs raised an obscure error. - Issue #7792: Registering non-classes to ABCs raised an obscure error.
- Removed the functions 'verify' and 'vereq' from Lib/test/test_support.py. - Removed the functions 'verify' and 'vereq' from Lib/test/test_support.py.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment