Commit a32f9a24 authored by Antoine Pitrou's avatar Antoine Pitrou

Merged revisions 77798 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r77798 | antoine.pitrou | 2010-01-27 21:59:50 +0100 (mer., 27 janv. 2010) | 8 lines

  Issue #7610: Reworked implementation of the internal
  :class:`zipfile.ZipExtFile` class used to represent files stored inside
  an archive.  The new implementation is significantly faster and can
  be wrapped in a :class:`io.BufferedReader` object for more speedups.
  It also solves an issue where interleaved calls to `read()` and
  `readline()` give wrong results.  Patch by Nir Aides.
........
parent 176d6c40
...@@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_STORED) self.zip_random_open_test(f, zipfile.ZIP_STORED)
def test_univeral_readaheads(self):
f = io.BytesIO()
data = b'a\r\n' * 16 * 1024
zipfp = zipfile.ZipFile(f, 'w', zipfile.ZIP_STORED)
zipfp.writestr(TESTFN, data)
zipfp.close()
data2 = b''
zipfp = zipfile.ZipFile(f, 'r')
zipopen = zipfp.open(TESTFN, 'rU')
for line in zipopen:
data2 += line
zipfp.close()
self.assertEqual(data, data2.replace(b'\n', b'\r\n'))
def zip_readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
zipopen = zipfp.open(TESTFN)
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(100)
if not read:
break
data += read
self.assertEqual(data, self.data)
zipfp.close()
def zip_readline_test(self, f, compression): def zip_readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase):
for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)): for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)):
self.assertEqual(zipline, line + '\n') self.assertEqual(zipline, line + '\n')
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_test(f, zipfile.ZIP_STORED) self.zip_readline_test(f, zipfile.ZIP_STORED)
...@@ -223,6 +267,12 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -223,6 +267,12 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) self.zip_random_open_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
...@@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase):
zipdata = zipfp.open(fn, "rU").read() zipdata = zipfp.open(fn, "rU").read()
self.assertEqual(self.arcdata[sep], zipdata) self.assertEqual(self.arcdata[sep], zipdata)
def readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
for sep, fn in self.arcfiles.items():
zipopen = zipfp.open(fn, "rU")
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(5)
if not read:
break
data += read
self.assertEqual(data, self.arcdata['\n'])
zipfp.close()
def readline_test(self, f, compression): def readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_STORED) self.read_test(f, zipfile.ZIP_STORED)
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_test(f, zipfile.ZIP_STORED) self.readline_test(f, zipfile.ZIP_STORED)
...@@ -1118,6 +1196,12 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1118,6 +1196,12 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_DEFLATED) self.read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
......
...@@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation. ...@@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation.
""" """
import struct, os, time, sys, shutil import struct, os, time, sys, shutil
import binascii, io, stat import binascii, io, stat
import io
import re
try: try:
import zlib # We may need its compression method import zlib # We may need its compression method
...@@ -443,205 +445,172 @@ class _ZipDecrypter: ...@@ -443,205 +445,172 @@ class _ZipDecrypter:
self._UpdateKeys(c) self._UpdateKeys(c)
return c return c
class ZipExtFile: class ZipExtFile(io.BufferedIOBase):
"""File-like object for reading an archive member. """File-like object for reading an archive member.
Is returned by ZipFile.open(). Is returned by ZipFile.open().
""" """
def __init__(self, fileobj, zipinfo, decrypt=None): # Max size supported by decompressor.
self.fileobj = fileobj MAX_N = 1 << 31 - 1
self.decrypter = decrypt
self.bytes_read = 0
self.rawbuffer = b''
self.readbuffer = b''
self.linebuffer = b''
self.eof = False
self.univ_newlines = False
self.nlSeps = (b"\n", )
self.lastdiscard = b''
self.compress_type = zipinfo.compress_type
self.compress_size = zipinfo.compress_size
self.closed = False
self.mode = "r"
self.name = zipinfo.filename
# read from compressed files in 64k blocks # Read from compressed files in 4k blocks.
self.compreadsize = 64*1024 MIN_READ_SIZE = 4096
if self.compress_type == ZIP_DEFLATED:
self.dc = zlib.decompressobj(-15)
def set_univ_newlines(self, univ_newlines): # Search for universal newlines or line chunks.
self.univ_newlines = univ_newlines PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
# pick line separator char(s) based on universal newlines flag def __init__(self, fileobj, mode, zipinfo, decrypter=None):
self.nlSeps = (b"\n", ) self._fileobj = fileobj
if self.univ_newlines: self._decrypter = decrypter
self.nlSeps = (b"\r\n", b"\r", b"\n")
def __iter__(self): self._decompressor = zlib.decompressobj(-15)
return self self._unconsumed = b''
def __next__(self): self._readbuffer = b''
nextline = self.readline() self._offset = 0
if not nextline:
raise StopIteration()
return nextline self._universal = 'U' in mode
self.newlines = None
def close(self): self._compress_type = zipinfo.compress_type
self.closed = True self._compress_size = zipinfo.compress_size
self._compress_left = zipinfo.compress_size
def _checkfornewline(self):
nl, nllen = -1, -1
if self.linebuffer:
# ugly check for cases where half of an \r\n pair was
# read on the last pass, and the \r was discarded. In this
# case we just throw away the \n at the start of the buffer.
if (self.lastdiscard, self.linebuffer[:1]) == (b'\r', b'\n'):
self.linebuffer = self.linebuffer[1:]
for sep in self.nlSeps:
nl = self.linebuffer.find(sep)
if nl >= 0:
nllen = len(sep)
return nl, nllen
return nl, nllen
def readline(self, size = -1):
"""Read a line with approx. size. If size is negative,
read a whole line.
"""
if size < 0:
size = sys.maxsize
elif size == 0:
return b''
# check for a newline already in buffer # Adjust read size for encrypted files since the first 12 bytes
nl, nllen = self._checkfornewline() # are for the encryption/password information.
if self._decrypter is not None:
self._compress_left -= 12
if nl >= 0: self.mode = mode
# the next line was already in the buffer self.name = zipinfo.filename
nl = min(nl, size)
else: def readline(self, limit=-1):
# no line break in buffer - try to read more """Read and return a line from the stream.
size -= len(self.linebuffer)
while nl < 0 and size > 0: If limit is specified, at most limit bytes will be read.
buf = self.read(min(size, 100))
if not buf:
break
self.linebuffer += buf
size -= len(buf)
# check for a newline in buffer
nl, nllen = self._checkfornewline()
# we either ran out of bytes in the file, or
# met the specified size limit without finding a newline,
# so return current buffer
if nl < 0:
s = self.linebuffer
self.linebuffer = b''
return s
buf = self.linebuffer[:nl]
self.lastdiscard = self.linebuffer[nl:nl + nllen]
self.linebuffer = self.linebuffer[nl + nllen:]
# line is always returned with \n as newline char (except possibly
# for a final incomplete line in the file, which is handled above).
return buf + b"\n"
def readlines(self, sizehint = -1):
"""Return a list with all (following) lines. The sizehint parameter
is ignored in this implementation.
""" """
result = []
while True:
line = self.readline()
if not line: break
result.append(line)
return result
def read(self, size = None):
# act like file obj and return empty string if size is 0
if size == 0:
return b''
# determine read size
bytesToRead = self.compress_size - self.bytes_read
# adjust read size for encrypted files since the first 12 bytes
# are for the encryption/password information
if self.decrypter is not None:
bytesToRead -= 12
if size is not None and size >= 0:
if self.compress_type == ZIP_STORED:
lr = len(self.readbuffer)
bytesToRead = min(bytesToRead, size - lr)
elif self.compress_type == ZIP_DEFLATED:
if len(self.readbuffer) > size:
# the user has requested fewer bytes than we've already
# pulled through the decompressor; don't read any more
bytesToRead = 0
else:
# user will use up the buffer, so read some more
lr = len(self.rawbuffer)
bytesToRead = min(bytesToRead, self.compreadsize - lr)
# avoid reading past end of file contents
if bytesToRead + self.bytes_read > self.compress_size:
bytesToRead = self.compress_size - self.bytes_read
# try to read from file (if necessary)
if bytesToRead > 0:
data = self.fileobj.read(bytesToRead)
self.bytes_read += len(data)
try:
self.rawbuffer += data
except:
print(repr(self.fileobj), repr(self.rawbuffer),
repr(data))
raise
# handle contents of raw buffer
if self.rawbuffer:
newdata = self.rawbuffer
self.rawbuffer = b''
# decrypt new data if we were given an object to handle that
if newdata and self.decrypter is not None:
newdata = bytes(map(self.decrypter, newdata))
# decompress newly read data if necessary
if newdata and self.compress_type == ZIP_DEFLATED:
newdata = self.dc.decompress(newdata)
self.rawbuffer = self.dc.unconsumed_tail
if self.eof and len(self.rawbuffer) == 0:
# we're out of raw bytes (both from the file and
# the local buffer); flush just to make sure the
# decompressor is done
newdata += self.dc.flush()
# prevent decompressor from being used again
self.dc = None
self.readbuffer += newdata
# return what the user asked for
if size is None or len(self.readbuffer) <= size:
data = self.readbuffer
self.readbuffer = b''
else:
data = self.readbuffer[:size]
self.readbuffer = self.readbuffer[size:]
if not self._universal and limit < 0:
# Shortcut common case - newline found in buffer.
i = self._readbuffer.find(b'\n', self._offset) + 1
if i > 0:
line = self._readbuffer[self._offset: i]
self._offset = i
return line
if not self._universal:
return io.BufferedIOBase.readline(self, limit)
line = b''
while limit < 0 or len(line) < limit:
readahead = self.peek(2)
if readahead == b'':
return line
#
# Search for universal newlines or line chunks.
#
# The pattern returns either a line chunk or a newline, but not
# both. Combined with peek(2), we are assured that the sequence
# '\r\n' is always retrieved completely and never split into
# separate newlines - '\r', '\n' due to coincidental readaheads.
#
match = self.PATTERN.search(readahead)
newline = match.group('newline')
if newline is not None:
if self.newlines is None:
self.newlines = []
if newline not in self.newlines:
self.newlines.append(newline)
self._offset += len(newline)
return line + b'\n'
chunk = match.group('chunk')
if limit >= 0:
chunk = chunk[: limit - len(line)]
self._offset += len(chunk)
line += chunk
return line
def peek(self, n=1):
"""Returns buffered bytes without advancing the position."""
if n > len(self._readbuffer) - self._offset:
chunk = self.read(n)
self._offset -= len(chunk)
# Return up to 512 bytes to reduce allocation overhead for tight loops.
return self._readbuffer[self._offset: self._offset + 512]
def readable(self):
return True
def read(self, n=-1):
"""Read and return up to n bytes.
If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
"""
buf = b''
while n < 0 or n is None or n > len(buf):
data = self.read1(n)
if len(data) == 0:
return buf
buf += data
return buf
def read1(self, n):
"""Read up to n bytes with at most one read() system call."""
# Simplify algorithm (branching) by transforming negative n to large n.
if n < 0 or n is None:
n = self.MAX_N
# Bytes available in read buffer.
len_readbuffer = len(self._readbuffer) - self._offset
# Read from file.
if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
nbytes = n - len_readbuffer - len(self._unconsumed)
nbytes = max(nbytes, self.MIN_READ_SIZE)
nbytes = min(nbytes, self._compress_left)
data = self._fileobj.read(nbytes)
self._compress_left -= len(data)
if data and self._decrypter is not None:
data = bytes(map(self._decrypter, data))
if self._compress_type == ZIP_STORED:
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
else:
# Prepare deflated bytes for decompression.
self._unconsumed += data
# Handle unconsumed data.
if len(self._unconsumed) > 0 and n > len_readbuffer:
data = self._decompressor.decompress(
self._unconsumed,
max(n - len_readbuffer, self.MIN_READ_SIZE)
)
self._unconsumed = self._decompressor.unconsumed_tail
if len(self._unconsumed) == 0 and self._compress_left == 0:
data += self._decompressor.flush()
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
# Read from buffer.
data = self._readbuffer[self._offset: self._offset + n]
self._offset += len(data)
return data return data
class ZipFile: class ZipFile:
""" Class with methods to open, read, write, close, list zip files. """ Class with methods to open, read, write, close, list zip files.
...@@ -925,16 +894,7 @@ class ZipFile: ...@@ -925,16 +894,7 @@ class ZipFile:
if h[11] != check_byte: if h[11] != check_byte:
raise RuntimeError("Bad password for file", name) raise RuntimeError("Bad password for file", name)
# build and return a ZipExtFile return ZipExtFile(zef_file, mode, zinfo, zd)
if zd is None:
zef = ZipExtFile(zef_file, zinfo)
else:
zef = ZipExtFile(zef_file, zinfo, zd)
# set universal newlines on ZipExtFile if necessary
if "U" in mode:
zef.set_univ_newlines(True)
return zef
def extract(self, member, path=None, pwd=None): def extract(self, member, path=None, pwd=None):
"""Extract a member from the archive to the current working directory, """Extract a member from the archive to the current working directory,
......
...@@ -225,7 +225,6 @@ C-API ...@@ -225,7 +225,6 @@ C-API
- Issue #1419652: Change the first argument to PyImport_AppendInittab() to - Issue #1419652: Change the first argument to PyImport_AppendInittab() to
``const char *`` as the string is stored beyond the call. ``const char *`` as the string is stored beyond the call.
- Issue #2422: When compiled with the ``--with-valgrind`` option, the - Issue #2422: When compiled with the ``--with-valgrind`` option, the
pymalloc allocator will be automatically disabled when running under pymalloc allocator will be automatically disabled when running under
Valgrind. This gives improved memory leak detection when running Valgrind. This gives improved memory leak detection when running
...@@ -234,6 +233,13 @@ C-API ...@@ -234,6 +233,13 @@ C-API
Library Library
------- -------
- Issue #7610: Reworked implementation of the internal
:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive. The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results. Patch by Nir Aides.
- Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool, - Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool,
allowing for a maximum number of tasks within the pool to be completed by allowing for a maximum number of tasks within the pool to be completed by
the worker before that worker is terminated, and a new one created to the worker before that worker is terminated, and a new one created to
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment