Commit a32f9a24 authored by Antoine Pitrou's avatar Antoine Pitrou

Merged revisions 77798 via svnmerge from

svn+ssh://pythondev@svn.python.org/python/trunk

........
  r77798 | antoine.pitrou | 2010-01-27 21:59:50 +0100 (mer., 27 janv. 2010) | 8 lines

  Issue #7610: Reworked implementation of the internal
  :class:`zipfile.ZipExtFile` class used to represent files stored inside
  an archive.  The new implementation is significantly faster and can
  be wrapped in a :class:`io.BufferedReader` object for more speedups.
  It also solves an issue where interleaved calls to `read()` and
  `readline()` give wrong results.  Patch by Nir Aides.
........
parent 176d6c40
...@@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -168,6 +168,45 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_STORED) self.zip_random_open_test(f, zipfile.ZIP_STORED)
def test_univeral_readaheads(self):
f = io.BytesIO()
data = b'a\r\n' * 16 * 1024
zipfp = zipfile.ZipFile(f, 'w', zipfile.ZIP_STORED)
zipfp.writestr(TESTFN, data)
zipfp.close()
data2 = b''
zipfp = zipfile.ZipFile(f, 'r')
zipopen = zipfp.open(TESTFN, 'rU')
for line in zipopen:
data2 += line
zipfp.close()
self.assertEqual(data, data2.replace(b'\n', b'\r\n'))
def zip_readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
zipopen = zipfp.open(TESTFN)
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(100)
if not read:
break
data += read
self.assertEqual(data, self.data)
zipfp.close()
def zip_readline_test(self, f, compression): def zip_readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -195,6 +234,11 @@ class TestsWithSourceFile(unittest.TestCase):
for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)): for line, zipline in zip(self.line_gen, zipfp.open(TESTFN)):
self.assertEqual(zipline, line + '\n') self.assertEqual(zipline, line + '\n')
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_test(f, zipfile.ZIP_STORED) self.zip_readline_test(f, zipfile.ZIP_STORED)
...@@ -223,6 +267,12 @@ class TestsWithSourceFile(unittest.TestCase): ...@@ -223,6 +267,12 @@ class TestsWithSourceFile(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_random_open_test(f, zipfile.ZIP_DEFLATED) self.zip_random_open_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.zip_readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
...@@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1067,6 +1117,29 @@ class UniversalNewlineTests(unittest.TestCase):
zipdata = zipfp.open(fn, "rU").read() zipdata = zipfp.open(fn, "rU").read()
self.assertEqual(self.arcdata[sep], zipdata) self.assertEqual(self.arcdata[sep], zipdata)
def readline_read_test(self, f, compression):
self.make_test_archive(f, compression)
# Read the ZIP archive
zipfp = zipfile.ZipFile(f, "r")
for sep, fn in self.arcfiles.items():
zipopen = zipfp.open(fn, "rU")
data = b''
while True:
read = zipopen.readline()
if not read:
break
data += read
read = zipopen.read(5)
if not read:
break
data += read
self.assertEqual(data, self.arcdata['\n'])
zipfp.close()
def readline_test(self, f, compression): def readline_test(self, f, compression):
self.make_test_archive(f, compression) self.make_test_archive(f, compression)
...@@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1101,6 +1174,11 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_STORED) self.read_test(f, zipfile.ZIP_STORED)
def test_readline_read_stored(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_STORED)
def test_readline_stored(self): def test_readline_stored(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_test(f, zipfile.ZIP_STORED) self.readline_test(f, zipfile.ZIP_STORED)
...@@ -1118,6 +1196,12 @@ class UniversalNewlineTests(unittest.TestCase): ...@@ -1118,6 +1196,12 @@ class UniversalNewlineTests(unittest.TestCase):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.read_test(f, zipfile.ZIP_DEFLATED) self.read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib")
def test_readline_read_deflated(self):
# Issue #7610: calls to readline() interleaved with calls to read().
for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
self.readline_read_test(f, zipfile.ZIP_DEFLATED)
@skipUnless(zlib, "requires zlib") @skipUnless(zlib, "requires zlib")
def test_readline_deflated(self): def test_readline_deflated(self):
for f in (TESTFN2, TemporaryFile(), io.BytesIO()): for f in (TESTFN2, TemporaryFile(), io.BytesIO()):
......
...@@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation. ...@@ -5,6 +5,8 @@ XXX references to utf-8 need further investigation.
""" """
import struct, os, time, sys, shutil import struct, os, time, sys, shutil
import binascii, io, stat import binascii, io, stat
import io
import re
try: try:
import zlib # We may need its compression method import zlib # We may need its compression method
...@@ -443,205 +445,172 @@ class _ZipDecrypter: ...@@ -443,205 +445,172 @@ class _ZipDecrypter:
self._UpdateKeys(c) self._UpdateKeys(c)
return c return c
class ZipExtFile: class ZipExtFile(io.BufferedIOBase):
"""File-like object for reading an archive member. """File-like object for reading an archive member.
Is returned by ZipFile.open(). Is returned by ZipFile.open().
""" """
def __init__(self, fileobj, zipinfo, decrypt=None): # Max size supported by decompressor.
self.fileobj = fileobj MAX_N = 1 << 31 - 1
self.decrypter = decrypt
self.bytes_read = 0
self.rawbuffer = b''
self.readbuffer = b''
self.linebuffer = b''
self.eof = False
self.univ_newlines = False
self.nlSeps = (b"\n", )
self.lastdiscard = b''
self.compress_type = zipinfo.compress_type
self.compress_size = zipinfo.compress_size
self.closed = False
self.mode = "r"
self.name = zipinfo.filename
# read from compressed files in 64k blocks # Read from compressed files in 4k blocks.
self.compreadsize = 64*1024 MIN_READ_SIZE = 4096
if self.compress_type == ZIP_DEFLATED:
self.dc = zlib.decompressobj(-15)
def set_univ_newlines(self, univ_newlines): # Search for universal newlines or line chunks.
self.univ_newlines = univ_newlines PATTERN = re.compile(br'^(?P<chunk>[^\r\n]+)|(?P<newline>\n|\r\n?)')
# pick line separator char(s) based on universal newlines flag def __init__(self, fileobj, mode, zipinfo, decrypter=None):
self.nlSeps = (b"\n", ) self._fileobj = fileobj
if self.univ_newlines: self._decrypter = decrypter
self.nlSeps = (b"\r\n", b"\r", b"\n")
def __iter__(self): self._decompressor = zlib.decompressobj(-15)
return self self._unconsumed = b''
def __next__(self): self._readbuffer = b''
nextline = self.readline() self._offset = 0
if not nextline:
raise StopIteration()
return nextline self._universal = 'U' in mode
self.newlines = None
def close(self): self._compress_type = zipinfo.compress_type
self.closed = True self._compress_size = zipinfo.compress_size
self._compress_left = zipinfo.compress_size
def _checkfornewline(self):
nl, nllen = -1, -1
if self.linebuffer:
# ugly check for cases where half of an \r\n pair was
# read on the last pass, and the \r was discarded. In this
# case we just throw away the \n at the start of the buffer.
if (self.lastdiscard, self.linebuffer[:1]) == (b'\r', b'\n'):
self.linebuffer = self.linebuffer[1:]
for sep in self.nlSeps:
nl = self.linebuffer.find(sep)
if nl >= 0:
nllen = len(sep)
return nl, nllen
return nl, nllen
def readline(self, size = -1):
"""Read a line with approx. size. If size is negative,
read a whole line.
"""
if size < 0:
size = sys.maxsize
elif size == 0:
return b''
# check for a newline already in buffer # Adjust read size for encrypted files since the first 12 bytes
nl, nllen = self._checkfornewline() # are for the encryption/password information.
if self._decrypter is not None:
self._compress_left -= 12
if nl >= 0: self.mode = mode
# the next line was already in the buffer self.name = zipinfo.filename
nl = min(nl, size)
else: def readline(self, limit=-1):
# no line break in buffer - try to read more """Read and return a line from the stream.
size -= len(self.linebuffer)
while nl < 0 and size > 0: If limit is specified, at most limit bytes will be read.
buf = self.read(min(size, 100))
if not buf:
break
self.linebuffer += buf
size -= len(buf)
# check for a newline in buffer
nl, nllen = self._checkfornewline()
# we either ran out of bytes in the file, or
# met the specified size limit without finding a newline,
# so return current buffer
if nl < 0:
s = self.linebuffer
self.linebuffer = b''
return s
buf = self.linebuffer[:nl]
self.lastdiscard = self.linebuffer[nl:nl + nllen]
self.linebuffer = self.linebuffer[nl + nllen:]
# line is always returned with \n as newline char (except possibly
# for a final incomplete line in the file, which is handled above).
return buf + b"\n"
def readlines(self, sizehint = -1):
"""Return a list with all (following) lines. The sizehint parameter
is ignored in this implementation.
""" """
result = []
while True:
line = self.readline()
if not line: break
result.append(line)
return result
def read(self, size = None): if not self._universal and limit < 0:
# act like file obj and return empty string if size is 0 # Shortcut common case - newline found in buffer.
if size == 0: i = self._readbuffer.find(b'\n', self._offset) + 1
return b'' if i > 0:
line = self._readbuffer[self._offset: i]
# determine read size self._offset = i
bytesToRead = self.compress_size - self.bytes_read return line
# adjust read size for encrypted files since the first 12 bytes if not self._universal:
# are for the encryption/password information return io.BufferedIOBase.readline(self, limit)
if self.decrypter is not None:
bytesToRead -= 12 line = b''
while limit < 0 or len(line) < limit:
if size is not None and size >= 0: readahead = self.peek(2)
if self.compress_type == ZIP_STORED: if readahead == b'':
lr = len(self.readbuffer) return line
bytesToRead = min(bytesToRead, size - lr)
elif self.compress_type == ZIP_DEFLATED: #
if len(self.readbuffer) > size: # Search for universal newlines or line chunks.
# the user has requested fewer bytes than we've already #
# pulled through the decompressor; don't read any more # The pattern returns either a line chunk or a newline, but not
bytesToRead = 0 # both. Combined with peek(2), we are assured that the sequence
else: # '\r\n' is always retrieved completely and never split into
# user will use up the buffer, so read some more # separate newlines - '\r', '\n' due to coincidental readaheads.
lr = len(self.rawbuffer) #
bytesToRead = min(bytesToRead, self.compreadsize - lr) match = self.PATTERN.search(readahead)
newline = match.group('newline')
# avoid reading past end of file contents if newline is not None:
if bytesToRead + self.bytes_read > self.compress_size: if self.newlines is None:
bytesToRead = self.compress_size - self.bytes_read self.newlines = []
if newline not in self.newlines:
# try to read from file (if necessary) self.newlines.append(newline)
if bytesToRead > 0: self._offset += len(newline)
data = self.fileobj.read(bytesToRead) return line + b'\n'
self.bytes_read += len(data)
try: chunk = match.group('chunk')
self.rawbuffer += data if limit >= 0:
except: chunk = chunk[: limit - len(line)]
print(repr(self.fileobj), repr(self.rawbuffer),
repr(data)) self._offset += len(chunk)
raise line += chunk
return line
def peek(self, n=1):
"""Returns buffered bytes without advancing the position."""
if n > len(self._readbuffer) - self._offset:
chunk = self.read(n)
self._offset -= len(chunk)
# Return up to 512 bytes to reduce allocation overhead for tight loops.
return self._readbuffer[self._offset: self._offset + 512]
def readable(self):
return True
def read(self, n=-1):
"""Read and return up to n bytes.
If the argument is omitted, None, or negative, data is read and returned until EOF is reached..
"""
# handle contents of raw buffer buf = b''
if self.rawbuffer: while n < 0 or n is None or n > len(buf):
newdata = self.rawbuffer data = self.read1(n)
self.rawbuffer = b'' if len(data) == 0:
return buf
# decrypt new data if we were given an object to handle that
if newdata and self.decrypter is not None: buf += data
newdata = bytes(map(self.decrypter, newdata))
return buf
# decompress newly read data if necessary
if newdata and self.compress_type == ZIP_DEFLATED: def read1(self, n):
newdata = self.dc.decompress(newdata) """Read up to n bytes with at most one read() system call."""
self.rawbuffer = self.dc.unconsumed_tail
if self.eof and len(self.rawbuffer) == 0: # Simplify algorithm (branching) by transforming negative n to large n.
# we're out of raw bytes (both from the file and if n < 0 or n is None:
# the local buffer); flush just to make sure the n = self.MAX_N
# decompressor is done
newdata += self.dc.flush() # Bytes available in read buffer.
# prevent decompressor from being used again len_readbuffer = len(self._readbuffer) - self._offset
self.dc = None
# Read from file.
self.readbuffer += newdata if self._compress_left > 0 and n > len_readbuffer + len(self._unconsumed):
nbytes = n - len_readbuffer - len(self._unconsumed)
nbytes = max(nbytes, self.MIN_READ_SIZE)
# return what the user asked for nbytes = min(nbytes, self._compress_left)
if size is None or len(self.readbuffer) <= size:
data = self.readbuffer data = self._fileobj.read(nbytes)
self.readbuffer = b'' self._compress_left -= len(data)
if data and self._decrypter is not None:
data = bytes(map(self._decrypter, data))
if self._compress_type == ZIP_STORED:
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
else: else:
data = self.readbuffer[:size] # Prepare deflated bytes for decompression.
self.readbuffer = self.readbuffer[size:] self._unconsumed += data
# Handle unconsumed data.
if len(self._unconsumed) > 0 and n > len_readbuffer:
data = self._decompressor.decompress(
self._unconsumed,
max(n - len_readbuffer, self.MIN_READ_SIZE)
)
self._unconsumed = self._decompressor.unconsumed_tail
if len(self._unconsumed) == 0 and self._compress_left == 0:
data += self._decompressor.flush()
self._readbuffer = self._readbuffer[self._offset:] + data
self._offset = 0
# Read from buffer.
data = self._readbuffer[self._offset: self._offset + n]
self._offset += len(data)
return data return data
class ZipFile: class ZipFile:
""" Class with methods to open, read, write, close, list zip files. """ Class with methods to open, read, write, close, list zip files.
...@@ -925,16 +894,7 @@ class ZipFile: ...@@ -925,16 +894,7 @@ class ZipFile:
if h[11] != check_byte: if h[11] != check_byte:
raise RuntimeError("Bad password for file", name) raise RuntimeError("Bad password for file", name)
# build and return a ZipExtFile return ZipExtFile(zef_file, mode, zinfo, zd)
if zd is None:
zef = ZipExtFile(zef_file, zinfo)
else:
zef = ZipExtFile(zef_file, zinfo, zd)
# set universal newlines on ZipExtFile if necessary
if "U" in mode:
zef.set_univ_newlines(True)
return zef
def extract(self, member, path=None, pwd=None): def extract(self, member, path=None, pwd=None):
"""Extract a member from the archive to the current working directory, """Extract a member from the archive to the current working directory,
......
...@@ -225,7 +225,6 @@ C-API ...@@ -225,7 +225,6 @@ C-API
- Issue #1419652: Change the first argument to PyImport_AppendInittab() to - Issue #1419652: Change the first argument to PyImport_AppendInittab() to
``const char *`` as the string is stored beyond the call. ``const char *`` as the string is stored beyond the call.
- Issue #2422: When compiled with the ``--with-valgrind`` option, the - Issue #2422: When compiled with the ``--with-valgrind`` option, the
pymalloc allocator will be automatically disabled when running under pymalloc allocator will be automatically disabled when running under
Valgrind. This gives improved memory leak detection when running Valgrind. This gives improved memory leak detection when running
...@@ -234,6 +233,13 @@ C-API ...@@ -234,6 +233,13 @@ C-API
Library Library
------- -------
- Issue #7610: Reworked implementation of the internal
:class:`zipfile.ZipExtFile` class used to represent files stored inside
an archive. The new implementation is significantly faster and can
be wrapped in a :class:`io.BufferedReader` object for more speedups.
It also solves an issue where interleaved calls to `read()` and
`readline()` give wrong results. Patch by Nir Aides.
- Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool, - Issue #6963: Added "maxtasksperchild" argument to multiprocessing.Pool,
allowing for a maximum number of tasks within the pool to be completed by allowing for a maximum number of tasks within the pool to be completed by
the worker before that worker is terminated, and a new one created to the worker before that worker is terminated, and a new one created to
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment