bpo-34010: Fix tarfile read performance regression (GH-8020)

During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc32.

bpo-34010: Fix tarfile read performance regression (GH-8020)
During buffered read, use a list followed by join instead of extending a bytes object. This is how it was done before but changed in commit b506dc32.
12a08c47 · hajoscher · INADA Naoki · 97ae32c9 · 12a08c47 · 12a08c47
Commit 12a08c47 authored Jul 04, 2018 by hajoscher Committed by INADA Naoki Jul 04, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 13 additions and 9 deletions

Lib/tarfile.py Lib/tarfile.py +11 -9

Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst ...S.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst +2 -0

No files found.
--- a/Lib/tarfile.py
+++ b/Lib/tarfile.py
@@ -525,7 +525,7 @@ class _Stream:
                if not buf:
                    break
                t.append(buf)
-            buf = "".join(t)
+            buf = b"".join(t)
        else:
            buf = self._read(size)
        self.pos += len(buf)
@@ -538,6 +538,7 @@ class _Stream:
            return self.__read(size)

        c = len(self.dbuf)
+        t = [self.dbuf]
        while c < size:
            buf = self.__read(self.bufsize)
            if not buf:
@@ -546,26 +547,27 @@ class _Stream:
                buf = self.cmp.decompress(buf)
            except self.exception:
                raise ReadError("invalid compressed data")
-            self.dbuf += buf
+            t.append(buf)
            c += len(buf)
-        buf = self.dbuf[:size]
-        self.dbuf = self.dbuf[size:]
-        return buf
+        t = b"".join(t)
+        self.dbuf = t[size:]
+        return t[:size]

    def __read(self, size):
        """Return size bytes from stream. If internal buffer is empty,
           read another block from the stream.
        """
        c = len(self.buf)
+        t = [self.buf]
        while c < size:
            buf = self.fileobj.read(self.bufsize)
            if not buf:
                break
-            self.buf += buf
+            t.append(buf)
            c += len(buf)
-        buf = self.buf[:size]
-        self.buf = self.buf[size:]
-        return buf
+        t = b"".join(t)
+        self.buf = t[size:]
+        return t[:size]
 # class _Stream

 class _StreamProxy(object):

--- a/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
+++ b/Misc/NEWS.d/next/Library/2018-07-04-07-36-53.bpo-34010.VNDkde.rst
+Fixed a performance regression for reading streams with tarfile. The
+buffered read should use a list, instead of appending to a bytes object.