Commit 9cbdd75e authored by Lars Gustäbel's avatar Lars Gustäbel

Add read support for all missing variants of the GNU sparse

extensions. Thus, in addition to GNUTYPE_SPARSE headers, sparse
information in pax headers created by GNU tar can now be decoded.
All three formats 0.0, 0.1 and 1.0 are supported.
On filesystems that support this, holes in files are now restored
whenever a sparse member is extracted.
parent 3122ce3e
...@@ -20,7 +20,8 @@ Some facts and figures: ...@@ -20,7 +20,8 @@ Some facts and figures:
* read/write support for the POSIX.1-1988 (ustar) format. * read/write support for the POSIX.1-1988 (ustar) format.
* read/write support for the GNU tar format including *longname* and *longlink* * read/write support for the GNU tar format including *longname* and *longlink*
extensions, read-only support for the *sparse* extension. extensions, read-only support for all variants of the *sparse* extension
including restoration of sparse files.
* read/write support for the POSIX.1-2001 (pax) format. * read/write support for the POSIX.1-2001 (pax) format.
......
...@@ -701,13 +701,29 @@ class _FileInFile(object): ...@@ -701,13 +701,29 @@ class _FileInFile(object):
object. object.
""" """
def __init__(self, fileobj, offset, size, sparse=None): def __init__(self, fileobj, offset, size, blockinfo=None):
self.fileobj = fileobj self.fileobj = fileobj
self.offset = offset self.offset = offset
self.size = size self.size = size
self.sparse = sparse
self.position = 0 self.position = 0
if blockinfo is None:
blockinfo = [(0, size)]
# Construct a map with data and zero blocks.
self.map_index = 0
self.map = []
lastpos = 0
realpos = self.offset
for offset, size in blockinfo:
if offset > lastpos:
self.map.append((False, lastpos, offset, None))
self.map.append((True, offset, offset + size, realpos))
realpos += size
lastpos = offset + size
if lastpos < self.size:
self.map.append((False, lastpos, self.size, None))
def seekable(self): def seekable(self):
if not hasattr(self.fileobj, "seekable"): if not hasattr(self.fileobj, "seekable"):
# XXX gzip.GzipFile and bz2.BZ2File # XXX gzip.GzipFile and bz2.BZ2File
...@@ -732,48 +748,26 @@ class _FileInFile(object): ...@@ -732,48 +748,26 @@ class _FileInFile(object):
else: else:
size = min(size, self.size - self.position) size = min(size, self.size - self.position)
if self.sparse is None: buf = b""
return self.readnormal(size)
else:
return self.readsparse(size)
def readnormal(self, size):
"""Read operation for regular files.
"""
self.fileobj.seek(self.offset + self.position)
self.position += size
return self.fileobj.read(size)
def readsparse(self, size):
"""Read operation for sparse files.
"""
data = b""
while size > 0: while size > 0:
buf = self.readsparsesection(size) while True:
if not buf: data, start, stop, offset = self.map[self.map_index]
if start <= self.position < stop:
break break
size -= len(buf)
data += buf
return data
def readsparsesection(self, size):
"""Read a single section of a sparse file.
"""
section = self.sparse.find(self.position)
if section is None:
return b""
size = min(size, section.offset + section.size - self.position)
if isinstance(section, _data):
realpos = section.realpos + self.position - section.offset
self.fileobj.seek(self.offset + realpos)
self.position += size
return self.fileobj.read(size)
else: else:
self.position += size self.map_index += 1
return NUL * size if self.map_index == len(self.map):
self.map_index = 0
length = min(size, stop - self.position)
if data:
self.fileobj.seek(offset)
block = self.fileobj.read(stop - start)
buf += block[self.position - start:self.position + length]
else:
buf += NUL * length
size -= length
self.position += length
return buf
#class _FileInFile #class _FileInFile
...@@ -1367,28 +1361,15 @@ class TarInfo(object): ...@@ -1367,28 +1361,15 @@ class TarInfo(object):
numbytes = nti(buf[pos + 12:pos + 24]) numbytes = nti(buf[pos + 12:pos + 24])
except ValueError: except ValueError:
break break
if offset and numbytes:
structs.append((offset, numbytes)) structs.append((offset, numbytes))
pos += 24 pos += 24
isextended = bool(buf[504]) isextended = bool(buf[504])
self.sparse = structs
# Transform the sparse structures to something we can use
# in ExFileObject.
self.sparse = _ringbuffer()
lastpos = 0
realpos = 0
for offset, numbytes in structs:
if offset > lastpos:
self.sparse.append(_hole(lastpos, offset - lastpos))
self.sparse.append(_data(offset, numbytes, realpos))
realpos += numbytes
lastpos = offset + numbytes
if lastpos < origsize:
self.sparse.append(_hole(lastpos, origsize - lastpos))
self.offset_data = tarfile.fileobj.tell() self.offset_data = tarfile.fileobj.tell()
tarfile.offset = self.offset_data + self._block(self.size) tarfile.offset = self.offset_data + self._block(self.size)
self.size = origsize self.size = origsize
return self return self
def _proc_pax(self, tarfile): def _proc_pax(self, tarfile):
...@@ -1464,6 +1445,19 @@ class TarInfo(object): ...@@ -1464,6 +1445,19 @@ class TarInfo(object):
except HeaderError: except HeaderError:
raise SubsequentHeaderError("missing or bad subsequent header") raise SubsequentHeaderError("missing or bad subsequent header")
# Process GNU sparse information.
if "GNU.sparse.map" in pax_headers:
# GNU extended sparse format version 0.1.
self._proc_gnusparse_01(next, pax_headers)
elif "GNU.sparse.size" in pax_headers:
# GNU extended sparse format version 0.0.
self._proc_gnusparse_00(next, pax_headers, buf)
elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
# GNU extended sparse format version 1.0.
self._proc_gnusparse_10(next, pax_headers, tarfile)
if self.type in (XHDTYPE, SOLARIS_XHDTYPE): if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
# Patch the TarInfo object with the extended header info. # Patch the TarInfo object with the extended header info.
next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors) next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
...@@ -1480,23 +1474,58 @@ class TarInfo(object): ...@@ -1480,23 +1474,58 @@ class TarInfo(object):
return next return next
def _proc_gnusparse_00(self, next, pax_headers, buf):
"""Process a GNU tar extended sparse header, version 0.0.
"""
offsets = []
for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
offsets.append(int(match.group(1)))
numbytes = []
for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
numbytes.append(int(match.group(1)))
next.sparse = list(zip(offsets, numbytes))
def _proc_gnusparse_01(self, next, pax_headers):
"""Process a GNU tar extended sparse header, version 0.1.
"""
sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
next.sparse = list(zip(sparse[::2], sparse[1::2]))
def _proc_gnusparse_10(self, next, pax_headers, tarfile):
"""Process a GNU tar extended sparse header, version 1.0.
"""
fields = None
sparse = []
buf = tarfile.fileobj.read(BLOCKSIZE)
fields, buf = buf.split(b"\n", 1)
fields = int(fields)
while len(sparse) < fields * 2:
if b"\n" not in buf:
buf += tarfile.fileobj.read(BLOCKSIZE)
number, buf = buf.split(b"\n", 1)
sparse.append(int(number))
next.offset_data = tarfile.fileobj.tell()
next.sparse = list(zip(sparse[::2], sparse[1::2]))
def _apply_pax_info(self, pax_headers, encoding, errors): def _apply_pax_info(self, pax_headers, encoding, errors):
"""Replace fields with supplemental information from a previous """Replace fields with supplemental information from a previous
pax extended or global header. pax extended or global header.
""" """
for keyword, value in pax_headers.items(): for keyword, value in pax_headers.items():
if keyword not in PAX_FIELDS: if keyword == "GNU.sparse.name":
continue setattr(self, "path", value)
elif keyword == "GNU.sparse.size":
if keyword == "path": setattr(self, "size", int(value))
value = value.rstrip("/") elif keyword == "GNU.sparse.realsize":
setattr(self, "size", int(value))
elif keyword in PAX_FIELDS:
if keyword in PAX_NUMBER_FIELDS: if keyword in PAX_NUMBER_FIELDS:
try: try:
value = PAX_NUMBER_FIELDS[keyword](value) value = PAX_NUMBER_FIELDS[keyword](value)
except ValueError: except ValueError:
value = 0 value = 0
if keyword == "path":
value = value.rstrip("/")
setattr(self, keyword, value) setattr(self, keyword, value)
self.pax_headers = pax_headers.copy() self.pax_headers = pax_headers.copy()
...@@ -1535,7 +1564,7 @@ class TarInfo(object): ...@@ -1535,7 +1564,7 @@ class TarInfo(object):
def isfifo(self): def isfifo(self):
return self.type == FIFOTYPE return self.type == FIFOTYPE
def issparse(self): def issparse(self):
return self.type == GNUTYPE_SPARSE return self.sparse is not None
def isdev(self): def isdev(self):
return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE) return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
# class TarInfo # class TarInfo
...@@ -2255,10 +2284,17 @@ class TarFile(object): ...@@ -2255,10 +2284,17 @@ class TarFile(object):
def makefile(self, tarinfo, targetpath): def makefile(self, tarinfo, targetpath):
"""Make a file called targetpath. """Make a file called targetpath.
""" """
source = self.extractfile(tarinfo) source = self.fileobj
source.seek(tarinfo.offset_data)
target = bltn_open(targetpath, "wb") target = bltn_open(targetpath, "wb")
copyfileobj(source, target) if tarinfo.sparse is not None:
source.close() for offset, size in tarinfo.sparse:
target.seek(offset)
copyfileobj(source, target, size)
else:
copyfileobj(source, target, tarinfo.size)
target.seek(tarinfo.size)
target.truncate()
target.close() target.close()
def makeunknown(self, tarinfo, targetpath): def makeunknown(self, tarinfo, targetpath):
...@@ -2544,49 +2580,6 @@ class TarIter: ...@@ -2544,49 +2580,6 @@ class TarIter:
self.index += 1 self.index += 1
return tarinfo return tarinfo
# Helper classes for sparse file support
class _section:
"""Base class for _data and _hole.
"""
def __init__(self, offset, size):
self.offset = offset
self.size = size
def __contains__(self, offset):
return self.offset <= offset < self.offset + self.size
class _data(_section):
"""Represent a data section in a sparse file.
"""
def __init__(self, offset, size, realpos):
_section.__init__(self, offset, size)
self.realpos = realpos
class _hole(_section):
"""Represent a hole section in a sparse file.
"""
pass
class _ringbuffer(list):
"""Ringbuffer class which increases performance
over a regular list.
"""
def __init__(self):
self.idx = 0
def find(self, offset):
idx = self.idx
while True:
item = self[idx]
if offset in item:
break
idx += 1
if idx == len(self):
idx = 0
if idx == self.idx:
# End of File
return None
self.idx = idx
return item
#-------------------- #--------------------
# exported functions # exported functions
#-------------------- #--------------------
......
...@@ -526,6 +526,22 @@ class MemberReadTest(ReadTest): ...@@ -526,6 +526,22 @@ class MemberReadTest(ReadTest):
tarinfo = self.tar.getmember("ustar/sparse") tarinfo = self.tar.getmember("ustar/sparse")
self._test_member(tarinfo, size=86016, chksum=md5_sparse) self._test_member(tarinfo, size=86016, chksum=md5_sparse)
def test_find_gnusparse(self):
tarinfo = self.tar.getmember("gnu/sparse")
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
def test_find_gnusparse_00(self):
tarinfo = self.tar.getmember("gnu/sparse-0.0")
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
def test_find_gnusparse_01(self):
tarinfo = self.tar.getmember("gnu/sparse-0.1")
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
def test_find_gnusparse_10(self):
tarinfo = self.tar.getmember("gnu/sparse-1.0")
self._test_member(tarinfo, size=86016, chksum=md5_sparse)
def test_find_umlauts(self): def test_find_umlauts(self):
tarinfo = self.tar.getmember("ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf") tarinfo = self.tar.getmember("ustar/umlauts-\xc4\xd6\xdc\xe4\xf6\xfc\xdf")
self._test_member(tarinfo, size=7011, chksum=md5_regtype) self._test_member(tarinfo, size=7011, chksum=md5_regtype)
...@@ -589,13 +605,53 @@ class GNUReadTest(LongnameTest): ...@@ -589,13 +605,53 @@ class GNUReadTest(LongnameTest):
subdir = "gnu" subdir = "gnu"
longnametype = tarfile.GNUTYPE_LONGNAME longnametype = tarfile.GNUTYPE_LONGNAME
def test_sparse_file(self): # Since 3.2 tarfile is supposed to accurately restore sparse members and
tarinfo1 = self.tar.getmember("ustar/sparse") # produce files with holes. This is what we actually want to test here.
fobj1 = self.tar.extractfile(tarinfo1) # Unfortunately, not all platforms/filesystems support sparse files, and
tarinfo2 = self.tar.getmember("gnu/sparse") # even on platforms that do it is non-trivial to make reliable assertions
fobj2 = self.tar.extractfile(tarinfo2) # about holes in files. Therefore, we first do one basic test which works
self.assertEqual(fobj1.read(), fobj2.read(), # an all platforms, and after that a test that will work only on
"sparse file extraction failed") # platforms/filesystems that prove to support sparse files.
def _test_sparse_file(self, name):
self.tar.extract(name, TEMPDIR)
filename = os.path.join(TEMPDIR, name)
with open(filename, "rb") as fobj:
data = fobj.read()
self.assertEqual(md5sum(data), md5_sparse,
"wrong md5sum for %s" % name)
if self._fs_supports_holes():
s = os.stat(filename)
self.assertTrue(s.st_blocks * 512 < s.st_size)
def test_sparse_file_old(self):
self._test_sparse_file("gnu/sparse")
def test_sparse_file_00(self):
self._test_sparse_file("gnu/sparse-0.0")
def test_sparse_file_01(self):
self._test_sparse_file("gnu/sparse-0.1")
def test_sparse_file_10(self):
self._test_sparse_file("gnu/sparse-1.0")
@staticmethod
def _fs_supports_holes():
# Return True if the platform knows the st_blocks stat attribute and
# uses st_blocks units of 512 bytes, and if the filesystem is able to
# store holes in files.
if sys.platform == "linux2":
# Linux evidentially has 512 byte st_blocks units.
name = os.path.join(TEMPDIR, "sparse-test")
with open(name, "wb") as fobj:
fobj.seek(4096)
fobj.truncate()
s = os.stat(name)
os.remove(name)
return s.st_blocks == 0
else:
return False
class PaxReadTest(LongnameTest): class PaxReadTest(LongnameTest):
......
This diff was suppressed by a .gitattributes entry.
...@@ -54,6 +54,9 @@ Core and Builtins ...@@ -54,6 +54,9 @@ Core and Builtins
Library Library
------- -------
- tarfile.py: Add support for all missing variants of the GNU sparse
extensions and create files with holes when extracting sparse members.
- Issue #10218: Return timeout status from ``Condition.wait`` in threading. - Issue #10218: Return timeout status from ``Condition.wait`` in threading.
- Issue #7351: Add ``zipfile.BadZipFile`` spelling of the exception name - Issue #7351: Add ``zipfile.BadZipFile`` spelling of the exception name
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment