tarfile.py 86.4 KB
Newer Older
1
#!/usr/bin/env python3
2 3 4
#-------------------------------------------------------------------
# tarfile.py
#-------------------------------------------------------------------
5
# Copyright (C) 2002 Lars Gustaebel <lars@gustaebel.de>
6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
# All rights reserved.
#
# Permission  is  hereby granted,  free  of charge,  to  any person
# obtaining a  copy of  this software  and associated documentation
# files  (the  "Software"),  to   deal  in  the  Software   without
# restriction,  including  without limitation  the  rights to  use,
# copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies  of  the  Software,  and to  permit  persons  to  whom the
# Software  is  furnished  to  do  so,  subject  to  the  following
# conditions:
#
# The above copyright  notice and this  permission notice shall  be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS  IS", WITHOUT WARRANTY OF ANY  KIND,
# EXPRESS OR IMPLIED, INCLUDING  BUT NOT LIMITED TO  THE WARRANTIES
# OF  MERCHANTABILITY,  FITNESS   FOR  A  PARTICULAR   PURPOSE  AND
# NONINFRINGEMENT.  IN  NO  EVENT SHALL  THE  AUTHORS  OR COPYRIGHT
# HOLDERS  BE LIABLE  FOR ANY  CLAIM, DAMAGES  OR OTHER  LIABILITY,
# WHETHER  IN AN  ACTION OF  CONTRACT, TORT  OR OTHERWISE,  ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
# OTHER DEALINGS IN THE SOFTWARE.
#
"""Read from and write to tar format archives.
"""

32
version     = "0.9.0"
33
__author__  = "Lars Gust\u00e4bel (lars@gustaebel.de)"
34 35
__date__    = "$Date: 2011-02-25 17:42:01 +0200 (Fri, 25 Feb 2011) $"
__cvsid__   = "$Id: tarfile.py 88586 2011-02-25 15:42:01Z marc-andre.lemburg $"
36
__credits__ = "Gustavo Niemeyer, Niels Gust\u00e4bel, Richard Townsend."
37 38 39 40 41 42

#---------
# Imports
#---------
import sys
import os
43
import io
44 45 46 47
import shutil
import stat
import time
import struct
48
import copy
49
import re
50 51 52 53 54 55

try:
    import grp, pwd
except ImportError:
    grp = pwd = None

56 57 58 59 60 61 62 63 64
# os.symlink on Windows prior to 6.0 raises NotImplementedError
symlink_exception = (AttributeError, NotImplementedError)
try:
    # WindowsError (1314) will be raised if the caller does not hold the
    # SeCreateSymbolicLinkPrivilege privilege
    symlink_exception += (WindowsError,)
except NameError:
    pass

65 66 67
# from tarfile import *
__all__ = ["TarFile", "TarInfo", "is_tarfile", "TarError"]

68
from builtins import open as _open # Since 'open' is TarFile.open
69

70 71 72
#---------------------------------------------------------
# tar constants
#---------------------------------------------------------
73
NUL = b"\0"                     # the null character
74
BLOCKSIZE = 512                 # length of processing blocks
75
RECORDSIZE = BLOCKSIZE * 20     # length of records
76 77
GNU_MAGIC = b"ustar  \0"        # magic gnu tar string
POSIX_MAGIC = b"ustar\x0000"    # magic posix tar string
78

79 80 81
LENGTH_NAME = 100               # maximum length of a filename
LENGTH_LINK = 100               # maximum length of a linkname
LENGTH_PREFIX = 155             # maximum length of the prefix field
82

83 84 85 86 87 88 89 90 91
REGTYPE = b"0"                  # regular file
AREGTYPE = b"\0"                # regular file
LNKTYPE = b"1"                  # link (inside tarfile)
SYMTYPE = b"2"                  # symbolic link
CHRTYPE = b"3"                  # character special device
BLKTYPE = b"4"                  # block special device
DIRTYPE = b"5"                  # directory
FIFOTYPE = b"6"                 # fifo special device
CONTTYPE = b"7"                 # contiguous file
92

93 94 95
GNUTYPE_LONGNAME = b"L"         # GNU tar longname
GNUTYPE_LONGLINK = b"K"         # GNU tar longlink
GNUTYPE_SPARSE = b"S"           # GNU tar sparse file
96

97 98 99
XHDTYPE = b"x"                  # POSIX.1-2001 extended header
XGLTYPE = b"g"                  # POSIX.1-2001 global header
SOLARIS_XHDTYPE = b"X"          # Solaris extended header
100 101 102 103 104

USTAR_FORMAT = 0                # POSIX.1-1988 (ustar) format
GNU_FORMAT = 1                  # GNU tar format
PAX_FORMAT = 2                  # POSIX.1-2001 (pax) format
DEFAULT_FORMAT = GNU_FORMAT
105 106 107 108

#---------------------------------------------------------
# tarfile constants
#---------------------------------------------------------
109 110 111
# File types that tarfile supports:
SUPPORTED_TYPES = (REGTYPE, AREGTYPE, LNKTYPE,
                   SYMTYPE, DIRTYPE, FIFOTYPE,
112 113 114 115
                   CONTTYPE, CHRTYPE, BLKTYPE,
                   GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
                   GNUTYPE_SPARSE)

116 117 118 119 120 121 122 123 124 125 126
# File types that will be treated as a regular file.
REGULAR_TYPES = (REGTYPE, AREGTYPE,
                 CONTTYPE, GNUTYPE_SPARSE)

# File types that are part of the GNU tar format.
GNU_TYPES = (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK,
             GNUTYPE_SPARSE)

# Fields from a pax header that override a TarInfo attribute.
PAX_FIELDS = ("path", "linkpath", "size", "mtime",
              "uid", "gid", "uname", "gname")
127

128 129 130
# Fields from a pax header that are affected by hdrcharset.
PAX_NAME_FIELDS = {"path", "linkpath", "uname", "gname"}

131 132 133 134 135 136 137 138 139 140 141
# Fields in a pax header that are numbers, all other fields
# are treated as strings.
PAX_NUMBER_FIELDS = {
    "atime": float,
    "ctime": float,
    "mtime": float,
    "uid": int,
    "gid": int,
    "size": int
}

142 143 144
#---------------------------------------------------------
# Bits used in the mode field, values in octal.
#---------------------------------------------------------
145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
S_IFLNK = 0o120000        # symbolic link
S_IFREG = 0o100000        # regular file
S_IFBLK = 0o060000        # block device
S_IFDIR = 0o040000        # directory
S_IFCHR = 0o020000        # character device
S_IFIFO = 0o010000        # fifo

TSUID   = 0o4000          # set UID on execution
TSGID   = 0o2000          # set GID on execution
TSVTX   = 0o1000          # reserved

TUREAD  = 0o400           # read by owner
TUWRITE = 0o200           # write by owner
TUEXEC  = 0o100           # execute/search by owner
TGREAD  = 0o040           # read by group
TGWRITE = 0o020           # write by group
TGEXEC  = 0o010           # execute/search by group
TOREAD  = 0o004           # read by other
TOWRITE = 0o002           # write by other
TOEXEC  = 0o001           # execute/search by other
165

166 167 168
#---------------------------------------------------------
# initialization
#---------------------------------------------------------
169 170 171 172
if os.name in ("nt", "ce"):
    ENCODING = "utf-8"
else:
    ENCODING = sys.getfilesystemencoding()
173

174 175 176
#---------------------------------------------------------
# Some useful functions
#---------------------------------------------------------
177

178 179
def stn(s, length, encoding, errors):
    """Convert a string to a null-terminated bytes object.
180
    """
181
    s = s.encode(encoding, errors)
182
    return s[:length] + (length - len(s)) * NUL
183

184 185
def nts(s, encoding, errors):
    """Convert a null-terminated bytes object to a string.
186
    """
187 188 189 190
    p = s.find(b"\0")
    if p != -1:
        s = s[:p]
    return s.decode(encoding, errors)
191

192 193 194 195 196
def nti(s):
    """Convert a number field to a python number.
    """
    # There are two possible encodings for a number field, see
    # itn() below.
197 198 199 200 201 202 203 204
    if s[0] in (0o200, 0o377):
        n = 0
        for i in range(len(s) - 1):
            n <<= 8
            n += s[i + 1]
        if s[0] == 0o377:
            n = -(256 ** (len(s) - 1) - n)
    else:
205
        try:
206
            n = int(nts(s, "ascii", "strict") or "0", 8)
207
        except ValueError:
208
            raise InvalidHeaderError("invalid header")
209 210
    return n

211
def itn(n, digits=8, format=DEFAULT_FORMAT):
212 213 214 215 216
    """Convert a python number to a number field.
    """
    # POSIX 1003.1-1988 requires numbers to be encoded as a string of
    # octal digits followed by a null-byte, this allows values up to
    # (8**(digits-1))-1. GNU tar allows storing numbers greater than
217 218 219 220 221
    # that if necessary. A leading 0o200 or 0o377 byte indicate this
    # particular encoding, the following digits-1 bytes are a big-endian
    # base-256 representation. This allows values up to (256**(digits-1))-1.
    # A 0o200 byte indicates a positive number, a 0o377 byte a negative
    # number.
222
    if 0 <= n < 8 ** (digits - 1):
223
        s = bytes("%0*o" % (digits - 1, n), "ascii") + NUL
224 225 226 227 228 229
    elif format == GNU_FORMAT and -256 ** (digits - 1) <= n < 256 ** (digits - 1):
        if n >= 0:
            s = bytearray([0o200])
        else:
            s = bytearray([0o377])
            n = 256 ** digits + n
230

231
        for i in range(digits - 1):
232
            s.insert(1, n & 0o377)
233
            n >>= 8
234 235 236
    else:
        raise ValueError("overflow in number field")

237 238 239 240 241 242 243 244 245 246
    return s

def calc_chksums(buf):
    """Calculate the checksum for a member's header by summing up all
       characters except for the chksum field which is treated as if
       it was filled with spaces. According to the GNU tar sources,
       some tars (Sun and NeXT) calculate chksum with signed char,
       which will be different if there are chars in the buffer with
       the high bit set. So we calculate two checksums, unsigned and
       signed.
247
    """
248 249
    unsigned_chksum = 256 + sum(struct.unpack_from("148B8x356B", buf))
    signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
250
    return unsigned_chksum, signed_chksum
251 252 253 254 255 256 257 258 259 260 261 262 263

def copyfileobj(src, dst, length=None):
    """Copy length bytes from fileobj src to fileobj dst.
       If length is None, copy the entire content.
    """
    if length == 0:
        return
    if length is None:
        shutil.copyfileobj(src, dst)
        return

    BUFSIZE = 16 * 1024
    blocks, remainder = divmod(length, BUFSIZE)
264
    for b in range(blocks):
265 266
        buf = src.read(BUFSIZE)
        if len(buf) < BUFSIZE:
267
            raise IOError("end of file reached")
268 269 270 271 272
        dst.write(buf)

    if remainder != 0:
        buf = src.read(remainder)
        if len(buf) < remainder:
273
            raise IOError("end of file reached")
274 275 276 277
        dst.write(buf)
    return

def filemode(mode):
278 279 280 281 282 283
    """Deprecated in this location; use stat.filemode."""
    import warnings
    warnings.warn("deprecated in favor of stat.filemode",
                  DeprecationWarning, 2)
    return stat.filemode(mode)

284 285 286 287 288 289 290 291

class TarError(Exception):
    """Base exception."""
    pass
class ExtractError(TarError):
    """General exception for extract errors."""
    pass
class ReadError(TarError):
292
    """Exception for unreadable tar archives."""
293 294 295 296 297 298 299
    pass
class CompressionError(TarError):
    """Exception for unavailable compression methods."""
    pass
class StreamError(TarError):
    """Exception for unsupported operations on stream-like TarFiles."""
    pass
300
class HeaderError(TarError):
301 302 303 304 305 306 307 308 309 310 311 312
    """Base exception for header errors."""
    pass
class EmptyHeaderError(HeaderError):
    """Exception for empty headers."""
    pass
class TruncatedHeaderError(HeaderError):
    """Exception for truncated headers."""
    pass
class EOFHeaderError(HeaderError):
    """Exception for end of file headers."""
    pass
class InvalidHeaderError(HeaderError):
313 314
    """Exception for invalid headers."""
    pass
315 316 317
class SubsequentHeaderError(HeaderError):
    """Exception for missing and invalid extended headers."""
    pass
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334

#---------------------------
# internal stream interface
#---------------------------
class _LowLevelFile:
    """Low-level file object. Supports reading and writing.
       It is used instead of a regular file object for streaming
       access.
    """

    def __init__(self, name, mode):
        mode = {
            "r": os.O_RDONLY,
            "w": os.O_WRONLY | os.O_CREAT | os.O_TRUNC,
        }[mode]
        if hasattr(os, "O_BINARY"):
            mode |= os.O_BINARY
335
        self.fd = os.open(name, mode, 0o666)
336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356

    def close(self):
        os.close(self.fd)

    def read(self, size):
        return os.read(self.fd, size)

    def write(self, s):
        os.write(self.fd, s)

class _Stream:
    """Class that serves as an adapter between TarFile and
       a stream-like object.  The stream-like object only
       needs to have a read() or write() method and is accessed
       blockwise.  Use of gzip or bzip2 compression is possible.
       A stream-like object could be for example: sys.stdin,
       sys.stdout, a socket, a tape device etc.

       _Stream is intended to be used only internally.
    """

357
    def __init__(self, name, mode, comptype, fileobj, bufsize):
358 359 360 361 362 363 364
        """Construct a _Stream object.
        """
        self._extfileobj = True
        if fileobj is None:
            fileobj = _LowLevelFile(name, mode)
            self._extfileobj = False

365 366 367 368 369 370 371 372 373 374 375
        if comptype == '*':
            # Enable transparent compression detection for the
            # stream interface
            fileobj = _StreamProxy(fileobj)
            comptype = fileobj.getcomptype()

        self.name     = name or ""
        self.mode     = mode
        self.comptype = comptype
        self.fileobj  = fileobj
        self.bufsize  = bufsize
376
        self.buf      = b""
377
        self.pos      = 0
378 379
        self.closed   = False

380 381 382 383 384 385 386 387 388 389
        try:
            if comptype == "gz":
                try:
                    import zlib
                except ImportError:
                    raise CompressionError("zlib module is not available")
                self.zlib = zlib
                self.crc = zlib.crc32(b"")
                if mode == "r":
                    self._init_read_gz()
390
                    self.exception = zlib.error
391 392 393
                else:
                    self._init_write_gz()

394
            elif comptype == "bz2":
395 396 397 398 399 400 401
                try:
                    import bz2
                except ImportError:
                    raise CompressionError("bz2 module is not available")
                if mode == "r":
                    self.dbuf = b""
                    self.cmp = bz2.BZ2Decompressor()
402
                    self.exception = IOError
403 404
                else:
                    self.cmp = bz2.BZ2Compressor()
405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420

            elif comptype == "xz":
                try:
                    import lzma
                except ImportError:
                    raise CompressionError("lzma module is not available")
                if mode == "r":
                    self.dbuf = b""
                    self.cmp = lzma.LZMADecompressor()
                    self.exception = lzma.LZMAError
                else:
                    self.cmp = lzma.LZMACompressor()

            elif comptype != "tar":
                raise CompressionError("unknown compression type %r" % comptype)

421 422 423 424 425
        except:
            if not self._extfileobj:
                self.fileobj.close()
            self.closed = True
            raise
426 427

    def __del__(self):
428
        if hasattr(self, "closed") and not self.closed:
429 430 431 432 433 434 435 436 437
            self.close()

    def _init_write_gz(self):
        """Initialize for writing with gzip compression.
        """
        self.cmp = self.zlib.compressobj(9, self.zlib.DEFLATED,
                                            -self.zlib.MAX_WBITS,
                                            self.zlib.DEF_MEM_LEVEL,
                                            0)
438
        timestamp = struct.pack("<L", int(time.time()))
439
        self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
440 441
        if self.name.endswith(".gz"):
            self.name = self.name[:-3]
442 443
        # RFC1952 says we must use ISO-8859-1 for the FNAME field.
        self.__write(self.name.encode("iso-8859-1", "replace") + NUL)
444 445 446 447

    def write(self, s):
        """Write string s to the stream.
        """
448
        if self.comptype == "gz":
449 450
            self.crc = self.zlib.crc32(s, self.crc)
        self.pos += len(s)
451
        if self.comptype != "tar":
452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470
            s = self.cmp.compress(s)
        self.__write(s)

    def __write(self, s):
        """Write string s to the stream if a whole new block
           is ready to be written.
        """
        self.buf += s
        while len(self.buf) > self.bufsize:
            self.fileobj.write(self.buf[:self.bufsize])
            self.buf = self.buf[self.bufsize:]

    def close(self):
        """Close the _Stream object. No operation should be
           done on it afterwards.
        """
        if self.closed:
            return

471
        if self.mode == "w" and self.comptype != "tar":
472
            self.buf += self.cmp.flush()
473

474 475
        if self.mode == "w" and self.buf:
            self.fileobj.write(self.buf)
476
            self.buf = b""
477
            if self.comptype == "gz":
478 479 480 481 482 483
                # The native zlib crc is an unsigned 32-bit integer, but
                # the Python wrapper implicitly casts that to a signed C
                # long.  So, on a 32-bit box self.crc may "look negative",
                # while the same crc on a 64-bit box may "look positive".
                # To avoid irksome warnings from the `struct` module, force
                # it to look positive on all boxes.
484 485
                self.fileobj.write(struct.pack("<L", self.crc & 0xffffffff))
                self.fileobj.write(struct.pack("<L", self.pos & 0xffffFFFF))
486 487 488 489 490 491 492 493 494 495

        if not self._extfileobj:
            self.fileobj.close()

        self.closed = True

    def _init_read_gz(self):
        """Initialize for reading a gzip compressed fileobj.
        """
        self.cmp = self.zlib.decompressobj(-self.zlib.MAX_WBITS)
496
        self.dbuf = b""
497 498

        # taken from gzip.GzipFile with some alterations
499
        if self.__read(2) != b"\037\213":
500
            raise ReadError("not a gzip file")
501
        if self.__read(1) != b"\010":
502
            raise CompressionError("unsupported compression method")
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533

        flag = ord(self.__read(1))
        self.__read(6)

        if flag & 4:
            xlen = ord(self.__read(1)) + 256 * ord(self.__read(1))
            self.read(xlen)
        if flag & 8:
            while True:
                s = self.__read(1)
                if not s or s == NUL:
                    break
        if flag & 16:
            while True:
                s = self.__read(1)
                if not s or s == NUL:
                    break
        if flag & 2:
            self.__read(2)

    def tell(self):
        """Return the stream's file pointer position.
        """
        return self.pos

    def seek(self, pos=0):
        """Set the stream's file pointer to pos. Negative seeking
           is forbidden.
        """
        if pos - self.pos >= 0:
            blocks, remainder = divmod(pos - self.pos, self.bufsize)
534
            for i in range(blocks):
535 536 537
                self.read(self.bufsize)
            self.read(remainder)
        else:
538
            raise StreamError("seeking backwards is not allowed")
539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561
        return self.pos

    def read(self, size=None):
        """Return the next size number of bytes from the stream.
           If size is not defined, return all bytes of the stream
           up to EOF.
        """
        if size is None:
            t = []
            while True:
                buf = self._read(self.bufsize)
                if not buf:
                    break
                t.append(buf)
            buf = "".join(t)
        else:
            buf = self._read(size)
        self.pos += len(buf)
        return buf

    def _read(self, size):
        """Return size bytes from the stream.
        """
562
        if self.comptype == "tar":
563 564 565 566 567 568 569
            return self.__read(size)

        c = len(self.dbuf)
        while c < size:
            buf = self.__read(self.bufsize)
            if not buf:
                break
570 571
            try:
                buf = self.cmp.decompress(buf)
572
            except self.exception:
573
                raise ReadError("invalid compressed data")
574
            self.dbuf += buf
575
            c += len(buf)
576 577 578
        buf = self.dbuf[:size]
        self.dbuf = self.dbuf[size:]
        return buf
579 580 581 582 583 584 585 586 587 588

    def __read(self, size):
        """Return size bytes from stream. If internal buffer is empty,
           read another block from the stream.
        """
        c = len(self.buf)
        while c < size:
            buf = self.fileobj.read(self.bufsize)
            if not buf:
                break
589
            self.buf += buf
590
            c += len(buf)
591 592 593
        buf = self.buf[:size]
        self.buf = self.buf[size:]
        return buf
594 595
# class _Stream

596 597 598 599 600 601 602 603 604 605 606 607 608 609
class _StreamProxy(object):
    """Small proxy class that enables transparent compression
       detection for the Stream interface (mode 'r|*').
    """

    def __init__(self, fileobj):
        self.fileobj = fileobj
        self.buf = self.fileobj.read(BLOCKSIZE)

    def read(self, size):
        self.read = self.fileobj.read
        return self.buf

    def getcomptype(self):
610
        if self.buf.startswith(b"\x1f\x8b\x08"):
611
            return "gz"
612
        elif self.buf[0:3] == b"BZh" and self.buf[4:10] == b"1AY&SY":
613
            return "bz2"
614 615 616 617
        elif self.buf.startswith((b"\x5d\x00\x00\x80", b"\xfd7zXZ")):
            return "xz"
        else:
            return "tar"
618 619 620 621 622

    def close(self):
        self.fileobj.close()
# class StreamProxy

623 624 625
#------------------------
# Extraction file object
#------------------------
626 627 628 629
class _FileInFile(object):
    """A thin wrapper around an existing file object that
       provides a part of its data as an individual file
       object.
630 631
    """

632
    def __init__(self, fileobj, offset, size, blockinfo=None):
633 634 635 636
        self.fileobj = fileobj
        self.offset = offset
        self.size = size
        self.position = 0
637 638
        self.name = getattr(fileobj, "name", None)
        self.closed = False
639

640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656
        if blockinfo is None:
            blockinfo = [(0, size)]

        # Construct a map with data and zero blocks.
        self.map_index = 0
        self.map = []
        lastpos = 0
        realpos = self.offset
        for offset, size in blockinfo:
            if offset > lastpos:
                self.map.append((False, lastpos, offset, None))
            self.map.append((True, offset, offset + size, realpos))
            realpos += size
            lastpos = offset + size
        if lastpos < self.size:
            self.map.append((False, lastpos, self.size, None))

657 658 659 660 661 662 663 664 665
    def flush(self):
        pass

    def readable(self):
        return True

    def writable(self):
        return False

666 667 668
    def seekable(self):
        return self.fileobj.seekable()

669 670
    def tell(self):
        """Return the current file position.
671
        """
672
        return self.position
673

674
    def seek(self, position, whence=io.SEEK_SET):
675
        """Seek to a position in the file.
676
        """
677 678 679 680 681 682 683 684 685 686 687 688
        if whence == io.SEEK_SET:
            self.position = min(max(position, 0), self.size)
        elif whence == io.SEEK_CUR:
            if position < 0:
                self.position = max(self.position + position, 0)
            else:
                self.position = min(self.position + position, self.size)
        elif whence == io.SEEK_END:
            self.position = max(min(self.size + position, self.size), 0)
        else:
            raise ValueError("Invalid argument")
        return self.position
689

690 691 692 693 694
    def read(self, size=None):
        """Read data from the file.
        """
        if size is None:
            size = self.size - self.position
695
        else:
696
            size = min(size, self.size - self.position)
697

698
        buf = b""
699
        while size > 0:
700 701 702 703 704 705 706 707 708 709
            while True:
                data, start, stop, offset = self.map[self.map_index]
                if start <= self.position < stop:
                    break
                else:
                    self.map_index += 1
                    if self.map_index == len(self.map):
                        self.map_index = 0
            length = min(size, stop - self.position)
            if data:
710 711
                self.fileobj.seek(offset + (self.position - start))
                buf += self.fileobj.read(length)
712 713 714 715 716
            else:
                buf += NUL * length
            size -= length
            self.position += length
        return buf
717

718 719 720 721
    def readinto(self, b):
        buf = self.read(len(b))
        b[:len(buf)] = buf
        return len(buf)
722 723 724

    def close(self):
        self.closed = True
725
#class _FileInFile
726

727 728 729 730 731 732 733
class ExFileObject(io.BufferedReader):

    def __init__(self, tarfile, tarinfo):
        fileobj = _FileInFile(tarfile.fileobj, tarinfo.offset_data,
                tarinfo.size, tarinfo.sparse)
        super().__init__(fileobj)
#class ExFileObject
734 735 736 737 738 739 740 741 742 743 744 745

#------------------
# Exported Classes
#------------------
class TarInfo(object):
    """Informational class which holds the details about an
       archive member given by a tar header block.
       TarInfo objects are returned by TarFile.getmember(),
       TarFile.getmembers() and TarFile.gettarinfo() and are
       usually created internally.
    """

746 747 748 749 750 751
    __slots__ = ("name", "mode", "uid", "gid", "size", "mtime",
                 "chksum", "type", "linkname", "uname", "gname",
                 "devmajor", "devminor",
                 "offset", "offset_data", "pax_headers", "sparse",
                 "tarfile", "_sparse_structs", "_link_target")

752 753 754 755
    def __init__(self, name=""):
        """Construct a TarInfo object. name is the optional name
           of the member.
        """
756
        self.name = name        # member name
757
        self.mode = 0o644       # file permissions
758 759 760 761 762 763 764
        self.uid = 0            # user id
        self.gid = 0            # group id
        self.size = 0           # file size
        self.mtime = 0          # modification time
        self.chksum = 0         # header checksum
        self.type = REGTYPE     # member type
        self.linkname = ""      # link name
765 766
        self.uname = ""         # user name
        self.gname = ""         # group name
767 768 769 770 771
        self.devmajor = 0       # device major number
        self.devminor = 0       # device minor number

        self.offset = 0         # the tar header starts here
        self.offset_data = 0    # the file's data starts here
772

773
        self.sparse = None      # sparse member information
774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789
        self.pax_headers = {}   # pax header information

    # In pax headers the "name" and "linkname" field are called
    # "path" and "linkpath".
    def _getpath(self):
        return self.name
    def _setpath(self, name):
        self.name = name
    path = property(_getpath, _setpath)

    def _getlinkpath(self):
        return self.linkname
    def _setlinkpath(self, linkname):
        self.linkname = linkname
    linkpath = property(_getlinkpath, _setlinkpath)

790 791 792
    def __repr__(self):
        return "<%s %r at %#x>" % (self.__class__.__name__,self.name,id(self))

793
    def get_info(self):
794 795 796
        """Return the TarInfo's attributes as a dictionary.
        """
        info = {
797
            "name":     self.name,
798
            "mode":     self.mode & 0o7777,
799 800 801 802 803 804
            "uid":      self.uid,
            "gid":      self.gid,
            "size":     self.size,
            "mtime":    self.mtime,
            "chksum":   self.chksum,
            "type":     self.type,
805
            "linkname": self.linkname,
806 807 808 809 810 811 812 813 814 815 816
            "uname":    self.uname,
            "gname":    self.gname,
            "devmajor": self.devmajor,
            "devminor": self.devminor
        }

        if info["type"] == DIRTYPE and not info["name"].endswith("/"):
            info["name"] += "/"

        return info

817
    def tobuf(self, format=DEFAULT_FORMAT, encoding=ENCODING, errors="surrogateescape"):
818 819
        """Return a tar header as a string of 512 byte blocks.
        """
820
        info = self.get_info()
821

822
        if format == USTAR_FORMAT:
823
            return self.create_ustar_header(info, encoding, errors)
824
        elif format == GNU_FORMAT:
825
            return self.create_gnu_header(info, encoding, errors)
826
        elif format == PAX_FORMAT:
827
            return self.create_pax_header(info, encoding)
828 829 830
        else:
            raise ValueError("invalid format")

831
    def create_ustar_header(self, info, encoding, errors):
832 833 834 835 836 837 838 839 840 841
        """Return the object as a ustar header block.
        """
        info["magic"] = POSIX_MAGIC

        if len(info["linkname"]) > LENGTH_LINK:
            raise ValueError("linkname is too long")

        if len(info["name"]) > LENGTH_NAME:
            info["prefix"], info["name"] = self._posix_split_name(info["name"])

842
        return self._create_header(info, USTAR_FORMAT, encoding, errors)
843

844
    def create_gnu_header(self, info, encoding, errors):
845 846 847 848
        """Return the object as a GNU header block sequence.
        """
        info["magic"] = GNU_MAGIC

849
        buf = b""
850
        if len(info["linkname"]) > LENGTH_LINK:
851
            buf += self._create_gnu_long_header(info["linkname"], GNUTYPE_LONGLINK, encoding, errors)
852 853

        if len(info["name"]) > LENGTH_NAME:
854
            buf += self._create_gnu_long_header(info["name"], GNUTYPE_LONGNAME, encoding, errors)
855

856
        return buf + self._create_header(info, GNU_FORMAT, encoding, errors)
857

858
    def create_pax_header(self, info, encoding):
859 860 861 862 863 864 865 866 867 868 869 870 871
        """Return the object as a ustar header block. If it cannot be
           represented this way, prepend a pax extended header sequence
           with supplement information.
        """
        info["magic"] = POSIX_MAGIC
        pax_headers = self.pax_headers.copy()

        # Test string fields for values that exceed the field length or cannot
        # be represented in ASCII encoding.
        for name, hname, length in (
                ("name", "path", LENGTH_NAME), ("linkname", "linkpath", LENGTH_LINK),
                ("uname", "uname", 32), ("gname", "gname", 32)):

872 873 874 875
            if hname in pax_headers:
                # The pax header has priority.
                continue

876 877
            # Try to encode the string as ASCII.
            try:
878
                info[name].encode("ascii", "strict")
879
            except UnicodeEncodeError:
880
                pax_headers[hname] = info[name]
881 882
                continue

883
            if len(info[name]) > length:
884
                pax_headers[hname] = info[name]
885 886 887 888

        # Test number fields for values that exceed the field limit or values
        # that like to be stored as float.
        for name, digits in (("uid", 8), ("gid", 8), ("size", 12), ("mtime", 12)):
889 890 891 892 893
            if name in pax_headers:
                # The pax header has priority. Avoid overflow.
                info[name] = 0
                continue

894 895
            val = info[name]
            if not 0 <= val < 8 ** (digits - 1) or isinstance(val, float):
896
                pax_headers[name] = str(val)
897 898
                info[name] = 0

899
        # Create a pax extended header if necessary.
900
        if pax_headers:
901
            buf = self._create_pax_generic_header(pax_headers, XHDTYPE, encoding)
902
        else:
903
            buf = b""
904

905
        return buf + self._create_header(info, USTAR_FORMAT, "ascii", "replace")
906 907

    @classmethod
908
    def create_pax_global_header(cls, pax_headers):
909 910
        """Return the object as a pax global header block sequence.
        """
911
        return cls._create_pax_generic_header(pax_headers, XGLTYPE, "utf-8")
912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928

    def _posix_split_name(self, name):
        """Split a name longer than 100 chars into a prefix
           and a name part.
        """
        prefix = name[:LENGTH_PREFIX + 1]
        while prefix and prefix[-1] != "/":
            prefix = prefix[:-1]

        name = name[len(prefix):]
        prefix = prefix[:-1]

        if not prefix or len(name) > LENGTH_NAME:
            raise ValueError("name is too long")
        return prefix, name

    @staticmethod
929
    def _create_header(info, format, encoding, errors):
930 931 932 933
        """Return a header block. info is a dictionary with file
           information, format must be one of the *_FORMAT constants.
        """
        parts = [
934
            stn(info.get("name", ""), 100, encoding, errors),
935
            itn(info.get("mode", 0) & 0o7777, 8, format),
936 937 938 939
            itn(info.get("uid", 0), 8, format),
            itn(info.get("gid", 0), 8, format),
            itn(info.get("size", 0), 12, format),
            itn(info.get("mtime", 0), 12, format),
940
            b"        ", # checksum field
941
            info.get("type", REGTYPE),
942 943
            stn(info.get("linkname", ""), 100, encoding, errors),
            info.get("magic", POSIX_MAGIC),
944 945
            stn(info.get("uname", ""), 32, encoding, errors),
            stn(info.get("gname", ""), 32, encoding, errors),
946 947
            itn(info.get("devmajor", 0), 8, format),
            itn(info.get("devminor", 0), 8, format),
948
            stn(info.get("prefix", ""), 155, encoding, errors)
949 950
        ]

951
        buf = struct.pack("%ds" % BLOCKSIZE, b"".join(parts))
952
        chksum = calc_chksums(buf[-BLOCKSIZE:])[0]
953
        buf = buf[:-364] + bytes("%06o\0" % chksum, "ascii") + buf[-357:]
954 955 956 957 958 959 960 961 962 963 964 965 966
        return buf

    @staticmethod
    def _create_payload(payload):
        """Return the string payload filled with zero bytes
           up to the next 512 byte border.
        """
        blocks, remainder = divmod(len(payload), BLOCKSIZE)
        if remainder > 0:
            payload += (BLOCKSIZE - remainder) * NUL
        return payload

    @classmethod
967
    def _create_gnu_long_header(cls, name, type, encoding, errors):
968 969 970
        """Return a GNUTYPE_LONGNAME or GNUTYPE_LONGLINK sequence
           for name.
        """
971
        name = name.encode(encoding, errors) + NUL
972 973 974 975 976 977 978 979

        info = {}
        info["name"] = "././@LongLink"
        info["type"] = type
        info["size"] = len(name)
        info["magic"] = GNU_MAGIC

        # create extended header + name blocks.
980
        return cls._create_header(info, USTAR_FORMAT, encoding, errors) + \
981 982 983
                cls._create_payload(name)

    @classmethod
984 985
    def _create_pax_generic_header(cls, pax_headers, type, encoding):
        """Return a POSIX.1-2008 extended or global header sequence
986
           that contains a list of keyword, value pairs. The values
987
           must be strings.
988
        """
989 990 991 992 993
        # Check if one of the fields contains surrogate characters and thereby
        # forces hdrcharset=BINARY, see _proc_pax() for more information.
        binary = False
        for keyword, value in pax_headers.items():
            try:
994
                value.encode("utf-8", "strict")
995 996 997 998
            except UnicodeEncodeError:
                binary = True
                break

999
        records = b""
1000 1001 1002 1003
        if binary:
            # Put the hdrcharset field at the beginning of the header.
            records += b"21 hdrcharset=BINARY\n"

1004
        for keyword, value in pax_headers.items():
1005
            keyword = keyword.encode("utf-8")
1006 1007 1008 1009 1010
            if binary:
                # Try to restore the original byte representation of `value'.
                # Needless to say, that the encoding must match the string.
                value = value.encode(encoding, "surrogateescape")
            else:
1011
                value = value.encode("utf-8")
1012

1013 1014 1015 1016 1017 1018 1019
            l = len(keyword) + len(value) + 3   # ' ' + '=' + '\n'
            n = p = 0
            while True:
                n = l + len(str(p))
                if n == p:
                    break
                p = n
1020
            records += bytes(str(p), "ascii") + b" " + keyword + b"=" + value + b"\n"
1021 1022 1023 1024 1025 1026 1027 1028 1029 1030

        # We use a hardcoded "././@PaxHeader" name like star does
        # instead of the one that POSIX recommends.
        info = {}
        info["name"] = "././@PaxHeader"
        info["type"] = type
        info["size"] = len(records)
        info["magic"] = POSIX_MAGIC

        # Create pax header + record blocks.
1031
        return cls._create_header(info, USTAR_FORMAT, "ascii", "replace") + \
1032 1033
                cls._create_payload(records)

Guido van Rossum's avatar
Guido van Rossum committed
1034
    @classmethod
1035 1036
    def frombuf(cls, buf, encoding, errors):
        """Construct a TarInfo object from a 512 byte bytes object.
1037
        """
1038 1039
        if len(buf) == 0:
            raise EmptyHeaderError("empty header")
1040
        if len(buf) != BLOCKSIZE:
1041
            raise TruncatedHeaderError("truncated header")
1042
        if buf.count(NUL) == BLOCKSIZE:
1043
            raise EOFHeaderError("end of file header")
1044 1045 1046

        chksum = nti(buf[148:156])
        if chksum not in calc_chksums(buf):
1047
            raise InvalidHeaderError("bad checksum")
1048

1049
        obj = cls()
1050
        obj.name = nts(buf[0:100], encoding, errors)
1051 1052 1053 1054 1055 1056 1057
        obj.mode = nti(buf[100:108])
        obj.uid = nti(buf[108:116])
        obj.gid = nti(buf[116:124])
        obj.size = nti(buf[124:136])
        obj.mtime = nti(buf[136:148])
        obj.chksum = chksum
        obj.type = buf[156:157]
1058 1059 1060
        obj.linkname = nts(buf[157:257], encoding, errors)
        obj.uname = nts(buf[265:297], encoding, errors)
        obj.gname = nts(buf[297:329], encoding, errors)
1061 1062
        obj.devmajor = nti(buf[329:337])
        obj.devminor = nti(buf[337:345])
1063
        prefix = nts(buf[345:500], encoding, errors)
1064 1065 1066 1067 1068

        # Old V7 tar format represents a directory as a regular
        # file with a trailing slash.
        if obj.type == AREGTYPE and obj.name.endswith("/"):
            obj.type = DIRTYPE
1069

1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087
        # The old GNU sparse format occupies some of the unused
        # space in the buffer for up to 4 sparse structures.
        # Save the them for later processing in _proc_sparse().
        if obj.type == GNUTYPE_SPARSE:
            pos = 386
            structs = []
            for i in range(4):
                try:
                    offset = nti(buf[pos:pos + 12])
                    numbytes = nti(buf[pos + 12:pos + 24])
                except ValueError:
                    break
                structs.append((offset, numbytes))
                pos += 24
            isextended = bool(buf[482])
            origsize = nti(buf[483:495])
            obj._sparse_structs = (structs, isextended, origsize)

1088 1089 1090
        # Remove redundant slashes from directories.
        if obj.isdir():
            obj.name = obj.name.rstrip("/")
1091

1092 1093 1094 1095
        # Reconstruct a ustar longname.
        if prefix and obj.type not in GNU_TYPES:
            obj.name = prefix + "/" + obj.name
        return obj
1096

1097 1098 1099 1100 1101 1102
    @classmethod
    def fromtarfile(cls, tarfile):
        """Return the next TarInfo object from TarFile object
           tarfile.
        """
        buf = tarfile.fileobj.read(BLOCKSIZE)
1103
        obj = cls.frombuf(buf, tarfile.encoding, tarfile.errors)
1104 1105
        obj.offset = tarfile.fileobj.tell() - BLOCKSIZE
        return obj._proc_member(tarfile)
1106

1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127
    #--------------------------------------------------------------------------
    # The following are methods that are called depending on the type of a
    # member. The entry point is _proc_member() which can be overridden in a
    # subclass to add custom _proc_*() methods. A _proc_*() method MUST
    # implement the following
    # operations:
    # 1. Set self.offset_data to the position where the data blocks begin,
    #    if there is data that follows.
    # 2. Set tarfile.offset to the position where the next member's header will
    #    begin.
    # 3. Return self or another valid TarInfo object.
    def _proc_member(self, tarfile):
        """Choose the right processing method depending on
           the type and call it.
        """
        if self.type in (GNUTYPE_LONGNAME, GNUTYPE_LONGLINK):
            return self._proc_gnulong(tarfile)
        elif self.type == GNUTYPE_SPARSE:
            return self._proc_sparse(tarfile)
        elif self.type in (XHDTYPE, XGLTYPE, SOLARIS_XHDTYPE):
            return self._proc_pax(tarfile)
1128
        else:
1129
            return self._proc_builtin(tarfile)
1130

1131 1132 1133 1134 1135 1136 1137 1138 1139 1140
    def _proc_builtin(self, tarfile):
        """Process a builtin type or an unknown type which
           will be treated as a regular file.
        """
        self.offset_data = tarfile.fileobj.tell()
        offset = self.offset_data
        if self.isreg() or self.type not in SUPPORTED_TYPES:
            # Skip the following data blocks.
            offset += self._block(self.size)
        tarfile.offset = offset
1141

1142
        # Patch the TarInfo object with saved global
1143
        # header information.
1144
        self._apply_pax_info(tarfile.pax_headers, tarfile.encoding, tarfile.errors)
1145

1146
        return self
1147

1148 1149 1150 1151 1152
    def _proc_gnulong(self, tarfile):
        """Process the blocks that hold a GNU longname
           or longlink member.
        """
        buf = tarfile.fileobj.read(self._block(self.size))
1153

1154
        # Fetch the next header and process it.
1155 1156 1157 1158
        try:
            next = self.fromtarfile(tarfile)
        except HeaderError:
            raise SubsequentHeaderError("missing or bad subsequent header")
1159

1160 1161 1162 1163
        # Patch the TarInfo object from the next header with
        # the longname information.
        next.offset = self.offset
        if self.type == GNUTYPE_LONGNAME:
1164
            next.name = nts(buf, tarfile.encoding, tarfile.errors)
1165
        elif self.type == GNUTYPE_LONGLINK:
1166
            next.linkname = nts(buf, tarfile.encoding, tarfile.errors)
1167

1168
        return next
1169

1170 1171 1172
    def _proc_sparse(self, tarfile):
        """Process a GNU sparse header plus extra headers.
        """
1173 1174 1175
        # We already collected some sparse structures in frombuf().
        structs, isextended, origsize = self._sparse_structs
        del self._sparse_structs
1176

1177
        # Collect sparse structures from extended header blocks.
1178
        while isextended:
1179 1180
            buf = tarfile.fileobj.read(BLOCKSIZE)
            pos = 0
1181
            for i in range(21):
1182 1183 1184 1185 1186
                try:
                    offset = nti(buf[pos:pos + 12])
                    numbytes = nti(buf[pos + 12:pos + 24])
                except ValueError:
                    break
1187 1188
                if offset and numbytes:
                    structs.append((offset, numbytes))
1189
                pos += 24
1190
            isextended = bool(buf[504])
1191
        self.sparse = structs
1192 1193 1194 1195 1196 1197 1198 1199

        self.offset_data = tarfile.fileobj.tell()
        tarfile.offset = self.offset_data + self._block(self.size)
        self.size = origsize
        return self

    def _proc_pax(self, tarfile):
        """Process an extended or global header as described in
1200
           POSIX.1-2008.
1201
        """
1202 1203
        # Read the header information.
        buf = tarfile.fileobj.read(self._block(self.size))
1204

1205 1206 1207 1208 1209 1210 1211 1212
        # A pax header stores supplemental information for either
        # the following file (extended) or all following files
        # (global).
        if self.type == XGLTYPE:
            pax_headers = tarfile.pax_headers
        else:
            pax_headers = tarfile.pax_headers.copy()

1213 1214 1215 1216 1217 1218 1219
        # Check if the pax header contains a hdrcharset field. This tells us
        # the encoding of the path, linkpath, uname and gname fields. Normally,
        # these fields are UTF-8 encoded but since POSIX.1-2008 tar
        # implementations are allowed to store them as raw binary strings if
        # the translation to UTF-8 fails.
        match = re.search(br"\d+ hdrcharset=([^\n]+)\n", buf)
        if match is not None:
1220
            pax_headers["hdrcharset"] = match.group(1).decode("utf-8")
1221 1222 1223 1224 1225 1226 1227 1228

        # For the time being, we don't care about anything other than "BINARY".
        # The only other value that is currently allowed by the standard is
        # "ISO-IR 10646 2000 UTF-8" in other words UTF-8.
        hdrcharset = pax_headers.get("hdrcharset")
        if hdrcharset == "BINARY":
            encoding = tarfile.encoding
        else:
1229
            encoding = "utf-8"
1230

1231 1232 1233
        # Parse pax header information. A record looks like that:
        # "%d %s=%s\n" % (length, keyword, value). length is the size
        # of the complete record including the length field itself and
1234
        # the newline. keyword and value are both UTF-8 encoded strings.
1235
        regex = re.compile(br"(\d+) ([^=]+)=")
1236 1237 1238 1239 1240 1241 1242 1243 1244 1245
        pos = 0
        while True:
            match = regex.match(buf, pos)
            if not match:
                break

            length, keyword = match.groups()
            length = int(length)
            value = buf[match.end(2) + 1:match.start(1) + length - 1]

1246
            # Normally, we could just use "utf-8" as the encoding and "strict"
1247 1248 1249 1250 1251 1252
            # as the error handler, but we better not take the risk. For
            # example, GNU tar <= 1.23 is known to store filenames it cannot
            # translate to UTF-8 as raw strings (unfortunately without a
            # hdrcharset=BINARY header).
            # We first try the strict standard encoding, and if that fails we
            # fall back on the user's encoding and error handler.
1253
            keyword = self._decode_pax_field(keyword, "utf-8", "utf-8",
1254 1255 1256 1257 1258
                    tarfile.errors)
            if keyword in PAX_NAME_FIELDS:
                value = self._decode_pax_field(value, encoding, tarfile.encoding,
                        tarfile.errors)
            else:
1259
                value = self._decode_pax_field(value, "utf-8", "utf-8",
1260
                        tarfile.errors)
1261 1262 1263 1264

            pax_headers[keyword] = value
            pos += length

1265
        # Fetch the next header.
1266 1267 1268 1269
        try:
            next = self.fromtarfile(tarfile)
        except HeaderError:
            raise SubsequentHeaderError("missing or bad subsequent header")
1270

1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283
        # Process GNU sparse information.
        if "GNU.sparse.map" in pax_headers:
            # GNU extended sparse format version 0.1.
            self._proc_gnusparse_01(next, pax_headers)

        elif "GNU.sparse.size" in pax_headers:
            # GNU extended sparse format version 0.0.
            self._proc_gnusparse_00(next, pax_headers, buf)

        elif pax_headers.get("GNU.sparse.major") == "1" and pax_headers.get("GNU.sparse.minor") == "0":
            # GNU extended sparse format version 1.0.
            self._proc_gnusparse_10(next, pax_headers, tarfile)

1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299
        if self.type in (XHDTYPE, SOLARIS_XHDTYPE):
            # Patch the TarInfo object with the extended header info.
            next._apply_pax_info(pax_headers, tarfile.encoding, tarfile.errors)
            next.offset = self.offset

            if "size" in pax_headers:
                # If the extended header replaces the size field,
                # we need to recalculate the offset where the next
                # header starts.
                offset = next.offset_data
                if next.isreg() or next.type not in SUPPORTED_TYPES:
                    offset += next._block(next.size)
                tarfile.offset = offset

        return next

1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332
    def _proc_gnusparse_00(self, next, pax_headers, buf):
        """Process a GNU tar extended sparse header, version 0.0.
        """
        offsets = []
        for match in re.finditer(br"\d+ GNU.sparse.offset=(\d+)\n", buf):
            offsets.append(int(match.group(1)))
        numbytes = []
        for match in re.finditer(br"\d+ GNU.sparse.numbytes=(\d+)\n", buf):
            numbytes.append(int(match.group(1)))
        next.sparse = list(zip(offsets, numbytes))

    def _proc_gnusparse_01(self, next, pax_headers):
        """Process a GNU tar extended sparse header, version 0.1.
        """
        sparse = [int(x) for x in pax_headers["GNU.sparse.map"].split(",")]
        next.sparse = list(zip(sparse[::2], sparse[1::2]))

    def _proc_gnusparse_10(self, next, pax_headers, tarfile):
        """Process a GNU tar extended sparse header, version 1.0.
        """
        fields = None
        sparse = []
        buf = tarfile.fileobj.read(BLOCKSIZE)
        fields, buf = buf.split(b"\n", 1)
        fields = int(fields)
        while len(sparse) < fields * 2:
            if b"\n" not in buf:
                buf += tarfile.fileobj.read(BLOCKSIZE)
            number, buf = buf.split(b"\n", 1)
            sparse.append(int(number))
        next.offset_data = tarfile.fileobj.tell()
        next.sparse = list(zip(sparse[::2], sparse[1::2]))

1333 1334 1335 1336 1337
    def _apply_pax_info(self, pax_headers, encoding, errors):
        """Replace fields with supplemental information from a previous
           pax extended or global header.
        """
        for keyword, value in pax_headers.items():
1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352
            if keyword == "GNU.sparse.name":
                setattr(self, "path", value)
            elif keyword == "GNU.sparse.size":
                setattr(self, "size", int(value))
            elif keyword == "GNU.sparse.realsize":
                setattr(self, "size", int(value))
            elif keyword in PAX_FIELDS:
                if keyword in PAX_NUMBER_FIELDS:
                    try:
                        value = PAX_NUMBER_FIELDS[keyword](value)
                    except ValueError:
                        value = 0
                if keyword == "path":
                    value = value.rstrip("/")
                setattr(self, keyword, value)
1353 1354

        self.pax_headers = pax_headers.copy()
1355

1356 1357 1358 1359 1360 1361 1362 1363
    def _decode_pax_field(self, value, encoding, fallback_encoding, fallback_errors):
        """Decode a single field from a pax record.
        """
        try:
            return value.decode(encoding, "strict")
        except UnicodeDecodeError:
            return value.decode(fallback_encoding, fallback_errors)

1364 1365 1366 1367 1368 1369 1370 1371
    def _block(self, count):
        """Round up a byte count by BLOCKSIZE and return it,
           e.g. _block(834) => 1024.
        """
        blocks, remainder = divmod(count, BLOCKSIZE)
        if remainder:
            blocks += 1
        return blocks * BLOCKSIZE
1372

1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
    def isreg(self):
        return self.type in REGULAR_TYPES
    def isfile(self):
        return self.isreg()
    def isdir(self):
        return self.type == DIRTYPE
    def issym(self):
        return self.type == SYMTYPE
    def islnk(self):
        return self.type == LNKTYPE
    def ischr(self):
        return self.type == CHRTYPE
    def isblk(self):
        return self.type == BLKTYPE
    def isfifo(self):
        return self.type == FIFOTYPE
    def issparse(self):
1390
        return self.sparse is not None
1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406
    def isdev(self):
        return self.type in (CHRTYPE, BLKTYPE, FIFOTYPE)
# class TarInfo

class TarFile(object):
    """The TarFile Class provides an interface to tar archives.
    """

    debug = 0                   # May be set from 0 (no msgs) to 3 (all msgs)

    dereference = False         # If true, add content of linked file to the
                                # tar file, else the link.

    ignore_zeros = False        # If true, skips empty or invalid blocks and
                                # continues processing.

1407
    errorlevel = 1              # If 0, fatal errors only appear in debug
1408 1409 1410
                                # messages (if debug >= 0). If > 0, errors
                                # are passed to the caller as exceptions.

1411 1412
    format = DEFAULT_FORMAT     # The format to use when creating an archive.

1413 1414 1415
    encoding = ENCODING         # Encoding for 8-bit character strings.

    errors = None               # Error handler for unicode conversion.
1416

1417
    tarinfo = TarInfo           # The default TarInfo class to use.
1418

1419
    fileobject = ExFileObject   # The file-object for extractfile().
1420 1421 1422

    def __init__(self, name=None, mode="r", fileobj=None, format=None,
            tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1423
            errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
1424 1425 1426 1427 1428 1429 1430 1431 1432
        """Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
           read from an existing archive, 'a' to append data to an existing
           file or 'w' to create a new file overwriting an existing one. `mode'
           defaults to 'r'.
           If `fileobj' is given, it is used for reading or writing data. If it
           can be determined, `mode' is overridden by `fileobj's mode.
           `fileobj' is not closed, when TarFile is closed.
        """
        if len(mode) > 1 or mode not in "raw":
1433
            raise ValueError("mode must be 'r', 'a' or 'w'")
1434 1435
        self.mode = mode
        self._mode = {"r": "rb", "a": "r+b", "w": "wb"}[mode]
1436 1437

        if not fileobj:
1438
            if self.mode == "a" and not os.path.exists(name):
1439
                # Create nonexistent files in append mode.
1440 1441
                self.mode = "w"
                self._mode = "wb"
1442
            fileobj = bltn_open(name, self._mode)
1443 1444
            self._extfileobj = False
        else:
1445 1446
            if name is None and hasattr(fileobj, "name"):
                name = fileobj.name
1447
            if hasattr(fileobj, "mode"):
1448
                self._mode = fileobj.mode
1449
            self._extfileobj = True
1450
        self.name = os.path.abspath(name) if name else None
1451 1452
        self.fileobj = fileobj

1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463
        # Init attributes.
        if format is not None:
            self.format = format
        if tarinfo is not None:
            self.tarinfo = tarinfo
        if dereference is not None:
            self.dereference = dereference
        if ignore_zeros is not None:
            self.ignore_zeros = ignore_zeros
        if encoding is not None:
            self.encoding = encoding
1464
        self.errors = errors
1465 1466 1467 1468 1469 1470

        if pax_headers is not None and self.format == PAX_FORMAT:
            self.pax_headers = pax_headers
        else:
            self.pax_headers = {}

1471 1472 1473 1474 1475 1476
        if debug is not None:
            self.debug = debug
        if errorlevel is not None:
            self.errorlevel = errorlevel

        # Init datastructures.
1477 1478 1479
        self.closed = False
        self.members = []       # list of members as TarInfo objects
        self._loaded = False    # flag if all members have been read
1480 1481
        self.offset = self.fileobj.tell()
                                # current position in the archive file
1482 1483
        self.inodes = {}        # dictionary caching the inodes of
                                # archive members already added
1484

1485 1486 1487 1488 1489 1490 1491 1492 1493
        try:
            if self.mode == "r":
                self.firstmember = None
                self.firstmember = self.next()

            if self.mode == "a":
                # Move to the end of the archive,
                # before the first empty block.
                while True:
1494 1495 1496 1497 1498 1499
                    self.fileobj.seek(self.offset)
                    try:
                        tarinfo = self.tarinfo.fromtarfile(self)
                        self.members.append(tarinfo)
                    except EOFHeaderError:
                        self.fileobj.seek(self.offset)
1500
                        break
1501 1502
                    except HeaderError as e:
                        raise ReadError(str(e))
1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515

            if self.mode in "aw":
                self._loaded = True

                if self.pax_headers:
                    buf = self.tarinfo.create_pax_global_header(self.pax_headers.copy())
                    self.fileobj.write(buf)
                    self.offset += len(buf)
        except:
            if not self._extfileobj:
                self.fileobj.close()
            self.closed = True
            raise
1516

1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527
    #--------------------------------------------------------------------------
    # Below are the classmethods which act as alternate constructors to the
    # TarFile class. The open() method is the only one that is needed for
    # public use; it is the "super"-constructor and is able to select an
    # adequate "sub"-constructor for a particular compression using the mapping
    # from OPEN_METH.
    #
    # This concept allows one to subclass TarFile without losing the comfort of
    # the super-constructor. A sub-constructor is registered and made available
    # by adding it to the mapping in OPEN_METH.

Guido van Rossum's avatar
Guido van Rossum committed
1528
    @classmethod
1529
    def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
1530 1531 1532 1533
        """Open a tar archive for reading, writing or appending. Return
           an appropriate TarFile class.

           mode:
1534
           'r' or 'r:*' open for reading with transparent compression
1535 1536 1537
           'r:'         open for reading exclusively uncompressed
           'r:gz'       open for reading with gzip compression
           'r:bz2'      open for reading with bzip2 compression
1538
           'r:xz'       open for reading with lzma compression
1539
           'a' or 'a:'  open for appending, creating the file if necessary
1540 1541 1542
           'w' or 'w:'  open for writing without compression
           'w:gz'       open for writing with gzip compression
           'w:bz2'      open for writing with bzip2 compression
1543
           'w:xz'       open for writing with lzma compression
1544 1545

           'r|*'        open a stream of tar blocks with transparent compression
1546 1547 1548
           'r|'         open an uncompressed stream of tar blocks for reading
           'r|gz'       open a gzip compressed stream of tar blocks
           'r|bz2'      open a bzip2 compressed stream of tar blocks
1549
           'r|xz'       open an lzma compressed stream of tar blocks
1550 1551 1552
           'w|'         open an uncompressed stream for writing
           'w|gz'       open a gzip compressed stream for writing
           'w|bz2'      open a bzip2 compressed stream for writing
1553
           'w|xz'       open an lzma compressed stream for writing
1554 1555 1556
        """

        if not name and not fileobj:
1557
            raise ValueError("nothing to open")
1558

1559 1560 1561 1562
        if mode in ("r", "r:*"):
            # Find out which *open() is appropriate for opening the file.
            for comptype in cls.OPEN_METH:
                func = getattr(cls, cls.OPEN_METH[comptype])
1563 1564
                if fileobj is not None:
                    saved_pos = fileobj.tell()
1565
                try:
1566 1567
                    return func(name, "r", fileobj, **kwargs)
                except (ReadError, CompressionError) as e:
1568 1569
                    if fileobj is not None:
                        fileobj.seek(saved_pos)
1570
                    continue
1571
            raise ReadError("file could not be opened successfully")
1572 1573

        elif ":" in mode:
1574 1575 1576 1577 1578 1579 1580 1581 1582
            filemode, comptype = mode.split(":", 1)
            filemode = filemode or "r"
            comptype = comptype or "tar"

            # Select the *open() function according to
            # given compression.
            if comptype in cls.OPEN_METH:
                func = getattr(cls, cls.OPEN_METH[comptype])
            else:
1583
                raise CompressionError("unknown compression type %r" % comptype)
1584
            return func(name, filemode, fileobj, **kwargs)
1585 1586 1587 1588 1589 1590 1591

        elif "|" in mode:
            filemode, comptype = mode.split("|", 1)
            filemode = filemode or "r"
            comptype = comptype or "tar"

            if filemode not in "rw":
1592
                raise ValueError("mode must be 'r' or 'w'")
1593

1594 1595 1596 1597 1598 1599
            stream = _Stream(name, filemode, comptype, fileobj, bufsize)
            try:
                t = cls(name, filemode, stream, **kwargs)
            except:
                stream.close()
                raise
1600 1601 1602 1603
            t._extfileobj = False
            return t

        elif mode in "aw":
1604
            return cls.taropen(name, mode, fileobj, **kwargs)
1605

1606
        raise ValueError("undiscernible mode")
1607

Guido van Rossum's avatar
Guido van Rossum committed
1608
    @classmethod
1609
    def taropen(cls, name, mode="r", fileobj=None, **kwargs):
1610 1611 1612
        """Open uncompressed tar archive name for reading or writing.
        """
        if len(mode) > 1 or mode not in "raw":
1613
            raise ValueError("mode must be 'r', 'a' or 'w'")
1614
        return cls(name, mode, fileobj, **kwargs)
1615

Guido van Rossum's avatar
Guido van Rossum committed
1616
    @classmethod
1617
    def gzopen(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1618 1619 1620 1621
        """Open gzip compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if len(mode) > 1 or mode not in "rw":
1622
            raise ValueError("mode must be 'r' or 'w'")
1623 1624 1625

        try:
            import gzip
1626 1627
            gzip.GzipFile
        except (ImportError, AttributeError):
1628
            raise CompressionError("gzip module is not available")
1629

1630
        extfileobj = fileobj is not None
1631
        try:
1632 1633
            fileobj = gzip.GzipFile(name, mode + "b", compresslevel, fileobj)
            t = cls.taropen(name, mode, fileobj, **kwargs)
1634
        except IOError:
1635
            if not extfileobj and fileobj is not None:
1636
                fileobj.close()
1637 1638
            if fileobj is None:
                raise
1639
            raise ReadError("not a gzip file")
1640
        except:
1641
            if not extfileobj and fileobj is not None:
1642 1643
                fileobj.close()
            raise
1644
        t._extfileobj = extfileobj
1645 1646
        return t

Guido van Rossum's avatar
Guido van Rossum committed
1647
    @classmethod
1648
    def bz2open(cls, name, mode="r", fileobj=None, compresslevel=9, **kwargs):
1649 1650 1651 1652
        """Open bzip2 compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if len(mode) > 1 or mode not in "rw":
1653
            raise ValueError("mode must be 'r' or 'w'.")
1654 1655 1656 1657

        try:
            import bz2
        except ImportError:
1658
            raise CompressionError("bz2 module is not available")
1659

1660 1661
        fileobj = bz2.BZ2File(fileobj or name, mode,
                              compresslevel=compresslevel)
1662 1663

        try:
1664
            t = cls.taropen(name, mode, fileobj, **kwargs)
1665
        except (IOError, EOFError):
1666
            fileobj.close()
1667
            raise ReadError("not a bzip2 file")
1668 1669 1670
        t._extfileobj = False
        return t

1671
    @classmethod
1672
    def xzopen(cls, name, mode="r", fileobj=None, preset=None, **kwargs):
1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683
        """Open lzma compressed tar archive name for reading or writing.
           Appending is not allowed.
        """
        if mode not in ("r", "w"):
            raise ValueError("mode must be 'r' or 'w'")

        try:
            import lzma
        except ImportError:
            raise CompressionError("lzma module is not available")

1684
        fileobj = lzma.LZMAFile(fileobj or name, mode, preset=preset)
1685 1686 1687 1688 1689 1690 1691 1692 1693

        try:
            t = cls.taropen(name, mode, fileobj, **kwargs)
        except (lzma.LZMAError, EOFError):
            fileobj.close()
            raise ReadError("not an lzma file")
        t._extfileobj = False
        return t

1694 1695 1696 1697
    # All *open() methods are registered here.
    OPEN_METH = {
        "tar": "taropen",   # uncompressed tar
        "gz":  "gzopen",    # gzip compressed tar
1698 1699
        "bz2": "bz2open",   # bzip2 compressed tar
        "xz":  "xzopen"     # lzma compressed tar
1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711
    }

    #--------------------------------------------------------------------------
    # The public methods which TarFile provides:

    def close(self):
        """Close the TarFile. In write-mode, two finishing zero blocks are
           appended to the archive.
        """
        if self.closed:
            return

1712
        if self.mode in "aw":
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727
            self.fileobj.write(NUL * (BLOCKSIZE * 2))
            self.offset += (BLOCKSIZE * 2)
            # fill up the end with zero-blocks
            # (like option -b20 for tar does)
            blocks, remainder = divmod(self.offset, RECORDSIZE)
            if remainder > 0:
                self.fileobj.write(NUL * (RECORDSIZE - remainder))

        if not self._extfileobj:
            self.fileobj.close()
        self.closed = True

    def getmember(self, name):
        """Return a TarInfo object for member `name'. If `name' can not be
           found in the archive, KeyError is raised. If a member occurs more
1728
           than once in the archive, its last occurrence is assumed to be the
1729 1730
           most up-to-date version.
        """
1731 1732
        tarinfo = self._getmember(name)
        if tarinfo is None:
1733
            raise KeyError("filename %r not found" % name)
1734
        return tarinfo
1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749

    def getmembers(self):
        """Return the members of the archive as a list of TarInfo objects. The
           list has the same order as the members in the archive.
        """
        self._check()
        if not self._loaded:    # if we want to obtain a list of
            self._load()        # all members, we first have to
                                # scan the whole archive.
        return self.members

    def getnames(self):
        """Return the members of the archive as a list of their names. It has
           the same order as the list returned by getmembers().
        """
1750
        return [tarinfo.name for tarinfo in self.getmembers()]
1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771

    def gettarinfo(self, name=None, arcname=None, fileobj=None):
        """Create a TarInfo object for either the file `name' or the file
           object `fileobj' (using os.fstat on its file descriptor). You can
           modify some of the TarInfo's attributes before you add it using
           addfile(). If given, `arcname' specifies an alternative name for the
           file in the archive.
        """
        self._check("aw")

        # When fileobj is given, replace name by
        # fileobj's real name.
        if fileobj is not None:
            name = fileobj.name

        # Building the name of the member in the archive.
        # Backward slashes are converted to forward slashes,
        # Absolute paths are turned to relative paths.
        if arcname is None:
            arcname = name
        drv, arcname = os.path.splitdrive(arcname)
1772 1773
        arcname = arcname.replace(os.sep, "/")
        arcname = arcname.lstrip("/")
1774 1775 1776

        # Now, fill the TarInfo object with
        # information specific for the file.
1777 1778
        tarinfo = self.tarinfo()
        tarinfo.tarfile = self
1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793

        # Use os.stat or os.lstat, depending on platform
        # and if symlinks shall be resolved.
        if fileobj is None:
            if hasattr(os, "lstat") and not self.dereference:
                statres = os.lstat(name)
            else:
                statres = os.stat(name)
        else:
            statres = os.fstat(fileobj.fileno())
        linkname = ""

        stmd = statres.st_mode
        if stat.S_ISREG(stmd):
            inode = (statres.st_ino, statres.st_dev)
1794 1795
            if not self.dereference and statres.st_nlink > 1 and \
                    inode in self.inodes and arcname != self.inodes[inode]:
1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821
                # Is it a hardlink to an already
                # archived file?
                type = LNKTYPE
                linkname = self.inodes[inode]
            else:
                # The inode is added only if its valid.
                # For win32 it is always 0.
                type = REGTYPE
                if inode[0]:
                    self.inodes[inode] = arcname
        elif stat.S_ISDIR(stmd):
            type = DIRTYPE
        elif stat.S_ISFIFO(stmd):
            type = FIFOTYPE
        elif stat.S_ISLNK(stmd):
            type = SYMTYPE
            linkname = os.readlink(name)
        elif stat.S_ISCHR(stmd):
            type = CHRTYPE
        elif stat.S_ISBLK(stmd):
            type = BLKTYPE
        else:
            return None

        # Fill the TarInfo object with all
        # information we can get.
1822 1823 1824 1825
        tarinfo.name = arcname
        tarinfo.mode = stmd
        tarinfo.uid = statres.st_uid
        tarinfo.gid = statres.st_gid
1826
        if type == REGTYPE:
1827
            tarinfo.size = statres.st_size
1828
        else:
1829
            tarinfo.size = 0
1830
        tarinfo.mtime = statres.st_mtime
1831
        tarinfo.type = type
1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858
        tarinfo.linkname = linkname
        if pwd:
            try:
                tarinfo.uname = pwd.getpwuid(tarinfo.uid)[0]
            except KeyError:
                pass
        if grp:
            try:
                tarinfo.gname = grp.getgrgid(tarinfo.gid)[0]
            except KeyError:
                pass

        if type in (CHRTYPE, BLKTYPE):
            if hasattr(os, "major") and hasattr(os, "minor"):
                tarinfo.devmajor = os.major(statres.st_rdev)
                tarinfo.devminor = os.minor(statres.st_rdev)
        return tarinfo

    def list(self, verbose=True):
        """Print a table of contents to sys.stdout. If `verbose' is False, only
           the names of the members are printed. If it is True, an `ls -l'-like
           output is produced.
        """
        self._check()

        for tarinfo in self:
            if verbose:
1859
                print(stat.filemode(tarinfo.mode), end=' ')
1860 1861
                print("%s/%s" % (tarinfo.uname or tarinfo.uid,
                                 tarinfo.gname or tarinfo.gid), end=' ')
1862
                if tarinfo.ischr() or tarinfo.isblk():
1863 1864
                    print("%10s" % ("%d,%d" \
                                    % (tarinfo.devmajor, tarinfo.devminor)), end=' ')
1865
                else:
1866 1867 1868
                    print("%10d" % tarinfo.size, end=' ')
                print("%d-%02d-%02d %02d:%02d:%02d" \
                      % time.localtime(tarinfo.mtime)[:6], end=' ')
1869

1870
            print(tarinfo.name + ("/" if tarinfo.isdir() else ""), end=' ')
1871 1872 1873

            if verbose:
                if tarinfo.issym():
1874
                    print("->", tarinfo.linkname, end=' ')
1875
                if tarinfo.islnk():
1876 1877
                    print("link to", tarinfo.linkname, end=' ')
            print()
1878

1879
    def add(self, name, arcname=None, recursive=True, exclude=None, *, filter=None):
1880 1881 1882 1883
        """Add the file `name' to the archive. `name' may be any type of file
           (directory, fifo, symbolic link, etc.). If given, `arcname'
           specifies an alternative name for the file in the archive.
           Directories are added recursively by default. This can be avoided by
1884
           setting `recursive' to False. `exclude' is a function that should
1885 1886 1887 1888
           return True for each filename to be excluded. `filter' is a function
           that expects a TarInfo object argument and returns the changed
           TarInfo object, if it returns None the TarInfo object will be
           excluded from the archive.
1889 1890 1891 1892 1893 1894
        """
        self._check("aw")

        if arcname is None:
            arcname = name

1895
        # Exclude pathnames.
1896 1897 1898 1899 1900 1901 1902
        if exclude is not None:
            import warnings
            warnings.warn("use the filter argument instead",
                    DeprecationWarning, 2)
            if exclude(name):
                self._dbg(2, "tarfile: Excluded %r" % name)
                return
1903

1904
        # Skip if somebody tries to archive the archive...
1905
        if self.name is not None and os.path.abspath(name) == self.name:
1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
            self._dbg(2, "tarfile: Skipped %r" % name)
            return

        self._dbg(1, name)

        # Create a TarInfo object from the file.
        tarinfo = self.gettarinfo(name, arcname)

        if tarinfo is None:
            self._dbg(1, "tarfile: Unsupported type %r" % name)
            return

1918 1919 1920 1921 1922 1923 1924
        # Change or exclude the TarInfo object.
        if filter is not None:
            tarinfo = filter(tarinfo)
            if tarinfo is None:
                self._dbg(2, "tarfile: Excluded %r" % name)
                return

1925 1926
        # Append the tar header and data to the archive.
        if tarinfo.isreg():
1927 1928
            with bltn_open(name, "rb") as f:
                self.addfile(tarinfo, f)
1929

1930
        elif tarinfo.isdir():
1931 1932 1933
            self.addfile(tarinfo)
            if recursive:
                for f in os.listdir(name):
1934
                    self.add(os.path.join(name, f), os.path.join(arcname, f),
1935
                            recursive, exclude, filter=filter)
1936

1937 1938 1939
        else:
            self.addfile(tarinfo)

1940 1941 1942 1943 1944 1945 1946 1947 1948
    def addfile(self, tarinfo, fileobj=None):
        """Add the TarInfo object `tarinfo' to the archive. If `fileobj' is
           given, tarinfo.size bytes are read from it and added to the archive.
           You can create TarInfo objects using gettarinfo().
           On Windows platforms, `fileobj' should always be opened with mode
           'rb' to avoid irritation about the file size.
        """
        self._check("aw")

1949
        tarinfo = copy.copy(tarinfo)
1950

1951
        buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
1952 1953
        self.fileobj.write(buf)
        self.offset += len(buf)
1954 1955 1956 1957 1958 1959 1960 1961 1962 1963

        # If there's data to follow, append it.
        if fileobj is not None:
            copyfileobj(fileobj, self.fileobj, tarinfo.size)
            blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
            if remainder > 0:
                self.fileobj.write(NUL * (BLOCKSIZE - remainder))
                blocks += 1
            self.offset += blocks * BLOCKSIZE

1964
        self.members.append(tarinfo)
1965

1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
    def extractall(self, path=".", members=None):
        """Extract all members from the archive to the current working
           directory and set owner, modification time and permissions on
           directories afterwards. `path' specifies a different directory
           to extract to. `members' is optional and must be a subset of the
           list returned by getmembers().
        """
        directories = []

        if members is None:
            members = self

        for tarinfo in members:
            if tarinfo.isdir():
Christian Heimes's avatar
Christian Heimes committed
1980
                # Extract directories with a safe mode.
1981
                directories.append(tarinfo)
Christian Heimes's avatar
Christian Heimes committed
1982 1983
                tarinfo = copy.copy(tarinfo)
                tarinfo.mode = 0o700
1984 1985
            # Do not set_attrs directories, as we will do that further down
            self.extract(tarinfo, path, set_attrs=not tarinfo.isdir())
1986 1987

        # Reverse sort directories.
1988
        directories.sort(key=lambda a: a.name)
1989 1990 1991 1992
        directories.reverse()

        # Set correct owner, mtime and filemode on directories.
        for tarinfo in directories:
1993
            dirpath = os.path.join(path, tarinfo.name)
1994
            try:
1995 1996 1997
                self.chown(tarinfo, dirpath)
                self.utime(tarinfo, dirpath)
                self.chmod(tarinfo, dirpath)
1998
            except ExtractError as e:
1999 2000 2001 2002 2003
                if self.errorlevel > 1:
                    raise
                else:
                    self._dbg(1, "tarfile: %s" % e)

2004
    def extract(self, member, path="", set_attrs=True):
2005 2006 2007
        """Extract a member from the archive to the current working directory,
           using its full name. Its file information is extracted as accurately
           as possible. `member' may be a filename or a TarInfo object. You can
2008 2009
           specify a different directory using `path'. File attributes (owner,
           mtime, mode) are set unless `set_attrs' is False.
2010 2011 2012
        """
        self._check("r")

2013
        if isinstance(member, str):
2014
            tarinfo = self.getmember(member)
2015 2016
        else:
            tarinfo = member
2017

2018 2019 2020 2021
        # Prepare the link target for makelink().
        if tarinfo.islnk():
            tarinfo._link_target = os.path.join(path, tarinfo.linkname)

2022
        try:
2023 2024
            self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
                                 set_attrs=set_attrs)
2025
        except EnvironmentError as e:
2026 2027 2028 2029 2030 2031 2032
            if self.errorlevel > 0:
                raise
            else:
                if e.filename is None:
                    self._dbg(1, "tarfile: %s" % e.strerror)
                else:
                    self._dbg(1, "tarfile: %s %r" % (e.strerror, e.filename))
2033
        except ExtractError as e:
2034 2035 2036 2037 2038 2039 2040
            if self.errorlevel > 1:
                raise
            else:
                self._dbg(1, "tarfile: %s" % e)

    def extractfile(self, member):
        """Extract a member from the archive as a file object. `member' may be
2041 2042 2043
           a filename or a TarInfo object. If `member' is a regular file or a
           link, an io.BufferedReader object is returned. Otherwise, None is
           returned.
2044 2045 2046
        """
        self._check("r")

2047
        if isinstance(member, str):
2048
            tarinfo = self.getmember(member)
2049 2050
        else:
            tarinfo = member
2051

2052 2053
        if tarinfo.isreg() or tarinfo.type not in SUPPORTED_TYPES:
            # Members with unknown types are treated as regular files.
2054
            return self.fileobject(self, tarinfo)
2055 2056 2057 2058 2059 2060

        elif tarinfo.islnk() or tarinfo.issym():
            if isinstance(self.fileobj, _Stream):
                # A small but ugly workaround for the case that someone tries
                # to extract a (sym)link as a file-object from a non-seekable
                # stream of tar blocks.
2061
                raise StreamError("cannot extract (sym)link as file object")
2062
            else:
2063
                # A (sym)link's file object is its target's file object.
2064
                return self.extractfile(self._find_link_target(tarinfo))
2065 2066 2067 2068 2069
        else:
            # If there's no data associated with the member (directory, chrdev,
            # blkdev, etc.), return None instead of a file object.
            return None

2070
    def _extract_member(self, tarinfo, targetpath, set_attrs=True):
2071 2072 2073 2074 2075 2076
        """Extract the TarInfo object tarinfo to a physical
           file called targetpath.
        """
        # Fetch the TarInfo object for the given name
        # and build the destination pathname, replacing
        # forward slashes to platform specific separators.
2077 2078
        targetpath = targetpath.rstrip("/")
        targetpath = targetpath.replace("/", os.sep)
2079 2080 2081 2082

        # Create all upper directories.
        upperdirs = os.path.dirname(targetpath)
        if upperdirs and not os.path.exists(upperdirs):
Christian Heimes's avatar
Christian Heimes committed
2083 2084
            # Create directories that are not part of the archive with
            # default permissions.
2085
            os.makedirs(upperdirs)
2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106

        if tarinfo.islnk() or tarinfo.issym():
            self._dbg(1, "%s -> %s" % (tarinfo.name, tarinfo.linkname))
        else:
            self._dbg(1, tarinfo.name)

        if tarinfo.isreg():
            self.makefile(tarinfo, targetpath)
        elif tarinfo.isdir():
            self.makedir(tarinfo, targetpath)
        elif tarinfo.isfifo():
            self.makefifo(tarinfo, targetpath)
        elif tarinfo.ischr() or tarinfo.isblk():
            self.makedev(tarinfo, targetpath)
        elif tarinfo.islnk() or tarinfo.issym():
            self.makelink(tarinfo, targetpath)
        elif tarinfo.type not in SUPPORTED_TYPES:
            self.makeunknown(tarinfo, targetpath)
        else:
            self.makefile(tarinfo, targetpath)

2107 2108 2109 2110 2111
        if set_attrs:
            self.chown(tarinfo, targetpath)
            if not tarinfo.issym():
                self.chmod(tarinfo, targetpath)
                self.utime(tarinfo, targetpath)
2112 2113 2114 2115 2116 2117 2118 2119 2120 2121

    #--------------------------------------------------------------------------
    # Below are the different file methods. They are called via
    # _extract_member() when extract() is called. They can be replaced in a
    # subclass to implement other functionality.

    def makedir(self, tarinfo, targetpath):
        """Make a directory called targetpath.
        """
        try:
Christian Heimes's avatar
Christian Heimes committed
2122 2123 2124
            # Use a safe mode for the directory, the real mode is set
            # later in _extract_member().
            os.mkdir(targetpath, 0o700)
2125 2126
        except FileExistsError:
            pass
2127 2128 2129 2130

    def makefile(self, tarinfo, targetpath):
        """Make a file called targetpath.
        """
2131 2132
        source = self.fileobj
        source.seek(tarinfo.offset_data)
2133 2134 2135 2136 2137 2138 2139 2140 2141
        with bltn_open(targetpath, "wb") as target:
            if tarinfo.sparse is not None:
                for offset, size in tarinfo.sparse:
                    target.seek(offset)
                    copyfileobj(source, target, size)
            else:
                copyfileobj(source, target, tarinfo.size)
            target.seek(tarinfo.size)
            target.truncate()
2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156

    def makeunknown(self, tarinfo, targetpath):
        """Make a file from a TarInfo object with an unknown type
           at targetpath.
        """
        self.makefile(tarinfo, targetpath)
        self._dbg(1, "tarfile: Unknown file type %r, " \
                     "extracted as regular file." % tarinfo.type)

    def makefifo(self, tarinfo, targetpath):
        """Make a fifo called targetpath.
        """
        if hasattr(os, "mkfifo"):
            os.mkfifo(targetpath)
        else:
2157
            raise ExtractError("fifo not supported by system")
2158 2159 2160 2161 2162

    def makedev(self, tarinfo, targetpath):
        """Make a character or block device called targetpath.
        """
        if not hasattr(os, "mknod") or not hasattr(os, "makedev"):
2163
            raise ExtractError("special devices not supported by system")
2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178

        mode = tarinfo.mode
        if tarinfo.isblk():
            mode |= stat.S_IFBLK
        else:
            mode |= stat.S_IFCHR

        os.mknod(targetpath, mode,
                 os.makedev(tarinfo.devmajor, tarinfo.devminor))

    def makelink(self, tarinfo, targetpath):
        """Make a (symbolic) link called targetpath. If it cannot be created
          (platform limitation), we try to make a copy of the referenced file
          instead of a link.
        """
2179
        try:
2180
            # For systems that support symbolic and hard links.
2181
            if tarinfo.issym():
2182
                os.symlink(tarinfo.linkname, targetpath)
2183
            else:
2184
                # See extract().
2185 2186 2187
                if os.path.exists(tarinfo._link_target):
                    os.link(tarinfo._link_target, targetpath)
                else:
2188 2189
                    self._extract_member(self._find_link_target(tarinfo),
                                         targetpath)
2190
        except symlink_exception:
2191
            try:
2192 2193
                self._extract_member(self._find_link_target(tarinfo),
                                     targetpath)
2194 2195
            except KeyError:
                raise ExtractError("unable to resolve link inside archive")
2196 2197 2198 2199 2200 2201 2202 2203 2204

    def chown(self, tarinfo, targetpath):
        """Set owner of targetpath according to tarinfo.
        """
        if pwd and hasattr(os, "geteuid") and os.geteuid() == 0:
            # We have to be root to do so.
            try:
                g = grp.getgrnam(tarinfo.gname)[2]
            except KeyError:
2205
                g = tarinfo.gid
2206 2207 2208
            try:
                u = pwd.getpwnam(tarinfo.uname)[2]
            except KeyError:
2209
                u = tarinfo.uid
2210 2211 2212 2213
            try:
                if tarinfo.issym() and hasattr(os, "lchown"):
                    os.lchown(targetpath, u, g)
                else:
2214 2215
                    if sys.platform != "os2emx":
                        os.chown(targetpath, u, g)
2216
            except EnvironmentError as e:
2217
                raise ExtractError("could not change owner")
2218 2219 2220 2221

    def chmod(self, tarinfo, targetpath):
        """Set file permissions of targetpath according to tarinfo.
        """
2222 2223 2224
        if hasattr(os, 'chmod'):
            try:
                os.chmod(targetpath, tarinfo.mode)
2225
            except EnvironmentError as e:
2226
                raise ExtractError("could not change mode")
2227 2228 2229 2230

    def utime(self, tarinfo, targetpath):
        """Set modification time of targetpath according to tarinfo.
        """
2231
        if not hasattr(os, 'utime'):
2232
            return
2233 2234
        try:
            os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2235
        except EnvironmentError as e:
2236
            raise ExtractError("could not change modification time")
2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250

    #--------------------------------------------------------------------------
    def next(self):
        """Return the next member of the archive as a TarInfo object, when
           TarFile is opened for reading. Return None if there is no more
           available.
        """
        self._check("ra")
        if self.firstmember is not None:
            m = self.firstmember
            self.firstmember = None
            return m

        # Read the next block.
Andrew M. Kuchling's avatar
Andrew M. Kuchling committed
2251
        self.fileobj.seek(self.offset)
2252
        tarinfo = None
2253 2254
        while True:
            try:
2255
                tarinfo = self.tarinfo.fromtarfile(self)
2256
            except EOFHeaderError as e:
2257
                if self.ignore_zeros:
2258
                    self._dbg(2, "0x%X: %s" % (self.offset, e))
2259 2260
                    self.offset += BLOCKSIZE
                    continue
2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275
            except InvalidHeaderError as e:
                if self.ignore_zeros:
                    self._dbg(2, "0x%X: %s" % (self.offset, e))
                    self.offset += BLOCKSIZE
                    continue
                elif self.offset == 0:
                    raise ReadError(str(e))
            except EmptyHeaderError:
                if self.offset == 0:
                    raise ReadError("empty file")
            except TruncatedHeaderError as e:
                if self.offset == 0:
                    raise ReadError(str(e))
            except SubsequentHeaderError as e:
                raise ReadError(str(e))
2276 2277
            break

2278 2279 2280 2281 2282
        if tarinfo is not None:
            self.members.append(tarinfo)
        else:
            self._loaded = True

2283
        return tarinfo
2284 2285 2286 2287

    #--------------------------------------------------------------------------
    # Little helper methods:

2288
    def _getmember(self, name, tarinfo=None, normalize=False):
2289 2290 2291
        """Find an archive member by name from bottom to top.
           If tarinfo is given, it is used as the starting point.
        """
2292 2293 2294
        # Ensure that all members have been loaded.
        members = self.getmembers()

2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306
        # Limit the member search list up to tarinfo.
        if tarinfo is not None:
            members = members[:members.index(tarinfo)]

        if normalize:
            name = os.path.normpath(name)

        for member in reversed(members):
            if normalize:
                member_name = os.path.normpath(member.name)
            else:
                member_name = member.name
2307

2308 2309
            if name == member_name:
                return member
Andrew M. Kuchling's avatar
Andrew M. Kuchling committed
2310

2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325
    def _load(self):
        """Read through the entire archive file and look for readable
           members.
        """
        while True:
            tarinfo = self.next()
            if tarinfo is None:
                break
        self._loaded = True

    def _check(self, mode=None):
        """Check if TarFile is still open, and if the operation's mode
           corresponds to TarFile's mode.
        """
        if self.closed:
2326
            raise IOError("%s is closed" % self.__class__.__name__)
2327 2328
        if mode is not None and self.mode not in mode:
            raise IOError("bad operation for mode %r" % self.mode)
2329

2330 2331 2332 2333 2334 2335
    def _find_link_target(self, tarinfo):
        """Find the target member of a symlink or hardlink member in the
           archive.
        """
        if tarinfo.issym():
            # Always search the entire archive.
2336
            linkname = "/".join(filter(None, (os.path.dirname(tarinfo.name), tarinfo.linkname)))
2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348
            limit = None
        else:
            # Search the archive before the link, because a hard link is
            # just a reference to an already archived file.
            linkname = tarinfo.linkname
            limit = tarinfo

        member = self._getmember(linkname, tarinfo=limit, normalize=True)
        if member is None:
            raise KeyError("linkname %r not found" % linkname)
        return member

2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360
    def __iter__(self):
        """Provide an iterator object.
        """
        if self._loaded:
            return iter(self.members)
        else:
            return TarIter(self)

    def _dbg(self, level, msg):
        """Write debugging output to sys.stderr.
        """
        if level <= self.debug:
2361
            print(msg, file=sys.stderr)
2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375

    def __enter__(self):
        self._check()
        return self

    def __exit__(self, type, value, traceback):
        if type is None:
            self.close()
        else:
            # An exception occurred. We must not call close() because
            # it would try to write end-of-archive blocks and padding.
            if not self._extfileobj:
                self.fileobj.close()
            self.closed = True
2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
# class TarFile

class TarIter:
    """Iterator Class.

       for tarinfo in TarFile(...):
           suite...
    """

    def __init__(self, tarfile):
        """Construct a TarIter object.
        """
        self.tarfile = tarfile
2389
        self.index = 0
2390 2391 2392 2393
    def __iter__(self):
        """Return iterator object.
        """
        return self
2394
    def __next__(self):
2395 2396 2397
        """Return the next item using TarFile's next() method.
           When all members have been read, set TarFile as _loaded.
        """
2398 2399 2400
        # Fix for SF #1100429: Under rare circumstances it can
        # happen that getmembers() is called during iteration,
        # which will cause TarIter to stop prematurely.
2401 2402 2403 2404 2405 2406

        if self.index == 0 and self.tarfile.firstmember is not None:
            tarinfo = self.tarfile.next()
        elif self.index < len(self.tarfile.members):
            tarinfo = self.tarfile.members[self.index]
        elif not self.tarfile._loaded:
2407 2408 2409 2410 2411
            tarinfo = self.tarfile.next()
            if not tarinfo:
                self.tarfile._loaded = True
                raise StopIteration
        else:
2412
            raise StopIteration
2413
        self.index += 1
2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429
        return tarinfo

#--------------------
# exported functions
#--------------------
def is_tarfile(name):
    """Return True if name points to a tar archive that we
       are able to handle, else return False.
    """
    try:
        t = open(name)
        t.close()
        return True
    except TarError:
        return False

2430
bltn_open = open
2431
open = TarFile.open