Commit 3bece2fa authored by Joe Tsai's avatar Joe Tsai Committed by Joe Tsai

archive/tar: refactor Reader support for sparse files

This CL is the first step (of two) for adding sparse file support
to the Writer. This CL only refactors the logic of sparse-file handling
in the Reader so that common logic can be easily shared by the Writer.

As a result of this CL, there are some new publicly visible API changes:
	type SparseEntry struct { Offset, Length int64 }
	type Header struct { ...; SparseHoles []SparseEntry }

A new type is defined to represent a sparse fragment and a new field
Header.SparseHoles is added to represent the sparse holes in a file.
The API intentionally represent sparse files using hole fragments,
rather than data fragments so that the zero value of SparseHoles
naturally represents a normal file (i.e., a file without any holes).
The Reader now populates SparseHoles for sparse files.

It is necessary to export the sparse hole information, otherwise it would
be impossible for the Writer to specify that it is trying to encode
a sparse file, and what it looks like.

Some unexported helper functions were added to common.go:
	func validateSparseEntries(sp []SparseEntry, size int64) bool
	func alignSparseEntries(src []SparseEntry, size int64) []SparseEntry
	func invertSparseEntries(src []SparseEntry, size int64) []SparseEntry

The validation logic that used to be in newSparseFileReader is now moved
to validateSparseEntries so that the Writer can use it in the future.
alignSparseEntries is currently unused by the Reader, but will be used
by the Writer in the future. Since TAR represents sparse files by
only recording the data fragments, we add the invertSparseEntries
function to convert a list of data fragments to a normalized list
of hole fragments (and vice-versa).

Some other high-level changes:
* skipUnread is deleted, where most of it's logic is moved to the
Discard methods on regFileReader and sparseFileReader.
* readGNUSparsePAXHeaders was rewritten to be simpler.
* regFileReader and sparseFileReader were completely rewritten
in simpler and easier to understand logic.
* A bug was fixed in sparseFileReader.Read where it failed to
report an error if the logical size of the file ends before
consuming all of the underlying data.
* The tests for sparse-file support was completely rewritten.

Updates #13548

Change-Id: Ic1233ae5daf3b3f4278fe1115d34a90c4aeaf0c2
Reviewed-on: https://go-review.googlesource.com/56771
Run-TryBot: Joe Tsai <thebrokentoaster@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: default avatarIan Lance Taylor <iant@golang.org>
parent b2174a16
......@@ -15,6 +15,7 @@ package tar
import (
"errors"
"fmt"
"math"
"os"
"path"
"strconv"
......@@ -30,6 +31,8 @@ var (
ErrWriteTooLong = errors.New("tar: write too long")
ErrFieldTooLong = errors.New("tar: header field too long")
ErrWriteAfterClose = errors.New("tar: write after close")
errMissData = errors.New("tar: sparse file references non-existent data")
errUnrefData = errors.New("tar: sparse file contains unreferenced data")
)
// Header type flags.
......@@ -68,6 +71,131 @@ type Header struct {
AccessTime time.Time // access time
ChangeTime time.Time // status change time
Xattrs map[string]string
// SparseHoles represents a sequence of holes in a sparse file.
//
// The regions must be sorted in ascending order, not overlap with
// each other, and not extend past the specified Size.
// The file is sparse if either len(SparseHoles) > 0 or
// the Typeflag is set to TypeGNUSparse.
SparseHoles []SparseEntry
}
// SparseEntry represents a Length-sized fragment at Offset in the file.
type SparseEntry struct{ Offset, Length int64 }
func (s SparseEntry) endOffset() int64 { return s.Offset + s.Length }
// A sparse file can be represented as either a sparseDatas or a sparseHoles.
// As long as the total size is known, they are equivalent and one can be
// converted to the other form and back. The various tar formats with sparse
// file support represent sparse files in the sparseDatas form. That is, they
// specify the fragments in the file that has data, and treat everything else as
// having zero bytes. As such, the encoding and decoding logic in this package
// deals with sparseDatas.
//
// However, the external API uses sparseHoles instead of sparseDatas because the
// zero value of sparseHoles logically represents a normal file (i.e., there are
// no holes in it). On the other hand, the zero value of sparseDatas implies
// that the file has no data in it, which is rather odd.
//
// As an example, if the underlying raw file contains the 10-byte data:
// var compactFile = "abcdefgh"
//
// And the sparse map has the following entries:
// var spd sparseDatas = []sparseEntry{
// {Offset: 2, Length: 5}, // Data fragment for 2..6
// {Offset: 18, Length: 3}, // Data fragment for 18..20
// }
// var sph sparseHoles = []SparseEntry{
// {Offset: 0, Length: 2}, // Hole fragment for 0..1
// {Offset: 7, Length: 11}, // Hole fragment for 7..17
// {Offset: 21, Length: 4}, // Hole fragment for 21..24
// }
//
// Then the content of the resulting sparse file with a Header.Size of 25 is:
// var sparseFile = "\x00"*2 + "abcde" + "\x00"*11 + "fgh" + "\x00"*4
type (
sparseDatas []SparseEntry
sparseHoles []SparseEntry
)
// validateSparseEntries reports whether sp is a valid sparse map.
// It does not matter whether sp represents data fragments or hole fragments.
func validateSparseEntries(sp []SparseEntry, size int64) bool {
// Validate all sparse entries. These are the same checks as performed by
// the BSD tar utility.
if size < 0 {
return false
}
var pre SparseEntry
for _, cur := range sp {
switch {
case cur.Offset < 0 || cur.Length < 0:
return false // Negative values are never okay
case cur.Offset > math.MaxInt64-cur.Length:
return false // Integer overflow with large length
case cur.endOffset() > size:
return false // Region extends beyond the actual size
case pre.endOffset() > cur.Offset:
return false // Regions cannot overlap and must be in order
}
pre = cur
}
return true
}
// alignSparseEntries mutates src and returns dst where each fragment's
// starting offset is aligned up to the nearest block edge, and each
// ending offset is aligned down to the nearest block edge.
//
// Even though the Go tar Reader and the BSD tar utility can handle entries
// with arbitrary offsets and lengths, the GNU tar utility can only handle
// offsets and lengths that are multiples of blockSize.
func alignSparseEntries(src []SparseEntry, size int64) []SparseEntry {
dst := src[:0]
for _, s := range src {
pos, end := s.Offset, s.endOffset()
pos += blockPadding(+pos) // Round-up to nearest blockSize
if end != size {
end -= blockPadding(-end) // Round-down to nearest blockSize
}
if pos < end {
dst = append(dst, SparseEntry{Offset: pos, Length: end - pos})
}
}
return dst
}
// invertSparseEntries converts a sparse map from one form to the other.
// If the input is sparseHoles, then it will output sparseDatas and vice-versa.
// The input must have been already validated.
//
// This function mutates src and returns a normalized map where:
// * adjacent fragments are coalesced together
// * only the last fragment may be empty
// * the endOffset of the last fragment is the total size
func invertSparseEntries(src []SparseEntry, size int64) []SparseEntry {
dst := src[:0]
var pre SparseEntry
for _, cur := range src {
if cur.Length == 0 {
continue // Skip empty fragments
}
pre.Length = cur.Offset - pre.Offset
if pre.Length > 0 {
dst = append(dst, pre) // Only add non-empty fragments
}
pre.Offset = cur.endOffset()
}
pre.Length = size - pre.Offset // Possibly the only empty fragment
return append(dst, pre)
}
type fileState interface {
// Remaining reports the number of remaining bytes in the current file.
// This count includes any sparse holes that may exist.
Remaining() int64
}
// FileInfo returns an os.FileInfo for the Header.
......@@ -300,6 +428,17 @@ const (
paxUname = "uname"
paxXattr = "SCHILY.xattr."
paxNone = ""
// Keywords for GNU sparse files in a PAX extended header.
paxGNUSparseNumBlocks = "GNU.sparse.numblocks"
paxGNUSparseOffset = "GNU.sparse.offset"
paxGNUSparseNumBytes = "GNU.sparse.numbytes"
paxGNUSparseMap = "GNU.sparse.map"
paxGNUSparseName = "GNU.sparse.name"
paxGNUSparseMajor = "GNU.sparse.major"
paxGNUSparseMinor = "GNU.sparse.minor"
paxGNUSparseSize = "GNU.sparse.size"
paxGNUSparseRealSize = "GNU.sparse.realsize"
)
// FileInfoHeader creates a partially-populated Header from fi.
......@@ -373,6 +512,9 @@ func FileInfoHeader(fi os.FileInfo, link string) (*Header, error) {
h.Size = 0
h.Linkname = sys.Linkname
}
if sys.SparseHoles != nil {
h.SparseHoles = append([]SparseEntry{}, sys.SparseHoles...)
}
}
if sysStat != nil {
return h, sysStat(fi, h)
......@@ -390,3 +532,10 @@ func isHeaderOnlyType(flag byte) bool {
return false
}
}
func min(a, b int64) int64 {
if a < b {
return a
}
return b
}
......@@ -50,6 +50,12 @@ const (
prefixSize = 155 // Max length of the prefix field in USTAR format
)
// blockPadding computes the number of bytes needed to pad offset up to the
// nearest block edge where 0 <= n < blockSize.
func blockPadding(offset int64) (n int64) {
return -offset & (blockSize - 1)
}
var zeroBlock block
type block [blockSize]byte
......@@ -192,11 +198,11 @@ func (h *headerUSTAR) Prefix() []byte { return h[345:][:155] }
type sparseArray []byte
func (s sparseArray) Entry(i int) sparseNode { return (sparseNode)(s[i*24:]) }
func (s sparseArray) Entry(i int) sparseElem { return (sparseElem)(s[i*24:]) }
func (s sparseArray) IsExtended() []byte { return s[24*s.MaxEntries():][:1] }
func (s sparseArray) MaxEntries() int { return len(s) / 24 }
type sparseNode []byte
type sparseElem []byte
func (s sparseNode) Offset() []byte { return s[00:][:12] }
func (s sparseNode) NumBytes() []byte { return s[12:][:12] }
func (s sparseElem) Offset() []byte { return s[00:][:12] }
func (s sparseElem) Length() []byte { return s[12:][:12] }
This diff is collapsed.
This diff is collapsed.
......@@ -19,6 +19,116 @@ import (
"time"
)
func equalSparseEntries(x, y []SparseEntry) bool {
return (len(x) == 0 && len(y) == 0) || reflect.DeepEqual(x, y)
}
func TestSparseEntries(t *testing.T) {
vectors := []struct {
in []SparseEntry
size int64
wantValid bool // Result of validateSparseEntries
wantAligned []SparseEntry // Result of alignSparseEntries
wantInverted []SparseEntry // Result of invertSparseEntries
}{{
in: []SparseEntry{}, size: 0,
wantValid: true,
wantInverted: []SparseEntry{{0, 0}},
}, {
in: []SparseEntry{}, size: 5000,
wantValid: true,
wantInverted: []SparseEntry{{0, 5000}},
}, {
in: []SparseEntry{{0, 5000}}, size: 5000,
wantValid: true,
wantAligned: []SparseEntry{{0, 5000}},
wantInverted: []SparseEntry{{5000, 0}},
}, {
in: []SparseEntry{{1000, 4000}}, size: 5000,
wantValid: true,
wantAligned: []SparseEntry{{1024, 3976}},
wantInverted: []SparseEntry{{0, 1000}, {5000, 0}},
}, {
in: []SparseEntry{{0, 3000}}, size: 5000,
wantValid: true,
wantAligned: []SparseEntry{{0, 2560}},
wantInverted: []SparseEntry{{3000, 2000}},
}, {
in: []SparseEntry{{3000, 2000}}, size: 5000,
wantValid: true,
wantAligned: []SparseEntry{{3072, 1928}},
wantInverted: []SparseEntry{{0, 3000}, {5000, 0}},
}, {
in: []SparseEntry{{2000, 2000}}, size: 5000,
wantValid: true,
wantAligned: []SparseEntry{{2048, 1536}},
wantInverted: []SparseEntry{{0, 2000}, {4000, 1000}},
}, {
in: []SparseEntry{{0, 2000}, {8000, 2000}}, size: 10000,
wantValid: true,
wantAligned: []SparseEntry{{0, 1536}, {8192, 1808}},
wantInverted: []SparseEntry{{2000, 6000}, {10000, 0}},
}, {
in: []SparseEntry{{0, 2000}, {2000, 2000}, {4000, 0}, {4000, 3000}, {7000, 1000}, {8000, 0}, {8000, 2000}}, size: 10000,
wantValid: true,
wantAligned: []SparseEntry{{0, 1536}, {2048, 1536}, {4096, 2560}, {7168, 512}, {8192, 1808}},
wantInverted: []SparseEntry{{10000, 0}},
}, {
in: []SparseEntry{{0, 0}, {1000, 0}, {2000, 0}, {3000, 0}, {4000, 0}, {5000, 0}}, size: 5000,
wantValid: true,
wantInverted: []SparseEntry{{0, 5000}},
}, {
in: []SparseEntry{{1, 0}}, size: 0,
wantValid: false,
}, {
in: []SparseEntry{{-1, 0}}, size: 100,
wantValid: false,
}, {
in: []SparseEntry{{0, -1}}, size: 100,
wantValid: false,
}, {
in: []SparseEntry{{0, 0}}, size: -100,
wantValid: false,
}, {
in: []SparseEntry{{math.MaxInt64, 3}, {6, -5}}, size: 35,
wantValid: false,
}, {
in: []SparseEntry{{1, 3}, {6, -5}}, size: 35,
wantValid: false,
}, {
in: []SparseEntry{{math.MaxInt64, math.MaxInt64}}, size: math.MaxInt64,
wantValid: false,
}, {
in: []SparseEntry{{3, 3}}, size: 5,
wantValid: false,
}, {
in: []SparseEntry{{2, 0}, {1, 0}, {0, 0}}, size: 3,
wantValid: false,
}, {
in: []SparseEntry{{1, 3}, {2, 2}}, size: 10,
wantValid: false,
}}
for i, v := range vectors {
gotValid := validateSparseEntries(v.in, v.size)
if gotValid != v.wantValid {
t.Errorf("test %d, validateSparseEntries() = %v, want %v", i, gotValid, v.wantValid)
}
if !v.wantValid {
continue
}
gotAligned := alignSparseEntries(append([]SparseEntry{}, v.in...), v.size)
if !equalSparseEntries(gotAligned, v.wantAligned) {
t.Errorf("test %d, alignSparseEntries():\ngot %v\nwant %v", i, gotAligned, v.wantAligned)
}
gotInverted := invertSparseEntries(append([]SparseEntry{}, v.in...), v.size)
if !equalSparseEntries(gotInverted, v.wantInverted) {
t.Errorf("test %d, inverseSparseEntries():\ngot %v\nwant %v", i, gotInverted, v.wantInverted)
}
}
}
func TestFileInfoHeader(t *testing.T) {
fi, err := os.Stat("testdata/small.txt")
if err != nil {
......
......@@ -250,7 +250,7 @@ func (tw *Writer) writeRawHeader(blk *block, size int64, flag byte) error {
size = 0
}
tw.nb = size
tw.pad = -size & (blockSize - 1) // blockSize is a power of two
tw.pad = blockPadding(size)
return nil
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment