Commit b5e4e424 authored by Kirill Smelkov's avatar Kirill Smelkov

Merge branch 't2' into t

* t2: (44 commits)
  .
  X wcfs: hook in δFtail.Forget
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  ...
parents c5341182 2ffa7d57
......@@ -183,6 +183,7 @@
// (*) see "Wcfs locking organization" in wcfs.go
// (%) see related comment in Conn.__pin1 for details.
// Handling of fork
//
// When a process calls fork, OS copies its memory and creates child process
......
......@@ -160,7 +160,7 @@ cdef extern from "<fcntl.h>" nogil:
int posix_fadvise(int fd, off_t offset, off_t len, int advice);
enum: POSIX_FADV_DONTNEED
# fadvise_dontneed teels the kernel that file<fd>[offset +len) is not needed.
# fadvise_dontneed tells the kernel that file<fd>[offset +len) is not needed.
#
# see fadvise(2) for details.
def fadvise_dontneed(int fd, off_t offset, off_t len):
......
......@@ -118,6 +118,6 @@ func TestPPTreeSubSetOps(t *testing.T) {
assert1("difference", tt.A, tt.A, Daa, S{})
assert1("difference", tt.B, tt.B, Dbb, S{})
// XXX also verify U/D properties like (A+B)\B + (A+B)\A + (A^B) == (A+B) ?
// TODO also verify U/D properties like (A+B)\B + (A+B)\A + (A^B) == (A+B) ?
}
}
......@@ -194,6 +194,8 @@ type _ΔFileTail struct {
vδE []_ΔFileEpoch // epochs (changes to ZBigFile object itself) ; nil if not yet rebuilt
rebuildJob *_RebuildJob // !nil if vδE rebuild is currently in-progress
btrackReqSet setI64 // set of blocks explicitly requested to be tracked in this file
}
// _ΔFileEpoch represent a change to ZBigFile object.
......@@ -260,7 +262,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() }
// One root can be associated with several files (each provided on different Track calls).
//
// zblk can be nil, which represents a hole.
// if zblk is nil -> blk is ignored and can be arbitrary.
// blk can be < 0, which requests not to establish file[blk] -> zblk
// association. zblk must be nil in this case.
//
// Objects in path and zblk must be with .PJar().At() == .head
func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, blkcov btree.LKeyRange, zblk ZBlk) {
......@@ -299,7 +302,11 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
δftail, ok := δFtail.byFile[foid]
if !ok {
δftail = &_ΔFileTail{root: root, vδE: nil /*will need to be rebuilt to past till tail*/}
δftail = &_ΔFileTail{
root: root,
vδE: nil /*will need to be rebuilt to past till tail*/,
btrackReqSet: setI64{},
}
δFtail.byFile[foid] = δftail
δFtail.ftrackNew.Add(foid)
}
......@@ -307,10 +314,16 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
// .root can change during epochs, but in between them it must be stable
panicf("BUG: zfile<%s> root mutated from %s -> %s", foid, δftail.root, root)
}
if blk >= 0 {
δftail.btrackReqSet.Add(blk)
}
// associate zblk with root, if it was not hole
if zblk != nil {
if blk < 0 {
panicf("BUG: zfile<%s>: blk=%d, but zblk != nil", foid, blk)
}
zoid := zblk.POid()
inroot, ok := δFtail.ztrackInRoot[zoid]
......@@ -333,15 +346,18 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
//
// It builds vδE for that file if there is such need.
// The only case when vδE actually needs to be built is when the file just started to be tracked.
func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoot zodb.Oid, err error) {
//
// It also returns δftail for convenience.
// NOTE access to returned δftail must be protected via δFtail.mu.
func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoot zodb.Oid, δftail *_ΔFileTail, err error) {
δFtail.mu.Lock() // TODO verify that there is no in-progress writers
defer δFtail.mu.Unlock()
δftail := δFtail.byFile[foid]
δftail = δFtail.byFile[foid]
root := δftail.root
vδE = δftail.vδE
if vδE != nil {
return vδE, root, nil
return vδE, root, δftail, nil
}
// vδE needs to be built
......@@ -355,7 +371,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo
δFtail.mu.Lock()
vδE = δftail.vδE
}
return vδE, root, job.err
return vδE, root, δftail, job.err
}
// we become responsible to build vδE
......@@ -379,7 +395,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo
job.err = err
close(job.ready)
return vδE, root, err
return vδE, root, δftail, err
}
// _rebuildAll rebuilds vδE for all files from ftrackNew requests.
......@@ -473,6 +489,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit) (_ ΔF, err error) {
// NOTE no need to clone vδE: we are writer, vδE is never returned to
// outside, append does not invalidate previous vδE retrievals.
δftail.vδE = append(δftail.vδE, δE)
δftail.btrackReqSet = setI64{}
}
}
......@@ -674,16 +691,32 @@ type _ZinblkOverlay struct {
//
// Note: contrary to regular go slicing, low is exclusive while high is inclusive.
func (δFtail *ΔFtail) SliceByFileRev(zfile *ZBigFile, lo, hi zodb.Tid) (/*readonly*/[]*ΔFile, error) {
return δFtail.SliceByFileRevEx(zfile, lo, hi, QueryOptions{})
}
// SliceByFileRevEx is extended version of SliceByFileRev with options.
func (δFtail *ΔFtail) SliceByFileRevEx(zfile *ZBigFile, lo, hi zodb.Tid, opt QueryOptions) (/*readonly*/[]*ΔFile, error) {
foid := zfile.POid()
//fmt.Printf("\nslice f<%s> (@%s,@%s]\n", foid, lo, hi)
vδf, err := δFtail._SliceByFileRev(foid, lo, hi)
vδf, err := δFtail._SliceByFileRev(foid, lo, hi, opt)
if err != nil {
err = fmt.Errorf("slice f<%s> (@%s,@%s]: %e", foid, lo, hi, err)
}
return vδf, err
}
func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*readonly*/[]*ΔFile, error) {
// QueryOptions represents options for SliceBy* queries.
type QueryOptions struct {
// OnlyExplicitlyTracked requests that only blocks, that were
// explicitly tracked, are included into result.
//
// By default SliceBy* return information about both blocks that
// were explicitly tracked, and blocks that became tracked due to being
// adjacent to a tracked block in BTree bucket.
OnlyExplicitlyTracked bool
}
func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid, opt QueryOptions) (/*readonly*/[]*ΔFile, error) {
xtail.AssertSlice(δFtail, lo, hi)
// query .δBtail.SliceByRootRev(file.blktab, lo, hi) +
......@@ -703,7 +736,7 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado
// δFile ────────o───────o──────x─────x────────────────────────
vδE, headRoot, err := δFtail.vδEForFile(foid)
vδE, headRoot, δftail, err := δFtail.vδEForFile(foid)
if err != nil {
return nil, err
}
......@@ -926,6 +959,40 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado
vδf[i], vδf[j] = vδf[j], vδf[i]
}
// take opt.OnlyExplicitlyTracked into account
// XXX epochs not handled (currently ok as epochs are rejected by wcfs)
if opt.OnlyExplicitlyTracked {
δblk := setI64{}
for _, δf := range vδf {
δblk.Update(δf.Blocks)
}
δFtail.mu.Lock()
for blk := range δblk {
if !δftail.btrackReqSet.Has(blk) {
δblk.Del(blk)
}
}
δFtail.mu.Unlock()
for i := len(vδf)-1; i >= 0; i-- {
δf := vδf[i]
if δf.Epoch {
continue
}
for blk := range δf.Blocks {
if !δblk.Has(blk) {
δf.Blocks.Del(blk)
}
}
if len(δf.Blocks) == 0 {
// delete @i
copy(vδf[i:], vδf[i+1:])
vδf = vδf[:len(vδf)-1]
}
}
}
return vδf, nil
}
......@@ -1017,7 +1084,7 @@ func (δFtail *ΔFtail) BlkRevAt(ctx context.Context, zfile *ZBigFile, blk int64
panicf("zconn.at out of bounds: zconn.at: @%s, (tail, head] = (@%s, @%s]", zconnAt, tail, head)
}
vδE, headRoot, err := δFtail.vδEForFile(foid)
vδE, headRoot, _, err := δFtail.vδEForFile(foid)
if err != nil {
return zodb.InvalidTid, false, err
}
......
......@@ -470,7 +470,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
trackZinroot := map[string]setOid{}
for zoid, inroot := range δFtail.ztrackInRoot {
zblki := commit.ZBlkTab[zoid]
trackZinroot[zblki.Name] = inroot.Clone() // XXX clone needed?
trackZinroot[zblki.Name] = inroot
}
Zinroot := map[string]setOid{}
for zblk := range Zinblk {
......@@ -494,7 +494,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
} else {
for zoid, inblk := range rt.ztrackInBlk {
zblki := commit.ZBlkTab[zoid]
trackZinblk[zblki.Name] = inblk.Clone() // XXX clone needed?
trackZinblk[zblki.Name] = inblk
}
}
......@@ -609,6 +609,8 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
// SliceByFileRev returns all changes to that untracked block. In other words
// we verify that no change to untracked block is missed, if any change to that
// block is ever present in returned slice.
//
// This test also verifies handling of OnlyExplicitlyTracked query option.
func TestΔFtailSliceUntrackedUniform(t_ *testing.T) {
t := newT(t_)
X := exc.Raiseif
......@@ -651,37 +653,48 @@ func TestΔFtailSliceUntrackedUniform(t_ *testing.T) {
// blktab[2] remains unnoticed because it is not changed past at1.
xtrackBlk(0)
// (at1, at4] -> changes to both 0 and 1, because they both are changed in the same bucket @at2
lo := t1.At
hi := t4.At
vδf, err := δFtail.SliceByFileRev(zfile, lo, hi); X(err)
vδf_ok := []*ΔFile{
&ΔFile{Rev: t2.At, Blocks: b(0,1), Size: true},
&ΔFile{Rev: t3.At, Blocks: b(0,1), Size: false},
&ΔFile{Rev: t4.At, Blocks: b( 1), Size: false},
}
// assertSliceByFileRev verifies result of SliceByFileRev and SliceByFileRevEx(OnlyExplicitlyTracked=y).
assertSliceByFileRev := func(lo, hi zodb.Tid, vδf_ok, vδfT_ok []*ΔFile) {
t.Helper()
Tonly := QueryOptions{OnlyExplicitlyTracked: true}
vδf, err := δFtail.SliceByFileRev (zfile, lo, hi); X(err)
vδfT, err := δFtail.SliceByFileRevEx(zfile, lo, hi, Tonly); X(err)
if !reflect.DeepEqual(vδf, vδf_ok) {
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok))
}
if !reflect.DeepEqual(vδfT, vδfT_ok) {
t.Errorf("sliceT (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδfT), t.vδfstr(vδfT_ok))
}
}
// (at1, at4] -> changes to both 0 and 1, because they both are changed in the same bucket @at2
assertSliceByFileRev(t1.At, t4.At,
/*vδf*/ []*ΔFile{
&ΔFile{Rev: t2.At, Blocks: b(0,1), Size: true},
&ΔFile{Rev: t3.At, Blocks: b(0,1), Size: false},
&ΔFile{Rev: t4.At, Blocks: b( 1), Size: false},
},
/*vδfT*/ []*ΔFile{
&ΔFile{Rev: t2.At, Blocks: b(0 ), Size: true},
&ΔFile{Rev: t3.At, Blocks: b(0 ), Size: false},
// no change @at4
})
// (at2, at4] -> changes to only 0, because there is no change to 2 via blktab
lo = t2.At
vδf, err = δFtail.SliceByFileRev(zfile, lo, hi); X(err)
vδf_ok = []*ΔFile{
assertSliceByFileRev(t2.At, t4.At,
/*vδf*/ []*ΔFile{
&ΔFile{Rev: t3.At, Blocks: b(0), Size: false},
}
if !reflect.DeepEqual(vδf, vδf_ok) {
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok))
}
},
/*vδfT*/ []*ΔFile{
&ΔFile{Rev: t3.At, Blocks: b(0), Size: false},
})
// (at3, at4] -> changes to only 0, ----/----
lo = t3.At
vδf, err = δFtail.SliceByFileRev(zfile, lo, hi); X(err)
vδf_ok = []*ΔFile(nil)
if !reflect.DeepEqual(vδf, vδf_ok) {
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok))
}
assertSliceByFileRev(t3.At, t4.At,
/*vδf*/ []*ΔFile(nil),
/*vδfT*/ []*ΔFile(nil))
}
......
......@@ -57,10 +57,13 @@ Trees _and_ Buckets nodes - would be required.
-> we took the approach where we send invalidation to client about a block
lazily only when the block is actually accessed.
XXX building δFtail lazily along serving fuse reads during scope of one
transaction is not trivial and creates concurrency bottlenecks if simple
Rejected alternative:
Building δFtail lazily along serving FUSE reads during scope of one
transaction is not trivial and would create concurrency bottlenecks if simple
locking scheme is used. With the main difficulty being to populate tracking set
of δBtree lazily. However as the first approach we can still build complete
of δBtree lazily. However as the first approach we could still build complete
tracking set for a BTree at the time of file open: we need to scan through all
trees but _not_ buckets: this way we'll know oid of all tree nodes: trees _and_
buckets, while avoiding loading buckets makes this approach practical: with
......@@ -69,9 +72,11 @@ require ~ 20 trees to cover 1TB of data. And we can scan those trees very
quickly even if doing so serially. For 1PB of data it will require to scan ~
10⁴ trees. If RTT to load 1 object is ~1ms this will become 10 seconds if done
serially. However if we load all those tree objects in parallel it will be
much less. Still the number of trees to scan is linear to the amount of data
and it would be good to address the shortcoming of doing whole file index scan
later.
much less. Still the number of trees to scan is linear to the amount of data.
-> rejected: ΔFtail and ΔBtail were instead fixed to allow several Track and
queries requests to run in parallel. See "Concurrency" section in ΔFtail/ΔBtail
organization overview.
Changing mmapping while under pagefault is possible
......@@ -107,7 +112,7 @@ We can change a mapping while a page from it is under pagefault:
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n2457
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n1301
- the filesystem server upon receiving the read request can manipulate
- the filesystem server, upon receiving the read request, can manipulate
client's address space. This requires to write-lock client->mm->mmap_sem,
but we can be sure it won't deadlock because the kernel releases it
before waiting (see previous point).
......
......@@ -42,7 +42,7 @@ digraph {
zobj2file -> zblk2file;
zobj2file -> zbtree2file;
zbtree2file -> δBTree [color=grey];
zbtree2file -> δBTree;
// wcfs_simple -> Btree_read;
// wcfs_simple -> ZBlk_read;
......@@ -75,8 +75,8 @@ digraph {
wcfsInvProcess [label="process\nZODB invalidations", style=filled fillcolor=grey95]
zconnCacheGet [label="zonn.\n.Cache.Get", style=filled fillcolor=lightyellow]
zobj2file [label="Z* → file/[]#blk", style=filled fillcolor=grey95]
zblk2file [label="ZBlk*\n↓\nfile/[]#blk", style=filled fillcolor=lightyellow]
zbtree2file [label="BTree/Bucket\n↓\nfile/[]#blk"]
zblk2file [label="ZBlk*\n↓\nfile/[]#blk", style=filled fillcolor=grey95]
zbtree2file [label="BTree/Bucket\n↓\nfile/[]#blk", style=filled fillcolor=grey95]
δBTree [label="δ(BTree)", style=filled fillcolor=grey95]
fuseRetrieveCache [label="FUSE:\nretrieve cache", style=filled fillcolor=lightyellow]
......
......@@ -316,7 +316,7 @@
<!-- zblk2file -->
<g id="node22" class="node">
<title>zblk2file</title>
<ellipse fill="lightyellow" stroke="black" cx="62.23" cy="-109.48" rx="62.45" ry="37.45"/>
<ellipse fill="#f2f2f2" stroke="black" cx="62.23" cy="-109.48" rx="62.45" ry="37.45"/>
<text text-anchor="middle" x="62.23" y="-120.78" font-family="Times,serif" font-size="14.00">ZBlk*</text>
<text text-anchor="middle" x="62.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text>
<text text-anchor="middle" x="62.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text>
......@@ -330,7 +330,7 @@
<!-- zbtree2file -->
<g id="node23" class="node">
<title>zbtree2file</title>
<ellipse fill="none" stroke="black" cx="222.23" cy="-109.48" rx="79.81" ry="37.45"/>
<ellipse fill="#f2f2f2" stroke="black" cx="222.23" cy="-109.48" rx="79.81" ry="37.45"/>
<text text-anchor="middle" x="222.23" y="-120.78" font-family="Times,serif" font-size="14.00">BTree/Bucket</text>
<text text-anchor="middle" x="222.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text>
<text text-anchor="middle" x="222.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text>
......@@ -368,8 +368,8 @@
<!-- zbtree2file&#45;&gt;δBTree -->
<g id="edge24" class="edge">
<title>zbtree2file&#45;&gt;δBTree</title>
<path fill="none" stroke="grey" d="M222.23,-71.82C222.23,-63.33 222.23,-54.43 222.23,-46.42"/>
<polygon fill="grey" stroke="grey" points="225.73,-46.15 222.23,-36.15 218.73,-46.15 225.73,-46.15"/>
<path fill="none" stroke="black" d="M222.23,-71.82C222.23,-63.33 222.23,-54.43 222.23,-46.42"/>
<polygon fill="black" stroke="black" points="225.73,-46.15 222.23,-36.15 218.73,-46.15 225.73,-46.15"/>
</g>
<!-- clientInvHandle&#45;&gt;headWatch -->
<g id="edge29" class="edge">
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment