Commit b5e4e424 authored by Kirill Smelkov's avatar Kirill Smelkov

Merge branch 't2' into t

* t2: (44 commits)
  .
  X wcfs: hook in δFtail.Forget
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  .
  ...
parents c5341182 2ffa7d57
...@@ -183,6 +183,7 @@ ...@@ -183,6 +183,7 @@
// (*) see "Wcfs locking organization" in wcfs.go // (*) see "Wcfs locking organization" in wcfs.go
// (%) see related comment in Conn.__pin1 for details. // (%) see related comment in Conn.__pin1 for details.
// Handling of fork // Handling of fork
// //
// When a process calls fork, OS copies its memory and creates child process // When a process calls fork, OS copies its memory and creates child process
......
...@@ -160,7 +160,7 @@ cdef extern from "<fcntl.h>" nogil: ...@@ -160,7 +160,7 @@ cdef extern from "<fcntl.h>" nogil:
int posix_fadvise(int fd, off_t offset, off_t len, int advice); int posix_fadvise(int fd, off_t offset, off_t len, int advice);
enum: POSIX_FADV_DONTNEED enum: POSIX_FADV_DONTNEED
# fadvise_dontneed teels the kernel that file<fd>[offset +len) is not needed. # fadvise_dontneed tells the kernel that file<fd>[offset +len) is not needed.
# #
# see fadvise(2) for details. # see fadvise(2) for details.
def fadvise_dontneed(int fd, off_t offset, off_t len): def fadvise_dontneed(int fd, off_t offset, off_t len):
......
...@@ -118,6 +118,6 @@ func TestPPTreeSubSetOps(t *testing.T) { ...@@ -118,6 +118,6 @@ func TestPPTreeSubSetOps(t *testing.T) {
assert1("difference", tt.A, tt.A, Daa, S{}) assert1("difference", tt.A, tt.A, Daa, S{})
assert1("difference", tt.B, tt.B, Dbb, S{}) assert1("difference", tt.B, tt.B, Dbb, S{})
// XXX also verify U/D properties like (A+B)\B + (A+B)\A + (A^B) == (A+B) ? // TODO also verify U/D properties like (A+B)\B + (A+B)\A + (A^B) == (A+B) ?
} }
} }
...@@ -194,6 +194,8 @@ type _ΔFileTail struct { ...@@ -194,6 +194,8 @@ type _ΔFileTail struct {
vδE []_ΔFileEpoch // epochs (changes to ZBigFile object itself) ; nil if not yet rebuilt vδE []_ΔFileEpoch // epochs (changes to ZBigFile object itself) ; nil if not yet rebuilt
rebuildJob *_RebuildJob // !nil if vδE rebuild is currently in-progress rebuildJob *_RebuildJob // !nil if vδE rebuild is currently in-progress
btrackReqSet setI64 // set of blocks explicitly requested to be tracked in this file
} }
// _ΔFileEpoch represent a change to ZBigFile object. // _ΔFileEpoch represent a change to ZBigFile object.
...@@ -260,7 +262,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() } ...@@ -260,7 +262,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() }
// One root can be associated with several files (each provided on different Track calls). // One root can be associated with several files (each provided on different Track calls).
// //
// zblk can be nil, which represents a hole. // zblk can be nil, which represents a hole.
// if zblk is nil -> blk is ignored and can be arbitrary. // blk can be < 0, which requests not to establish file[blk] -> zblk
// association. zblk must be nil in this case.
// //
// Objects in path and zblk must be with .PJar().At() == .head // Objects in path and zblk must be with .PJar().At() == .head
func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, blkcov btree.LKeyRange, zblk ZBlk) { func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, blkcov btree.LKeyRange, zblk ZBlk) {
...@@ -299,7 +302,11 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl ...@@ -299,7 +302,11 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
δftail, ok := δFtail.byFile[foid] δftail, ok := δFtail.byFile[foid]
if !ok { if !ok {
δftail = &_ΔFileTail{root: root, vδE: nil /*will need to be rebuilt to past till tail*/} δftail = &_ΔFileTail{
root: root,
vδE: nil /*will need to be rebuilt to past till tail*/,
btrackReqSet: setI64{},
}
δFtail.byFile[foid] = δftail δFtail.byFile[foid] = δftail
δFtail.ftrackNew.Add(foid) δFtail.ftrackNew.Add(foid)
} }
...@@ -307,10 +314,16 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl ...@@ -307,10 +314,16 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
// .root can change during epochs, but in between them it must be stable // .root can change during epochs, but in between them it must be stable
panicf("BUG: zfile<%s> root mutated from %s -> %s", foid, δftail.root, root) panicf("BUG: zfile<%s> root mutated from %s -> %s", foid, δftail.root, root)
} }
if blk >= 0 {
δftail.btrackReqSet.Add(blk)
}
// associate zblk with root, if it was not hole // associate zblk with root, if it was not hole
if zblk != nil { if zblk != nil {
if blk < 0 {
panicf("BUG: zfile<%s>: blk=%d, but zblk != nil", foid, blk)
}
zoid := zblk.POid() zoid := zblk.POid()
inroot, ok := δFtail.ztrackInRoot[zoid] inroot, ok := δFtail.ztrackInRoot[zoid]
...@@ -333,15 +346,18 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl ...@@ -333,15 +346,18 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, bl
// //
// It builds vδE for that file if there is such need. // It builds vδE for that file if there is such need.
// The only case when vδE actually needs to be built is when the file just started to be tracked. // The only case when vδE actually needs to be built is when the file just started to be tracked.
func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoot zodb.Oid, err error) { //
// It also returns δftail for convenience.
// NOTE access to returned δftail must be protected via δFtail.mu.
func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoot zodb.Oid, δftail *_ΔFileTail, err error) {
δFtail.mu.Lock() // TODO verify that there is no in-progress writers δFtail.mu.Lock() // TODO verify that there is no in-progress writers
defer δFtail.mu.Unlock() defer δFtail.mu.Unlock()
δftail := δFtail.byFile[foid] δftail = δFtail.byFile[foid]
root := δftail.root root := δftail.root
vδE = δftail.vδE vδE = δftail.vδE
if vδE != nil { if vδE != nil {
return vδE, root, nil return vδE, root, δftail, nil
} }
// vδE needs to be built // vδE needs to be built
...@@ -355,7 +371,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo ...@@ -355,7 +371,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo
δFtail.mu.Lock() δFtail.mu.Lock()
vδE = δftail.vδE vδE = δftail.vδE
} }
return vδE, root, job.err return vδE, root, δftail, job.err
} }
// we become responsible to build vδE // we become responsible to build vδE
...@@ -379,7 +395,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo ...@@ -379,7 +395,7 @@ func (δFtail *ΔFtail) vδEForFile(foid zodb.Oid) (vδE []_ΔFileEpoch, headRoo
job.err = err job.err = err
close(job.ready) close(job.ready)
return vδE, root, err return vδE, root, δftail, err
} }
// _rebuildAll rebuilds vδE for all files from ftrackNew requests. // _rebuildAll rebuilds vδE for all files from ftrackNew requests.
...@@ -473,6 +489,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit) (_ ΔF, err error) { ...@@ -473,6 +489,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit) (_ ΔF, err error) {
// NOTE no need to clone vδE: we are writer, vδE is never returned to // NOTE no need to clone vδE: we are writer, vδE is never returned to
// outside, append does not invalidate previous vδE retrievals. // outside, append does not invalidate previous vδE retrievals.
δftail.vδE = append(δftail.vδE, δE) δftail.vδE = append(δftail.vδE, δE)
δftail.btrackReqSet = setI64{}
} }
} }
...@@ -674,16 +691,32 @@ type _ZinblkOverlay struct { ...@@ -674,16 +691,32 @@ type _ZinblkOverlay struct {
// //
// Note: contrary to regular go slicing, low is exclusive while high is inclusive. // Note: contrary to regular go slicing, low is exclusive while high is inclusive.
func (δFtail *ΔFtail) SliceByFileRev(zfile *ZBigFile, lo, hi zodb.Tid) (/*readonly*/[]*ΔFile, error) { func (δFtail *ΔFtail) SliceByFileRev(zfile *ZBigFile, lo, hi zodb.Tid) (/*readonly*/[]*ΔFile, error) {
return δFtail.SliceByFileRevEx(zfile, lo, hi, QueryOptions{})
}
// SliceByFileRevEx is extended version of SliceByFileRev with options.
func (δFtail *ΔFtail) SliceByFileRevEx(zfile *ZBigFile, lo, hi zodb.Tid, opt QueryOptions) (/*readonly*/[]*ΔFile, error) {
foid := zfile.POid() foid := zfile.POid()
//fmt.Printf("\nslice f<%s> (@%s,@%s]\n", foid, lo, hi) //fmt.Printf("\nslice f<%s> (@%s,@%s]\n", foid, lo, hi)
vδf, err := δFtail._SliceByFileRev(foid, lo, hi) vδf, err := δFtail._SliceByFileRev(foid, lo, hi, opt)
if err != nil { if err != nil {
err = fmt.Errorf("slice f<%s> (@%s,@%s]: %e", foid, lo, hi, err) err = fmt.Errorf("slice f<%s> (@%s,@%s]: %e", foid, lo, hi, err)
} }
return vδf, err return vδf, err
} }
func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*readonly*/[]*ΔFile, error) { // QueryOptions represents options for SliceBy* queries.
type QueryOptions struct {
// OnlyExplicitlyTracked requests that only blocks, that were
// explicitly tracked, are included into result.
//
// By default SliceBy* return information about both blocks that
// were explicitly tracked, and blocks that became tracked due to being
// adjacent to a tracked block in BTree bucket.
OnlyExplicitlyTracked bool
}
func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid, opt QueryOptions) (/*readonly*/[]*ΔFile, error) {
xtail.AssertSlice(δFtail, lo, hi) xtail.AssertSlice(δFtail, lo, hi)
// query .δBtail.SliceByRootRev(file.blktab, lo, hi) + // query .δBtail.SliceByRootRev(file.blktab, lo, hi) +
...@@ -703,7 +736,7 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado ...@@ -703,7 +736,7 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado
// δFile ────────o───────o──────x─────x──────────────────────── // δFile ────────o───────o──────x─────x────────────────────────
vδE, headRoot, err := δFtail.vδEForFile(foid) vδE, headRoot, δftail, err := δFtail.vδEForFile(foid)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -926,6 +959,40 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado ...@@ -926,6 +959,40 @@ func (δFtail *ΔFtail) _SliceByFileRev(foid zodb.Oid, lo, hi zodb.Tid) (/*reado
vδf[i], vδf[j] = vδf[j], vδf[i] vδf[i], vδf[j] = vδf[j], vδf[i]
} }
// take opt.OnlyExplicitlyTracked into account
// XXX epochs not handled (currently ok as epochs are rejected by wcfs)
if opt.OnlyExplicitlyTracked {
δblk := setI64{}
for _, δf := range vδf {
δblk.Update(δf.Blocks)
}
δFtail.mu.Lock()
for blk := range δblk {
if !δftail.btrackReqSet.Has(blk) {
δblk.Del(blk)
}
}
δFtail.mu.Unlock()
for i := len(vδf)-1; i >= 0; i-- {
δf := vδf[i]
if δf.Epoch {
continue
}
for blk := range δf.Blocks {
if !δblk.Has(blk) {
δf.Blocks.Del(blk)
}
}
if len(δf.Blocks) == 0 {
// delete @i
copy(vδf[i:], vδf[i+1:])
vδf = vδf[:len(vδf)-1]
}
}
}
return vδf, nil return vδf, nil
} }
...@@ -1017,7 +1084,7 @@ func (δFtail *ΔFtail) BlkRevAt(ctx context.Context, zfile *ZBigFile, blk int64 ...@@ -1017,7 +1084,7 @@ func (δFtail *ΔFtail) BlkRevAt(ctx context.Context, zfile *ZBigFile, blk int64
panicf("zconn.at out of bounds: zconn.at: @%s, (tail, head] = (@%s, @%s]", zconnAt, tail, head) panicf("zconn.at out of bounds: zconn.at: @%s, (tail, head] = (@%s, @%s]", zconnAt, tail, head)
} }
vδE, headRoot, err := δFtail.vδEForFile(foid) vδE, headRoot, _, err := δFtail.vδEForFile(foid)
if err != nil { if err != nil {
return zodb.InvalidTid, false, err return zodb.InvalidTid, false, err
} }
......
...@@ -470,7 +470,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) { ...@@ -470,7 +470,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
trackZinroot := map[string]setOid{} trackZinroot := map[string]setOid{}
for zoid, inroot := range δFtail.ztrackInRoot { for zoid, inroot := range δFtail.ztrackInRoot {
zblki := commit.ZBlkTab[zoid] zblki := commit.ZBlkTab[zoid]
trackZinroot[zblki.Name] = inroot.Clone() // XXX clone needed? trackZinroot[zblki.Name] = inroot
} }
Zinroot := map[string]setOid{} Zinroot := map[string]setOid{}
for zblk := range Zinblk { for zblk := range Zinblk {
...@@ -494,7 +494,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) { ...@@ -494,7 +494,7 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
} else { } else {
for zoid, inblk := range rt.ztrackInBlk { for zoid, inblk := range rt.ztrackInBlk {
zblki := commit.ZBlkTab[zoid] zblki := commit.ZBlkTab[zoid]
trackZinblk[zblki.Name] = inblk.Clone() // XXX clone needed? trackZinblk[zblki.Name] = inblk
} }
} }
...@@ -609,6 +609,8 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) { ...@@ -609,6 +609,8 @@ func testΔFtail(t_ *testing.T, testq chan ΔFTestEntry) {
// SliceByFileRev returns all changes to that untracked block. In other words // SliceByFileRev returns all changes to that untracked block. In other words
// we verify that no change to untracked block is missed, if any change to that // we verify that no change to untracked block is missed, if any change to that
// block is ever present in returned slice. // block is ever present in returned slice.
//
// This test also verifies handling of OnlyExplicitlyTracked query option.
func TestΔFtailSliceUntrackedUniform(t_ *testing.T) { func TestΔFtailSliceUntrackedUniform(t_ *testing.T) {
t := newT(t_) t := newT(t_)
X := exc.Raiseif X := exc.Raiseif
...@@ -651,37 +653,48 @@ func TestΔFtailSliceUntrackedUniform(t_ *testing.T) { ...@@ -651,37 +653,48 @@ func TestΔFtailSliceUntrackedUniform(t_ *testing.T) {
// blktab[2] remains unnoticed because it is not changed past at1. // blktab[2] remains unnoticed because it is not changed past at1.
xtrackBlk(0) xtrackBlk(0)
// (at1, at4] -> changes to both 0 and 1, because they both are changed in the same bucket @at2 // assertSliceByFileRev verifies result of SliceByFileRev and SliceByFileRevEx(OnlyExplicitlyTracked=y).
lo := t1.At assertSliceByFileRev := func(lo, hi zodb.Tid, vδf_ok, vδfT_ok []*ΔFile) {
hi := t4.At t.Helper()
vδf, err := δFtail.SliceByFileRev(zfile, lo, hi); X(err)
vδf_ok := []*ΔFile{ Tonly := QueryOptions{OnlyExplicitlyTracked: true}
&ΔFile{Rev: t2.At, Blocks: b(0,1), Size: true}, vδf, err := δFtail.SliceByFileRev (zfile, lo, hi); X(err)
&ΔFile{Rev: t3.At, Blocks: b(0,1), Size: false}, vδfT, err := δFtail.SliceByFileRevEx(zfile, lo, hi, Tonly); X(err)
&ΔFile{Rev: t4.At, Blocks: b( 1), Size: false},
} if !reflect.DeepEqual(vδf, vδf_ok) {
if !reflect.DeepEqual(vδf, vδf_ok) { t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok))
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok)) }
if !reflect.DeepEqual(vδfT, vδfT_ok) {
t.Errorf("sliceT (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδfT), t.vδfstr(vδfT_ok))
}
} }
// (at1, at4] -> changes to both 0 and 1, because they both are changed in the same bucket @at2
assertSliceByFileRev(t1.At, t4.At,
/*vδf*/ []*ΔFile{
&ΔFile{Rev: t2.At, Blocks: b(0,1), Size: true},
&ΔFile{Rev: t3.At, Blocks: b(0,1), Size: false},
&ΔFile{Rev: t4.At, Blocks: b( 1), Size: false},
},
/*vδfT*/ []*ΔFile{
&ΔFile{Rev: t2.At, Blocks: b(0 ), Size: true},
&ΔFile{Rev: t3.At, Blocks: b(0 ), Size: false},
// no change @at4
})
// (at2, at4] -> changes to only 0, because there is no change to 2 via blktab // (at2, at4] -> changes to only 0, because there is no change to 2 via blktab
lo = t2.At assertSliceByFileRev(t2.At, t4.At,
vδf, err = δFtail.SliceByFileRev(zfile, lo, hi); X(err) /*vδf*/ []*ΔFile{
vδf_ok = []*ΔFile{ &ΔFile{Rev: t3.At, Blocks: b(0), Size: false},
&ΔFile{Rev: t3.At, Blocks: b(0), Size: false}, },
} /*vδfT*/ []*ΔFile{
if !reflect.DeepEqual(vδf, vδf_ok) { &ΔFile{Rev: t3.At, Blocks: b(0), Size: false},
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok)) })
}
// (at3, at4] -> changes to only 0, ----/---- // (at3, at4] -> changes to only 0, ----/----
lo = t3.At assertSliceByFileRev(t3.At, t4.At,
vδf, err = δFtail.SliceByFileRev(zfile, lo, hi); X(err) /*vδf*/ []*ΔFile(nil),
vδf_ok = []*ΔFile(nil) /*vδfT*/ []*ΔFile(nil))
if !reflect.DeepEqual(vδf, vδf_ok) {
t.Errorf("slice (@%s,@%s]:\nhave: %v\nwant: %v", t.AtSymb(lo), t.AtSymb(hi), t.vδfstr(vδf), t.vδfstr(vδf_ok))
}
} }
......
...@@ -57,10 +57,13 @@ Trees _and_ Buckets nodes - would be required. ...@@ -57,10 +57,13 @@ Trees _and_ Buckets nodes - would be required.
-> we took the approach where we send invalidation to client about a block -> we took the approach where we send invalidation to client about a block
lazily only when the block is actually accessed. lazily only when the block is actually accessed.
XXX building δFtail lazily along serving fuse reads during scope of one
transaction is not trivial and creates concurrency bottlenecks if simple Rejected alternative:
Building δFtail lazily along serving FUSE reads during scope of one
transaction is not trivial and would create concurrency bottlenecks if simple
locking scheme is used. With the main difficulty being to populate tracking set locking scheme is used. With the main difficulty being to populate tracking set
of δBtree lazily. However as the first approach we can still build complete of δBtree lazily. However as the first approach we could still build complete
tracking set for a BTree at the time of file open: we need to scan through all tracking set for a BTree at the time of file open: we need to scan through all
trees but _not_ buckets: this way we'll know oid of all tree nodes: trees _and_ trees but _not_ buckets: this way we'll know oid of all tree nodes: trees _and_
buckets, while avoiding loading buckets makes this approach practical: with buckets, while avoiding loading buckets makes this approach practical: with
...@@ -69,9 +72,11 @@ require ~ 20 trees to cover 1TB of data. And we can scan those trees very ...@@ -69,9 +72,11 @@ require ~ 20 trees to cover 1TB of data. And we can scan those trees very
quickly even if doing so serially. For 1PB of data it will require to scan ~ quickly even if doing so serially. For 1PB of data it will require to scan ~
10⁴ trees. If RTT to load 1 object is ~1ms this will become 10 seconds if done 10⁴ trees. If RTT to load 1 object is ~1ms this will become 10 seconds if done
serially. However if we load all those tree objects in parallel it will be serially. However if we load all those tree objects in parallel it will be
much less. Still the number of trees to scan is linear to the amount of data much less. Still the number of trees to scan is linear to the amount of data.
and it would be good to address the shortcoming of doing whole file index scan
later. -> rejected: ΔFtail and ΔBtail were instead fixed to allow several Track and
queries requests to run in parallel. See "Concurrency" section in ΔFtail/ΔBtail
organization overview.
Changing mmapping while under pagefault is possible Changing mmapping while under pagefault is possible
...@@ -107,7 +112,7 @@ We can change a mapping while a page from it is under pagefault: ...@@ -107,7 +112,7 @@ We can change a mapping while a page from it is under pagefault:
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n2457 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n2457
* https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n1301 * https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/filemap.c?id=v4.20-rc3-83-g06e68fed3282#n1301
- the filesystem server upon receiving the read request can manipulate - the filesystem server, upon receiving the read request, can manipulate
client's address space. This requires to write-lock client->mm->mmap_sem, client's address space. This requires to write-lock client->mm->mmap_sem,
but we can be sure it won't deadlock because the kernel releases it but we can be sure it won't deadlock because the kernel releases it
before waiting (see previous point). before waiting (see previous point).
......
...@@ -42,7 +42,7 @@ digraph { ...@@ -42,7 +42,7 @@ digraph {
zobj2file -> zblk2file; zobj2file -> zblk2file;
zobj2file -> zbtree2file; zobj2file -> zbtree2file;
zbtree2file -> δBTree [color=grey]; zbtree2file -> δBTree;
// wcfs_simple -> Btree_read; // wcfs_simple -> Btree_read;
// wcfs_simple -> ZBlk_read; // wcfs_simple -> ZBlk_read;
...@@ -75,8 +75,8 @@ digraph { ...@@ -75,8 +75,8 @@ digraph {
wcfsInvProcess [label="process\nZODB invalidations", style=filled fillcolor=grey95] wcfsInvProcess [label="process\nZODB invalidations", style=filled fillcolor=grey95]
zconnCacheGet [label="zonn.\n.Cache.Get", style=filled fillcolor=lightyellow] zconnCacheGet [label="zonn.\n.Cache.Get", style=filled fillcolor=lightyellow]
zobj2file [label="Z* → file/[]#blk", style=filled fillcolor=grey95] zobj2file [label="Z* → file/[]#blk", style=filled fillcolor=grey95]
zblk2file [label="ZBlk*\n↓\nfile/[]#blk", style=filled fillcolor=lightyellow] zblk2file [label="ZBlk*\n↓\nfile/[]#blk", style=filled fillcolor=grey95]
zbtree2file [label="BTree/Bucket\n↓\nfile/[]#blk"] zbtree2file [label="BTree/Bucket\n↓\nfile/[]#blk", style=filled fillcolor=grey95]
δBTree [label="δ(BTree)", style=filled fillcolor=grey95] δBTree [label="δ(BTree)", style=filled fillcolor=grey95]
fuseRetrieveCache [label="FUSE:\nretrieve cache", style=filled fillcolor=lightyellow] fuseRetrieveCache [label="FUSE:\nretrieve cache", style=filled fillcolor=lightyellow]
......
...@@ -316,7 +316,7 @@ ...@@ -316,7 +316,7 @@
<!-- zblk2file --> <!-- zblk2file -->
<g id="node22" class="node"> <g id="node22" class="node">
<title>zblk2file</title> <title>zblk2file</title>
<ellipse fill="lightyellow" stroke="black" cx="62.23" cy="-109.48" rx="62.45" ry="37.45"/> <ellipse fill="#f2f2f2" stroke="black" cx="62.23" cy="-109.48" rx="62.45" ry="37.45"/>
<text text-anchor="middle" x="62.23" y="-120.78" font-family="Times,serif" font-size="14.00">ZBlk*</text> <text text-anchor="middle" x="62.23" y="-120.78" font-family="Times,serif" font-size="14.00">ZBlk*</text>
<text text-anchor="middle" x="62.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text> <text text-anchor="middle" x="62.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text>
<text text-anchor="middle" x="62.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text> <text text-anchor="middle" x="62.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text>
...@@ -330,7 +330,7 @@ ...@@ -330,7 +330,7 @@
<!-- zbtree2file --> <!-- zbtree2file -->
<g id="node23" class="node"> <g id="node23" class="node">
<title>zbtree2file</title> <title>zbtree2file</title>
<ellipse fill="none" stroke="black" cx="222.23" cy="-109.48" rx="79.81" ry="37.45"/> <ellipse fill="#f2f2f2" stroke="black" cx="222.23" cy="-109.48" rx="79.81" ry="37.45"/>
<text text-anchor="middle" x="222.23" y="-120.78" font-family="Times,serif" font-size="14.00">BTree/Bucket</text> <text text-anchor="middle" x="222.23" y="-120.78" font-family="Times,serif" font-size="14.00">BTree/Bucket</text>
<text text-anchor="middle" x="222.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text> <text text-anchor="middle" x="222.23" y="-105.78" font-family="Times,serif" font-size="14.00"></text>
<text text-anchor="middle" x="222.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text> <text text-anchor="middle" x="222.23" y="-90.78" font-family="Times,serif" font-size="14.00">file/[]#blk</text>
...@@ -368,8 +368,8 @@ ...@@ -368,8 +368,8 @@
<!-- zbtree2file&#45;&gt;δBTree --> <!-- zbtree2file&#45;&gt;δBTree -->
<g id="edge24" class="edge"> <g id="edge24" class="edge">
<title>zbtree2file&#45;&gt;δBTree</title> <title>zbtree2file&#45;&gt;δBTree</title>
<path fill="none" stroke="grey" d="M222.23,-71.82C222.23,-63.33 222.23,-54.43 222.23,-46.42"/> <path fill="none" stroke="black" d="M222.23,-71.82C222.23,-63.33 222.23,-54.43 222.23,-46.42"/>
<polygon fill="grey" stroke="grey" points="225.73,-46.15 222.23,-36.15 218.73,-46.15 225.73,-46.15"/> <polygon fill="black" stroke="black" points="225.73,-46.15 222.23,-36.15 218.73,-46.15 225.73,-46.15"/>
</g> </g>
<!-- clientInvHandle&#45;&gt;headWatch --> <!-- clientInvHandle&#45;&gt;headWatch -->
<g id="edge29" class="edge"> <g id="edge29" class="edge">
......
...@@ -79,7 +79,6 @@ ...@@ -79,7 +79,6 @@
// @<revX>/ has the following structure: // @<revX>/ has the following structure:
// //
// @<revX>/ // @<revX>/
// at
// bigfile/ ; bigfiles' data as of revision <revX> // bigfile/ ; bigfiles' data as of revision <revX>
// <oid(ZBigFile1)> // <oid(ZBigFile1)>
// <oid(ZBigFile2)> // <oid(ZBigFile2)>
...@@ -114,11 +113,9 @@ ...@@ -114,11 +113,9 @@
// The server sends "ok" reply only after head/at is ≥ requested <at>, and only // The server sends "ok" reply only after head/at is ≥ requested <at>, and only
// after all initial pin/unpin messages are fully acknowledged by the client. // after all initial pin/unpin messages are fully acknowledged by the client.
// The client can start to use mmapped data after it gets "ok". // The client can start to use mmapped data after it gets "ok".
// The server sends "error" reply if requested <at> is too far away back from // The server sends "error" reply e.g. if requested <at> is too far away back
// head/at. // from head/at, or on any other error.
// XXX other errors are possible (e.g. "no such file", or error handling pin). // TODO specify watch state after error.
// XXX error handling pin -> then client is killed?
// XXX if not - specify that watch state is lost after error.
// //
// Upon watch request, either initially, or after sending "ok", the server will be notifying the // Upon watch request, either initially, or after sending "ok", the server will be notifying the
// client about file blocks that client needs to pin in order to observe file's // client about file blocks that client needs to pin in order to observe file's
...@@ -181,6 +178,10 @@ ...@@ -181,6 +178,10 @@
// files to @<revX>/bigfile/<bigfileX> drops to zero, and automatically // files to @<revX>/bigfile/<bigfileX> drops to zero, and automatically
// destroys @<revX>/bigfile/<bigfileX> after reasonable timeout. // destroys @<revX>/bigfile/<bigfileX> after reasonable timeout.
// //
// The client should send "bye" before closing head/watch file:
//
// C: <2·k+1> bye
//
// //
// Protection against slow or faulty clients // Protection against slow or faulty clients
// //
...@@ -261,7 +262,7 @@ package main ...@@ -261,7 +262,7 @@ package main
// //
// The invariant helps on invalidation: when δFtail (see below) sees a // The invariant helps on invalidation: when δFtail (see below) sees a
// changed oid, it is guaranteed that if the change affects block that was // changed oid, it is guaranteed that if the change affects block that was
// ever provided to OS, δFtail will detect that this block has changed. XXX review // ever provided to OS, δFtail will detect that this block has changed.
// And if oid relates to a file block but is not in δFtail's tracking set - // And if oid relates to a file block but is not in δFtail's tracking set -
// we know that block is not cached and will trigger ZODB load on a future // we know that block is not cached and will trigger ZODB load on a future
// file read. // file read.
...@@ -274,42 +275,26 @@ package main ...@@ -274,42 +275,26 @@ package main
// 4) when we receive an invalidation message from ZODB - we process it and // 4) when we receive an invalidation message from ZODB - we process it and
// propagate invalidations to OS file cache of head/bigfile/*: // propagate invalidations to OS file cache of head/bigfile/*:
// //
// invalidation message: (tid↑, []oid) // invalidation message: δZ = (tid↑, []oid)
//
// 4.1) zhead.cache.lookup(oid) XXX -> δFtail
// 4.2) ø: nothing to do - see invariant ^^^.
// 4.3) obj found:
// //
// - ZBlk* -> [] of file/[]#blk // 4.1) δF = δFtail.Update(δZ)
// - BTree/Bucket -> δ(BTree) -> file/[]#blk
// //
// in the end after processing all []oid from invalidation message we have // δFtail (see below) converts ZODB-level changes into information about
// which blocks of which files were modified and need to be invalidated:
// //
// [] of file/[]#blk // δF = (tid↑, {} file -> []#blk)
// //
// that describes which file(s) parts needs to be invalidated. // Note that δF might be not full and reflects only changes to files and
// blocks that were requested to be tracked. However because of the invariant
// δF covers in full what needs to be invalidated in the OS file cache.
// //
// FIXME no - we can build it but not in full - since we consider only zobj in live cache. // 4.2) for all file/blk to invalidate we do:
// FIXME and even if we consider all δ'ed zobj, building complete set of
// file.δtail requires to first do complete scan of file.blktab
// which is prohibitively expensive.
// XXX -> we'll do the scan, but only Trees _without_ Buckets. This
// makes the scan practical until 1PB while helping to build
// initial tracking set for δFtail.
// Eager invalidation would require full scan - Trees _and_
// Buckets, which makes it prohibitively expensive - see (+).
//
// FIXME all ^^^ is outdated -> XXX δFtail
//
// 4.4) for all file/blk to invalidate we do:
// //
// - try to retrieve head/bigfile/file[blk] from OS file cache(*); // - try to retrieve head/bigfile/file[blk] from OS file cache(*);
// - if retrieved successfully -> store retrieved data back into OS file // - if retrieved successfully -> store retrieved data back into OS file
// cache for @<rev>/bigfile/file[blk], where // cache for @<rev>/bigfile/file[blk], where
// //
// # see below about file.δtail // rev = δFtail.BlkRevAt(file, #blk, zhead.at)
// # XXX -> file.BlkRevAt(#blk, zhead.at)
// rev = max(file.δtail.by(#blk)) || min(rev ∈ file.δtail) || zhead.at
// //
// - invalidate head/bigfile/file[blk] in OS file cache. // - invalidate head/bigfile/file[blk] in OS file cache.
// //
...@@ -318,30 +303,41 @@ package main ...@@ -318,30 +303,41 @@ package main
// won't be served from OS file cache and instead will trigger a FUSE read // won't be served from OS file cache and instead will trigger a FUSE read
// request to wcfs. // request to wcfs.
// //
// 4.5) no invalidation messages are sent to wcfs clients at this point(+). // 4.3) no invalidation messages are sent to wcfs clients at this point(+).
// //
// 4.6) processing ZODB invalidations and serving file reads (see 7) are // 4.4) processing ZODB invalidations and serving file reads (see 7) are
// organized to be mutually exclusive. // organized to be mutually exclusive.
// //
// 5.5) similarly, processing ZODB invalidations and setting up watches (see
// 7.2) are organized to be mutually exclusive.
//
// 5) after OS file cache was invalidated, we resync zhead to new database // 5) after OS file cache was invalidated, we resync zhead to new database
// view corresponding to tid. // view corresponding to tid.
// //
// 6) for every file δtail invalidation info about head/data is maintained: XXX -> δFtail // 6) a ZBigFile-level history tail is maintained in δFtail.
// //
// - tailv: [](rev↑, []#blk) // δFtail translates ZODB object-level changes into information about which
// - by: {} #blk -> []rev↑ in tail // blocks of which ZBigFile were modified, and provides service to query
// that information.
// //
// δtail.tail describes invalidations to file we learned from ZODB invalidation. // It semantically consists of
// δtail.by allows to quickly lookup information by #blk.
// //
// min(rev) in δtail is min(@at) at which head/bigfile/file is currently watched (see below). // []δF
// //
// XXX δtail can miss ... // where δF represents a change in files space
//
// δF:
// .rev↑
// {} file -> {}blk
//
// min(rev) in δFtail is min(@at) at which head/bigfile/file is currently watched (see below).
// //
// to support initial openings with @at being slightly in the past, we also // to support initial openings with @at being slightly in the past, we also
// make sure that min(rev) is enough to cover last 10 minutes of history // make sure that min(rev) is enough to cover last 1 minute of history
// from head/at. // from head/at.
// //
// See ΔFtail documentation in internal/zdata/δftail.go for more details.
//
// 7) when we receive a FUSE read(#blk) request to a head/bigfile/file, we process it as follows: // 7) when we receive a FUSE read(#blk) request to a head/bigfile/file, we process it as follows:
// //
// 7.1) load blkdata for head/bigfile/file[blk] @zhead.at . // 7.1) load blkdata for head/bigfile/file[blk] @zhead.at .
...@@ -354,21 +350,16 @@ package main ...@@ -354,21 +350,16 @@ package main
// it is not exact because BTree/Bucket can change (e.g. rebalance) // it is not exact because BTree/Bucket can change (e.g. rebalance)
// but still point to the same k->ZBlk. // but still point to the same k->ZBlk.
// //
// we also use file.δtail to find either exact blk revision: XXX δFtail // we also use δFtail to find either exact blk revision or another upper
// // bound if file[blk] has no change during δFtail coverage:
// rev(blk) = max(file.δtail.by(#blk) -> []rev↑)
//
// or another upper bound if #blk ∉ δtail:
//
// rev(blk) ≤ min(rev ∈ δtail) ; #blk ∉ δtail
// //
// rev(blk) = δFtail.BlkRevAt(file, #blk, zhead.at)
// //
// below rev'(blk) is min(of the estimates found): // below rev'(blk) is min(of the estimates found):
// //
// rev(blk) ≤ rev'(blk) rev'(blk) = min(^^^) // rev(blk) ≤ rev'(blk) rev'(blk) = min(^^^)
// //
// // Note: we delay recomputing δFtail.BlkRevAt(file, #blk, head) because
// XXX we delay recomputing δFtail.BlkRevAt(file, #blk, head) because
// using just cheap revmax estimate can frequently result in all watches // using just cheap revmax estimate can frequently result in all watches
// being skipped. // being skipped.
// //
...@@ -377,7 +368,7 @@ package main ...@@ -377,7 +368,7 @@ package main
// - rev'(blk) ≤ at: -> do nothing // - rev'(blk) ≤ at: -> do nothing
// - rev'(blk) > at: // - rev'(blk) > at:
// - if blk ∈ watch.pinned -> do nothing // - if blk ∈ watch.pinned -> do nothing
// - rev = max(δtail.by(#blk) : _ ≤ at) || min(rev ∈ δtail : rev ≤ at) || at // - rev = δFtail.BlkRevAt(file, #blk, at)
// - watch.pin(file, #blk, @rev) // - watch.pin(file, #blk, @rev)
// - watch.pinned += blk // - watch.pinned += blk
// //
...@@ -416,7 +407,7 @@ package main ...@@ -416,7 +407,7 @@ package main
// transaction is maintained. For zhead, every time it is resynced (see "5") // transaction is maintained. For zhead, every time it is resynced (see "5")
// the transaction associated with zhead is renewed. // the transaction associated with zhead is renewed.
// //
// XXX 10) gc @rev/ and @rev/bigfile/<bigfileX> automatically on atime timeout // TODO 10) gc @rev/ and @rev/bigfile/<bigfileX> automatically on atime timeout
// //
// //
// (*) see notes.txt -> "Notes on OS pagecache control" // (*) see notes.txt -> "Notes on OS pagecache control"
...@@ -425,6 +416,7 @@ package main ...@@ -425,6 +416,7 @@ package main
// (^) see notes.txt -> "Client cannot be ptraced while under pagefault" // (^) see notes.txt -> "Client cannot be ptraced while under pagefault"
// (%) no need to keep track of ZData - ZBlk1 is always marked as changed on blk data change. // (%) no need to keep track of ZData - ZBlk1 is always marked as changed on blk data change.
// Wcfs locking organization // Wcfs locking organization
// //
// As it was said processing ZODB invalidations (see "4") and serving file // As it was said processing ZODB invalidations (see "4") and serving file
...@@ -463,13 +455,14 @@ package main ...@@ -463,13 +455,14 @@ package main
// WatchLink.byfileMu > BigFileDir.fileMu // WatchLink.byfileMu > BigFileDir.fileMu
// WatchLink.byfileMu > Watch.atMu // WatchLink.byfileMu > Watch.atMu
// Notation used // Notation used
// //
// δZ - change in ZODB space // δZ - change in ZODB space
// δB - change in BTree*s* space // δB - change in BTree*s* space
// δT - change in BTree(1) space // δT - change in BTree(1) space
// δF - change in File*s* space // δF - change in File*s* space
// δfile - change in File(1) space XXX -> δf ? // δfile - change in File(1) space
// //
// f - BigFile // f - BigFile
// bfdir - BigFileDir // bfdir - BigFileDir
...@@ -491,7 +484,7 @@ import ( ...@@ -491,7 +484,7 @@ import (
"sync" "sync"
"sync/atomic" "sync/atomic"
"syscall" "syscall"
// "time" "time"
log "github.com/golang/glog" log "github.com/golang/glog"
...@@ -511,23 +504,20 @@ import ( ...@@ -511,23 +504,20 @@ import (
"github.com/hanwen/go-fuse/v2/fuse/nodefs" "github.com/hanwen/go-fuse/v2/fuse/nodefs"
"github.com/pkg/errors" "github.com/pkg/errors"
"lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/set"
"lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/xzodb" "lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/xzodb"
"lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/zdata" "lab.nexedi.com/nexedi/wendelin.core/wcfs/internal/zdata"
) )
// ZBigFile-related types // shorthands for ZBigFile and ZBlk*
// XXX place
type ( type (
ZBlk = zdata.ZBlk ZBlk = zdata.ZBlk
ZBlk0 = zdata.ZBlk0 ZBlk0 = zdata.ZBlk0
ZBlk1 = zdata.ZBlk1 ZBlk1 = zdata.ZBlk1
ZData = zdata.ZData ZData = zdata.ZData
ZBigFile = zdata.ZBigFile ZBigFile = zdata.ZBigFile
setI64 = set.I64
) )
// Root represents root of wcfs filesystem. // Root represents root of wcfs filesystem.
type Root struct { type Root struct {
fsNode fsNode
...@@ -570,6 +560,8 @@ type Head struct { ...@@ -570,6 +560,8 @@ type Head struct {
// it is also kept rlocked by OS cache uploaders (see BigFile.uploadBlk) // it is also kept rlocked by OS cache uploaders (see BigFile.uploadBlk)
// with additional locking protocol to avoid deadlocks (see below for // with additional locking protocol to avoid deadlocks (see below for
// pauseOSCacheUpload + ...). // pauseOSCacheUpload + ...).
//
// TODO head.zheadMu -> special mutex with Lock(ctx) so that Lock wait could be canceled
zheadMu sync.RWMutex zheadMu sync.RWMutex
zconn *xzodb.ZConn // for head/ zwatcher resyncs head.zconn; others only read zconn objects. zconn *xzodb.ZConn // for head/ zwatcher resyncs head.zconn; others only read zconn objects.
...@@ -600,8 +592,7 @@ type BigFileDir struct { ...@@ -600,8 +592,7 @@ type BigFileDir struct {
// δ tail of tracked BTree nodes of all BigFiles + -> which file // δ tail of tracked BTree nodes of all BigFiles + -> which file
// (used only for head/, not revX/) // (used only for head/, not revX/)
δFmu sync.RWMutex // zheadMu.W | zheadMu.R + δFmu.X δFtail *zdata.ΔFtail // read/write access protected by zheadMu.{R,W}
δFtail *zdata.ΔFtail
} }
// /(head|<rev>)/bigfile/<bigfileX> - served by BigFile. // /(head|<rev>)/bigfile/<bigfileX> - served by BigFile.
...@@ -617,23 +608,12 @@ type BigFile struct { ...@@ -617,23 +608,12 @@ type BigFile struct {
// things read/computed from .zfile; constant during lifetime of current transaction. // things read/computed from .zfile; constant during lifetime of current transaction.
// i.e. changed under zhead.W // i.e. changed under zhead.W
blksize int64 // zfile.blksize blksize int64 // zfile.blksize
size int64 // zfile.Size() size int64 // zfile.Size()
rev zodb.Tid // last revision that modified zfile data revApprox zodb.Tid // approx last revision that modified zfile data
// XXX we can't know rev fully as some later blocks could be learnt only // ( we can't know rev fully as some later blocks could be learnt only
// while populating δFtail lazily // while populating δFtail lazily. For simplicity we don't delve into
// XXX or then it is not "constant during lifetime of current txn" // updating revApprox during lifetime of current transaction )
// // tail change history of this file.
// //
// // XXX computationally expensive to start - see "Invalidations to wcfs
// // clients are delayed ..." in notes.txt
// δtail *ΔTailI64 // [](rev↑, []#blk)
// blocks that were ever read-accessed (head/ only) XXX locking by bfdir.δFmu ?
// XXX = δFtail.Tracked(f) ?
// XXX goes away if δFtail query returns only tracked blocks
accessed setI64
// inflight loadings of ZBigFile from ZODB. // inflight loadings of ZBigFile from ZODB.
// successful load results are kept here until blkdata is put into OS pagecache. // successful load results are kept here until blkdata is put into OS pagecache.
...@@ -754,16 +734,6 @@ func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolic ...@@ -754,16 +734,6 @@ func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolic
return 0 return 0
} }
/*
// -------- zhead lock/wait --------
// XXX needed?
// TODO head.zheadMu -> special mutex with Lock(ctx) so that Lock wait could be canceled
func (head *Head) zheadRLock() { head.zheadMu.RLock() }
func (head *Head) zheadRUnlock() { head.zheadMu.RUnlock() }
func (head *Head) zheadLock() { head.zheadMu.Lock() }
func (head *Head) zheadUnlock() { head.zheadMu.Unlock() }
*/
// -------- 4) ZODB invalidation -> OS cache -------- // -------- 4) ZODB invalidation -> OS cache --------
...@@ -800,7 +770,7 @@ func (root *Root) zwatcher(ctx context.Context, zwatchq chan zodb.Event) (err er ...@@ -800,7 +770,7 @@ func (root *Root) zwatcher(ctx context.Context, zwatchq chan zodb.Event) (err er
case zevent, ok = <-zwatchq: case zevent, ok = <-zwatchq:
if !ok { if !ok {
traceZWatch("zwatchq closed") traceZWatch("zwatchq closed")
return nil // closed XXX ok? return nil // closed
} }
} }
...@@ -830,7 +800,7 @@ func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err err ...@@ -830,7 +800,7 @@ func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err err
head := root.head head := root.head
// while we are invalidating OS cache, make sure that nothing, that // while we are invalidating OS cache, make sure that nothing, that
// even reads /head/bigfile/*, is running (see 4.6). // even reads /head/bigfile/*, is running (see 4.4).
// //
// also make sure that cache uploaders we spawned (uploadBlk) are all // also make sure that cache uploaders we spawned (uploadBlk) are all
// paused, or else they could overwrite OS cache with stale data. // paused, or else they could overwrite OS cache with stale data.
...@@ -839,7 +809,7 @@ func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err err ...@@ -839,7 +809,7 @@ func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err err
continueOSCacheUpload := make(chan struct{}) continueOSCacheUpload := make(chan struct{})
retry: retry:
for { for {
// XXX ctx cancel // TODO ctx cancel
head.zheadMu.Lock() head.zheadMu.Lock()
head.pauseOSCacheUpload = true head.pauseOSCacheUpload = true
head.continueOSCacheUpload = continueOSCacheUpload head.continueOSCacheUpload = continueOSCacheUpload
...@@ -867,9 +837,7 @@ retry: ...@@ -867,9 +837,7 @@ retry:
bfdir := head.bfdir bfdir := head.bfdir
// invalidate kernel cache for data in changed files // invalidate kernel cache for data in changed files
// NOTE no δFmu lock needed because zhead is WLocked
// δF, err := bfdir.δFtail.Update(δZ, zhead) // δF <- δZ |tracked
δF, err := bfdir.δFtail.Update(δZ) // δF <- δZ |tracked δF, err := bfdir.δFtail.Update(δZ) // δF <- δZ |tracked
if err != nil { if err != nil {
return err return err
...@@ -944,11 +912,10 @@ retry: ...@@ -944,11 +912,10 @@ retry:
} }
// resync .zhead to δZ.tid // resync .zhead to δZ.tid
// XXX -> Head.Resync() ?
// 1. abort old and resync to new txn/at // 1. abort old and resync to new txn/at
transaction.Current(zhead.TxnCtx).Abort() transaction.Current(zhead.TxnCtx).Abort()
_, ctx = transaction.New(context.Background()) // XXX bg ok? _, ctx = transaction.New(context.Background())
err = zhead.Resync(ctx, δZ.Tid) err = zhead.Resync(ctx, δZ.Tid)
if err != nil { if err != nil {
return err return err
...@@ -957,37 +924,53 @@ retry: ...@@ -957,37 +924,53 @@ retry:
// 2. restat invalidated ZBigFile // 2. restat invalidated ZBigFile
// NOTE no lock needed since .blksize and .size are constant during lifetime of one txn. // NOTE no lock needed since .blksize and .size are constant during lifetime of one txn.
// XXX -> parallel // TODO -> parallel
for foid := range δF.ByFile { for foid, δfile := range δF.ByFile {
file := bfdir.fileTab[foid] // must be present file := bfdir.fileTab[foid] // must be present
zfile := file.zfile zfile := file.zfile
// XXX need to do only if δfile.Size changed if δfile.Size {
size, sizePath, blkCov, err := zfile.Size(ctx) size, sizePath, blkCov, err := zfile.Size(ctx)
if err != nil { if err != nil {
return err return err
} }
file.size = size file.size = size
// see "3) for */head/data the following invariant is maintained..." // see "3) for */head/data the following invariant is maintained..."
bfdir.δFtail.Track(zfile, -1, sizePath, blkCov, nil) bfdir.δFtail.Track(zfile, -1, sizePath, blkCov, nil)
}
// XXX we can miss a change to file if δblk is not yet tracked // NOTE we can miss a change to file if δblk is not yet tracked
// -> need to update file.rev at read time -> locking=XXX // that's why revision is only approximated
file.rev = zhead.At() file.revApprox = zhead.At()
} }
// notify .wcfs/zhead // notify .wcfs/zhead
for sk := range gdebug.zheadSockTab { for sk := range gdebug.zheadSockTab {
_, err := fmt.Fprintf(xio.BindCtxW(sk, ctx), "%s\n", δZ.Tid) _, err := fmt.Fprintf(xio.BindCtxW(sk, ctx), "%s\n", δZ.Tid)
if err != nil { if err != nil {
log.Errorf("%s", err) // XXX errctx + file, handle, reader pid log.Errorf("zhead: %s: write: %s (detaching reader)", sk, err)
sk.Close() sk.Close()
delete(gdebug.zheadSockTab, sk) delete(gdebug.zheadSockTab, sk)
} }
} }
// XXX δFtail.ForgetPast(...) // shrink δFtail not to grow indefinitely.
// cover history for at least 1 minute, but including all watches.
// No need to lock anything because we are holding zheadMu and
// setupWatch too runs with zheadMu locked.
//
// TODO shrink δFtail only once in a while - there is no need to compute
// revCut and cut δFtail on every transaction.
revCut := zodb.TidFromTime(zhead.At().Time().Add(-1*time.Minute))
for wlink := range head.wlinkTab {
for _, w := range wlink.byfile {
if w.at < revCut {
revCut = w.at
}
}
}
bfdir.δFtail.ForgetPast(revCut)
// notify zhead.At waiters // notify zhead.At waiters
for hw := range head.hwait { for hw := range head.hwait {
...@@ -1016,7 +999,7 @@ func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) { ...@@ -1016,7 +999,7 @@ func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) {
panic("must be called only for head/, not @revX/") panic("must be called only for head/, not @revX/")
} }
// XXX check wcfs.down // TODO check wcfs.down
// check if zhead is already ≥ at // check if zhead is already ≥ at
head.zheadMu.RLock() head.zheadMu.RLock()
...@@ -1044,7 +1027,7 @@ func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) { ...@@ -1044,7 +1027,7 @@ func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) {
// invalidateBlk invalidates 1 file block in kernel cache. // invalidateBlk invalidates 1 file block in kernel cache.
// //
// see "4.4) for all file/blk to in invalidate we do" // see "4.2) for all file/blk to in invalidate we do"
// called with zheadMu wlocked. // called with zheadMu wlocked.
func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) { func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) {
defer xerr.Contextf(&err, "%s: invalidate blk #%d:", f.path(), blk) defer xerr.Contextf(&err, "%s: invalidate blk #%d:", f.path(), blk)
...@@ -1073,7 +1056,7 @@ func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) { ...@@ -1073,7 +1056,7 @@ func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) {
blkdata = make([]byte, blksize) blkdata = make([]byte, blksize)
n, st := fsconn.FileRetrieveCache(f.Inode(), off, blkdata) n, st := fsconn.FileRetrieveCache(f.Inode(), off, blkdata)
if st != fuse.OK { if st != fuse.OK {
// XXX warn log.Errorf("%s: retrieve blk #%d from cache: %s (ignoring, but reading @revX/bigfile will be slow)", f.path(), blk, st)
} }
blkdata = blkdata[:n] blkdata = blkdata[:n]
} }
...@@ -1168,27 +1151,7 @@ func (root *Root) lockRevFile(rev zodb.Tid, fid zodb.Oid) (_ *BigFile, unlock fu ...@@ -1168,27 +1151,7 @@ func (root *Root) lockRevFile(rev zodb.Tid, fid zodb.Oid) (_ *BigFile, unlock fu
frevpath := fmt.Sprintf("@%s/bigfile/%s", rev, fid) // relative to fs root for now frevpath := fmt.Sprintf("@%s/bigfile/%s", rev, fid) // relative to fs root for now
defer xerr.Contextf(&err, "/: lockRevFile %s", frevpath) defer xerr.Contextf(&err, "/: lockRevFile %s", frevpath)
// FIXME checking for "node{0}" is fragile: // open through kernel
// XXX the node could be still forgotten since we are not holding open on it
// XXX -> always os.open unconditionally for now
// or is it ok since it is just a cache?
// -> no, not ok: if inode ID is forgotten, the same ID could be
// reallocated to another file and then we'll corrupt in-kernel
// cache by wrongly storing data of one file into cache of
// another file.
// -> to avoid this we need to always lock the inode ID with real open.
// XXX (also disabled for now due to race-detector)
/*
// first check without going through kernel, whether the inode maybe known already
xfrev := fsconn.LookupNode(root.Inode(), frevpath)
if xfrev != nil {
if xfrev.String() != "node{0}" {
return xfrev.Node().(*BigFile), func(){}, nil
}
}
*/
// we have to ping the kernel
frevospath := gmntpt + "/" + frevpath // now starting from OS / frevospath := gmntpt + "/" + frevpath // now starting from OS /
f, err := os.Open(frevospath) f, err := os.Open(frevospath)
if err != nil { if err != nil {
...@@ -1204,7 +1167,7 @@ func (root *Root) lockRevFile(rev zodb.Tid, fid zodb.Oid) (_ *BigFile, unlock fu ...@@ -1204,7 +1167,7 @@ func (root *Root) lockRevFile(rev zodb.Tid, fid zodb.Oid) (_ *BigFile, unlock fu
// /(head|<rev>)/bigfile/<bigfileX> -> Read serves reading bigfile data. // /(head|<rev>)/bigfile/<bigfileX> -> Read serves reading bigfile data.
func (f *BigFile) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context) (fuse.ReadResult, fuse.Status) { func (f *BigFile) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context) (fuse.ReadResult, fuse.Status) {
f.head.zheadMu.RLock() // XXX +fctx to cancel f.head.zheadMu.RLock() // TODO +fctx to cancel
defer f.head.zheadMu.RUnlock() defer f.head.zheadMu.RUnlock()
// cap read request to file size // cap read request to file size
...@@ -1228,10 +1191,10 @@ func (f *BigFile) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context ...@@ -1228,10 +1191,10 @@ func (f *BigFile) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context
if re := end % f.blksize; re != 0 { if re := end % f.blksize; re != 0 {
aend += f.blksize - re aend += f.blksize - re
} }
// XXX use original dest if it can fit the data // TODO use original dest if it can fit the data
dest = make([]byte, aend - aoff) // ~> [aoff:aend) in file dest = make([]byte, aend - aoff) // ~> [aoff:aend) in file
// XXX better ctx = transaction.PutIntoContext(ctx, txn) // TODO better ctx = transaction.PutIntoContext(ctx, txn)
ctx, cancel := xcontext.Merge(fctx, f.head.zconn.TxnCtx) ctx, cancel := xcontext.Merge(fctx, f.head.zconn.TxnCtx)
defer cancel() defer cancel()
...@@ -1283,7 +1246,7 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro ...@@ -1283,7 +1246,7 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro
case <-loading.ready: case <-loading.ready:
if loading.err == nil { if loading.err == nil {
copy(dest, loading.blkdata) // XXX copy copy(dest, loading.blkdata) // TODO avoid copy
} }
return loading.err return loading.err
} }
...@@ -1291,6 +1254,22 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro ...@@ -1291,6 +1254,22 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro
// noone was loading - we became responsible to load this block // noone was loading - we became responsible to load this block
blkdata, treepath, blkcov, zblk, blkrevMax, err := f.zfile.LoadBlk(ctx, blk) blkdata, treepath, blkcov, zblk, blkrevMax, err := f.zfile.LoadBlk(ctx, blk)
// head/ - update δFtail + pin watchers
if f.head.rev == 0 && err == nil {
// update δFtail index
// see "3) for */head/data the following invariant is maintained..."
δFtail := f.head.bfdir.δFtail
δFtail.Track(f.zfile, blk, treepath, blkcov, zblk)
// we have the data - it can be used after watchers are updated
// XXX should we use ctx here? (see readPinWatchers comments)
err = f.readPinWatchers(ctx, blk, blkrevMax)
if err != nil {
blkdata = nil
}
}
loading.blkdata = blkdata loading.blkdata = blkdata
loading.err = err loading.err = err
...@@ -1303,13 +1282,9 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro ...@@ -1303,13 +1282,9 @@ func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err erro
return err return err
} }
// we have the data - it can be used after watchers are updated
// XXX should we use ctx here? (see readPinWatchers comments)
f.readPinWatchers(ctx, blk, treepath, blkcov, zblk, blkrevMax)
// data can be used now // data can be used now
close(loading.ready) close(loading.ready)
copy(dest, blkdata) // XXX copy copy(dest, blkdata) // TODO avoid copy
// store to kernel pagecache whole block that we've just loaded from database. // store to kernel pagecache whole block that we've just loaded from database.
// This way, even if the user currently requested to read only small portion from it, // This way, even if the user currently requested to read only small portion from it,
...@@ -1423,7 +1398,7 @@ func traceIso(format string, argv ...interface{}) { ...@@ -1423,7 +1398,7 @@ func traceIso(format string, argv ...interface{}) {
// //
// must be called with atMu rlocked. // must be called with atMu rlocked.
// //
// XXX error - when? or close watch on any error? // TODO close watch on any error
func (w *Watch) pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) { func (w *Watch) pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
defer xerr.Contextf(&err, "wlink%d: f<%s>", w.link.id, w.file.zfile.POid()) defer xerr.Contextf(&err, "wlink%d: f<%s>", w.link.id, w.file.zfile.POid())
return w._pin(ctx, blk, rev) return w._pin(ctx, blk, rev)
...@@ -1453,7 +1428,7 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) { ...@@ -1453,7 +1428,7 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
} }
w.pinnedMu.Unlock() w.pinnedMu.Unlock()
<-blkpin.ready // XXX + ctx ? (or just keep ready ?) <-blkpin.ready // TODO +ctx cancel
if blkpin.rev == rev { if blkpin.rev == rev {
// already pinned // already pinned
...@@ -1462,7 +1437,7 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) { ...@@ -1462,7 +1437,7 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
} }
// relock the watch and check that w.pinned[blk] is the same. Retry if it is not. // relock the watch and check that w.pinned[blk] is the same. Retry if it is not.
// ( w.pinned[blk] could have changed while w.mu was not held e.g. by XXX recheck // ( w.pinned[blk] could have changed while w.mu was not held e.g. by
// simultaneous setupWatch if we were called by readPinWatchers ) // simultaneous setupWatch if we were called by readPinWatchers )
w.pinnedMu.Lock() w.pinnedMu.Lock()
if blkpin == w.pinned[blk] { if blkpin == w.pinned[blk] {
...@@ -1518,29 +1493,21 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) { ...@@ -1518,29 +1493,21 @@ func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
// //
// See "7.2) for all registered client@at watchers ..." // See "7.2) for all registered client@at watchers ..."
// //
// Called with f.head.zheadMu rlocked. // Must be called only for f under head/
// Must be called with f.head.zheadMu rlocked.
// //
// XXX do we really need to use/propagate caller context here? ideally update // XXX do we really need to use/propagate caller context here? ideally update
// watchers should be synchronous, and in practice we just use 30s timeout. // watchers should be synchronous, and in practice we just use 30s timeout (TODO).
// Should a READ interrupt cause watch update failure? -> probably no // Should a READ interrupt cause watch update failure? -> probably no
func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btree.LONode, blkcov btree.LKeyRange, zblk ZBlk, blkrevMax zodb.Tid) { func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, blkrevMax zodb.Tid) (err error) {
defer xerr.Context(&err, "pin watchers") // f.path and blk is already put into context by readBlk
// only head/ is being watched for // only head/ is being watched for
if f.head.rev != 0 { if f.head.rev != 0 {
return panic("BUG: readPinWatchers: called for file under @revX/")
} }
// fmt.Printf("S: read #%d -> pin watchers (#%d)\n", blk, len(f.watchTab)) //fmt.Printf("S: read #%d -> pin watchers (#%d)\n", blk, len(f.watchTab))
// update δFtail index XXX -> move upper into readBlk ?
// (δFtail is just for δZ -> δF invalidation handling and is needed without isolation protocol)
// XXX ^^^ no - also need to query to send pins
// see "3) for */head/data the following invariant is maintained..."
bfdir := f.head.bfdir
δFtail := bfdir.δFtail
bfdir.δFmu.Lock() // XXX locking correct? XXX -> better push down?
δFtail.Track(f.zfile, blk, treepath, blkcov, zblk) // XXX pass in zblk.rev here?
f.accessed.Add(blk)
bfdir.δFmu.Unlock()
// make sure that file[blk] on clients side stays as of @w.at state. // make sure that file[blk] on clients side stays as of @w.at state.
...@@ -1548,6 +1515,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr ...@@ -1548,6 +1515,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr
// This is likely to be the case, since most watchers should be usually close to head. // This is likely to be the case, since most watchers should be usually close to head.
// If using blkrevMax only turns out to be not sufficient, we'll // If using blkrevMax only turns out to be not sufficient, we'll
// consult δFtail, which might involve recomputing it. // consult δFtail, which might involve recomputing it.
δFtail := f.head.bfdir.δFtail
blkrev := blkrevMax blkrev := blkrevMax
blkrevRough := true blkrevRough := true
...@@ -1576,7 +1544,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr ...@@ -1576,7 +1544,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr
var err error var err error
blkrev, _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, f.head.zconn.At()) blkrev, _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, f.head.zconn.At())
if err != nil { if err != nil {
panic(err) // XXX return err
} }
blkrevRough = false blkrevRough = false
...@@ -1593,32 +1561,25 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr ...@@ -1593,32 +1561,25 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr
// and most of them would be on different w.at - cache of the file will // and most of them would be on different w.at - cache of the file will
// be lost. Via pinning to particular block revision, we make sure the // be lost. Via pinning to particular block revision, we make sure the
// revision to pin is the same on all clients, and so file cache is shared. // revision to pin is the same on all clients, and so file cache is shared.
pinrev, _, err := δFtail.BlkRevAt(ctx, w.file.zfile, blk, w.at) // XXX move into go?
// XXX ^^^ w.file vs f ?
if err != nil {
panic(err) // XXX
}
//fmt.Printf("S: read #%d: watch @%s: pin -> @%s\n", blk, w.at, pinrev)
wg.Go(func(ctx context.Context) error { wg.Go(func(ctx context.Context) error {
defer w.atMu.RUnlock() defer w.atMu.RUnlock()
// XXX close watcher on any error pinrev, _, err := δFtail.BlkRevAt(ctx, f.zfile, blk, w.at)
if err != nil {
return err
}
//fmt.Printf("S: read #%d: watch @%s: pin -> @%s\n", blk, w.at, pinrev)
// TODO close watcher on any error
return w.pin(ctx, blk, pinrev) return w.pin(ctx, blk, pinrev)
}) })
} }
f.watchMu.RUnlock() f.watchMu.RUnlock()
err := wg.Wait() return wg.Wait()
if err != nil {
panic(err) // XXX
}
} }
// setupWatch sets up or updates a Watch when client sends `watch <file> @<at>` request. // setupWatch sets up or updates a Watch when client sends `watch <file> @<at>` request.
// //
// XXX sends "pin" notifications; final "ok" must be sent by caller. // It sends "pin" notifications; final "ok" or "error" must be sent by caller.
//
// XXX called synchronously - only 1 setupWatch call at a time?
func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.Tid) (err error) { func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.Tid) (err error) {
defer xerr.Contextf(&err, "setup watch f<%s> @%s", foid, at) defer xerr.Contextf(&err, "setup watch f<%s> @%s", foid, at)
head := wlink.head head := wlink.head
...@@ -1638,7 +1599,6 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1638,7 +1599,6 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
defer head.zheadMu.RUnlock() defer head.zheadMu.RUnlock()
headAt := head.zconn.At() headAt := head.zconn.At()
// XXX δFtail locking? (or ForgetPast is called only with zheadMu.W ?)
if at != zodb.InvalidTid && at < bfdir.δFtail.Tail() { if at != zodb.InvalidTid && at < bfdir.δFtail.Tail() {
return fmt.Errorf("too far away back from head/at (@%s); δt = %s", return fmt.Errorf("too far away back from head/at (@%s); δt = %s",
headAt, headAt.Time().Sub(at.Time().Time)) headAt, headAt.Time().Sub(at.Time().Time))
...@@ -1686,7 +1646,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1686,7 +1646,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
w.atMu.Lock() w.atMu.Lock()
// check at >= w.at // check at >= w.at
// XXX we might want to allow going back in history if we need it. // TODO(?) we might want to allow going back in history if we need it.
if !(at >= w.at) { if !(at >= w.at) {
w.atMu.Unlock() w.atMu.Unlock()
f.watchMu.Unlock() f.watchMu.Unlock()
...@@ -1697,7 +1657,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1697,7 +1657,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
// register w to f early, so that READs going in parallel to us // register w to f early, so that READs going in parallel to us
// preparing and processing initial pins, also send pins to w for read // preparing and processing initial pins, also send pins to w for read
// blocks. If we don't, we can miss to send pin to w for a freshly read // blocks. If we don't, we can miss to send pin to w for a freshly read
// block which could have revision > w.at: XXX test // block which could have revision > w.at:
// //
// 1 3 2 4 // 1 3 2 4
// ─────.────x───o────x───x──────]────────── // ─────.────x───o────x───x──────]──────────
...@@ -1736,15 +1696,24 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1736,15 +1696,24 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
f.watchMu.Unlock() f.watchMu.Unlock()
wlink.byfileMu.Unlock() wlink.byfileMu.Unlock()
// XXX defer -> unregister watch if error? // TODO defer -> unregister watch if error
// pin all tracked file blocks that were changed in (at, head] range. // pin all tracked file blocks that were changed in (at, head] range.
toPin := map[int64]zodb.Tid{} // blk -> @rev toPin := map[int64]zodb.Tid{} // blk -> @rev
δFtail := bfdir.δFtail δFtail := bfdir.δFtail
vδf, err := δFtail.SliceByFileRev(f.zfile, at, headAt) // XXX locking δFtail vδf, err := δFtail.SliceByFileRevEx(f.zfile, at, headAt, zdata.QueryOptions{
// blk might be in δFtail because it is adjacent in
// ZBigFile.blktab to another blk that was explicitly tracked.
// We do not want to get those to avoid unnecessarily pinning
// potentially more blocks than needed.
//
// wcfs tests also verify that only blocks that were previously
// explicitly accessed are included into watch setup pins.
OnlyExplicitlyTracked: true,
})
if err != nil { if err != nil {
panic(err) // XXX return err
} }
for _, δfile := range vδf { for _, δfile := range vδf {
if δfile.Epoch { if δfile.Epoch {
...@@ -1767,24 +1736,10 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1767,24 +1736,10 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
if already { if already {
continue continue
} }
// blk might be in δFtail because it is adjacent in
// ZBigFile.blktab to another blk that was explicitly
// tracked. However wcfs tests expect that only blocks
// that were previously explicitly accessed are
// included into watch setup pins.
//
// XXX adjust wcfs tests to not require only accessed
// blocks to be in setup pins? But that would mean that
// potentially more blocks would be potentially
// _unnecessarily_ pinned if they are not going to be
// accessed at all.
if !f.accessed.Has(blk) {
continue
}
toPin[blk], _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, at) toPin[blk], _, err = δFtail.BlkRevAt(ctx, f.zfile, blk, at)
if err != nil { if err != nil {
panic(err) // XXX return err
} }
} }
} }
...@@ -1792,7 +1747,6 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1792,7 +1747,6 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
// if a block was previously pinned, but ∉ δ(at, head] -> unpin it to head. // if a block was previously pinned, but ∉ δ(at, head] -> unpin it to head.
for blk, pinPrev := range w.pinned { for blk, pinPrev := range w.pinned {
// only 1 setupWatch can be run simultaneously for one file // only 1 setupWatch can be run simultaneously for one file
// XXX assert pinPrev.rev != zodb.TidMax
pinNew, pinning := toPin[blk] pinNew, pinning := toPin[blk]
if !pinning { if !pinning {
...@@ -1808,7 +1762,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1808,7 +1762,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
} }
// downgrade atMu.W -> atMu.R to let other clients to access the file. // downgrade atMu.W -> atMu.R to let other clients to access the file.
// XXX there is no primitive to do Wlock->Rlock atomically, but we are // NOTE there is no primitive to do Wlock->Rlock atomically, but we are
// ok with that since we prepared everything to handle simultaneous pins // ok with that since we prepared everything to handle simultaneous pins
// from other reads. // from other reads.
w.atMu.Unlock() w.atMu.Unlock()
...@@ -1833,7 +1787,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T ...@@ -1833,7 +1787,7 @@ func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.T
// Open serves /head/watch opens. // Open serves /head/watch opens.
func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) { func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) {
// XXX check flags? // TODO(?) check flags
head := wnode.head head := wnode.head
wlink := &WatchLink{ wlink := &WatchLink{
...@@ -1857,7 +1811,6 @@ func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fus ...@@ -1857,7 +1811,6 @@ func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fus
// wcfs initiated pin requests. // wcfs initiated pin requests.
func (wlink *WatchLink) serve() { func (wlink *WatchLink) serve() {
err := wlink._serve() err := wlink._serve()
// XXX log error if !(close || EOF)
if err != nil { if err != nil {
log.Error(err) log.Error(err)
} }
...@@ -1871,7 +1824,7 @@ func (wlink *WatchLink) serve() { ...@@ -1871,7 +1824,7 @@ func (wlink *WatchLink) serve() {
func (wlink *WatchLink) _serve() (err error) { func (wlink *WatchLink) _serve() (err error) {
defer xerr.Contextf(&err, "wlink %d: serve rx", wlink.id) defer xerr.Contextf(&err, "wlink %d: serve rx", wlink.id)
ctx0 := context.TODO() // XXX ctx = ? -> merge(ctx of wcfs running, ctx of wlink timeout) ctx0 := context.TODO() // TODO ctx = merge(ctx of wcfs running, ctx of wlink timeout)
ctx, cancel := context.WithCancel(ctx0) ctx, cancel := context.WithCancel(ctx0)
wg := xsync.NewWorkGroup(ctx) wg := xsync.NewWorkGroup(ctx)
...@@ -1934,10 +1887,8 @@ func (wlink *WatchLink) _serve() (err error) { ...@@ -1934,10 +1887,8 @@ func (wlink *WatchLink) _serve() (err error) {
return e return e
}) })
// XXX recheck that it is safe to handle multiple simultaneous watch requests.
for { for {
l, err := r.ReadString('\n') // XXX limit accepted line len to prevent DOS l, err := r.ReadString('\n') // TODO limit accepted line len to prevent DOS
if err != nil { if err != nil {
// r.Read is woken up by sk.CloseRead when serve decides to exit // r.Read is woken up by sk.CloseRead when serve decides to exit
if err == io.ErrClosedPipe || err == io.EOF { if err == io.ErrClosedPipe || err == io.EOF {
...@@ -1969,7 +1920,7 @@ func (wlink *WatchLink) _serve() (err error) { ...@@ -1969,7 +1920,7 @@ func (wlink *WatchLink) _serve() (err error) {
// client-initiated request // client-initiated request
// bye TODO document in "Isolation protocol" // bye
if msg == "bye" { if msg == "bye" {
return nil // deferred sk.Close will wake-up rx on client side return nil // deferred sk.Close will wake-up rx on client side
} }
...@@ -2011,16 +1962,37 @@ func (wlink *WatchLink) _handleWatch(ctx context.Context, msg string) error { ...@@ -2011,16 +1962,37 @@ func (wlink *WatchLink) _handleWatch(ctx context.Context, msg string) error {
// sendReq sends wcfs-originated request to client and returns client response. // sendReq sends wcfs-originated request to client and returns client response.
func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, err error) { func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, err error) {
// XXX err ctx defer xerr.Context(&err, "sendReq") // wlink is already put into ctx by caller
var stream uint64 var stream uint64
for stream == 0 { for stream == 0 {
stream = atomic.AddUint64(&wlink.reqNext, +2) stream = atomic.AddUint64(&wlink.reqNext, +2)
} }
rxq := make(chan string) // XXX cap=1? (so that if we return canceled we do not block client) rxq := make(chan string, 1)
wlink.rxMu.Lock() wlink.rxMu.Lock()
wlink.rxTab[stream] = rxq // XXX assert .stream is not there? _, already := wlink.rxTab[stream]
if !already {
wlink.rxTab[stream] = rxq
}
wlink.rxMu.Unlock() wlink.rxMu.Unlock()
if already {
panic("BUG: to-be-sent stream is present in rxtab")
}
defer func() {
if err != nil {
// remove rxq from rxTab
// ( _serve could have already deleted it if unexpected
// reply came to the stream, but no other rxq should
// have registered on the [stream] slot )
wlink.rxMu.Lock()
delete(wlink.rxTab, stream)
wlink.rxMu.Unlock()
// no need to drain rxq - it was created with cap=1
}
}()
err = wlink.send(ctx, stream, req) err = wlink.send(ctx, stream, req)
if err != nil { if err != nil {
...@@ -2029,7 +2001,6 @@ func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, ...@@ -2029,7 +2001,6 @@ func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string,
select { select {
case <-ctx.Done(): case <-ctx.Done():
// XXX del rxTab[stream] ?
return "", ctx.Err() return "", ctx.Err()
case reply = <-rxq: case reply = <-rxq:
...@@ -2040,16 +2011,20 @@ func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, ...@@ -2040,16 +2011,20 @@ func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string,
// send sends a message to client over specified stream ID. // send sends a message to client over specified stream ID.
// //
// Multiple send can be called simultaneously; send serializes writes. // Multiple send can be called simultaneously; send serializes writes.
func (wlink *WatchLink) send(ctx context.Context, stream uint64, msg string) error { func (wlink *WatchLink) send(ctx context.Context, stream uint64, msg string) (err error) {
// XXX err ctx defer xerr.Contextf(&err, "send .%d", stream) // wlink is already put into ctx by caller
// XXX assert '\n' not in msg
// assert '\n' not in msg
if strings.ContainsRune(msg, '\n') {
panicf("BUG: msg contains \\n ; msg: %q", msg)
}
wlink.txMu.Lock() wlink.txMu.Lock()
defer wlink.txMu.Unlock() defer wlink.txMu.Unlock()
pkt := []byte(fmt.Sprintf("%d %s\n", stream, msg)) pkt := []byte(fmt.Sprintf("%d %s\n", stream, msg))
traceIso("S: wlink%d: tx: %q\n", wlink.id, pkt) traceIso("S: wlink%d: tx: %q\n", wlink.id, pkt)
_, err := wlink.sk.Write(ctx, pkt) _, err = wlink.sk.Write(ctx, pkt)
if err != nil { if err != nil {
return err return err
} }
...@@ -2079,7 +2054,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context) ...@@ -2079,7 +2054,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context)
return nil, eINVALf("not oid") return nil, eINVALf("not oid")
} }
bfdir.head.zheadMu.RLock() // XXX +fctx -> cancel bfdir.head.zheadMu.RLock() // TODO +fctx -> cancel
defer bfdir.head.zheadMu.RUnlock() defer bfdir.head.zheadMu.RUnlock()
defer func() { defer func() {
...@@ -2098,7 +2073,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context) ...@@ -2098,7 +2073,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context)
} }
// not there - without bfdir lock proceed to open BigFile from ZODB // not there - without bfdir lock proceed to open BigFile from ZODB
f, err = bfdir.head.bigopen(fctx, oid) f, err = bfdir.head.bigfopen(fctx, oid)
if err != nil { if err != nil {
return nil, err return nil, err
} }
...@@ -2109,7 +2084,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context) ...@@ -2109,7 +2084,7 @@ func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context)
f2, already := bfdir.fileTab[oid] f2, already := bfdir.fileTab[oid]
if already { if already {
bfdir.fileMu.Unlock() bfdir.fileMu.Unlock()
f.Close() // f.Close() not needed - BigFile is all just garbage-collected
return f2, nil return f2, nil
} }
...@@ -2158,7 +2133,6 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) { ...@@ -2158,7 +2133,6 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) {
} }
// not there - without revMu lock proceed to open @rev view of ZODB // not there - without revMu lock proceed to open @rev view of ZODB
// zconnRev, err := root.zopenAt(fctx, rev)
zconnRev, err := xzodb.ZOpen(fctx, root.zdb, &zodb.ConnOptions{At: rev}) zconnRev, err := xzodb.ZOpen(fctx, root.zdb, &zodb.ConnOptions{At: rev})
if err != nil { if err != nil {
return nil, err return nil, err
...@@ -2177,17 +2151,17 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) { ...@@ -2177,17 +2151,17 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) {
} }
revDir = &Head{ revDir = &Head{
// XXX how to test forgets: // TODO how to test forgets:
// echo 2 >/proc/sys/vm/drop_caches (root) // echo 2 >/proc/sys/vm/drop_caches (root)
// mount -i -oremount $mntpt (root ?) (shrinks dcache) // mount -i -oremount $mntpt (root ?) (shrinks dcache)
// notify invalidate dentry from inside fs // notify invalidate dentry from inside fs
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + Head.OnForget() -> del root.revTab[] fsNode: newFSNode(&fsOptions{Sticky: false}), // TODO + Head.OnForget() -> del root.revTab[]
rev: rev, rev: rev,
zconn: zconnRev, // XXX + Head.OnForget() -> release zconn (= abort zconn.TxnCtx) zconn: zconnRev, // TODO + Head.OnForget() -> release zconn (= abort zconn.TxnCtx)
} }
bfdir := &BigFileDir{ bfdir := &BigFileDir{
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + BigFileDir.OnForget() fsNode: newFSNode(&fsOptions{Sticky: false}), // TODO + BigFileDir.OnForget()
head: revDir, head: revDir,
fileTab: make(map[zodb.Oid]*BigFile), fileTab: make(map[zodb.Oid]*BigFile),
δFtail: nil, // δFtail not needed/used for @revX/ δFtail: nil, // δFtail not needed/used for @revX/
...@@ -2200,22 +2174,21 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) { ...@@ -2200,22 +2174,21 @@ func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) {
// mkdir takes filesystem treeLock - do it outside revMu. // mkdir takes filesystem treeLock - do it outside revMu.
mkdir(root, name, revDir) mkdir(root, name, revDir)
mkdir(revDir, "bigfile", bfdir) mkdir(revDir, "bigfile", bfdir)
// XXX + "at"
return revDir, nil return revDir, nil
} }
// bigopen opens BigFile corresponding to oid on head.zconn. // bigfopen opens BigFile corresponding to oid on head.zconn.
// //
// A ZBigFile corresponding to oid is activated and statted. // A ZBigFile corresponding to oid is activated and statted.
// //
// head.zconn must be locked. // head.zheadMu must be locked.
func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err error) { func (head *Head) bigfopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err error) {
zconn := head.zconn zconn := head.zconn
defer xerr.Contextf(&err, "bigopen %s @%s", oid, zconn.At()) defer xerr.Contextf(&err, "bigfopen %s @%s", oid, zconn.At())
// XXX better ctx = transaction.PutIntoContext(ctx, txn) // TODO better ctx = transaction.PutIntoContext(ctx, txn)
ctx, cancel := xcontext.Merge(ctx, zconn.TxnCtx) ctx, cancel := xcontext.Merge(ctx, zconn.TxnCtx)
defer cancel() defer cancel()
...@@ -2225,7 +2198,7 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er ...@@ -2225,7 +2198,7 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er
case *zodb.NoObjectError: case *zodb.NoObjectError:
return nil, eINVAL(err) return nil, eINVAL(err)
case *zodb.NoDataError: case *zodb.NoDataError:
return nil, eINVAL(err) // XXX what to do if it was existing and got deleted? return nil, eINVAL(err)
default: default:
return nil, err return nil, err
} }
...@@ -2242,12 +2215,10 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er ...@@ -2242,12 +2215,10 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er
return nil, err return nil, err
} }
blksize := zfile.BlkSize() blksize := zfile.BlkSize()
// XXX it should be revision of both ZBigFile and its data. But we // NOTE file revision should be revision of both ZBigFile and its data. But we
// cannot get data revision without expensive scan of all ZBigFile's objects. // cannot get data revision without expensive scan of all ZBigFile's objects.
// -> approximate mtime initially with ZBigFile object mtime. // -> approximate mtime initially with ZBigFile object mtime.
// revApprox := zfile.PSerial()
// XXX for @rev/... we can know initial mtime more exactly?
rev := zfile.PSerial()
zfile.PDeactivate() zfile.PDeactivate()
size, sizePath, blkCov, err := zfile.Size(ctx) size, sizePath, blkCov, err := zfile.Size(ctx)
...@@ -2256,50 +2227,32 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er ...@@ -2256,50 +2227,32 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er
} }
f := &BigFile{ f := &BigFile{
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + BigFile.OnForget -> del .head.bfdir.fileTab[] fsNode: newFSNode(&fsOptions{Sticky: false}), // TODO + BigFile.OnForget -> del .head.bfdir.fileTab[]
head: head, head: head,
zfile: zfile, zfile: zfile,
blksize: blksize, blksize: blksize,
size: size, size: size,
rev: rev, revApprox: revApprox,
loading: make(map[int64]*blkLoadState), loading: make(map[int64]*blkLoadState),
} }
// only head/ needs δFtail, f.δtail and watches. // only head/ needs δFtail and watches.
if head.rev == 0 { if head.rev == 0 {
// see "3) for */head/data the following invariant is maintained..." // see "3) for */head/data the following invariant is maintained..."
head.bfdir.δFmu.Lock() // XXX locking ok?
head.bfdir.δFtail.Track(f.zfile, -1, sizePath, blkCov, nil) head.bfdir.δFtail.Track(f.zfile, -1, sizePath, blkCov, nil)
head.bfdir.δFmu.Unlock()
// FIXME: scan zfile.blktab - so that we can detect all btree changes
// see "XXX building δFtail lazily ..." in notes.txt
f.accessed = make(setI64)
f.watchTab = make(map[*Watch]struct{}) f.watchTab = make(map[*Watch]struct{})
} }
return f, nil return f, nil
} }
// Close release all resources of BigFile. XXX needed?
func (f *BigFile) Close() error {
// XXX locking?
f.zfile = nil
// f.zconn.Release()
// f.zconn = nil
f.head = nil
return nil
}
// ---- misc --- // ---- misc ---
// /(head|<rev>)/at -> readAt serves read. // /(head|<rev>)/at -> readAt serves read.
func (h *Head) readAt(fctx *fuse.Context) ([]byte, error) { func (h *Head) readAt(fctx *fuse.Context) ([]byte, error) {
// XXX cancel on fctx cancel h.zheadMu.RLock() // TODO +fctx -> cancel
h.zheadMu.RLock()
defer h.zheadMu.RUnlock() defer h.zheadMu.RUnlock()
return []byte(h.zconn.At().String()), nil return []byte(h.zconn.At().String()), nil
...@@ -2309,7 +2262,7 @@ func (h *Head) readAt(fctx *fuse.Context) ([]byte, error) { ...@@ -2309,7 +2262,7 @@ func (h *Head) readAt(fctx *fuse.Context) ([]byte, error) {
func (head *Head) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status { func (head *Head) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status {
at := head.rev at := head.rev
if at == 0 { if at == 0 {
head.zheadMu.RLock() // XXX +fctx -> cancel head.zheadMu.RLock() // TODO +fctx -> cancel
at = head.zconn.At() at = head.zconn.At()
head.zheadMu.RUnlock() head.zheadMu.RUnlock()
} }
...@@ -2322,7 +2275,7 @@ func (head *Head) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fus ...@@ -2322,7 +2275,7 @@ func (head *Head) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fus
// /(head|<rev>)/bigfile/<bigfileX> -> Getattr serves stat. // /(head|<rev>)/bigfile/<bigfileX> -> Getattr serves stat.
func (f *BigFile) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status { func (f *BigFile) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status {
f.head.zheadMu.RLock() // XXX +fctx -> cancel f.head.zheadMu.RLock() // TODO +fctx -> cancel
defer f.head.zheadMu.RUnlock() defer f.head.zheadMu.RUnlock()
f.getattr(out) f.getattr(out)
...@@ -2332,10 +2285,10 @@ func (f *BigFile) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fus ...@@ -2332,10 +2285,10 @@ func (f *BigFile) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fus
func (f *BigFile) getattr(out *fuse.Attr) { func (f *BigFile) getattr(out *fuse.Attr) {
out.Mode = fuse.S_IFREG | 0444 out.Mode = fuse.S_IFREG | 0444
out.Size = uint64(f.size) out.Size = uint64(f.size)
out.Blksize = uint32(f.blksize) // XXX 64 -> 32 out.Blksize = uint32(f.blksize) // NOTE truncating 64 -> 32
// .Blocks // .Blocks
mtime := f.rev.Time().Time mtime := f.revApprox.Time().Time
out.SetTimes(/*atime=*/nil, /*mtime=*/&mtime, /*ctime=*/&mtime) out.SetTimes(/*atime=*/nil, /*mtime=*/&mtime, /*ctime=*/&mtime)
} }
...@@ -2377,14 +2330,14 @@ type _wcfs_Zhead struct { ...@@ -2377,14 +2330,14 @@ type _wcfs_Zhead struct {
} }
func (zh *_wcfs_Zhead) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) { func (zh *_wcfs_Zhead) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) {
// XXX check flags? // TODO(?) check flags
sk := NewFileSock() sk := NewFileSock()
sk.CloseRead() sk.CloseRead()
groot.head.zheadMu.Lock() // XXX +fctx -> cancel groot.head.zheadMu.Lock() // TODO +fctx -> cancel
defer groot.head.zheadMu.Unlock() defer groot.head.zheadMu.Unlock()
// XXX del zheadSockTab[sk] on sk.File.Release (= client drops opened handle) // TODO del zheadSockTab[sk] on sk.File.Release (= client drops opened handle)
gdebug.zheadSockTab[sk] = struct{}{} gdebug.zheadSockTab[sk] = struct{}{}
return sk.File(), fuse.OK return sk.File(), fuse.OK
} }
...@@ -2392,8 +2345,8 @@ func (zh *_wcfs_Zhead) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse ...@@ -2392,8 +2345,8 @@ func (zh *_wcfs_Zhead) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse
// TODO -> enable/disable fuse debugging dynamically (by write to .wcfs/debug ?) // TODO -> enable/disable fuse debugging dynamically (by write to .wcfs/debug ?)
func main() { func main() {
stdlog.SetPrefix("wcfs: ") //stdlog.SetPrefix("wcfs: ") NOTE conflicts with log.CopyStandardLogTo
//log.CopyStandardLogTo("WARNING") // XXX -> "DEBUG" if -d ? log.CopyStandardLogTo("WARNING")
defer log.Flush() defer log.Flush()
err := _main() err := _main()
...@@ -2405,8 +2358,6 @@ func main() { ...@@ -2405,8 +2358,6 @@ func main() {
func _main() (err error) { func _main() (err error) {
debug := flag.Bool("d", false, "debug") debug := flag.Bool("d", false, "debug")
autoexit := flag.Bool("autoexit", false, "automatically stop service when there is no client activity") autoexit := flag.Bool("autoexit", false, "automatically stop service when there is no client activity")
// XXX option to prevent starting if wcfs was already started/mounted on mntpt ?
// XXX do the check unconditionally?
flag.Parse() flag.Parse()
if len(flag.Args()) != 2 { if len(flag.Args()) != 2 {
...@@ -2420,7 +2371,7 @@ func _main() (err error) { ...@@ -2420,7 +2371,7 @@ func _main() (err error) {
err = xerr.First(err, c.Close()) err = xerr.First(err, c.Close())
} }
// debug -> precise t, no dates (XXX -> always precise t?) // debug -> precise t, no dates (TODO(?) -> always precise t?)
if *debug { if *debug {
stdlog.SetFlags(stdlog.Lmicroseconds) stdlog.SetFlags(stdlog.Lmicroseconds)
} }
...@@ -2434,7 +2385,7 @@ func _main() (err error) { ...@@ -2434,7 +2385,7 @@ func _main() (err error) {
log.Info(gover) log.Info(gover)
// open zodb storage/watch/db/connection // open zodb storage/watch/db/connection
ctx := context.Background() // XXX + timeout? ctx := context.Background() // TODO(?) + timeout?
zstor, err := zodb.Open(ctx, zurl, &zodb.OpenOptions{ zstor, err := zodb.Open(ctx, zurl, &zodb.OpenOptions{
ReadOnly: true, ReadOnly: true,
}) })
...@@ -2505,7 +2456,7 @@ func _main() (err error) { ...@@ -2505,7 +2456,7 @@ func _main() (err error) {
// TODO -> teach go-fuse to handle Init.MaxPages (Linux 4.20+). // TODO -> teach go-fuse to handle Init.MaxPages (Linux 4.20+).
MaxWrite: 2*1024*1024, MaxWrite: 2*1024*1024,
// XXX tune MaxReadAhead? MaxBackground? // TODO(?) tune MaxReadAhead? MaxBackground?
// OS cache that we populate with bigfile data is precious; // OS cache that we populate with bigfile data is precious;
// we explicitly propagate ZODB invalidations into file invalidations. // we explicitly propagate ZODB invalidations into file invalidations.
...@@ -2519,8 +2470,8 @@ func _main() (err error) { ...@@ -2519,8 +2470,8 @@ func _main() (err error) {
if err != nil { if err != nil {
return err return err
} }
groot = root // FIXME temp workaround (see ^^^) groot = root // FIXME temp workaround (see ^^^)
gfsconn = fsconn // FIXME ----//---- gfsconn = fsconn // FIXME ----//----
gmntpt = mntpt gmntpt = mntpt
// we require proper pagecache control (added to Linux 2.6.36 in 2010) // we require proper pagecache control (added to Linux 2.6.36 in 2010)
...@@ -2544,7 +2495,7 @@ func _main() (err error) { ...@@ -2544,7 +2495,7 @@ func _main() (err error) {
// add entries to / // add entries to /
mkdir(root, "head", head) mkdir(root, "head", head)
mkdir(head, "bigfile", bfdir) mkdir(head, "bigfile", bfdir)
mkfile(head, "at", NewSmallFile(head.readAt)) // TODO mtime(at) = tidtime(at) mkfile(head, "at", NewSmallFile(head.readAt)) // TODO mtime(at) = tidtime(at)
mkfile(head, "watch", wnode) mkfile(head, "watch", wnode)
// for debugging/testing // for debugging/testing
...@@ -2594,12 +2545,13 @@ func _main() (err error) { ...@@ -2594,12 +2545,13 @@ func _main() (err error) {
if errors.Cause(err) != context.Canceled { if errors.Cause(err) != context.Canceled {
log.Error(err) log.Error(err)
log.Errorf("zwatcher failed -> switching filesystem to EIO mode (TODO)") log.Errorf("zwatcher failed -> switching filesystem to EIO mode (TODO)")
//panic("TODO: switch fs to EIO mode") // XXX // TODO: switch fs to EIO mode
} }
// wait for unmount // wait for unmount
// XXX the kernel does not send FORGETs on unmount - release left node resources ourselves? // NOTE the kernel does not send FORGETs on unmount - but we don't need
// to release left node resources ourselves, because it is just memory.
<-serveCtx.Done() <-serveCtx.Done()
log.Infof("stop %q %q", mntpt, zurl) log.Infof("stop %q %q", mntpt, zurl)
return nil // XXX serveErr | zwatchErr ? return nil
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment