X No longer rely on ZODB cache invariant for invalidations

Move all aspects of what is tracked into ΔFtail. It is more robust if ΔFtail does not rely on properties of outside zconn.Cache in order to work correctly. It also potentially allows to move ΔFtail out of zdata package. And it is even more memeory efficient, as before it was Connection.cache[zoid] -> ZBlk(ghost)->inΔFtail and now it is ΔFtail.trackSetZBlk[zoid] -> zblkTrack becuase Persistent embedded in ZBlk even in ghost state occupies alone occupies ~ 16 words. Now we keep only what is needed to be kept for tracking.

X No longer rely on ZODB cache invariant for invalidations
Move all aspects of what is tracked into ΔFtail. It is more robust if ΔFtail does not rely on properties of outside zconn.Cache in order to work correctly. It also potentially allows to move ΔFtail out of zdata package. And it is even more memeory efficient, as before it was Connection.cache[zoid] -> ZBlk(ghost)->inΔFtail and now it is ΔFtail.trackSetZBlk[zoid] -> zblkTrack becuase Persistent embedded in ZBlk even in ghost state occupies alone occupies ~ 16 words. Now we keep only what is needed to be kept for tracking.
bf9a7405 · Kirill Smelkov · ef74aebc · bf9a7405 · bf9a7405 · bf9a7405
Commit bf9a7405 authored Jul 01, 2021 by Kirill Smelkov
Show whitespace changes
Inline Side-by-side

Showing with 78 additions and 98 deletions

wcfs/internal/zdata/zblk.go wcfs/internal/zdata/zblk.go +0 -3

wcfs/internal/zdata/δftail.go wcfs/internal/zdata/δftail.go +51 -65

wcfs/wcfs.go wcfs/wcfs.go +27 -30

No files found.
--- a/wcfs/internal/zdata/zblk.go
+++ b/wcfs/internal/zdata/zblk.go
@@ -60,7 +60,6 @@ import (
 // ZBlk is the interface that every ZBlk* block implements.
 type ZBlk interface {
 	zodb.IPersistent
-	_ZBlkInΔFtail
 	// LoadBlkData loads from database and returns data block stored by this ZBlk.
 	//
@@ -79,7 +78,6 @@ var _ ZBlk = (*ZBlk1)(nil)
 // ZBlk0 mimics ZBlk0 from python.
 type ZBlk0 struct {
 	zodb.Persistent
-	zblkInΔFtail
 	// NOTE py source uses bytes(buf) but on python2 it still results in str
 	blkdata string
@@ -157,7 +155,6 @@ func (zd *zDataState) PySetState(pystate interface{}) error {
 // ZBlk1 mimics ZBlk1 from python.
 type ZBlk1 struct {
 	zodb.Persistent
-	zblkInΔFtail
 	chunktab *btree.IOBTree // {} offset -> ZData(chunk)
 }

--- a/wcfs/internal/zdata/δftail.go
+++ b/wcfs/internal/zdata/δftail.go
@@ -22,8 +22,6 @@ package zdata
 import (
 	"context"
 	"fmt"
-	"runtime"
-	"sync"
 	"lab.nexedi.com/kirr/go123/xerr"
 	"lab.nexedi.com/kirr/neo/go/zodb"
@@ -79,6 +77,8 @@ type ΔFtail struct {
 	δBtail  *xbtree.ΔBtail
 	fileIdx map[zodb.Oid]SetOid // tree-root -> {} ZBigFile<oid> as of @head
+	trackSetZFile SetOid                  // set of tracked ZBigFiles	as of @head
+	trackSetZBlk  map[zodb.Oid]*zblkTrack // zblk -> {} root -> {}blk	as of @head
 // XXX kill
 ///*
 	// XXX don't need vδF - everything is reconstructed at runtime from .δBtail.vδT
@@ -89,10 +89,16 @@ type ΔFtail struct {
 	// tracked ZBlk that are not yet taken into account in current vδF.
 	// grows on new track requests; flushes on queries and update.
-	trackNew map[zodb.Oid]map[zodb.Oid]*zblkInΔFtail // {} foid -> {} zoid -> zblk
+	trackNew map[zodb.Oid]map[zodb.Oid]*zblkTrack // {} foid -> {} zoid -> zblk
 //*/
 }
+// zblkTrack keeps information in which root/blocks ZBlk is present as of @head.
+type zblkTrack struct {
+//	inroot map[zodb.Oid]SetI64  // {} root -> {}blk		XXX later switch to this
+	infile map[zodb.Oid]SetI64  // {} foid -> {}blk
+}
 // ΔF represents a change in files space.
 type ΔF struct {
 	Rev    zodb.Tid
@@ -106,24 +112,6 @@ type ΔFile struct {
 	Size   bool   // whether file size changed	XXX -> ΔSize?
 }
-// zblkInΔFtail is ΔFtail-related volatile data embedded into ZBlk*.
-//
-// The data is preserved even when ZBlk comes to ghost state, but is lost if
-// ZBlk is garbage collected. The data is transient - it is _not_ included into
-// persistent state.
-type zblkInΔFtail struct {
-	mu sync.Mutex // to support multiple concurrent loaders
-	// XXX change vvv to intree_parent? {} Bucket -> set(#blk)
-	// (this is uniform with building in-RAM reverse child->parents relation for
-	//  tree nodes and for tree_root->file)
-	// with which files/blocks this ZBlk is associated with as of @head state
-	infile map[zodb.Oid]SetI64 // {} foid -> set(#blk)
-}
-type _ZBlkInΔFtail interface { inΔFtail() *zblkInΔFtail }
-func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z }
 // NewΔFtail creates new ΔFtail object.
 //
@@ -135,8 +123,10 @@ func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z }
 func NewΔFtail(at0 zodb.Tid, db *zodb.DB) *ΔFtail {
 	return &ΔFtail{
 		δBtail:        xbtree.NewΔBtail(at0, db),
-		fileIdx:  make(map[zodb.Oid]SetOid),
+		fileIdx:       map[zodb.Oid]SetOid{},
-		trackNew: make(map[zodb.Oid]map[zodb.Oid]*zblkInΔFtail),
+		trackSetZFile: SetOid{},
+		trackSetZBlk:  map[zodb.Oid]*zblkTrack{},
+		trackNew:      map[zodb.Oid]map[zodb.Oid]*zblkTrack{},
 	}
 }
@@ -158,6 +148,8 @@ func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() }
 //
 // A root can be associated with several files (each provided on different Track call).
 func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zblk ZBlk) {
+	// XXX locking
 	foid := file.POid()
 	if blk == -1 {
 		// XXX blk = ∞ from beginning ?
@@ -167,6 +159,7 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb
 	if err != nil {
 		panic(err) // XXX -> error? errctx
 	}
 	root := path[0].(*btree.LOBTree)
 	files, ok := δFtail.fileIdx[root.POid()]
 	if !ok {
@@ -175,30 +168,35 @@ func (δFtail *ΔFtail) Track(file *ZBigFile, blk int64, path []btree.LONode, zb
 	}
 	files.Add(foid)
+	δFtail.trackSetZFile.Add(foid)
 	// associate zblk with file, if it was not hole
 	if zblk != nil {
-		z := zblk.inΔFtail()
+		zoid := zblk.POid()
-		z.mu.Lock()
+		zt, ok := δFtail.trackSetZBlk[zoid]
-		blocks, ok := z.infile[foid]
+		if !ok {
+			zt = &zblkTrack{}
+			δFtail.trackSetZBlk[zoid] = zt
+		}
+		blocks, ok := zt.infile[foid]
 		if !ok {
 			blocks = make(SetI64, 1)
-			if z.infile == nil {
+			if zt.infile == nil {
-				z.infile = make(map[zodb.Oid]SetI64)
+				zt.infile = make(map[zodb.Oid]SetI64)
 			}
-			z.infile[foid] = blocks
+			zt.infile[foid] = blocks
 		}
 		blocks.Add(blk)
-		z.mu.Unlock()
-		// XXX locking
 		if !ok {
 			// zblk was not associated with this file
-			zt := δFtail.trackNew[foid]
+			ft := δFtail.trackNew[foid]
-			if zt == nil {
+			if ft == nil {
-				zt = make(map[zodb.Oid]*zblkInΔFtail, 1)
+				ft = make(map[zodb.Oid]*zblkTrack, 1)
-				δFtail.trackNew[foid] = zt
+				δFtail.trackNew[foid] = ft
 			}
-			zt[zblk.POid()] = z
+			ft[zoid] = zt
 		}
 	}
@@ -227,6 +225,7 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF
 	// XXX δFtail.update() first?
 	// XXX verify zhead.At() == δFtail.Head()
+	// XXX locking
 	δB, err := δFtail.δBtail.Update(δZ)
 	if err != nil {
@@ -263,42 +262,29 @@ func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *xzodb.ZConn) (_ ΔF
 	// take zblk changes into account
 	for _, oid := range δZ.Changev {
-		// XXX cache lock/unlock
+		if δFtail.trackSetZFile.Has(oid) {
-		obj := zhead.Cache().Get(oid)
+			// TODO check that .blksize and .blktab (it is only
-		if obj == nil {
+			// persistent reference) do not change.
-			//fmt.Printf("%s: not in cache\n", oid)
-			continue // nothing to do - see invariant
-		}
-		//fmt.Printf("%s:     in cache (%s)\n", oid, typeOf(obj))
-		switch obj := obj.(type) {
-		case ZBlk:	// ZBlk*
-			// z.infile locking: since we write-locked head.zheadMu
-			// - no other fuse reads are running, and thus no one
-			// is mutating z.infile. XXX recheck
-			z := obj.inΔFtail()
-			for file, blocks := range z.infile {
-				δfile, ok := δF.ByFile[file]
-				if !ok {
-					δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
-					δF.ByFile[file] = δfile
-				}
-				δfile.Blocks.Update(blocks)
+			return ΔF{}, fmt.Errorf("ZBigFile<%s> changed @%s", oid, δZ.Tid)
 		}
-			// XXX update z.infile according to btree changes
+		zt, ok := δFtail.trackSetZBlk[oid]
+		if !ok {
+			continue // not tracked
+		}
-		case *ZBigFile:
+		for foid, blocks := range zt.infile {
-			// TODO check that .blksize and .blktab (it is only
+			δfile, ok := δF.ByFile[foid]
-			// persistent reference) do not change.
+			if !ok {
+				δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
+				δF.ByFile[foid] = δfile
+			}
-			return ΔF{}, fmt.Errorf("ZBigFile<%s> changed @%s", oid, δZ.Tid)
+			δfile.Blocks.Update(blocks)
 		}
-		// make sure obj won't be garbage-collected until we finish handling it.
+		// XXX update zt.infile according to btree changes
-		runtime.KeepAlive(obj)
 	}
 	δFtail.vδF = append(δFtail.vδF, δF)

--- a/wcfs/wcfs.go
+++ b/wcfs/wcfs.go
@@ -257,21 +257,19 @@ package main
 // 2) head/bigfile/* of all bigfiles represent state as of zhead.At .
 // 3) for head/bigfile/* the following invariant is maintained:
 //
-//	#blk ∈ OS file cache	=>  ZBlk(#blk) + all BTree/Bucket that lead to it  ∈ zhead live cache(%)
+//	#blk ∈ OS file cache	=>  all BTree/Bucket/ZBlk that lead to blk are tracked(%)
-//	                            (ZBlk* in ghost state)
 //
-//	                        =>  all BTree/Bucket that lead to blk are tracked (XXX)
+//    The invariant helps on invalidation: when δFtail (see below) sees a
+//    changed oid, it is guaranteed that if the change affects block that was
+//    ever provided to OS, δFtail will detect that this block has changed.	XXX review
+//    And if oid relates to a file block but is not in δFtail's tracking set -
+//    we know that block is not cached and will trigger ZODB load on a future
+//    file read.
 //
-//    The invariant helps on invalidation: if we see a changed oid, and
+//    Currently we maintain this invariant by adding ZBlk/LOBTree/LOBucket
-//    zhead.cache.lookup(oid) = ø -> we know we don't have to invalidate OS
+//    objects to δFtail on every access, and never shrinking that tracking set.
-//    cache for any part of any file (even if oid relates to a file block - that
+//    In the future we may want to try to synchronize to kernel freeing its
-//    block is not cached and will trigger ZODB load on file read).
+//    pagecache pages.
-//
-//    XXX explain why tracked
-//
-//    Currently we maintain this invariant by simply never evicting ZBlk/LOBTree/LOBucket
-//    objects from ZODB Connection cache. In the future we may want to try to
-//    synchronize to kernel freeing its pagecache pages.
 //
 // 4) when we receive an invalidation message from ZODB - we process it and
 //    propagate invalidations to OS file cache of head/bigfile/*:
@@ -301,6 +299,8 @@ package main
 //	       Eager invalidation would require full scan - Trees _and_
 //	       Buckets, which makes it prohibitively expensive - see (+).
 //
+//	FIXME all ^^^ is outdated ->  XXX δFtail
+//
 //    4.4) for all file/blk to invalidate we do:
 //
 //	- try to retrieve head/bigfile/file[blk] from OS file cache(*);
@@ -718,7 +718,7 @@ type blkPinState struct {
 	err    error
 }
-// -------- 3) Cache invariant --------
+// -------- ZODB cache control --------
 // zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
 // LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive
@@ -726,34 +726,28 @@ type blkPinState struct {
 //
 // For the data itself - we put it to kernel pagecache and always deactivate
 // from ZODB right after that.
-//
-// See "3) for */head/data the following invariant is maintained..."
 type zodbCacheControl struct {}
 func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolicy {
 	switch obj.(type) {
-	// ZBlk* should be in cache but without data
+	// don't let ZBlk*/ZData to pollute the cache
 	case *ZBlk0:
-		return zodb.PCachePinObject | zodb.PCacheDropState
+		return zodb.PCacheDropObject | zodb.PCacheDropState
 	case *ZBlk1:
-		return zodb.PCachePinObject | zodb.PCacheDropState
+		return zodb.PCacheDropObject | zodb.PCacheDropState
-	// ZBigFile btree index should be in cache with data
-	case *btree.LOBTree:
-		return zodb.PCachePinObject | zodb.PCacheKeepState
-	case *btree.LOBucket:
-		return zodb.PCachePinObject | zodb.PCacheKeepState
-	// don't let ZData to pollute the cache
 	case *ZData:
 		return zodb.PCacheDropObject | zodb.PCacheDropState
-	// for performance reason we also keep ZBigFile in cache.
+	// keep ZBigFile and its btree index in cache to speedup file data access.
 	//
 	// ZBigFile is top-level object that is used on every block load, and
 	// it would be a waste to evict ZBigFile from cache.
 	case *ZBigFile:
 		return zodb.PCachePinObject | zodb.PCacheKeepState
+	case *btree.LOBTree:
+		return zodb.PCachePinObject | zodb.PCacheKeepState
+	case *btree.LOBucket:
+		return zodb.PCachePinObject | zodb.PCacheKeepState
 	}
 	return 0
@@ -960,6 +954,7 @@ retry:
 		}
 		file.size = size
+		// see "3) for */head/data the following invariant is maintained..."
 		bfdir.δFtail.Track(zfile, -1, sizePath, nil)
 		// XXX we can miss a change to file if δblk is not yet tracked
@@ -1503,6 +1498,7 @@ func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btr
 	// update δFtail index	XXX -> move upper into readBlk ?
 	// (δFtail is just for δZ -> δF invalidation handling and is needed without isolation protocol)
 	// XXX ^^^ no - also need to query to send pins
+	// see "3) for */head/data the following invariant is maintained..."
 	bfdir := f.head.bfdir
 	δFtail := bfdir.δFtail
 	bfdir.δFmu.Lock()		// XXX locking correct? XXX -> better push down?
@@ -2206,6 +2202,7 @@ func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err er
 	// only head/ needs δFtail, f.δtail and watches.
 	if head.rev == 0 {
+		// see "3) for */head/data the following invariant is maintained..."
 		head.bfdir.δFmu.Lock()	// XXX locking ok?
 		head.bfdir.δFtail.Track(f.zfile, -1, sizePath, nil)
 		head.bfdir.δFmu.Unlock()
@@ -2393,8 +2390,8 @@ func _main() (err error) {
 	zhead, err := xzodb.ZOpen(ctx, zdb, &zodb.ConnOptions{
 		At: at0,
-		// we need zhead.cache to be maintained across several transactions.
+		// preserve zhead.cache across several transactions.
-		// see "3) for head/bigfile/* the following invariant is maintained ..."
+		// see "ZODB cache control"
 		NoPool: true,
 	})
 	if err != nil {