.

7829816b · Kirill Smelkov · 46329e84 · 7829816b · 7829816b
Commit 7829816b authored Oct 21, 2021 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 46 additions and 54 deletions

wcfs/notes.txt wcfs/notes.txt +11 -6

wcfs/wcfs.go wcfs/wcfs.go +35 -48

No files found.
--- a/wcfs/notes.txt
+++ b/wcfs/notes.txt
@@ -57,10 +57,13 @@ Trees _and_ Buckets nodes - would be required.
 -> we took the approach where we send invalidation to client about a block
 lazily only when the block is actually accessed.

-XXX building δFtail lazily along serving fuse reads during scope of one         XXX kill - fixed
-transaction is not trivial and creates concurrency bottlenecks if simple
+
+Rejected alternative:
+
+Building δFtail lazily along serving FUSE reads during scope of one
+transaction is not trivial and would create concurrency bottlenecks if simple
 locking scheme is used. With the main difficulty being to populate tracking set
-of δBtree lazily. However as the first approach we can still build complete
+of δBtree lazily. However as the first approach we could still build complete
 tracking set for a BTree at the time of file open: we need to scan through all
 trees but _not_ buckets: this way we'll know oid of all tree nodes: trees _and_
 buckets, while avoiding loading buckets makes this approach practical: with
@@ -69,9 +72,11 @@ require ~ 20 trees to cover 1TB of data. And we can scan those trees very
 quickly even if doing so serially. For 1PB of data it will require to scan ~
 10⁴ trees. If RTT to load 1 object is ~1ms this will become 10 seconds if done
 serially. However if we load all those tree objects in parallel it will be
-much less. Still the number of trees to scan is linear to the amount of data
-and it would be good to address the shortcoming of doing whole file index scan
-later.
+much less. Still the number of trees to scan is linear to the amount of data.
+
+-> rejected: ΔFtail and ΔBtail were instead fixed to allow several Track and
+queries requests to run in parallel. See "Concurrency" section in ΔFtail/ΔBtail
+organization overview.


 Changing mmapping while under pagefault is possible

--- a/wcfs/wcfs.go
+++ b/wcfs/wcfs.go
@@ -274,42 +274,26 @@ package main
 // 4) when we receive an invalidation message from ZODB - we process it and
 //    propagate invalidations to OS file cache of head/bigfile/*:
 //
-//	invalidation message: (tid↑, []oid)
+//	invalidation message:  δZ = (tid↑, []oid)
 //
-//    4.1) zhead.cache.lookup(oid)			XXX -> δFtail
-//    4.2) ø: nothing to do - see invariant ^^^.
-//    4.3) obj found:
+//    4.1) δF = δFtail.Update(δZ)
 //
-//	- ZBlk*		-> [] of file/[]#blk
-//	- BTree/Bucket	-> δ(BTree)  -> file/[]#blk
+//	δFtail (see below) converts ZODB-level changes into information about
+//	which blocks of which files were modified and need to be invalidated:
 //
-//	in the end after processing all []oid from invalidation message we have
+//	  δF = (tid↑, {} file -> []#blk)
 //
-//	  [] of file/[]#blk
+//	Note that δF might be not full and reflects only changes to files and
+//	blocks that were requested to be tracked. However because of the invariant
+//	δF covers in full what needs to be invalidated in the OS file cache.
 //
-//	that describes which file(s) parts needs to be invalidated.
-//
-//	FIXME no - we can build it but not in full - since we consider only zobj in live cache.
-//	FIXME and even if we consider all δ'ed zobj, building complete set of
-//	      file.δtail requires to first do complete scan of file.blktab
-//	      which is prohibitively expensive.
-//	XXX -> we'll do the scan, but only Trees _without_ Buckets. This
-//	       makes the scan practical until 1PB while helping to build
-//	       initial tracking set for δFtail.
-//	       Eager invalidation would require full scan - Trees _and_
-//	       Buckets, which makes it prohibitively expensive - see (+).
-//
-//	FIXME all ^^^ is outdated ->  XXX δFtail
-//
-//    4.4) for all file/blk to invalidate we do:
+//    4.2) for all file/blk to invalidate we do:
 //
 //	- try to retrieve head/bigfile/file[blk] from OS file cache(*);
 //	- if retrieved successfully -> store retrieved data back into OS file
 //	  cache for @<rev>/bigfile/file[blk], where
 //
-//	    # see below about file.δtail
-//	    # XXX -> δFtail.BlkRevAt(file, #blk, zhead.at)
-//	    rev = max(file.δtail.by(#blk)) || min(rev ∈ file.δtail) || zhead.at
+//	    rev = δFtail.BlkRevAt(file, #blk, zhead.at)
 //
 //	- invalidate head/bigfile/file[blk] in OS file cache.
 //
@@ -318,30 +302,38 @@ package main
 //	won't be served from OS file cache and instead will trigger a FUSE read
 //	request to wcfs.
 //
-//    4.5) no invalidation messages are sent to wcfs clients at this point(+).
+//    4.3) no invalidation messages are sent to wcfs clients at this point(+).
 //
-//    4.6) processing ZODB invalidations and serving file reads (see 7) are
+//    4.4) processing ZODB invalidations and serving file reads (see 7) are
 //      organized to be mutually exclusive.
 //
 // 5) after OS file cache was invalidated, we resync zhead to new database
 //    view corresponding to tid.
 //
-// 6) for every file δtail invalidation info about head/data is maintained:	XXX -> δFtail
+// 6) a ZBigFile-level history tail is maintained in δFtail.
+//
+//    δFtail translates ZODB object-level changes into information about which
+//    blocks of which ZBigFile were modified, and provides service to query
+//    that information.
+//
+//    It semantically consists of
 //
-//	- tailv: [](rev↑, []#blk)
-//	- by:    {} #blk -> []rev↑ in tail
+//	[]δF
 //
-//    δtail.tail describes invalidations to file we learned from ZODB invalidation.
-//    δtail.by   allows to quickly lookup information by #blk.
+//    where δF represents a change in files space
 //
-//    min(rev) in δtail is min(@at) at which head/bigfile/file is currently watched (see below).
+//	δF:
+//		.rev↑
+//		{} file ->  {}blk
 //
-//    XXX δtail can miss ...
+//    min(rev) in δFtail is min(@at) at which head/bigfile/file is currently watched (see below).
 //
 //    to support initial openings with @at being slightly in the past, we also
-//    make sure that min(rev) is enough to cover last 10 minutes of history	XXX 10m -> 1m ?
+//    make sure that min(rev) is enough to cover last 1 minute of history
 //    from head/at.
 //
+//    See ΔFtail documentation in internal/zdata/δftail.go for more details.
+//
 // 7) when we receive a FUSE read(#blk) request to a head/bigfile/file, we process it as follows:
 //
 //   7.1) load blkdata for head/bigfile/file[blk] @zhead.at .
@@ -354,20 +346,15 @@ package main
 //	it is not exact because BTree/Bucket can change (e.g. rebalance)
 //	but still point to the same k->ZBlk.
 //
-//	we also use file.δtail to find either exact blk revision:	XXX δFtail
-//
-//	  rev(blk) = max(file.δtail.by(#blk) -> []rev↑)
-//
-//	or another upper bound if #blk ∉ δtail:
-//
-//	  rev(blk) ≤ min(rev ∈ δtail)		; #blk ∉ δtail
+//	we also use δFtail to find either exact blk revision or another upper
+//	bound if file[blk] has no change during δFtail coverage:
 //
+//	  rev(blk) = δFtail.BlkRevAt(file, #blk, zhead.at)
 //
 //	below rev'(blk) is min(of the estimates found):
 //
 //	  rev(blk) ≤ rev'(blk)		rev'(blk) = min(^^^)
 //
-//
 //	Note: we delay recomputing δFtail.BlkRevAt(file, #blk, head) because
 //	using just cheap revmax estimate can frequently result in all watches
 //	being skipped.
@@ -377,7 +364,7 @@ package main
 //	- rev'(blk) ≤ at: -> do nothing
 //	- rev'(blk) > at:
 //	  - if blk ∈ watch.pinned -> do nothing
-//	  - rev = max(δtail.by(#blk) : _ ≤ at)	|| min(rev ∈ δtail : rev ≤ at)	|| at
+//	  - rev = δFtail.BlkRevAt(file, #blk, at)
 //	  - watch.pin(file, #blk, @rev)
 //	  - watch.pinned += blk
 //
@@ -820,7 +807,7 @@ func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err err
 	head := root.head

 	// while we are invalidating OS cache, make sure that nothing, that
-	// even reads /head/bigfile/*, is running (see 4.6).
+	// even reads /head/bigfile/*, is running (see 4.4).
 	//
 	// also make sure that cache uploaders we spawned (uploadBlk) are all
 	// paused, or else they could overwrite OS cache with stale data.
@@ -1032,7 +1019,7 @@ func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) {

 // invalidateBlk invalidates 1 file block in kernel cache.
 //
-// see "4.4) for all file/blk to in invalidate we do"
+// see "4.2) for all file/blk to in invalidate we do"
 // called with zheadMu wlocked.
 func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) {
 	defer xerr.Contextf(&err, "%s: invalidate blk #%d:", f.path(), blk)
@@ -2243,7 +2230,7 @@ func (head *Head) bigfopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err e
 		loading: make(map[int64]*blkLoadState),
 	}

-	// only head/ needs δFtail, f.δtail and watches.
+	// only head/ needs δFtail and watches.
 	if head.rev == 0 {
 		// see "3) for */head/data the following invariant is maintained..."
 		head.bfdir.δFtail.Track(f.zfile, -1, sizePath, blkCov, nil)