.

06ed10ee · Kirill Smelkov · 9b4a42a3 · 06ed10ee
Commit 06ed10ee authored Oct 19, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 115 additions and 37 deletions

wcfs/wcfs.go wcfs/wcfs.go +115 -37

No files found.
--- a/wcfs/wcfs.go
+++ b/wcfs/wcfs.go
@@ -221,70 +221,97 @@
 // data directly into the file.
 package main

-// wcfs organization
+// Wcfs organization
 //
-// * 1 ZODB connection for "latest data" for whole filesystem (zconn).
+// Wcfs is a ZODB client that translates ZODB objects into OS files as would
+// non-wcfs wendelin.core do for a ZBigFile. It is organized as follows:
 //
-// * XXX read-only transaction for head data.
-//
-// * data/head of all bigfiles represent state as of zconn.at .
-//
-// * the following invariant is maintained:
+// - 1 ZODB connection for "latest data" for whole filesystem (zconn).
+// - head/data of all bigfiles represent state as of zconn.At .
+// - for */head/data the following invariant is maintained:
 //
 //	#blk ∈ file cache    =>    ZBlk(#blk) + all BTree/Bucket that lead to it  ∈ zconn cache
 //	                           (ZBlk* in ghost state)
 //
-//   The invariant helps on invalidations: if we see a changed oid, and
+//   The invariant helps on invalidation: if we see a changed oid, and
 //   zconn.cache.lookup(oid) = ø -> we know we don't have to invalidate OS
-//   cache for any part of any file (even if oid relates to a file block - it
-//   is uncached and will trigger ZODB load on file read).
+//   cache for any part of any file (even if oid relates to a file block - that
+//   block is not cached and will trigger ZODB load on file read).
+//
+//   Currently we maintain this invariant by simply never evicting LOBTree/LOBucket
+//   objects from ZODB Connection cache (LOBucket keeps references to ZBlk* and
+//   so ZBlk* also stay in cache in ghost form). In the future we may want to
+//   try to synchronize to kernel freeing its pagecache pages.
 //
-//   Currently we maintain this invariant by simply never evicting ZBlk* and
-//   LOBTree/LOBucket objects from ZODB Connection cache. In the future we may
-//   want to try to synchronize to kernel freeing its pagecache pages.
+// - when we receive an invalidation message from zstor - we process it and
+//   propagate invalidations to OS file cache of */head/data:
 //
-// * when we receive an invalidation message from zstor - we process it and
-//   propagate invalidations to OS file cache:
+//	invalidation message: (tid↑, []oid)
 //
-//   - zconn.cache.lookup(oid)
-//   - ø: nothing to do - see invariant ^^^.
-//   - obj found:
+//   1. zconn.cache.lookup(oid)
+//   2. ø: nothing to do - see invariant ^^^.
+//   3. obj found:
 //
 //	- ZBlk*		-> file/#blk
-//	- BTree/Bucket	-> file/δ(BTree)  -> file/[]#blk
+//	- BTree/Bucket	-> δ(BTree)  -> file/[]#blk
 //
-//     in the end for all found objects we have
+//     in the end after processing all []oid from invalidation message we have
 //
 //	  [] of file/[]#blk
 //
 //     that describes which file(s) parts needs to be invalidated.
 //
-//   - for all file/blk to invalidate we do:
+//   4. for all file/blk to invalidate we do:
 //
-//	- try to retrieve file's head/data[blk] from OS file cache;
+//	- try to retrieve file/head/data[blk] from OS file cache;
 //	- if retrieved successfully -> store retrieved data into OS file cache
-//	  for @<rev>/data[blk];
-//	- invalidate head/data[blk] in OS file cache.
+//	  for file/@<rev>/data[blk];	XXX @rev = what? (ideally exact previous rev of blk)
+//	- invalidate file/head/data[blk] in OS file cache.
+//
+//	This preserves previous data in OS file cache in case it will be needed
+//	by not-yet-uptodate clients, and makes sure file read of head/data[blk]
+//	won't be served from OS file cache and instead will trigger a FUSE read
+//	request to wcfs.
+//
+// - XXX δZtail of invalidation info is maintained.
+//
+//	- tail of [](tid↑, []oid)
+//	- {} oid -> []tid↑ in tail
+//
+//	min(tid) in δZtail is min(@at) at which */head/data is currently mmapped.
+//
+// - when we receive a FUSE read(#blk) request to a file/head/data we process it as follows:
+//
+//   1. load blkdata for head/data[blk] @zconn.at .
+//      this also gives upper bound estimate of when the block was last changed:
 //
-//	This preserves @<rev> data in OS file cache in case it will be needed,
-//	and makes sure file read of head/data[blk] won't be served from OS file
-//	cache and will trigger a FUSE read request to wcfs.
+//	rev(blk) ≤ max(_.serial for _ in (ZBlk(#blk), all BTree/Bucket that lead to ZBlk))
 //
-// * XXX δZ tail of invalidation info is maintained.
+//	XXX it is not exact because BTree/Bucket can change (e.g. rebalance)
+//	but still point to the same k->ZBlk.
+//	XXX if we maintain δBTree tail we can maybe get rev(blk) as exact?
 //
-// * when we receive a FUSE read(#blk) request to a file's head/data we process it as follows:
+//   2. for all client/addr@at mmappings of file/head/data:
 //
-//   - first for all clients that have file's head/data mmaped with older @rev:
+//	- rev(blk) ≤ at: -> do nothing
+//	- rev(blk) > at:
+//	  - if blk ∉ mmapping.pinned -> do nothing
+//	  - client.remmap(addr[blk], file/@at/data)	XXX @at -> @revprev(blk) better?
+//							XXX @at -> @prevrev(file) even more better?
+//	  - mmapping.pinned += blk
 //
-//       client.remmap(blk, @rev)
+//	remmapping is done synchronously via ptrace.
+//	XXX via running wcfs-trusted code wcfs injects into clients.
 //
-//     remmapping is done synchronously via ptrace.
-//     XXX via running wcfs-trusted code wcfs injects into clients.
+//	in order to support remmapping for each file/head/data
 //
-//     XXX δZ is consulted to find out which client needs such update.
-//     XXX table of which blocks were already remmaped.
+//	  [] of mmapping{client/addr/@at↑, pinned}
 //
-//   - load blkdata for head/data[blk] @zconn.at and return it to kernel.
+//	is maintained.
+//
+//	XXX δZ is consulted to find out which client needs such update?
+//
+//   3. blkdata is returned to kernel.
 //
 //   Thus a client that wants latest data on pagefault will get latest data,
 //   and a client that wants @rev data will get @rev data, even if it was this
@@ -293,9 +320,58 @@ package main
 //
 //
 //
+// δ(BTree) notes
+//
+//
+// input: BTree, (@new, []oid)  ->  find out δ(BTree) i.e. {-k(v), +k'(v'), ...}
+//
+// - oid ∈ Bucket
+// - oid ∈ BTree
+//
+// Bucket:
+//
+//	old = {k  -> v}
+//	new = {k' -> v'}
+//
+//	Δ = -k(v), +k(v), ...
+//
+// => for all buckets
+//
+//	Δ accumulates to []δk(v)[n+,n-]  n+ ∈ {0,1}, n- ∈ {0,1}, if n+=n- - cancel
+//
+//
+// BTree:
+//
+//	old = {k  -> B}   or {k  -> T}
+//	new = {k' -> B'}  or {k' -> T'}
+//
+//	Δ = -k(B), +k(B), -k(T), +K(T), ...
+//
+// we translate (in top-down order):
+//
+//	k(B) -> {} of k(v)
+//	k(T) -> {} of k(B) -> {} of k(v)
+//
+// which gives
+//
+//	Δ = k(v), +k(v), ...
+//
+// i.e. exactly as for buckets and it accumulates to global Δ.
+//
+// The globally-accumulated Δ is the answer for δ(BTree, (@new, []oid))
+//
+// XXX -> internal/btreediff ?
+//
+// δ(BTree) in wcfs context:
+//
+// . -k(blk) -> invalidata #blk
+// . +k(blk) -> ignore (no need to invalidate)
+//
+//
 //
 //
 // XXX zconn(s) for historical state
+// XXX serving read from @<rev>/data
 //
 //
 //
@@ -841,13 +917,15 @@ func (bf *BigFile) readAt() []byte {



-// zodbCacheControl implements LiveCacheControl to tune ZODB to never evict
+// zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
 // LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive
 // becuse it is essentially the index where to find ZBigFile data.
 //
 // For the data itself - we put it to kernel pagecache and always deactivate
 // from ZODB right after that.
 //
+// See "*/head/data invariant" in "wcfs organization" overview.
+//
 // TODO set it to Connection.CacheControl
 type zodbCacheControl struct {}