X invalidation design draftly settled

9b4a42a3 · Kirill Smelkov · 69c94fbc · 9b4a42a3
Commit 9b4a42a3 authored Oct 18, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 83 additions and 2 deletions

wcfs/wcfs.go wcfs/wcfs.go +83 -2

No files found.
--- a/wcfs/wcfs.go
+++ b/wcfs/wcfs.go
@@ -95,6 +95,9 @@
 //
 // Invalidation protocol
 //
+// XXX invalidations will be done via ptrace because we need them to be
+// synchronous (see "wcfs organization")
+//
 // In order to support isolation wcfs implements invalidation protocol that
 // must be cooperatively followed by both wcfs and client.
 //
@@ -220,9 +223,85 @@ package main

 // wcfs organization
 //
-// TODO
+// * 1 ZODB connection for "latest data" for whole filesystem (zconn).
+//
+// * XXX read-only transaction for head data.
+//
+// * data/head of all bigfiles represent state as of zconn.at .
+//
+// * the following invariant is maintained:
+//
+//	#blk ∈ file cache    =>    ZBlk(#blk) + all BTree/Bucket that lead to it  ∈ zconn cache
+//	                           (ZBlk* in ghost state)
+//
+//   The invariant helps on invalidations: if we see a changed oid, and
+//   zconn.cache.lookup(oid) = ø -> we know we don't have to invalidate OS
+//   cache for any part of any file (even if oid relates to a file block - it
+//   is uncached and will trigger ZODB load on file read).
+//
+//   Currently we maintain this invariant by simply never evicting ZBlk* and
+//   LOBTree/LOBucket objects from ZODB Connection cache. In the future we may
+//   want to try to synchronize to kernel freeing its pagecache pages.
+//
+// * when we receive an invalidation message from zstor - we process it and
+//   propagate invalidations to OS file cache:
+//
+//   - zconn.cache.lookup(oid)
+//   - ø: nothing to do - see invariant ^^^.
+//   - obj found:
+//
+//	- ZBlk*		-> file/#blk
+//	- BTree/Bucket	-> file/δ(BTree)  -> file/[]#blk
+//
+//     in the end for all found objects we have
+//
+//	  [] of file/[]#blk
+//
+//     that describes which file(s) parts needs to be invalidated.
+//
+//   - for all file/blk to invalidate we do:
+//
+//	- try to retrieve file's head/data[blk] from OS file cache;
+//	- if retrieved successfully -> store retrieved data into OS file cache
+//	  for @<rev>/data[blk];
+//	- invalidate head/data[blk] in OS file cache.
+//
+//	This preserves @<rev> data in OS file cache in case it will be needed,
+//	and makes sure file read of head/data[blk] won't be served from OS file
+//	cache and will trigger a FUSE read request to wcfs.
+//
+// * XXX δZ tail of invalidation info is maintained.
 //
-// - 1 ZODB connection per 1 bigfile (each bigfile can be at its different @at,
+// * when we receive a FUSE read(#blk) request to a file's head/data we process it as follows:
+//
+//   - first for all clients that have file's head/data mmaped with older @rev:
+//
+//       client.remmap(blk, @rev)
+//
+//     remmapping is done synchronously via ptrace.
+//     XXX via running wcfs-trusted code wcfs injects into clients.
+//
+//     XXX δZ is consulted to find out which client needs such update.
+//     XXX table of which blocks were already remmaped.
+//
+//   - load blkdata for head/data[blk] @zconn.at and return it to kernel.
+//
+//   Thus a client that wants latest data on pagefault will get latest data,
+//   and a client that wants @rev data will get @rev data, even if it was this
+//   "old" client that triggered the pagefault.	XXX verify can we change a
+//   mapping while it is under pagefault.
+//
+//
+//
+//
+//
+// XXX zconn(s) for historical state
+//
+//
+//
+//
+//
+// - XXX(kill) 1 ZODB connection per 1 bigfile (each bigfile can be at its different @at,
 //   because invalidations for different bigfiles can be processed with different
 //   timings depending on clients). No harm here as different bigfiles use
 //   completely different ZODB BTree and data objects.
@@ -780,6 +859,8 @@ func (cc *zodbCacheControl) WantEvict(obj zodb.IPersistent) bool {
 	case *btree.LOBTree:
 	case *btree.LOBucket:

+	// ZBlk* are kept referenced by a LOBucket, so they don't go away from Connection.objtab
+
 	// XXX + ZBigFile ?
 	}