.

f38caef7 · Kirill Smelkov · 77ccb352 · f38caef7 · f38caef7
Commit f38caef7 authored Dec 25, 2018 by Kirill Smelkov
Hide whitespace changes
Inline Side-by-side

Showing with 26 additions and 31 deletions

wcfs/notes.txt wcfs/notes.txt +3 -3

wcfs/wcfs.go wcfs/wcfs.go +23 -28

No files found.
--- a/wcfs/notes.txt
+++ b/wcfs/notes.txt
@@ -6,8 +6,8 @@ This file contains notes additional to usage documentation and internal
 organization overview in wcfs.go .


-Invalidations to wcfs clients are delayed until they read
-=========================================================
+Invalidations to wcfs clients are delayed until block access
+============================================================

 Initially it was planned that wcfs would send invalidation messages to its
 clients right after receiving invalidation message from ZODB at transaction
@@ -18,7 +18,7 @@ Emitting whole δR right at transaction-boundary time requires to keep whole
 ZBigFile.blktab index in RAM. Even though from space point of view it is
 somewhat acceptable (~ 0.01% of whole-file data size, i.e. ~ 128MB of index for
 ~ 1TB of data), it is not good from time overhead point of view - initial open
-of a file this way would be potentially very slow.
+of a file this way would be potentially slow.

 -> we took the approach where we invalidate a block lazily only when it is
 actually accesses.

--- a/wcfs/wcfs.go
+++ b/wcfs/wcfs.go
@@ -238,11 +238,11 @@ package main
 //
 // Wcfs is a ZODB client that translates ZODB objects into OS files as would
 // non-wcfs wendelin.core do for a ZBigFile. Contrary to non-wcfs wendelin.core,
-// it keeps bigfile data in shared cache efficiently. It is organized as follows:
+// it keeps bigfile data in shared OS cache efficiently. It is organized as follows:
 //
 // 1) 1 ZODB connection for "latest data" for whole filesystem (zhead).
-// 2) head/data of all bigfiles represent state as of zhead.At .
-// 3) for */head/data the following invariant is maintained:
+// 2) head/bigfile/* of all bigfiles represent state as of zhead.At .
+// 3) for head/bigfile/* the following invariant is maintained:
 //
 //	#blk ∈ file cache    =>    ZBlk(#blk) + all BTree/Bucket that lead to it  ∈ zhead cache
 //	                           (ZBlk* in ghost state)
@@ -258,7 +258,7 @@ package main
 //    try to synchronize to kernel freeing its pagecache pages.
 //
 // 4) when we receive an invalidation message from ZODB - we process it and
-//    propagate invalidations to OS file cache of */head/data:
+//    propagate invalidations to OS file cache of head/bigfile/*:
 //
 //	invalidation message: (tid↑, []oid)
 //
@@ -277,22 +277,23 @@ package main
 //
 //    4.4) for all file/blk to invalidate we do:
 //
-//	- try to retrieve file/head/data[blk] from OS file cache;
+//	- try to retrieve head/bigfile/file[blk] from OS file cache;
 //	- if retrieved successfully -> store retrieved data back into OS file
-//	  cache for file/@<rev>/data[blk], where
+//	  cache for @<rev>/bigfile/file[blk], where
 //
 //	    rev = max(δFtail.by(#blk)) || min(rev ∈ δFtail) || zhead.at	; see below about δFtail
 //
-//	- invalidate file/head/data[blk] in OS file cache.
+//	- invalidate head/bigfile/file[blk] in OS file cache.
 //
 //	This preserves previous data in OS file cache in case it will be needed
-//	by not-yet-uptodate clients, and makes sure file read of head/data[blk]
+//	by not-yet-uptodate clients, and makes sure file read of head/bigfile/file[blk]
 //	won't be served from OS file cache and instead will trigger a FUSE read
 //	request to wcfs.
 //
 //    4.5) no invalidation messages are sent to wcfs clients at this point(*).
 //
-//    XXX processing ZODB invalidations and serving reads are mutually exclusive.
+//    4.6) processing ZODB invalidations and serving file reads (see 7) are
+//      organized to be mutually exclusive.
 //
 // 5) after OS file cache was invalidated, we resync zhead to new database
 //    view corresponding to tid.
@@ -305,12 +306,15 @@ package main
 //    δFtail.tail describes invalidations to file we learned from ZODB invalidation.
 //    δFtail.by   allows to quickly lookup information by #blk.
 //
-//    min(rev) in δFtail is min(@at) at which head/data is currently mmapped (see below).
-//    XXX min(10 minutes) of history to support initial openings
+//    min(rev) in δFtail is min(@at) at which head/bigfile/file is currently mmapped (see below).
 //
-// 7) when we receive a FUSE read(#blk) request to a file/head/data we process it as follows:
+//    to support initial openings with @at being slightly in the past, we also
+//    make sure that min(rev) is enough to cover last 10 minutes of history
+//    from head/at.
 //
-//   7.1) load blkdata for head/data[blk] @zhead.at .
+// 7) when we receive a FUSE read(#blk) request to a head/bigfile/file we process it as follows:
+//
+//   7.1) load blkdata for head/bigfile/file[blk] @zhead.at .
 //
 //	while loading this also gives upper bound estimate of when the block
 //	was last changed:
@@ -334,13 +338,13 @@ package main
 //	  rev(blk) ≤ rev'(blk)		rev'(blk) = min(^^^)
 //
 //
-//   7.2) for all client@at mmappings of file/head/data:
+//   7.2) for all client@at mmappings of head/bigfile/file:
 //
 //	- rev'(blk) ≤ at: -> do nothing
 //	- rev'(blk) > at:
 //	  - if blk ∈ mmapping.pinned -> do nothing
 //	  - rev = max(δFtail.by(#blk) : _ ≤ at)	|| min(rev ∈ δFtail : rev ≤ at)	|| at
-//	  - client.remmap(file, #blk, @rev/data)
+//	  - client.remmap(file, #blk, @rev/bigfile/file)
 //	  - mmapping.pinned += blk
 //
 //	remmapping is done via "invalidation protocol" exchange with client.
@@ -348,7 +352,7 @@ package main
 //	  wcfs-trusted code via ptrace that wcfs injects into clients, but ptrace
 //	  won't work when client thread is blocked under pagefault or syscall(~) )
 //
-//	in order to support remmapping for each file/head/data
+//	in order to support remmapping for each head/bigfile/file
 //
 //	  [] of mmapping{client@at↑, pinned}
 //
@@ -360,15 +364,14 @@ package main
 //   and a client that wants @rev data will get @rev data, even if it was this
 //   "old" client that triggered the pagefault(+).
 //
-// (*) see "Invalidations to wcfs clients are delayed until they read" in notes.txt
+// (*) see "Invalidations to wcfs clients are delayed until block access" in notes.txt
 // (+) see "Changing mmapping while under pagefault is possible" in notes.txt
 // (~) see "Client cannot be ptraced while under pagefault" in notes.txt
 //
 //
-// XXX mmap(@at) open
-//
 // XXX 8) serving read from @<rev>/data + zconn(s) for historical state
 //
+// XXX For every ZODB connection a dedicated read-only transaction is maintained.
 //
 // XXX(integrate place=?) ZData - no need to keep track -> ZBlk1 is always
 // marked as changed on blk data change.
@@ -419,20 +422,12 @@ package main
 //
 // δ(BTree) in wcfs context:
 //
-// . -k(blk) -> invalidata #blk
+// . -k(blk) -> invalidate #blk
 // . +k(blk) -> invalidate #blk (e.g. if blk was previously read as hold)
 //
 //
 // ----------------------------------------
 //
-// - XXX(kill) 1 ZODB connection per 1 bigfile (each bigfile can be at its different @at,
-//   because invalidations for different bigfiles can be processed with different
-//   timings depending on clients). No harm here as different bigfiles use
-//   completely different ZODB BTree and data objects.
-//
-//   For every ZODB connection a dedicated read-only transaction is maintained.
-//
-//
 // Notes on OS pagecache control:
 //
 // the cache of snapshotted bigfile can be pre-made hot, if invalidated region