wcfs.go 37.5 KB
Newer Older
Kirill Smelkov's avatar
Kirill Smelkov committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
// Copyright (C) 2018  Nexedi SA and Contributors.
//                     Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
20
// Program wcfs provides filesystem server with file data backed by wendelin.core arrays.
Kirill Smelkov's avatar
Kirill Smelkov committed
21 22 23 24 25
//
// Intro
//
// Each wendelin.core array (ZBigArray) is actually a linear file (ZBigFile)
// and array metadata like dtype, shape and strides associated with it. This
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
26
// program exposes as files only ZBigFile data and leaves rest of
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
27
// array-specific handling to clients. Every ZBigFile is exposed as one separate
Kirill Smelkov's avatar
Kirill Smelkov committed
28 29 30
// file that represents whole ZBigFile's data.
//
// For a client, the primary way to access a bigfile should be to mmap
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
31
// head/bigfile/<bigfileX> which represents always latest bigfile data.
Kirill Smelkov's avatar
Kirill Smelkov committed
32 33 34 35 36 37 38 39 40 41 42 43 44
// Clients that want to get isolation guarantee should subscribe for
// invalidations and re-mmap invalidated regions to file with pinned bigfile revision for
// the duration of their transaction. See "Invalidation protocol" for details.
//
// In the usual situation when bigfiles are big, and there are O(1)/δt updates,
// there should be no need for any cache besides shared kernel cache of latest
// bigfile data.
//
//
// Filesystem organization
//
// Top-level structure of provided filesystem is as follows:
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
45
//	head/			; latest database view
Kirill Smelkov's avatar
Kirill Smelkov committed
46
//		...
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
47 48 49
//	@<rev1>/		; database view as of revision <revX>
//		...
//	@<rev2>/
Kirill Smelkov's avatar
Kirill Smelkov committed
50
//		...
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
51
//	...
Kirill Smelkov's avatar
Kirill Smelkov committed
52
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
53
// where head/ represents latest data as stored in upstream ZODB, and
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
54
// @<revX>/ represents data as of database revision <revX>.
Kirill Smelkov's avatar
Kirill Smelkov committed
55 56 57
//
// head/ has the following structure:
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
58 59 60 61 62 63 64
//	head/
//		at			; data inside head/ is as of this ZODB transaction
//		watch			; channel for bigfile invalidations
//		bigfile/		; bigfiles' data
//			<oid(bigfile1)>
//			<oid(bigfile2)>
//			...
Kirill Smelkov's avatar
Kirill Smelkov committed
65
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
66 67 68 69 70 71
// where /bigfile/<bigfileX> represents latest bigfile data as stored in
// upstream ZODB. As there can be some lag receiving updates from the database,
// /at describes precisely ZODB state for which bigfile data is currently
// exposed. Whenever bigfile data is changed in upstream ZODB, information
// about the changes is first propagated to /watch, and only after that
// /bigfile/<bigfileX> is updated. See "Invalidation protocol" for details.
Kirill Smelkov's avatar
Kirill Smelkov committed
72
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
73
// @<revX>/ has the following structure:
Kirill Smelkov's avatar
Kirill Smelkov committed
74
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
75 76 77 78 79 80
//	@<revX>/
//		at
//		bigfile/		; bigfiles' data as of revision <revX>
//			<oid(bigfile1)>
//			<oid(bigfile2)>
//			...
Kirill Smelkov's avatar
Kirill Smelkov committed
81
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
82
// where /bigfile/<bigfileX> represent bigfile data as of revision <revX>.
Kirill Smelkov's avatar
Kirill Smelkov committed
83
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
84 85
// Unless accessed {head,@<revX>}/bigfile/<bigfileX> are not automatically visible in
// wcfs filesystem. Similarly @<revX>/ should be explicitly created by client via mkdir.
Kirill Smelkov's avatar
Kirill Smelkov committed
86 87 88 89
//
//
// Invalidation protocol
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
90
// In order to support isolation, wcfs implements invalidation protocol that
Kirill Smelkov's avatar
Kirill Smelkov committed
91 92
// must be cooperatively followed by both wcfs and client.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
93
// First, client mmaps latest bigfile, but does not access it
Kirill Smelkov's avatar
Kirill Smelkov committed
94
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
95
//	mmap(head/bigfile/<bigfileX>)
Kirill Smelkov's avatar
Kirill Smelkov committed
96
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
97 98
// Then client opens head/watch and tells wcfs through it for which ZODB state
// it wants to get bigfile's view.
Kirill Smelkov's avatar
Kirill Smelkov committed
99
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
100
//	C: 1 watch <bigfileX> @<at>
Kirill Smelkov's avatar
Kirill Smelkov committed
101
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
102 103
// The server then, after potentially sending initial pin messages (see below),
// reports either success or failure:
Kirill Smelkov's avatar
Kirill Smelkov committed
104
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
105 106
//	S: 1 ok
//	S: 1 error ...		; if <at> is too far away back from head/at
Kirill Smelkov's avatar
Kirill Smelkov committed
107
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
108 109 110 111 112
// The server sends "ok" reply only after head/at is ≥ requested <at>, and
// only after all initial pin messages are fully acknowledged by the client.
// The client can start to use mmapped data after it gets "ok".
// The server sends "error" reply if requested <at> is too far away back from
// head/at.
Kirill Smelkov's avatar
Kirill Smelkov committed
113
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
114 115 116
// Upon watch request, either initially, or after sending "ok", the server will be notifying the
// client about file blocks that client needs to pin in order to observe file's
// data as of <at> revision:
Kirill Smelkov's avatar
Kirill Smelkov committed
117
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
118 119 120 121 122 123
// The filesystem server itself receives information about changed data from
// ZODB server through regular ZODB invalidation channel (as it is ZODB client
// itself). Then, separately for each changed file block, before actually
// updating head/bigfile/<bigfileX> content, it notifies through head/watch to
// clients, that had requested it (separately to each client), about the
// changes:
Kirill Smelkov's avatar
Kirill Smelkov committed
124
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
125
//	S: 2 pin <bigfileX> #<blk> @<rev_max>
Kirill Smelkov's avatar
Kirill Smelkov committed
126
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
127 128
// and waits until all clients confirm that changed file block can be updated
// in global OS cache.
Kirill Smelkov's avatar
Kirill Smelkov committed
129
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
130
// The client in turn should now re-mmap requested to be pinned block to bigfile@<rev_max>
Kirill Smelkov's avatar
Kirill Smelkov committed
131
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
132 133
//	# mmapped at address corresponding to #blk
//	mmap(@<rev_max>/bigfile/<bigfileX>, #blk, MAP_FIXED)
Kirill Smelkov's avatar
Kirill Smelkov committed
134 135 136
//
// and must send ack back to the server when it is done:
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152
//	C: 2 ack
//
// The server sends pin notifications only for file blocks, that are known to
// be potentially changed after client's <at>, and <rev_max> describes the
// upper bound for the block revision:
//
//	<at>	<  <rev_max>
//
// The server maintains short history tail of file changes to be able to
// support openings with <at> being slightly in the past compared to current
// head/at. The server might reject a watch request if <at> is too far away in
// the past from head/at. The client is advised to restart its transaction with
// more uptodate database view if it gets watch setup error.
//
// A later request from the client for the same <bigfileX> but with different
// <at>, overrides previous watch request for that file. A client can use "-"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
153
// instead of "@<at>" to stop watching a file.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
154 155 156 157 158
//
// A single client can send several watch requests through single head/watch
// open, as well as it can use several head/watch opens simultaneously.
// The server sends pin notifications for all files requested to be watched via
// every head/watch open.
Kirill Smelkov's avatar
Kirill Smelkov committed
159
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
160 161 162 163 164 165
// Note: a client could use a single watch to manage its several views for the same
// file but with different <at>. This could be achieved via watching with
// @<at_min>, and then deciding internally which views needs to be adjusted and
// which views need not. Wcfs does not oblige clients to do so though, and a
// client is free to use as many head/watch openenings as it needs to.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
166
// When clients are done with @<revX>/bigfile/<bigfileX> (i.e. client's
Kirill Smelkov's avatar
Kirill Smelkov committed
167
// transaction ends and array is unmapped), the server sees number of opened
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
168 169
// files to @<revX>/bigfile/<bigfileX> drops to zero, and automatically
// destroys @<revX>/bigfile/<bigfileX> after reasonable timeout.
Kirill Smelkov's avatar
Kirill Smelkov committed
170 171 172 173
//
//
// Protection against slow or faulty clients
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
174 175 176 177
// If a client, on purpose or due to a bug or being stopped, is slow to respond
// with ack to file invalidation notification, it creates a problem because the
// server will become blocked waiting for pin acknowledgments, and thus all
// other clients, that try to work with the same file, will get stuck.
Kirill Smelkov's avatar
Kirill Smelkov committed
178
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
179 180
// The problem could be avoided, if wcfs would reside inside OS kernel and this
// way could be able to manipulate clients address space directly (then
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
181 182 183 184
// invalidation protocol won't be needed). It is also possible to imagine
// mechanism, where wcfs would synchronously change clients' address space via
// injecting trusted code and running it on client side via ptrace to adjust
// file mappings.
Kirill Smelkov's avatar
Kirill Smelkov committed
185
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
186 187 188 189 190
// However ptrace does not work when client thread is blocked under pagefault,
// and that is exactly what wcfs would need to do to process invalidations
// lazily, because eager invalidation processing results in prohibitively slow
// file opens. See internal wcfs overview for details about why ptrace
// cannot be used and why lazy invalidation processing is required.
Kirill Smelkov's avatar
Kirill Smelkov committed
191
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
192 193 194
// Lacking OS primitives to change address space of another process and not
// being able to work it around with ptrace in userspace, wcfs takes approach
// to kill a slow client on 30 seconds timeout by default.
Kirill Smelkov's avatar
Kirill Smelkov committed
195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232
//
//
// Writes
//
// As each bigfile is represented by 1 synthetic file, there can be several
// write schemes:
//
// 1. mmap(MAP_PRIVATE) + writeout by client
//
// In this scheme bigfile data is mmapped in MAP_PRIVATE mode, so that local
// user changes are not automatically propagated back to the file. When there
// is a need to commit, client investigates via some OS mechanism, e.g.
// /proc/self/pagemap or something similar, which pages of this mapping it
// modified. Knowing this it knows which data it dirtied and so can write this
// data back to ZODB itself, without filesystem server providing write support.
//
// 2. mmap(MAP_SHARED, PROT_READ) + write-tracking & writeout by client
//
// In this scheme bigfile data is mmaped in MAP_SHARED mode with read-only pages
// protection. Then whenever write fault occurs, client allocates RAM from
// shmfs, copies faulted page to it, and then mmaps RAM page with RW protection
// in place of original bigfile page. Writeout implementation should be similar
// to "1", only here client already knows the pages it dirtied, and this way
// there is no need to consult /proc/self/pagemap.
//
// The advantage of this scheme over mmap(MAP_PRIVATE) is that in case
// there are several in-process mappings of the same bigfile with overlapping
// in-file ranges, changes in one mapping will be visible in another mapping.
// Contrary: whenever a MAP_PRIVATE mapping is modified, the kernel COWs
// faulted page into a page completely private to this mapping, so that other
// MAP_PRIVATE mappings of this file, including ones created from the same
// process, do not see changes made to the first mapping.
//
// Since wendelin.core needs to provide coherency in between different slices
// of the same array, this is the mode wendelin.core actually uses.
//
// 3. write to wcfs
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
233
// TODO we later could implement "write-directly" mode where clients would write
Kirill Smelkov's avatar
Kirill Smelkov committed
234
// data directly into the file.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
235
package main
Kirill Smelkov's avatar
Kirill Smelkov committed
236

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
237
// Wcfs organization
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
238
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
239
// Wcfs is a ZODB client that translates ZODB objects into OS files as would
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
240
// non-wcfs wendelin.core do for a ZBigFile. Contrary to non-wcfs wendelin.core,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
241
// it keeps bigfile data in shared cache efficiently. It is organized as follows:
242
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
243 244
// 1) 1 ZODB connection for "latest data" for whole filesystem (zhead).
// 2) head/data of all bigfiles represent state as of zhead.At .
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
245
// 3) for */head/data the following invariant is maintained:
246
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
247
//	#blk ∈ file cache    =>    ZBlk(#blk) + all BTree/Bucket that lead to it  ∈ zhead cache
248 249
//	                           (ZBlk* in ghost state)
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
250
//    The invariant helps on invalidation: if we see a changed oid, and
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
251
//    zhead.cache.lookup(oid) = ø -> we know we don't have to invalidate OS
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
252 253
//    cache for any part of any file (even if oid relates to a file block - that
//    block is not cached and will trigger ZODB load on file read).
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
254
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
255 256 257 258
//    Currently we maintain this invariant by simply never evicting LOBTree/LOBucket
//    objects from ZODB Connection cache (LOBucket keeps references to ZBlk* and
//    so ZBlk* also stay in cache in ghost form). In the future we may want to
//    try to synchronize to kernel freeing its pagecache pages.
259
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
260
// 4) when we receive an invalidation message from ZODB - we process it and
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
261
//    propagate invalidations to OS file cache of */head/data:
262
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
263
//	invalidation message: (tid↑, []oid)
264
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
265
//    4.1) zhead.cache.lookup(oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
266 267
//    4.2) ø: nothing to do - see invariant ^^^.
//    4.3) obj found:
268 269
//
//	- ZBlk*		-> file/#blk
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
270
//	- BTree/Bucket	-> δ(BTree)  -> file/[]#blk
271
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
272
//	in the end after processing all []oid from invalidation message we have
273 274 275
//
//	  [] of file/[]#blk
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
276
//	that describes which file(s) parts needs to be invalidated.
277
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
278
//    4.4) for all file/blk to invalidate we do:
279
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
280
//	- try to retrieve file/head/data[blk] from OS file cache;
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
281 282
//	- if retrieved successfully -> store retrieved data back into OS file
//	  cache for file/@<rev>/data[blk], where
Kirill Smelkov's avatar
Kirill Smelkov committed
283
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
284
//	    rev = max(δFtail.by(#blk)) || min(rev ∈ δFtail) || zhead.at	; see below about δFtail
Kirill Smelkov's avatar
Kirill Smelkov committed
285
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
286 287 288 289 290 291 292
//	- invalidate file/head/data[blk] in OS file cache.
//
//	This preserves previous data in OS file cache in case it will be needed
//	by not-yet-uptodate clients, and makes sure file read of head/data[blk]
//	won't be served from OS file cache and instead will trigger a FUSE read
//	request to wcfs.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
293 294
//    4.5) no invalidation messages are sent to wcfs clients at this point(*).
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
295 296
//    XXX processing ZODB invalidations and serving reads are mutually exclusive.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
297 298 299 300
// 5) after OS file cache was invalidated, we resync zhead to new database
//    view corresponding to tid.
//
// 6) for every file δFtail invalidation info about head/data is maintained:
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
301
//
Kirill Smelkov's avatar
Kirill Smelkov committed
302 303
//	- tailv: [](rev↑, []#blk)
//	- by:    {} #blk -> []rev↑ in tail
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
304
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
305 306
//    δFtail.tail describes invalidations to file we learned from ZODB invalidation.
//    δFtail.by   allows to quickly lookup information by #blk.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
307
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
308
//    min(rev) in δFtail is min(@at) at which head/data is currently mmapped (see below).
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
309
//    XXX min(10 minutes) of history to support initial openings
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
310
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
311
// 7) when we receive a FUSE read(#blk) request to a file/head/data we process it as follows:
312
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
313
//   7.1) load blkdata for head/data[blk] @zhead.at .
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
314 315 316
//
//	while loading this also gives upper bound estimate of when the block
//	was last changed:
317
//
Kirill Smelkov's avatar
Kirill Smelkov committed
318 319 320
//	  rev(blk) ≤ max(_.serial for _ in (ZBlk(#blk), all BTree/Bucket that lead to ZBlk))
//
//	it is not exact because BTree/Bucket can change (e.g. rebalance)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
321
//	but still point to the same k->ZBlk.
Kirill Smelkov's avatar
Kirill Smelkov committed
322 323 324 325 326 327 328
//
//	we also use file.δFtail to find either exact blk revision:
//
//	  rev(blk) = max(file.δFtail.by(#blk) -> []rev↑)
//
//	or another upper bound if #blk ∉ δFtail:
//
Kirill Smelkov's avatar
Kirill Smelkov committed
329
//	  rev(blk) ≤ min(rev ∈ δFtail)		; #blk ∉ δFtail
Kirill Smelkov's avatar
Kirill Smelkov committed
330 331
//
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
332
//	below rev'(blk) is min(of the estimates found):
Kirill Smelkov's avatar
Kirill Smelkov committed
333 334 335
//
//	  rev(blk) ≤ rev'(blk)		rev'(blk) = min(^^^)
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
336
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
337
//   7.2) for all client@at mmappings of file/head/data:
338
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
339
//	- rev'(blk) ≤ at: -> do nothing
Kirill Smelkov's avatar
Kirill Smelkov committed
340 341
//	- rev'(blk) > at:
//	  - if blk ∈ mmapping.pinned -> do nothing
Kirill Smelkov's avatar
Kirill Smelkov committed
342
//	  - rev = max(δFtail.by(#blk) : _ ≤ at)	|| min(rev ∈ δFtail : rev ≤ at)	|| at
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
343
//	  - client.remmap(file, #blk, @rev/data)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
344
//	  - mmapping.pinned += blk
345
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
346 347 348 349
//	remmapping is done via "invalidation protocol" exchange with client.
//	( one could imagine adjusting mappings synchronously via running
//	  wcfs-trusted code via ptrace that wcfs injects into clients, but ptrace
//	  won't work when client thread is blocked under pagefault or syscall(~) )
350
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
351
//	in order to support remmapping for each file/head/data
352
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
353
//	  [] of mmapping{client@at↑, pinned}
354
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
355 356
//	is maintained.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
357
//   7.3) blkdata is returned to kernel.
358 359 360
//
//   Thus a client that wants latest data on pagefault will get latest data,
//   and a client that wants @rev data will get @rev data, even if it was this
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
361
//   "old" client that triggered the pagefault(+).
362
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
363 364 365
// (*) see "Invalidations to wcfs clients are delayed until they read" in notes.txt
// (+) see "Changing mmapping while under pagefault is possible" in notes.txt
// (~) see "Client cannot be ptraced while under pagefault" in notes.txt
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
366 367
//
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
368 369
// XXX mmap(@at) open
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
370
// XXX 8) serving read from @<rev>/data + zconn(s) for historical state
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
371
//
372
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
373
// XXX(integrate place=?) ZData - no need to keep track -> ZBlk1 is always
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
374 375 376
// marked as changed on blk data change.
//
// ----------------------------------------
377
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
// δ(BTree) notes
//
//
// input: BTree, (@new, []oid)  ->  find out δ(BTree) i.e. {-k(v), +k'(v'), ...}
//
// - oid ∈ Bucket
// - oid ∈ BTree
//
// Bucket:
//
//	old = {k  -> v}
//	new = {k' -> v'}
//
//	Δ = -k(v), +k(v), ...
//
// => for all buckets
//
//	Δ accumulates to []δk(v)[n+,n-]  n+ ∈ {0,1}, n- ∈ {0,1}, if n+=n- - cancel
//
//
// BTree:
//
//	old = {k  -> B}   or {k  -> T}
//	new = {k' -> B'}  or {k' -> T'}
//
//	Δ = -k(B), +k(B), -k(T), +K(T), ...
//
// we translate (in top-down order):
//
//	k(B) -> {} of k(v)
//	k(T) -> {} of k(B) -> {} of k(v)
//
// which gives
//
//	Δ = k(v), +k(v), ...
//
// i.e. exactly as for buckets and it accumulates to global Δ.
//
// The globally-accumulated Δ is the answer for δ(BTree, (@new, []oid))
//
// XXX -> internal/btreediff ?
//
// δ(BTree) in wcfs context:
//
// . -k(blk) -> invalidata #blk
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
423
// . +k(blk) -> invalidate #blk (e.g. if blk was previously read as hold)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
424 425
//
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
426
// ----------------------------------------
427 428
//
// - XXX(kill) 1 ZODB connection per 1 bigfile (each bigfile can be at its different @at,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
429 430 431 432
//   because invalidations for different bigfiles can be processed with different
//   timings depending on clients). No harm here as different bigfiles use
//   completely different ZODB BTree and data objects.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
433 434
//   For every ZODB connection a dedicated read-only transaction is maintained.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
435
//
Kirill Smelkov's avatar
Kirill Smelkov committed
436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456
// Notes on OS pagecache control:
//
// the cache of snapshotted bigfile can be pre-made hot, if invalidated region
// was already in pagecache of head/data:
//
// - we can retrieve a region from pagecache of head/data with FUSE_NOTIFY_RETRIEVE.
// - we can store that retrieved data into pagecache region of @<tidX>/ with FUSE_NOTIFY_STORE.
// - we can invalidate a region from pagecache of head/data with FUSE_NOTIFY_INVAL_INODE.
//
// we have to disable FUSE_AUTO_INVAL_DATA to tell the kernel we are fully
// responsible for invalidating pagecache. If we don't, the kernel will be
// clearing whole cache of head/data on e.g. its mtime change.
//
// XXX FUSE_AUTO_INVAL_DATA does not fully prevent kernel from automatically
// invalidating pagecache - e.g. it will invalidate whole cache on file size changes:
//
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/fuse/inode.c?id=e0bc833d10#n233
//
// we can currently workaround it with using writeback mode (see !is_wb in the
// link above), but better we have proper FUSE flag for filesystem server to
// tell the kernel it is fully responsible for invalidating pagecache.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
457

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
458
import (
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
459
	"context"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
460
	"flag"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
461
	stdlog "log"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
462
	"os"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
463
	"strings"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
464 465
	"sync"
	"syscall"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
466

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
467
	log "github.com/golang/glog"
Kirill Smelkov's avatar
Kirill Smelkov committed
468 469
	"golang.org/x/sync/errgroup"

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
470
	"lab.nexedi.com/kirr/go123/xcontext"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
471
	"lab.nexedi.com/kirr/go123/xerr"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
472

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
473
	"lab.nexedi.com/kirr/neo/go/zodb"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
474
	"lab.nexedi.com/kirr/neo/go/zodb/btree"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
475
	_ "lab.nexedi.com/kirr/neo/go/zodb/wks"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
476 477 478

	"github.com/hanwen/go-fuse/fuse"
	"github.com/hanwen/go-fuse/fuse/nodefs"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
479
	"github.com/pkg/errors"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
480 481
)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
482 483 484 485 486 487 488 489 490 491
// Root represents root of wcfs filesystem.
type Root struct {
	nodefs.Node

	// ZODB storage we work with
	zstor zodb.IStorage

	// ZODB DB handle for zstor.
	// keeps cache of connections for both head/ and @<rev>/ accesses.
	//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
492
	// only one connection is used for head/ and only one for each @<rev>.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
493 494 495
	zdb *zodb.DB

	// ZODB connection for head/
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
496 497
	zheadMu sync.RWMutex // protects access to zhead & live _objects_ associated with it
	zhead   *ZConn       // zwatcher resyncs zhead; others only read zhead objects.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
498 499 500 501 502 503

	// ZODB connections for @<rev>/
	zrevMu  sync.Mutex
	zrevTab map[zodb.Tid]*ZConn
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
504
// /bigfile/	- served by BigFileRoot.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
505 506 507
type BigFileRoot struct {
	nodefs.Node

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
508
	// {} oid -> <bigfileX>/
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
509
	mu  sync.Mutex
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
510
	tab map[zodb.Oid]*BigFileDir
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
511
}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
512

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
513
// /bigfile/<bigfileX>/	- served by BigFileDir.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
514
type BigFileDir struct {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
515
	nodefs.Node
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
516 517
	oid zodb.Oid // oid of ZBigFile

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
518
	// head/ is implicitly linked to by fs
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
519 520

	// {} rev -> @<rev>/ bigfile snapshot
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
521
	mu     sync.Mutex
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
522
	revTab map[zodb.Tid]*BigFileRev
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
523 524
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
525 526
// /bigfile/<bigfileX>/(head|<rev>)/	- served by BigFileRev.
type BigFileRev struct {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
527
	nodefs.Node
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
528
	// data, at, invalidations, etc - all implicitly linked to by fs
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
529 530
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
531
// /bigfile/<bigfileX>/(head|<rev>)/*	- internally served by BigFile.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
532
type BigFile struct {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
533
	// this BigFile views ZODB via zconn
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
534
	zconn	*ZConn
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
535

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
536
	// ZBigFile top-level object. Kept activated during lifetime of current transaction.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
537
	zbf	*ZBigFile
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
538

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
539
	// zbf.Size(). It is constant during liftime of current transaction.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
540 541
	zbfSize int64

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
542 543 544
	// change history of this file.
	δFtail *ΔTailI64 // [](rev, []#blk)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
545
	// TODO -> δFtail
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
546
	// lastChange	zodb.Tid // last change to whole bigfile as of .zconn.At view
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
547 548
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
549
// /bigfile/<bigfileX>/(head|<rev>)/data	- served by BigFileData.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
550 551 552 553
type BigFileData struct {
	nodefs.Node

	bigfile *BigFile
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
554

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
555
	// inflight loadings of ZBigFile from ZODB.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
556
	// successfull load results are kept here until blkdata is put into OS pagecache.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
557 558
	//
	// XXX -> BigFile ?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
559
	loadMu  sync.Mutex
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
560
	loading map[int64]*blkLoadState // #blk -> {... blkdata}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
561 562 563

	// XXX mappings where client(s) requested isolation guarantee
	//mappings ...
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
564 565 566 567 568 569 570 571 572 573 574
}

// blkLoadState represents a ZBlk load state/result.
//
// when !ready the loading is in progress.
// when ready the loading has been completed.
type blkLoadState struct {
	ready chan struct{}

	blkdata []byte
	err     error
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
575 576
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
577 578
// ----------------------------------------

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611
// zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
// LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive
// becuse it is essentially the index where to find ZBigFile data.
//
// For the data itself - we put it to kernel pagecache and always deactivate
// from ZODB right after that.
//
// See "3) for */head/data the following invariant is maintained..."
type zodbCacheControl struct {}

func (cc *zodbCacheControl) WantEvict(obj zodb.IPersistent) bool {
	switch obj.(type) {
	default:
		return true

	case *btree.LOBTree:
	case *btree.LOBucket:

	// ZBlk* are kept referenced by a LOBucket, so they don't go away from Connection.cache.objtab

	// we also keep ZBigFile alive because we want to make sure .blksize
	// and (p. ref) .blktab do not change.
	// XXX do we really need to keep ZBigFile alive for that?
	//case *ZBigFile:
	}

	return false
}

// zwatcher watches for ZODB changes.
// see "4) when we receive an invalidation message from ZODB ..."
func (r *Root) zwatcher(ctx context.Context) (err error) {
	defer xerr.Contextf(&err, "zwatch")	// XXX more in context?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
612 613
	// XXX unmount on error? -> always EIO?

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
614
	zwatch := r.zstor.Watch() // XXX -> create before zwatcher is started
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
615 616

	for {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
617
		zevent, err := zwatch.Read(ctx)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
618 619 620 621
		if err != nil {
			return err
		}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
622
		r.zhandle1(zevent)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
623 624 625 626
	}
}

// zhandle1 handles 1 event from ZODB notification.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
627 628 629 630 631 632
// (called with .zheadMu wlocked)
func (r *Root) zhandle1(zevent zodb.WatchEvent) {
	// XXX locking correct? XXX too coarse? -> lock only around "resync .zhead ..." ?
	r.zheadMu.Lock()
	defer r.zheadMu.Unlock()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
633 634
	//toinvalidate := map[*ZBigFile]SetI64{} // {} zfile -> set(#blk)
	toinvalidate := map[*BigFileData]SetI64{} // {} zfile -> set(#blk)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
635 636

	// zevent = (tid^, []oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
637 638
	for _, oid := range zevent.Changev {
		obj := r.zhead.Cache().Get(oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
639 640 641 642 643 644
		if obj == nil {
			continue // nothing to do - see invariant
		}

		switch obj := obj.(type) {
		default:
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
645
			continue // object not related to any bigfile
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
646

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
647
		case *btree.LOBTree:
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
648 649
			// XXX -> δBTree

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
650
		case *btree.LOBucket:
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
651 652
			// XXX -> δBTree

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
653 654 655 656 657 658 659 660 661 662
		case zBlk:	// ZBlk*
			// XXX locking ?
			for zfile, objWhere := range obj.inzfile {
				blkmap, ok := toinvalidate[zfile]
				if !ok {
					blkmap = SetI64{}
					toinvalidate[zfile] = blkmap
				}
				blkmap.Update(objWhere)
			}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
663

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
664 665 666 667 668
		case *ZBigFile:
			// XXX check that .blksize and .blktab (it is only
			// persistent reference) do not change.

			// XXX shutdown fs with ^^^ message.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
669 670 671
		}
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
672
	//wg = ...
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
673
	ctx := context.TODO()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
674 675
	for file, blkmap := range toinvalidate {
		for blk := range blkmap {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
676
			go file.invalidateBlk(ctx, blk)	// XXX -> wg.Go
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
677
		}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
678
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
679 680

	// XXX resync .zhead to zevent.tid
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
681 682 683
}

// invalidateBlk invalidates 1 file block.	XXX
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
684
// XXX see "4.4) for all file/blk to in invalidate we do"
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
685
func (f *BigFileData) invalidateBlk(ctx context.Context, blk int64) error {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
686
	fsconn := f.root().fsconn
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
687 688 689
	off := blk*blksize

	// try retrieve cache of current head/data[blk]
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
690 691 692 693 694 695 696
	//
	// if less than blksize was cached - probably the kernel had to evict
	// some data from its cache already. In such case we don't try to
	// preserve the rest and drop what was read, to avoid keeping the
	// system overloaded.
	//
	// XXX st != OK -> warn?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
697
	blkdata, st := fsconn.FileRetrieveCache(f.Inode(), off, blksize)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
698 699 700
	if len(blkdata) == blksize {
		// XXX -> go
		// store retrieved data back to OS cache for file @<rev>/data[blk]
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
701
		frev, _ := f.bigfile.δFtail.LastRevOf(blk, at)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
702 703 704 705 706
		st = fsconn.FileNotifyStoreCache(frev.Inode(), off, blkdata)
		if st != fuse.OK {
			// XXX log	- dup wrt readBlk -> common func.
		}
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
707

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
708
	// invalidate file/head/data[blk] in OS file cache.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
709 710
	st = fsconn.FileNotify(f.Inode(), off, blksize)
	// XXX st != ok (fatal here)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
711 712

	panic("TODO")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
713
}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
714

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
715 716
// ----------------------------------------

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
717 718 719
// /bigfile -> Mkdir receives client request to create /bigfile/<bigfileX>.
//
// It creates <bigfileX>/head/* along the way.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
720 721
func (bfroot *BigFileRoot) Mkdir(name string, mode uint32, fctx *fuse.Context) (*nodefs.Inode, fuse.Status) {
	inode, err := bfroot.mkdir(name, fctx) // XXX ok to ignore mode?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
	return inode, err2LogStatus(err)

}

func (bfroot *BigFileRoot) mkdir(name string, fctx *fuse.Context) (_ *nodefs.Inode, err error) {
	defer xerr.Contextf(&err, "/bigfile: mkdir %q", name)

	oid, err := zodb.ParseOid(name)
	if err != nil {
		return nil, eINVALf("not oid")
	}

	// check to see if dir(oid) is already there
	bfroot.mu.Lock()
	_, already := bfroot.tab[oid]
	bfroot.mu.Unlock()

	if already {
		return nil, syscall.EEXIST
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
743
	// not there - without bfroot lock proceed to open BigFile from ZODB
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
744
	bf, err := bigopen(asctx(fctx), groot.zhead, oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
745 746 747 748 749 750 751 752
	if err != nil {
		return nil, err
	}
	defer func() {
		if err != nil {
			bf.Close()
		}
	}()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
753

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
754 755 756 757 758 759
	// relock bfroot and either mkdir or EEXIST if the directory was maybe
	// simultanously created while we were not holding bfroot.mu
	bfroot.mu.Lock()
	_, already = bfroot.tab[oid]
	if already {
		bfroot.mu.Unlock()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
760
		return nil, syscall.EEXIST
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
761
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
762

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
763
	bfdir := &BigFileDir{
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
764 765 766
		Node:   nodefs.NewDefaultNode(),
		oid:    oid,
		revTab: make(map[zodb.Tid]*BigFileRev),
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
767
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
768

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
769
	bfhead := &BigFileRev{
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
770 771 772
		Node: nodefs.NewDefaultNode(),
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
773 774 775
	bfdata := &BigFileData{
		Node:    nodefs.NewDefaultNode(),
		bigfile: bf,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
776
		loading: make(map[int64]*blkLoadState),
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
777 778
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
779 780
	bfroot.tab[oid] = bfdir
	bfroot.mu.Unlock()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
781

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
782 783 784 785
	// mkdir takes filesystem treeLock - do it outside bfroot.mu
	mkdir(bfroot, name, bfdir)
	mkdir(bfdir, "head", bfhead)
	mkfile(bfhead, "data", bfdata)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
786
	mkfile(bfhead, "at", NewSmallFile(bf.readAt))	// TODO mtime(at) = tidtime(at)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
787
	// XXX mkfile(bh, "invalidations", bh.inv)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
788

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
789
	return bfdir.Inode(), nil
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
790 791
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
792 793
// XXX do we need to support rmdir? (probably no)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
794
// /bigfile/<bigfileX> -> Mkdir receives client request to create @<tid>/.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
795
func (bfdir *BigFileDir) Mkdir(name string, mode uint32, fctx *fuse.Context) (*nodefs.Inode, fuse.Status) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
796
	inode, err := bfdir.mkdir(name, fctx) // XXX ok to ignore mode?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
797 798 799 800 801 802
	return inode, err2LogStatus(err)
}

func (bfdir *BigFileDir) mkdir(name string, fctx *fuse.Context) (_ *nodefs.Inode, err error) {
	defer xerr.Contextf(&err, "/bigfile/%s: mkdir %q", bfdir.oid, name)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
803 804 805 806 807 808 809 810
	var tid zodb.Tid
	ok := false

	if strings.HasPrefix(name, "@") {
		tid, err = zodb.ParseTid(name[1:])
		ok = (err == nil)
	}
	if !ok {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
811
		return nil, eINVALf("not @tid")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
812 813
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
814
	// check to see if dir(tid) is already there
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
815 816 817 818 819
	bfdir.mu.Lock()
	_, already := bfdir.revTab[tid]
	bfdir.mu.Unlock()

	if already {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
820
		return nil, syscall.EEXIST
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
821 822
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
823
	// not there - without bfdir lock proceed to open BigFile @tid view of ZODB
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
824 825 826 827 828 829 830 831
	ctx := asctx(fctx)
	zconnRev, err := groot.zopenAt(ctx, tid)
	if err != nil {
		return nil, err
	}
	defer zconnRev.Release()

	bf, err := bigopen(ctx, zconnRev, bfdir.oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
832 833 834
	if err != nil {
		return nil, err
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
835
	defer func() {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
836 837
		if err != nil {
			bf.Close()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
838
		}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
839
	}()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
840

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
841 842 843 844 845 846 847 848
	// relock bfdir and either mkdir or EEXIST if the directory was maybe
	// simultanously created while we were not holding bfroot.mu
	bfdir.mu.Lock()
	_, already = bfdir.revTab[tid]
	if already {
		bfdir.mu.Unlock()
		return nil, syscall.EEXIST
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
849

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
850 851 852
	bfrev := &BigFileRev{
		Node: nodefs.NewDefaultNode(),
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
853

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
854 855 856 857 858
	revdata := &BigFileData{
		Node:    nodefs.NewDefaultNode(),
		bigfile: bf,
		loading: make(map[int64]*blkLoadState),
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
859

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
860 861
	bfdir.revTab[tid] = bfrev
	bfdir.mu.Unlock()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
862

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
863 864
	// mkdir takes filesystem treeLock - do it outside bfroot.mu
	mkdir(bfdir, name, bfrev)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
865 866
	mkfile(bfrev, "data", revdata)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
867
	return bfrev.Inode(), nil
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
868 869 870
}


Kirill Smelkov's avatar
.  
Kirill Smelkov committed
871
// bigopen opens BigFile corresponding to oid on zconn.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
872 873 874 875
//
// A ZBigFile corresponding to oid is activated and statted.
//
// The whole result is returned as BigFile.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
876 877
func bigopen(ctx context.Context, zconn *ZConn, oid zodb.Oid) (_ *BigFile, err error) {
	defer xerr.Contextf(&err, "bigopen %s @%s", oid, zconn.At())
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
878 879

	// XXX better ctx = transaction.PutIntoContext(ctx, txn)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
880
	ctx, cancel := xcontext.Merge(ctx, zconn.txnCtx)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
881 882
	defer cancel()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
883
	xzbf, err := zconn.Get(ctx, oid)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
884
	if err != nil {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
885 886 887 888 889 890 891 892
		switch errors.Cause(err).(type) {
		case *zodb.NoObjectError:
			return nil, eINVAL(err)
		case *zodb.NoDataError:
			return nil, eINVAL(err) // XXX what to do if it was existing and got deleted?
		default:
			return nil, err
		}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
893
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915

	zbf, ok := xzbf.(*ZBigFile)
	if !ok {
		return nil, eINVALf("%s is not a ZBigFile", typeOf(xzbf))
	}

	// activate ZBigFile and keep it this way
	err = zbf.PActivate(ctx)
	if err != nil {
		return nil, err
	}
	defer func() {
		if err != nil {
			zbf.PDeactivate()
		}
	}()

	zbfSize, err := zbf.Size(ctx)
	if err != nil {
		return nil, err
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
916
	zconn.Incref()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
917 918 919 920
	return &BigFile{
		zconn:   zconn,
		zbf:     zbf,
		zbfSize: zbfSize,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
921 922 923

		// XXX this is needed only for head/
		δFtail:  NewΔTailI64(),	// XXX indicate we have coverage starting from zconn.at?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
924
	}, nil
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
925 926
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
927 928 929 930 931
// Close release all resources of BigFile.
func (bf *BigFile) Close() error {
	bf.zbf.PDeactivate()
	bf.zbf = nil

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
932
	bf.zconn.Release()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
933 934 935 936
	bf.zconn = nil

	return nil
}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
937

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
938 939
// /bigfile/<bigfileX>/head/data -> Getattr serves stat.
func (bfdata *BigFileData) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
940 941
	// XXX locking

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
942 943
	bf := bfdata.bigfile

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
944
	out.Mode = fuse.S_IFREG | 0444
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
945
	out.Size = uint64(bf.zbfSize)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
946
	// .Blocks
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
947
	// .Blksize
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
948 949 950

	// FIXME lastChange should cover all bigfile data, not only ZBigFile itself
	//mtime := &bfdata.lastChange.Time().Time
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
951
	lastChange := bf.zbf.PSerial()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
952 953 954 955 956 957 958 959

	mtime := lastChange.Time().Time
	out.SetTimes(/*atime=*/nil, /*mtime=*/&mtime, /*ctime=*/&mtime)

	return fuse.OK
}


Kirill Smelkov's avatar
.  
Kirill Smelkov committed
960 961
// /bigfile/<bigfileX>/head/data -> Read serves reading bigfile data.
func (bfdata *BigFileData) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context) (fuse.ReadResult, fuse.Status) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
962 963
	// XXX locking

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
964
	bf := bfdata.bigfile
Kirill Smelkov's avatar
Kirill Smelkov committed
965
	zbf := bf.zbf
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
966

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
967 968 969 970 971 972 973 974 975
	// cap read request to file size
	end := off + int64(len(dest))		// XXX overflow?
	if end > bf.zbfSize {
		end = bf.zbfSize
	}
	if end <= off {
		// XXX off >= size -> EINVAL? (but when size=0 kernel issues e.g. [0 +4K) read)
		return fuse.ReadResultData(nil), fuse.OK
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
976

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
977
	// widen read request to be aligned with blksize granularity
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
978
	// (we can load only whole ZBlk* blocks)
Kirill Smelkov's avatar
Kirill Smelkov committed
979
	aoff := off - (off % zbf.blksize)
980 981 982 983
	aend := end
	if re := end % zbf.blksize; re != 0 {
		aend += zbf.blksize - re
	}
984
	dest = make([]byte, aend - aoff) // ~> [aoff:aend) in file
Kirill Smelkov's avatar
Kirill Smelkov committed
985

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
986
	// XXX better ctx = transaction.PutIntoContext(ctx, txn)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
987
	ctx, cancel := xcontext.Merge(asctx(fctx), bf.zconn.txnCtx)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
988 989
	defer cancel()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
990
	// read/load all block(s) in parallel
Kirill Smelkov's avatar
Kirill Smelkov committed
991 992 993 994 995
	wg, ctx := errgroup.WithContext(ctx)
	for blkoff := aoff; blkoff < aend; blkoff += zbf.blksize {
		blkoff := blkoff
		blk := blkoff / zbf.blksize
		wg.Go(func() error {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
996
			δ := blkoff-aoff // blk position in dest
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
997
			//log.Infof("readBlk #%d dest[%d:+%d]", blk, δ, zbf.blksize)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
998
			return bfdata.readBlk(ctx, blk, dest[δ:δ+zbf.blksize])
Kirill Smelkov's avatar
Kirill Smelkov committed
999 1000 1001 1002 1003
		})
	}

	err := wg.Wait()
	if err != nil {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1004
		log.Errorf("%s", err)	// XXX + /bigfile/XXX: read [a,b): -> ...
Kirill Smelkov's avatar
Kirill Smelkov committed
1005 1006 1007
		return nil, fuse.EIO
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1008
	return fuse.ReadResultData(dest[off-aoff:end-aoff]), fuse.OK
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1009 1010
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1011
// readBlk serves Read to read 1 ZBlk #blk into destination buffer.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1012
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1013 1014
// see "6) when we receive a FUSE read(#blk) request ..." in overview.
//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1015
// len(dest) == blksize.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1016
func (bfdata *BigFileData) readBlk(ctx context.Context, blk int64, dest []byte) error {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1017
	// XXX errctx?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1018
	// XXX locking
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1019

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1020 1021 1022 1023 1024 1025 1026
	// check if someone else is already loading this block
	bfdata.loadMu.Lock()
	loading, already := bfdata.loading[blk]
	if !already {
		loading = &blkLoadState{
			ready:   make(chan struct{}),
		}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1027
		bfdata.loading[blk] = loading
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1028 1029 1030
	}
	bfdata.loadMu.Unlock()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1031
	// if it is already loading - just wait for it
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1032 1033 1034 1035 1036 1037
	if already {
		select {
		case <-ctx.Done():
			return ctx.Err()

		case <-loading.ready:
1038
			if loading.err == nil {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1039 1040 1041 1042 1043 1044 1045
				copy(dest, loading.blkdata)
			}
			return loading.err
		}
	}

	// noone was loading - we became reponsible to load this block
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1046 1047

	zbf := bfdata.bigfile.zbf
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1048
	blkdata, err := zbf.LoadBlk(ctx, blk)	// XXX -> +blkrevmax1
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1049 1050 1051 1052
	loading.blkdata = blkdata
	loading.err = err
	close(loading.ready)

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1053
	// XXX before loading.ready?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1054 1055 1056 1057
	blkrevmax2, _ := bfdata.bigfile.δFtail.LastRevOf(blk, zbf.PJar().At())
	//revmax := min(blkrevmax1, blkrevmax2)
	revmax := blkrevmax2
	_ = revmax
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1058

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1059
/*
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082
	// XXX remmapping
	// XXX -> own func?
	// XXX locking
	for _, mapping := range bfdata.mappings {
		if revmax <= mapping.at || !mapping.blkrange.in(blk) {
			continue // do nothing
		}

		if mapping.pinned.Contains(blk) {
			continue // do nothing
		}

		rev = max(δFtail.by(blk) : _ <= mapping.at)

		// XXX vvv -> go
		client.remmap(mapping.addr[blk], file/@<rev>/data)
		mapping.pinned.Add(blk)


	}
*/


Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1083 1084 1085
	// data loaded with error - cleanup .loading
	if loading.err != nil {
		bfdata.loadMu.Lock()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1086
		delete(bfdata.loading, blk)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1087 1088 1089 1090 1091 1092 1093 1094 1095 1096
		bfdata.loadMu.Unlock()
		return err
	}

	// data loaded ok
	copy(dest, blkdata)

	// store to kernel pagecache whole block that we've just loaded from database.
	// This way, even if the user currently requested to read only small portion from it,
	// it will prevent next e.g. consecutive user read request to again hit
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1097
	// the DB, and instead will be served by kernel from its pagecache.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1098 1099 1100
	//
	// We cannot do this directly from reading goroutine - while reading
	// kernel FUSE is holding corresponging page in pagecache locked, and if
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1101
	// we would try to update that same page in pagecache it would result
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1102 1103
	// in deadlock inside kernel.
	//
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1104
	// .loading cleanup is done once we are finished with putting the data into OS pagecache.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1105 1106 1107
	// If we do it earlier - a simultaneous read covered by the same block could result
	// into missing both kernel pagecache (if not yet updated) and empty .loading[blk],
	// and thus would trigger DB access again.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1108 1109
	go func() {
		// XXX locking - invalidation must make sure this workers are finished.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1110 1111

		// XXX if direct-io: don't touch pagecache
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1112
		st := gfsconn.FileNotifyStoreCache(bfdata.Inode(), blk*zbf.blksize, blkdata)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1113 1114

		bfdata.loadMu.Lock()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1115
		delete(bfdata.loading, blk)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1116 1117
		bfdata.loadMu.Unlock()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1118 1119
		if st == fuse.OK {
			return
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1120
		}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1121 1122 1123 1124 1125

		// pagecache update failed, but it must not (we verified on startup that
		// pagecache control is supported by kernel). We can correctly live on
		// with the error, but data access will be likely very slow. Tell user
		// about the problem.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1126
		log.Errorf("BUG: bigfile %s: blk %d: -> pagecache: %s  (ignoring, but reading from bigfile will be very slow)", zbf.POid(), blk, st)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1127
	}()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1128 1129 1130 1131

	return nil
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1132 1133

// /bigfile/<bigfileX>/head/at -> readAt serves read.
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1134 1135
func (bf *BigFile) readAt() []byte {
	// XXX locking
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1136 1137
	// XXX zbf.PJar() not good if we want to share objects between connections?
	return []byte(bf.zbf.PJar().At().String())
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1138 1139
}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1140

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1141 1142 1143



Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1144
// FIXME groot/gfsconn is tmp workaround for lack of way to retrieve FileSystemConnector from nodefs.Inode
Kirill Smelkov's avatar
Kirill Smelkov committed
1145 1146 1147 1148 1149
// TODO:
//	- Inode += .Mount() -> nodefs.Mount
//	- Mount:
//		.Root()		-> root Inode of the fs
//		.Connector()	-> FileSystemConnector through which fs is mounted
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1150
var groot   *Root
Kirill Smelkov's avatar
Kirill Smelkov committed
1151 1152
var gfsconn *nodefs.FileSystemConnector

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1153
func main() {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1154
	stdlog.SetPrefix("wcfs: ")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1155
	//log.CopyStandardLogTo("WARNING") // XXX -> "DEBUG" if -d ?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1156
	defer log.Flush()
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1157

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1158
	debug := flag.Bool("d", false, "debug")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1159
	autoexit := flag.Bool("autoexit", false, "automatically stop service when there is no client activity")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1160 1161
	// XXX option to prevent starting if wcfs was already started ?

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1162 1163
	flag.Parse()
	if len(flag.Args()) != 2 {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1164
		log.Fatalf("Usage: %s [OPTIONS] zurl mntpt", os.Args[0])
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1165 1166 1167 1168
	}
	zurl := flag.Args()[0]
	mntpt := flag.Args()[1]

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1169 1170 1171 1172 1173
	// debug -> precise t, no dates	(XXX -> always precise t?)
	if *debug {
		stdlog.SetFlags(stdlog.Lmicroseconds)
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1174
	// open zodb storage/db/connection
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1175 1176 1177 1178 1179 1180 1181
	ctx := context.Background()	// XXX + timeout?
	zstor, err := zodb.OpenStorage(ctx, zurl, &zodb.OpenOptions{ReadOnly: true})
	if err != nil {
		log.Fatal(err)
	}
	defer zstor.Close()

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1182
	zdb := zodb.NewDB(zstor)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1183
	zhead, err := zopen(ctx, zdb, &zodb.ConnOptions{})
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1184 1185 1186
	if err != nil {
		log.Fatal(err)
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1187
	zhead.Cache().SetControl(&zodbCacheControl{})	// XXX +locking?
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1188

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1189
	// mount root
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1190 1191 1192 1193
	root := &Root{
		Node:    nodefs.NewDefaultNode(),
		zstor:   zstor,
		zdb:     zdb,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1194
		zhead:   zhead,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1195 1196 1197
		zrevTab: make(map[zodb.Tid]*ZConn),
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1198
	opts := &fuse.MountOptions{
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1199 1200
		FsName: zurl,
		Name:   "wcfs",
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1201 1202

		DisableXAttrs: true, // we don't use
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1203
		Debug:         *debug,
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1204
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1205

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1206
	fssrv, fsconn, err := mount(mntpt, root, opts)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1207
	if err != nil {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1208
		log.Fatal(err)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1209
	}
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1210 1211
	groot   = root		// FIXME temp workaround (see ^^^)
	gfsconn = fsconn	// FIXME ----//----
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1212

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1213
	// we require proper pagecache control (added to Linux 2.6.36 in 2010)
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1214
	supports := fssrv.KernelSettings().SupportsNotify
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1215
	if !(supports(fuse.NOTIFY_STORE_CACHE) && supports(fuse.NOTIFY_RETRIEVE_CACHE)) {
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1216
		log.Fatalf("kernel FUSE does not support pagecache control")
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1217 1218
	}

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1219 1220
	// add entries to /
	mkfile(root, ".wcfs", NewStaticFile([]byte(zurl)))
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1221 1222 1223 1224
	mkdir(root, "bigfile", &BigFileRoot{
		Node:  nodefs.NewDefaultNode(),
		tab:   make(map[zodb.Oid]*BigFileDir),
	})
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1225

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1226 1227 1228
	// TODO handle autoexit
	_ = autoexit

Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1229
	// serve client requests
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1230
	fssrv.Serve()	// XXX Serve returns no error
Kirill Smelkov's avatar
.  
Kirill Smelkov committed
1231
}