Commit fe7efb94 authored by Kirill Smelkov's avatar Kirill Smelkov

X start of wcfs

parent 60e8f859
package main
import (
"flag"
"fmt"
"log"
"os"
"time"
"github.com/hanwen/go-fuse/fuse"
"github.com/hanwen/go-fuse/fuse/nodefs"
)
// dir represents a directory in the filesystem.
type dir struct {
nodefs.Node
}
// file represents a file in the filesystem.
type file struct {
nodefs.Node
}
// fileHandle represents opened file.
type fileHandle struct {
nodefs.File
content []byte
}
// XXX recheck whether Lookup is needed
func (d *dir) Lookup(out *fuse.Attr, name string, _ *fuse.Context) (*nodefs.Inode, fuse.Status) {
ientry := d.Inode().GetChild(name)
if ientry == nil {
return nil, fuse.ENOENT
}
// XXX fill out
return ientry, fuse.OK
}
var nopen = 0
func (f *file) Open(flags uint32, _ *fuse.Context) (nodefs.File, fuse.Status) {
_, name := f.Inode().Parent()
nopen++ // XXX -> atomic
data := fmt.Sprintf("%04d %s\n", nopen, name)
h := &fileHandle{File: nodefs.NewDefaultFile(), content: []byte(data)}
// force direct-io to disable pagecache: we alway return different data
// and st_size=0 (like in /proc).
return &nodefs.WithFlags{
File: h,
FuseFlags: fuse.FOPEN_DIRECT_IO,
}, fuse.OK
}
func (fh *fileHandle) Read(dest []byte, off int64) (fuse.ReadResult, fuse.Status) {
l := int64(len(dest))
// XXX demonstrate we can indeed serve different content to different openings.
if l >= 1 {
l = 1
time.Sleep(1*time.Second)
}
end := off + l
if ldata := int64(len(fh.content)); end > ldata {
end = ldata
}
res := fh.content[off:end]
fmt.Printf("read [%d:%d] -> %q\n", off, end, res)
return fuse.ReadResultData(res), fuse.OK
}
func (d *dir) mkdir(name string) *dir {
child := &dir{Node: nodefs.NewDefaultNode()}
d.Inode().NewChild(name, true, child)
return child
}
func (d *dir) mkfile(name string) *file {
child := &file{Node: nodefs.NewDefaultNode()}
d.Inode().NewChild(name, false, child)
return child
}
func main() {
debug := flag.Bool("d", true, "debug")
flag.Parse()
if len(flag.Args()) != 1 {
log.Fatalf("Usage: %s mntpt", os.Args[0])
}
mntpt := flag.Args()[0]
root := &dir{Node: nodefs.NewDefaultNode()}
opts := nodefs.NewOptions()
if *debug {
opts.Debug = true
}
server, _, err := nodefs.MountRoot(mntpt, root, opts)
if err != nil {
log.Fatal(err) // XXX err ctx?
}
// NOTE cannot make entries before mount because Inode.AddChild does
// not work before that (panics on nil deref to mountRootXXX)
root.mkdir("aaa")
root.mkfile("hello.txt")
server.Serve() // XXX error?
}
digraph {
wcfs -> wcfs_simple;
wcfs -> ZODB_go_inv;
wcfs -> Sinvtree;
wcfs -> δR;
wcfs_simple -> Btree_read;
wcfs_simple -> ZBlk_read;
client -> wcfs_spawn;
client -> δR;
client -> nowcfs;
wcfs [label="wcfs"]
wcfs_simple [label="wcfs no\ninvalidations"]
client [label="client"]
wcfs_spawn [label="spawn wcfs"]
nowcfs [label="!wcfs mode"]
ZODB_go_inv [label="ZODB/go\ninvalidations"]
Btree_read [label="BTree read"]
ZBlk_read [label="ZBigFile / ZBlk* read"]
Sinvtree [label="server: inv. tree"]
δR [label="δR encoding"]
test [label="? tests"]
}
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 2.40.1 (20161225.0304)
-->
<!-- Title: %3 Pages: 1 -->
<svg width="864pt" height="206pt"
viewBox="0.00 0.00 863.69 205.74" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 201.7401)">
<title>%3</title>
<polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-201.7401 859.6909,-201.7401 859.6909,4 -4,4"/>
<!-- wcfs -->
<g id="node1" class="node">
<title>wcfs</title>
<ellipse fill="none" stroke="#000000" cx="342.0456" cy="-179.7401" rx="27.0966" ry="18"/>
<text text-anchor="middle" x="342.0456" y="-176.0401" font-family="Times,serif" font-size="14.00" fill="#000000">wcfs</text>
</g>
<!-- wcfs_simple -->
<g id="node2" class="node">
<title>wcfs_simple</title>
<ellipse fill="none" stroke="#000000" cx="129.0456" cy="-98.8701" rx="60.623" ry="26.7407"/>
<text text-anchor="middle" x="129.0456" y="-102.6701" font-family="Times,serif" font-size="14.00" fill="#000000">wcfs no</text>
<text text-anchor="middle" x="129.0456" y="-87.6701" font-family="Times,serif" font-size="14.00" fill="#000000">invalidations</text>
</g>
<!-- wcfs&#45;&gt;wcfs_simple -->
<g id="edge1" class="edge">
<title>wcfs&#45;&gt;wcfs_simple</title>
<path fill="none" stroke="#000000" d="M318.2464,-170.7982C289.8538,-160.1201 240.9589,-141.6981 199.0456,-125.7401 194.301,-123.9337 189.3775,-122.0541 184.4396,-120.1654"/>
<polygon fill="#000000" stroke="#000000" points="185.5598,-116.8466 174.9694,-116.5393 183.0567,-123.3838 185.5598,-116.8466"/>
</g>
<!-- ZODB_go_inv -->
<g id="node3" class="node">
<title>ZODB_go_inv</title>
<ellipse fill="none" stroke="#000000" cx="269.0456" cy="-98.8701" rx="60.623" ry="26.7407"/>
<text text-anchor="middle" x="269.0456" y="-102.6701" font-family="Times,serif" font-size="14.00" fill="#000000">ZODB/go</text>
<text text-anchor="middle" x="269.0456" y="-87.6701" font-family="Times,serif" font-size="14.00" fill="#000000">invalidations</text>
</g>
<!-- wcfs&#45;&gt;ZODB_go_inv -->
<g id="edge2" class="edge">
<title>wcfs&#45;&gt;ZODB_go_inv</title>
<path fill="none" stroke="#000000" d="M327.9704,-164.1476C319.622,-154.8991 308.7181,-142.8196 298.5632,-131.5699"/>
<polygon fill="#000000" stroke="#000000" points="300.9919,-129.0371 291.6932,-123.9593 295.7958,-133.7276 300.9919,-129.0371"/>
</g>
<!-- Sinvtree -->
<g id="node4" class="node">
<title>Sinvtree</title>
<ellipse fill="none" stroke="#000000" cx="414.0456" cy="-98.8701" rx="66.0889" ry="18"/>
<text text-anchor="middle" x="414.0456" y="-95.1701" font-family="Times,serif" font-size="14.00" fill="#000000">server: inv. tree</text>
</g>
<!-- wcfs&#45;&gt;Sinvtree -->
<g id="edge3" class="edge">
<title>wcfs&#45;&gt;Sinvtree</title>
<path fill="none" stroke="#000000" d="M355.9279,-164.1476C366.0124,-152.8207 379.8773,-137.2477 391.5356,-124.1531"/>
<polygon fill="#000000" stroke="#000000" points="394.4204,-126.1765 398.4559,-116.3803 389.1922,-121.5217 394.4204,-126.1765"/>
</g>
<!-- δR -->
<g id="node5" class="node">
<title>δR</title>
<ellipse fill="none" stroke="#000000" cx="554.0456" cy="-98.8701" rx="55.7903" ry="18"/>
<text text-anchor="middle" x="554.0456" y="-95.1701" font-family="Times,serif" font-size="14.00" fill="#000000">δR encoding</text>
</g>
<!-- wcfs&#45;&gt;δR -->
<g id="edge4" class="edge">
<title>wcfs&#45;&gt;δR</title>
<path fill="none" stroke="#000000" d="M366.3483,-171.1484C395.5899,-160.7334 446.1138,-142.4915 489.0456,-125.7401 496.006,-123.0242 503.3525,-120.0716 510.5028,-117.1505"/>
<polygon fill="#000000" stroke="#000000" points="512.1604,-120.2533 520.077,-113.212 509.4973,-113.7796 512.1604,-120.2533"/>
</g>
<!-- Btree_read -->
<g id="node6" class="node">
<title>Btree_read</title>
<ellipse fill="none" stroke="#000000" cx="50.0456" cy="-18" rx="50.0912" ry="18"/>
<text text-anchor="middle" x="50.0456" y="-14.3" font-family="Times,serif" font-size="14.00" fill="#000000">BTree read</text>
</g>
<!-- wcfs_simple&#45;&gt;Btree_read -->
<g id="edge5" class="edge">
<title>wcfs_simple&#45;&gt;Btree_read</title>
<path fill="none" stroke="#000000" d="M104.9137,-74.167C95.0785,-64.0989 83.7372,-52.4892 73.9278,-42.4475"/>
<polygon fill="#000000" stroke="#000000" points="76.2668,-39.8333 66.7753,-35.1257 71.2595,-44.7248 76.2668,-39.8333"/>
</g>
<!-- ZBlk_read -->
<g id="node7" class="node">
<title>ZBlk_read</title>
<ellipse fill="none" stroke="#000000" cx="208.0456" cy="-18" rx="89.8845" ry="18"/>
<text text-anchor="middle" x="208.0456" y="-14.3" font-family="Times,serif" font-size="14.00" fill="#000000">ZBigFile / ZBlk* read</text>
</g>
<!-- wcfs_simple&#45;&gt;ZBlk_read -->
<g id="edge6" class="edge">
<title>wcfs_simple&#45;&gt;ZBlk_read</title>
<path fill="none" stroke="#000000" d="M153.1774,-74.167C162.7957,-64.321 173.8543,-53.0007 183.512,-43.1143"/>
<polygon fill="#000000" stroke="#000000" points="186.093,-45.4809 190.5772,-35.8818 181.0856,-40.5894 186.093,-45.4809"/>
</g>
<!-- client -->
<g id="node8" class="node">
<title>client</title>
<ellipse fill="none" stroke="#000000" cx="680.0456" cy="-179.7401" rx="30.5947" ry="18"/>
<text text-anchor="middle" x="680.0456" y="-176.0401" font-family="Times,serif" font-size="14.00" fill="#000000">client</text>
</g>
<!-- client&#45;&gt;δR -->
<g id="edge8" class="edge">
<title>client&#45;&gt;δR</title>
<path fill="none" stroke="#000000" d="M659.2392,-166.3861C639.8634,-153.9502 610.6331,-135.1894 587.8985,-120.5978"/>
<polygon fill="#000000" stroke="#000000" points="589.5328,-117.4878 579.2265,-115.0319 585.7518,-123.3789 589.5328,-117.4878"/>
</g>
<!-- wcfs_spawn -->
<g id="node9" class="node">
<title>wcfs_spawn</title>
<ellipse fill="none" stroke="#000000" cx="680.0456" cy="-98.8701" rx="51.9908" ry="18"/>
<text text-anchor="middle" x="680.0456" y="-95.1701" font-family="Times,serif" font-size="14.00" fill="#000000">spawn wcfs</text>
</g>
<!-- client&#45;&gt;wcfs_spawn -->
<g id="edge7" class="edge">
<title>client&#45;&gt;wcfs_spawn</title>
<path fill="none" stroke="#000000" d="M680.0456,-161.3894C680.0456,-151.3599 680.0456,-138.7154 680.0456,-127.4265"/>
<polygon fill="#000000" stroke="#000000" points="683.5457,-127.1481 680.0456,-117.1481 676.5457,-127.1481 683.5457,-127.1481"/>
</g>
<!-- nowcfs -->
<g id="node10" class="node">
<title>nowcfs</title>
<ellipse fill="none" stroke="#000000" cx="803.0456" cy="-98.8701" rx="52.7911" ry="18"/>
<text text-anchor="middle" x="803.0456" y="-95.1701" font-family="Times,serif" font-size="14.00" fill="#000000">!wcfs mode</text>
</g>
<!-- client&#45;&gt;nowcfs -->
<g id="edge9" class="edge">
<title>client&#45;&gt;nowcfs</title>
<path fill="none" stroke="#000000" d="M700.633,-166.2043C719.5906,-153.74 748.0404,-135.0349 770.1469,-120.5002"/>
<polygon fill="#000000" stroke="#000000" points="772.1452,-123.3752 778.5781,-114.9569 768.2995,-117.5262 772.1452,-123.3752"/>
</g>
<!-- test -->
<g id="node11" class="node">
<title>test</title>
<ellipse fill="none" stroke="#000000" cx="761.0456" cy="-179.7401" rx="32.4942" ry="18"/>
<text text-anchor="middle" x="761.0456" y="-176.0401" font-family="Times,serif" font-size="14.00" fill="#000000">? tests</text>
</g>
</g>
</svg>
// Copyright (C) 2018 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides filesystem server with file data backed by wendelin.core arrays.
//
// Intro
//
// Each wendelin.core array (ZBigArray) is actually a linear file (ZBigFile)
// and array metadata like dtype, shape and strides associated with it. This
// package exposes as files only ZBigFile data and leaves rest of
// array-specific handling to client. Every ZBigFile is exposed as one separate
// file that represents whole ZBigFile's data.
//
// For a client, the primary way to access a bigfile should be to mmap
// bigfile/<bigfileX>/head/data which represents always latest bigfile data.
// Clients that want to get isolation guarantee should subscribe for
// invalidations and re-mmap invalidated regions to file with pinned bigfile revision for
// the duration of their transaction. See "Invalidation protocol" for details.
//
// In the usual situation when bigfiles are big, and there are O(1)/δt updates,
// there should be no need for any cache besides shared kernel cache of latest
// bigfile data.
//
//
// Filesystem organization
//
// Top-level structure of provided filesystem is as follows:
//
// bigfile/
// <oid(bigfile1)>/
// ...
// <oid(bigfile2)>/
// ...
// ...
//
// where for a bigfileX there is bigfile/<oid(bigfileX)>/ directory, with
// oid(bigfileX) being ZODB object-id of corresponding ZBigFile object formatted with %016x.
//
// Each bigfileX/ has the following structure:
//
// bigfile/<bigfileX>/
// head/ ; latest bigfile revision
// ...
// @<tid1>/ ; bigfile revision as of transaction <tidX>
// ...
// @<tid2>/
// ...
// ...
//
// where head/ represents latest bigfile as stored in upstream ZODB, and
// @<tidX>/ represents bigfile as of transaction <tidX>.
//
// head/ has the following structure:
//
// bigfile/<bigfileX>/head/
// data ; latest bigfile data
// at ; data is bigfile view as of this ZODB transaction
// invalidations ; channel that describes invalidated data regions
//
// where /data represents latest bigfile data as stored in upstream ZODB. As
// there can be some lag receiving updates from the database, /at describes
// precisely ZODB state for which bigfile data is currently exposed. Whenever
// bigfile data is changed in upstream ZODB, information about the changes is
// first propagated to /invalidations, and only after that /data is
// updated. See "Invalidation protocol" for details.
//
// @<tidX>/ has the following structure:
//
// bigfile/<bigfileX>/@<tidX>/
// data ; bigfile data as of transaction <tidX>
//
// where /data represents bigfile data as of transaction <tidX>.
//
// bigfile/<bigfileX>/ should be created by client via mkdir. Unless explicitly
// created bigfile/<bigfileX>/ are not automatically visible in wcfs
// filesystem. Similarly bigfile/<bigfileX>/@<tidX>/ should be too created by
// client.
//
//
// Invalidation protocol
//
// In order to support isolation wcfs implements invalidation protocol that
// must be cooperatively followed by both wcfs and client.
//
// First, before client wants to mmap bigfile, it opens
// bigfile/<bigfileX>/head/invalidations and tells wcfs through it for which
// ZODB state it wants to get bigfile view. The server in turn reports for
// which ZODB state head/data is current, δ describing changed bigfile region
// between those revisions, or "wait" flag if server state is earlier compared
// to what client wants:
//
// C: want <Cat>
// S: have <Sat>, wait ; Sat < Cat
// S: have <Sat>, δR(Cat,Sat) ; Sat ≥ Cat
//
// If server reply was "wait" the client does nothing and waits for next server
// message which must come without "wait" flag set. When client receives have
// message with δR(Cat,Sat) it has the guarantee from wcfs that head/data
// content is for Sat ZODB revision and won't change until client sends ack
// back to the server. The client in turn now can mmap head/data and
// @<Cat>/data to get bigfile view as of Cat:
//
// mmap(bigfile/<bigfileX>/head/data)
// mmap(bigfile/<bigfileX>/@<Cat>/data, δR(Cat,Sat), MAP_FIXED) # mmaped at addresses corresponding to δR(Cat,Sat)
//
// When client completes its initiall mmapping it sends ack back to the server:
//
// C: ack
//
// From now on the server will be processing updates to bigfile coming from
// ZODB as follows:
//
//
// The filesystem server itself receives information about changed data
// from ZODB server through regular ZODB invalidation channel (as it is ZODB
// client itself). Then, before actually updating bigfile/<bigfileX>/head/data
// content in changed part, it notifies through bigfile/<bigfileX>/head/invalidations
// to clients that had opened this file (separately to each client) about the changes:
//
// S: have <Sat>, δR(Sat_prev, Sat)
//
// where Sat_prev is ZODB revision last reported to client for this bigfile,
// and waits until they all confirm that changed file part can be updated in
// global OS cache.
//
// The client in turn can now re-mmap invalidated regions to bigfile@Cat
//
// # mmapped at addresses corresponding to δR(Sat_prev, Sat)
// mmap(bigfile/<bigfileX>/@<Cat>/data, δR(Sat_prev, Sat), MAP_FIXED)
//
// and must send ack back to the server when it is done:
//
// C: ack
//
// When clients are done with bigfile/<bigfileX>/@<Cat>/data (i.e. Cat
// transaction ends and array is unmapped), the server sees number of opened
// files to bigfile/<bigfileX>/@<Cat>/data drops to zero, and automatically
// destroys bigfile/<bigfileX>/@<Cat>/ directory after reasonable timeout.
//
//
// Protection against slow or faulty clients
//
// If a client, on purpose or due to a bug or being stopped, is slow to
// respond with ack to invalidation notification, it creates a problem because
// head/data updates will be blocked and thus all other clients that try to
// work with current data will get stuck.
//
// To avoid this problem it should be possible for wcfs to stop a client with
// ptrace and change its address space in a style similar to e.g.
// VirtualAllocEx on Windows. Here is hacky example how this could be done on Linux:
//
// https://gist.github.com/rofl0r/1073739/63f0f788a4923e26fcf743dd9a8411d4916f0ac0
//
// This way there should be no possibility for a client to block wcfs
// indefinitely waiting for client's ack.
//
// Similarly for initiall mmapings client could first mmap head/data, then open
// head/invalidations and tell the server that it wants Cat revision, with
// the server then remmaping blocks to get to Cat state via ptrace.
//
// However for simplicity the plan is to go first without ptrace and just kill
// a slow client on, say 30 seconds, timeout.
//
//
// Writes
//
// As each bigfile is represented by 1 synthetic file, there can be several
// write schemes:
//
// 1. mmap(MAP_PRIVATE) + writeout by client
//
// In this scheme bigfile data is mmapped in MAP_PRIVATE mode, so that local
// user changes are not automatically propagated back to the file. When there
// is a need to commit, client investigates via some OS mechanism, e.g.
// /proc/self/pagemap or something similar, which pages of this mapping it
// modified. Knowing this it knows which data it dirtied and so can write this
// data back to ZODB itself, without filesystem server providing write support.
//
// 2. mmap(MAP_SHARED, PROT_READ) + write-tracking & writeout by client
//
// In this scheme bigfile data is mmaped in MAP_SHARED mode with read-only pages
// protection. Then whenever write fault occurs, client allocates RAM from
// shmfs, copies faulted page to it, and then mmaps RAM page with RW protection
// in place of original bigfile page. Writeout implementation should be similar
// to "1", only here client already knows the pages it dirtied, and this way
// there is no need to consult /proc/self/pagemap.
//
// The advantage of this scheme over mmap(MAP_PRIVATE) is that in case
// there are several in-process mappings of the same bigfile with overlapping
// in-file ranges, changes in one mapping will be visible in another mapping.
// Contrary: whenever a MAP_PRIVATE mapping is modified, the kernel COWs
// faulted page into a page completely private to this mapping, so that other
// MAP_PRIVATE mappings of this file, including ones created from the same
// process, do not see changes made to the first mapping.
//
// Since wendelin.core needs to provide coherency in between different slices
// of the same array, this is the mode wendelin.core actually uses.
//
// 3. write to wcfs
//
// XXX we later could implement "write-directly" mode where clients would write
// data directly into the file.
package wcfs
// Notes on OS pagecache control:
//
// the cache of snapshotted bigfile can be pre-made hot, if invalidated region
// was already in pagecache of head/data:
//
// - we can retrieve a region from pagecache of head/data with FUSE_NOTIFY_RETRIEVE.
// - we can store that retrieved data into pagecache region of @<tidX>/ with FUSE_NOTIFY_STORE.
// - we can invalidate a region from pagecache of head/data with FUSE_NOTIFY_INVAL_INODE.
//
// we have to disable FUSE_AUTO_INVAL_DATA to tell the kernel we are fully
// responsible for invalidating pagecache. If we don't, the kernel will be
// clearing whole cache of head/data on e.g. its mtime change.
//
// XXX FUSE_AUTO_INVAL_DATA does not fully prevent kernel from automatically
// invalidating pagecache - e.g. it will invalidate whole cache on file size changes:
//
// https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/fuse/inode.c?id=e0bc833d10#n233
//
// we can currently workaround it with using writeback mode (see !is_wb in the
// link above), but better we have proper FUSE flag for filesystem server to
// tell the kernel it is fully responsible for invalidating pagecache.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment