Commit 6f0cdaff authored by Kirill Smelkov's avatar Kirill Smelkov

wcfs: Provide isolation to clients

Via custom isolation protocol that both server and clients must cooperatively
follow. This is the core change that enables file cache to be practically
shared while each client can still be provided with isolated view of the database.

This patch brings only server changes, tests + the minimum client bits to support the tests.
The client library, that will implement isolation protocol on client side, will come next.

This patch is organized as follows:

- wcfs.go brings in description of the protocol, overview of how server
  implements that protocol and the implementation itself.
  See also notes.txt

- wcfs_test.py brings in tests for server implementation.
  tWCFS._abort_ontimeout had to be moved into nogil mode into wcfs_test.pyx
  to avoid deadlock on the GIL (see comments in wcfs_test.pyx for details).

- files added in wcfs/client/ are needed to provide client-side
  implementation of WatchLink - the message exchange protocol over
  opened head/watch file - for tests. Client-side watchlink implementation
  lives in wcfs/client/wcfs_watchlink.{h,cpp}. The other additions in
  wcfs/client/ are to support that and to expose the WatchLink to Python.

  Client-side bits are done right in C++ because upcoming WCFS client
  library will be implemented in C++ to work in nogil mode in order to
  avoid deadlock on the GIL because client-side pinner thread might be
  woken-up synchronously by WCFS server at any moment, including when
  another client thread already holds the GIL and is paused by WCFS.

Some preliminary history:

kirr/wendelin.core@9b4a42a3    X invalidation design draftly settled
kirr/wendelin.core@27d91d47    X δFtail settled
kirr/wendelin.core@c27c1940    X mmap over under pagefault to this mmapping works
kirr/wendelin.core@d36b171f    X ptrace when client is under pagefault or syscall won't work
kirr/wendelin.core@c1f5bb19    X notes on why lazy-invalidate approach was taken
kirr/wendelin.core@4fbdd270    X Proof that that it is possible to change mmapping while under pagefault to it
kirr/wendelin.core@33e0dfce    X ΔTail draftly done
kirr/wendelin.core@12628943    X make sure "bye" is always processed immediately - even if a handleWatch is currently blocked
kirr/wendelin.core@af0a64cb    X test for "bye" canceling blocked handlers
kirr/wendelin.core@996dc6a8    X Fix race in test
kirr/wendelin.core@43915fe9    X wcfs: Don't forbid simultaneous watch requests
kirr/wendelin.core@941dc54b    X wcfs: threading.Lock -> sync.Mutex
kirr/wendelin.core@d75b2304    X wcfs: Move _abort_ontimeout to pyx/nogil
kirr/wendelin.core@79234659    X Notes on why eagier invalidation was rejected
kirr/wendelin.core@f05271b1    X Test that sysread(/head/watch) can be interrupted
kirr/wendelin.core@5ba816da    X restore test_wcfs_watch_robust after f05271b1.
kirr/wendelin.core@4bd88564    X "Invalidation protocol" -> "Isolation protocol"
kirr/wendelin.core@f7b54ca4    X avoid fmt::vsprintf  (now compils again with latest pygolang@master)
kirr/wendelin.core@0a8fcd9d    X wcfs/client: Move EOF -> pygolang
kirr/wendelin.core@153e02e6    X test_wcfs_watch_setup and test_wcfs_watch_setup_ahead work again
kirr/wendelin.core@17f98edc    X wcfs: client: os: Factor syserr -> string into _sysErrString
kirr/wendelin.core@7b0c301c    X wcfs: tests: Fix tFile.assertBlk not to segfault on a test failure
kirr/wendelin.core@b74dda09    X Start switching Track from Track(key) to Track(keycov)
kirr/wendelin.core@8b5d8523    X Move tracking of which blocks were accessed from wcfs to ΔFtail
parent 4430de41
/* Wendelin.bigfile | virtual memory tests
* Copyright (C) 2014-2019 Nexedi SA and Contributors.
* Copyright (C) 2014-2021 Nexedi SA and Contributors.
* Kirill Smelkov <kirr@nexedi.com>
*
* This program is free software: you can Use, Study, Modify and Redistribute
......@@ -338,7 +338,8 @@ void test_file_access_synthetic(void)
size_t PS, PSb;
int err;
/* MUST_FAULT(code) - checks that code faults */
/* MUST_FAULT(code) - checks that code faults */
/* somewhat dup in wcfs/internal/wcfs_test.pyx */
sigjmp_buf fault_jmp;
volatile int fault_expected = 0;
void sigfault_handler(int sig) {
......
......@@ -282,6 +282,12 @@ libvirtmem_h = [
'include/wendelin/utils.h',
]
libwcfs_h = [
'wcfs/client/wcfs.h',
'wcfs/client/wcfs_misc.h',
'wcfs/client/wcfs_watchlink.h',
]
setup(
name = 'wendelin.core',
version = '0.13',
......@@ -306,7 +312,13 @@ setup(
'lib/utils.c'],
depends = libvirtmem_h,
define_macros = [('_GNU_SOURCE',None)],
language = 'c')],
language = 'c'),
DSO('wendelin.wcfs.client.libwcfs',
['wcfs/client/wcfs.cpp',
'wcfs/client/wcfs_watchlink.cpp',
'wcfs/client/wcfs_misc.cpp'],
depends = libwcfs_h)],
ext_modules = [
PyGoExt('wendelin.bigfile._bigfile',
......@@ -319,6 +331,11 @@ setup(
language = 'c',
dsos = ['wendelin.bigfile.libvirtmem']),
PyGoExt('wendelin.wcfs.client._wcfs',
['wcfs/client/_wcfs.pyx'],
depends = libwcfs_h,
dsos = ['wendelin.wcfs.client.libwcfs']),
PyGoExt('wendelin.wcfs.internal.wcfs_test',
['wcfs/internal/wcfs_test.pyx']),
......
......@@ -60,6 +60,10 @@ from persistent import Persistent
from zodbtools.util import ashex as h
from six.moves.urllib.parse import urlsplit, urlunsplit
from .client._wcfs import \
PyWCFS as _WCFS, \
PyWatchLink as WatchLink \
# Server represents running wcfs server.
#
......@@ -79,7 +83,7 @@ class Server:
# Raw files on wcfs can be accessed with ._path/._read/._stat/._open .
#
# WCFS logically mirrors ZODB.DB .
class WCFS:
class WCFS(_WCFS):
# .mountpoint path to wcfs mountpoint
# ._fwcfs /.wcfs/zurl opened to keep the server from going away (at least cleanly)
# ._njoin this connection was returned for so many joins
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
#
# It wraps WCFS and WatchLink.
from golang cimport chan, structZ, string, error, refptr
from golang cimport context
from libc.stdint cimport int64_t, uint64_t
from libcpp.utility cimport pair
from libcpp.vector cimport vector
cdef extern from "wcfs/client/wcfs_misc.h" namespace "zodb" nogil:
ctypedef uint64_t Tid
ctypedef uint64_t Oid
cdef extern from "wcfs/client/wcfs_misc.h" namespace "wcfs" nogil:
const Tid TidHead
# pyx/nogil description for C++ classes
cdef extern from "wcfs/client/wcfs_watchlink.h" namespace "wcfs" nogil:
cppclass _WatchLink:
error close()
error closeWrite()
pair[string, error] sendReq(context.Context ctx, const string &req)
error recvReq(context.Context ctx, PinReq *prx)
error replyReq(context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv
chan[structZ] rx_eof
cppclass WatchLink (refptr[_WatchLink]):
# WatchLink.X = WatchLink->X in C++
error close "_ptr()->close" ()
error closeWrite "_ptr()->closeWrite"()
pair[string, error] sendReq "_ptr()->sendReq" (context.Context ctx, const string &req)
error recvReq "_ptr()->recvReq" (context.Context ctx, PinReq *prx)
error replyReq "_ptr()->replyReq" (context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv "_ptr()->fatalv"
chan[structZ] rx_eof "_ptr()->rx_eof"
cppclass PinReq:
Oid foid
int64_t blk
Tid at
string msg
error _twlinkwrite(WatchLink wlink, const string& pkt)
cdef extern from "wcfs/client/wcfs.h" namespace "wcfs" nogil:
cppclass WCFS:
string mountpoint
pair[WatchLink, error] _openwatch()
# ---- python bits ----
cdef class PyWCFS:
cdef WCFS wc
cdef class PyWatchLink:
cdef WatchLink wlink
cdef class PyPinReq:
cdef PinReq pinreq
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2021 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# cython: auto_pickle=False
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
# See _wcfs.pxd for package overview.
from golang cimport pychan, pyerror, nil
from golang cimport io
from ZODB.utils import p64
cdef class PyWCFS:
property mountpoint:
def __get__(PyWCFS pywc):
return pywc.wc.mountpoint
def __set__(PyWCFS pywc, string v):
pywc.wc.mountpoint = v
cdef class PyWatchLink:
def __init__(PyWatchLink pywlink, PyWCFS pywc):
with nogil:
_ = wcfs_openwatch_pyexc(&pywc.wc)
pywlink.wlink = _.first
err = _.second
if err != nil:
raise pyerr(err)
def __dealloc__(PyWatchLink pywlink):
pywlink.wlink = nil
def close(PyWatchLink pywlink):
with nogil:
err = wlink_close_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def closeWrite(PyWatchLink pywlink):
with nogil:
err = wlink_closeWrite_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def sendReq(PyWatchLink pywlink, context.PyContext pyctx, string req): # -> reply(string)
with nogil:
_ = wlink_sendReq_pyexc(pywlink.wlink, pyctx.ctx, req)
reply = _.first
err = _.second
if err != nil:
raise pyerr(err)
return reply
def recvReq(PyWatchLink pywlink, context.PyContext pyctx): # -> PinReq | None when EOF
cdef PyPinReq pyreq = PyPinReq.__new__(PyPinReq)
with nogil:
err = wlink_recvReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq)
if err.eq(io.EOF):
return None
if err != nil:
raise pyerr(err)
return pyreq
def replyReq(PyWatchLink pywlink, context.PyContext pyctx, PyPinReq pyreq, string reply):
with nogil:
err = wlink_replyReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq, reply)
if err != nil:
raise pyerr(err)
return
# XXX for tests
property fatalv:
def __get__(PyWatchLink pywlink):
return pywlink.wlink.fatalv
property rx_eof:
def __get__(PyWatchLink pywlink):
return pychan.from_chan_structZ(pywlink.wlink.rx_eof)
cdef class PyPinReq:
property foid:
def __get__(PyPinReq pypin):
return p64(pypin.pinreq.foid)
property blk:
def __get__(PyPinReq pypin):
return pypin.pinreq.blk
property at:
def __get__(PyPinReq pypin):
at = pypin.pinreq.at
if at == TidHead:
return None
return p64(at)
# wcfs_test.py uses req.msg in several places
property msg:
def __get__(PyPinReq pypin):
return pypin.pinreq.msg
def _tpywlinkwrite(PyWatchLink pywlink, bytes pypkt):
cdef string pkt = pypkt
with nogil:
err = _twlinkwrite_pyexc(pywlink.wlink, pkt)
if err != nil:
raise pyerr(err)
# ---- misc ----
# pyerr converts error into python error.
cdef object pyerr(error err):
return pyerror.from_error(err)
from golang cimport topyexc
cdef nogil:
pair[WatchLink, error] wcfs_openwatch_pyexc(WCFS *wcfs) except +topyexc:
return wcfs._openwatch()
error wlink_close_pyexc(WatchLink wlink) except +topyexc:
return wlink.close()
error wlink_closeWrite_pyexc(WatchLink wlink) except +topyexc:
return wlink.closeWrite()
pair[string, error] wlink_sendReq_pyexc(WatchLink wlink, context.Context ctx, const string &req) except +topyexc:
return wlink.sendReq(ctx, req)
error wlink_recvReq_pyexc(WatchLink wlink, context.Context ctx, PinReq *prx) except +topyexc:
return wlink.recvReq(ctx, prx)
error wlink_replyReq_pyexc(WatchLink wlink, context.Context ctx, const PinReq *req, const string& reply) except +topyexc:
return wlink.replyReq(ctx, req, reply)
error _twlinkwrite_pyexc(WatchLink wlink, const string& pkt) except +topyexc:
return _twlinkwrite(wlink, pkt)
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client.
#include "wcfs_misc.h"
#include "wcfs.h"
#include "wcfs_watchlink.h"
#include <golang/errors.h>
#include <golang/fmt.h>
// wcfs::
namespace wcfs {
// ---- WCFS raw file access ----
// _path returns path for object on wcfs.
// - str: wcfs root + obj;
string WCFS::_path(const string &obj) {
WCFS& wc = *this;
return wc.mountpoint + "/" + obj;
}
tuple<os::File, error> WCFS::_open(const string &path, int flags) {
WCFS& wc = *this;
string path_ = wc._path(path);
return os::open(path_, flags);
}
// ---- misc ----
string WCFS::String() const {
const WCFS& wc = *this;
return fmt::sprintf("wcfs %s", v(wc.mountpoint));
}
} // wcfs::
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client.
#ifndef _NXD_WCFS_H_
#define _NXD_WCFS_H_
#include <golang/libgolang.h>
#include <tuple>
#include "wcfs_misc.h"
// wcfs::
namespace wcfs {
using namespace golang;
using std::tuple;
using std::pair;
typedef refptr<struct _WatchLink> WatchLink;
struct PinReq;
// WCFS represents filesystem-level connection to wcfs server.
//
// Use wcfs.join in Python API to create it.
//
// WCFS logically mirrors ZODB.DB .
// It is safe to use WCFS from multiple threads simultaneously.
struct WCFS {
string mountpoint;
pair<WatchLink, error> _openwatch();
string String() const;
// at OS-level, on-WCFS raw files can be accessed via ._path and ._open.
string _path(const string &obj);
tuple<os::File, error> _open(const string &path, int flags=O_RDONLY);
};
} // wcfs::
#endif
// Copyright (C) 2019-2021 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
#include "wcfs_misc.h"
#include <golang/libgolang.h>
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
using namespace golang;
#include <inttypes.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <algorithm>
#include <memory>
// golang::
namespace golang {
// os::
namespace os {
// TODO -> os.PathError + err=syscall.Errno
static error _pathError(const char *op, const string &path, int syserr);
static string _sysErrString(int syserr);
int _File::fd() const { return _fd; }
string _File::name() const { return _path; }
_File::_File() {}
_File::~_File() {}
void _File::decref() {
if (__decref())
delete this;
}
tuple<File, error> open(const string &path, int flags, mode_t mode) {
int fd = ::open(path.c_str(), flags, mode);
if (fd == -1)
return make_tuple(nil, _pathError("open", path, errno));
File f = adoptref(new _File);
f->_path = path;
f->_fd = fd;
return make_tuple(f, nil);
}
error _File::close() {
_File& f = *this;
int err = ::close(f._fd);
if (err != 0)
return f._errno("close");
f._fd = -1;
return nil;
}
tuple<int, error> _File::read(void *buf, size_t count) {
_File& f = *this;
int n;
n = ::read(f._fd, buf, count);
if (n == 0)
return make_tuple(n, io::EOF_);
if (n < 0)
return make_tuple(0, f._errno("read"));
return make_tuple(n, nil);
}
tuple <int, error> _File::write(const void *buf, size_t count) {
_File& f = *this;
int n, wrote=0;
// NOTE contrary to write(2) we have to write all data as io.Writer requires.
while (count != 0) {
n = ::write(f._fd, buf, count);
if (n < 0)
return make_tuple(wrote, f._errno("write"));
wrote += n;
buf = ((const char *)buf) + n;
count -= n;
}
return make_tuple(wrote, nil);
}
error _File::stat(struct stat *st) {
_File& f = *this;
int err = fstat(f._fd, st);
if (err != 0)
return f._errno("stat");
return nil;
}
// _errno returns error corresponding to op(file) and errno.
error _File::_errno(const char *op) {
_File& f = *this;
return _pathError(op, f._path, errno);
}
// _pathError returns os.PathError-like for op/path and system error
// indicated by syserr.
static error _pathError(const char *op, const string &path, int syserr) {
// TODO v(_sysErrString(syserr)) -> v(syscall.Errno(syserr))
return fmt::errorf("%s %s: %s", op, v(path), v(_sysErrString(syserr)));
}
// _sysErrString returns string corresponding to system error syserr.
static string _sysErrString(int syserr) {
char ebuf[128];
char *estr = strerror_r(syserr, ebuf, sizeof(ebuf));
return string(estr);
}
} // os::
// xstrconv:: (strconv-like)
namespace xstrconv {
// parseHex64 decodes 16-character-wide hex-encoded string into uint64.
tuple<uint64_t, error> parseHex64(const string& s) {
if (s.size() != 16)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
uint64_t v;
int n = sscanf(s.c_str(), "%16" SCNx64, &v);
if (n != 1)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseInt decodes string s as signed decimal integer.
tuple<int64_t, error> parseInt(const string& s) {
int64_t v;
int n = sscanf(s.c_str(), "%" SCNi64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("int %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseUint decodes string s as unsigned decimal integer.
tuple<uint64_t, error> parseUint(const string& s) {
uint64_t v;
int n = sscanf(s.c_str(), "%" SCNu64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("uint %s invalid", v(s)));
return make_tuple(v, nil);
}
} // xstrconv::
} // golang::
// xerr::
namespace xerr {
// XXX don't require fmt::vsprintf
#if 0
Contextf::Contextf(const char *format, ...) {
Contextf& c = *this;
va_list argp;
va_start(argp, format);
c.errctx = fmt::sprintfv(format, argp);
va_end(argp);
}
#endif
error Contextf::operator() (error err) const {
const Contextf& c = *this;
if (err == nil)
return nil;
return fmt::errorf("%s: %w", v(c.errctx), err);
}
} // xerr::
#include <golang/time.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
// golang::log::
namespace golang {
namespace log {
void __Logf(const char *file, int line, char level, const char *format, ...) {
double t = time::now();
time_t t_int = time_t(t);
struct tm tm_loc;
localtime_r(&t_int, &tm_loc);
char t_buf[32];
strftime(t_buf, sizeof(t_buf), "%m%d %H:%M:%S", &tm_loc);
int t_us = int((t-t_int)*1E6);
pid_t tid = syscall(SYS_gettid);
string prefix = fmt::sprintf("%c%s.%06d % 7d %s:%d] ", level, t_buf, t_us, tid, file, line);
// TODO better to emit prefix and msg in one go.
flockfile(stderr);
fprintf(stderr, "%s", v(prefix));
va_list argp;
va_start(argp, format);
vfprintf(stderr, format, argp);
va_end(argp);