Commit 0ec8ce51 authored by Kirill Smelkov's avatar Kirill Smelkov

.

parent e728f5db
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
#
# It wraps WCFS/Conn/FileH/Mapping and WatchLink to help client_test.py unit-test
# WCFS base-layer mmap functionality. At functional level WCFS client (and especially
# pinner) is verified when running wendelin.core array tests in wcfs mode.
from golang cimport chan, structZ, string, error, refptr
from golang cimport context, cxx
from libc.stdint cimport int64_t, uint64_t, uint8_t
from libcpp.utility cimport pair
from libcpp.vector cimport vector
cdef extern from "wcfs/client/wcfs_misc.h" namespace "zodb" nogil:
ctypedef uint64_t Tid
ctypedef uint64_t Oid
cdef extern from "wcfs/client/wcfs_misc.h" namespace "wcfs" nogil:
const Tid TidHead
# pyx/nogil description for C++ classes
cdef extern from "wcfs/client/wcfs_watchlink.h" namespace "wcfs" nogil:
cppclass _WatchLink:
error close()
error closeWrite()
pair[string, error] sendReq(context.Context ctx, const string &req)
error recvReq(context.Context ctx, PinReq *prx)
error replyReq(context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv
chan[structZ] rx_eof
cppclass WatchLink (refptr[_WatchLink]):
# WatchLink.X = WatchLink->X in C++
error close "_ptr()->close" ()
error closeWrite "_ptr()->closeWrite"()
pair[string, error] sendReq "_ptr()->sendReq" (context.Context ctx, const string &req)
error recvReq "_ptr()->recvReq" (context.Context ctx, PinReq *prx)
error replyReq "_ptr()->replyReq" (context.Context ctx, const PinReq *req, const string& reply);
vector[string] fatalv "_ptr()->fatalv"
chan[structZ] rx_eof "_ptr()->rx_eof"
cppclass PinReq:
Oid foid
int64_t blk
Tid at
string msg
error _twlinkwrite(WatchLink wlink, const string& pkt)
cdef extern from "wcfs/client/wcfs.h" namespace "wcfs" nogil:
cppclass WCFS:
string mountpoint
pair[WatchLink, error] _openwatch()
pair[Conn, error] connect(Tid at)
cppclass _Conn:
pair[FileH, error] open(Oid foid)
error close()
error resync(Tid at)
cppclass Conn (refptr[_Conn]):
# Conn.X = Conn->X in C++
pair[FileH, error] open "_ptr()->open" (Oid foid)
error close "_ptr()->close" ()
error resync "_ptr()->resync" (Tid at)
cppclass _FileH:
size_t blksize
error close()
pair[Mapping, error] mmap(int64_t blk_start, int64_t blk_len) # `VMA *vma=nil` not exposed
cppclass FileH (refptr[_FileH]):
# FileH.X = FileH->X in C++
size_t blksize "_ptr()->blksize"
error close "_ptr()->close" ()
pair[Mapping, error] mmap "_ptr()->mmap" (int64_t blk_start, int64_t blk_len)
cppclass _Mapping:
FileH fileh
int64_t blk_start
int64_t blk_stop() const
uint8_t *mem_start
uint8_t *mem_stop
error unmap()
cppclass Mapping (refptr[_Mapping]):
# Mapping.X = Mapping->X in C++
FileH fileh "_ptr()->fileh"
int64_t blk_start "_ptr()->blk_start"
int64_t blk_stop "_ptr()->blk_stop" () const
uint8_t *mem_start "_ptr()->mem_start"
uint8_t *mem_stop "_ptr()->mem_stop"
error unmap "_ptr()->unmap" ()
cxx.dict[int64_t, Tid] _tfileh_pinned(FileH wfileh)
# ---- python bits ----
cdef class PyWCFS:
cdef WCFS wc
cdef class PyConn:
cdef Conn wconn
cdef class PyFileH:
cdef FileH wfileh
cdef class PyMapping:
cdef Mapping wmmap
cdef readonly PyFileH fileh
cdef class PyWatchLink:
cdef WatchLink wlink
cdef class PyPinReq:
cdef PinReq pinreq
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
# cython: language_level=2
# cython: auto_pickle=False
# distutils: language=c++
# Package _wcfs provides Python-wrappers for C++ wcfs client package.
# See _wcfs.pxd for package overview.
from golang cimport pychan, pyerror, nil
from golang cimport io
cdef extern from *:
ctypedef bint cbool "bool"
from ZODB.utils import p64, u64
from cpython cimport PyBuffer_FillInfo
from libcpp.unordered_map cimport unordered_map
cdef class PyWCFS:
property mountpoint:
def __get__(PyWCFS pywc):
return pywc.wc.mountpoint
def __set__(PyWCFS pywc, string v):
pywc.wc.mountpoint = v
def connect(PyWCFS pywc, pyat): # -> PyConn
cdef Tid at = u64(pyat)
with nogil:
_ = wcfs_connect_pyexc(&pywc.wc, at)
wconn = _.first
err = _.second
if err != nil:
raise pyerr(err)
cdef PyConn pywconn = PyConn.__new__(PyConn)
pywconn.wconn = wconn
return pywconn
cdef class PyConn:
def __dealloc__(PyConn pywconn):
pywconn.wconn = nil
def close(PyConn pywconn):
with nogil:
err = wconn_close_pyexc(pywconn.wconn)
if err != nil:
raise pyerr(err)
def open(PyConn pywconn, pyfoid): # -> FileH
cdef Oid foid = u64(pyfoid)
with nogil:
_ = wconn_open_pyexc(pywconn.wconn, foid)
wfileh = _.first
err = _.second
if err != nil:
raise pyerr(err)
cdef PyFileH pywfileh = PyFileH.__new__(PyFileH)
pywfileh.wfileh = wfileh
return pywfileh
def resync(PyConn pywconn, pyat):
cdef Tid at = u64(pyat)
with nogil:
err = wconn_resync_pyexc(pywconn.wconn, at)
if err != nil:
raise pyerr(err)
cdef class PyFileH:
def __dealloc__(PyFileH pywfileh):
pywfileh.wfileh = nil
def close(PyFileH pywfileh):
with nogil:
err = wfileh_close_pyexc(pywfileh.wfileh)
if err != nil:
raise pyerr(err)
def mmap(PyFileH pywfileh, int64_t blk_start, int64_t blk_len):
with nogil:
_ = wfileh_mmap_pyexc(pywfileh.wfileh, blk_start, blk_len)
wmmap = _.first
err = _.second
if err != nil:
raise pyerr(err)
assert wmmap.fileh .eq (pywfileh.wfileh)
cdef PyMapping pywmmap = PyMapping.__new__(PyMapping)
pywmmap.wmmap = wmmap
pywmmap.fileh = pywfileh
return pywmmap
property blksize:
def __get__(PyFileH pywfileh):
return pywfileh.wfileh.blksize
# XXX for tests
property pinned:
def __get__(PyFileH pywfileh):
# XXX cast: needed for cython to automatically convert to py dict
cdef dict p = <unordered_map[int64_t, Tid]> _tfileh_pinned(pywfileh.wfileh)
for blk in p:
p[blk] = p64(p[blk]) # rev(int64) -> rev(bytes)
return p
cdef class PyMapping:
def __dealloc__(PyMapping pywmmap):
# unmap just in case (double unmap is ok)
with nogil:
err = wmmap_unmap_pyexc(pywmmap.wmmap)
pywmmap.wmmap = nil
if err != nil:
raise pyerr(err)
property blk_start:
def __get__(PyMapping pywmmap):
return pywmmap.wmmap.blk_start
property blk_stop:
def __get__(PyMapping pywmmap):
return pywmmap.wmmap.blk_stop()
def __getbuffer__(PyMapping pywmmap, Py_buffer *view, int flags):
PyBuffer_FillInfo(view, pywmmap, pywmmap.wmmap.mem_start,
pywmmap.wmmap.mem_stop - pywmmap.wmmap.mem_start, readonly=1, flags=flags)
property mem:
def __get__(PyMapping pywmmap) -> memoryview:
return memoryview(pywmmap)
def unmap(PyMapping pywmmap):
with nogil:
err = wmmap_unmap_pyexc(pywmmap.wmmap)
if err != nil:
raise pyerr(err)
# ----------------------------------------
cdef class PyWatchLink:
def __init__(PyWatchLink pywlink, PyWCFS pywc):
with nogil:
_ = wcfs_openwatch_pyexc(&pywc.wc)
pywlink.wlink = _.first
err = _.second
if err != nil:
raise pyerr(err)
def __dealloc__(PyWatchLink pywlink):
pywlink.wlink = nil
def close(PyWatchLink pywlink):
with nogil:
err = wlink_close_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def closeWrite(PyWatchLink pywlink):
with nogil:
err = wlink_closeWrite_pyexc(pywlink.wlink)
if err != nil:
raise pyerr(err)
def sendReq(PyWatchLink pywlink, context.PyContext pyctx, string req): # -> reply(string)
with nogil:
_ = wlink_sendReq_pyexc(pywlink.wlink, pyctx.ctx, req)
reply = _.first
err = _.second
if err != nil:
raise pyerr(err)
return reply
def recvReq(PyWatchLink pywlink, context.PyContext pyctx): # -> PinReq | None when EOF
cdef PyPinReq pyreq = PyPinReq.__new__(PyPinReq)
with nogil:
err = wlink_recvReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq)
if err.eq(io.EOF):
return None
if err != nil:
raise pyerr(err)
return pyreq
def replyReq(PyWatchLink pywlink, context.PyContext pyctx, PyPinReq pyreq, string reply):
with nogil:
err = wlink_replyReq_pyexc(pywlink.wlink, pyctx.ctx, &pyreq.pinreq, reply)
if err != nil:
raise pyerr(err)
return
# XXX for tests
property fatalv:
def __get__(PyWatchLink pywlink):
return pywlink.wlink.fatalv
property rx_eof:
def __get__(PyWatchLink pywlink):
return pychan.from_chan_structZ(pywlink.wlink.rx_eof)
cdef class PyPinReq:
property foid:
def __get__(PyPinReq pypin):
return p64(pypin.pinreq.foid)
property blk:
def __get__(PyPinReq pypin):
return pypin.pinreq.blk
property at:
def __get__(PyPinReq pypin):
at = pypin.pinreq.at
if at == TidHead:
return None
return p64(at)
# wcfs_test.py uses req.msg in several places
property msg:
def __get__(PyPinReq pypin):
return pypin.pinreq.msg
def _tpywlinkwrite(PyWatchLink pywlink, bytes pypkt):
cdef string pkt = pypkt
with nogil:
err = _twlinkwrite_pyexc(pywlink.wlink, pkt)
if err != nil:
raise pyerr(err)
# ---- misc ----
# pyerr converts error into python error.
cdef object pyerr(error err):
return pyerror.from_error(err)
from golang cimport topyexc
cdef nogil:
pair[WatchLink, error] wcfs_openwatch_pyexc(WCFS *wcfs) except +topyexc:
return wcfs._openwatch()
pair[Conn, error] wcfs_connect_pyexc(WCFS *wcfs, Tid at) except +topyexc:
return wcfs.connect(at)
error wconn_close_pyexc(Conn wconn) except +topyexc:
return wconn.close()
pair[FileH, error] wconn_open_pyexc(Conn wconn, Oid foid) except +topyexc:
return wconn.open(foid)
error wconn_resync_pyexc(Conn wconn, Tid at) except +topyexc:
return wconn.resync(at)
error wfileh_close_pyexc(FileH wfileh) except +topyexc:
return wfileh.close()
pair[Mapping, error] wfileh_mmap_pyexc(FileH wfileh, int64_t blk_start, int64_t blk_len) except +topyexc:
return wfileh.mmap(blk_start, blk_len)
error wmmap_unmap_pyexc(Mapping wmmap) except +topyexc:
return wmmap.unmap()
error wlink_close_pyexc(WatchLink wlink) except +topyexc:
return wlink.close()
error wlink_closeWrite_pyexc(WatchLink wlink) except +topyexc:
return wlink.closeWrite()
pair[string, error] wlink_sendReq_pyexc(WatchLink wlink, context.Context ctx, const string &req) except +topyexc:
return wlink.sendReq(ctx, req)
error wlink_recvReq_pyexc(WatchLink wlink, context.Context ctx, PinReq *prx) except +topyexc:
return wlink.recvReq(ctx, prx)
error wlink_replyReq_pyexc(WatchLink wlink, context.Context ctx, const PinReq *req, const string& reply) except +topyexc:
return wlink.replyReq(ctx, req, reply)
error _twlinkwrite_pyexc(WatchLink wlink, const string& pkt) except +topyexc:
return _twlinkwrite(wlink, pkt)
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2020 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
# it under the terms of the GNU General Public License version 3, or (at your
# option) any later version, as published by the Free Software Foundation.
#
# You can also Link and Combine this program with other software covered by
# the terms of any of the Free Software licenses or any of the Open Source
# Initiative approved licenses and Convey the resulting work. Corresponding
# source of such a combination shall include the source code for all other
# software used.
#
# This program is distributed WITHOUT ANY WARRANTY; without even the implied
# warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See COPYING file for full licensing terms.
# See https://www.nexedi.com/licensing for rationale and options.
"""client_test.py unit-tests virtmem layer provided by wcfs client.
WCFS filesystem itself is unit-tested by wcfs/wcfs_test.py .
At functional level, the whole wendelin.core test suite is used to verify
wcfs.py/wcfs.go while running tox tests in wcfs mode.
"""
from __future__ import print_function, absolute_import
from golang import func, defer, error, b
from wendelin.bigfile.file_zodb import ZBigFile
from wendelin.wcfs.wcfs_test import tDB, tAt
from wendelin.wcfs import wcfs_test
from wendelin.wcfs.internal.wcfs_test import read_mustfault
from wendelin.wcfs.internal import mm
from pytest import raises
from golang.golang_test import panics
# so that e.g. testdb is set up + ...
def setup_module(): wcfs_test.setup_module()
def teardown_module(): wcfs_test.teardown_module()
def setup_function(f): wcfs_test.setup_function(f)
def teardown_function(f): wcfs_test.teardown_function(f)
# tMapping provides testing environment for Mapping.
class tMapping(object):
def __init__(t, tdb, mmap):
t.tdb = tdb
t.mmap = mmap
# assertBlk asserts that mmap[·] with · corresponding to blk reads as dataok.
# pinnedOK: {} blk -> rev of t.mmap.fileh.pinned after access.
#
# see also: tFile.assertBlk .
# NOTE contrary to tFile, pinnedOK represents full fh.pinned state, not
# only pins that wcfs sent to client after tested access.
def assertBlk(t, blk, dataok, pinnedOK):
assert t.mmap.blk_start <= blk < t.mmap.blk_stop
blk_inmmap = blk - t.mmap.blk_start
dataok = b(dataok)
fh = t.mmap.fileh
assert len(dataok) <= fh.blksize
dataok += b'\0'*(fh.blksize - len(dataok)) # trailing zeros
blkview = t.mmap.mem[blk_inmmap*fh.blksize:][:fh.blksize]
# NOTE access to memory goes _with_ GIL: this verifies that wcfs pinner
# is implemented in fully nogil mode because if that was not the case,
# the pinner would deadlock trying to acquire GIL in its thread while
# user thread that triggered the access is already holding the GIL.
#
# - - - - - -
# | |
# pinner <------.
# | | wcfs
# client -------^
# | |
# - - - - - -
# client process
#
_ = blkview[0]
assert _ == dataok[0]
assert blkview.tobytes() == dataok
assert fhpinned(t.tdb, fh) == pinnedOK
# assertBlkFaults asserts that mmap[·] with · corresponding to blk raises
# SIGSEGV on read access.
def assertBlkFaults(t, blk):
assert t.mmap.blk_start <= blk < t.mmap.blk_stop
blk_inmmap = blk - t.mmap.blk_start
fh = t.mmap.fileh
blkview = t.mmap.mem[blk_inmmap*fh.blksize:][:fh.blksize]
for i in range(0, len(blkview), mm.PAGE_SIZE):
read_mustfault(blkview[i:][:1])
# fhpinned(fh) returns fh.pinned with rev wrapped into tAt.
# XXX better wrap FileH into tFileH and do this automatically in .pinned ?
def fhpinned(t, fh):
p = fh.pinned.copy()
for blk in p:
p[blk] = tAt(t, p[blk])
return p
# test_wcfs_client unit-tests virtmem layer of wcfs client.
@func
def test_wcfs_client():
t = tDB(); zf = t.zfile; at0=t.at0
defer(t.close)
pinned = lambda fh: fhpinned(t, fh)
at1 = t.commit(zf, {2:'c1', 3:'d1'})
at2 = t.commit(zf, {2:'c2'})
wconn = t.wc.connect(at1)
defer(wconn.close)
fh = wconn.open(zf._p_oid)
defer(fh.close)
# create mmap with 1 block beyond file size
m1 = fh.mmap(2, 3)
defer(m1.unmap)
assert m1.blk_start == 2
assert m1.blk_stop == 5
assert len(m1.mem) == 3*zf.blksize
tm1 = tMapping(t, m1)
assert pinned(fh) == {}
# verify initial data reads
tm1.assertBlk(2, 'c1', {2:at1})
tm1.assertBlk(3, 'd1', {2:at1})
tm1.assertBlk(4, '', {2:at1})
# commit with growing file size -> verify data read as the same, #3 pinned.
# (#4 is not yet pinned because it was not accessed)
at3 = t.commit(zf, {3:'d3', 4:'e3'})
assert pinned(fh) == {2:at1}
tm1.assertBlk(2, 'c1', {2:at1})
tm1.assertBlk(3, 'd1', {2:at1, 3:at1})
tm1.assertBlk(4, '', {2:at1, 3:at1})
# resync at1 -> at2: #2 must unpin to @head; #4 must stay as zero
wconn.resync(at2)
assert pinned(fh) == {3:at1}
tm1.assertBlk(2, 'c2', { 3:at1})
tm1.assertBlk(3, 'd1', { 3:at1})
tm1.assertBlk(4, '', { 3:at1, 4:at0}) # XXX at0->ø ?
# resync at2 -> at3: #3 must unpin to @head; #4 - start to read with data
wconn.resync(at3)
assert pinned(fh) == {}
tm1.assertBlk(2, 'c2', {})
tm1.assertBlk(3, 'd3', {})
tm1.assertBlk(4, 'e3', {})
# mmap after .size completely (start > size)
m2 = fh.mmap(5, 2); defer(m2.unmap); tm2 = tMapping(t, m2)
tm2.assertBlk(5, '', {})
tm2.assertBlk(6, '', {})
# open same fh twice, close once - fh2 continue to work ok
fh2 = wconn.open(zf._p_oid)
defer(fh2.close)
mfh2 = fh2.mmap(2, 3); defer(mfh2.unmap); tmfh2 = tMapping(t, mfh2)
tm1.assertBlk(2, 'c2', {}); tmfh2.assertBlk(2, 'c2', {})
tm1.assertBlk(3, 'd3', {}); tmfh2.assertBlk(3, 'd3', {})
tm1.assertBlk(4, 'e3', {}); tmfh2.assertBlk(4, 'e3', {})
fh2.close()
tm1.assertBlk(2, 'c2', {}); tmfh2.assertBlk(2, 'c2', {})
tm1.assertBlk(3, 'd3', {}); tmfh2.assertBlk(3, 'd3', {})
tm1.assertBlk(4, 'e3', {}); tmfh2.assertBlk(4, 'e3', {})
m3 = fh.mmap(2, 1); defer(m3.unmap); tm3 = tMapping(t, m3)
tm3.assertBlk(2, 'c2', {})
# resync ↓ -> "forbidden" (reject is from server) -> wconn is down.
with raises(error, match=": going back in history is forbidden"): wconn.resync(at2)
with raises(error, match=".*: connection closed"): wconn.open(zf._p_oid)
# verify that on Conn/FileH down/closed -> Mappings switch to EFAULT on access.
@func
def test_wcfs_client_down_efault():
t = tDB(); zf1 = t.zfile; at0=t.at0
defer(t.close)
at1 = t.commit(zf1, {2:'c1', 3:'d1'})
zf2 = t.root['zfile2'] = ZBigFile(zf1.blksize)
at2 = t.commit()
at3 = t.commit(zf2, {1:'β3', 2:'γ3'})
wconn = t.wc.connect(at3)
defer(wconn.close)
fh1 = wconn.open(zf1._p_oid); defer(fh1.close)
fh2 = wconn.open(zf2._p_oid); defer(fh2.close)
m11 = fh1.mmap(1, 4); defer(m11.unmap); tm11 = tMapping(t, m11)
m12 = fh1.mmap(3, 3); defer(m12.unmap); tm12 = tMapping(t, m12)
m21 = fh2.mmap(0, 4); defer(m21.unmap); tm21 = tMapping(t, m21)
m22 = fh2.mmap(2, 3); defer(m22.unmap); tm22 = tMapping(t, m22)
# initially fh1 and fh2 mmaps read ok.
tm11.assertBlk(1, '', {})
tm11.assertBlk(2, 'c1', {})
tm11.assertBlk(3, 'd1', {}); tm12.assertBlk(3, 'd1', {})
tm11.assertBlk(4, '', {}); tm12.assertBlk(4, '', {})
pass; tm12.assertBlk(5, '', {})
tm21.assertBlk(0, '', {})
tm21.assertBlk(1, 'β3', {})
tm21.assertBlk(2, 'γ3', {}); tm22.assertBlk(2, 'γ3', {})
tm21.assertBlk(3, '', {}); tm22.assertBlk(3, '', {})
pass; tm22.assertBlk(4, '', {})
# close fh1 -> all fh1 mmaps must turn into efaulting memory; fh2 mmaps continue to work ok.
fh1.close()
tm11.assertBlkFaults(1)
tm11.assertBlkFaults(2)
tm11.assertBlkFaults(3); tm12.assertBlkFaults(3)
tm11.assertBlkFaults(4); tm12.assertBlkFaults(4)
pass; tm12.assertBlkFaults(5)
tm21.assertBlk(0, '', {})
tm21.assertBlk(1, 'β3', {})
tm21.assertBlk(2, 'γ3', {}); tm22.assertBlk(2, 'γ3', {})
tm21.assertBlk(3, '', {}); tm22.assertBlk(3, '', {})
pass; tm22.assertBlk(4, '', {})
# open f1 again - mapping created via old fh1 continue to efault; new mappings work ok.
fh1_ = wconn.open(zf1._p_oid); defer(fh1_.close)
m11_ = fh1_.mmap(1, 4); defer(m11_.unmap); tm11_ = tMapping(t, m11_)
tm11.assertBlkFaults(1); tm11_.assertBlk(1, '', {})
tm11.assertBlkFaults(2); tm11_.assertBlk(2, 'c1', {})
tm11.assertBlkFaults(3); tm11_.assertBlk(3, 'd1', {}); tm12.assertBlkFaults(3)
tm11.assertBlkFaults(4); tm11_.assertBlk(4, '', {}); tm12.assertBlkFaults(4)
pass; tm12.assertBlkFaults(5)
tm21.assertBlk(0, '', {})
tm21.assertBlk(1, 'β3', {})
tm21.assertBlk(2, 'γ3', {}); tm22.assertBlk(2, 'γ3', {})
tm21.assertBlk(3, '', {}); tm22.assertBlk(3, '', {})
pass; tm22.assertBlk(4, '', {})
# cose wconn -> fh2 and fh1_ mmaps must turn into efaulting too.
wconn.close()
tm11.assertBlkFaults(1); tm11_.assertBlkFaults(1)
tm11.assertBlkFaults(2); tm11_.assertBlkFaults(2)
tm11.assertBlkFaults(3); tm11_.assertBlkFaults(3); tm12.assertBlkFaults(3)
tm11.assertBlkFaults(4); tm11_.assertBlkFaults(4); tm12.assertBlkFaults(4)
pass; tm12.assertBlkFaults(5)
tm21.assertBlkFaults(0)
tm21.assertBlkFaults(1)
tm21.assertBlkFaults(2); tm22.assertBlkFaults(2)
tm21.assertBlkFaults(3); tm22.assertBlkFaults(3)
pass; tm22.assertBlkFaults(4)
# XXX vvv -> separate test?
# verify that after wconn.close()
# wconn.open(), wconn.resync(), fh.mmap() -> error
with raises(error, match=".*: connection closed"): wconn.open(zf1._p_oid)
with raises(error, match=".*: connection closed"): wconn.resync(at3)
with raises(error, match=".*: file already closed"): fh2.mmap(2, 3) # NOTE we did not close fh2 yet
# ----//---- after fileh.close
with raises(error, match=".*: file already closed"): fh1.mmap(2, 3) # fh1 was explicitly closed ^^^
# TODO try to unit test at wcfs client level wcfs.Mapping with dirty RW page -
# that it stays in sync with DB after dirty discard.
# verify that read_mustfault works as expected.
def test_read_mustfault():
mem = mm.map_zero_ro(mm.PAGE_SIZE)
with panics("not faulted"): read_mustfault(mem[:1])
mm.protect(mem, mm.PROT_NONE)
read_mustfault(mem[:1])
mm.protect(mem, mm.PROT_READ)
with panics("not faulted"): read_mustfault(mem[:1])
// Copyright (C) 2018-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client integrated with user-space virtual memory manager.
// See wcfs.h for package overview.
// Wcfs client organization
//
// Wcfs client provides to its users isolated bigfile views backed by data on
// WCFS filesystem. In the absence of Isolation property, wcfs client would
// reduce to just directly using OS-level file wcfs/head/f for a bigfile f. On
// the other hand there is a simple, but inefficient, way to support isolation:
// for @at database view of bigfile f - directly use OS-level file wcfs/@at/f.
// The latter works, but is very inefficient because OS-cache for f data is not
// shared in between two connections with @at1 and @at2 views. The cache is
// also lost when connection view of the database is resynced on transaction
// boundary. To support isolation efficiently, wcfs client uses wcfs/head/f
// most of the time, but injects wcfs/@revX/f parts into mappings to maintain
// f@at view driven by pin messages that wcfs server sends to client in
// accordance to WCFS isolation protocol(*).
//
// Wcfs server sends pin messages synchronously triggered by access to mmaped
// memory. That means that a client thread, that is accessing wcfs/head/f mmap,
// is completely blocked while wcfs server sends pins and waits to receive acks
// from all clients. In other words on-client handling of pins has to be done
// in separate thread, because wcfs server can also send pins to client that
// triggered the access.
//
// Wcfs client implements pins handling in so-called "pinner" thread(+). The
// pinner thread receives pin requests from wcfs server via watchlink handle
// opened through wcfs/head/watch. For every pin request the pinner finds
// corresponding Mappings and injects wcfs/@revX/f parts via Mapping._remmapblk
// appropriately.
//
// The same watchlink handle is used to send client-originated requests to wcfs
// server. The requests are sent to tell wcfs that client wants to observe a
// particular bigfile as of particular revision, or to stop watching for it.
// Such requests originate from regular client threads - not pinner - via entry
// points like Conn.open, Conn.resync and FileH.close.
//
// Every FileH maintains fileh._pinned {} with currently pinned blk -> rev. This
// dict is updated by pinner driven by pin messages, and is used when either
// new fileh Mapping is created (FileH.mmap) or refreshed due to request from
// virtmem (Mapping.remmap_blk, see below).
//
// In wendelin.core a bigfile has semantic that it is infinite in size and
// reads as all zeros beyond region initialized with data. Memory-mapping of
// OS-level files can also go beyond file size, however accessing memory
// corresponding to file region after file.size triggers SIGBUS. To preserve
// wendelin.core semantic wcfs client mmaps-in zeros for Mapping regions after
// wcfs/head/f.size. For simplicity it is assumed that bigfiles only grow and
// never shrink. It is indeed currently so, but will have to be revisited
// if/when wendelin.core adds bigfile truncation. Wcfs client restats
// wcfs/head/f at every transaction boundary (Conn.resync) and remembers f.size
// in FileH._headfsize for use during one transaction(%).
//
//
// Integration with wendelin.core virtmem layer
//
// Wcfs client integrates with virtmem layer to support virtmem handle
// dirtying pages of read-only base-layer that wcfs client provides via
// isolated Mapping. For wcfs-backed bigfiles every virtmem VMA is interlinked
// with Mapping:
//
// VMA -> BigFileH -> ZBigFile -----> Z
// ↑↓ O
// Mapping -> FileH -> wcfs server --> DB
//
// When a page is write-accessed, virtmem mmaps in a page of RAM in place of
// accessed virtual memory, copies base-layer content provided by Mapping into
// there, and marks that page as read-write.
//
// Upon receiving pin message, the pinner consults virtmem, whether
// corresponding page was already dirtied in virtmem's BigFileH (call to
// __fileh_page_isdirty), and if it was, the pinner does not remmap Mapping
// part to wcfs/@revX/f and just leaves dirty page in its place, remembering
// pin information in fileh._pinned.
//
// Once dirty pages are no longer needed (either after discard/abort or
// writeout/commit), virtmem asks wcfs client to remmap corresponding regions
// of Mapping in its place again via calls to Mapping.remmap_blk for previously
// dirtied blocks.
//
// The scheme outlined above does not need to split Mapping upon dirtying an
// inner page.
//
// See bigfile_ops interface (wendelin/bigfile/file.h) that explains base-layer
// and overlaying from virtmem point of view. For wcfs this interface is
// provided by small wcfs client wrapper in bigfile/file_zodb.cpp.
//
// --------
//
// (*) see wcfs.go documentation for WCFS isolation protocol overview and details.
// (+) currently, for simplicity, there is one pinner thread for each connection.
// In the future, for efficiency, it might be reworked to be one pinner thread
// that serves all connections simultaneously.
// (%) see _headWait comments on how this has to be reworked.
// Wcfs client locking organization
//
// Wcfs client needs to synchronize regular user threads vs each other and vs
// pinner. A major lock Conn.atMu protects updates to changes to Conn's view of
// the database. Whenever atMu.W is taken - Conn.at is changing (Conn.resync),
// and contrary whenever atMu.R is taken - Conn.at is stable (roughly speaking
// Conn.resync is not running).
//
// Similarly to wcfs.go(*) several locks that protect internal data structures
// are minor to Conn.atMu - they need to be taken only under atMu.R (to
// synchronize e.g. multiple fileh open running simultaneously), but do not
// need to be taken at all if atMu.W is taken. In data structures such locks
// are noted as follows
//
// sync::Mutex xMu; // atMu.W | atMu.R + xMu
//
// After atMu, Conn.filehMu protects registry of opened file handles
// (Conn._filehTab), and FileH.mmapMu protects registry of created Mappings
// (FileH.mmaps) and FileH.pinned.
//
// Several locks are RWMutex instead of just Mutex not only to allow more
// concurrency, but, in the first place for correctness: pinner thread being
// core element in handling WCFS isolation protocol, is effectively invoked
// synchronously from other threads via messages coming through wcfs server.
// For example Conn.resync sends watch request to wcfs server and waits for the
// answer. Wcfs server, in turn, might send corresponding pin messages to the
// pinner and _wait_ for the answer before answering to resync:
//
// - - - - - -
// | .···|·····. ----> = request
// pinner <------.↓ <···· = response
// | | wcfs
// resync -------^↓
// | `····|·····
// - - - - - -
// client process
//
// This creates the necessity to use RWMutex for locks that pinner and other
// parts of the code could be using at the same time in synchronous scenarios
// similar to the above. This locks are:
//
// - Conn.atMu
// - Conn.filehMu
//
// Note that FileH.mmapMu is regular - not RW - mutex, since nothing in wcfs
// client calls into wcfs server via watchlink with mmapMu held.
//
// To synchronize with virtmem layer, wcfs client takes and releases big
// virtmem lock around places that touch virtmem (calls to virt_lock and
// virt_unlock). Also virtmem calls several wcfs client entrypoints with
// virtmem lock already taken. Thus, to avoid AB-BA style deadlocks, wcfs
// client needs to take virtmem lock as the first lock, whenever it needs to
// take both virtmem lock, and another lock - e.g. atMu(%).
//
// The ordering of locks is:
//
// virt_lock > Conn.atMu > Conn.filehMu > FileH.mmapMu
//
// The pinner takes the following locks:
//
// - virt_lock
// - wconn.atMu.R
// - wconn.filehMu.R
// - fileh.mmapMu (to read .mmaps + write .pinned)
//
//
// (*) see "Wcfs locking organization" in wcfs.go
// (%) see related comment in Conn.__pin1 for details.
#include "wcfs_misc.h"
#include "wcfs.h"
#include "wcfs_watchlink.h"
#include <wendelin/bigfile/virtmem.h>
#include <wendelin/bigfile/ram.h>
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
#include <golang/time.h>
#include <algorithm>
#include <string>
#include <vector>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <unistd.h>
using std::min;
using std::max;
using std::vector;
namespace ioutil = io::ioutil;
#define TRACE 0
#if TRACE
# define trace(format, ...) log::Debugf(format, ##__VA_ARGS__)
#else
# define trace(format, ...) do {} while (0)
#endif
// trace with op prefix taken from E.
#define etrace(format, ...) trace("%s", v(E(fmt::errorf(format, ##__VA_ARGS__))))
// wcfs::
namespace wcfs {
static error mmap_zero_into_ro(void *addr, size_t size);
static error mmap_efault_into(void *addr, size_t size);
static tuple<uint8_t*, error> mmap_ro(os::File f, off_t offset, size_t size);
static error mmap_into_ro(void *addr, size_t size, os::File f, off_t offset);
// _headWait waits till wcfs/head/at becomes ≥ at.
//
// _headWait is currently needed, because client stats wcfs/head/f to get f
// size assuming that f size only ↑. The assumption is not generally valid
// (e.g. f might be truncated = hole punched for block at tail), but holds true
// for now. However to get correct results wcfs/head/f has to be statt'ed
// _after_ wcfs view of the database becomes ≥ wconn.at.
//
// TODO extend isolation protocol to report f size as of @at database state at
// watch init/update(*). This way there won't be need for headWait as correct
// file size @at will be returned by wcfs itself, which will also work if
// wcfs/head/f size is changed arbitrarily.
//
// (*) equivalent might be to send something like "pin #<bsize>.. Z" (pin
// blocks bsize till ∞ to zeros).
error WCFS::_headWait(zodb::Tid at) {
WCFS& wc = *this;
xerr::Contextf E("%s: headWait @%s", v(wc), v(at));
etrace("");
zodb::Tid xat;
string xatStr;
error err;
// XXX dumb implementation, because _headWait should go away.
while (1) {
tie(xatStr, err) = ioutil::ReadFile(wc._path("head/at"));
if (err != nil)
return E(err);
tie(xat, err) = xstrconv::parseHex64(xatStr);
if (err != nil)
return E(fmt::errorf("head/at: %w", err));
if (xat >= at)
break;
time::sleep(1*time::millisecond);
}
return nil;
}
// connect creates new Conn viewing WCFS state as of @at.
pair<Conn, error> WCFS::connect(zodb::Tid at) {
WCFS *wc = this;
xerr::Contextf E("%s: connect @%s", v(wc), v(at));
etrace("");
error err;
// TODO support !isolated mode
// need to wait till `wcfs/head/at ≥ at` because e.g. Conn.open stats
// head/f to get f.headfsize.
err = wc->_headWait(at);
if (err != nil) {
return make_pair(nil, E(err));
}
WatchLink wlink;
tie(wlink, err) = wc->_openwatch();
if (err != nil)
return make_pair(nil, E(err));
Conn wconn = adoptref(new _Conn());
wconn->_wc = wc;
wconn->at = at;
wconn->_wlink = wlink;
context::Context pinCtx;
tie(pinCtx, wconn->_pinCancel) = context::with_cancel(context::background());
wconn->_pinWG = sync::NewWorkGroup(pinCtx);
wconn->_pinWG->go([wconn](context::Context ctx) -> error {
return wconn->_pinner(ctx);
});
return make_pair(wconn, nil);
}
static global<error> errConnClosed = errors::New("connection closed");
// close releases resources associated with wconn.
//
// opened fileh and mappings become invalid to use except close and unmap.
error _Conn::close() {
_Conn& wconn = *this;
// lock virtmem early. TODO more granular virtmem locking (see __pin1 for
// details and why virt_lock currently goes first)
virt_lock();
bool virtUnlocked = false;
defer([&]() {
if (!virtUnlocked)
virt_unlock();
});
wconn._atMu.RLock();
defer([&]() {
wconn._atMu.RUnlock();
});
xerr::Contextf E("%s: close", v(wconn));
etrace("");
error err, eret;
auto reterr1 = [&eret](error err) {
if (eret == nil && err != nil)
eret = err;
};
// mark wconn as closed, so that no new wconn.open might be spawned.
bool alreadyClosed = false;
wconn._filehMu.Lock();
alreadyClosed = (wconn._downErr == errConnClosed);
wconn._downErr = errConnClosed;
wconn._filehMu.Unlock();
if (alreadyClosed)
return nil;
// close all files - both that have no mappings and that still have opened
// mappings. We have to close files before shutting down pinner, because
// wcfs might send pin messages due to file access by other clients. So to
// avoid being killed we have to unwatch all files before stopping the
// pinner.
//
// NOTE after file is closed, its mappings could continue to survive, but
// we can no longer maintain consistent view. For this reason we change
// mappings to give EFAULT on access.
while (1) {
FileH f = nil;
bool opening;
// pick up any fileh
wconn._filehMu.Lock();
if (!wconn._filehTab.empty()) {
f = wconn._filehTab.begin()->second;
opening = (f->_state < _FileHOpened);
}
wconn._filehMu.Unlock();
if (f == nil)
break; // all closed
// if fileh was "opening" - wait for the open to complete before calling close.
if (opening) {
f->_openReady.recv();
if (f->_openErr != nil)
continue; // failed open; f should be removed from wconn._filehTab by Conn.open itself
}
// force fileh close.
// - virt_lock
// - wconn.atMu.R
// - wconn.filehMu unlocked
err = f->_closeLocked(/*force=*/true);
if (err != nil)
reterr1(err);
// wait for f close to complete, as it might be that f.close was called
// simultaneously to us or just before. f is removed from
// wconn.filehTab only after close is complete.
f->_closedq.recv();
}
// close wlink and signal to pinner to stop.
// we have to release virt_lock, to avoid deadlocking with pinner.
virtUnlocked = true;
virt_unlock();
err = wconn._wlink->close();
if (err != nil)
reterr1(err);
wconn._pinCancel();
err = wconn._pinWG->wait();
if (!errors::Is(err, context::canceled)) // canceled - ok
reterr1(err);
return E(eret);
}
// _pinner receives pin messages from wcfs and adjusts wconn file mappings.
error _Conn::_pinner(context::Context ctx) {
Conn wconn = newref(this); // newref for go
error err = wconn->__pinner(ctx);
// if pinner fails, wcfs will kill us.
// log pinner error so that the error is not hidden.
// print to stderr as well as by default log does not print to there.
// TODO also catch panic/exc ?
if (!(err == nil || errors::Is(err, context::canceled))) { // canceled = .close asks pinner to stop
log::Fatalf("CRITICAL: %s", v(err));
log::Fatalf("CRITICAL: wcfs server will likely kill us soon.");
fprintf(stderr, "CRITICAL: %s\n", v(err));
fprintf(stderr, "CRITICAL: wcfs server will likely kill us soon.\n");
// mark the connection non-operational if pinner fails.
//
// XXX go because wconn.close might deadlock wrt Conn.resync on
// wconn._filehMu, because Conn.resync sends "watch" updates under
// wconn._filehMu (however Conn.open and FileH.close send "watch"
// _without_ wconn._filehMu). If pinner fails - we already have serious
// problems... TODO try to resolve the deadlock.
go([wconn]() {
wconn->close();
});
}
return err;
}
error _Conn::__pinner(context::Context ctx) {
_Conn& wconn = *this;
xerr::Contextf E("pinner"); // NOTE pinner error goes to Conn::close who has its own context
etrace("");
PinReq req;
error err;
while (1) {
err = wconn._wlink->recvReq(ctx, &req);
if (err != nil) {
// it is ok if we receive EOF due to us (client) closing the connection
if (err == io::EOF_) {
wconn._filehMu.RLock();
err = (wconn._downErr == errConnClosed) ? nil : io::ErrUnexpectedEOF;
wconn._filehMu.RUnlock();
}
return E(err);
}
// we received request to pin/unpin file block. handle it
err = wconn._pin1(&req);
if (err != nil) {
return E(err);
}
}
}
// pin1 handles one pin request received from wcfs.
error _Conn::_pin1(PinReq *req) {
_Conn& wconn = *this;
xerr::Contextf E("pin f<%s> #%ld @%s", v(req->foid), req->blk, v(req->at));
etrace("");
error err = wconn.__pin1(req);
// reply either ack or nak on error
string ack = "ack";
if (err != nil)
ack = fmt::sprintf("nak: %s", v(err));
// NOTE ctx=bg to always send reply even if we are canceled
error err2 = wconn._wlink->replyReq(context::background(), req, ack);
if (err == nil)
err = err2;
return E(err);
}
error _Conn::__pin1(PinReq *req) {
_Conn& wconn = *this;
FileH f;
bool ok;
// lock virtmem first.
//
// The reason we do it here instead of closely around call to
// mmap->_remmapblk() is to avoid deadlocks: virtmem calls FileH.mmap,
// Mapping.remmap_blk and Mapping.unmap under virt_lock locked. In those
// functions the order of locks is
//
// virt_lock, wconn.atMu.R, fileh.mmapMu
//
// So if we take virt_lock right around mmap._remmapblk(), the order of
// locks in pinner would be
//
// wconn.atMu.R, wconn.filehMu.R, fileh.mmapMu, virt_lock
//
// which means there is AB-BA deadlock possibility.
//
// TODO try to take virt_lock only around virtmem-associated VMAs and with
// better granularity. NOTE it is possible to teach virtmem to call
// FileH.mmap and Mapping.unmap without virtmem locked. However reworking
// virtmem to call Mapping.remmap_blk without virt_lock is not so easy.
virt_lock();
defer([&]() {
virt_unlock();
});
wconn._atMu.RLock();
defer([&]() {
wconn._atMu.RUnlock();
});
// lock wconn.filehMu.R to lookup fileh in wconn.filehTab.
//
// keep wconn.filehMu.R locked during whole __pin1 run to make sure that
// e.g. simultaneous FileH.close does not remove f from wconn.filehTab.
// TODO keeping filehMu.R during whole pin1 is not needed and locking can be made more granular.
//
// NOTE no deadlock wrt Conn.resync, Conn.open, FileH.close - they all send
// "watch" requests to wcfs server outside of wconn.filehMu.
wconn._filehMu.RLock();
defer([&]() {
wconn._filehMu.RUnlock();
});
tie(f, ok) = wconn._filehTab.get_(req->foid);
if (!ok) {
// why wcfs sent us this update?
return fmt::errorf("unexpected pin: f<%s> not watched", v(req->foid));
}
// NOTE no need to check f._state as we need to go only through f.mmaps, and
// wcfs server can send us pins at any state, including "opening" - to pin
// our view to requested @at, and "closing" - due to other clients
// accessing wcfs/head/f simultaneously.
f->_mmapMu.lock();
defer([&]() {
f->_mmapMu.unlock();
});
for (auto mmap : f->_mmaps) { // TODO use ↑blk_start for binary search
if (!(mmap->blk_start <= req->blk && req->blk < mmap->blk_stop()))
continue; // blk ∉ mmap
trace("\tremmapblk %d @%s", req->blk, (req->at == TidHead ? "head" : v(req->at)));
// pin only if virtmem did not dirtied page corresponding to this block already
// if virtmem dirtied the page - it will ask us to remmap it again after commit or abort.
bool do_pin= true;
error err;
if (mmap->vma != nil) {
mmap->_assertVMAOk();
// see ^^^ about deadlock
//virt_lock();
BigFileH *virt_fileh = mmap->vma->fileh;
TODO (mmap->fileh->blksize != virt_fileh->ramh->ram->pagesize);
do_pin = !__fileh_page_isdirty(virt_fileh, req->blk);
}
if (do_pin)
err = mmap->_remmapblk(req->blk, req->at);
// see ^^^ about deadlock
//if (mmap->vma != nil)
// virt_unlock();
// on error don't need to continue with other mappings - all fileh and
// all mappings become marked invalid on pinner failure.
if (err != nil)
return err;
trace("\t-> remmaped");
}
// update f._pinned
if (req->at == TidHead) {
f->_pinned.erase(req->blk); // unpin to @head
}
else {
f->_pinned[req->blk] = req->at;
}
return nil;
}
// resync resyncs connection and its file mappings onto different database view.
//
// bigfile/_file_zodb.pyx arranges to call Conn.resync at transaction boundaries
// to keep Conn view in sync with updated zconn database view.
error _Conn::resync(zodb::Tid at) {
_Conn& wconn = *this;
error err;
wconn._atMu.RLock();
xerr::Contextf E("%s: resync -> @%s", v(wconn), v(at));
etrace("");
wconn._filehMu.RLock();
err = wconn._downErr;
wconn._filehMu.RUnlock();
wconn._atMu.RUnlock();
if (err != nil)
return E(err);
// wait for wcfs/head to be >= at.
// we need this e.g. to be sure that head/f.size is at least as big that it will be @at state.
err = wconn._wc->_headWait(at);
if (err != nil)
return E(err);
// bring wconn + fileh + mmaps down on error
bool retok = false;
defer([&]() {
if (!retok)
wconn.close(); // ignore error
});
// lock wconn._atMu.W . This excludes everything else, and in
// particular _pinner_, from running and mutating files and mappings.
//
// NOTE we'll relock atMu as R in the second part of resync, so we prelock
// wconn._filehMu.R as well while under atMu.W, to be sure that set of opened
// files and their states stay the same during whole resync.
bool atMuWLocked = true;
wconn._atMu.Lock();
wconn._filehMu.RLock();
defer([&]() {
wconn._filehMu.RUnlock();
if (atMuWLocked)
wconn._atMu.Unlock();
else
wconn._atMu.RUnlock();
});
err = wconn._downErr;
if (err != nil)
return E(err);
// set new wconn.at early, so that e.g. Conn.open running simultaneously
// to second part of resync (see below) uses new at.
wconn.at = at;
// go through all files opened under wconn and pre-adjust their mappings
// for viewing data as of new @at state.
//
// We are still holding atMu.W, so we are the only mutators of mappings,
// because, in particular, pinner is not running.
//
// Don't send watch updates for opened files to wcfs yet - without running
// pinner those updates will get stuck.
for (auto fit : wconn._filehTab) {
//zodb::Oid foid = fit.first;
FileH f = fit.second;
// TODO if file has no mappings and was not used during whole prev
// cycle - forget and stop watching it?
// "opening" or "closing" fileh - their setup/teardown is currently
// handled by Conn.open and FileH.close correspondingly.
if (f->_state != _FileHOpened)
continue;
// update f._headfsize and remmap to head/f zero regions that are now covered by head/f
struct stat st;
err = f->_headf->stat(&st);
if (err != nil)
return E(err);
if ((size_t)st.st_blksize != f->blksize) // blksize must not change
return E(fmt::errorf("wcfs bug: blksize changed: %zd -> %ld", f->blksize, st.st_blksize));
auto headfsize = st.st_size;
if (!(f->_headfsize <= headfsize)) // head/file size ↑=
return E(fmt::errorf("wcfs bug: head/file size not ↑="));
if (!(headfsize % f->blksize == 0))
return E(fmt::errorf("wcfs bug: head/file size %% blksize != 0"));
// replace zero regions in f mappings in accordance to adjusted f._headfsize.
// NOTE it is ok to access f._mmaps without locking f._mmapMu because we hold wconn.atMu.W
for (auto mmap : f->_mmaps) {
//trace(" resync -> %s: unzero [%lu:%lu)", v(at), f->_headfsize/f->blksize, headfsize/f->blksize);
uint8_t *mem_unzero_start = min(mmap->mem_stop,
mmap->mem_start + (f->_headfsize - mmap->blk_start*f->blksize));
uint8_t *mem_unzero_stop = min(mmap->mem_stop,
mmap->mem_start + ( headfsize - mmap->blk_start*f->blksize));
if (mem_unzero_stop - mem_unzero_start > 0) {
err = mmap_into_ro(mem_unzero_start, mem_unzero_stop-mem_unzero_start, f->_headf, f->_headfsize);
if (err != nil)
return E(err);
}
}
f->_headfsize = headfsize;
}
// atomically downgrade atMu.W to atMu.R before issuing watch updates to wcfs.
// - we need atMu to be not Wlocked, because under atMu.W pinner cannot run simultaneously to us.
// - we need to hold atMu.R to avoid race wrt e.g. other resync which changes at.
// - we cannot just do regular `atMu.Unlock + atMu.RLock()` because then
// there is e.g. a race window in between Unlock and RLock where wconn.at can be changed.
// Also if we Unlock and Rlock, it will produce deadlock, because locking
// order will change to reverse: wconn._filehMu.R + wconn._atMu.R
//
// Now other calls, e.g. Conn.open, can be running simultaneously to us,
// but since we already set wconn.at to new value it is ok. For example
// Conn.open, for not-yet-opened file, will use new at to send "watch".
//
// NOTE we are still holding wconn._filehMu.R, so wconn._filehTab and fileh
// states are the same as in previous pass above.
wconn._atMu.UnlockToRLock();
atMuWLocked = false;
// send watch updates to wcfs.
// the pinner is now running and will be able to serve pin requests triggered by our watch.
//
// update only fileh in "opened" state - for fileh in "opening" and
// "closing" states, watch setup/teardown is currently in-progress and
// performed by Conn.open and FileH.close correspondingly.
for (auto fit : wconn._filehTab) {
zodb::Oid foid = fit.first;
FileH f = fit.second;
if (f->_state != _FileHOpened)
continue;
string ack;
tie(ack, err) = wconn._wlink->sendReq(context::background(),
fmt::sprintf("watch %s @%s", v(foid), v(at)));
if (err != nil)
return E(err);
if (ack != "ok")
return E(fmt::errorf("%s", v(ack)));
}
retok = true;
return nil;
}
// open opens FileH corresponding to ZBigFile foid.
pair<FileH, error> _Conn::open(zodb::Oid foid) {
_Conn& wconn = *this;
error err;
wconn._atMu.RLock();
defer([&]() {
wconn._atMu.RUnlock();
});
xerr::Contextf E("%s: open f<%s>", v(wconn), v(foid));
etrace("");
retry:
wconn._filehMu.Lock();
if (wconn._downErr != nil) {
err = wconn._downErr;
wconn._filehMu.Unlock();
return make_pair(nil, E(err));
}
// TODO ensure f<foid>@ wconn.at exists - else we get pins to non-existing
// state from wcfs, pinner replies nak, wcfs sends SIGBUS.
// TODO -> better teach wcfs to reject "watch <foid> @at" for @at where f did not existed.
// (see test_wcfs_watch_before_create)
FileH f; bool ok;
tie(f, ok) = wconn._filehTab.get_(foid);
if (ok) {
bool closing;
if (f->_state <= _FileHOpened) {
f->_nopen++;
closing = false;
} else {
closing = true;
}
wconn._filehMu.Unlock();
// if the file was closing|closed, we should wait for the close to
// complete and retry the open.
if (closing) {
f->_closedq.recv();
goto retry;
}
// the file was opening|opened. wait for open to complete and return the result.
// we can be sure there won't be last close simultaneous to us as we did ._nopen++
f->_openReady.recv();
if (f->_openErr != nil) {
// don't care about f->_nopen-- since f is not returned anywhere
return make_pair(nil, E(f->_openErr));
}
return make_pair(f, nil);
}
// create "opening" FileH entry and perform open with wconn._filehMu released.
// NOTE wconn._atMu.R is still held because FileH._open relies on wconn.at being stable.
f = adoptref(new _FileH());
f->wconn = newref(&wconn);
f->foid = foid;
f->_openReady = makechan<structZ>();
f->_closedq = makechan<structZ>();
f->_openErr = nil;
f->_headf = nil;
f->blksize = 0;
f->_headfsize = 0;
f->_state = _FileHOpening;
f->_nopen = 1;
bool retok = false;
wconn._filehTab[foid] = f;
wconn._filehMu.Unlock();
defer([&]() {
wconn._filehMu.Lock();
if (wconn._filehTab.get(foid) != f) {
wconn._filehMu.Unlock();
panic("BUG: wconn.open: wconn.filehTab[foid] mutated while file open was in progress");
}
if (!retok) {
// don't care about f->_nopen-- since f is not returned anywhere
wconn._filehTab.erase(foid);
} else {
f->_state = _FileHOpened;
}
wconn._filehMu.Unlock();
f->_openReady.close();
});
// do the actual open.
// we hold only wconn.atMu.R, but neither wconn.filehMu, nor f.mmapMu .
f->_openErr = f->_open();
if (f->_openErr != nil)
return make_pair(nil, E(f->_openErr));
// NOTE no need to recheck that wconn was not closed while the open was in
// progress: we'll return "success" but Conn.close will close the fileh.
// However it is indistinguishable from the following scenario:
//
// T1 T2
//
// f = wconn.open()
// # completes ok
// wconn.close()
//
// # use f -> error
retok = true;
return make_pair(f, nil);
}
// _open performs actual open of FileH marked as "in-flight-open" in wconn.filehTab.
//
// Called with:
// - wconn.atMu held
// - wconn.filehMu not locked
// - f.mmapMu not locked
error _FileH::_open() {
_FileH& f = *this;
Conn wconn = f.wconn;
error err;
tie(f._headf, err)
= wconn->_wc->_open(fmt::sprintf("head/bigfile/%s", v(foid)));
if (err != nil)
return err;
bool retok = false;
defer([&]() {
if (!retok)
f._headf->close();
});
struct stat st;
err = f._headf->stat(&st);
if (err != nil)
return err;
f.blksize = st.st_blksize;
f._headfsize = st.st_size;
if (!(f._headfsize % f.blksize == 0))
return fmt::errorf("wcfs bug: %s size (%d) %% blksize (%d) != 0",
v(f._headf->name()), f._headfsize, f.blksize);
// start watching f
// NOTE we are _not_ holding wconn.filehMu nor f.mmapMu - only wconn.atMu to rely on wconn.at being stable.
// NOTE wcfs will reply "ok" only after wcfs/head/at ≥ wconn.at
string ack;
tie(ack, err) = wconn->_wlink->sendReq(context::background(),
fmt::sprintf("watch %s @%s", v(foid), v(wconn->at)));
if (err != nil)
return err;
if (ack != "ok")
return fmt::errorf("watch: %s", v(ack));
retok = true;
return nil;
}
// close releases resources associated with FileH.
//
// Left fileh mappings become invalid to use except unmap.
error _FileH::close() {
_FileH& fileh = *this;
Conn wconn = fileh.wconn;
// lock virtmem early. TODO more granular virtmem locking (see __pin1 for
// details and why virt_lock currently goes first)
virt_lock();
defer([&]() {
virt_unlock();
});
wconn->_atMu.RLock();
defer([&]() {
wconn->_atMu.RUnlock();
});
return fileh._closeLocked(/*force=*/false);
}
// _closeLocked serves FileH.close and Conn.close.
//
// Must be called with the following locks held by caller:
// - virt_lock
// - wconn.atMu
error _FileH::_closeLocked(bool force) {
_FileH& fileh = *this;
Conn wconn = fileh.wconn;
wconn->_filehMu.Lock();
defer([&]() {
wconn->_filehMu.Unlock();
});
// fileh.close can be called several times. just return nil for second close.
if (fileh._state >= _FileHClosing)
return nil;
// decref open count; do real close only when last open goes away.
if (fileh._nopen <= 0)
panic("BUG: fileh.close: fileh._nopen <= 0");
fileh._nopen--;
if (fileh._nopen > 0 && !force)
return nil;
// last open went away - real close.
xerr::Contextf E("%s: %s: close", v(wconn), v(fileh));
etrace("");
ASSERT(fileh._state == _FileHOpened); // there can be no open-in-progress, because
fileh._state = _FileHClosing; // .close() can be called only on "opened" fileh
// unlock wconn._filehMu to stop watching the file outside of this lock.
// we'll relock wconn._filehMu again before updating wconn.filehTab.
wconn->_filehMu.Unlock();
error err, eret;
auto reterr1 = [&eret](error err) {
if (eret == nil && err != nil)
eret = err;
};
// stop watching f
string ack;
tie(ack, err) = wconn->_wlink->sendReq(context::background(),
fmt::sprintf("watch %s -", v(foid)));
if (err != nil)
reterr1(err);
else if (ack != "ok")
reterr1(fmt::errorf("unwatch: %s", v(ack)));
// relock wconn._filehMu again and remove fileh from wconn._filehTab
wconn->_filehMu.Lock();
if (wconn->_filehTab.get(fileh.foid)._ptr() != &fileh)
panic("BUG: fileh.close: wconn.filehTab[fileh.foid] != fileh");
wconn->_filehTab.erase(fileh.foid);
reterr1(fileh._headf->close());
// change all fileh.mmaps to cause EFAULT on any access after fileh.close
fileh._mmapMu.lock();
defer([&]() {
fileh._mmapMu.unlock();
});
for (auto mmap : fileh._mmaps) {
err = mmap->__remmapAsEfault();
if (err != nil)
reterr1(err);
}
// fileh close complete
fileh._state = _FileHClosed;
fileh._closedq.close();
return E(eret);
}
// mmap creates file mapping representing file[blk_start +blk_len) data as of wconn.at database state.
//
// If vma != nil, created mapping is associated with that vma of user-space virtual memory manager:
// virtmem calls FileH::mmap under virtmem lock when virtmem fileh is mmapped into vma.
pair<Mapping, error> _FileH::mmap(int64_t blk_start, int64_t blk_len, VMA *vma) {
_FileH& f = *this;
// NOTE virtmem lock is held by virtmem caller
f.wconn->_atMu.RLock(); // e.g. f._headfsize
f.wconn->_filehMu.RLock(); // f._state TODO -> finer grained (currently too coarse)
f._mmapMu.lock(); // f._pinned, f._mmaps
defer([&]() {
f._mmapMu.unlock();
f.wconn->_filehMu.RUnlock();
f.wconn->_atMu.RUnlock();
});
xerr::Contextf E("%s: %s: mmap [#%ld +%ld)", v(f.wconn), v(f), blk_start, blk_len);
etrace("");
if (f._state >= _FileHClosing)
return make_pair(nil, E(os::ErrClosed));
error err;
if (blk_start < 0)
panic("blk_start < 0");
if (blk_len < 0)
panic("blk_len < 0");
int64_t blk_stop; // = blk_start + blk_len
if (__builtin_add_overflow(blk_start, blk_len, &blk_stop))
panic("blk_start + blk_len overflow int64");
int64_t stop;// = blk_stop *f.blksize;
if (__builtin_mul_overflow(blk_stop, f.blksize, &stop))
panic("(blk_start + blk_len)*f.blksize overflow int64");
int64_t start = blk_start*f.blksize;
// create memory with head/f mapping and applied pins
// mmap-in zeros after f.size (else access to memory after file.size will raise SIGBUS)
uint8_t *mem_start, *mem_stop;
tie(mem_start, err) = mmap_ro(f._headf, start, blk_len*f.blksize);
if (err != nil)
return make_pair(nil, E(err));
mem_stop = mem_start + blk_len*f.blksize;
bool retok = false;
defer([&]() {
if (!retok)
mm::unmap(mem_start, mem_stop - mem_start); // ignore error
});
// part of mmapped region is beyond file size - mmap that with zeros - else
// access to memory after file.size will raise SIGBUS. (assumes head/f size ↑=)
if (stop > f._headfsize) {
uint8_t *zmem_start = mem_start + (max(f._headfsize, start) - start);
err = mmap_zero_into_ro(zmem_start, mem_stop - zmem_start);
if (err != nil)
return make_pair(nil, E(err));
}
Mapping mmap = adoptref(new _Mapping());
mmap->fileh = newref(&f);
mmap->blk_start = blk_start;
mmap->mem_start = mem_start;
mmap->mem_stop = mem_stop;
mmap->vma = vma;
mmap->efaulted = false;
for (auto _ : f._pinned) { // TODO keep f._pinned ↑blk and use binary search
int64_t blk = _.first;
zodb::Tid rev = _.second;
if (!(blk_start <= blk && blk < blk_stop))
continue; // blk ∉ this mapping
err = mmap->_remmapblk(blk, rev);
if (err != nil)
return make_pair(nil, E(err));
}
if (vma != nil) {
if (vma->mmap_overlay_server != nil)
panic("vma is already associated with overlay server");
if (!(vma->addr_start == 0 && vma->addr_stop == 0))
panic("vma already covers !nil virtual memory area");
mmap->incref(); // vma->mmap_overlay_server is keeping ref to mmap
vma->mmap_overlay_server = mmap._ptr();
vma->addr_start = (uintptr_t)mmap->mem_start;
vma->addr_stop = (uintptr_t)mmap->mem_stop;
mmap->_assertVMAOk(); // just in case
}
f._mmaps.push_back(mmap); // TODO keep f._mmaps ↑blk_start
retok = true;
return make_pair(mmap, nil);
}
// __remmapAsEfault remmaps Mapping memory to cause SIGSEGV on access.
//
// It is used on FileH shutdown to turn all fileh mappings into incorrect ones,
// because after fileh is down, it is not possible to continue to provide
// correct f@at data view.
//
// Must be called with the following locks held by caller:
// - virt_lock
// - fileh.mmapMu
error _Mapping::__remmapAsEfault() {
_Mapping& mmap = *this;
FileH f = mmap.fileh;
// errctx: no need for wconn and f: __remmapAsEfault is called only from
// FileH._closeLocked who adds them.
xerr::Contextf E("%s: remmap as efault", v(mmap));
etrace("");
error err = mmap_efault_into(mmap.mem_start, mmap.mem_stop - mmap.mem_start);
mmap.efaulted = true;
return E(err);
}
// __remmapBlkAsEfault is similar to __remmapAsEfault, but remmaps memory of only 1 block.
// blk must be in mapped range.
error _Mapping::__remmapBlkAsEfault(int64_t blk) {
_Mapping& mmap = *this;
FileH f = mmap.fileh;
xerr::Contextf E("%s: remmapblk #%ld as efault", v(mmap), blk);
etrace("");
ASSERT(mmap.blk_start <= blk && blk < mmap.blk_stop());
uint8_t *blkmem = mmap.mem_start + (blk - mmap.blk_start)*f->blksize;
error err = mmap_efault_into(blkmem, 1*f->blksize);
return E(err);
}
// unmap releases mapping memory from address space.
//
// After call to unmap the mapping must no longer be used.
// The association in between mapping and linked virtmem VMA is reset.
//
// Virtmem calls Mapping.unmap under virtmem lock when VMA is unmapped.
error _Mapping::unmap() {
Mapping mmap = newref(this); // newref for std::remove
FileH f = mmap->fileh;
// NOTE virtmem lock is held by virtmem caller
f->wconn->_atMu.RLock();
f->_mmapMu.lock();
defer([&]() {
f->_mmapMu.unlock();
f->wconn->_atMu.RUnlock();
});
xerr::Contextf E("%s: %s: %s: unmap", v(f->wconn), v(f), v(mmap));
etrace("");
// double unmap = ok
if (mmap->mem_start == nil)
return nil;
if (mmap->vma != nil) {
mmap->_assertVMAOk();
VMA *vma = mmap->vma;
vma->mmap_overlay_server = nil;
mmap->decref(); // vma->mmap_overlay_server was holding a ref to mmap
vma->addr_start = 0;
vma->addr_stop = 0;
mmap->vma = nil;
}
error err = mm::unmap(mmap->mem_start, mmap->mem_stop - mmap->mem_start);
mmap->mem_start = nil;
mmap->mem_stop = nil;
//f->_mmaps.remove(mmap);
f->_mmaps.erase(
std::remove(f->_mmaps.begin(), f->_mmaps.end(), mmap),
f->_mmaps.end());
return E(err);
}
// _remmapblk remmaps mapping memory for file[blk] to be viewing database as of @at state.
//
// at=TidHead means unpin to head/ .
// NOTE this does not check whether virtmem already mapped blk as RW.
//
// _remmapblk must not be called after Mapping is switched to efault.
//
// The following locks must be held by caller:
// - f.wconn.atMu
// - f._mmapMu
error _Mapping::_remmapblk(int64_t blk, zodb::Tid at) {
_Mapping& mmap = *this;
FileH f = mmap.fileh;
xerr::Contextf E("_remmapblk #%ld @%s", blk, v(at));
etrace("");
ASSERT(mmap.blk_start <= blk && blk < mmap.blk_stop());
// a mmapping is efaulted only for closed files, i.e. fileh is removed from wconn._filehTab
// -> pinner should not see the fileh and so should not see this mapping.
ASSERT(!mmap.efaulted);
uint8_t *blkmem = mmap.mem_start + (blk - mmap.blk_start)*f->blksize;
error err;
os::File fsfile;
bool fclose = false;
if (at == TidHead) {
fsfile = f->_headf;
}
else {
// TODO share @rev fd until wconn is resynced?
tie(fsfile, err) = f->wconn->_wc->_open(
fmt::sprintf("@%s/bigfile/%s", v(at), v(f->foid)));
if (err != nil)
return E(err);
fclose = true;
}
defer([&]() {
if (fclose)
fsfile->close();
});
struct stat st;
err = fsfile->stat(&st);
if (err != nil)
return E(err);
if ((size_t)st.st_blksize != f->blksize)
return E(fmt::errorf("wcfs bug: blksize changed: %zd -> %ld", f->blksize, st.st_blksize));
// block is beyond file size - mmap with zeros - else access to memory
// after file.size will raise SIGBUS. (assumes head/f size ↑=)
if ((blk+1)*f->blksize > (size_t)st.st_size) {
err = mmap_zero_into_ro(blkmem, 1*f->blksize);
if (err != nil)
return E(err);
}
// block is inside file - mmap in file data
else {
err = mmap_into_ro(blkmem, 1*f->blksize, fsfile, blk*f->blksize);
if (err != nil)
return E(err);
}
return nil;
}
// remmap_blk remmaps file[blk] in its place again.
//
// Virtmem calls Mapping.remmap_blk under virtmem lock to remmap a block after
// RW dirty page was e.g. discarded or committed.
error _Mapping::remmap_blk(int64_t blk) {
_Mapping& mmap = *this;
FileH f = mmap.fileh;
error err;
// NOTE virtmem lock is held by virtmem caller
f->wconn->_atMu.RLock();
f->_mmapMu.lock();
defer([&]() {
f->_mmapMu.unlock();
f->wconn->_atMu.RUnlock();
});
xerr::Contextf E("%s: %s: %s: remmapblk #%ld", v(f->wconn), v(f), v(mmap), blk);
etrace("");
if (!(mmap.blk_start <= blk && blk < mmap.blk_stop()))
panic("remmap_blk: blk out of Mapping range");
// it should not happen, but if, for a efaulted mapping, virtmem asks us to
// remmap base-layer blk memory in its place again, we reinject efault into it.
if (mmap.efaulted) {
log::Warnf("%s: remmapblk called for already-efaulted mapping", v(mmap));
return E(mmap.__remmapBlkAsEfault(blk));
}
// blkrev = rev | @head
zodb::Tid blkrev; bool ok;
tie(blkrev, ok) = f->_pinned.get_(blk);
if (!ok)
blkrev = TidHead;
err = mmap._remmapblk(blk, blkrev);
if (err != nil)
return E(err);
return nil;
}
// ---- WCFS raw file access ----
// _path returns path for object on wcfs.
// - str: wcfs root + obj;
string WCFS::_path(const string &obj) {
WCFS& wc = *this;
return wc.mountpoint + "/" + obj;
}
tuple<os::File, error> WCFS::_open(const string &path, int flags) {
WCFS& wc = *this;
string path_ = wc._path(path);
return os::open(path_, flags);
}
// ---- misc ----
// mmap_zero_into serves mmap_zero_into_ro and mmap_efault_into.
static error mmap_zero_into(void *addr, size_t size, int prot) {
xerr::Contextf E("mmap zero");
etrace("");
// mmap /dev/zero with MAP_NORESERVE and MAP_SHARED
// this way the mapping will be able to be read, but no memory will be allocated to keep it.
os::File z;
error err;
tie(z, err) = os::open("/dev/zero");
if (err != nil)
return E(err);
defer([&]() {
z->close();
});
err = mm::map_into(addr, size, prot, MAP_SHARED | MAP_NORESERVE, z, 0);
if (err != nil)
return E(err);
return nil;
}
// mmap_zero_into_ro mmaps read-only zeros into [addr +size) so that region is all zeros.
// created mapping, even after it is accessed, does not consume memory.
static error mmap_zero_into_ro(void *addr, size_t size) {
return mmap_zero_into(addr, size, PROT_READ);
}
// mmap_efault_into changes [addr +size) region to generate SIGSEGV on read/write access.
// Any previous mapping residing in that virtual address range is released.
static error mmap_efault_into(void *addr, size_t size) {
xerr::Contextf E("mmap efault");
etrace("");
// mmaping /dev/zero with PROT_NONE gives what we need.
return E(mmap_zero_into(addr, size, PROT_NONE));
}
// mmap_ro mmaps read-only fd[offset +size).
// The mapping is created with MAP_SHARED.
static tuple<uint8_t*, error> mmap_ro(os::File f, off_t offset, size_t size) {
return mm::map(PROT_READ, MAP_SHARED, f, offset, size);
}
// mmap_into_ro mmaps read-only fd[offset +size) into [addr +size).
// The mapping is created with MAP_SHARED.
static error mmap_into_ro(void *addr, size_t size, os::File f, off_t offset) {
return mm::map_into(addr, size, PROT_READ, MAP_SHARED, f, offset);
}
// _assertVMAOk() verifies that mmap and vma are related to each other and cover
// exactly the same virtual memory range.
//
// It panics if mmap and vma do not exactly relate to each other or cover
// different virtual memory range.
void _Mapping::_assertVMAOk() {
_Mapping* mmap = this;
VMA *vma = mmap->vma;
if (!(vma->mmap_overlay_server == static_cast<void*>(mmap)))
panic("BUG: mmap and vma do not link to each other");
if (!(vma->addr_start == uintptr_t(mmap->mem_start) &&
vma->addr_stop == uintptr_t(mmap->mem_stop)))
panic("BUG: mmap and vma cover different virtual memory ranges");
// verified ok
}
string WCFS::String() const {
const WCFS& wc = *this;
return fmt::sprintf("wcfs %s", v(wc.mountpoint));
}
// NOTE String must be called with Conn.atMu locked.
string _Conn::String() const {
const _Conn& wconn = *this;
// XXX don't include wcfs as prefix here?
// (e.g. to use Conn.String in tracing without wcfs prefix)
// (if yes -> go and correct all xerr::Contextf calls)
return fmt::sprintf("%s: conn%d @%s", v(wconn._wc), wconn._wlink->fd(), v(wconn.at));
}
string _FileH::String() const {
const _FileH& f = *this;
return fmt::sprintf("f<%s>", v(f.foid));
}
string _Mapping::String() const {
const _Mapping& mmap = *this;
return fmt::sprintf("m[#%ld +%ld) v[%p +%lx)",
mmap.blk_start, mmap.blk_stop() - mmap.blk_start,
mmap.mem_start, mmap.mem_stop - mmap.mem_start);
}
_Conn::_Conn() {}
_Conn::~_Conn() {}
void _Conn::decref() {
if (__decref())
delete this;
}
_FileH::_FileH() {}
_FileH::~_FileH() {}
void _FileH::decref() {
if (__decref())
delete this;
}
_Mapping::_Mapping() {}
_Mapping::~_Mapping() {}
void _Mapping::decref() {
if (__decref())
delete this;
}
dict<int64_t, zodb::Tid> _tfileh_pinned(FileH fileh) {
return fileh->_pinned;
}
} // wcfs::
// Copyright (C) 2018-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Package wcfs provides WCFS client integrated with user-space virtual memory manager.
//
// This client package takes care about WCFS isolation protocol details and
// provides to clients simple interface to isolated view of bigfile data on
// WCFS similar to regular files: given a particular revision of database @at,
// it provides synthetic read-only bigfile memory mappings with data
// corresponding to @at state, but using /head/bigfile/* most of the time to
// build and maintain the mappings.
//
// For its data a mapping to bigfile X mostly reuses kernel cache for
// /head/bigfile/X with amount of data not associated with kernel cache for
// /head/bigfile/X being proportional to δ(bigfile/X, at..head). In the usual
// case where many client workers simultaneously serve requests, their database
// views are a bit outdated, but close to head, which means that in practice
// the kernel cache for /head/bigfile/* is being used almost 100% of the time.
//
// A mapping for bigfile X@at is built from OS-level memory mappings of
// on-WCFS files as follows:
//
// ___ /@revA/bigfile/X
// __ /@revB/bigfile/X
// _ /@revC/bigfile/X
// + ...
// ─── ───── ────────────────────────── ───── /head/bigfile/X
//
// where @revR mmaps are being dynamically added/removed by this client package
// to maintain X@at data view according to WCFS isolation protocol(*).
//
//
// Integration with wendelin.core virtmem layer
//
// This client package can be used standalone, but additionally provides
// integration with wendelin.core userspace virtual memory manager: when a
// Mapping is created, it can be associated as serving base layer for a
// particular virtmem VMA via FileH.mmap(vma=...). In that case, since virtmem
// itself adds another layer of dirty pages over read-only base provided by
// Mapping(+)
//
// ┌──┐ ┌──┐
// │RW│ │RW│ ← virtmem VMA dirty pages
// └──┘ └──┘
// +
// VMA base = X@at view provided by Mapping:
//
// ___ /@revA/bigfile/X
// __ /@revB/bigfile/X
// _ /@revC/bigfile/X
// + ...
// ─── ───── ────────────────────────── ───── /head/bigfile/X
//
// the Mapping will interact with virtmem layer to coordinate
// updates to mapping virtual memory.
//
//
// API overview
//
// - `WCFS` represents filesystem-level connection to wcfs server.
// - `Conn` represents logical connection that provides view of data on wcfs
// filesystem as of particular database state.
// - `FileH` represent isolated file view under Conn.
// - `Mapping` represents one memory mapping of FileH.
//
// A path from WCFS to Mapping is as follows:
//
// WCFS.connect(at) -> Conn
// Conn.open(foid) -> FileH
// FileH.mmap([blk_start +blk_len)) -> Mapping
//
// A connection can be resynced to another database view via Conn.resync(at').
//
// Documentation for classes provides more thorough overview and API details.
//
// --------
//
// (*) see wcfs.go documentation for WCFS isolation protocol overview and details.
// (+) see bigfile_ops interface (wendelin/bigfile/file.h) that gives virtmem
// point of view on layering.
#ifndef _NXD_WCFS_H_
#define _NXD_WCFS_H_
#include <golang/libgolang.h>
#include <golang/cxx.h>
#include <golang/sync.h>
#include <tuple>
#include <utility>
#include "wcfs_misc.h"
#include <wendelin/bug.h>
// from wendelin/bigfile/virtmem.h
extern "C" {
struct VMA;
}
// wcfs::
namespace wcfs {
using namespace golang;
using cxx::dict;
using cxx::set;
using std::tuple;
using std::pair;
typedef refptr<struct _Conn> Conn;
typedef refptr<struct _Mapping> Mapping;
typedef refptr<struct _FileH> FileH;
typedef refptr<struct _WatchLink> WatchLink;
struct PinReq;
// WCFS represents filesystem-level connection to wcfs server.
//
// Use wcfs.join in Python API to create it.
//
// The primary way to access wcfs is to open logical connection viewing on-wcfs
// data as of particular database state, and use that logical connection to
// create base-layer mappings. See .connect and Conn for details.
//
// WCFS logically mirrors ZODB.DB .
// It is safe to use WCFS from multiple threads simultaneously.
struct WCFS {
string mountpoint;
pair<Conn, error> connect(zodb::Tid at);
pair<WatchLink, error> _openwatch();
string String() const;
error _headWait(zodb::Tid at);
// at OS-level, on-WCFS raw files can be accessed via ._path and ._open.
string _path(const string &obj);
tuple<os::File, error> _open(const string &path, int flags=O_RDONLY);
};
// Conn represents logical connection that provides view of data on wcfs
// filesystem as of particular database state.
//
// It uses /head/bigfile/* and notifications received from /head/watch to
// maintain isolated database view while at the same time sharing most of data
// cache in OS pagecache of /head/bigfile/*.
//
// Use WCFS.connect(at) to create Conn.
// Use .open to create new FileH.
// Use .resync to resync Conn onto different database view.
//
// Conn logically mirrors ZODB.Connection .
// It is safe to use Conn from multiple threads simultaneously.
typedef refptr<struct _Conn> Conn;
struct _Conn : object {
WCFS *_wc;
WatchLink _wlink; // watch/receive pins for mappings created under this conn
// atMu protects .at.
// While it is rlocked, .at is guaranteed to stay unchanged and Conn
// viewing the database at particular state. .resync write-locks this and
// knows noone is using the connection for reading simultaneously.
sync::RWMutex _atMu;
zodb::Tid at;
sync::RWMutex _filehMu; // _atMu.W | _atMu.R + _filehMu
error _downErr; // !nil if connection is closed or no longer operational
dict<zodb::Oid, FileH> _filehTab; // {} foid -> fileh
sync::WorkGroup _pinWG; // pin/unpin messages from wcfs are served by _pinner
func<void()> _pinCancel; // spawned under _pinWG.
// don't new - create via WCFS.connect
private:
_Conn();
~_Conn();
friend pair<Conn, error> WCFS::connect(zodb::Tid at);
public:
void decref();
public:
pair<FileH, error> open(zodb::Oid foid);
error close();
error resync(zodb::Tid at);
string String() const;
private:
error _pinner(context::Context ctx);
error __pinner(context::Context ctx);
error _pin1(PinReq *req);
error __pin1(PinReq *req);
};
// FileH represent isolated file view under Conn.
//
// The file view is maintained to be as of @Conn.at database state even in the
// presence of simultaneous database changes. The file view uses
// /head/<file>/data primarily and /@revX/<file>/data pin overrides.
//
// Use .mmap to map file view into memory.
//
// It is safe to use FileH from multiple threads simultaneously.
enum _FileHState {
// NOTE order of states is semantically important
_FileHOpening = 0, // FileH open is in progress
_FileHOpened = 1, // FileH is opened and can be used
_FileHClosing = 2, // FileH close is in progress
_FileHClosed = 3, // FileH is closed
};
typedef refptr<struct _FileH> FileH;
struct _FileH : object {
Conn wconn;
zodb::Oid foid; // ZBigFile root object ID (does not change after fileh open)
// protected by wconn._filehMu
_FileHState _state; // opening/opened/closing/closed
int _nopen; // number of times Conn.open returned this fileh
chan<structZ> _openReady; // in-flight open completed
error _openErr; // error result from open
chan<structZ> _closedq; // in-flight close completed
os::File _headf; // file object of head/file
size_t blksize; // block size of this file (does not change after fileh open)
// head/file size is known to be at least headfsize (size ↑=)
// protected by .wconn._atMu
off_t _headfsize;
sync::Mutex _mmapMu; // atMu.W | atMu.R + _mmapMu
dict<int64_t, zodb::Tid> _pinned; // {} blk -> rev that wcfs already sent us for this file
vector<Mapping> _mmaps; // []Mapping ↑blk_start mappings of this file
// don't new - create via Conn.open
private:
_FileH();
~_FileH();
friend pair<FileH, error> _Conn::open(zodb::Oid foid);
public:
void decref();
public:
error close();
pair<Mapping, error> mmap(int64_t blk_start, int64_t blk_len, VMA *vma=nil);
string String() const;
error _open();
error _closeLocked(bool force);
};
// Mapping represents one memory mapping of FileH.
//
// The mapped memory is [.mem_start, .mem_stop)
// Use .unmap to release virtual memory resources used by mapping.
//
// Except unmap, it is safe to use Mapping from multiple threads simultaneously.
typedef refptr<struct _Mapping> Mapping;
struct _Mapping : object {
FileH fileh;
int64_t blk_start; // offset of this mapping in file
// protected by fileh._mmapMu
uint8_t *mem_start; // mmapped memory [mem_start, mem_stop)
uint8_t *mem_stop;
VMA *vma; // mmapped under this virtmem VMA | nil if created standalone from virtmem
bool efaulted; // y after mapping was switched to be invalid (gives SIGSEGV on access)
int64_t blk_stop() const {
ASSERT((mem_stop - mem_start) % fileh->blksize == 0);
return blk_start + (mem_stop - mem_start) / fileh->blksize;
}
error remmap_blk(int64_t blk); // for virtmem-only
error unmap();
void _assertVMAOk();
error _remmapblk(int64_t blk, zodb::Tid at);
error __remmapAsEfault();
error __remmapBlkAsEfault(int64_t blk);
// don't new - create via FileH.mmap
private:
_Mapping();
~_Mapping();
friend pair<Mapping, error> _FileH::mmap(int64_t blk_start, int64_t blk_len, VMA *vma);
public:
void decref();
string String() const;
};
// for testing
dict<int64_t, zodb::Tid> _tfileh_pinned(FileH fileh);
} // wcfs::
#endif
// Copyright (C) 2019-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
#include "wcfs_misc.h"
#include <golang/libgolang.h>
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
using namespace golang;
#include <inttypes.h>
#include <stdarg.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <memory>
// golang::
namespace golang {
// os::
namespace os {
global<error> ErrClosed = errors::New("file already closed");
// TODO -> os.PathError + err=syscall.Errno
static error _pathError(const char *op, const string &path, int syserr);
int _File::fd() const { return _fd; }
string _File::name() const { return _path; }
_File::_File() {}
_File::~_File() {}
void _File::decref() {
if (__decref())
delete this;
}
tuple<File, error> open(const string &path, int flags, mode_t mode) {
int fd = ::open(path.c_str(), flags, mode);
if (fd == -1)
return make_tuple(nil, _pathError("open", path, errno));
File f = adoptref(new _File);
f->_path = path;
f->_fd = fd;
return make_tuple(f, nil);
}
error _File::close() {
_File& f = *this;
int err = ::close(f._fd);
if (err != 0)
return f._errno("close");
f._fd = -1;
return nil;
}
tuple<int, error> _File::read(void *buf, size_t count) {
_File& f = *this;
int n;
n = ::read(f._fd, buf, count);
if (n == 0)
return make_tuple(n, io::EOF_);
if (n < 0)
return make_tuple(0, f._errno("read"));
return make_tuple(n, nil);
}
tuple <int, error> _File::write(const void *buf, size_t count) {
_File& f = *this;
int n, wrote=0;
// NOTE contrary to write(2) we have to write all data as io.Writer requires.
while (count != 0) {
n = ::write(f._fd, buf, count);
if (n < 0)
return make_tuple(wrote, f._errno("write"));
wrote += n;
buf = ((const char *)buf) + n;
count -= n;
}
return make_tuple(wrote, nil);
}
error _File::stat(struct stat *st) {
_File& f = *this;
int err = fstat(f._fd, st);
if (err != 0)
return f._errno("stat");
return nil;
}
// _errno returns error corresponding to op(file) and errno.
error _File::_errno(const char *op) {
_File& f = *this;
return _pathError(op, f._path, errno);
}
// _pathError returns os.PathError-like for op/path and system error
// indicated by syserr.
static error _pathError(const char *op, const string &path, int syserr) {
char ebuf[128];
char *estr = strerror_r(syserr, ebuf, sizeof(ebuf));
return fmt::errorf("%s %s: %s", op, v(path), estr); // TODO estr -> syscall.Errno
}
} // os::
// mm::
namespace mm {
// map memory-maps f.fd[offset +size) somewhere into memory.
// prot is PROT_* from mmap(2).
// flags is MAP_* from mmap(2); MAP_FIXED must not be used.
tuple<uint8_t*, error> map(int prot, int flags, os::File f, off_t offset, size_t size) {
void *addr;
if (flags & MAP_FIXED)
panic("MAP_FIXED not allowed for map - use map_into");
addr = ::mmap(nil, size, prot, flags, f->fd(), offset);
if (addr == MAP_FAILED)
return make_tuple(nil, os::_pathError("mmap", f->name(), errno));
return make_tuple((uint8_t*)addr, nil);
}
// map_into memory-maps f.fd[offset +size) into [addr +size).
// prot is PROT_* from mmap(2).
// flags is MAP_* from mmap(2); MAP_FIXED is added automatically.
error map_into(void *addr, size_t size, int prot, int flags, os::File f, off_t offset) {
void *addr2;
addr2 = ::mmap(addr, size, prot, MAP_FIXED | flags, f->fd(), offset);
if (addr2 == MAP_FAILED)
return os::_pathError("mmap", f->name(), errno);
if (addr2 != addr)
panic("mmap(addr, MAP_FIXED): returned !addr");
return nil;
}
// unmap unmaps [addr +size) memory previously mapped with map & co.
error unmap(void *addr, size_t size) {
int err = ::munmap(addr, size);
if (err != 0)
return os::_pathError("munmap", "<memory>", errno);
return nil;
}
} // mm::
// io::ioutil::
namespace io {
namespace ioutil {
tuple<string, error> ReadFile(const string& path) {
// errctx is ok as returned by all calls.
os::File f;
error err;
tie(f, err) = os::open(path);
if (err != nil)
return make_tuple("", err);
string data;
vector<char> buf(4096);
while (1) {
int n;
tie(n, err) = f->read(&buf[0], buf.size());
data.append(&buf[0], n);
if (err != nil) {
if (err == io::EOF_)
err = nil;
break;
}
}
error err2 = f->close();
if (err == nil)
err = err2;
if (err != nil)
data = "";
return make_tuple(data, err);
}
}} // io::ioutil::
// xstrconv:: (strconv-like)
namespace xstrconv {
// parseHex64 decodes 16-character-wide hex-encoded string into uint64.
tuple<uint64_t, error> parseHex64(const string& s) {
if (s.size() != 16)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
uint64_t v;
int n = sscanf(s.c_str(), "%16" SCNx64, &v);
if (n != 1)
return make_tuple(0, fmt::errorf("hex64 %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseInt decodes string s as signed decimal integer.
tuple<int64_t, error> parseInt(const string& s) {
int64_t v;
int n = sscanf(s.c_str(), "%" SCNi64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("int %s invalid", v(s)));
return make_tuple(v, nil);
}
// parseUint decodes string s as unsigned decimal integer.
tuple<uint64_t, error> parseUint(const string& s) {
uint64_t v;
int n = sscanf(s.c_str(), "%" SCNu64, &v);
if (!(n == 1 && std::to_string(v) == s))
return make_tuple(0, fmt::errorf("uint %s invalid", v(s)));
return make_tuple(v, nil);
}
} // xstrconv::
} // golang::
// xerr::
namespace xerr {
// XXX don't require fmt::vsprintf
#if 0
Contextf::Contextf(const char *format, ...) {
Contextf& c = *this;
va_list argp;
va_start(argp, format);
c.errctx = fmt::sprintfv(format, argp);
va_end(argp);
}
#endif
error Contextf::operator() (error err) const {
const Contextf& c = *this;
if (err == nil)
return nil;
return fmt::errorf("%s: %w", v(c.errctx), err);
}
} // xerr::
#include <golang/time.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/syscall.h>
// golang::log::
namespace golang {
namespace log {
void __Logf(const char *file, int line, char level, const char *format, ...) {
double t = time::now();
time_t t_int = time_t(t);
struct tm tm_loc;
localtime_r(&t_int, &tm_loc);
char t_buf[32];
strftime(t_buf, sizeof(t_buf), "%m%d %H:%M:%S", &tm_loc);
int t_us = int((t-t_int)*1E6);
pid_t tid = syscall(SYS_gettid);
string prefix = fmt::sprintf("%c%s.%06d % 7d %s:%d] ", level, t_buf, t_us, tid, file, line);
// TODO better to emit prefix and msg in one go.
flockfile(stderr);
fprintf(stderr, "%s", v(prefix));
va_list argp;
va_start(argp, format);
vfprintf(stderr, format, argp);
va_end(argp);
fprintf(stderr, "\n");
funlockfile(stderr);
}
}} // golang::log::
// wcfs::
namespace wcfs {
template<> string v_(error err) {
return (err != nil) ? err->Error() : "nil";
}
static string h016(uint64_t v) { return fmt::sprintf("%016lx", v); }
template<> string v_(const zodb::Tid& tid) { return h016(tid); }
//template<> string v_(zodb::Oid oid) { return h016(oid); }
// XXX Tid and Oid are typedefs for uint64_t and C++ reduces template
// specializations to the underlying type. This providing specialization for
// both Tid and Oid results in "multiple definition" error.
} // wcfs::
// Copyright (C) 2019-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// wcfs_misc.{h,cpp} provide miscellaneous utilities for other wcfs_* files.
#ifndef _NXD_WCFS_MISC_H_
#define _NXD_WCFS_MISC_H_
// XXX hack: C++ does not have __builtin_types_compatible_p, but CCAN configure
// thinks it does because CCAN is configured via C, not C++.
#include <config.h>
#undef HAVE_BUILTIN_TYPES_COMPATIBLE_P
#define HAVE_BUILTIN_TYPES_COMPATIBLE_P 0
#include <ccan/array_size/array_size.h>
#include <stddef.h>
#include <stdint.h>
#include <golang/libgolang.h>
using namespace golang;
#include <string>
using std::string;
#include <utility>
using std::pair;
using std::make_pair;
#include <tuple>
using std::tuple;
using std::make_tuple;
using std::tie;
#include <vector>
using std::vector;
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
// golang::
namespace golang {
// os::
namespace os {
extern global<error> ErrClosed;
// os::File mimics os.File from Go.
// its operations return error with full file context.
typedef refptr<class _File> File;
class _File : public object {
int _fd;
string _path;
// don't new - create via open
private:
_File();
~_File();
friend tuple<File, error> open(const string &path, int flags, mode_t mode);
public:
void decref();
public:
int fd() const;
string name() const;
error close();
// read implements io.Reader from Go: it reads into buf up-to count bytes.
// XXX buf -> slice<byte> ?
tuple<int, error> read(void *buf, size_t count);
// write implements io.Writer from Go: it writes all data from buf.
//
// NOTE write behaves like io.Writer in Go - it tries to write as much
// bytes as requested, and if it could write only less - it returns error.
// XXX buf -> slice<byte> ?
tuple<int, error> write(const void *buf, size_t count);
error stat(struct stat *st);
private:
error _errno(const char *op);
};
// open opens file @path.
tuple<File, error> open(const string &path, int flags = O_RDONLY,
mode_t mode = S_IRUSR | S_IWUSR | S_IXUSR |
S_IRGRP | S_IWGRP | S_IXGRP |
S_IROTH | S_IWOTH | S_IXOTH);
} // os::
// mm::
namespace mm {
tuple<uint8_t*, error> map(int prot, int flags, os::File f, off_t offset, size_t size);
error map_into(void *addr, size_t size, int prot, int flags, os::File f, off_t offset);
error unmap(void *addr, size_t size);
} // mm::
// io::ioutil::
namespace io {
namespace ioutil {
tuple<string, error> ReadFile(const string& path);
}} // io::ioutil::
// ---- misc ----
// xstrconv::
namespace xstrconv {
tuple<uint64_t, error> parseHex64(const string& s);
tuple<int64_t, error> parseInt(const string& s);
tuple<uint64_t, error> parseUint(const string& s);
} // xstrconv::
// log::
namespace log {
#define Debugf(format, ...) __Logf(__FILE__, __LINE__, 'D', format, ##__VA_ARGS__)
#define Infof(format, ...) __Logf(__FILE__, __LINE__, 'I', format, ##__VA_ARGS__)
#define Warnf(format, ...) __Logf(__FILE__, __LINE__, 'W', format, ##__VA_ARGS__)
#define Errorf(format, ...) __Logf(__FILE__, __LINE__, 'E', format, ##__VA_ARGS__)
#define Fatalf(format, ...) __Logf(__FILE__, __LINE__, 'F', format, ##__VA_ARGS__)
void __Logf(const char *file, int line, char level, const char *format, ...);
} // log::
} // golang::
// zodb::
namespace zodb {
typedef uint64_t Tid;
typedef uint64_t Oid;
} // zodb::
#include <golang/fmt.h>
// xerr::
namespace xerr {
// xerr::Contextf mimics xerr.Contextf from Go.
//
// Usage is a bit different(*) compared to Go:
//
// func doSomething(arg) {
// xerr.Contextf E("doing something %s", v(arg));
// ...
// return E(err);
// }
//
// (*) because C++ does not allow to modify returned value on the fly.
class Contextf {
string errctx;
public:
template<typename ...Argv>
inline Contextf(const char *format, Argv... argv) {
// XXX string() to avoid "error: format not a string literal" given by -Werror=format-security
errctx = fmt::sprintf(string(format), argv...);
}
error operator() (error) const;
};
} // xerr::
// wcfs::
namespace wcfs {
// TidHead is invalid Tid which is largest Tid value and means @head.
const zodb::Tid TidHead = -1ULL;
// v mimics %v for T to be used in printf & friends.
//
// NOTE returned char* pointer is guaranteed to stay valid only till end of
// current expression. For example
//
// printf("hello %s", v(obj))
//
// is valid, while
//
// x = v(obj);
// use(x);
//
// is not valid.
#define v(obj) (wcfs::v_(obj).c_str())
template<typename T> string v_(T* obj) { return obj->String(); }
template<typename T> string v_(const T* obj) { return obj->String(); }
template<typename T> string v_(const T& obj) { return obj.String(); }
template<typename T> string v_(refptr<T> obj) { return obj->String(); }
template<> inline string v_(const string& s) { return s; }
template<> string v_(error);
template<> string v_(const zodb::Tid&);
template<> string v_(const zodb::Oid&);
} // wcfs::
#endif
// Copyright (C) 2018-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
#include "wcfs_misc.h"
#include "wcfs.h"
#include "wcfs_watchlink.h"
#include <golang/errors.h>
#include <golang/fmt.h>
#include <golang/io.h>
#include <golang/strings.h>
#include <string.h>
#define TRACE 0
#if TRACE
# define trace(format, ...) log::Debugf(format, ##__VA_ARGS__)
#else
# define trace(format, ...) do {} while (0)
#endif
// wcfs::
namespace wcfs {
// ErrLinkDown is the error indicating that WCFS watch link is no-longer operational.
global<error> ErrLinkDown = errors::New("link is down");
// _openwatch opens new watch link on wcfs.
pair<WatchLink, error> WCFS::_openwatch() {
WCFS *wc = this;
xerr::Contextf E("%s: openwatch", v(wc));
// head/watch handle.
os::File f;
error err;
tie(f, err) = wc->_open("head/watch", O_RDWR);
if (err != nil)
return make_pair(nil, E(err));
WatchLink wlink = adoptref(new(_WatchLink));
wlink->_wc = wc;
wlink->_f = f;
wlink->_acceptq = makechan<rxPkt>();
wlink->_down = false;
wlink->_rxeof = false;
wlink->_req_next = 1;
wlink->rx_eof = makechan<structZ>();
context::Context serveCtx;
tie(serveCtx, wlink->_serveCancel) = context::with_cancel(context::background());
wlink->_serveWG = sync::NewWorkGroup(serveCtx);
wlink->_serveWG->go([wlink](context::Context ctx) -> error {
return wlink->_serveRX(ctx);
});
return make_pair(wlink, nil);
}
// close closes the link.
error _WatchLink::close() {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: close", v(wlink));
error err = wlink.closeWrite();
wlink._serveCancel();
// NOTE we can get stuck here if wcfs does not behave correctly by closing
// its side in reply to our "bye" message.
//
// TODO -> better pthread_kill(SIGINT) instead of relying on wcfs proper behaviour?
error err2 = wlink._serveWG->wait();
if (errors::Is(err2, context::canceled) || // we canceled _serveWG
errors::Is(err2, io::EOF_) || // EOF received from WCFS
errors::Is(err2, ErrLinkDown)) // link shutdown due to logic error; details logged
err2 = nil;
error err3 = wlink._f->close();
if (err == nil)
err = err2;
if (err == nil)
err = err3;
return E(err);
}
// closeWrite closes send half of the link.
error _WatchLink::closeWrite() {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: closeWrite", v(wlink));
wlink._txclose1.do_([&]() {
// ask wcfs to close its tx & rx sides; wcfs.close(tx) wakes up
// _serveRX on client (= on us). The connection can be already closed
// by wcfs - so ignore errors when sending bye.
(void)wlink._send(wlink._nextReqID(), "bye");
// NOTE vvv should be ~ shutdown(wlink._f, SHUT_WR), however shutdown does
// not work for non-socket file descriptors. And even if we dup link
// fd, and close only one used for TX, peer's RX will still be blocked
// as fds are referring to one file object which stays in opened
// state. So just use ^^^ "bye" as "TX closed" message.
// wlink._wtx.close();
});
return nil;
}
// _serveRX receives messages from ._f and dispatches them according to
// streamID either to .recvReq, or to .sendReq waiting for reply.
error _WatchLink::_serveRX(context::Context ctx) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: serve rx", v(wlink));
bool rxeof = false;
// when finishing - wakeup everyone waiting for rx
defer([&]() {
wlink._rxmu.lock();
wlink._rxeof = rxeof;
wlink._down = true; // don't allow new rxtab registers; mark the link as down
for (auto _ : wlink._rxtab) {
auto rxq = _.second;
rxq.close();
}
wlink._rxmu.unlock();
wlink._acceptq.close();
});
string l;
error err;
rxPkt pkt;
while (1) {
// NOTE: .close() makes sure ._f.read*() will wake up
tie(l, err) = wlink._readline();
if (err != nil) {
// peer closed its tx
if (err == io::EOF_) {
rxeof = true;
wlink.rx_eof.close();
}
return E(err);
}
trace("C: watch : rx: \"%s\"", v(l));
err = pkt.from_string(l);
if (err != nil)
return E(err);
if (pkt.stream == 0) { // control/fatal message from wcfs
log::Errorf("%s: rx fatal: %s\n", v(wlink), v(l));
wlink.fatalv.push_back(pkt.to_string()); // TODO -> wlink.errorq
continue; // wcfs should close link after error
}
bool reply = (pkt.stream % 2 != 0);
// wcfs replies to our request
if (reply) {
chan<rxPkt> rxq;
bool ok;
wlink._rxmu.lock();
tie(rxq, ok) = wlink._rxtab.pop_(pkt.stream);
wlink._rxmu.unlock();
if (!ok) {
// wcfs sent reply on unexpected stream -> shutdown wlink.
log::Errorf("%s: .%lu: wcfs sent reply on unexpected stream", v(wlink), pkt.stream);
return E(ErrLinkDown);
}
int _ = select({
ctx->done().recvs(), // 0
rxq.sends(&pkt), // 1
});
if (_ == 0)
return E(ctx->err());
}
// wcfs originated request
else {
wlink._rxmu.lock();
if (wlink._accepted.has(pkt.stream)) {
wlink._rxmu.unlock();
// wcfs request on already used stream
log::Errorf("%s: .%lu: wcfs sent request on already used stream", v(wlink), pkt.stream);
return E(ErrLinkDown);
}
wlink._accepted.insert(pkt.stream);
wlink._rxmu.unlock();
int _ = select({
ctx->done().recvs(), // 0
wlink._acceptq.sends(&pkt), // 1
});
if (_ == 0)
return E(ctx->err());
}
}
}
// recvReq receives client <- server request.
//
// it returns EOF when server closes the link.
static error _parsePinReq(PinReq *pin, const rxPkt *pkt);
error _WatchLink::recvReq(context::Context ctx, PinReq *prx) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: recvReq", v(wlink));
rxPkt pkt;
bool ok;
int _ = select({
ctx->done().recvs(), // 0
wlink._acceptq.recvs(&pkt, &ok), // 1
});
if (_ == 0)
return E(ctx->err());
if (!ok) {
wlink._rxmu.lock();
bool rxeof = wlink._rxeof;
wlink._rxmu.unlock();
if (rxeof)
return io::EOF_; // NOTE EOF goes without E
return E(ErrLinkDown);
}
return E(_parsePinReq(prx, &pkt));
}
// replyReq sends reply to client <- server request received via recvReq.
error _WatchLink::replyReq(context::Context ctx, const PinReq *req, const string& answer) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: replyReq .%d", v(wlink), req->stream);
wlink._rxmu.lock();
bool ok = wlink._accepted.has(req->stream);
bool down = wlink._down;
wlink._rxmu.unlock();
if (!ok)
panic("reply to not accepted stream");
if (down)
return E(ErrLinkDown);
error err = wlink._send(req->stream, answer);
wlink._rxmu.lock();
ok = wlink._accepted.has(req->stream);
if (ok)
wlink._accepted.erase(req->stream);
wlink._rxmu.unlock();
if (!ok)
panic("BUG: stream vanished from wlink._accepted while reply was in progress");
// TODO also track as answered for some time and don't accept new requests with the same ID?
return E(err);
}
// sendReq sends client -> server request and returns server reply.
pair</*reply*/string, error> _WatchLink::sendReq(context::Context ctx, const string &req) {
_WatchLink& wlink = *this;
StreamID stream = wlink._nextReqID();
xerr::Contextf E("%s: sendReq .%d", v(wlink), stream);
rxPkt rx; bool ok;
chan<rxPkt> rxq;
error err;
tie(rxq, err) = wlink._sendReq(ctx, stream, req);
if (err != nil)
return make_pair("", E(err));
// wait for reply
E = xerr::Contextf("%s: sendReq .%d: recvReply", v(wlink), stream);
int _ = select({
ctx->done().recvs(), // 0
rxq.recvs(&rx, &ok), // 1
});
if (_ == 0)
return make_pair("", E(ctx->err()));
if (!ok) {
wlink._rxmu.lock();
bool down = wlink._down;
wlink._rxmu.unlock();
return make_pair("", E(down ? ErrLinkDown : io::ErrUnexpectedEOF));
}
string reply = rx.to_string();
return make_pair(reply, nil);
}
tuple</*rxq*/chan<rxPkt>, error> _WatchLink::_sendReq(context::Context ctx, StreamID stream, const string &req) {
_WatchLink& wlink = *this;
auto rxq = makechan<rxPkt>(1);
wlink._rxmu.lock();
if (wlink._down) {
wlink._rxmu.unlock();
return make_tuple(nil, ErrLinkDown);
}
if (wlink._rxtab.has(stream)) {
wlink._rxmu.unlock();
panic("BUG: to-be-sent stream is present in rxtab");
}
wlink._rxtab[stream] = rxq;
wlink._rxmu.unlock();
error err = wlink._send(stream, req);
if (err != nil) {
// remove rxq from rxtab
wlink._rxmu.lock();
wlink._rxtab.erase(stream);
wlink._rxmu.unlock();
// no need to drain rxq - it was created with cap=1
rxq = nil;
}
return make_tuple(rxq, err);
}
// _send sends raw message via specified stream.
//
// multiple _send can be called in parallel - _send serializes writes.
// msg must not include \n.
error _WatchLink::_send(StreamID stream, const string &msg) {
_WatchLink& wlink = *this;
xerr::Contextf E("%s: send .%d", v(wlink), stream);
if (msg.find('\n') != string::npos)
panic("msg has \\n");
string pkt = fmt::sprintf("%lu %s\n", stream, v(msg));
return E(wlink._write(pkt));
}
error _twlinkwrite(WatchLink wlink, const string &pkt) {
return wlink->_write(pkt);
}
error _WatchLink::_write(const string &pkt) {
_WatchLink& wlink = *this;
// no errctx
wlink._txmu.lock();
defer([&]() {
wlink._txmu.unlock();
});
trace("C: watch : tx: \"%s\"", v(pkt));
int n;
error err;
tie(n, err) = wlink._f->write(pkt.c_str(), pkt.size());
return err;
}
// _parsePinReq parses message into PinReq according to wcfs isolation protocol.
static error _parsePinReq(PinReq *pin, const rxPkt *pkt) {
pin->stream = pkt->stream;
string msg = pkt->to_string();
pin->msg = msg;
xerr::Contextf E("bad pin: '%s'", v(msg));
// pin <foid>) #<blk> @<at>
if (!strings::has_prefix(msg, "pin ")) {
return E(fmt::errorf("not a pin request"));
}
auto argv = strings::split(msg.substr(4), ' ');
if (argv.size() != 3)
return E(fmt::errorf("expected 3 arguments, got %zd", argv.size()));
error err;
tie(pin->foid, err) = xstrconv::parseHex64(argv[0]);
if (err != nil)
return E(fmt::errorf("invalid foid"));
if (!strings::has_prefix(argv[1], '#'))
return E(fmt::errorf("invalid blk"));
tie(pin->blk, err) = xstrconv::parseInt(argv[1].substr(1));
if (err != nil)
return E(fmt::errorf("invalid blk"));
if (!strings::has_prefix(argv[2], '@'))
return E(fmt::errorf("invalid at"));
auto at = argv[2].substr(1);
if (at == "head") {
pin->at = TidHead;
} else {
tie(pin->at, err) = xstrconv::parseHex64(at);
if (err != nil)
return E(fmt::errorf("invalid at"));
}
return nil;
}
// _readline reads next raw line sent from wcfs.
tuple<string, error> _WatchLink::_readline() {
_WatchLink& wlink = *this;
char buf[128];
size_t nl_searchfrom = 0;
while (1) {
auto nl = wlink._rxbuf.find('\n', nl_searchfrom);
if (nl != string::npos) {
auto line = wlink._rxbuf.substr(0, nl+1);
wlink._rxbuf = wlink._rxbuf.substr(nl+1);
return make_tuple(line, nil);
}
nl_searchfrom = wlink._rxbuf.length();
// limit line length to avoid DoS
if (wlink._rxbuf.length() > 128)
return make_tuple("", fmt::errorf("input line is too long"));
int n;
error err;
tie(n, err) = wlink._f->read(buf, sizeof(buf));
if (n > 0) {
wlink._rxbuf += string(buf, n);
continue;
}
if (err == nil)
panic("read returned (0, nil)");
if (err == io::EOF_ && wlink._rxbuf.length() != 0)
err = io::ErrUnexpectedEOF;
return make_tuple("", err);
}
}
// from_string parses string into rxPkt.
error rxPkt::from_string(const string &rx) {
rxPkt& pkt = *this;
xerr::Contextf E("invalid pkt");
// <stream> ... \n
auto sp = rx.find(' ');
if (sp == string::npos)
return E(fmt::errorf("no SP"));
if (!strings::has_suffix(rx, '\n'))
return E(fmt::errorf("no LF"));
string sid = rx.substr(0, sp);
string smsg = strings::trim_suffix(rx.substr(sp+1), '\n');
error err;
tie(pkt.stream, err) = xstrconv::parseUint(sid);
if (err != nil)
return E(fmt::errorf("invalid stream ID"));
auto msglen = smsg.length();
if (msglen > ARRAY_SIZE(pkt.data))
return E(fmt::errorf("len(msg) > %zu", ARRAY_SIZE(pkt.data)));
memcpy(pkt.data, smsg.c_str(), msglen);
pkt.datalen = msglen;
return nil;
}
// to_string converts rxPkt data into string.
string rxPkt::to_string() const {
const rxPkt& pkt = *this;
return string(pkt.data, pkt.datalen);
}
_WatchLink::_WatchLink() {}
_WatchLink::~_WatchLink() {}
void _WatchLink::decref() {
if (__decref())
delete this;
}
string _WatchLink::String() const {
const _WatchLink& wlink = *this;
// XXX don't include wcfs as prefix here? (see Conn.String for details)
return fmt::sprintf("%s: wlink%d", v(wlink._wc), wlink._f->fd());
}
int _WatchLink::fd() const {
const _WatchLink& wlink = *this;
return wlink._f->fd();
}
// _nextReqID returns stream ID for next client-originating request to be made.
StreamID _WatchLink::_nextReqID() {
_WatchLink& wlink = *this;
wlink._txmu.lock(); // TODO ._req_next -> atomic (currently uses arbitrary lock)
StreamID stream = wlink._req_next;
wlink._req_next = (wlink._req_next + 2); // wraparound at uint64 max
wlink._txmu.unlock();
return stream;
}
} // wcfs::
// Copyright (C) 2018-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// wcfs_watchlink provides WatchLink class that implements message exchange
// over /head/watch on wcfs.
#ifndef _NXD_WCFS_WATCHLINK_H_
#define _NXD_WCFS_WATCHLINK_H_
#include <golang/libgolang.h>
#include <golang/context.h>
#include <golang/cxx.h>
#include <golang/sync.h>
using namespace golang;
using cxx::dict;
using cxx::set;
#include "wcfs.h"
#include "wcfs_misc.h"
// wcfs::
namespace wcfs {
struct PinReq;
// StreamID stands for ID of a stream multiplexed over WatchLink.
typedef uint64_t StreamID;
// rxPkt internally represents data of one message received over WatchLink.
struct rxPkt {
// stream over which the data was received
StreamID stream;
// raw data received/to-be-sent.
// XXX not e.g. string, as chan<T> currently does not support types with
// non-trivial copy. Note: we anyway need to limit rx line length to
// avoid DoS, but just for DoS the limit would be higher.
uint16_t datalen;
char data[256 - sizeof(StreamID) - sizeof(uint16_t)];
error from_string(const string& rx);
string to_string() const;
};
static_assert(sizeof(rxPkt) == 256, "rxPkt miscompiled"); // NOTE 128 is too low for long error message
// WatchLink represents /head/watch link opened on wcfs.
//
// It is created by WCFS._openwatch().
//
// .sendReq()/.recvReq() provides raw IO in terms of wcfs isolation protocol messages.
// .close() closes the link.
//
// It is safe to use WatchLink from multiple threads simultaneously.
typedef refptr<class _WatchLink> WatchLink;
class _WatchLink : public object {
WCFS *_wc;
os::File _f; // head/watch file handle
string _rxbuf; // buffer for data already read from _f
// iso.protocol message IO
chan<rxPkt> _acceptq; // server originated messages go here
sync::Mutex _rxmu;
bool _down; // y when the link is no-longer operational
bool _rxeof; // y if EOF was received from server
dict<StreamID, chan<rxPkt>>
_rxtab; // {} stream -> rxq server replies go via here
set<StreamID> _accepted; // streams we accepted but did not replied yet
StreamID _req_next; // stream ID for next client-originated request TODO -> atomic
sync::Mutex _txmu; // serializes writes
sync::Once _txclose1;
sync::WorkGroup _serveWG; // _serveRX is running under _serveWG
func<void()> _serveCancel;
// XXX for tests
public:
vector<string> fatalv; // ad-hoc, racy. TODO rework to send messages to control channel
chan<structZ> rx_eof; // becomes ready when wcfs closes its tx side
// don't new - create only via WCFS._openwatch()
private:
_WatchLink();
~_WatchLink();
friend pair<WatchLink, error> WCFS::_openwatch();
public:
void decref();
public:
error close();
error closeWrite();
pair<string, error> sendReq(context::Context ctx, const string &req);
error recvReq(context::Context ctx, PinReq *rx_into);
error replyReq(context::Context ctx, const PinReq *req, const string& reply);
string String() const;
int fd() const;
private:
error _serveRX(context::Context ctx);
tuple<string, error> _readline();
error _send(StreamID stream, const string &msg);
error _write(const string &pkt);
StreamID _nextReqID();
tuple<chan<rxPkt>, error> _sendReq(context::Context ctx, StreamID stream, const string &req);
friend error _twlinkwrite(WatchLink wlink, const string &pkt);
};
// PinReq represents 1 server-initiated wcfs pin request received over /head/watch link.
struct PinReq {
StreamID stream; // request was received with this stream ID
zodb::Oid foid; // request is about this file
int64_t blk; // ----//---- about this block
zodb::Tid at; // pin to this at; TidHead means unpin to head
string msg; // XXX raw message for tests (TODO kill)
};
// for testing
error _twlinkwrite(WatchLink wlink, const string &pkt);
} // wcfs::
#endif
#!/usr/bin/env -S bpftrace
// Copyright (C) 2019-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// fusetrace - trace FUSE protocol exchange that goes over /dev/fuse.
#include <linux/fuse.h>
#include <linux/uio.h>
BEGIN {
@tstart = nsecs;
// XXX vvv commented to save space - else BPF for BEGIN might be rejected as too large
//@nread = {}; // {} tid -> nread
//@nwrite = {}; // {} tid -> nwrite
//@rpending[0] = ustack;
//@wpending[0] = ustack;
//clear(@rpending); // {} (tid,#nr) -> ustack
//clear(@wpending); // {} (tid,#nw) -> ustack
@opcode [FUSE_LOOKUP] = "LOOKUP";
@opcode [FUSE_FORGET] = "FORGET";
@opcode [FUSE_GETATTR] = "GETATTR";
@opcode [FUSE_SETATTR] = "SETATTR";
@opcode [FUSE_READLINK] = "READLINK";
@opcode [FUSE_SYMLINK] = "SYMLINK";
@opcode [FUSE_MKNOD] = "MKNOD";
@opcode [FUSE_MKDIR] = "MKDIR";
@opcode [FUSE_UNLINK] = "UNLINK";
@opcode [FUSE_RMDIR] = "RMDIR";
@opcode [FUSE_RENAME] = "RENAME";
@opcode [FUSE_LINK] = "LINK";
@opcode [FUSE_OPEN] = "OPEN";
@opcode [FUSE_READ] = "READ";
@opcode [FUSE_WRITE] = "WRITE";
@opcode [FUSE_STATFS] = "STATFS";
@opcode [FUSE_RELEASE] = "RELEASE";
@opcode [FUSE_FSYNC] = "FSYNC";
@opcode [FUSE_SETXATTR] = "SETXATTR";
@opcode [FUSE_GETXATTR] = "GETXATTR";
@opcode [FUSE_LISTXATTR] = "LISTXATTR";
@opcode [FUSE_REMOVEXATTR] = "REMOVEXATTR";
@opcode [FUSE_FLUSH] = "FLUSH";
@opcode [FUSE_INIT] = "INIT";
@opcode [FUSE_OPENDIR] = "OPENDIR";
@opcode [FUSE_READDIR] = "READDIR";
@opcode [FUSE_RELEASEDIR] = "RELEASEDIR";
@opcode [FUSE_FSYNCDIR] = "FSYNCDIR";
@opcode [FUSE_GETLK] = "GETLK";
@opcode [FUSE_SETLK] = "SETLK";
@opcode [FUSE_SETLKW] = "SETLKW";
@opcode [FUSE_ACCESS] = "ACCESS";
@opcode [FUSE_CREATE] = "CREATE";
@opcode [FUSE_INTERRUPT] = "INTERRUPT";
@opcode [FUSE_BMAP] = "BMAP";
@opcode [FUSE_DESTROY] = "DESTROY";
@opcode [FUSE_IOCTL] = "IOCTL";
@opcode [FUSE_POLL] = "POLL";
@opcode [FUSE_NOTIFY_REPLY] = "NOTIFY_REPLY";
@opcode [FUSE_BATCH_FORGET] = "BATCH_FORGET";
@opcode [FUSE_FALLOCATE] = "FALLOCATE";
@opcode [FUSE_READDIRPLUS] = "READDIRPLUS";
@opcode [FUSE_RENAME2] = "RENAME2";
@opcode [FUSE_LSEEK] = "LSEEK";
// XXX >= Linux x
//@opcode[FUSE_COPY_FILE_RANGE] = "COPY_FILE_RANGE";
@notify [FUSE_NOTIFY_POLL] = "NOTIFY_POLL";
@notify [FUSE_NOTIFY_INVAL_INODE] = "NOTIFY_INVAL_INODE";
@notify [FUSE_NOTIFY_INVAL_ENTRY] = "NOTIFY_INVAL_ENTRY";
@notify [FUSE_NOTIFY_STORE] = "NOTIFY_STORE";
@notify [FUSE_NOTIFY_RETRIEVE] = "NOTIFY_RETRIEVE";
@notify [FUSE_NOTIFY_DELETE] = "NOTIFY_DELETE";
}
END {
// don't print:
clear(@nread);
clear(@nwrite);
clear(@opcode);
clear(@notify);
clear(@rbuf);
// print @{r,w}pending
printf("\npending read/write:\n");
}
// reader enqueues
kprobe::fuse_dev_read {
$tr = (nsecs - @tstart) / 1000; // μs
$nr = @nread[tid];
@rpending[tid,$nr] = ustack;
// fuse_dev_do_read advances `to` - fetch/remember buffer pointer before.
$to = (struct iov_iter *)arg1;
$buf = $to->iov->iov_base;
@rbuf[tid] = $buf;
printf("P%d %d.%d /dev/fuse <- qread %s/%d_%d_r:\n", cpu, $tr/1000000, $tr%1000000, comm, tid, $nr);
printf("%s\n", ustack);
}
// read ready
kretprobe::fuse_dev_read {
$trr = (nsecs - @tstart) / 1000; // μs
$rr = @nread[tid];
@nread[tid] = $rr + 1;
delete(@rpending[tid,$rr]);
$h = (struct fuse_in_header *)@rbuf[tid];
delete(@rbuf[tid]);
$op = @opcode[$h->opcode];
printf("P%d %d.%d /dev/fuse -> read %s/%d_%d_r:\n", cpu, $trr/1000000, $trr%1000000, comm, tid, $rr);
// XXX ret=... ^^^
printf("\t.%d %s i%d ...\t\t(ret=%d)\n\n", $h->unique, $op, $h->nodeid, retval);
}
// write request
kprobe::fuse_dev_write {
$tw = (nsecs - @tstart) / 1000; // μs
$nw = @nwrite[tid];
@wpending[tid,$nw] = ustack;
$from = (struct iov_iter *)arg1;
$wbuf = $from->iov->iov_base;
$wh = (struct fuse_out_header *)$wbuf;
printf("P%d %d.%d /dev/fuse <- write %s/%d_%d_w:\n", cpu, $tw/1000000, $tw%1000000, comm, tid, $nw);
$u = $wh->unique;
$e = $wh->error;
if ($u == 0) {
$nop = @notify[$e];
//if ($nop != "") { // XXX https://github.com/iovisor/bpftrace/issues/402#issuecomment-461752005
printf("\t%s ...\n", $nop);
//} else {
// printf("\t?notify(%d) ...\n", $e);
//}
} else {
printf("\t.%d (%d) ...\n", $u, $e);
}
printf("%s\n", ustack);
}
// write ack
kretprobe::fuse_dev_write {
$tww = (nsecs - @tstart) / 1000; // μs
$ww = @nwrite[tid];
@nwrite[tid] = $ww + 1;
delete(@wpending[tid,$ww]);
printf("P%d %d.%d /dev/fuse -> write_ack %s/%d_%d_w", cpu, $tww/1000000, $tww%1000000, comm, tid, $ww);
printf(" (ret=%d)\n\n", retval);
}
// XXX splice stubs (so that splice IO is not missed)
kprobe::fuse_dev_splice_read {
printf("/dev/fuse -> splice read\n")
}
kprobe::fuse_dev_splice_write {
printf("/dev/fuse <- splice write:%s\n", kstack)
}
...@@ -563,25 +563,6 @@ func parseWatch(msg string) (oid zodb.Oid, at zodb.Tid, err error) { ...@@ -563,25 +563,6 @@ func parseWatch(msg string) (oid zodb.Oid, at zodb.Tid, err error) {
// ---- make df happy (else it complains "function not supported") ---- // ---- make df happy (else it complains "function not supported") ----
func (root *Root) StatFs() *fuse.StatfsOut {
return &fuse.StatfsOut{
// filesystem sizes (don't try to estimate)
Blocks: 0,
Bfree: 0,
Bavail: 0,
// do we need to count files?
Files: 0,
Ffree: 0,
// block size
Bsize: 2*1024*1024, // "optimal transfer block size" XXX better get from root?
Frsize: 2*1024*1024, // "fragment size"
NameLen: 255, // XXX ok? /proc uses the same
}
}
// ---- misc ---- // ---- misc ----
func panicf(format string, argv ...interface{}) { func panicf(format string, argv ...interface{}) {
......
// Copyright (C) 2018-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
// Program wcfs provides filesystem server with file data backed by wendelin.core arrays.
//
// Intro
//
// Each wendelin.core array (ZBigArray) is actually a linear file (ZBigFile)
// and array metadata like dtype, shape and strides associated with it. This
// program exposes as files only ZBigFile data and leaves rest of
// array-specific handling to clients. Every ZBigFile is exposed as one separate
// file that represents whole ZBigFile's data.
//
// For a client, the primary way to access a bigfile should be to mmap
// head/bigfile/<bigfileX> which represents always latest bigfile data.
// Clients that want to get isolation guarantee should subscribe for
// invalidations and re-mmap invalidated regions to file with pinned bigfile revision for
// the duration of their transaction. See "Isolation protocol" for details(*).
//
// In the usual situation when bigfiles are big, and there are O(1)/δt updates,
// there should be no need for any cache besides shared kernel cache of latest
// bigfile data.
//
// --------
//
// (*) wcfs servers comes accompanied by Python and C++ client packages that
// take care about isolation protocol details and provide to clients simple
// interface similar to regular files.
//
//
// Filesystem organization
//
// Top-level structure of provided filesystem is as follows:
//
// head/ ; latest database view
// ...
// @<rev1>/ ; database view as of revision <revX>
// ...
// @<rev2>/
// ...
// ...
//
// where head/ represents latest data as stored in upstream ZODB, and
// @<revX>/ represents data as of database revision <revX>.
//
// head/ has the following structure:
//
// head/
// at ; data inside head/ is as of this ZODB transaction
// watch ; channel for bigfile invalidations
// bigfile/ ; bigfiles' data
// <oid(ZBigFile1)>
// <oid(ZBigFile2)>
// ...
//
// where /bigfile/<bigfileX> represents latest bigfile data as stored in
// upstream ZODB. As there can be some lag receiving updates from the database,
// /at describes precisely ZODB state for which bigfile data is currently
// exposed. Whenever bigfile data is changed in upstream ZODB, information
// about the changes is first propagated to /watch, and only after that
// /bigfile/<bigfileX> is updated. See "Isolation protocol" for details.
//
// @<revX>/ has the following structure:
//
// @<revX>/
// at
// bigfile/ ; bigfiles' data as of revision <revX>
// <oid(ZBigFile1)>
// <oid(ZBigFile2)>
// ...
//
// where /bigfile/<bigfileX> represent bigfile data as of revision <revX>.
//
// Unless accessed {head,@<revX>}/bigfile/<bigfileX> are not automatically visible in
// wcfs filesystem. Similarly @<revX>/ become visible only after access.
//
//
// Isolation protocol
//
// In order to support isolation, wcfs implements isolation protocol that
// must be cooperatively followed by both wcfs and client.
//
// First, client mmaps latest bigfile, but does not access it
//
// mmap(head/bigfile/<bigfileX>)
//
// Then client opens head/watch and tells wcfs through it for which ZODB state
// it wants to get bigfile's view.
//
// C: 1 watch <bigfileX> @<at>
//
// The server then, after potentially sending initial pin and unpin messages
// (see below), reports either success or failure:
//
// S: 1 ok
// S: 1 error ... ; if <at> is too far away back from head/at
//
// The server sends "ok" reply only after head/at is ≥ requested <at>, and only
// after all initial pin/unpin messages are fully acknowledged by the client.
// The client can start to use mmapped data after it gets "ok".
// The server sends "error" reply if requested <at> is too far away back from
// head/at.
// XXX other errors are possible (e.g. "no such file", or error handling pin).
// XXX error handling pin -> then client is killed?
// XXX if not - specify that watch state is lost after error.
//
// Upon watch request, either initially, or after sending "ok", the server will be notifying the
// client about file blocks that client needs to pin in order to observe file's
// data as of <at> revision:
//
// The filesystem server itself receives information about changed data from
// ZODB server through regular ZODB invalidation channel (as it is ZODB client
// itself). Then, separately for each changed file block, before actually
// updating head/bigfile/<bigfileX> content, it notifies through opened
// head/watch links to clients, that had requested it (separately to each
// client), about the changes:
//
// S: <2·k> pin <bigfileX> #<blk> @<rev_max> ; @head means unpin
//
// and waits until all clients confirm that changed file block can be updated
// in global OS cache.
//
// The client in turn should now re-mmap requested to be pinned block to bigfile@<rev_max>
//
// # mmapped at address corresponding to #blk
// mmap(@<rev_max>/bigfile/<bigfileX>, #blk, MAP_FIXED)
//
// or, if given @head as @<rev_max>, to bigfile@head
//
// mmap(head/bigfile/<bigfileX>, #blk, MAP_FIXED)
//
// and must send ack back to the server when it is done:
//
// C: <2·k> ack
//
// The server sends pin notifications only for file blocks, that are known to
// be potentially changed after client's <at>, and <rev_max> describes the
// upper bound for the block revision as of <at> database view:
//
// <rev_max> ≤ <at> ; block stays unchanged in (<rev_max>, <at>] range
//
// The server maintains short history tail of file changes to be able to
// support openings with <at> being slightly in the past compared to current
// head/at. The server might reject a watch request if <at> is too far away in
// the past from head/at. The client is advised to restart its transaction with
// more uptodate database view if it gets watch setup error.
//
// A later request from the client for the same <bigfileX> but with different
// <at>, overrides previous watch request for that file. A client can use "-"
// instead of "@<at>" to stop watching a file.
//
// A single client can send several watch requests through single head/watch
// open, as well as it can use several head/watch opens simultaneously.
// The server sends pin notifications for all files requested to be watched via
// every head/watch open.
//
// Note: a client could use a single watch to manage its several views for the same
// file but with different <at>. This could be achieved via watching with
// @<at_min>, and then deciding internally which views needs to be adjusted and
// which views need not. Wcfs does not oblige clients to do so though, and a
// client is free to use as many head/watch openings as it needs to.
//
// When clients are done with @<revX>/bigfile/<bigfileX> (i.e. client's
// transaction ends and array is unmapped), the server sees number of opened
// files to @<revX>/bigfile/<bigfileX> drops to zero, and automatically
// destroys @<revX>/bigfile/<bigfileX> after reasonable timeout.
//
//
// Protection against slow or faulty clients
//
// If a client, on purpose or due to a bug or being stopped, is slow to respond
// with ack to file invalidation notification, it creates a problem because the
// server will become blocked waiting for pin acknowledgments, and thus all
// other clients, that try to work with the same file, will get stuck.
//
// The problem could be avoided, if wcfs would reside inside OS kernel and this
// way could be able to manipulate clients address space directly (then
// isolation protocol won't be needed). It is also possible to imagine
// mechanism, where wcfs would synchronously change clients' address space via
// injecting trusted code and running it on client side via ptrace to adjust
// file mappings.
//
// However ptrace does not work when client thread is blocked under pagefault,
// and that is exactly what wcfs would need to do to process invalidations
// lazily, because eager invalidation processing results in prohibitively slow
// file opens. See internal wcfs overview for details about why ptrace
// cannot be used and why lazy invalidation processing is required.
//
// Lacking OS primitives to change address space of another process and not
// being able to work it around with ptrace in userspace, wcfs takes approach
// to kill a slow client on 30 seconds timeout by default.
//
//
// Writes
//
// As each bigfile is represented by 1 synthetic file, there can be several
// write schemes:
//
// 1. mmap(MAP_PRIVATE) + writeout by client
//
// In this scheme bigfile data is mmapped in MAP_PRIVATE mode, so that local
// user changes are not automatically propagated back to the file. When there
// is a need to commit, client investigates via some OS mechanism, e.g.
// /proc/self/pagemap or something similar, which pages of this mapping it
// modified. Knowing this it knows which data it dirtied and so can write this
// data back to ZODB itself, without filesystem server providing write support.
//
// 2. mmap(MAP_SHARED, PROT_READ) + write-tracking & writeout by client
//
// In this scheme bigfile data is mmaped in MAP_SHARED mode with read-only pages
// protection. Then whenever write fault occurs, client allocates RAM from
// shmfs, copies faulted page to it, and then mmaps RAM page with RW protection
// in place of original bigfile page. Writeout implementation should be similar
// to "1", only here client already knows the pages it dirtied, and this way
// there is no need to consult /proc/self/pagemap.
//
// The advantage of this scheme over mmap(MAP_PRIVATE) is that in case
// there are several in-process mappings of the same bigfile with overlapping
// in-file ranges, changes in one mapping will be visible in another mapping.
// Contrary: whenever a MAP_PRIVATE mapping is modified, the kernel COWs
// faulted page into a page completely private to this mapping, so that other
// MAP_PRIVATE mappings of this file, including ones created from the same
// process, do not see changes made to the first mapping.
//
// Since wendelin.core needs to provide coherency in between different slices
// of the same array, this is the mode wendelin.core actually uses.
//
// 3. write to wcfs
//
// TODO we later could implement "write-directly" mode where clients would write
// data directly into the file.
package main
// Wcfs organization
//
// Wcfs is a ZODB client that translates ZODB objects into OS files as would
// non-wcfs wendelin.core do for a ZBigFile. Contrary to non-wcfs wendelin.core,
// it keeps bigfile data in shared OS cache efficiently. It is organized as follows:
//
// 1) 1 ZODB connection for "latest data" for whole filesystem (zhead).
// 2) head/bigfile/* of all bigfiles represent state as of zhead.At .
// 3) for head/bigfile/* the following invariant is maintained:
//
// #blk ∈ OS file cache => ZBlk(#blk) + all BTree/Bucket that lead to it ∈ zhead live cache(%)
// (ZBlk* in ghost state)
//
// => all BTree/Bucket that lead to blk are tracked (XXX)
//
// The invariant helps on invalidation: if we see a changed oid, and
// zhead.cache.lookup(oid) = ø -> we know we don't have to invalidate OS
// cache for any part of any file (even if oid relates to a file block - that
// block is not cached and will trigger ZODB load on file read).
//
// XXX explain why tracked
//
// Currently we maintain this invariant by simply never evicting ZBlk/LOBTree/LOBucket
// objects from ZODB Connection cache. In the future we may want to try to
// synchronize to kernel freeing its pagecache pages.
//
// 4) when we receive an invalidation message from ZODB - we process it and
// propagate invalidations to OS file cache of head/bigfile/*:
//
// invalidation message: (tid↑, []oid)
//
// 4.1) zhead.cache.lookup(oid) XXX -> δFtail
// 4.2) ø: nothing to do - see invariant ^^^.
// 4.3) obj found:
//
// - ZBlk* -> [] of file/[]#blk
// - BTree/Bucket -> δ(BTree) -> file/[]#blk
//
// in the end after processing all []oid from invalidation message we have
//
// [] of file/[]#blk
//
// that describes which file(s) parts needs to be invalidated.
//
// FIXME no - we can build it but not in full - since we consider only zobj in live cache.
// FIXME and even if we consider all δ'ed zobj, building complete set of
// file.δtail requires to first do complete scan of file.blktab
// which is prohibitively expensive.
// XXX -> we'll do the scan, but only Trees _without_ Buckets. This
// makes the scan practical until 1PB while helping to build
// initial tracking set for δFtail.
// Eager invalidation would require full scan - Trees _and_
// Buckets, which makes it prohibitively expensive - see (+).
//
// 4.4) for all file/blk to invalidate we do:
//
// - try to retrieve head/bigfile/file[blk] from OS file cache(*);
// - if retrieved successfully -> store retrieved data back into OS file
// cache for @<rev>/bigfile/file[blk], where
//
// # see below about file.δtail
// # XXX -> file.BlkRevAt(#blk, zhead.at)
// rev = max(file.δtail.by(#blk)) || min(rev ∈ file.δtail) || zhead.at
//
// - invalidate head/bigfile/file[blk] in OS file cache.
//
// This preserves previous data in OS file cache in case it will be needed
// by not-yet-uptodate clients, and makes sure file read of head/bigfile/file[blk]
// won't be served from OS file cache and instead will trigger a FUSE read
// request to wcfs.
//
// 4.5) no invalidation messages are sent to wcfs clients at this point(+).
//
// 4.6) processing ZODB invalidations and serving file reads (see 7) are
// organized to be mutually exclusive.
//
// 5) after OS file cache was invalidated, we resync zhead to new database
// view corresponding to tid.
//
// 6) for every file δtail invalidation info about head/data is maintained: XXX -> δFtail
//
// - tailv: [](rev↑, []#blk)
// - by: {} #blk -> []rev↑ in tail
//
// δtail.tail describes invalidations to file we learned from ZODB invalidation.
// δtail.by allows to quickly lookup information by #blk.
//
// min(rev) in δtail is min(@at) at which head/bigfile/file is currently watched (see below).
//
// XXX δtail can miss ...
//
// to support initial openings with @at being slightly in the past, we also
// make sure that min(rev) is enough to cover last 10 minutes of history
// from head/at.
//
// 7) when we receive a FUSE read(#blk) request to a head/bigfile/file, we process it as follows:
//
// 7.1) load blkdata for head/bigfile/file[blk] @zhead.at .
//
// while loading this also gives upper bound estimate of when the block
// was last changed:
//
// rev(blk) ≤ max(_.serial for _ in (ZBlk(#blk), all BTree/Bucket that lead to ZBlk))
//
// it is not exact because BTree/Bucket can change (e.g. rebalance)
// but still point to the same k->ZBlk.
//
// we also use file.δtail to find either exact blk revision: XXX δFtail
//
// rev(blk) = max(file.δtail.by(#blk) -> []rev↑)
//
// or another upper bound if #blk ∉ δtail:
//
// rev(blk) ≤ min(rev ∈ δtail) ; #blk ∉ δtail
//
//
// below rev'(blk) is min(of the estimates found):
//
// rev(blk) ≤ rev'(blk) rev'(blk) = min(^^^)
//
//
// XXX we delay recomputing δFtail.LastBlkRev(file, #blk, head) because
// using just cheap revmax estimate can frequently result in all watches
// being skipped.
//
// 7.2) for all registered client@at watches of head/bigfile/file:
//
// - rev'(blk) ≤ at: -> do nothing
// - rev'(blk) > at:
// - if blk ∈ watch.pinned -> do nothing
// - rev = max(δtail.by(#blk) : _ ≤ at) || min(rev ∈ δtail : rev ≤ at) || at
// - watch.pin(file, #blk, @rev)
// - watch.pinned += blk
//
// where
//
// watch.pin(file, #blk, @rev)
//
// sends pin message according to "Isolation protocol", and is assumed
// to cause
//
// remmap(file, #blk, @rev/bigfile/file)
//
// on client.
//
// ( one could imagine adjusting mappings synchronously via running
// wcfs-trusted code via ptrace that wcfs injects into clients, but ptrace
// won't work when client thread is blocked under pagefault or syscall(^) )
//
// in order to support watching for each head/bigfile/file
//
// [] of watch{client@at↑, pinned}
//
// is maintained.
//
// 7.3) blkdata is returned to kernel.
//
// Thus a client that wants latest data on pagefault will get latest data,
// and a client that wants @rev data will get @rev data, even if it was this
// "old" client that triggered the pagefault(~).
//
// 8) serving FUSE reads from @<rev>/bigfile/file is organized similarly to
// serving reads from head/bigfile/file, but with using dedicated per-<rev>
// ZODB connection and without notifying any watches.
//
// 9) for every ZODB connection (zhead + one per @<rev>) a dedicated read-only
// transaction is maintained. For zhead, every time it is resynced (see "5")
// the transaction associated with zhead is renewed.
//
// XXX 10) gc @rev/ and @rev/bigfile/<bigfileX> automatically on atime timeout
//
//
// (*) see notes.txt -> "Notes on OS pagecache control"
// (+) see notes.txt -> "Invalidations to wcfs clients are delayed until block access"
// (~) see notes.txt -> "Changing mmapping while under pagefault is possible"
// (^) see notes.txt -> "Client cannot be ptraced while under pagefault"
// (%) no need to keep track of ZData - ZBlk1 is always marked as changed on blk data change.
// Wcfs locking organization
//
// As it was said processing ZODB invalidations (see "4") and serving file
// reads (see "7") are organized to be mutually exclusive. To do so a major RW
// lock - zheadMu - is used. Whenever ZODB invalidations are processed and
// zhead.at is updated - zheadMu.W is taken. Contrary whenever file read is
// served and in other situations - which needs zhead to remain viewing
// database at the same state - zheadMu.R is taken.
//
// Several locks that protect internal data structures are minor to zheadMu -
// they need to be taken only under zheadMu.R (to protect e.g. multiple readers
// running simultaneously to each other), but do not need to be taken at all if
// zheadMu.W is taken. In data structures such locks are noted as follows
//
// xMu sync.Mutex // zheadMu.W | zheadMu.R + xMu
//
// If a lock is not minor to zheadMu, it is still ok to lock it under zheadMu.R
// as zheadMu, being the most major lock in wcfs, always comes locked first, if
// it needs to be locked.
//
// For watches, similarly to zhead, watch.at is protected by major-for-watch
// per-watch RW lock watch.atMu . When watch.at is updated during watch
// setup/upgrade time - watch.atMu.W is taken. Contrary whenever watch is
// notified with pin messages - watch.atMu.R is taken to make sure watch.at
// stays unchanged while pins are prepared and processed.
//
// For watches, similarly to zheadMu, there are several minor-to-atMu locks
// that protect internal data structures. Such locks are noted similarly to
// zheadMu enslavement.
//
// In addition to what is written above there are other ordering rules that are
// followed consistently to avoid hitting deadlock:
//
// BigFile.watchMu > Watch.atMu
// WatchLink.byfileMu > BigFile.watchMu
// WatchLink.byfileMu > BigFileDir.fileMu
// WatchLink.byfileMu > Watch.atMu
// Notation used
//
// δZ - change in ZODB space
// δB - change in BTree*s* space
// δT - change in BTree(1) space
// δF - change in File*s* space
// δfile - change in File(1) space XXX -> δf ?
//
// f - BigFile
// bfdir - BigFileDir
// wlink - WatchLink
// w - Watch
import (
"bufio"
"context"
"flag"
"fmt"
"io"
stdlog "log"
"math"
"os"
"runtime"
"sort"
"strings"
"sync"
"sync/atomic"
"syscall"
// "time"
log "github.com/golang/glog"
"lab.nexedi.com/kirr/go123/xcontext"
"lab.nexedi.com/kirr/go123/xerr"
"lab.nexedi.com/kirr/go123/xio"
"lab.nexedi.com/kirr/go123/xruntime/race"
"lab.nexedi.com/kirr/go123/xsync"
"lab.nexedi.com/kirr/neo/go/transaction"
"lab.nexedi.com/kirr/neo/go/zodb"
"lab.nexedi.com/kirr/neo/go/zodb/btree"
_ "lab.nexedi.com/kirr/neo/go/zodb/wks"
"github.com/johncgriffin/overflow"
"github.com/hanwen/go-fuse/v2/fuse"
"github.com/hanwen/go-fuse/v2/fuse/nodefs"
"github.com/pkg/errors"
)
// Root represents root of wcfs filesystem.
type Root struct {
fsNode
// ZODB storage we work with
zstor zodb.IStorage
// ZODB DB handle for zstor.
// keeps cache of connections for @<rev>/ accesses.
// only one connection is used for each @<rev>.
zdb *zodb.DB
// directory + ZODB connection for head/
// (zhead is Resync'ed and is kept outside zdb pool)
head *Head
// directories + ZODB connections for @<rev>/
revMu sync.Mutex
revTab map[zodb.Tid]*Head
}
// /(head|<rev>)/ - served by Head.
type Head struct {
fsNode
rev zodb.Tid // 0 for head/, !0 for @<rev>/
bfdir *BigFileDir // bigfile/
// at - served by .readAt
// watch - implicitly linked to by fs
// ZODB connection for everything under this head
// zheadMu protects zconn.At & live _objects_ associated with it.
// while it is rlocked zconn is guaranteed to stay viewing database at
// particular view.
//
// zwatcher write-locks this and knows noone is using ZODB objects and
// noone mutates OS file cache while zwatcher is running.
//
// it is also kept rlocked by OS cache uploaders (see BigFile.uploadBlk)
// with additional locking protocol to avoid deadlocks (see below for
// pauseOSCacheUpload + ...).
zheadMu sync.RWMutex
zconn *ZConn // for head/ zwatcher resyncs head.zconn; others only read zconn objects.
// zwatcher signals to uploadBlk to pause/continue uploads to OS cache to avoid deadlocks.
// see notes.txt -> "Kernel locks page on read/cache store/..." for details.
pauseOSCacheUpload bool
continueOSCacheUpload chan struct{}
// uploadBlk signals to zwatcher that there are so many inflight OS cache uploads currently.
inflightOSCacheUploads int32
// head/watch opens
wlinkMu sync.Mutex
wlinkTab map[*WatchLink]struct{}
// waiters for zhead.At to become ≥ their at.
hwaitMu sync.Mutex // zheadMu.W | zheadMu.R + hwaitMu
hwait map[hwaiter]struct{} // set{(at, ready)}
}
// /(head|<rev>)/bigfile/ - served by BigFileDir.
type BigFileDir struct {
fsNode
head *Head // parent head/ or @<rev>/
// {} oid -> <bigfileX>
fileMu sync.Mutex // zheadMu.W | zheadMu.R + fileMu
fileTab map[zodb.Oid]*BigFile
// δ tail of tracked BTree nodes of all BigFiles + -> which file
// (used only for head/, not revX/)
δFmu sync.RWMutex // zheadMu.W | zheadMu.R + δFmu.X
δFtail *ΔFtail
}
// /(head|<rev>)/bigfile/<bigfileX> - served by BigFile.
type BigFile struct {
fsNode
// this BigFile is under .head/bigfile/; it views ZODB via .head.zconn
// parent's BigFileDir.head is the same.
head *Head
// ZBigFile top-level object
zfile *ZBigFile
// things read/computed from .zfile; constant during lifetime of current transaction.
// i.e. changed under zhead.W
blksize int64 // zfile.blksize
size int64 // zfile.Size()
rev zodb.Tid // last revision that modified zfile data
// XXX we can't know rev fully as some later blocks could be learnt only
// while populating δFtail lazily
// XXX or then it is not "constant during lifetime of current txn"
// // tail change history of this file.
// //
// // XXX computationally expensive to start - see "Invalidations to wcfs
// // clients are delayed ..." in notes.txt
// //go:generate ./gen-δtail I64 int64 zδtail_i64.go
// δtail *ΔTailI64 // [](rev↑, []#blk)
// blocks that were ever read-accessed (head/ only) XXX locking by bfdir.δFmu ?
accessed SetI64
// inflight loadings of ZBigFile from ZODB.
// successful load results are kept here until blkdata is put into OS pagecache.
//
// Being a staging area for data to enter OS cache, loading has to be
// consulted/invalidated whenever wcfs logic needs to consult/invalidate OS cache.
loadMu sync.Mutex // zheadMu.W | zheadMu.R + loadMu
loading map[int64]*blkLoadState // #blk -> {... blkdata}
// watches attached to this file.
//
// both watches in already "established" state (i.e. initial watch
// request was completed and answered with "ok"), and watches in
// progress of being established are kept here.
watchMu sync.RWMutex
watchTab map[*Watch]struct{}
}
// blkLoadState represents a ZBlk load state/result.
//
// when !ready the loading is in progress.
// when ready the loading has been completed.
type blkLoadState struct {
ready chan struct{}
blkdata []byte
err error
}
// /head/watch - served by WatchNode.
type WatchNode struct {
fsNode
head *Head // parent head/
idNext int32 // ID for next opened WatchLink
}
// /head/watch open - served by WatchLink.
type WatchLink struct {
sk *FileSock // IO channel to client
id int32 // ID of this /head/watch handle (for debug log)
head *Head
// watches associated with this watch link.
//
// both already established, and watches being initialized in-progress are registered here.
// (see setupWatch)
byfileMu sync.Mutex
byfile map[zodb.Oid]*Watch // {} foid -> Watch
// IO
reqNext uint64 // stream ID for next wcfs-originated request; 0 is reserved for control messages
txMu sync.Mutex
rxMu sync.Mutex
rxTab map[/*stream*/uint64]chan string // client replies go via here
}
// Watch represents watching for changes to 1 BigFile over particular watch link.
type Watch struct {
link *WatchLink // link to client
file *BigFile // watching this file
// atMu, similarly to zheadMu, protects watch.at and pins associated with Watch.
// atMu.R guarantees that watch.at is not changing, but multiple
// simultaneous pins could be running (used e.g. by readPinWatchers).
// atMu.W guarantees that only one user has watch.at write access and
// that no pins are running (used by setupWatch).
atMu sync.RWMutex
at zodb.Tid // requested to be watched @at
pinnedMu sync.Mutex // atMu.W | atMu.R + pinnedMu
pinned map[int64]*blkPinState // {} blk -> {... rev} blocks that are already pinned to be ≤ at
}
// blkPinState represents state/result of pinning one block.
//
// when !ready the pinning is in progress.
// when ready the pinning has been completed.
type blkPinState struct {
rev zodb.Tid // revision to which the block is being or has been pinned
ready chan struct{}
err error
}
// -------- 3) Cache invariant --------
// zodbCacheControl implements zodb.LiveCacheControl to tune ZODB to never evict
// LOBTree/LOBucket from live cache. We want to keep LOBTree/LOBucket always alive
// because it is essentially the index where to find ZBigFile data.
//
// For the data itself - we put it to kernel pagecache and always deactivate
// from ZODB right after that.
//
// See "3) for */head/data the following invariant is maintained..."
type zodbCacheControl struct {}
func (_ *zodbCacheControl) PCacheClassify(obj zodb.IPersistent) zodb.PCachePolicy {
switch obj.(type) {
// ZBlk* should be in cache but without data
case *ZBlk0:
return zodb.PCachePinObject | zodb.PCacheDropState
case *ZBlk1:
return zodb.PCachePinObject | zodb.PCacheDropState
// ZBigFile btree index should be in cache with data
case *btree.LOBTree:
return zodb.PCachePinObject | zodb.PCacheKeepState
case *btree.LOBucket:
return zodb.PCachePinObject | zodb.PCacheKeepState
// don't let ZData to pollute the cache
case *ZData:
return zodb.PCacheDropObject | zodb.PCacheDropState
// for performance reason we also keep ZBigFile in cache.
//
// ZBigFile is top-level object that is used on every block load, and
// it would be a waste to evict ZBigFile from cache.
case *ZBigFile:
return zodb.PCachePinObject | zodb.PCacheKeepState
}
return 0
}
/*
// -------- zhead lock/wait --------
// XXX needed?
// TODO head.zheadMu -> special mutex with Lock(ctx) so that Lock wait could be canceled
func (head *Head) zheadRLock() { head.zheadMu.RLock() }
func (head *Head) zheadRUnlock() { head.zheadMu.RUnlock() }
func (head *Head) zheadLock() { head.zheadMu.Lock() }
func (head *Head) zheadUnlock() { head.zheadMu.Unlock() }
*/
// -------- 4) ZODB invalidation -> OS cache --------
func traceZWatch(format string, argv ...interface{}) {
if !log.V(1) { // XXX -> 2?
return
}
log.InfoDepth(1, fmt.Sprintf("zwatcher: " + format, argv...))
}
// zwatcher watches for ZODB changes.
//
// see "4) when we receive an invalidation message from ZODB ..."
func (root *Root) zwatcher(ctx context.Context, zwatchq chan zodb.Event) (err error) {
defer xerr.Contextf(&err, "zwatch %s", root.zstor.URL())
traceZWatch(">>>")
var zevent zodb.Event
var ok bool
for {
traceZWatch("select ...")
select {
case <-ctx.Done():
traceZWatch("cancel")
return ctx.Err()
case zevent, ok = <-zwatchq:
if !ok {
traceZWatch("zwatchq closed")
return nil // closed XXX ok?
}
}
traceZWatch("zevent: %s", zevent)
switch zevent := zevent.(type) {
default:
return fmt.Errorf("unexpected event: %T", zevent)
case *zodb.EventError:
return zevent.Err
case *zodb.EventCommit:
err = root.handleδZ(ctx, zevent)
if err != nil {
return err
}
}
}
}
// handleδZ handles 1 change event from ZODB notification.
func (root *Root) handleδZ(ctx context.Context, δZ *zodb.EventCommit) (err error) {
defer xerr.Contextf(&err, "handleδZ @%s", δZ.Tid)
head := root.head
// while we are invalidating OS cache, make sure that nothing, that
// even reads /head/bigfile/*, is running (see 4.6).
//
// also make sure that cache uploaders we spawned (uploadBlk) are all
// paused, or else they could overwrite OS cache with stale data.
// see notes.txt -> "Kernel locks page on read/cache store/..." for
// details on how to do this without deadlocks.
continueOSCacheUpload := make(chan struct{})
retry:
for {
// XXX ctx cancel
head.zheadMu.Lock()
head.pauseOSCacheUpload = true
head.continueOSCacheUpload = continueOSCacheUpload
// NOTE need atomic load, since inflightOSCacheUploads
// decrement is done not under zheadMu.
if atomic.LoadInt32(&head.inflightOSCacheUploads) != 0 {
head.zheadMu.Unlock()
continue retry
}
break
}
defer func() {
head.pauseOSCacheUpload = false
head.continueOSCacheUpload = nil
head.zheadMu.Unlock()
close(continueOSCacheUpload)
}()
// zheadMu.W taken and all cache uploaders are paused
zhead := head.zconn
bfdir := head.bfdir
// invalidate kernel cache for data in changed files
// NOTE no δFmu lock needed because zhead is WLocked
δF, err := bfdir.δFtail.Update(δZ, zhead) // δF <- δZ |tracked
if err != nil {
return err
}
if false { // XXX -> V(2) ?
// debug dump δF
fmt.Printf("\n\nS: handleδZ: δF (#%d):\n", len(δF.ByFile))
for file, δfile := range δF.ByFile {
blkv := δfile.Blocks.Elements()
sort.Slice(blkv, func(i, j int) bool {
return blkv[i] < blkv[j]
})
size := " "
if δfile.Size {
size = "S"
}
fmt.Printf("S: \t- %s\t%s %v\n", file.zfile.POid(), size, blkv)
}
fmt.Printf("\n\n")
}
wg := xsync.NewWorkGroup(ctx)
for file, δfile := range δF.ByFile {
// // XXX needed?
// // XXX even though δBtail is complete, not all ZBlk are present here
// file.δtail.Append(δF.Rev, δfile.Blocks.Elements())
file := file
for blk := range δfile.Blocks {
blk := blk
wg.Go(func(ctx context.Context) error {
return file.invalidateBlk(ctx, blk)
})
}
}
err = wg.Wait()
if err != nil {
return err
}
// invalidate kernel cache for attributes
// we need to do it only if we see topology (i.e. btree) change
//
// do it after completing data invalidations.
wg = xsync.NewWorkGroup(ctx)
for file, δfile := range δF.ByFile {
if !δfile.Size {
continue
}
file := file
wg.Go(func(ctx context.Context) error {
return file.invalidateAttr() // NOTE does not accept ctx
})
}
err = wg.Wait()
if err != nil {
return err
}
// resync .zhead to δZ.tid
// XXX -> Head.Resync() ?
// 1. abort old and resync to new txn/at
transaction.Current(zhead.txnCtx).Abort()
_, ctx = transaction.New(context.Background()) // XXX bg ok?
err = zhead.Resync(ctx, δZ.Tid)
if err != nil {
return err
}
zhead.txnCtx = ctx
// 2. restat invalidated ZBigFile
// NOTE no lock needed since .blksize and .size are constant during lifetime of one txn.
// XXX -> parallel
for file := range δF.ByFile {
size, sizePath, err := file.zfile.Size(ctx)
if err != nil {
return err
}
file.size = size
bfdir.δFtail.Track(file, -1, sizePath, nil)
// XXX we can miss a change to file if δblk is not yet tracked
// -> need to update file.rev at read time -> locking=XXX
file.rev = zhead.At()
}
// notify .wcfs/zhead
for sk := range gdebug.zheadSockTab {
_, err := fmt.Fprintf(xio.BindCtxW(sk, ctx), "%s\n", δZ.Tid)
if err != nil {
log.Errorf("%s", err) // XXX errctx + file, handle, reader pid
sk.Close()
delete(gdebug.zheadSockTab, sk)
}
}
// XXX δFtail.ForgetPast(...)
// XXX for f in δF: f.δtail.ForgetPast(...)
// notify zhead.At waiters
for hw := range head.hwait {
if hw.at <= δZ.Tid {
delete(head.hwait, hw)
close(hw.ready)
}
}
return nil
}
// hwaiter represents someone waiting for zhead to become ≥ at.
type hwaiter struct {
at zodb.Tid
ready chan struct{}
}
// zheadWait waits till head.zconn.At becomes ≥ at.
//
// It returns error either if wcfs is down or ctx is canceled.
func (head *Head) zheadWait(ctx context.Context, at zodb.Tid) (err error) {
defer xerr.Contextf(&err, "wait zhead ≥ %s", at)
if head.rev != 0 {
panic("must be called only for head/, not @revX/")
}
// XXX check wcfs.down
// check if zhead is already ≥ at
head.zheadMu.RLock()
if head.zconn.At() >= at {
head.zheadMu.RUnlock()
return nil
}
// no - we have to wait for it
ready := make(chan struct{})
head.hwaitMu.Lock()
head.hwait[hwaiter{at, ready}] = struct{}{}
head.hwaitMu.Unlock()
head.zheadMu.RUnlock()
select {
case <-ctx.Done():
return ctx.Err()
case <-ready:
return nil // ok - zhead.At went ≥ at
}
}
// invalidateBlk invalidates 1 file block in kernel cache.
//
// see "4.4) for all file/blk to in invalidate we do"
// called with zheadMu wlocked.
func (f *BigFile) invalidateBlk(ctx context.Context, blk int64) (err error) {
defer xerr.Contextf(&err, "%s: invalidate blk #%d:", f.path(), blk)
fsconn := gfsconn
blksize := f.blksize
off := blk*blksize
var blkdata []byte = nil
// first try to retrieve f.loading[blk];
// make sure f.loading[blk] is invalidated.
//
// we are running with zheadMu wlocked - no need to lock f.loadMu
loading, ok := f.loading[blk]
if ok {
if loading.err == nil {
blkdata = loading.blkdata
}
delete(f.loading, blk)
}
// TODO skip retrieve/store if len(f.watchTab) == 0
// try to retrieve cache of current head/data[blk], if we got nothing from f.loading
if blkdata == nil {
blkdata = make([]byte, blksize)
n, st := fsconn.FileRetrieveCache(f.Inode(), off, blkdata)
if st != fuse.OK {
// XXX warn
}
blkdata = blkdata[:n]
}
// if less than blksize was cached - probably the kernel had to evict
// some data from its cache already. In such case we don't try to
// preserve the rest and drop what was read, to avoid keeping the
// system overloaded.
//
// if we have the data - preserve it under @revX/bigfile/file[blk].
if int64(len(blkdata)) == blksize {
func() {
// store retrieved data back to OS cache for file @<rev>/file[blk]
δFtail := f.head.bfdir.δFtail
blkrev, _ := δFtail.LastBlkRev(ctx, f, blk, f.head.zconn.At())
frev, funlock, err := groot.lockRevFile(blkrev, f.zfile.POid())
if err != nil {
log.Errorf("BUG: %s: invalidate blk #%d: %s (ignoring, but reading @revX/bigfile will be slow)", f.path(), blk, err)
return
}
defer funlock()
st := fsconn.FileNotifyStoreCache(frev.Inode(), off, blkdata)
if st != fuse.OK {
log.Errorf("BUG: %s: invalidate blk #%d: %s: store cache: %s (ignoring, but reading @revX/bigfile will be slow)", f.path(), blk, frev.path(), st)
}
}()
}
// invalidate file/head/data[blk] in OS file cache.
st := fsconn.FileNotify(f.Inode(), off, blksize)
if st != fuse.OK {
return syscall.Errno(st)
}
return nil
}
// invalidateAttr invalidates file attributes in kernel cache.
//
// complements invalidateBlk and is used to invalidate file size.
// called with zheadMu wlocked.
func (f *BigFile) invalidateAttr() (err error) {
defer xerr.Contextf(&err, "%s: invalidate attr", f.path())
fsconn := gfsconn
st := fsconn.FileNotify(f.Inode(), -1, -1) // metadata only
if st != fuse.OK {
return syscall.Errno(st)
}
return nil
}
// lockRevFile makes sure inode ID of /@<rev>/bigfile/<fid> is known to kernel
// and won't change until unlock.
//
// We need node ID to be know to the kernel, when we need to store data into
// file's kernel cache - if the kernel don't have the node ID for the file in
// question, FileNotifyStoreCache will just fail.
//
// For kernel to know the inode lockRevFile issues regular filesystem lookup
// request which goes to kernel and should go back to wcfs. It is thus not safe
// to use lockRevFile from under FUSE request handler as doing so might deadlock.
//
// Caller must call unlock when inode ID is no longer required to be present.
// It is safe to simultaneously call multiple lockRevFile with the same arguments.
func (root *Root) lockRevFile(rev zodb.Tid, fid zodb.Oid) (_ *BigFile, unlock func(), err error) {
fsconn := gfsconn
frevpath := fmt.Sprintf("@%s/bigfile/%s", rev, fid) // relative to fs root for now
defer xerr.Contextf(&err, "/: lockRevFile %s", frevpath)
// FIXME checking for "node{0}" is fragile:
// XXX the node could be still forgotten since we are not holding open on it
// XXX -> always os.open unconditionally for now
// or is it ok since it is just a cache?
// -> no, not ok: if inode ID is forgotten, the same ID could be
// reallocated to another file and then we'll corrupt in-kernel
// cache by wrongly storing data of one file into cache of
// another file.
// -> to avoid this we need to always lock the inode ID with real open.
// XXX (also disabled for now due to race-detector)
/*
// first check without going through kernel, whether the inode maybe known already
xfrev := fsconn.LookupNode(root.Inode(), frevpath)
if xfrev != nil {
if xfrev.String() != "node{0}" {
return xfrev.Node().(*BigFile), func(){}, nil
}
}
*/
// we have to ping the kernel
frevospath := gmntpt + "/" + frevpath // now starting from OS /
f, err := os.Open(frevospath)
if err != nil {
return nil, nil, err
}
xfrev := fsconn.LookupNode(root.Inode(), frevpath)
// must be !nil as open succeeded
return xfrev.Node().(*BigFile), func() { f.Close() }, nil
}
// -------- 7) FUSE read(#blk) --------
// /(head|<rev>)/bigfile/<bigfileX> -> Read serves reading bigfile data.
func (f *BigFile) Read(_ nodefs.File, dest []byte, off int64, fctx *fuse.Context) (fuse.ReadResult, fuse.Status) {
f.head.zheadMu.RLock() // XXX +fctx to cancel
defer f.head.zheadMu.RUnlock()
// cap read request to file size
end, ok := overflow.Add64(off, int64(len(dest)))
if !ok {
end = math.MaxInt64 // cap read request till max possible file size
}
if end > f.size {
end = f.size
}
if end <= off {
// the kernel issues e.g. [0 +4K) read for f.size=0 and expects to get (0, ok)
// POSIX also says to return 0 if off >= f.size
return fuse.ReadResultData(nil), fuse.OK
}
// widen read request to be aligned with blksize granularity
// (we can load only whole ZBlk* blocks)
aoff := off - (off % f.blksize)
aend := end
if re := end % f.blksize; re != 0 {
aend += f.blksize - re
}
// XXX use original dest if it can fit the data
dest = make([]byte, aend - aoff) // ~> [aoff:aend) in file
// XXX better ctx = transaction.PutIntoContext(ctx, txn)
ctx, cancel := xcontext.Merge(fctx, f.head.zconn.txnCtx)
defer cancel()
// read/load all block(s) in parallel
wg := xsync.NewWorkGroup(ctx)
for blkoff := aoff; blkoff < aend; blkoff += f.blksize {
blkoff := blkoff
blk := blkoff / f.blksize
wg.Go(func(ctx context.Context) error {
δ := blkoff-aoff // blk position in dest
//log.Infof("readBlk #%d dest[%d:+%d]", blk, δ, f.blksize)
return f.readBlk(ctx, blk, dest[δ:δ+f.blksize])
})
}
err := wg.Wait()
if err != nil {
return nil, err2LogStatus(err)
}
return fuse.ReadResultData(dest[off-aoff:end-aoff]), fuse.OK
}
// readBlk serves Read to read 1 ZBlk #blk into destination buffer.
//
// see "7) when we receive a FUSE read(#blk) request ..." in overview.
//
// len(dest) == blksize.
// called with head.zheadMu rlocked.
func (f *BigFile) readBlk(ctx context.Context, blk int64, dest []byte) (err error) {
defer xerr.Contextf(&err, "%s: readblk #%d", f.path(), blk)
// check if someone else is already loading this block
f.loadMu.Lock()
loading, already := f.loading[blk]
if !already {
loading = &blkLoadState{
ready: make(chan struct{}),
}
f.loading[blk] = loading
}
f.loadMu.Unlock()
// if it is already loading - just wait for it
if already {
select {
case <-ctx.Done():
return ctx.Err()
case <-loading.ready:
if loading.err == nil {
copy(dest, loading.blkdata) // XXX copy
}
return loading.err
}
}
// noone was loading - we became responsible to load this block
blkdata, treepath, zblk, blkrevMax, err := f.zfile.LoadBlk(ctx, blk)
loading.blkdata = blkdata
loading.err = err
// data loaded with error - cleanup .loading
if loading.err != nil {
close(loading.ready)
f.loadMu.Lock()
delete(f.loading, blk)
f.loadMu.Unlock()
return err
}
// we have the data - it can be used after watchers are updated
// XXX should we use ctx here? (see readPinWatchers comments)
f.readPinWatchers(ctx, blk, treepath, zblk, blkrevMax)
// data can be used now
close(loading.ready)
copy(dest, blkdata) // XXX copy
// store to kernel pagecache whole block that we've just loaded from database.
// This way, even if the user currently requested to read only small portion from it,
// it will prevent next e.g. consecutive user read request to again hit
// the DB, and instead will be served by kernel from its pagecache.
//
// We cannot do this directly from reading goroutine - while reading
// kernel FUSE is holding corresponding page in pagecache locked, and if
// we would try to update that same page in pagecache it would result
// in deadlock inside kernel.
//
// .loading cleanup is done once we are finished with putting the data into OS pagecache.
// If we do it earlier - a simultaneous read covered by the same block could result
// into missing both kernel pagecache (if not yet updated) and empty .loading[blk],
// and thus would trigger DB access again.
//
// XXX if direct-io: don't touch pagecache
// XXX upload parts only not covered by currrent read (not to e.g. wait for page lock)
// XXX skip upload completely if read is wide to cover whole blksize
go f.uploadBlk(blk, loading)
return nil
}
// uploadBlk complements readBlk: it uploads loaded blkdata into OS cache.
func (f *BigFile) uploadBlk(blk int64, loading *blkLoadState) {
head := f.head
// rlock zheadMu and make sure zwatcher is not asking us to pause.
// if it does - wait for a safer time not to deadlock.
// see notes.txt -> "Kernel locks page on read/cache store/..." for details.
retry:
for {
head.zheadMu.RLock()
// help zwatcher if it asks us to pause uploadings, so it can
// take zheadMu wlocked without deadlocks.
if head.pauseOSCacheUpload {
ready := head.continueOSCacheUpload
head.zheadMu.RUnlock()
<-ready
continue retry
}
break
}
// zheadMu rlocked.
// zwatcher is not currently trying to pause OS cache uploads.
// check if this block was already invalidated by zwatcher.
// if so don't upload the block into OS cache.
f.loadMu.Lock()
loading_ := f.loading[blk]
f.loadMu.Unlock()
if loading != loading_ {
head.zheadMu.RUnlock()
return
}
oid := f.zfile.POid()
// signal to zwatcher not to run while we are performing the upload.
// upload with released zheadMu so that zwatcher can lock it even if to
// check inflightOSCacheUploads status.
atomic.AddInt32(&head.inflightOSCacheUploads, +1)
head.zheadMu.RUnlock()
st := gfsconn.FileNotifyStoreCache(f.Inode(), blk*f.blksize, loading.blkdata)
f.loadMu.Lock()
bug := (loading != f.loading[blk])
if !bug {
delete(f.loading, blk)
}
f.loadMu.Unlock()
// signal to zwatcher that we are done and it can continue.
atomic.AddInt32(&head.inflightOSCacheUploads, -1)
if bug {
panicf("BUG: bigfile %s: blk %d: f.loading mutated while uploading data to pagecache", oid, blk)
}
if st == fuse.OK {
return
}
// pagecache update failed, but it must not (we verified on startup that
// pagecache control is supported by kernel). We can correctly live on
// with the error, but data access will be likely very slow. Tell user
// about the problem.
log.Errorf("BUG: bigfile %s: blk %d: -> pagecache: %s (ignoring, but reading from bigfile will be very slow)", oid, blk, st)
}
// -------- isolation protocol notification/serving --------
//
// (see "7.2) for all registered client@at watchers ...")
const _traceIso = false
func traceIso(format string, argv ...interface{}) {
if !_traceIso {
return
}
log.InfoDepth(1, fmt.Sprintf(format, argv...))
}
// pin makes sure that file[blk] on client side is the same as of @rev state.
//
// rev = zodb.TidMax means @head; otherwise rev must be ≤ w.at and there must
// be no rev_next changing file[blk]: rev < rev_next ≤ w.at.
//
// must be called with atMu rlocked.
//
// XXX error - when? or close watch on any error?
func (w *Watch) pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
defer xerr.Contextf(&err, "wlink%d: f<%s>", w.link.id, w.file.zfile.POid())
return w._pin(ctx, blk, rev)
}
func (w *Watch) _pin(ctx context.Context, blk int64, rev zodb.Tid) (err error) {
foid := w.file.zfile.POid()
revstr := rev.String()
if rev == zodb.TidMax {
revstr = "head"
}
defer xerr.Contextf(&err, "pin #%d @%s", blk, revstr)
if !(rev == zodb.TidMax || rev <= w.at) {
panicf("f<%s>: wlink%d: pin #%d @%s: watch.at (%s) < rev",
foid, w.link.id, blk, rev, w.at)
}
w.pinnedMu.Lock()
// check/wait for previous/simultaneous pin.
// (pin could be called simultaneously e.g. by setupWatch and readPinWatchers)
for {
blkpin := w.pinned[blk]
if blkpin == nil {
break
}
w.pinnedMu.Unlock()
<-blkpin.ready // XXX + ctx ? (or just keep ready ?)
if blkpin.rev == rev {
// already pinned
// (e.g. os cache for block was evicted and read called the second time)
return blkpin.err
}
// relock the watch and check that w.pinned[blk] is the same. Retry if it is not.
// ( w.pinned[blk] could have changed while w.mu was not held e.g. by XXX recheck
// simultaneous setupWatch if we were called by readPinWatchers )
w.pinnedMu.Lock()
if blkpin == w.pinned[blk] {
if blkpin.rev == zodb.TidMax {
w.pinnedMu.Unlock()
panicf("f<%s>: wlink%d: pinned[#%d] = @head", foid, w.link.id, blk)
}
break
}
}
// w.pinnedMu locked & previous pin is either nil or completed and its .rev != rev
// -> setup new pin state
blkpin := &blkPinState{rev: rev, ready: make(chan struct{})}
w.pinned[blk] = blkpin
// perform IO without w.pinnedMu
w.pinnedMu.Unlock()
ack, err := w.link.sendReq(ctx, fmt.Sprintf("pin %s #%d @%s", foid, blk, revstr))
w.pinnedMu.Lock()
// check IO reply & verify/signal blkpin is ready
defer func() {
if rev == zodb.TidMax {
delete(w.pinned, blk)
}
w.pinnedMu.Unlock()
close(blkpin.ready)
}()
if err != nil {
blkpin.err = err
return err
}
if ack != "ack" {
blkpin.err = fmt.Errorf("expect %q; got %q", "ack", ack)
return blkpin.err
}
if blkpin != w.pinned[blk] {
blkpin.err = fmt.Errorf("BUG: pinned[#%d] mutated while doing IO", blk)
panicf("f<%s>: wlink%d: %s", foid, w.link.id, blkpin.err)
}
return nil
}
// readPinWatchers complements readBlk: it sends `pin blk` for watchers of the file
// after a block was loaded from ZODB but before block data is returned to kernel.
//
// See "7.2) for all registered client@at watchers ..."
//
// Called with f.head.zheadMu rlocked.
//
// XXX do we really need to use/propagate caller context here? ideally update
// watchers should be synchronous, and in practice we just use 30s timeout.
// Should a READ interrupt cause watch update failure? -> probably no
func (f *BigFile) readPinWatchers(ctx context.Context, blk int64, treepath []btree.LONode, zblk zBlk, blkrevMax zodb.Tid) {
// only head/ is being watched for
if f.head.rev != 0 {
return
}
// fmt.Printf("S: read #%d -> pin watchers (#%d)\n", blk, len(f.watchTab))
// update δFtail index XXX -> move upper into readBlk ?
// (δFtail is just for δZ -> δF invalidation handling and is needed without isolation protocol)
// XXX ^^^ no - also need to query to send pins
bfdir := f.head.bfdir
δFtail := bfdir.δFtail
bfdir.δFmu.Lock() // XXX locking correct? XXX -> better push down?
δFtail.Track(f, blk, treepath, zblk) // XXX pass in zblk.rev here?
f.accessed.Add(blk)
bfdir.δFmu.Unlock()
// make sure that file[blk] on clients side stays as of @w.at state.
// try to use blkrevMax only as the first cheap criteria to skip updating watchers.
// This is likely to be the case, since most watchers should be usually close to head.
// If using blkrevMax only turns out to be not sufficient, we'll
// consult δFtail, which might involve recomputing it.
blkrev := blkrevMax
blkrevRough := true
wg := xsync.NewWorkGroup(ctx)
f.watchMu.RLock()
for w := range f.watchTab {
w := w
// make sure w.at stays unchanged while we prepare and pin the block
w.atMu.RLock()
// the block is already covered by @w.at database view
if blkrev <= w.at {
w.atMu.RUnlock()
continue
}
// if blkrev is rough estimation and that upper bound is > w.at
// we have to recompute ~exact file[blk] revision @head.
if blkrevRough {
// unlock atMu while we are (re-)calculating blkrev
// we'll relock atMu again and recheck blkrev vs w.at after.
w.atMu.RUnlock()
blkrev, _ = δFtail.LastBlkRev(ctx, f, blk, f.head.zconn.At())
blkrevRough = false
w.atMu.RLock()
if blkrev <= w.at {
w.atMu.RUnlock()
continue
}
}
// the block is newer - find out its revision as of @w.at and pin to that.
//
// We don't pin to w.at since if we would do so for several clients,
// and most of them would be on different w.at - cache of the file will
// be lost. Via pinning to particular block revision, we make sure the
// revision to pin is the same on all clients, and so file cache is shared.
pinrev, _ := δFtail.LastBlkRev(ctx, w.file, blk, w.at) // XXX move into go?
// XXX ^^^ w.file vs f ?
//fmt.Printf("S: read #%d: watch @%s: pin -> @%s\n", blk, w.at, pinrev)
wg.Go(func(ctx context.Context) error {
defer w.atMu.RUnlock()
// XXX close watcher on any error
return w.pin(ctx, blk, pinrev)
})
}
f.watchMu.RUnlock()
err := wg.Wait()
if err != nil {
panic(err) // XXX
}
}
// setupWatch sets up or updates a Watch when client sends `watch <file> @<at>` request.
//
// XXX sends "pin" notifications; final "ok" must be sent by caller.
//
// XXX called synchronously - only 1 setupWatch call at a time?
func (wlink *WatchLink) setupWatch(ctx context.Context, foid zodb.Oid, at zodb.Tid) (err error) {
defer xerr.Contextf(&err, "setup watch f<%s> @%s", foid, at)
head := wlink.head
bfdir := head.bfdir
// wait for zhead.At ≥ at
if at != zodb.InvalidTid {
err = head.zheadWait(ctx, at)
if err != nil {
return err
}
}
// make sure zhead.At stays unchanged while we are preparing the watch
// (see vvv e.g. about unpin to @head for why it is needed)
head.zheadMu.RLock()
defer head.zheadMu.RUnlock()
headAt := head.zconn.At()
// XXX δFtail locking? (or ForgetPast is called only with zheadMu.W ?)
if at != zodb.InvalidTid && at < bfdir.δFtail.Tail() {
return fmt.Errorf("too far away back from head/at (@%s); δt = %s",
headAt, headAt.Time().Sub(at.Time().Time))
}
wlink.byfileMu.Lock()
// if watch was already established - we need to update it
w := wlink.byfile[foid]
if w == nil {
// watch was not previously established - set it up anew
bfdir.fileMu.Lock()
f := bfdir.fileTab[foid]
bfdir.fileMu.Unlock()
if f == nil {
wlink.byfileMu.Unlock()
// by "isolation protocol" watch is setup after data file was opened
return fmt.Errorf("file not yet known to wcfs or is not a ZBigFile")
}
w = &Watch{
link: wlink,
file: f,
at: at,
pinned: make(map[int64]*blkPinState),
}
}
f := w.file
f.watchMu.Lock()
// at="-" (InvalidTid) means "remove the watch"
if at == zodb.InvalidTid {
delete(wlink.byfile, foid)
delete(f.watchTab, w)
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
return nil
}
// request exclusive access to the watch to change .at and compute pins.
// The lock will be downgraded from W to R after pins computation is done.
// Pins will be executed with atMu.R only - with the idea not to block
// other clients that read-access the file simultaneously to setupWatch.
w.atMu.Lock()
// check at >= w.at
// XXX we might want to allow going back in history if we need it.
if !(at >= w.at) {
w.atMu.Unlock()
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
return fmt.Errorf("going back in history is forbidden")
}
// register w to f early, so that READs going in parallel to us
// preparing and processing initial pins, also send pins to w for read
// blocks. If we don't, we can miss to send pin to w for a freshly read
// block which could have revision > w.at: XXX test
//
// 1 3 2 4
// ─────.────x───o────x───x──────]──────────
// ↑ ↑
// w.at head
//
// Here blocks #1, #2 and #4 were previously accessed, are thus tracked
// by δFtail and are changed after w.at - they will be returned by vvv
// δFtail query and pin-sent to w. Block #3 was not yet accessed but
// was also changed after w.at . As head/file[#3] might be accessed
// simultaneously to watch setup, and f.readBlk will be checking
// f.watchTab; if w ∉ f.watchTab at that moment, w will miss to receive
// pin for #3.
//
// NOTE for `unpin blk` to -> @head we can be sure there won't be
// simultaneous `pin blk` request, because:
//
// - unpin means blk was previously pinned,
// - blk was pinned means it is tracked by δFtail,
// - if blk is tracked and δFtail says there is no δblk ∈ (at, head],
// there is indeed no blk change in that region,
// - which means that δblk with rev > w.at might be only > head,
// - but such δblk are processed with zhead wlocked and we keep zhead
// rlocked during pin setup.
//
// δ δ
// ----x----.------------]----x----
// ↑ ↑
// w.at head
//
// - also: there won't be simultaneous READs that would need to be
// unpinned, because we update w.at to requested at early.
w.at = at
f.watchTab[w] = struct{}{}
wlink.byfile[foid] = w
f.watchMu.Unlock()
wlink.byfileMu.Unlock()
// XXX defer -> unregister watch if error?
// pin all tracked file blocks that were changed in (at, head] range.
toPin := map[int64]zodb.Tid{} // blk -> @rev
δFtail := bfdir.δFtail
for _, δfile := range δFtail.SliceByFileRev(f, at, headAt) { // XXX locking δFtail
for blk := range δfile.Blocks {
_, already := toPin[blk]
if already {
continue
}
// blk might be in δFtail because it is adjacent in
// ZBigFile.blktab to another blk that was explicitly
// tracked. However wcfs tests expect that only blocks
// that were previously explicitly accessed are
// included into watch setup pins.
//
// XXX adjust wcfs tests to not require only accessed
// blocks to be in setup pins? But that would mean that
// potentially more blocks would be potentially
// _unneccessarily_ pinned if they are not going to be
// accessed at all.
if !f.accessed.Has(blk) {
continue
}
toPin[blk], _ = δFtail.LastBlkRev(ctx, f, blk, at) // XXX err
}
}
// if a block was previously pinned, but ∉ δ(at, head] -> unpin it to head.
for blk, pinPrev := range w.pinned {
// only 1 setupWatch can be run simultaneously for one file
// XXX assert pinPrev.rev != zodb.TidMax
pinNew, pinning := toPin[blk]
if !pinning {
toPin[blk] = zodb.TidMax // @head
}
// TODO don't bother to spawn .pin goroutines if pin revision is the same ?
// if pinNew == pinPrev.rev && ready(pinPrev.ready) && pinPrev.err == nil {
// delete(toPin, blk)
// }
_ = pinPrev
_ = pinNew
}
// downgrade atMu.W -> atMu.R to let other clients to access the file.
// XXX there is no primitive to do Wlock->Rlock atomically, but we are
// ok with that since we prepared everything to handle simultaneous pins
// from other reads.
w.atMu.Unlock()
w.atMu.RLock()
defer w.atMu.RUnlock()
wg := xsync.NewWorkGroup(ctx)
for blk, rev := range toPin {
blk := blk
rev := rev
wg.Go(func(ctx context.Context) error {
return w._pin(ctx, blk, rev)
})
}
err = wg.Wait()
if err != nil {
return err
}
return nil
}
// Open serves /head/watch opens.
func (wnode *WatchNode) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) {
// XXX check flags?
head := wnode.head
wlink := &WatchLink{
sk: NewFileSock(),
id: atomic.AddInt32(&wnode.idNext, +1),
head: head,
byfile: make(map[zodb.Oid]*Watch),
rxTab: make(map[uint64]chan string),
}
head.wlinkMu.Lock()
// XXX del wlinkTab[w] on w.sk.File.Release
head.wlinkTab[wlink] = struct{}{}
head.wlinkMu.Unlock()
go wlink.serve()
return wlink.sk.File(), fuse.OK
}
// serve serves client initiated watch requests and routes client replies to
// wcfs initiated pin requests.
func (wlink *WatchLink) serve() {
err := wlink._serve()
// XXX log error if !(close || EOF)
if err != nil {
log.Error(err)
}
head := wlink.head
head.wlinkMu.Lock()
delete(head.wlinkTab, wlink)
head.wlinkMu.Unlock()
}
func (wlink *WatchLink) _serve() (err error) {
defer xerr.Contextf(&err, "wlink %d: serve rx", wlink.id)
ctx0 := context.TODO() // XXX ctx = ? -> merge(ctx of wcfs running, ctx of wlink timeout)
ctx, cancel := context.WithCancel(ctx0)
wg := xsync.NewWorkGroup(ctx)
r := bufio.NewReader(xio.BindCtxR(wlink.sk, ctx))
defer func() {
// cancel all handlers on both error and ok return.
// ( ok return is e.g. when we received "bye", so if client
// sends "bye" and some pin handlers are in progress - they
// anyway don't need to wait for client replies anymore )
cancel()
err2 := wg.Wait()
if err == nil {
err = err2
}
// unregister all watches created on this wlink
wlink.byfileMu.Lock()
for _, w := range wlink.byfile {
w.file.watchMu.Lock()
delete(w.file.watchTab, w)
w.file.watchMu.Unlock()
}
wlink.byfile = nil
wlink.byfileMu.Unlock()
// write to peer if it was logical error on client side
if err != nil {
_ = wlink.send(ctx0, 0, fmt.Sprintf("error: %s", err))
}
// close .sk.tx : this wakes up rx on client side.
err2 = wlink.sk.CloseWrite()
if err == nil {
err = err2
}
}()
// close .sk.rx on error/wcfs stopping or return: this wakes up read(sk).
retq := make(chan struct{})
defer close(retq)
wg.Go(func(ctx context.Context) error {
// monitor is always canceled - either at parent ctx cancel, or
// upon return from serve (see "cancel all handlers ..." ^^^).
// If it was return - report returned error to wg.Wait, not "canceled".
<-ctx.Done()
e := ctx.Err()
select {
default:
case <-retq:
e = err // returned error
}
e2 := wlink.sk.CloseRead()
if e == nil {
e = e2
}
return e
})
// XXX recheck that it is safe to handle multiple simultaneous watch requests.
for {
l, err := r.ReadString('\n') // XXX limit accepted line len to prevent DOS
if err != nil {
// r.Read is woken up by sk.CloseRead when serve decides to exit
if err == io.ErrClosedPipe || err == io.EOF {
err = nil
}
return err
}
traceIso("S: wlink%d: rx: %q\n", wlink.id, l)
stream, msg, err := parseWatchFrame(l)
if err != nil {
return err
}
// reply from client to wcfs
reply := (stream % 2 == 0)
if reply {
wlink.rxMu.Lock()
rxq := wlink.rxTab[stream]
delete(wlink.rxTab, stream)
wlink.rxMu.Unlock()
if rxq == nil {
return fmt.Errorf("%d: reply on unexpected stream", stream)
}
rxq <- msg
continue
}
// client-initiated request
// bye TODO document in "Isolation protocol"
if msg == "bye" {
return nil // deferred sk.Close will wake-up rx on client side
}
// watch ...
wg.Go(func(ctx context.Context) error {
return wlink.handleWatch(ctx, stream, msg)
})
}
}
// handleWatch handles watch request from client.
//
// returned error comes without full error prefix.
func (wlink *WatchLink) handleWatch(ctx context.Context, stream uint64, msg string) (err error) {
defer xerr.Contextf(&err, "%d", stream)
err = wlink._handleWatch(ctx, msg)
reply := "ok"
if err != nil {
// logical error is reported back to client, but watch link remains live
reply = fmt.Sprintf("error %s", err)
err = nil
}
err = wlink.send(ctx, stream, reply)
return err
}
func (wlink *WatchLink) _handleWatch(ctx context.Context, msg string) error {
foid, at, err := parseWatch(msg)
if err != nil {
return err
}
err = wlink.setupWatch(ctx, foid, at)
return err
}
// sendReq sends wcfs-originated request to client and returns client response.
func (wlink *WatchLink) sendReq(ctx context.Context, req string) (reply string, err error) {
// XXX err ctx
var stream uint64
for stream == 0 {
stream = atomic.AddUint64(&wlink.reqNext, +2)
}
rxq := make(chan string) // XXX cap=1? (so that if we return canceled we do not block client)
wlink.rxMu.Lock()
wlink.rxTab[stream] = rxq // XXX assert .stream is not there?
wlink.rxMu.Unlock()
err = wlink.send(ctx, stream, req)
if err != nil {
return "", err
}
select {
case <-ctx.Done():
// XXX del rxTab[stream] ?
return "", ctx.Err()
case reply = <-rxq:
return reply, nil
}
}
// send sends a message to client over specified stream ID.
//
// Multiple send can be called simultaneously; send serializes writes.
func (wlink *WatchLink) send(ctx context.Context, stream uint64, msg string) error {
// XXX err ctx
// XXX assert '\n' not in msg
wlink.txMu.Lock()
defer wlink.txMu.Unlock()
pkt := []byte(fmt.Sprintf("%d %s\n", stream, msg))
traceIso("S: wlink%d: tx: %q\n", wlink.id, pkt)
_, err := wlink.sk.Write(ctx, pkt)
if err != nil {
return err
}
return nil
}
// ---- Lookup ----
// /(head|<rev>)/bigfile/ -> Lookup receives client request to create /(head|<rev>)/bigfile/<bigfileX>.
func (bfdir *BigFileDir) Lookup(out *fuse.Attr, name string, fctx *fuse.Context) (*nodefs.Inode, fuse.Status) {
f, err := bfdir.lookup(out, name, fctx)
var inode *nodefs.Inode
if f != nil {
inode = f.Inode()
}
return inode, err2LogStatus(err)
}
func (bfdir *BigFileDir) lookup(out *fuse.Attr, name string, fctx *fuse.Context) (f *BigFile, err error) {
defer xerr.Contextf(&err, "%s: lookup %q", bfdir.path(), name)
oid, err := zodb.ParseOid(name)
if err != nil {
return nil, eINVALf("not oid")
}
bfdir.head.zheadMu.RLock() // XXX +fctx -> cancel
defer bfdir.head.zheadMu.RUnlock()
defer func() {
if f != nil {
f.getattr(out)
}
}()
// check to see if dir(oid) is already there
bfdir.fileMu.Lock()
f, already := bfdir.fileTab[oid]
bfdir.fileMu.Unlock()
if already {
return f, nil
}
// not there - without bfdir lock proceed to open BigFile from ZODB
f, err = bfdir.head.bigopen(fctx, oid)
if err != nil {
return nil, err
}
// relock bfdir and either register f or, if the file was maybe
// simultaneously created while we were not holding bfdir.fileMu, return that.
bfdir.fileMu.Lock()
f2, already := bfdir.fileTab[oid]
if already {
bfdir.fileMu.Unlock()
f.Close()
return f2, nil
}
bfdir.fileTab[oid] = f
bfdir.fileMu.Unlock()
// mkfile takes filesystem treeLock - do it outside bfdir.fileMu
mkfile(bfdir, name, f)
return f, nil
}
// / -> Lookup receives client request to create @<rev>/.
func (root *Root) Lookup(out *fuse.Attr, name string, fctx *fuse.Context) (*nodefs.Inode, fuse.Status) {
revd, err := root.lookup(name, fctx)
var inode *nodefs.Inode
if revd != nil {
inode = revd.Inode()
_ = revd.GetAttr(out, nil, fctx) // always ok
}
return inode, err2LogStatus(err)
}
func (root *Root) lookup(name string, fctx *fuse.Context) (_ *Head, err error) {
defer xerr.Contextf(&err, "/: lookup %q", name)
var rev zodb.Tid
ok := false
if strings.HasPrefix(name, "@") {
rev, err = zodb.ParseTid(name[1:])
ok = (err == nil)
}
if !ok {
return nil, eINVALf("not @rev")
}
// check to see if dir(rev) is already there
root.revMu.Lock()
revDir, already := root.revTab[rev]
root.revMu.Unlock()
if already {
// XXX race wrt simlutaneous "FORGET @<rev>" ?
return revDir, nil
}
// not there - without revMu lock proceed to open @rev view of ZODB
// zconnRev, err := root.zopenAt(fctx, rev)
zconnRev, err := zopen(fctx, root.zdb, &zodb.ConnOptions{At: rev})
if err != nil {
return nil, err
}
// relock root and either register new revX/ directory or, if the
// directory was maybe simultaneously created while we were not holding
// revMu, return that.
root.revMu.Lock()
revDir, already = root.revTab[rev]
if already {
root.revMu.Unlock()
// zconnRev.Release()
transaction.Current(zconnRev.txnCtx).Abort()
return revDir, nil
}
revDir = &Head{
// XXX how to test forgets:
// echo 2 >/proc/sys/vm/drop_caches (root)
// mount -i -oremount $mntpt (root ?) (shrinks dcache)
// notify invalidate dentry from inside fs
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + Head.OnForget() -> del root.revTab[]
rev: rev,
zconn: zconnRev, // XXX + Head.OnForget() -> release zconn (= abort zconn.txnCtx)
}
bfdir := &BigFileDir{
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + BigFileDir.OnForget()
head: revDir,
fileTab: make(map[zodb.Oid]*BigFile),
δFtail: nil, // δFtail not needed/used for @revX/
}
revDir.bfdir = bfdir
root.revTab[rev] = revDir
root.revMu.Unlock()
// mkdir takes filesystem treeLock - do it outside revMu.
mkdir(root, name, revDir)
mkdir(revDir, "bigfile", bfdir)
// XXX + "at"
return revDir, nil
}
// bigopen opens BigFile corresponding to oid on head.zconn.
//
// A ZBigFile corresponding to oid is activated and statted.
//
// head.zconn must be locked.
func (head *Head) bigopen(ctx context.Context, oid zodb.Oid) (_ *BigFile, err error) {
zconn := head.zconn
defer xerr.Contextf(&err, "bigopen %s @%s", oid, zconn.At())
// XXX better ctx = transaction.PutIntoContext(ctx, txn)
ctx, cancel := xcontext.Merge(ctx, zconn.txnCtx)
defer cancel()
xzfile, err := zconn.Get(ctx, oid)
if err != nil {
switch errors.Cause(err).(type) {
case *zodb.NoObjectError:
return nil, eINVAL(err)
case *zodb.NoDataError:
return nil, eINVAL(err) // XXX what to do if it was existing and got deleted?
default:
return nil, err
}
}
zfile, ok := xzfile.(*ZBigFile)
if !ok {
return nil, eINVALf("%s is not a ZBigFile", typeOf(xzfile))
}
// extract blksize, size and initial approximation for file revision
err = zfile.PActivate(ctx)
if err != nil {
return nil, err
}
blksize := zfile.blksize
// XXX it should be revision of both ZBigFile and its data. But we
// cannot get data revision without expensive scan of all ZBigFile's objects.
// -> approximate mtime initially with ZBigFile object mtime.
//
// XXX for @rev/... we can know initial mtime more exactly?
rev := zfile.PSerial()
zfile.PDeactivate()
size, sizePath, err := zfile.Size(ctx)
if err != nil {
return nil, err
}
f := &BigFile{
fsNode: newFSNode(&fsOptions{Sticky: false}), // XXX + BigFile.OnForget -> del .head.bfdir.fileTab[]
head: head,
zfile: zfile,
blksize: blksize,
size: size,
rev: rev,
loading: make(map[int64]*blkLoadState),
}
// only head/ needs δFtail, f.δtail and watches.
if head.rev == 0 {
head.bfdir.δFmu.Lock() // XXX locking ok?
head.bfdir.δFtail.Track(f, -1, sizePath, nil)
head.bfdir.δFmu.Unlock()
// FIXME: scan zfile.blktab - so that we can detect all btree changes
// see "XXX building δFtail lazily ..." in notes.txt
f.accessed = make(SetI64)
f.watchTab = make(map[*Watch]struct{})
}
return f, nil
}
// Close release all resources of BigFile. XXX needed?
func (f *BigFile) Close() error {
// XXX locking?
f.zfile = nil
// f.zconn.Release()
// f.zconn = nil
f.head = nil
return nil
}
// ---- misc ---
// /(head|<rev>)/at -> readAt serves read.
func (h *Head) readAt(fctx *fuse.Context) ([]byte, error) {
// XXX cancel on fctx cancel
h.zheadMu.RLock()
defer h.zheadMu.RUnlock()
return []byte(h.zconn.At().String()), nil
}
// /(head|<rev>)/ -> Getattr serves stat.
func (head *Head) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status {
at := head.rev
if at == 0 {
head.zheadMu.RLock() // XXX +fctx -> cancel
at = head.zconn.At()
head.zheadMu.RUnlock()
}
t := at.Time().Time
out.Mode = fuse.S_IFDIR | 0555
out.SetTimes(/*atime=*/nil, /*mtime=*/&t, /*ctime=*/&t)
return fuse.OK
}
// /(head|<rev>)/bigfile/<bigfileX> -> Getattr serves stat.
func (f *BigFile) GetAttr(out *fuse.Attr, _ nodefs.File, fctx *fuse.Context) fuse.Status {
f.head.zheadMu.RLock() // XXX +fctx -> cancel
defer f.head.zheadMu.RUnlock()
f.getattr(out)
return fuse.OK
}
func (f *BigFile) getattr(out *fuse.Attr) {
out.Mode = fuse.S_IFREG | 0444
out.Size = uint64(f.size)
out.Blksize = uint32(f.blksize) // XXX 64 -> 32
// .Blocks
mtime := f.rev.Time().Time
out.SetTimes(/*atime=*/nil, /*mtime=*/&mtime, /*ctime=*/&mtime)
}
// FIXME groot/gfsconn is tmp workaround for lack of way to retrieve FileSystemConnector from nodefs.Inode
// TODO:
// - Inode += .Mount() -> nodefs.Mount
// - Mount:
// .Root() -> root Inode of the fs
// .Connector() -> FileSystemConnector through which fs is mounted
var groot *Root
var gfsconn *nodefs.FileSystemConnector
// root of the filesystem is mounted here.
//
// we need to talk to kernel and lookup @<rev>/bigfile/<fid> before uploading
// data to kernel cache there. Referencing root of the filesystem via path is
// vulnerable to bugs wrt e.g. `mount --move` and/or mounting something else
// over wcfs. However keeping opened root fd will prevent wcfs to be unmounted,
// so we still have to reference the root via path.
var gmntpt string
// debugging (protected by zhead.W)
var gdebug = struct {
// .wcfs/zhead opens
// protected by groot.head.zheadMu
zheadSockTab map[*FileSock]struct{}
}{}
func init() {
gdebug.zheadSockTab = make(map[*FileSock]struct{})
}
// _wcfs_Zhead serves .wcfs/zhead opens.
type _wcfs_Zhead struct {
fsNode
}
func (zh *_wcfs_Zhead) Open(flags uint32, fctx *fuse.Context) (nodefs.File, fuse.Status) {
// XXX check flags?
sk := NewFileSock()
sk.CloseRead()
groot.head.zheadMu.Lock() // XXX +fctx -> cancel
defer groot.head.zheadMu.Unlock()
// XXX del zheadSockTab[sk] on sk.File.Release (= client drops opened handle)
gdebug.zheadSockTab[sk] = struct{}{}
return sk.File(), fuse.OK
}
// TODO -> enable/disable fuse debugging dynamically (by write to .wcfs/debug ?)
func main() {
stdlog.SetPrefix("wcfs: ")
//log.CopyStandardLogTo("WARNING") // XXX -> "DEBUG" if -d ?
defer log.Flush()
err := _main()
if err != nil {
log.Fatal(err)
}
}
func _main() (err error) {
debug := flag.Bool("d", false, "debug")
autoexit := flag.Bool("autoexit", false, "automatically stop service when there is no client activity")
// XXX option to prevent starting if wcfs was already started/mounted on mntpt ?
// XXX do the check unconditionally?
flag.Parse()
if len(flag.Args()) != 2 {
fmt.Fprintf(os.Stderr, "Usage: %s [OPTIONS] zurl mntpt\n", os.Args[0])
os.Exit(2)
}
zurl := flag.Args()[0]
mntpt := flag.Args()[1]
xclose := func(c io.Closer) {
err = xerr.First(err, c.Close())
}
// debug -> precise t, no dates (XXX -> always precise t?)
if *debug {
stdlog.SetFlags(stdlog.Lmicroseconds)
}
log.Infof("start %q %q", mntpt, zurl)
gover := "(built with " + runtime.Version()
if race.Enabled {
gover += " -race"
}
gover += ")"
log.Info(gover)
// open zodb storage/watch/db/connection
ctx := context.Background() // XXX + timeout?
zstor, err := zodb.Open(ctx, zurl, &zodb.OpenOptions{
ReadOnly: true,
})
if err != nil {
return err
}
defer xclose(zstor)
zwatchq := make(chan zodb.Event)
at0 := zstor.AddWatch(zwatchq)
defer zstor.DelWatch(zwatchq)
zdb := zodb.NewDB(zstor)
defer xclose(zdb)
zhead, err := zopen(ctx, zdb, &zodb.ConnOptions{
At: at0,
// we need zhead.cache to be maintained across several transactions.
// see "3) for head/bigfile/* the following invariant is maintained ..."
NoPool: true,
})
if err != nil {
return err
}
zhead.Cache().Lock()
zhead.Cache().SetControl(&zodbCacheControl{})
zhead.Cache().Unlock()
// mount root + head/
head := &Head{
fsNode: newFSNode(fSticky),
rev: 0,
zconn: zhead,
wlinkTab: make(map[*WatchLink]struct{}),
hwait: make(map[hwaiter]struct{}),
}
wnode := &WatchNode{
fsNode: newFSNode(fSticky),
head: head,
}
bfdir := &BigFileDir{
fsNode: newFSNode(fSticky),
head: head,
fileTab: make(map[zodb.Oid]*BigFile),
δFtail: NewΔFtail(zhead.At(), zdb),
}
head.bfdir = bfdir
root := &Root{
fsNode: newFSNode(fSticky),
zstor: zstor,
zdb: zdb,
head: head,
revTab: make(map[zodb.Tid]*Head),
}
opts := &fuse.MountOptions{
FsName: zurl,
Name: "wcfs",
// We retrieve kernel cache in ZBlk.blksize chunks, which are 2MB in size.
// XXX currently go-fuse caps MaxWrite to 128KB.
// TODO -> teach go-fuse to handle Init.MaxPages (Linux 4.20+).
MaxWrite: 2*1024*1024,
// XXX tune MaxReadAhead? MaxBackground?
// OS cache that we populate with bigfile data is precious;
// we explicitly propagate ZODB invalidations into file invalidations.
ExplicitDataCacheControl: true,
DisableXAttrs: true, // we don't use
Debug: *debug,
}
fssrv, fsconn, err := mount(mntpt, root, opts)
if err != nil {
return err
}
groot = root // FIXME temp workaround (see ^^^)
gfsconn = fsconn // FIXME ----//----
gmntpt = mntpt
// we require proper pagecache control (added to Linux 2.6.36 in 2010)
kinit := fssrv.KernelSettings()
kfuse := fmt.Sprintf("kernel FUSE (API %d.%d)", kinit.Major, kinit.Minor)
supports := kinit.SupportsNotify
if !(supports(fuse.NOTIFY_STORE_CACHE) && supports(fuse.NOTIFY_RETRIEVE_CACHE)) {
return fmt.Errorf("%s does not support pagecache control", kfuse)
}
// make a bold warning if kernel does not support explicit cache invalidation
// (patch is in Linux 5.2+; see notes.txt -> "Notes on OS pagecache control")
if kinit.Flags & fuse.CAP_EXPLICIT_INVAL_DATA == 0 {
w1 := fmt.Sprintf("%s does not support explicit data cache invalidation", kfuse)
w2 := "-> performance will be AWFUL."
w3 := "-> you need kernel which includes git.kernel.org/linus/ad2ba64dd489."
w4 := "-> (Linux 5.2+, or nxd-fuse-dkms package installed from navytux.spb.ru/pkg)"
log.Error(w1); log.Error(w2); log.Error(w3); log.Error(w4)
fmt.Fprintf(os.Stderr, "W: wcfs: %s\nW: wcfs: %s\nW: wcfs: %s\nW: wcfs: %s\n", w1, w2, w3, w4)
}
// add entries to /
mkdir(root, "head", head)
mkdir(head, "bigfile", bfdir)
mkfile(head, "at", NewSmallFile(head.readAt)) // TODO mtime(at) = tidtime(at)
mkfile(head, "watch", wnode)
// for debugging/testing
_wcfs := newFSNode(fSticky)
mkdir(root, ".wcfs", &_wcfs)
mkfile(&_wcfs, "zurl", NewStaticFile([]byte(zurl)))
// .wcfs/zhead - special file channel that sends zhead.at.
//
// If a user opens it, it will start to get tids of through which
// zhead.at was, starting from the time when .wcfs/zhead was opened.
// There can be multiple openers. Once opened, the file must be read,
// as wcfs blocks waiting for data to be read when processing
// invalidations.
mkfile(&_wcfs, "zhead", &_wcfs_Zhead{
fsNode: newFSNode(fSticky),
})
// TODO handle autoexit
// (exit when kernel forgets all our inodes - wcfs.py keeps .wcfs/zurl
// opened, so when all inodes has been forgotten - we know all wcfs.py clients exited)
_ = autoexit
defer xerr.Contextf(&err, "serve %s %s", mntpt, zurl)
// spawn filesystem server.
//
// use `go serve` + `waitMount` not just `serve` - because waitMount
// cares to disable OS calling poll on us.
// ( if we don't disable polling - fs serving can get stuck - see
// https://github.com/hanwen/go-fuse/commit/4f10e248eb for details )
serveCtx, serveCancel := context.WithCancel(context.Background())
go func () {
defer serveCancel()
fssrv.Serve()
}()
err = fssrv.WaitMount()
if err != nil {
return err
}
// filesystem server is serving requests.
// run zwatcher and wait for it to complete.
// zwatcher completes either normally - due to filesystem unmount, or fails.
// if zwatcher fails - switch filesystem to return EIO instead of stale data.
err = root.zwatcher(serveCtx, zwatchq)
if errors.Cause(err) != context.Canceled {
log.Error(err)
log.Errorf("zwatcher failed -> switching filesystem to EIO mode")
// XXX switch fs to EIO mode
}
// wait for unmount
// XXX the kernel does not sentd FORGETs on unmount - release left node resources ourselves?
<-serveCtx.Done()
log.Infof("stop %q %q", mntpt, zurl)
return nil // XXX serveErr | zwatchErr ?
}
...@@ -68,7 +68,7 @@ type zBlk interface { ...@@ -68,7 +68,7 @@ type zBlk interface {
loadBlkData(ctx context.Context) (data []byte, rev zodb.Tid, _ error) loadBlkData(ctx context.Context) (data []byte, rev zodb.Tid, _ error)
// inΔFtail returns pointer to struct zblkInΔFtail embedded into this ZBlk. // inΔFtail returns pointer to struct zblkInΔFtail embedded into this ZBlk.
inΔFtail() *zblkInΔFtail // inΔFtail() *zblkInΔFtail
// XXX kill - in favour of inΔFtail // XXX kill - in favour of inΔFtail
/* /*
...@@ -140,7 +140,7 @@ func (zb *zBlkBase) blkBoundTo() map[*BigFile]SetI64 { ...@@ -140,7 +140,7 @@ func (zb *zBlkBase) blkBoundTo() map[*BigFile]SetI64 {
// ZBlk0 mimics ZBlk0 from python. // ZBlk0 mimics ZBlk0 from python.
type ZBlk0 struct { type ZBlk0 struct {
zblkInΔFtail // zblkInΔFtail
zodb.Persistent zodb.Persistent
// NOTE py source uses bytes(buf) but on python2 it still results in str // NOTE py source uses bytes(buf) but on python2 it still results in str
...@@ -211,7 +211,7 @@ func (zd *zDataState) PySetState(pystate interface{}) error { ...@@ -211,7 +211,7 @@ func (zd *zDataState) PySetState(pystate interface{}) error {
// ZBlk1 mimics ZBlk1 from python. // ZBlk1 mimics ZBlk1 from python.
type ZBlk1 struct { type ZBlk1 struct {
zblkInΔFtail // zblkInΔFtail
zodb.Persistent zodb.Persistent
chunktab *btree.IOBTree // {} offset -> ZData(chunk) chunktab *btree.IOBTree // {} offset -> ZData(chunk)
......
// Code generated by gen-set BigFile *BigFile; DO NOT EDIT.
// Copyright (C) 2015-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
package main
// SetBigFile is a set of *BigFile.
type SetBigFile map[*BigFile]struct{}
// Add adds v to the set.
func (s SetBigFile) Add(v *BigFile) {
s[v] = struct{}{}
}
// Del removes v from the set.
// it is noop if v was not in the set.
func (s SetBigFile) Del(v *BigFile) {
delete(s, v)
}
// Has checks whether the set contains v.
func (s SetBigFile) Has(v *BigFile) bool {
_, ok := s[v]
return ok
}
// Update adds t values to s.
func (s SetBigFile) Update(t SetBigFile) {
for v := range t {
s.Add(v)
}
}
// Elements returns all elements of set as slice.
func (s SetBigFile) Elements() []*BigFile {
ev := make([]*BigFile, len(s))
i := 0
for e := range s {
ev[i] = e
i++
}
return ev
}
// Copyright (C) 2019-2020 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
// it under the terms of the GNU General Public License version 3, or (at your
// option) any later version, as published by the Free Software Foundation.
//
// You can also Link and Combine this program with other software covered by
// the terms of any of the Free Software licenses or any of the Open Source
// Initiative approved licenses and Convey the resulting work. Corresponding
// source of such a combination shall include the source code for all other
// software used.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the implied
// warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
//
// See COPYING file for full licensing terms.
// See https://www.nexedi.com/licensing for rationale and options.
package main
//go:generate ./gen-set main I64 int64 zset_i64.go
//go:generate ./gen-set main BigFile *BigFile zset_bigfile.go
import (
"context"
"runtime"
"sync"
"lab.nexedi.com/kirr/go123/xerr"
"lab.nexedi.com/kirr/neo/go/zodb"
"lab.nexedi.com/kirr/neo/go/zodb/btree"
)
// ΔFtail represents tail of revisional changes to files.
//
// It semantically consists of
//
// []δF ; rev ∈ (tail, head]
//
// where δF represents a change in files space
//
// δF:
// .rev↑
// {} file -> {}blk
//
// Only files and blocks explicitly requested to be tracked are guaranteed to
// be present. In particular a block that was not explicitly requested to be
// tracked, even if it was changed in δZ, is not guaranteed to be present in δF.
//
// ΔFtail provides the following operations:
//
// .Track(file, blk, path, zblk) - add file and block reached via BTree path to tracked set.
//
// .Update(δZ) -> δF - update files δ tail given raw ZODB changes
// .ForgetPast(revCut) - forget changes past revCut
// .SliceByRev(lo, hi) -> []δF - query for all files changes with rev ∈ (lo, hi]
// .SliceByFileRev(file, lo, hi) -> []δfile - query for changes of file with rev ∈ (lo, hi]
// .LastBlkRev(file, #blk, at) - query for what is last revision that changed
// file[#blk] as of @at database state.
//
// XXX δfile:
// .rev↑
// []blk
//
// XXX concurrent use
//
// See also zodb.ΔTail
type ΔFtail struct {
// ΔFtail merges btree.ΔTail with history of ZBlk
δBtail *ΔBtail
fileIdx map[zodb.Oid]SetBigFile // tree-root -> {} BigFile XXX as of @head?
// data with δF changes. Actual for part of tracked set that was taken
// into account.
vδF []ΔF
// tracked ZBlk that are not yet taken into account in current vδF.
// grows on new track requests; flushes on queries and update.
trackNew map[*BigFile]map[zodb.Oid]*zblkInΔFtail // {} file -> {} oid -> zblk
}
// ΔF represents a change in files space.
type ΔF struct {
Rev zodb.Tid
ByFile map[*BigFile]*ΔFile // file -> δfile
}
// ΔFile represents a change to one file.
type ΔFile struct {
Rev zodb.Tid
Blocks SetI64 // changed blocks
Size bool // whether file size changed
}
// zblkInΔFtail is part of ΔFtail embedded into ZBlk*.
//
// The data stored by zblkInΔFtail is transient - it is _not_ included into
// persistent state.
type zblkInΔFtail struct {
mu sync.Mutex // used only for binding to support multiple loaders
// with which files/blocks this ZBlk is associated. XXX as of @head state?
infile map[*BigFile]SetI64 // {} file -> set(#blk)
}
func (z *zblkInΔFtail) inΔFtail() *zblkInΔFtail { return z }
// NewΔFtail creates new ΔFtail object.
//
// Initial tracked set is empty.
// Initial coverage of created ΔFtail is (at₀, at₀].
//
// XXX db
func NewΔFtail(at0 zodb.Tid, db *zodb.DB) *ΔFtail {
return &ΔFtail{
δBtail: NewΔBtail(at0, db),
fileIdx: make(map[zodb.Oid]SetBigFile),
trackNew: make(map[*BigFile]map[zodb.Oid]*zblkInΔFtail),
}
}
// (tail, head] coverage
func (δFtail *ΔFtail) Head() zodb.Tid { return δFtail.δBtail.Head() }
func (δFtail *ΔFtail) Tail() zodb.Tid { return δFtail.δBtail.Tail() }
// Track associates file[blk] with tree path and zblk object there.
//
// zblk can be nil, which represents a hole.
// XXX blk=-1 is used for tracking after Size (no zblk is accessed at all).
//
// XXX Track adds tree path to tracked set and associates path root with file.
//
// XXX text
//
// A root can be associated with several files (each provided on different Track call).
func (δFtail *ΔFtail) Track(file *BigFile, blk int64, path []btree.LONode, zblk zBlk) {
if blk == -1 {
// XXX blk = ∞ from beginning ?
blk = KeyMax
}
err := δFtail.δBtail.Track(blk, zblk != nil, path)
if err != nil {
panic(err) // XXX -> error? errctx
}
root := path[0].(*btree.LOBTree)
files, ok := δFtail.fileIdx[root.POid()]
if !ok {
files = SetBigFile{}
δFtail.fileIdx[root.POid()] = files
}
files.Add(file)
// associate zblk with file, if it was not hole
if zblk != nil {
z := zblk.inΔFtail()
z.mu.Lock()
blocks, ok := z.infile[file]
if !ok {
blocks = make(SetI64, 1)
if z.infile == nil {
z.infile = make(map[*BigFile]SetI64)
}
z.infile[file] = blocks
}
blocks.Add(blk)
z.mu.Unlock()
// XXX locking
if !ok {
// zblk was not associated with this file
zt := δFtail.trackNew[file]
if zt == nil {
zt = make(map[zodb.Oid]*zblkInΔFtail, 1)
δFtail.trackNew[file] = zt
}
zt[zblk.POid()] = z
}
}
// XXX mark something dirty so that LastBlkRev and Slice* know what to rebuild?
// XXX debug
/*
leaf := path[len(path)-1].(*btree.LOBucket)
for _, e := range leaf.Entryv() { // XXX activate
δFtail.tracked.Add(e.Key())
}
*/
}
// Update updates δFtail given raw ZODB changes.
//
// It returns change in files space that corresponds to δZ.
//
// δZ should include all objects changed by ZODB transaction.
//
// Zhead must be active connection at δFtail.Head() database state.
// Objects in Zhead must not be modified.
// During call to Update zhead must not be otherwise used - even for reading.
func (δFtail *ΔFtail) Update(δZ *zodb.EventCommit, zhead *ZConn) (_ ΔF, err error) {
defer xerr.Contextf(&err, "ΔFtail update %s -> %s", δFtail.Head(), δZ.Tid)
// XXX δFtail.update() first?
// XXX verify zhead.At() == δFtail.Head()
δB, err := δFtail.δBtail.Update(δZ)
if err != nil {
return ΔF{}, err
}
δF := ΔF{Rev: δB.Rev, ByFile: make(map[*BigFile]*ΔFile)}
// take btree changes into account
for root, δt := range δB.ByRoot {
files := δFtail.fileIdx[root]
if len(files) == 0 {
panicf("ΔFtail: root<%s> -> ø file", root)
}
for file := range files {
δfile, ok := δF.ByFile[file]
if !ok {
δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
δF.ByFile[file] = δfile
}
for blk /*, zblk*/ := range δt {
// FIXME stub - need to take both keys and zblk changes into account
// XXX document, and in particular how to include atTail
δfile.Blocks.Add(blk)
}
// TODO invalidate .size only if key >= maxkey was changed (size increase),
// or if on the other hand maxkey was deleted (size decrese).
//
// XXX currently we invalidate size on any topology change.
δfile.Size = true
}
}
// take zblk changes into account
for _, oid := range δZ.Changev {
// XXX cache lock/unlock
obj := zhead.Cache().Get(oid)
if obj == nil {
//fmt.Printf("%s: not in cache\n", oid)
continue // nothing to do - see invariant
}
//fmt.Printf("%s: in cache (%s)\n", oid, typeOf(obj))
switch obj := obj.(type) {
case zBlk: // ZBlk*
// z.infile locking: since we write-locked head.zheadMu
// - no other fuse reads are running, and thus no one
// is mutating z.infile. XXX recheck
z := obj.inΔFtail()
for file, blocks := range z.infile {
δfile, ok := δF.ByFile[file]
if !ok {
δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
δF.ByFile[file] = δfile
}
δfile.Blocks.Update(blocks)
}
// XXX update z.infile according to btree changes
case *ZBigFile:
// XXX check that .blksize and .blktab (it is only
// persistent reference) do not change.
// XXX shutdown fs with ^^^ message.
panic("ZBigFile changed")
}
// make sure obj won't be garbage-collected until we finish handling it.
runtime.KeepAlive(obj)
}
δFtail.vδF = append(δFtail.vδF, δF)
return δF, nil
}
// update processes new track requests and updates vδF.
//
// If file != nil only track requests related to file are processed.
// Otherwise all track requests are processed.
func (δFtail *ΔFtail) update(file *BigFile) {
if file == nil {
panic("TODO")
}
// let's see if we need to rebuild .vδF due to not-yet processed track requests
// XXX locking
// XXX dumb
zt, dirty := δFtail.trackNew[file]
if !dirty {
return
}
delete(δFtail.trackNew, file)
// XXX unlock here
for i, δZ := range δFtail.δBtail.δZtail.Data() {
δF := δFtail.vδF[i]
// XXX assert δF.Rev == δZ.Rev
for _, oid := range δZ.Changev {
z, ok := zt[oid]
if !ok {
continue
}
// XXX locking
// XXX -> func δF.δfile(file) ?
δfile, ok := δF.ByFile[file]
if !ok {
δfile = &ΔFile{Rev: δF.Rev, Blocks: make(SetI64)}
δF.ByFile[file] = δfile
}
δfile.Blocks.Update(z.infile[file])
}
}
}
// ForgetPast discards all δFtail entries with rev ≤ revCut.
func (δFtail *ΔFtail) ForgetPast(revCut zodb.Tid) {
panic("TODO")
}
// XXX
func (δFtail *ΔFtail) SliceByRev(lo, hi zodb.Tid) /*readonly*/ []ΔF {
δassertSlice(δFtail, lo, hi)
panic("TODO")
}
// SliceByFileRev returns history of file changes in (lo, hi] range.
//
// it must be called with the following condition:
//
// tail ≤ lo ≤ hi ≤ head
//
// the caller must not modify returned slice.
//
// Note: contrary to regular go slicing, low is exclusive while high is inclusive.
func (δFtail *ΔFtail) SliceByFileRev(file *BigFile, lo, hi zodb.Tid) /*readonly*/[]*ΔFile {
δassertSlice(δFtail, lo, hi)
// XXX locking?
δFtail.update(file)
// find vδF range corresponding to (lo, hi]
// XXX linear scan
vδF := δFtail.vδF
if len(vδF) == 0 {
return nil
}
// find max j : [j].rev ≤ hi XXX linear scan -> binary search
j := len(vδF)-1
for ; j >= 0 && vδF[j].Rev > hi; j-- {}
if j < 0 {
return nil // ø
}
// find max i : [i].rev > low XXX linear scan -> binary search
i := j
for ; i >= 0 && vδF[i].Rev > lo; i-- {}
i++
vδF = vδF[i:j+1]
// filter found changed to have only file-related bits
var vδfile []*ΔFile
for _, δF := range vδF {
δfile, ok := δF.ByFile[file]
if ok {
vδfile = append(vδfile, δfile)
}
}
// XXX merge into vδF zblk from not yet handled tracked part
return vδfile
// merging tree (δT) and Zblk (δZblk) histories into file history (δFile):
// δT ────────·──────────────·─────────────────·────────────
// │ │
// ↓ │
// δZblk₁ ────────────────o───────────────────o─────────────────
// |
// ↓
// δZblk₂ ────────────x────────────────x────────────────────────
//
//
// δFile ────────o───────o──────x─────x────────────────────────
/*
vδZ := δFtail.δBtail.δZtail.SliceByRev(lo, hi)
// XXX stub that takes only ZBlk changes into account
// XXX dumb
for _, δZ := range vδZ {
}
*/
/*
// XXX activate zfile?
vδT := δFtail.δBtail.SliceByRootRev(file.zfile.blktab, lo, hi)
// state of `{} blk -> zblk` as we are scanning ↓
δblktab := map[int64]struct {
zblk zodb.Oid // blk points to this zblk
lo, hi zodb.Tid // blk points to zblk during [lo, hi)
}{}
iz := len(vδZ) - 1
it := len(vδT) - 1
for (iz >= 0 && it >= 0) { // XXX -> ||
δZ := vδZ[iz]
δT := vδT[it]
if δZ.Rev >= δT.Rev {
for _, oid := range δZ.Changev {
// XXX oid -> tracked ZBlk?
// ZBlk -> bound to {}blk @head
for blk := range boundToAtHead {
if !δblktab.Has(blk) {
δblktab[blk] = oid
}
}
}
}
if δT.Rev >= δZ.Rev {
...
}
}
*/
}
// XXX rename -> BlkRevAt
// LastBlkRev returns last revision that changed file[blk] as of @at database state.
//
// if exact=False - what is returned is only an upper bound for last block revision.
//
// f must be from head/
// at must ∈ (tail, head] XXX [tail ?
// blk must be tracked
//
// XXX +ctx, error rebuild []δF here
func (δFtail *ΔFtail) LastBlkRev(ctx context.Context, f *BigFile, blk int64, at zodb.Tid) (_ zodb.Tid, exact bool) {
//defer xerr.Contextf(&err, "") // XXX text
// XXX assert δFtail == f.head.bfdir.δFtail ?
// XXX tabRev -> treeRev ?
// XXX activate zfile?
zblkOid, ok, tabRev, tabRevExact, err := δFtail.δBtail.Get(ctx, f.zfile.blktab, blk, at)
if err != nil {
panic(err)
}
// block was removed
// XXX or not in tracked set?
if !ok {
return tabRev, tabRevExact
}
// blktab[blk] was changed to point to a zblk @rev.
// blk revision is max rev and when zblk changed last in (rev, at] range.
//
// XXX need to use full δZ, not only connected to tracked subset?
zblkRev, zblkRevExact := δFtail.δBtail.δZtail.LastRevOf(zblkOid, at)
if zblkRev > tabRev {
return zblkRev, zblkRevExact
} else {
return tabRev, tabRevExact
}
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment