Commit ebfe164e authored by Kirill Smelkov's avatar Kirill Smelkov

wcfs: Switch filesystem to EIO mode on zwatcher failure

Currently zwatcher failure leads to wcfs starting to provide stale data
instead of uptodate data. Fix that by detecting zwatcher failures and
explicitly switching the filesystem to a mode where any access to
anything returns "input/output error".

Zwatcher can fail on e.g. failure to retrieve transactions from ZODB
storage or any other failure. With this patch we make sure this does not
go unnoticed.
parent 323be34a
// Copyright (C) 2018-2021 Nexedi SA and Contributors.
// Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com>
//
// This program is free software: you can Use, Study, Modify and Redistribute
......@@ -428,6 +428,13 @@ func (f *skFile) Release() {
}
// fatalEIO switches filesystem into EIO mode and terminates the program.
func fatalEIO() {
// log.Fatal terminates the program and so any attempt to access
// was-mounted filesystem starts to return ENOTCONN
log.Fatal("switching filesystem to EIO mode")
}
// ---- parsing ----
// parseWatchFrame parses line going through /head/watch into (stream, msg)
......
......@@ -2560,8 +2560,8 @@ func _main() (err error) {
err = root.zwatcher(serveCtx, zwatchq)
if errors.Cause(err) != context.Canceled {
log.Error(err)
log.Errorf("zwatcher failed -> switching filesystem to EIO mode (TODO)")
// TODO: switch fs to EIO mode
log.Error("zwatcher failed")
fatalEIO()
}
// wait for unmount
......
# -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors.
# Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com>
#
# This program is free software: you can Use, Study, Modify and Redistribute
......@@ -54,7 +54,7 @@ from pytest import raises, fail
from wendelin.wcfs.internal import io, mm
from wendelin.wcfs.internal.wcfs_test import _tWCFS, read_exfault_nogil, SegmentationFault, install_sigbus_trap, fadvise_dontneed
from wendelin.wcfs.client._wcfs import _tpywlinkwrite as _twlinkwrite
from wendelin.wcfs import _is_mountpoint as is_mountpoint, _procwait as procwait, _ready as ready, _rmdir_ifexists as rmdir_ifexists
from wendelin.wcfs import _is_mountpoint as is_mountpoint, _procwait as procwait, _waitfor as waitfor, _ready as ready, _rmdir_ifexists as rmdir_ifexists
# setup:
......@@ -1824,6 +1824,68 @@ def test_wcfs_watch_2files():
# ----------------------------------------
# verify that wcfs switches to EIO mode after zwatcher failure.
# in EIO mode accessing anything on the filesystem returns ENOTCONN error.
@func
def test_wcfs_eio_after_zwatcher_fail(capfd):
# we will use low-level tWCFS instead of tDB for precise control of access
# to the filesystem. For example tDB keeps open connection to .wcfs/zhead
# and inspects it during commit which can break in various ways on switch
# to EIO mode. Do all needed actions by hand to avoid unneeded uncertainty.
# create ZBigFile
root = testdb.dbopen()
def _():
dbclose(root)
defer(_)
root['zfile'] = zf = ZBigFile(blksize)
transaction.commit()
# start wcfs
t = tWCFS()
def _():
with raises(IOError) as exc:
t.close()
assert exc.value.errno == ENOTCONN
defer(_)
t.wc._stat("head/bigfile/%s" % h(zf._p_oid)) # wcfs starts to track zf
# instead of simulating e.g. ZODB server failure we utilize the fact that
# currently zwatcher fails when there is ZBigFile epoch
zf.blksize += 1
transaction.commit()
# on new transaction with ZBigFile epoch wcfs should switch to EIO when it
# learns about that transaction
def _():
try:
t.wc._stat("head")
except:
return True
else:
return False
waitfor(timeout(), _)
# verify it was indeed switch to EIO
_ = capfd.readouterr()
assert not ready(t._wcfuseaborted) # wcfs might have been killed on overall test timeout
assert "test timed out" not in _.err
assert "aborting wcfs fuse connection to unblock" not in _.err
assert "zwatcher failed" in _.err
assert "switching filesystem to EIO mode" in _.err
# verify that accessing any file returns ENOTCONN after the switch
def checkeio(path):
with raises(IOError) as exc:
t.wc._read(path)
assert exc.value.errno == ENOTCONN
checkeio(".wcfs/zurl")
checkeio("head/at")
checkeio("head/bigfile/%s" % h(zf._p_oid))
checkeio("anything")
# verify that wcfs does not panic with "no current transaction" / "at out of
# bounds" on read/invalidate/watch codepaths.
@func
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment