Commit e64f0e0b authored by Kirill Smelkov's avatar Kirill Smelkov

wcfs: Switch filesystem to EIO mode on zwatcher failure

Currently zwatcher failure leads to wcfs starting to provide stale data
instead of uptodate data. Fix that by detecting zwatcher failures and
explicitly switching the filesystem to a mode where any access to
anything returns "input/output error".

Zwatcher can fail on e.g. failure to retrieve transactions from ZODB
storage or any other failure. With this patch we make sure this does not
go unnoticed.
parent 323be34a
// Copyright (C) 2018-2021 Nexedi SA and Contributors. // Copyright (C) 2018-2024 Nexedi SA and Contributors.
// Kirill Smelkov <kirr@nexedi.com> // Kirill Smelkov <kirr@nexedi.com>
// //
// This program is free software: you can Use, Study, Modify and Redistribute // This program is free software: you can Use, Study, Modify and Redistribute
...@@ -428,6 +428,13 @@ func (f *skFile) Release() { ...@@ -428,6 +428,13 @@ func (f *skFile) Release() {
} }
// fatalEIO switches filesystem into EIO mode and terminates the program.
func fatalEIO() {
// log.Fatal terminates the program and so any attempt to access
// was-mounted filesystem starts to return ENOTCONN
log.Fatal("switching filesystem to EIO mode")
}
// ---- parsing ---- // ---- parsing ----
// parseWatchFrame parses line going through /head/watch into (stream, msg) // parseWatchFrame parses line going through /head/watch into (stream, msg)
......
...@@ -2560,8 +2560,8 @@ func _main() (err error) { ...@@ -2560,8 +2560,8 @@ func _main() (err error) {
err = root.zwatcher(serveCtx, zwatchq) err = root.zwatcher(serveCtx, zwatchq)
if errors.Cause(err) != context.Canceled { if errors.Cause(err) != context.Canceled {
log.Error(err) log.Error(err)
log.Errorf("zwatcher failed -> switching filesystem to EIO mode (TODO)") log.Error("zwatcher failed")
// TODO: switch fs to EIO mode fatalEIO()
} }
// wait for unmount // wait for unmount
......
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
# Copyright (C) 2018-2022 Nexedi SA and Contributors. # Copyright (C) 2018-2024 Nexedi SA and Contributors.
# Kirill Smelkov <kirr@nexedi.com> # Kirill Smelkov <kirr@nexedi.com>
# #
# This program is free software: you can Use, Study, Modify and Redistribute # This program is free software: you can Use, Study, Modify and Redistribute
...@@ -44,7 +44,7 @@ import sys, os, os.path, subprocess ...@@ -44,7 +44,7 @@ import sys, os, os.path, subprocess
import six import six
from six.moves._thread import get_ident as gettid from six.moves._thread import get_ident as gettid
from time import gmtime from time import gmtime
from errno import EINVAL, ENOTCONN from errno import EINVAL, ENOTCONN, ECONNABORTED
from resource import setrlimit, getrlimit, RLIMIT_MEMLOCK from resource import setrlimit, getrlimit, RLIMIT_MEMLOCK
from golang import go, chan, select, func, defer, error, b from golang import go, chan, select, func, defer, error, b
from golang import context, errors, sync, time from golang import context, errors, sync, time
...@@ -1824,6 +1824,46 @@ def test_wcfs_watch_2files(): ...@@ -1824,6 +1824,46 @@ def test_wcfs_watch_2files():
# ---------------------------------------- # ----------------------------------------
# verify that wcfs switches to EIO mode after zwatcher failure.
# in EIO mode accessing anything on the filesystem returns ENOTCONN error.
@func
def test_wcfs_eio_after_zwatcher_fail(capfd):
t = tDB(); zf = t.zfile
def _():
with raises(IOError) as exc:
t.close()
assert exc.value.errno == ENOTCONN
defer(_)
# instead of simulating e.g. ZODB server failure we utilize the fact that
# currently zwatcher fails when there is ZBigFile epoch
t.wc._stat("head/bigfile/%s" % h(zf._p_oid)) # wcfs starts to track zf
zf.blksize += 1
with raises(IOError) as exc:
t.commit()
# transaction.commit goes ok, but reading from already opened .wcfs/zhead
# after commit returns ECONNABORTED
assert exc.value.errno == ECONNABORTED
_ = capfd.readouterr()
assert not ready(t._wcfuseaborted) # wcfs might have been killed on overall test timeout
assert "test timed out" not in _.err
assert "aborting wcfs fuse connection to unblock" not in _.err
assert "zwatcher failed" in _.err
assert "switching filesystem to EIO mode" in _.err
# verify that accessing any file returns ENOTCONN after the switch
def checkeio(path):
with raises(IOError) as exc:
t.wc._read(path)
assert exc.value.errno == ENOTCONN
checkeio(".wcfs/zurl")
checkeio("head/at")
checkeio("head/bigfile/%s" % h(zf._p_oid))
checkeio("anything")
# verify that wcfs does not panic with "no current transaction" / "at out of # verify that wcfs does not panic with "no current transaction" / "at out of
# bounds" on read/invalidate/watch codepaths. # bounds" on read/invalidate/watch codepaths.
@func @func
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment