wcfs_test: Ensure faulty client is killed @readPinWatchers

This patch extends the test scope of 'test_wcfs_pintimeout_kill'. Before this patch, the test only ensured that a client that does not respond to pin requests during the initial watch request [1] is killed. Now it also tests that a faulty client is killed when a block is invalidated. Since there are no other situations where the WCFS server sends pin requests to a client, the tests now cover all situations where a faulty client might not respond. This patch therefore aims to increase the security that WCFS is not blocked by a faulty client. [1] See nexedi/wendelin.core!18

wcfs_test: Ensure faulty client is killed @readPinWatchers
This patch extends the test scope of 'test_wcfs_pintimeout_kill'. Before this patch, the test only ensured that a client that does not respond to pin requests during the initial watch request [1] is killed. Now it also tests that a faulty client is killed when a block is invalidated. Since there are no other situations where the WCFS server sends pin requests to a client, the tests now cover all situations where a faulty client might not respond. This patch therefore aims to increase the security that WCFS is not blocked by a faulty client. [1] See nexedi/wendelin.core!18
9d42efff · Levin Zimmermann · 0f4bc2d4 · 9d42efff
Commit 9d42efff authored Aug 29, 2024 by Levin Zimmermann
Show whitespace changes
Inline Side-by-side

Showing with 149 additions and 69 deletions

wcfs/wcfs_test.py wcfs/wcfs_test.py +149 -69

No files found.
--- a/wcfs/wcfs_test.py
+++ b/wcfs/wcfs_test.py
@@ -773,8 +773,34 @@ class tFile:
        blkview = t._blk(blk)
        assert t.cached()[blk] == cached
-        def _(ctx, ev):
+        ev = doCheckingPin(lambda ctx, ev: t._triggerPin(ctx, ev, blk, cached), pinokByWLink, pinfunc)
+        # XXX hack - wlinks are notified and emit events simultaneously - we
+        # check only that events begin and end with read pre/post and that pins
+        # are inside (i.e. read is stuck until pins are acknowledged).
+        # Better do explicit check in tracetest style.
+        assert ev[0]  == 'read pre', ev
+        assert ev[-1] == 'read ' + dataok[0], ev
+        ev = ev[1:-1]
+        if not shouldPin:
+            assert ev == []
+        else:
+            assert 'pin rx' in ev
+            assert 'pin ack pre' in ev
+        assert t.cached()[blk] > 0
+        # verify full data of the block
+        # TODO(?) assert individually for every block's page? (easier debugging?)
+        assert blkview.tobytes() == dataok
+        # we just accessed the block in full - it has to be in OS cache completely
+        assert t.cached()[blk] == 1
+    # 'triggerPin' triggers server pin requests by reading 'blk'.
+    def _triggerPin(t, ctx, ev, blk, cached):
        assert t.cached()[blk] == cached
+        blkview = t._blk(blk)
        ev.append('read pre')
        # access data with released GIL so that the thread that reads data from
@@ -808,30 +834,6 @@ class tFile:
        b = _rx
        ev.append('read ' + b)
-        ev = doCheckingPin(_, pinokByWLink, pinfunc)
-        # XXX hack - wlinks are notified and emit events simultaneously - we
-        # check only that events begin and end with read pre/post and that pins
-        # are inside (i.e. read is stuck until pins are acknowledged).
-        # Better do explicit check in tracetest style.
-        assert ev[0]  == 'read pre', ev
-        assert ev[-1] == 'read ' + dataok[0], ev
-        ev = ev[1:-1]
-        if not shouldPin:
-            assert ev == []
-        else:
-            assert 'pin rx' in ev
-            assert 'pin ack pre' in ev
-        assert t.cached()[blk] > 0
-        # verify full data of the block
-        # TODO(?) assert individually for every block's page? (easier debugging?)
-        assert blkview.tobytes() == dataok
-        # we just accessed the block in full - it has to be in OS cache completely
-        assert t.cached()[blk] == 1
    # assertData asserts that file has data blocks as specified.
    #
@@ -1453,59 +1455,137 @@ def test_wcfs_watch_going_back():
 # TODO extend tests to also cover situation that a non-faulty
 # client continues to be served ok
-# TODO explicitly cover readPinWatchers behaviour with tests
 # verify that wcfs kills slow/faulty client who does not reply to pin in time.
 @func
 def test_wcfs_pintimeout_kill():
-    # adjusted wcfs timeout to kill client who is stuck not providing pin reply
+    # WCFS server sends pin requests in two different situations:
-    # timeout until killing is 30 seconds, we add 2 seconds delay for the actual
+    # (1) during the initial watch request of the client. If the client wants
-    # killing/process shutdown.
+    #     to watch blocks older than head, the server sends pin requests for
-    tkill = 32*time.second
+    #     these blocks.
-    t = tDB(timeout=tkill*2); zf = t.zfile
+    _test_wcfs_pintimeout_kill_at_initial_watch_request()
-    defer(t.close)
+    # (2) After a block has been updated and a client still watches at
+    #     a pre-update state, the server needs to send pin requests.
+    _test_wcfs_pintimeout_kill_at_read_pin_watchers()
-    at1 = t.commit(zf, {2:'c1'})
-    at2 = t.commit(zf, {2:'c2'})
+@func
-    f = t.open(zf)
+def _test_wcfs_pintimeout_kill_at_initial_watch_request():
+    t = _tPinTimeout(); defer(t.close)
+    at1 = t.DB.commit(t.zf, {2:'c1'})
+    at2 = t.DB.commit(t.zf, {2:'c2'})
+    f = t.DB.open(t.zf)
    f.assertData(['','','c2'])
-    # move into subprocess to avoid killing testing process, in
+    @t
-    # case test is successful
    def test(err):
-        try:
+        ctx, _ = context.with_timeout(context.background(), 2*t.kill)
-            _test(err)
+        wl = t.DB.openwatch()
-        except Exception:
-            if not err.value:
-                err.value = "unexpected error in test code"
-    def _test(err):
-        ctx, _ = context.with_timeout(context.background(), 2*tkill)
-        wl = t.openwatch()
        wg = sync.WorkGroup(ctx)
        def _(ctx):
            # send watch. The pin handler won't be replying -> we should never get reply here.
-            wl.sendReq(ctx, b"watch %s @%s" % (h(zf._p_oid), h(at1)))
+            wl.sendReq(ctx, b"watch %s @%s" % (h(t.zf._p_oid), h(at1)))
-            err.value= "watch request completed (should not as pin handler is stuck)"
+            err.value = "watch request completed (should not as pin handler is stuck)"
        wg.go(_)
        def _(ctx):
+            # server must reply with pin request, as we want to watch OID@at1,
+            # but head is already @at2.
            req = wl.recvReq(ctx)
            assert req is not None
-            assert req.msg == b"pin %s #%d @%s" % (h(zf._p_oid), 2, h(at1))
+            assert req.msg == b"pin %s #%d @%s" % (h(t.zf._p_oid), 2, h(at1))
+            # sleep > wcfs pin timeout - wcfs must kill us
+            _, _rx = select(
+                ctx.done().recv,         # 0
+                time.after(t.kill).recv, # 1
+            )
+            if _ == 0:
+                raise ctx.err()
+            err.value = "wcfs did not kill stuck client"
+        wg.go(_)
+        wg.wait()
+@func
+def _test_wcfs_pintimeout_kill_at_read_pin_watchers():
+    t = _tPinTimeout(); defer(t.close)
+    blk = 2
+    at1 = t.DB.commit(t.zf, {blk:'c1'})
+    f = t.DB.open(t.zf)
+    f.assertData(['','','c1'])
+    @t
+    def test(err):
+        ctx, _ = context.with_timeout(context.background(), 2*t.kill)
+        wl = t.DB.openwatch()
+        wg = sync.WorkGroup(ctx)
+        # Initial watch request:
+        # Client sends OID watch request @at1. As there isn't any newer
+        # data than 'at1' for OID, the client can watch @head & server doesn't
+        # need to pin anything to client
+        def _(ctx):
+            wl.sendReq(ctx, b"watch %s @%s" % (h(t.zf._p_oid), h(at1)))
+        wg.go(_)
+        wg.wait()
+        # Trigger readPinWatchers by overridding blk => client needs to
+        # pin blk to at1, because head is now @at2.
+        at2 = t.DB.commit(t.zf, {blk:'c2'})
+        blkview = f._blk(blk)
+        catched = f.cached()[blk]
+        wg = sync.WorkGroup(ctx)
+        wg.go(lambda ctx: f._triggerPin(ctx, [], blk, catched))
+        def _(ctx):
+            # Client receives now pin request by server, but doesn't respond in order
+            # to trigger server to kill client.
+            req = wl.recvReq(ctx)
+            assert req is not None
+            assert req.msg == b"pin %s #%d @%s" % (h(t.zf._p_oid), 2, h(at1))
            # sleep > wcfs pin timeout - wcfs must kill us
            _, _rx = select(
                ctx.done().recv,         # 0
-                time.after(tkill).recv, # 1
+                time.after(t.kill).recv, # 1
            )
            if _ == 0:
                raise ctx.err()
-            err.value= "wcfs did not kill stuck client"
+            err.value = "wcfs did not kill stuck client"
        wg.go(_)
        wg.wait()
+# '_tPinTimeout' provides common utilities for pin timeout tests
+class _tPinTimeout(object):
+    def __init__(t):
+        # adjusted wcfs timeout to kill client who is stuck not providing pin reply
+        # timeout until killing is 30 seconds, we add 2 seconds delay for the actual
+        # killing/process shutdown.
+        t.kill = 32*time.second
+        t.DB = tDB(timeout=t.kill*2)
+        t.zf = t.DB.zfile
+    def close(t):
+        t.DB.close()
+    # Executes test code in subprocess, to avoid killing testing
+    # process, in case test is successful & faulty client is killed.
+    def __call__(t, test):
+        def _(err):
+            try:
+                test(err)
+            except Exception:
+                if not err.value:
+                    err.value = "unexpected error in test code"
        err = Value((ctypes.c_char_p if six.PY2 else ctypes.c_wchar_p), "")
-    p = Process(target=test, args=(err,))
+        p = Process(target=_, args=(err,))
        p.start()
        p.join()
        if err.value: