From 0a2cd4a5fcbfefcad52d7a4fad7d1ec92e01d4d5 Mon Sep 17 00:00:00 2001
From: Kirill Smelkov <kirr@nexedi.com>
Date: Mon, 16 Sep 2024 13:26:26 +0300
Subject: [PATCH] wcfs: tests: Add test to exercies faulty client that does not
 reply to pin triggered by readPinWatchers

Levin writes:

    This patch extends the test scope of 'test_wcfs_pintimeout_kill'. Before
    this patch, the test only ensured that a client that does not
    respond to pin requests during the initial watch request [1] is
    killed. Now it also tests that a faulty client is killed when a block
    is invalidated. Since there are no other situations where the WCFS
    server sends pin requests to a client, the tests now cover all situations
    where a faulty client might not respond. This patch therefore aims to
    increase the security that WCFS is not blocked by a faulty client.

    [1] See https://lab.nexedi.com/nexedi/wendelin.core/-/merge_requests/18

Preliminary history:

    https://lab.nexedi.com/levin.zimmermann/wendelin.core/-/commit/9d42efff

Co-authored-by: Levin Zimmermann <levin.zimmermann@nexedi.com>
---
 wcfs/wcfs_faultyprot_test.py | 66 ++++++++++++++++++++++++++++++++++--
 1 file changed, 64 insertions(+), 2 deletions(-)

diff --git a/wcfs/wcfs_faultyprot_test.py b/wcfs/wcfs_faultyprot_test.py
index c30c0aff..b104c890 100644
--- a/wcfs/wcfs_faultyprot_test.py
+++ b/wcfs/wcfs_faultyprot_test.py
@@ -214,7 +214,10 @@ class tFaultyClient:
 # ---- tests ----
 
 
-# verify that wcfs kills slow/faulty client who does not reply to pin in time.
+# verify that wcfs kills slow/faulty client who does not reply to pin
+# notifications in time during watch setup.
+#
+# This verifies setupWatch codepath.
 
 @func
 def _bad_watch_no_pin_reply(ctx, f, at):
@@ -242,7 +245,7 @@ def _bad_watch_no_pin_reply(ctx, f, at):
 
 @xfail  # protection against faulty/slow clients
 @func
-def test_wcfs_pintimeout_kill(with_prompt_pintimeout):
+def test_wcfs_pintimeout_kill_on_watch(with_prompt_pintimeout):
     t = tDB(multiproc=True); zf = t.zfile
     defer(t.close)
 
@@ -268,6 +271,65 @@ def test_wcfs_pintimeout_kill(with_prompt_pintimeout):
     assert p.exitcode is not None
 
 
+# verify that wcfs kills slow/faulty client who does not reply to pin
+# notifications in time caused by asynchronous read access.
+#
+# This verifies readPinWatchers codepath.
+
+@func
+def _bad_pinh_no_pin_reply(ctx, f, at):
+    wl = wcfs.WatchLink(f.wc)   ; defer(wl.close)
+
+    # initial watch setup goes ok
+    _ = wl.sendReq(ctx, b"watch %s @%s" % (h(f.zfile_oid), h(at)))
+    assert _ == b"ok",  _
+    f.cout.send("f: watch setup ok")
+
+    # wait for "pin ..." due to read access in the parent
+    req = wl.recvReq(ctx)
+    assert req is not None
+    f.cout.send(req.msg)
+
+    # sleep > wcfs pin timeout - wcfs must kill us
+    f.assertKilled(ctx, "wcfs did not kill stuck client")
+
+@xfail  # protection against faulty/slow clients
+@func
+def test_wcfs_pintimeout_kill_on_access(with_prompt_pintimeout):
+    t = tDB(multiproc=True); zf = t.zfile; at0=t.at0
+    defer(t.close)
+
+    at1 = t.commit(zf, {2:'c1'})
+    at2 = t.commit(zf, {2:'c2'})
+    f = t.open(zf)
+    f.assertData(['','','c2'])
+
+    # issue our watch request - it should be served well
+    wl = t.openwatch()
+    wl.watch(zf, at1, {2:at1})
+
+    # spawn faulty client and wait until it setups its watch
+    p = tFaultySubProcess(t, _bad_pinh_no_pin_reply, at=at2)
+    defer(p.close)
+    assert p.recv(t.ctx) == "f: watch setup ok"
+
+    # commit new transaction and issue read access to modified block
+    # our read should be served well even though faulty client is stuck.
+    # As the result the faulty client should be killed by wcfs.
+    at3 = t.commit(zf, {1:'b3'})
+    wg = sync.WorkGroup(t.ctx)
+    def _(ctx):
+        f.assertBlk(1, 'b3', {wl: {1:at0}}, timeout=2*t.pintimeout)
+    wg.go(_)
+    def _(ctx):
+        assert p.recv(ctx) == b"pin %s #%d @%s" % (h(zf._p_oid), 1, h(at0))
+    wg.go(_)
+    wg.wait()
+
+    p.join(t.ctx)
+    assert p.exitcode is not None
+
+
 # assertKilled assert that the current process becomes killed after time goes after pintimeout.
 @func(tFaultyClient)
 def assertKilled(f, ctx, failmsg):
-- 
2.30.9