Commit 7b0c301c authored by Kirill Smelkov's avatar Kirill Smelkov

X wcfs: tests: Fix tFile.assertBlk not to segfault on a test failure

See added comments to wcfs_test.py for details on how that can happen.
Fixes test segmentation faults like

    $ WENDELIN_CORE_TEST_DB="<zeo>" python -m pytest -vs -k test_wcfs_watch_vs_access

    wcfs_test.py::test_wcfs_watch_vs_access
    ------------------------------- live log setup --------------------------------
    INFO     ZEO.ClientStorage:ClientStorage.py:263 ('localhost', 20106) ClientStorage (pid=36942) created RW/normal for storage: '1'
    INFO     ZEO.cache:cache.py:217 created temporary cache file '<fdopen>'
    INFO     ZEO.ClientStorage:ClientStorage.py:574 ('localhost', 20106) Testing connection <ManagedClientConnection ('127.0.0.1', 20106)>
    INFO     ZEO.zrpc.Connection('C'):connection.py:365 (127.0.0.1:20106) received handshake 'Z4'
    INFO     ZEO.ClientStorage:ClientStorage.py:580 ('localhost', 20106) Server authentication protocol None
    INFO     ZEO.ClientStorage:ClientStorage.py:640 ('localhost', 20106) Connected to storage: ('localhost', 20106)
    INFO     ZEO.ClientStorage:ClientStorage.py:1326 ('localhost', 20106) No verification necessary -- empty cache
    INFO     ZEO.ClientStorage:ClientStorage.py:728 ('localhost', 20106) Disconnected from storage: "('localhost', 20106)"
    -------------------------------- live log call --------------------------------
    INFO     ZEO.ClientStorage:ClientStorage.py:263 ('localhost', 20106) ClientStorage (pid=36942) created RW/normal for storage: '1'
    INFO     ZEO.cache:cache.py:217 created temporary cache file '<fdopen>'
    INFO     ZEO.ClientStorage:ClientStorage.py:574 ('localhost', 20106) Testing connection <ManagedClientConnection ('127.0.0.1', 20106)>
    INFO     ZEO.zrpc.Connection('C'):connection.py:365 (127.0.0.1:20106) received handshake 'Z4'
    INFO     ZEO.ClientStorage:ClientStorage.py:580 ('localhost', 20106) Server authentication protocol None
    INFO     ZEO.ClientStorage:ClientStorage.py:640 ('localhost', 20106) Connected to storage: ('localhost', 20106)
    INFO     ZEO.ClientStorage:ClientStorage.py:1326 ('localhost', 20106) No verification necessary -- empty cache
    INFO     root:__init__.py:294 wcfs: starting for zeo://localhost:20106 ...
    wcfs: 2021/08/13 02:27:40 zodb: FIXME: open zeo://localhost:20106: raw cache is not ready for invalidations -> NoCache forced
    INFO     root:__init__.py:335 wcfs: started pid37431 @ /dev/shm/wcfs/e7630c831aeed36692d06459de5a25a745eb9d76

    M: commit -> @at0 (03e2107fabf002ee)

    M: commit -> @at1 (03e2107fac097466)
    M:      f<0000000000000002>     [2]

    M: commit -> @at2 (03e2107fac3df2aa)
    M:      f<0000000000000002>     [2, 3, 5]

    M: commit -> @at3 (03e2107fac5ef011)
    M:      f<0000000000000002>     [2, 5]

    C: setup watch f<0000000000000002> @at3 (03e2107fac5ef011)
    #  pinok: {}

    C: setup watch f<0000000000000002> @at3 (03e2107fac5ef011)
    #  pinok: {}

    C: setup watch f<0000000000000002> @at2 (03e2107fac3df2aa)
    #  pinok: {2: @at2 (03e2107fac3df2aa)}

    M: commit -> @at4 (03e2107face33c77)
    M:      f<0000000000000002>     [2, 5, 6]

    >>> Change history by file:

    f<0000000000000002>:
                                    0 1 2 3 4 5 6 7
                                    a b c d e f g h
            @at0 (03e2107fabf002ee)
            @at1 (03e2107fac097466)     2
            @at2 (03e2107fac3df2aa)     2 3   5
            @at3 (03e2107fac5ef011)     2     5
            @at4 (03e2107face33c77)     2     5 6

    INFO     ZEO.ClientStorage:ClientStorage.py:728 ('localhost', 20106) Disconnected from storage: "('localhost', 20106)"
    INFO     root:__init__.py:401 wcfs: unmount/stop wcfs pid37431 @ /dev/shm/wcfs/e7630c831aeed36692d06459de5a25a745eb9d76
    WARNING  root:__init__.py:548 fuse_unmount /dev/shm/wcfs/e7630c831aeed36692d06459de5a25a745eb9d76: failed: fusermount: failed to unmount /dev/shm/wcfs/e7630c831aeed36692d06459de5a25a745eb9d76: Device or resource busy
    WARNING  root:__init__.py:533 # lsof /dev/shm/wcfs/e7630c831aeed36692d06459de5a25a745eb9d76
    WARNING  root:__init__.py:541
    WARNING  root:__init__.py:543 (lsof failed)
    WARNING  root:__init__.py:461 -> kill -TERM wcfs.go ...
    WARNING  root:__init__.py:464 -> abort FUSE connection ...
    Segmentation fault: read @00007f6e36bfe000
    /srv/slapgrid/slappart91/srv/runner/software/3335682bae677c2d474f9244e578f64b/parts/wendelin.core/wcfs/client/./../../bigfile/liblibvirtmem.so(dump_traceback+0x1b)[0x7f6f80844e4b]
    /srv/slapgrid/slappart91/srv/runner/software/3335682bae677c2d474f9244e578f64b/parts/wendelin.core/wcfs/client/./../../bigfile/liblibvirtmem.so(+0x3956)[0x7f6f80841956]
    /lib/x86_64-linux-gnu/libpthread.so.0(+0x12730)[0x7f6f83117730]
    /srv/slapgrid/slappart91/srv/runner/software/3335682bae677c2d474f9244e578f64b/parts/wendelin.core/wcfs/internal/wcfs_test.so(+0x10860)[0x7f6e3e2eb860]
    /srv/slapgrid/slappart91/srv//runner//shared/python2.7/93d57ff089fd75f374514794469a0538/bin/python2.7(PyEval_EvalFrameEx+0x7b5)[0x4d2dc5]
    /srv/slapgrid/slappart91/srv//runner//shared/python2.7/93d57ff089fd75f374514794469a0538/bin/python2.7(PyEval_EvalCodeEx+0x2cc)[0x4d1abc]
    /srv/slapgrid/slappart91/srv//runner//shared/python2.7/93d57ff089fd75f374514794469a0538/bin/python2.7[0x51b92e]
    /srv/slapgrid/slappart91/srv/runner/software/3335682bae677c2d474f9244e578f64b/develop-eggs/pygolang-0.0.8-py2.7-linux-x86_64.egg/golang/_golang.so(+0xc8b0)[0x7f6f8182b8b0]
    /srv/slapgrid/slappart91/srv/runner/software/3335682bae677c2d474f9244e578f64b/develop-eggs/pygolang-0.0.8-py2.7-linux-x86_64.egg/golang/_golang.so(+0x14ab4)[0x7f6f81833ab4]
    /srv/slapgrid/slappart91/srv//runner//shared/python2.7/93d57ff089fd75f374514794469a0538/bin/python2.7[0x54bbb4]
    /lib/x86_64-linux-gnu/libpthread.so.0(+0x7fa3)[0x7f6f8310cfa3]
    /lib/x86_64-linux-gnu/libc.so.6(clone+0x3f)[0x7f6f82eae4cf]
    Segmentation fault (core dumped)

Which looks under gdb as

    #0  on_pagefault (sig=<optimized out>, si=0x7f6dde7fb570, _uc=<optimized out>) at bigfile/pagefault.c:171
    #1  <signal handler called>
    #2  __pyx_pf_8wendelin_4wcfs_8internal_9wcfs_test_read_nogil (__pyx_self=<optimized out>, __pyx_v_mem=...) at wcfs/internal/wcfs_test.cpp:3103
    #3  __pyx_pw_8wendelin_4wcfs_8internal_9wcfs_test_1read_nogil (__pyx_self=<optimized out>, __pyx_arg_mem=<optimized out>) at wcfs/internal/wcfs_test.cpp:3029
    #4  0x00000000004d2dc5 in call_function (oparg=<optimized out>, pp_stack=0x7f6dde7fbc88) at Python/ceval.c:4364
    #5  PyEval_EvalFrameEx (f=<optimized out>, throwflag=<optimized out>) at Python/ceval.c:3013
    #6  0x00000000004d1abc in PyEval_EvalCodeEx (co=0x7f6f8094cbb0, globals=<optimized out>, locals=locals@entry=0x0, args=args@entry=0x7f6f82d72068, argcount=<optimized out>, kws=kws@entry=0x7f6f82d72068, kwcount=0, defs=0x0, defcount=0,
        closure=0x7f6e3c6e7110) at Python/ceval.c:3608
    #7  0x000000000051b92e in function_call (func=0x7f6e3c711150, arg=0x7f6f82d72050, kw=0x7f6e3c710b90) at Objects/funcobject.c:523
    #8  0x00007f6f8182b8b0 in __Pyx_PyObject_Call (func=0x7f6e3c711150, arg=<optimized out>, kw=<optimized out>) at golang/_golang.cpp:15660
    #9  0x00007f6f81833ab4 in __pyx_f_6golang_7_golang___goviac (__pyx_v_arg=0x7f6e3c70f5f0) at golang/_golang.cpp:3466
    #10 __pyx_f_6golang_7_golang__goviac (__pyx_v_arg=__pyx_v_arg@entry=0x7f6e3c70f5f0) at golang/_golang.cpp:3350
    #11 0x000000000054bbb4 in pythread_wrapper (arg=<optimized out>) at Python/thread_pthread.h:178
    #12 0x00007f6f8310cfa3 in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
    #13 0x00007f6f82eae4cf in clone () from /lib/x86_64-linux-gnu/libc.so.6
parent b808b669
...@@ -339,8 +339,8 @@ def test_wcfs_client_afterfork(): ...@@ -339,8 +339,8 @@ def test_wcfs_client_afterfork():
# verify that read_mustfault works as expected. # verify that read_mustfault works as expected.
def test_read_mustfault(): def test_read_mustfault():
mem = mm.map_zero_ro(mm.PAGE_SIZE) mem = mm.map_zero_ro(mm.PAGE_SIZE)
with panics("not faulted"): read_mustfault(mem[:1]) with raises(AssertionError, match="not faulted"): read_mustfault(mem[:1])
mm.protect(mem, mm.PROT_NONE) mm.protect(mem, mm.PROT_NONE)
read_mustfault(mem[:1]) read_mustfault(mem[:1])
mm.protect(mem, mm.PROT_READ) mm.protect(mem, mm.PROT_READ)
with panics("not faulted"): read_mustfault(mem[:1]) with raises(AssertionError, match="not faulted"): read_mustfault(mem[:1])
...@@ -77,41 +77,48 @@ cdef class _tWCFS: ...@@ -77,41 +77,48 @@ cdef class _tWCFS:
xwrite(fdabort, b"1\n") xwrite(fdabort, b"1\n")
t._wcfuseaborted.chan_structZ().close() t._wcfuseaborted.chan_structZ().close()
# read_nogil reads mem with GIL released and returns its content.
def read_nogil(const unsigned char[::1] mem not None) -> bytes:
assert len(mem) == 1, "read_nogil: only [1] mem is supported for now"
cdef unsigned char b
with nogil:
b = mem[0]
return bytes(bytearray([b]))
# read_exfault_nogil reads mem with GIL released and returns its content.
#
# If reading hits segmentation fault, it is converted to SegmentationFault exception.
class SegmentationFault(Exception): pass
cdef sync.Mutex exfaultMu # one at a time as sigaction is per-process
cdef sigjmp_buf exfaultJmp
cdef cbool faulted
def read_exfault_nogil(const unsigned char[::1] mem not None) -> bytes:
assert len(mem) == 1, "read_exfault_nogil: only [1] mem is supported for now"
cdef unsigned char b
global faulted
cdef cbool faulted_
# read_mustfault verifies that read-access to mem causes SIGSEGV. # somewhat dup of MUST_FAULT in test_virtmem.c
cdef sync.Mutex mustfaultMu # one at a time as sigaction is per-process with nogil:
cdef sigjmp_buf mustfaultJmp exfaultMu.lock()
cdef cbool faultExpected = False
cdef cbool faultedOk = False
cdef extern from * nogil:
"""
volatile unsigned char mustfaultG; // global var for compiler not to optimize-out p[0] access
"""
unsigned char mustfaultG
cdef void mustfaultSighand(int sig) nogil: faulted = False
global faultedOk try:
with nogil:
b = _read_exfault(&mem[0])
finally:
faulted_ = faulted
with nogil:
exfaultMu.unlock()
if not faultExpected: if faulted_:
panic("unexpected fault") raise SegmentationFault()
return bytes(bytearray([b]))
# just return from sighandler to proper place cdef void exfaultSighand(int sig) nogil:
faultedOk = True # return from sighandler to proper place with faulted=True
siglongjmp(mustfaultJmp, 1) global faulted
faulted = True
siglongjmp(exfaultJmp, 1)
cdef void _read_mustfault(const unsigned char *p) nogil except +topyexc: cdef unsigned char _read_exfault(const unsigned char *p) nogil except +topyexc:
global faultExpected, faultedOk, mustfaultG global faulted
cdef sigaction_t act, saveact cdef sigaction_t act, saveact
act.sa_handler = mustfaultSighand act.sa_handler = exfaultSighand
act.sa_flags = 0 act.sa_flags = 0
err = sigemptyset(&act.sa_mask) err = sigemptyset(&act.sa_mask)
...@@ -119,37 +126,32 @@ cdef void _read_mustfault(const unsigned char *p) nogil except +topyexc: ...@@ -119,37 +126,32 @@ cdef void _read_mustfault(const unsigned char *p) nogil except +topyexc:
panic("sigemptyset: failed") panic("sigemptyset: failed")
err = sigaction(SIGSEGV, &act, &saveact) err = sigaction(SIGSEGV, &act, &saveact)
if err != 0: if err != 0:
panic("sigaction SIGSEGV -> mustfaultSighand: failed") panic("sigaction SIGSEGV -> exfaultSighand: failed")
faultExpected = True b = 0xff
faultedOk = False if sigsetjmp(exfaultJmp, 1) == 0:
b = p[0] # should pagefault -> sighandler does longjmp
if sigsetjmp(mustfaultJmp, 1) == 0:
mustfaultG = p[0] # should pagefault -> sighandler does longjmp
panic("not faulted")
else: else:
# faulted # faulted
if not faultedOk: if not faulted:
panic("faulted, but !faultedOk") panic("faulted, but !faulted")
faultExpected = False
err = sigaction(SIGSEGV, &saveact, NULL) err = sigaction(SIGSEGV, &saveact, NULL)
if err != 0: if err != 0:
panic("sigaction SIGSEGV <- restore: failed") panic("sigaction SIGSEGV <- restore: failed")
def read_mustfault(const unsigned char[::1] mem not None): return b
assert len(mem) == 1, "read_mustfault: only [1] mem is supported for now"
# somewhat dup of MUST_FAULT in test_virtmem.c
with nogil:
mustfaultMu.lock()
def read_mustfault(const unsigned char[::1] mem not None):
try: try:
with nogil: read_exfault_nogil(mem)
_read_mustfault(&mem[0]) except SegmentationFault:
finally: # ok
with nogil: pass
mustfaultMu.unlock() else:
raise AssertionError("not faulted")
# -------- # --------
......
...@@ -51,7 +51,7 @@ from zodbtools.util import ashex as h, fromhex ...@@ -51,7 +51,7 @@ from zodbtools.util import ashex as h, fromhex
import pytest; xfail = pytest.mark.xfail import pytest; xfail = pytest.mark.xfail
from pytest import raises, fail from pytest import raises, fail
from wendelin.wcfs.internal import io, mm from wendelin.wcfs.internal import io, mm
from wendelin.wcfs.internal.wcfs_test import _tWCFS, read_nogil, install_sigbus_trap, fadvise_dontneed from wendelin.wcfs.internal.wcfs_test import _tWCFS, read_exfault_nogil, SegmentationFault, install_sigbus_trap, fadvise_dontneed
from wendelin.wcfs.client._wcfs import _tpywlinkwrite as _twlinkwrite from wendelin.wcfs.client._wcfs import _tpywlinkwrite as _twlinkwrite
from wendelin.wcfs import _is_mountpoint as is_mountpoint, _procwait as procwait, _ready as ready from wendelin.wcfs import _is_mountpoint as is_mountpoint, _procwait as procwait, _ready as ready
...@@ -777,11 +777,23 @@ class tFile: ...@@ -777,11 +777,23 @@ class tFile:
# so that on error in another worker we don't get stuck and the # so that on error in another worker we don't get stuck and the
# error can be propagated to wait and reported. # error can be propagated to wait and reported.
# #
# we handle cancellation by spawning read in another thread and
# waiting for either ctx cancel, or read thread to complete. This
# way on ctx cancel (e.g. assertion failure in another worker), the
# read thread can remain running even after _assertBlk returns, and
# in particular till the point where the whole test is marked as
# failed and shut down. But on test shutdown .fmmap is unmapped for
# all opened tFiles, and so read will hit SIGSEGV. Prepare to catch
# that SIGSEGV here.
#
# XXX after WatchLink is moved to pyx/nogil, do we still need to do # XXX after WatchLink is moved to pyx/nogil, do we still need to do
# here with nogil? # here with nogil?
have_read = chan(1) have_read = chan(1)
def _(): def _():
b = read_nogil(blkview[0:1]) try:
b = read_exfault_nogil(blkview[0:1])
except SegmentationFault:
b = 'FAULT'
t._blkaccess(blk) t._blkaccess(blk)
have_read.send(b) have_read.send(b)
go(_) go(_)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment