Commit 661b871f authored by Kirill Smelkov's avatar Kirill Smelkov

X make sure tests don't get stuck even if wcfs gets killed -9 ...

... and is still stuck in kernel waiting for FUSE request it sent to
self (mkrevfile) to complete. For example:

	==================
	WARNING: DATA RACE
	Write at 0x00c00009a2a0 by goroutine 17:
	  github.com/hanwen/go-fuse/fuse/nodefs.(*portableHandleMap).Register()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/nodefs/handle.go:87 +0x130
	  github.com/hanwen/go-fuse/fuse/nodefs.(*FileSystemConnector).lookupUpdate()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/nodefs/fsconnector.go:126 +0x65
	  github.com/hanwen/go-fuse/fuse/nodefs.(*rawBridge).Lookup()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/nodefs/fsops.go:119 +0x203
	  github.com/hanwen/go-fuse/fuse.doLookup()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/opcode.go:335 +0x16e
	  github.com/hanwen/go-fuse/fuse.(*Server).handleRequest()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/server.go:470 +0x868
	  github.com/hanwen/go-fuse/fuse.(*Server).loop()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/server.go:437 +0x1de

	Previous read at 0x00c00009a2a0 by goroutine 44:
	  main.(*Root).mkrevfile()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/nodefs/inode.go:66 +0x476
	  main.(*BigFile).invalidateBlk.func1()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:980 +0x1b0
	  main.(*BigFile).invalidateBlk()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:990 +0x33f
	  main.(*Root).handleδZ.func3()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:811 +0x8a
	  golang.org/x/sync/errgroup.(*Group).Go.func1()
	      /home/kirr/src/tools/go/g.env/src/golang.org/x/sync/errgroup/errgroup.go:57 +0x64

	Goroutine 17 (running) created at:
	  github.com/hanwen/go-fuse/fuse.(*Server).readRequest()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/server.go:304 +0x5d3
	  github.com/hanwen/go-fuse/fuse.(*Server).loop()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/server.go:415 +0x78
	  github.com/hanwen/go-fuse/fuse.(*Server).Serve()
	      /home/kirr/src/neo/src/github.com/hanwen/go-fuse/fuse/server.go:360 +0x7d
	  main._main.func2()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:2292 +0x52

	Goroutine 44 (running) created at:
	  golang.org/x/sync/errgroup.(*Group).Go()
	      /home/kirr/src/tools/go/g.env/src/golang.org/x/sync/errgroup/errgroup.go:54 +0x73
	  main.(*Root).handleδZ()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:810 +0x47d
	  main.(*Root).zwatcher()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:731 +0x32b
	  main._main()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:2303 +0x21d1
	  main.main()
	      /home/kirr/src/wendelin/wendelin.core/wcfs/wcfs.go:2114 +0x89
	==================

	C: test timed out after 7.0s -> aborting wcfs fuse connection to unblock
	Traceback (most recent call last):
	  File "/home/kirr/src/tools/go/pygolang/golang/__init__.py", line 122, in _
	    return f(*argv, **kw)
	  File "/home/kirr/src/wendelin/wendelin.core/wcfs/wcfs_test.py", line 289, in close
	    tf.close()
	  File "/home/kirr/src/wendelin/wendelin.core/wcfs/wcfs_test.py", line 505, in close
	    t.f.close()
	IOError: [Errno 107] Transport endpoint is not connected
	Traceback (most recent call last):
	  File "/home/kirr/src/tools/go/pygolang/golang/__init__.py", line 122, in _
	    return f(*argv, **kw)
	  File "/home/kirr/src/wendelin/wendelin.core/wcfs/wcfs_test.py", line 1284, in test_wcfs_basic
	    at2 = t.commit(zf, {2:'c2', 3:'d2', 5:'f2'})
	  File "/home/kirr/src/wendelin/wendelin.core/wcfs/wcfs_test.py", line 376, in commit
	    t._wcsync()
	  File "/home/kirr/src/wendelin/wendelin.core/wcfs/wcfs_test.py", line 382, in _wcsync
	    l = t._wc_zheadfh.readline()
	IOError: [Errno 103] Software caused connection abort

before the patch after data race reporting on wcfs.go side and wcfs.go
aborting (we set GORACE="halt_on_error") the test was still stuck and
one wcfs thread was still alive:

	├─py.test,58 /home/kirr/src/wendelin/venv/z-dev/bin/py.test -p no:cacheprovider -vsx -k test_wcfs_basic
	│   └─(wcfs,67)
	│       └─{wcfs},82

with 67 being already killed - zombie - and 82 received the same signal
but being alive waiting in kernel for answer to a FUSE request:

	# cat /proc/82/stack
	[<0>] request_wait_answer+0x206/0x23d
	[<0>] __fuse_request_send+0x5d/0x80
	[<0>] fuse_simple_request+0xc8/0x180
	[<0>] fuse_lookup_name+0xfa/0x200
	[<0>] fuse_lookup+0x4a/0x110
	[<0>] fuse_atomic_open+0xa8/0x100
	[<0>] path_openat+0xbe2/0x12a0
	[<0>] do_filp_open+0x8e/0x100
	[<0>] do_sys_open+0x181/0x220
	[<0>] do_syscall_64+0x59/0x35b
	[<0>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
	[<0>] 0xffffffffffffffff

and manual `kill -9 82` not making any effect, because after
git.kernel.org/linus/a131de0a482a request_wait_answer even after,
receiving fatal signal, waits for request to be answered.

It was explicitly said in that kernel commit that the way to unblock
such deadlocks is /sys/fs/fuse/connection/<X>/abort - so we do it this
way on timeout to make sure a test is never stuck forever.
parent 61b2a2f5
......@@ -17,6 +17,7 @@ fi
# executed under qemu-runlinux
#export WENDELIN_CORE_WCFS_OPTIONS="-d -alsologtostderr -v=1"
#screen -Logfile /tmp/x.0 -L \
py.test \
`# disable cache & warnings that it cannot be updated since fs under qrun is read-only` \
`# https://stackoverflow.com/a/47893653/9456786` \
......
......@@ -199,6 +199,16 @@ class tDB:
assert os.path.exists(testmntpt)
assert 0 == subprocess.call(["mountpoint", "-q", testmntpt])
# force-unmount wcfs on timeout to unstuck current test and let it fail.
# Force-unmount can be done reliably only by writing into
# /sys/fs/fuse/connections/<X>/abort. For everything else there are
# cases, when wcfs, even after receiving `kill -9`, will be stuck in kernel.
# ( git.kernel.org/linus/a131de0a482a makes in-kernel FUSE client to
# still wait for request completion even after fatal signal )
t._closed = chan()
t._wcfuseconn = os.stat(testmntpt).st_dev
go(t._abort_ontimeout, 7*time.second) # NOTE must be >> with_timeout
# ZBigFile(s) scheduled for commit
t._changed = {} # ZBigFile -> {} blk -> data
......@@ -232,10 +242,26 @@ class tDB:
def head(t):
return t.dFtail[-1].rev
def _abort_ontimeout(t, dt):
_, _rx = select(
time.after(dt).recv, # 0
t._closed.recv, # 1
)
if _ == 1:
return # tDB closed = testcase completed
# timeout -> force-umount wcfs
print("\nC: test timed out after %.1fs -> aborting wcfs fuse connection"
" to unblock" % (dt / time.second), file=sys.stderr)
writefile("/sys/fs/fuse/connections/%d/abort" % t._wcfuseconn, b"1\n")
# close closes test database as well as all tracked files, watch links and wcfs.
# it also prints change history to help developer overview current testcase.
@func
def close(t):
defer(t._closed.close)
# unmount and wait for wcfs to exit
def _():
assert 0 == subprocess.call(["mountpoint", "-q", testmntpt])
......@@ -1860,6 +1886,11 @@ def readfile(path):
with open(path) as f:
return f.read()
# writefile writes data to file @ path.
def writefile(path, data):
with open(path, "w") as f:
f.write(data)
# tidtime converts tid to transaction commit time.
def tidtime(tid):
t = TimeStamp(tid).timeTime()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment