Commit 8eac771c authored by Kirill Smelkov's avatar Kirill Smelkov

X neo/connection: Fix race between link.shutdown() and conn.lightClose()

The scenario could be: nl.shutdown sets nl.connTab=nil and
then iterates connTab snapshot taken under nl.connMu lock. If so
this activity (which calls e.g. Conn.shutdown) could be running with
conn.lightClose -> conn.release() in parallel.

Bug triggers under neotest in zhash.go +prefetch128:

	crc32:bf9deea9   ; oid=0..8499  nread=34134938  t=9.010696705s (1.060081ms / object)  x=zhash.go +prefetch128
	==================
	WARNING: DATA RACE
	Write at 0x00c424b55ed0 by goroutine 147:
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).reinit()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:258 +0x102
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).release()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:240 +0x38
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).lightClose()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:1633 +0xa4
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).Ask1()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:1719 +0xde
	  lab.nexedi.com/kirr/neo/go/neo/client.(*Client)._Load()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/client/client.go:466 +0x5d3
	  lab.nexedi.com/kirr/neo/go/neo/client.(*Client).Load()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/client/client.go:392 +0xa8
	  lab.nexedi.com/kirr/neo/go/zodb/storage.(*Cache).loadRCE()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/zodb/storage/cache.go:297 +0xfb

	Previous write at 0x00c424b55ed0 by goroutine 75:
	  sync/atomic.AddInt32()
	      /home/kirr/src/tools/go/go/src/runtime/race_amd64.s:269 +0xb
	  sync.(*Mutex).Unlock()
	      /home/kirr/src/tools/go/go/src/sync/mutex.go:182 +0x54
	  sync.(*Once).Do()
	      /home/kirr/src/tools/go/go/src/sync/once.go:46 +0x95
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).shutdownRX()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:471 +0xb1
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).shutdown()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:442 +0x62
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).shutdown.func1.1()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:408 +0x18f

	Goroutine 147 (running) created at:
	  lab.nexedi.com/kirr/neo/go/zodb/storage.(*Cache).Prefetch()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/zodb/storage/cache.go:198 +0xbb
	  main.zhash.func2()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/t/zhash.go:134 +0x86
	  main.zhash.func4()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/t/zhash.go:157 +0x8f
	  main.zhash()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/t/zhash.go:193 +0xc7b
	  main.main()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/t/zhash.go:107 +0x48d

	Goroutine 75 (finished) created at:
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).shutdown.func1()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:391 +0x116
	  sync.(*Once).Do()
	      /home/kirr/src/tools/go/go/src/sync/once.go:44 +0xe1
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).shutdown()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:383 +0x99
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).serveRecv()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:748 +0x651
	==================

and without -race zhash.go instrumentation it was showing itself as
"panic: unlock of unlocked mutex" in

	  sync.(*Mutex).Unlock()
	      /home/kirr/src/tools/go/go/src/sync/mutex.go:182 +0x54
	  sync.(*Once).Do()
	      /home/kirr/src/tools/go/go/src/sync/once.go:46 +0x95
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).shutdownRX()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:471 +0xb1
	  lab.nexedi.com/kirr/neo/go/neo.(*Conn).shutdown()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:442 +0x62
	  lab.nexedi.com/kirr/neo/go/neo.(*NodeLink).shutdown.func1.1()
	      /home/kirr/src/neo/src/lab.nexedi.com/kirr/neo/go/neo/connection.go:408 +0x18f
parent e4a100de
......@@ -1606,15 +1606,25 @@ func (c *Conn) Ask(req Msg, resp Msg) error {
// XXX must be called only once.
func (c *Conn) lightClose() {
nl := c.link
releaseok := false
nl.connMu.Lock()
if nl.connTab != nil {
// XXX find way to keep initiated by us conn as closed for some time (see Conn.Close)
// but timer costs too much...
delete(nl.connTab, c.connId)
releaseok = true
}
nl.connMu.Unlock()
c.release()
// we can release c only if we removed it from connTab.
//
// if not - the scenario could be: nl.shutdown sets nl.connTab=nil and
// then iterates connTab snapshot taken under nl.connMu lock. If so
// this activity (which calls e.g. Conn.shutdown) could be running with
// us in parallel.
if releaseok {
c.release()
}
}
// Request is a message received from the link + connection handle to make a reply.
......
......@@ -842,6 +842,29 @@ func TestRecv1Mode(t *testing.T) {
// TODO link.Close vs Recv1
}
// test possible race-condition between link.shutdown and conn.lightClose:
//
// link.shutdown sets link.connTab=nil and under link.connMu and then iterates
// read connTab without link.connMu held. If conn.lightClose does
// conn.release() in parallel to link.shutdown() iterating connTab they can be
// both writing/using e.g. conn.rxdownOnce.
//
// bug triggers under -race
func TestLightCloseVsLinkShutdown(t *testing.T) {
nl1, nl2 := nodeLinkPipe()
wg := &xsync.WorkGroup{}
c := xnewconn(nl1)
nl1.shutdown()
wg.Gox(func() {
c.lightClose()
})
xwait(wg)
xclose(nl1)
xclose(nl2)
}
// ---- benchmarks ----
// rtt over chan - for comparision as base
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment