Commit 5e7f34d2 authored by Julien Muchembled's avatar Julien Muchembled

New algorithm for deadlock avoidance

The time complexity of previous one was too bad. With several tens of
concurrent transactions, we saw commits take minutes to complete and
the whole application looked frozen.

This new algorithm is much simpler. Instead of asking the oldest
transaction to somewhat restart (we used the "rebase" term because
the concept was similar to what git-rebase does), the storage gives
it priority and the newest is asked to relock (this request is ignored
if vote already happened, which means there was actually no deadlock).

testLocklessWriteDuringConflictResolution was initially more complex
because Transaction.written (client) ignored KeyError (which is not the
case anymore since commit 8ef1ddba).
parent d98b576c
...@@ -524,32 +524,11 @@ class Application(ThreadedApplication): ...@@ -524,32 +524,11 @@ class Application(ThreadedApplication):
while 1: while 1:
# We iterate over conflict_dict, and clear it, # We iterate over conflict_dict, and clear it,
# because new items may be added by calls to _store. # because new items may be added by calls to _store.
# This is also done atomically, to avoid race conditions
# with PrimaryNotificationsHandler.notifyDeadlock
try: try:
oid, serial = pop_conflict() oid, serial = pop_conflict()
except KeyError: except KeyError:
return return
try:
data, old_serial, _ = data_dict.pop(oid) data, old_serial, _ = data_dict.pop(oid)
except KeyError:
assert oid is None, (oid, serial)
# Storage refused us from taking object lock, to avoid a
# possible deadlock. TID is actually used for some kind of
# "locking priority": when a higher value has the lock,
# this means we stored objects "too late", and we would
# otherwise cause a deadlock.
# To recover, we must ask storages to release locks we
# hold (to let possibly-competing transactions acquire
# them), and requeue our already-sent store requests.
ttid = txn_context.ttid
logging.info('Deadlock avoidance triggered for TXN %s'
' with new locking TID %s', dump(ttid), dump(serial))
txn_context.locking_tid = serial
packet = Packets.AskRebaseTransaction(ttid, serial)
for uuid in txn_context.conn_dict:
self._askStorageForWrite(txn_context, uuid, packet)
else:
if data is CHECKED_SERIAL: if data is CHECKED_SERIAL:
raise ReadConflictError(oid=oid, raise ReadConflictError(oid=oid,
serials=(serial, old_serial)) serials=(serial, old_serial))
...@@ -569,7 +548,6 @@ class Application(ThreadedApplication): ...@@ -569,7 +548,6 @@ class Application(ThreadedApplication):
# in this case. # in this case.
raise ConflictError(oid=oid, serials=(serial, old_serial), raise ConflictError(oid=oid, serials=(serial, old_serial),
data=data or None) data=data or None)
else:
logging.info( logging.info(
'Conflict resolution succeeded for %s@%s with %s', 'Conflict resolution succeeded for %s@%s with %s',
dump(oid), dump(old_serial), dump(serial)) dump(oid), dump(old_serial), dump(serial))
...@@ -609,6 +587,7 @@ class Application(ThreadedApplication): ...@@ -609,6 +587,7 @@ class Application(ThreadedApplication):
"""Store current transaction.""" """Store current transaction."""
txn_context = self._txn_container.get(transaction) txn_context = self._txn_container.get(transaction)
self.waitStoreResponses(txn_context) self.waitStoreResponses(txn_context)
txn_context.stored = True
ttid = txn_context.ttid ttid = txn_context.ttid
ext = transaction._extension ext = transaction._extension
ext = dumps(ext, _protocol) if ext else '' ext = dumps(ext, _protocol) if ext else ''
...@@ -640,10 +619,8 @@ class Application(ThreadedApplication): ...@@ -640,10 +619,8 @@ class Application(ThreadedApplication):
if not (uuid in failed or uuid in uuid_set): if not (uuid in failed or uuid in uuid_set):
break break
else: else:
# Very unlikely. Instead of raising, we could recover # Very unlikely. Trying to recover
# the transaction by doing something similar to # is not worth the complexity.
# deadlock avoidance; that would be done before voting.
# But it's not worth the complexity.
raise NEOStorageError( raise NEOStorageError(
'partition %s not fully write-locked' % offset) 'partition %s not fully write-locked' % offset)
failed = [uuid for uuid, running in failed.iteritems() if running] failed = [uuid for uuid, running in failed.iteritems() if running]
......
...@@ -149,13 +149,6 @@ class PrimaryNotificationsHandler(MTEventHandler): ...@@ -149,13 +149,6 @@ class PrimaryNotificationsHandler(MTEventHandler):
if node and node.isConnected(): if node and node.isConnected():
node.getConnection().close() node.getConnection().close()
def notifyDeadlock(self, conn, ttid, locking_tid):
for txn_context in self.app.txn_contexts():
if txn_context.ttid == ttid:
txn_context.conflict_dict[None] = locking_tid
txn_context.wakeup(conn)
break
class PrimaryAnswersHandler(AnswerBaseHandler): class PrimaryAnswersHandler(AnswerBaseHandler):
""" Handle that process expected packets from the primary master """ """ Handle that process expected packets from the primary master """
......
...@@ -28,11 +28,24 @@ from ..transactions import Transaction ...@@ -28,11 +28,24 @@ from ..transactions import Transaction
from ..exception import NEOStorageError, NEOStorageNotFoundError from ..exception import NEOStorageError, NEOStorageNotFoundError
from ..exception import NEOStorageReadRetry, NEOStorageDoesNotExistError from ..exception import NEOStorageReadRetry, NEOStorageDoesNotExistError
@apply
class _DeadlockPacket(object):
handler_method_name = 'notifyDeadlock'
_args = ()
getId = int
class StorageEventHandler(MTEventHandler): class StorageEventHandler(MTEventHandler):
def _acceptIdentification(*args): def _acceptIdentification(*args):
pass pass
def notifyDeadlock(self, conn, ttid, oid):
for txn_context in self.app.txn_contexts():
if txn_context.ttid == ttid:
txn_context.queue.put((conn, _DeadlockPacket, {'oid': oid}))
break
class StorageBootstrapHandler(AnswerBaseHandler): class StorageBootstrapHandler(AnswerBaseHandler):
""" Handler used when connecting to a storage node """ """ Handler used when connecting to a storage node """
...@@ -72,22 +85,28 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -72,22 +85,28 @@ class StorageAnswersHandler(AnswerBaseHandler):
answerCheckCurrentSerial = answerStoreObject answerCheckCurrentSerial = answerStoreObject
def answerRebaseTransaction(self, conn, oid_list): def notifyDeadlock(self, conn, oid):
# To avoid a possible deadlock, storage nodes are waiting for our
# lock to be cancelled, so that a transaction that started earlier
# can complete. This is done by acquiring the lock again.
txn_context = self.app.getHandlerData() txn_context = self.app.getHandlerData()
if txn_context.stored:
return
ttid = txn_context.ttid ttid = txn_context.ttid
queue = txn_context.queue logging.info('Deadlock avoidance triggered for %s:%s',
dump(oid), dump(ttid))
assert conn.getUUID() not in txn_context.data_dict.get(oid, ((),))[-1]
try: try:
for oid in oid_list:
# We could have an extra parameter to tell the storage if we # We could have an extra parameter to tell the storage if we
# still have the data, and in this case revert what was done # still have the data, and in this case revert what was done
# in Transaction.written. This would save bandwidth in case of # in Transaction.written. This would save bandwidth in case of
# conflict. # conflict.
conn.ask(Packets.AskRebaseObject(ttid, oid), conn.ask(Packets.AskRelockObject(ttid, oid),
queue=queue, oid=oid) queue=txn_context.queue, oid=oid)
except ConnectionClosed: except ConnectionClosed:
txn_context.conn_dict[conn.getUUID()] = None txn_context.conn_dict[conn.getUUID()] = None
def answerRebaseObject(self, conn, conflict, oid): def answerRelockObject(self, conn, conflict, oid):
if conflict: if conflict:
txn_context = self.app.getHandlerData() txn_context = self.app.getHandlerData()
serial, conflict, data = conflict serial, conflict, data = conflict
...@@ -105,7 +124,7 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -105,7 +124,7 @@ class StorageAnswersHandler(AnswerBaseHandler):
assert oid in txn_context.data_dict assert oid in txn_context.data_dict
if serial <= txn_context.conflict_dict.get(oid, ''): if serial <= txn_context.conflict_dict.get(oid, ''):
# Another node already reported this conflict or a newer, # Another node already reported this conflict or a newer,
# by answering to this rebase or to the previous store. # by answering to this relock or to the previous store.
return return
# A node has not answered yet to a previous store. Do not wait # A node has not answered yet to a previous store. Do not wait
# it to report the conflict because it may fail before. # it to report the conflict because it may fail before.
...@@ -119,8 +138,8 @@ class StorageAnswersHandler(AnswerBaseHandler): ...@@ -119,8 +138,8 @@ class StorageAnswersHandler(AnswerBaseHandler):
compression, checksum, data = data compression, checksum, data = data
if checksum != makeChecksum(data): if checksum != makeChecksum(data):
raise NEOStorageError( raise NEOStorageError(
'wrong checksum while getting back data for' 'wrong checksum while getting back data for %s:%s'
' object %s during rebase of transaction %s' ' (deadlock avoidance)'
% (dump(oid), dump(txn_context.ttid))) % (dump(oid), dump(txn_context.ttid)))
data = decompress_list[compression](data) data = decompress_list[compression](data)
size = len(data) size = len(data)
......
...@@ -22,19 +22,12 @@ from neo.lib.protocol import Packets ...@@ -22,19 +22,12 @@ from neo.lib.protocol import Packets
from neo.lib.util import dump from neo.lib.util import dump
from .exception import NEOStorageError from .exception import NEOStorageError
@apply
class _WakeupPacket(object):
handler_method_name = 'pong'
_args = ()
getId = int
class Transaction(object): class Transaction(object):
cache_size = 0 # size of data in cache_dict cache_size = 0 # size of data in cache_dict
data_size = 0 # size of data in data_dict data_size = 0 # size of data in data_dict
error = None error = None
locking_tid = None stored = False
voted = False voted = False
ttid = None # XXX: useless, except for testBackupReadOnlyAccess ttid = None # XXX: useless, except for testBackupReadOnlyAccess
lockless_dict = None # {partition: {uuid}} lockless_dict = None # {partition: {uuid}}
...@@ -55,16 +48,13 @@ class Transaction(object): ...@@ -55,16 +48,13 @@ class Transaction(object):
def __repr__(self): def __repr__(self):
error = self.error error = self.error
return ("<%s ttid=%s locking_tid=%s voted=%u" return ("<%s ttid=%s voted=%u"
" #queue=%s #writing=%s #written=%s%s>") % ( " #queue=%s #writing=%s #written=%s%s>") % (
self.__class__.__name__, self.__class__.__name__,
dump(self.ttid), dump(self.locking_tid), self.voted, dump(self.ttid), self.voted,
len(self.queue._queue), len(self.data_dict), len(self.cache_dict), len(self.queue._queue), len(self.data_dict), len(self.cache_dict),
' error=%r' % error if error else '') ' error=%r' % error if error else '')
def wakeup(self, conn):
self.queue.put((conn, _WakeupPacket, {}))
def write(self, app, packet, object_id, **kw): def write(self, app, packet, object_id, **kw):
uuid_list = [] uuid_list = []
pt = app.pt pt = app.pt
...@@ -78,14 +68,6 @@ class Transaction(object): ...@@ -78,14 +68,6 @@ class Transaction(object):
conn = conn_dict[uuid] conn = conn_dict[uuid]
except KeyError: except KeyError:
conn = conn_dict[uuid] = app.getStorageConnection(node) conn = conn_dict[uuid] = app.getStorageConnection(node)
if self.locking_tid and 'oid' in kw:
# A deadlock happened but this node is not aware of it.
# Tell it to write-lock with the same locking tid as
# for the other nodes. The condition on kw is to
# distinguish whether we're writing an oid or
# transaction metadata.
conn.ask(Packets.AskRebaseTransaction(
self.ttid, self.locking_tid), queue=self.queue)
conn.ask(packet, queue=self.queue, **kw) conn.ask(packet, queue=self.queue, **kw)
uuid_list.append(uuid) uuid_list.append(uuid)
except AttributeError: except AttributeError:
...@@ -128,8 +110,8 @@ class Transaction(object): ...@@ -128,8 +110,8 @@ class Transaction(object):
lockless = self.lockless_dict = defaultdict(set) lockless = self.lockless_dict = defaultdict(set)
lockless[app.pt.getPartition(oid)].add(uuid) lockless[app.pt.getPartition(oid)].add(uuid)
if oid in self.conflict_dict: if oid in self.conflict_dict:
# In the case of a rebase, uuid_list may not contain the id # In case of deadlock avoidance, uuid_list may not contain the
# of the node reporting a conflict. # id of the node reporting a conflict.
return return
if uuid_list: if uuid_list:
return return
......
...@@ -322,8 +322,8 @@ class EventQueue(object): ...@@ -322,8 +322,8 @@ class EventQueue(object):
# Stable sort when 2 keys are equal. # Stable sort when 2 keys are equal.
# XXX: Is it really useful to keep events with same key ordered # XXX: Is it really useful to keep events with same key ordered
# chronologically ? The caller could use more specific keys. For # chronologically ? The caller could use more specific keys.
# write-locks (by the storage node), the locking tid seems enough. # For write-locks (by the storage node), the ttid seems enough.
sortQueuedEvents = (lambda key=itemgetter(0): lambda self: sortQueuedEvents = (lambda key=itemgetter(0): lambda self:
self._event_queue.sort(key=key))() self._event_queue.sort(key=key))()
...@@ -334,14 +334,6 @@ class EventQueue(object): ...@@ -334,14 +334,6 @@ class EventQueue(object):
if key is not None: if key is not None:
self.sortQueuedEvents() self.sortQueuedEvents()
def sortAndExecuteQueuedEvents(self):
if self._executing_event < 0:
self.sortQueuedEvents()
self.executeQueuedEvents()
else:
# We can't sort events when they're being processed.
self._executing_event = 1
def executeQueuedEvents(self): def executeQueuedEvents(self):
# Not reentrant. When processing a queued event, calling this method # Not reentrant. When processing a queued event, calling this method
# only tells the caller to retry all events from the beginning, because # only tells the caller to retry all events from the beginning, because
...@@ -369,10 +361,6 @@ class EventQueue(object): ...@@ -369,10 +361,6 @@ class EventQueue(object):
while done: while done:
del queue[done.pop()] del queue[done.pop()]
self._executing_event = 0 self._executing_event = 0
# What sortAndExecuteQueuedEvents could not do immediately
# is done here:
if event[0] is not None:
self.sortQueuedEvents()
self._executing_event = -1 self._executing_event = -1
def logQueuedEvents(self): def logQueuedEvents(self):
......
...@@ -20,7 +20,7 @@ from msgpack import packb ...@@ -20,7 +20,7 @@ from msgpack import packb
# The protocol version must be increased whenever upgrading a node may require # The protocol version must be increased whenever upgrading a node may require
# to upgrade other nodes. # to upgrade other nodes.
PROTOCOL_VERSION = 1 PROTOCOL_VERSION = 2
# By encoding the handshake packet with msgpack, the whole NEO stream can be # By encoding the handshake packet with msgpack, the whole NEO stream can be
# decoded with msgpack. The first byte is 0x92, which is different from TLS # decoded with msgpack. The first byte is 0x92, which is different from TLS
# Handshake (0x16). # Handshake (0x16).
...@@ -532,28 +532,17 @@ class Packets(dict): ...@@ -532,28 +532,17 @@ class Packets(dict):
""") """)
NotifyDeadlock = notify(""" NotifyDeadlock = notify("""
Ask master to generate a new TTID that will be used by the client to A client acquired a write-lock before another one that has a smaller
solve a deadlock by rebasing the transaction on top of concurrent TTID, leading to a possible deadlock. In order to solve it, this asks
changes. the client with the greatest TTID to lock again if it can't vote.
:nodes: S -> M -> C :nodes: S -> C
""") """)
AskRebaseTransaction, AnswerRebaseTransaction = request(""" AskRelockObject, AnswerRelockObject = request("""
Rebase a transaction to solve a deadlock. Relock an object change to solve a deadlock.
:nodes: C -> S :nodes: C -> S
""")
AskRebaseObject, AnswerRebaseObject = request("""
Rebase an object change to solve a deadlock.
:nodes: C -> S
XXX: It is a request packet to simplify the implementation. For more
efficiency, this should be turned into a notification, and the
RebaseTransaction should answered once all objects are rebased
(so that the client can still wait on something).
""", data_path=(1, 0, 2, 0)) """, data_path=(1, 0, 2, 0))
AskStoreObject, AnswerStoreObject = request(""" AskStoreObject, AnswerStoreObject = request("""
......
...@@ -68,9 +68,6 @@ class StorageServiceHandler(BaseServiceHandler): ...@@ -68,9 +68,6 @@ class StorageServiceHandler(BaseServiceHandler):
conn.answer(p) conn.answer(p)
app.pt.updatable(conn.getUUID(), offset_list) app.pt.updatable(conn.getUUID(), offset_list)
def notifyDeadlock(self, conn, *args):
self.app.tm.deadlock(conn.getUUID(), *args)
def answerInformationLocked(self, conn, ttid): def answerInformationLocked(self, conn, ttid):
self.app.tm.lock(ttid, conn.getUUID()) self.app.tm.lock(ttid, conn.getUUID())
......
...@@ -19,15 +19,13 @@ from time import time ...@@ -19,15 +19,13 @@ from time import time
from struct import pack, unpack from struct import pack, unpack
from neo.lib import logging from neo.lib import logging
from neo.lib.handler import DelayEvent, EventQueue from neo.lib.handler import DelayEvent, EventQueue
from neo.lib.protocol import Packets, ProtocolError, uuid_str, \ from neo.lib.protocol import ProtocolError, uuid_str, ZERO_OID, ZERO_TID
ZERO_OID, ZERO_TID
from neo.lib.util import dump, u64, addTID, tidFromTime from neo.lib.util import dump, u64, addTID, tidFromTime
class Transaction(object): class Transaction(object):
""" """
A pending transaction A pending transaction
""" """
locking_tid = ZERO_TID
_tid = None _tid = None
_msg_id = None _msg_id = None
_oid_list = None _oid_list = None
...@@ -292,19 +290,6 @@ class TransactionManager(EventQueue): ...@@ -292,19 +290,6 @@ class TransactionManager(EventQueue):
logging.debug('Begin %s', txn) logging.debug('Begin %s', txn)
return tid return tid
def deadlock(self, storage_id, ttid, locking_tid):
try:
txn = self._ttid_dict[ttid]
except KeyError:
return
if txn.locking_tid <= locking_tid:
client = txn.getNode()
txn.locking_tid = locking_tid = self._nextTID()
logging.info('Deadlock avoidance triggered by %s for %s:'
' new locking tid for TXN %s is %s', uuid_str(storage_id),
uuid_str(client.getUUID()), dump(ttid), dump(locking_tid))
client.send(Packets.NotifyDeadlock(ttid, locking_tid))
def vote(self, app, ttid, uuid_list): def vote(self, app, ttid, uuid_list):
""" """
Check that the transaction can be voted Check that the transaction can be voted
......
...@@ -133,25 +133,23 @@ class ClientOperationHandler(BaseHandler): ...@@ -133,25 +133,23 @@ class ClientOperationHandler(BaseHandler):
compression, checksum, data, data_serial, ttid, time.time()), compression, checksum, data, data_serial, ttid, time.time()),
*e.args) *e.args)
def askRebaseTransaction(self, conn, *args): def askRelockObject(self, conn, ttid, oid):
conn.answer(Packets.AnswerRebaseTransaction(
self.app.tm.rebase(conn, *args)))
def askRebaseObject(self, conn, ttid, oid):
try: try:
self._askRebaseObject(conn, ttid, oid, None) self.app.tm.relockObject(ttid, oid, True)
except DelayEvent, e: except DelayEvent, e:
# locked by a previous transaction, retry later # locked by a previous transaction, retry later
self.app.tm.queueEvent(self._askRebaseObject, self.app.tm.queueEvent(self._askRelockObject,
conn, (ttid, oid, time.time()), *e.args) conn, (ttid, oid, time.time()), *e.args)
else:
conn.answer(Packets.AnswerRelockObject(None))
def _askRebaseObject(self, conn, ttid, oid, request_time): def _askRelockObject(self, conn, ttid, oid, request_time):
conflict = self.app.tm.rebaseObject(ttid, oid) conflict = self.app.tm.relockObject(ttid, oid, False)
if request_time and SLOW_STORE is not None: if request_time and SLOW_STORE is not None:
duration = time.time() - request_time duration = time.time() - request_time
if duration > SLOW_STORE: if duration > SLOW_STORE:
logging.info('RebaseObject delay: %.02fs', duration) logging.info('RelockObject delay: %.02fs', duration)
conn.answer(Packets.AnswerRebaseObject(conflict)) conn.answer(Packets.AnswerRelockObject(conflict))
def askTIDsFrom(self, conn, min_tid, max_tid, length, partition): def askTIDsFrom(self, conn, min_tid, max_tid, length, partition):
conn.answer(Packets.AnswerTIDsFrom(self.app.dm.getReplicationTIDList( conn.answer(Packets.AnswerTIDsFrom(self.app.dm.getReplicationTIDList(
...@@ -251,8 +249,7 @@ class ClientReadOnlyOperationHandler(ClientOperationHandler): ...@@ -251,8 +249,7 @@ class ClientReadOnlyOperationHandler(ClientOperationHandler):
askVoteTransaction = _readOnly askVoteTransaction = _readOnly
askStoreObject = _readOnly askStoreObject = _readOnly
askFinalTID = _readOnly askFinalTID = _readOnly
askRebaseObject = _readOnly askRelockObject = _readOnly
askRebaseTransaction = _readOnly
# takes write lock & is only used when going to commit # takes write lock & is only used when going to commit
askCheckCurrentSerial = _readOnly askCheckCurrentSerial = _readOnly
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
from time import time from time import time
from neo.lib import logging from neo.lib import logging
from neo.lib.handler import DelayEvent, EventQueue from neo.lib.handler import DelayEvent, EventQueue
from neo.lib.util import dump from neo.lib.util import cached_property, dump
from neo.lib.protocol import Packets, ProtocolError, NonReadableCell, \ from neo.lib.protocol import Packets, ProtocolError, NonReadableCell, \
uuid_str, MAX_TID, ZERO_TID uuid_str, MAX_TID, ZERO_TID
...@@ -47,7 +47,6 @@ class Transaction(object): ...@@ -47,7 +47,6 @@ class Transaction(object):
def __init__(self, uuid, ttid): def __init__(self, uuid, ttid):
self._birth = time() self._birth = time()
self.locking_tid = ttid
self.uuid = uuid self.uuid = uuid
self.serial_dict = {} self.serial_dict = {}
self.store_dict = {} self.store_dict = {}
...@@ -56,17 +55,21 @@ class Transaction(object): ...@@ -56,17 +55,21 @@ class Transaction(object):
# the replication is finished. # the replication is finished.
self.lockless = set() self.lockless = set()
@cached_property
def deadlock(self):
# Remember the oids for which we sent a deadlock notification.
return set()
def __repr__(self): def __repr__(self):
return "<%s(%s, locking_tid=%s, tid=%s, age=%.2fs) at 0x%x>" % ( return "<%s(%s, tid=%s, age=%.2fs) at 0x%x>" % (
self.__class__.__name__, self.__class__.__name__,
uuid_str(self.uuid), uuid_str(self.uuid),
dump(self.locking_tid),
dump(self.tid), dump(self.tid),
time() - self._birth, time() - self._birth,
id(self)) id(self))
def __lt__(self, other): def __lt__(self, other):
return self.locking_tid < other.locking_tid raise AssertionError
def logDelay(self, ttid, locked, oid_serial): def logDelay(self, ttid, locked, oid_serial):
if self._delayed.get(oid_serial) != locked: if self._delayed.get(oid_serial) != locked:
...@@ -161,8 +164,7 @@ class TransactionManager(EventQueue): ...@@ -161,8 +164,7 @@ class TransactionManager(EventQueue):
# We sort transactions so that in case of multiple stores/checks # We sort transactions so that in case of multiple stores/checks
# for the same oid, the lock is taken by the highest locking ttid, # for the same oid, the lock is taken by the highest locking ttid,
# which will delay new transactions. # which will delay new transactions.
for txn, ttid in sorted((txn, ttid) for ttid, txn in for ttid, txn in sorted(self._transaction_dict.iteritems()):
self._transaction_dict.iteritems()):
assert txn.lockless.issubset(txn.serial_dict), ( assert txn.lockless.issubset(txn.serial_dict), (
ttid, txn.lockless, txn.serial_dict) ttid, txn.lockless, txn.serial_dict)
for oid in txn.lockless: for oid in txn.lockless:
...@@ -217,84 +219,6 @@ class TransactionManager(EventQueue): ...@@ -217,84 +219,6 @@ class TransactionManager(EventQueue):
except KeyError: except KeyError:
return None return None
def _rebase(self, transaction, ttid, locking_tid=MAX_TID):
# With the default value of locking_tid, this marks the transaction as
# being rebased, in case that the current lock is released (the other
# transaction is aborted or committed) before the client sends us a new
# locking tid: in lockObject, 'locked' will be None but we'll still
# have to delay the store.
transaction.locking_tid = locking_tid
if ttid:
# Remove store locks we have.
# In order to keep all locking data consistent, this must be done
# when the locking tid changes, i.e. from both 'lockObject' (for
# the node that triggered the deadlock) and 'rebase' (for other
# nodes).
for oid, locked in self._store_lock_dict.items():
# If this oid is locked by several transactions (all lockless),
# the following condition is true if we have the highest ttid,
# but in either case, _notifyReplicated will be called below,
# fixing the store lock.
if locked == ttid:
del self._store_lock_dict[oid]
# However here, we don't try to remember lockless writes: later,
# we may give write-locks to oids that would normally conflict.
# Readable cells prevent such scenario to go wrong.
lockless = transaction.lockless
if locking_tid == MAX_TID:
if lockless:
lockless.clear()
self._notifyReplicated()
else:
# There's nothing to rebase for lockless stores to replicating
# partitions because there's no lock taken yet. In other words,
# rebasing such stores would do nothing. Other lockless stores
# become normal ones: this transaction does not block anymore
# replicated partitions from being marked as UP_TO_DATE.
oid = [oid
for oid in lockless
if self.getPartition(oid) not in self._replicating]
if oid:
lockless.difference_update(oid)
self._notifyReplicated()
# Some locks were released, some pending locks may now succeed.
# We may even have delayed stores for this transaction, like the one
# that triggered the deadlock. They must also be sorted again because
# our locking tid has changed.
self.sortAndExecuteQueuedEvents()
def rebase(self, conn, ttid, locking_tid):
self.register(conn, ttid)
transaction = self._transaction_dict[ttid]
if transaction.voted:
raise ProtocolError("TXN %s already voted" % dump(ttid))
# First, get a set copy of serial_dict before _rebase locks oids.
lock_set = set(transaction.serial_dict)
self._rebase(transaction, transaction.locking_tid != MAX_TID and ttid,
locking_tid)
if transaction.locking_tid == MAX_TID:
# New deadlock. There's no point rebasing objects now.
return ()
# We return all oids that can't be relocked trivially
# (the client will use RebaseObject for these oids).
lock_set -= transaction.lockless # see comment in _rebase
recheck_set = lock_set.intersection(self._store_lock_dict)
lock_set -= recheck_set
for oid in lock_set:
try:
serial = transaction.serial_dict[oid]
except KeyError:
# An oid was already being rebased and delayed,
# and it got a conflict during the above call to _rebase.
continue
try:
self.lockObject(ttid, serial, oid)
except ConflictError:
recheck_set.add(oid)
except (DelayEvent, NonReadableCell), e: # pragma: no cover
raise AssertionError(e)
return recheck_set
def vote(self, ttid, txn_info=None): def vote(self, ttid, txn_info=None):
""" """
Store transaction information received from client node Store transaction information received from client node
...@@ -378,15 +302,12 @@ class TransactionManager(EventQueue): ...@@ -378,15 +302,12 @@ class TransactionManager(EventQueue):
# replicate, and we're expected to store it in full. # replicate, and we're expected to store it in full.
# Since there's at least 1 other (readable) cell that will do this # Since there's at least 1 other (readable) cell that will do this
# check, we accept this store/check without taking a lock. # check, we accept this store/check without taking a lock.
if transaction.locking_tid == MAX_TID:
# Deadlock avoidance. Still no new locking_tid from the client.
raise DelayEvent(transaction)
transaction.lockless.add(oid) transaction.lockless.add(oid)
return return
locked = self._store_lock_dict.get(oid) locked = self._store_lock_dict.get(oid)
if locked: if locked:
other = self._transaction_dict[locked] other = self._transaction_dict[locked]
if other < transaction or other.voted: if locked < ttid or other.voted:
# We have a bigger "TTID" than locking transaction, so we are # We have a bigger "TTID" than locking transaction, so we are
# younger: enter waiting queue so we are handled when lock gets # younger: enter waiting queue so we are handled when lock gets
# released. We also want to delay (instead of conflict) if the # released. We also want to delay (instead of conflict) if the
...@@ -399,7 +320,7 @@ class TransactionManager(EventQueue): ...@@ -399,7 +320,7 @@ class TransactionManager(EventQueue):
# but this is not a problem. EventQueue processes them in order # but this is not a problem. EventQueue processes them in order
# and only the last one will not result in conflicts (that are # and only the last one will not result in conflicts (that are
# already resolved). # already resolved).
raise DelayEvent(transaction) raise DelayEvent(ttid)
if oid in transaction.lockless: if oid in transaction.lockless:
# This is a consequence of not having taken a lock during # This is a consequence of not having taken a lock during
# replication. After a ConflictError, we may be asked to "lock" # replication. After a ConflictError, we may be asked to "lock"
...@@ -407,37 +328,34 @@ class TransactionManager(EventQueue): ...@@ -407,37 +328,34 @@ class TransactionManager(EventQueue):
# new transactions. # new transactions.
# For the cluster, we're still out-of-date and like above, # For the cluster, we're still out-of-date and like above,
# at least 1 other (readable) cell checks for conflicts. # at least 1 other (readable) cell checks for conflicts.
# IDEA: If the shared-lock is assigned to us, consider
# reassigning it (hoping it won't be shared anymore),
# and delay (unstoring the previous store may be tricky).
return return
if other is not transaction: if other is not transaction:
# We have a smaller "TTID" than locking transaction, so we are # We have a smaller "TTID" than locking transaction, so we are
# older: this is a possible deadlock case, as we might already # older: this is a possible deadlock case, as we might already
# hold locks the younger transaction is waiting upon. # hold locks the younger transaction is waiting upon.
if oid in other.lockless:
transaction.lockless.add(oid)
return
# Do not notify whenever the older transaction
# retries to lock (executeQueuedEvents).
if oid not in other.deadlock:
other.deadlock.add(oid)
logging.info('Deadlock on %s:%s with %s', logging.info('Deadlock on %s:%s with %s',
dump(oid), dump(ttid), dump(locked)) dump(oid), dump(ttid), dump(locked))
# Ask master to give the client a new locking tid, which will # Ask the older transaction to lock again.
# be used to ask all involved storage nodes to rebase the # But keep the lock for the moment in case it can vote.
# already locked oids for this transaction. self._app.nm.getByUUID(other.uuid).send(
self._app.master_conn.send(Packets.NotifyDeadlock( Packets.NotifyDeadlock(locked, oid))
ttid, transaction.locking_tid)) raise DelayEvent(ttid)
self._rebase(transaction, ttid)
raise DelayEvent(transaction)
# If previous store was an undo, next store must be based on # If previous store was an undo, next store must be based on
# undo target. # undo target.
try:
previous_serial = transaction.store_dict[oid][2] previous_serial = transaction.store_dict[oid][2]
except KeyError:
# Similarly to below for store, cascaded deadlock for
# checkCurrentSerial is possible because rebase() may return
# oids for which previous rebaseObject are delayed, or being
# received, and the client will bindly resend them.
assert oid in transaction.serial_dict, transaction
logging.info('Transaction %s checking %s more than once',
dump(ttid), dump(oid))
return True
if previous_serial is None: if previous_serial is None:
# 2 valid cases: # The only valid case is when the previous undo resulted in a
# - the previous undo resulted in a resolved conflict # resolved conflict.
# - cascaded deadlock resolution
# Otherwise, this should not happen. For example, when being # Otherwise, this should not happen. For example, when being
# disconnected by the master because we missed a transaction, # disconnected by the master because we missed a transaction,
# a conflict may happen after a first store to us, but the # a conflict may happen after a first store to us, but the
...@@ -447,9 +365,6 @@ class TransactionManager(EventQueue): ...@@ -447,9 +365,6 @@ class TransactionManager(EventQueue):
logging.info('Transaction %s storing %s more than once', logging.info('Transaction %s storing %s more than once',
dump(ttid), dump(oid)) dump(ttid), dump(oid))
return True return True
elif transaction.locking_tid == MAX_TID:
# Deadlock avoidance. Still no new locking_tid from the client.
raise DelayEvent(transaction)
else: else:
try: try:
previous_serial = self._app.dm.getLastObjectTID(oid) previous_serial = self._app.dm.getLastObjectTID(oid)
...@@ -517,24 +432,29 @@ class TransactionManager(EventQueue): ...@@ -517,24 +432,29 @@ class TransactionManager(EventQueue):
if not locked: if not locked:
return ZERO_TID return ZERO_TID
def rebaseObject(self, ttid, oid): def relockObject(self, ttid, oid, unlock):
try: try:
transaction = self._transaction_dict[ttid] transaction = self._transaction_dict[ttid]
except KeyError: except KeyError:
logging.info('Forget rebase of %s by %s delayed by %s', logging.info('Forget relock of %s by %s delayed by %s',
dump(oid), dump(ttid), dump(self.getLockingTID(oid))) dump(oid), dump(ttid), dump(self.getLockingTID(oid)))
return return
try: try:
serial = transaction.serial_dict[oid] serial = transaction.serial_dict[oid]
except KeyError: except KeyError:
# There was a previous rebase for this oid, it was still delayed # This can happen when a partition is dropped.
# during the second RebaseTransaction, and then a conflict was logging.info("no oid %s to relock for transaction %s",
# reported when another transaction was committed.
# This can also happen when a partition is dropped.
logging.info("no oid %s to rebase for transaction %s",
dump(oid), dump(ttid)) dump(oid), dump(ttid))
return return
assert oid not in transaction.lockless, (oid, transaction.lockless) assert oid not in transaction.lockless, (oid, transaction.lockless)
if unlock:
transaction.deadlock.remove(oid)
locked = self._store_lock_dict.pop(oid)
assert locked == ttid, (oid, locked, ttid)
# Now that the lock is released, the younger transaction that triggered
# this deadlock avoidance should be able to lock.
self.executeQueuedEvents()
# And we'll likely be delayed.
try: try:
self.lockObject(ttid, serial, oid) self.lockObject(ttid, serial, oid)
except ConflictError, e: except ConflictError, e:
...@@ -591,8 +511,7 @@ class TransactionManager(EventQueue): ...@@ -591,8 +511,7 @@ class TransactionManager(EventQueue):
try: try:
write_locking_tid = self._store_lock_dict[oid] write_locking_tid = self._store_lock_dict[oid]
except KeyError: except KeyError:
# Lockless store (we are replicating this partition), # Lockless store (we are replicating this partition).
# or unresolved deadlock.
continue continue
if ttid == write_locking_tid: if ttid == write_locking_tid:
del self._store_lock_dict[oid] del self._store_lock_dict[oid]
...@@ -604,11 +523,11 @@ class TransactionManager(EventQueue): ...@@ -604,11 +523,11 @@ class TransactionManager(EventQueue):
if oid in transaction.lockless: if oid in transaction.lockless:
# Several lockless stores for this oid and among them, # Several lockless stores for this oid and among them,
# a higher ttid is still pending. # a higher ttid is still pending.
assert transaction < other, x assert ttid < write_locking_tid, x
# There may remain a single lockless store so we'll need # There may remain a single lockless store so we'll need
# this partition to be checked in _notifyReplicated. # this partition to be checked in _notifyReplicated.
assert self._replicated.get(self.getPartition(oid)), x assert self._replicated.get(self.getPartition(oid)), x
else: # unresolved deadlock else: # delayed relock
assert not locked, x assert not locked, x
# remove the transaction # remove the transaction
del self._transaction_dict[ttid] del self._transaction_dict[ttid]
......
...@@ -26,9 +26,8 @@ AnswerPack(bool) ...@@ -26,9 +26,8 @@ AnswerPack(bool)
AnswerPartitionList(int,int,[[(int,CellStates)]]) AnswerPartitionList(int,int,[[(int,CellStates)]])
AnswerPartitionTable(int,int,[[(int,CellStates)]]) AnswerPartitionTable(int,int,[[(int,CellStates)]])
AnswerPrimary(int) AnswerPrimary(int)
AnswerRebaseObject(?(p64,p64,?(int,bin,bin)))
AnswerRebaseTransaction([p64])
AnswerRecovery(?int,?p64,?p64) AnswerRecovery(?int,?p64,?p64)
AnswerRelockObject(?(p64,p64,?(int,bin,bin)))
AnswerStoreObject(?p64) AnswerStoreObject(?p64)
AnswerStoreTransaction() AnswerStoreTransaction()
AnswerTIDs([p64]) AnswerTIDs([p64])
...@@ -61,9 +60,8 @@ AskPack(p64) ...@@ -61,9 +60,8 @@ AskPack(p64)
AskPartitionList(int,int,?) AskPartitionList(int,int,?)
AskPartitionTable() AskPartitionTable()
AskPrimary() AskPrimary()
AskRebaseObject(p64,p64)
AskRebaseTransaction(p64,p64)
AskRecovery() AskRecovery()
AskRelockObject(p64,p64)
AskStoreObject(p64,p64,int,bin,bin,?p64,?p64) AskStoreObject(p64,p64,int,bin,bin,?p64,?p64)
AskStoreTransaction(p64,bin,bin,bin,[p64]) AskStoreTransaction(p64,bin,bin,bin,[p64])
AskTIDs(int,int,int) AskTIDs(int,int,int)
......
...@@ -36,11 +36,12 @@ from neo.lib.handler import DelayEvent, EventHandler ...@@ -36,11 +36,12 @@ from neo.lib.handler import DelayEvent, EventHandler
from neo.lib import logging from neo.lib import logging
from neo.lib.protocol import (CellStates, ClusterStates, NodeStates, NodeTypes, from neo.lib.protocol import (CellStates, ClusterStates, NodeStates, NodeTypes,
Packets, Packet, uuid_str, ZERO_OID, ZERO_TID, MAX_TID) Packets, Packet, uuid_str, ZERO_OID, ZERO_TID, MAX_TID)
from .. import unpickle_state, Patch, TransactionalResource from .. import Patch, TransactionalResource
from . import ClientApplication, ConnectionFilter, LockLock, NEOCluster, \ from . import ClientApplication, ConnectionFilter, LockLock, NEOCluster, \
NEOThreadedTest, RandomConflictDict, Serialized, ThreadId, with_cluster NEOThreadedTest, RandomConflictDict, Serialized, ThreadId, with_cluster
from neo.lib.util import add64, makeChecksum, p64, u64 from neo.lib.util import add64, makeChecksum, p64, u64
from neo.client.exception import NEOPrimaryMasterLost, NEOStorageError from neo.client.exception import NEOPrimaryMasterLost, NEOStorageError
from neo.client.handlers.storage import _DeadlockPacket
from neo.client.transactions import Transaction from neo.client.transactions import Transaction
from neo.master.handlers.client import ClientServiceHandler from neo.master.handlers.client import ClientServiceHandler
from neo.master.pt import PartitionTable from neo.master.pt import PartitionTable
...@@ -48,7 +49,6 @@ from neo.storage.database import DatabaseFailure ...@@ -48,7 +49,6 @@ from neo.storage.database import DatabaseFailure
from neo.storage.handlers.client import ClientOperationHandler from neo.storage.handlers.client import ClientOperationHandler
from neo.storage.handlers.identification import IdentificationHandler from neo.storage.handlers.identification import IdentificationHandler
from neo.storage.handlers.initialization import InitializationHandler from neo.storage.handlers.initialization import InitializationHandler
from neo.storage.replicator import Replicator
class PCounter(Persistent): class PCounter(Persistent):
value = 0 value = 0
...@@ -279,7 +279,7 @@ class Test(NEOThreadedTest): ...@@ -279,7 +279,7 @@ class Test(NEOThreadedTest):
self.assertEqual(except_list, [DelayEvent]) self.assertEqual(except_list, [DelayEvent])
@with_cluster(storage_count=2, replicas=1) @with_cluster(storage_count=2, replicas=1)
def _testDeadlockAvoidance(self, cluster, scenario): def _testDeadlockAvoidance(self, cluster, scenario, delayed_conflict=None):
except_list = [] except_list = []
delay = threading.Event(), threading.Event() delay = threading.Event(), threading.Event()
ident = get_ident() ident = get_ident()
...@@ -304,6 +304,9 @@ class Test(NEOThreadedTest): ...@@ -304,6 +304,9 @@ class Test(NEOThreadedTest):
delay[c2].wait() delay[c2].wait()
scenario[0] -= 1 scenario[0] -= 1
switch = not scenario[0] switch = not scenario[0]
if c2 and delayed:
f.remove(delayed.pop())
self.assertFalse(delayed)
try: try:
return orig(conn, packet, *args, **kw) return orig(conn, packet, *args, **kw)
finally: finally:
...@@ -324,7 +327,14 @@ class Test(NEOThreadedTest): ...@@ -324,7 +327,14 @@ class Test(NEOThreadedTest):
o1.value += 1 o1.value += 1
o2.value += 2 o2.value += 2
with Patch(TransactionManager, storeObject=onStoreObject), \ delayed = []
if delayed_conflict:
def finish1(*_):
d = f.byPacket(delayed_conflict, lambda _: delayed.append(d))
TransactionalResource(t1, 0, tpc_finish=finish1)
with ConnectionFilter() as f, \
Patch(TransactionManager, storeObject=onStoreObject), \
Patch(MTClientConnection, ask=onAsk): Patch(MTClientConnection, ask=onAsk):
t = self.newThread(t1.commit) t = self.newThread(t1.commit)
t2.commit() t2.commit()
...@@ -343,14 +353,24 @@ class Test(NEOThreadedTest): ...@@ -343,14 +353,24 @@ class Test(NEOThreadedTest):
self.assertEqual(self._testDeadlockAvoidance([2, 4]), self.assertEqual(self._testDeadlockAvoidance([2, 4]),
[DelayEvent, DelayEvent, ConflictError, ConflictError]) [DelayEvent, DelayEvent, ConflictError, ConflictError])
def testDeadlockAvoidance(self): def testDeadlockAvoidance(self, **kw):
# 0: C1 -> S1 # 0: C1 -> S1
# 1: C2 -> S1, S2 (delayed) # 1: C2 -> S1, S2 (delayed)
# 2: C1 -> S2 (deadlock) # 2: C1 -> S2 (deadlock)
# 3: C2 commits # 3: C2 -> S2 (delayed)
# 4: C1 resolves conflict # 4: C1 commits
self.assertEqual(self._testDeadlockAvoidance([1, 3]), # 5: C2 resolves conflict
[DelayEvent, DelayEvent, DelayEvent, ConflictError]) self.assertEqual(self._testDeadlockAvoidance([1, 3], **kw),
[DelayEvent, DelayEvent, ConflictError])
# The following 2 tests cover extra paths when processing
# AnswerRelockObject.
def testDeadlockAvoidanceConflictVsLateRelock(self):
self.testDeadlockAvoidance(delayed_conflict=Packets.AnswerRelockObject)
def testDeadlockAvoidanceRelockVsLateConflict(self):
self.testDeadlockAvoidance(delayed_conflict=Packets.AnswerStoreObject)
@with_cluster() @with_cluster()
def testConflictResolutionTriggered2(self, cluster): def testConflictResolutionTriggered2(self, cluster):
...@@ -2093,7 +2113,11 @@ class Test(NEOThreadedTest): ...@@ -2093,7 +2113,11 @@ class Test(NEOThreadedTest):
self.assertEqual(x.value, 6) self.assertEqual(x.value, 6)
@contextmanager @contextmanager
def thread_switcher(self, threads, order, expected): def thread_switcher(self, threads, order, expected,
# XXX: sched() can be recursive while handling conflicts. In some
# cases, when the test is ending, it seems to work provided
# we ignore a few exceptions.
allow_recurse=False):
self.assertGreaterEqual(len(order), len(expected)) self.assertGreaterEqual(len(order), len(expected))
thread_id = ThreadId() thread_id = ThreadId()
l = [threading.Lock() for l in xrange(len(threads)+1)] l = [threading.Lock() for l in xrange(len(threads)+1)]
...@@ -2103,31 +2127,34 @@ class Test(NEOThreadedTest): ...@@ -2103,31 +2127,34 @@ class Test(NEOThreadedTest):
expected = iter(expected) expected = iter(expected)
def sched(orig, *args, **kw): def sched(orig, *args, **kw):
i = thread_id() i = thread_id()
logging.info('%s: %s%r', i, orig.__name__, args) e = orig.__name__
logging.info('%s: %s%r', i, e, args)
try: try:
x = u64(kw['oid']) e = u64((args[3] if e == '_handlePacket' else kw)['oid'])
except KeyError: except KeyError:
for x in args: for x in args:
if isinstance(x, Packet): if isinstance(x, Packet):
x = type(x).__name__ e = type(x).__name__
break break
else:
x = orig.__name__
try: try:
j = next(order) j = next(order)
except StopIteration: except StopIteration:
end[i].append(x) end[i].append(e)
j = None j = None
try: try:
while 1: while 1:
try:
l.pop().release() l.pop().release()
except threading.ThreadError:
if not allow_recurse:
raise
except IndexError: except IndexError:
pass pass
else: else:
try: try:
self.assertEqual(next(expected), x) self.assertEqual(next(expected), e)
except StopIteration: except StopIteration:
end[i].append(x) end[i].append(e)
try: try:
if callable(j): if callable(j):
with contextmanager(j)(*args, **kw) as j: with contextmanager(j)(*args, **kw) as j:
...@@ -2140,14 +2167,19 @@ class Test(NEOThreadedTest): ...@@ -2140,14 +2167,19 @@ class Test(NEOThreadedTest):
l[j].release() l[j].release()
except threading.ThreadError: except threading.ThreadError:
l[j].acquire() l[j].acquire()
try:
threads[j-1].start() threads[j-1].start()
if x != 'AskStoreTransaction': except RuntimeError:
if not allow_recurse:
raise
l[j].release()
if e != 'AskStoreTransaction':
try: try:
l[i].acquire() l[i].acquire()
except IndexError: except IndexError:
pass pass
def _handlePacket(orig, *args): def _handlePacket(orig, *args):
if isinstance(args[2], Packets.AnswerRebaseTransaction): if args[2] is _DeadlockPacket:
return sched(orig, *args) return sched(orig, *args)
return orig(*args) return orig(*args)
with RandomConflictDict, \ with RandomConflictDict, \
...@@ -2167,184 +2199,71 @@ class Test(NEOThreadedTest): ...@@ -2167,184 +2199,71 @@ class Test(NEOThreadedTest):
t1, c1 = cluster.getTransaction() t1, c1 = cluster.getTransaction()
r = c1.root() r = c1.root()
oids = [] oids = []
for x in 'abcd': for x in 'abc':
r[x] = PCounterWithResolution() r[x] = PCounterWithResolution()
t1.commit() t1.commit()
oids.append(u64(r[x]._p_oid)) oids.append(u64(r[x]._p_oid))
# The test relies on the implementation-defined behavior that ZODB # The test relies on the implementation-defined behavior that ZODB
# processes oids by order of registration. It's also simpler with # processes oids by order of registration. It's also simpler with
# oids from a=1 to d=4. # oids from a=1 to c=3.
self.assertEqual(oids, range(1, 5)) self.assertEqual(oids, [1, 2, 3])
t2, c2 = cluster.getTransaction() t2, c2 = cluster.getTransaction()
t3, c3 = cluster.getTransaction() t3, c3 = cluster.getTransaction()
changes(r, c2.root(), c3.root()) changes(r, c2.root(), c3.root())
threads = map(self.newPausedThread, (t2.commit, t3.commit)) threads = map(self.newPausedThread, (t2.commit, t3.commit))
with self.thread_switcher(threads, order, expected_packets) as end: with self.thread_switcher(threads, order, expected_packets) as end:
t1.commit() t1.commit()
if except2 is None:
threads[0].join() threads[0].join()
else: if except2 is None:
self.assertRaises(except2, threads[0].join)
threads[1].join() threads[1].join()
else:
self.assertRaises(except2, threads[1].join)
t3.begin() t3.begin()
r = c3.root() r = c3.root()
self.assertEqual(expected_values, [r[x].value for x in 'abcd']) self.assertEqual(expected_values, [r[x].value for x in 'abc'])
return dict(end) return dict(end)
def testCascadedDeadlockAvoidanceWithOneStorage1(self): def _testConflictDuringDeadlockAvoidance(self, change):
"""
locking tids: t1 < t2 < t3
1. A2 (t2 stores A)
2. B1, c2 (t2 checks C)
3. A3 (delayed), B3 (delayed), D3 (delayed)
4. C1 -> deadlock: B3
5. d2 -> deadlock: A3
locking tids: t3 < t1 < t2
6. t3 commits
7. t2 rebase: conflicts on A and D
8. t1 rebase: new deadlock on C
9. t2 aborts (D non current)
all locks released for t1, which rebases and resolves conflicts
"""
def changes(r1, r2, r3): def changes(r1, r2, r3):
r1['b'].value += 1 r1['a'].value = 1
r1['c'].value += 2 self.readCurrent(r1['c'])
r2['a'].value += 3 r2['b'].value = 2
self.readCurrent(r2['c']) r2['c'].value = 3
self.readCurrent(r2['d']) r3['b'].value = 4
r3['a'].value += 4 change(r3['a'])
r3['b'].value += 5
r3['d'].value += 6
x = self._testComplexDeadlockAvoidanceWithOneStorage(changes,
(1, 1, 0, 1, 2, 2, 2, 2, 0, 1, 2, 1, 0, 0, 1, 0, 0, 1),
('tpc_begin', 'tpc_begin', 1, 2, 3, 'tpc_begin', 1, 2, 4, 3, 4,
'AskStoreTransaction', 'AskRebaseTransaction',
'AskRebaseTransaction', 'AnswerRebaseTransaction',
'AnswerRebaseTransaction', 'AskRebaseTransaction',
'AnswerRebaseTransaction'),
[4, 6, 2, 6])
try:
x[1].remove(1)
except ValueError:
pass
self.assertEqual(x, {0: [2, 'AskStoreTransaction'], 1: ['tpc_abort']})
def testCascadedDeadlockAvoidanceWithOneStorage2(self):
def changes(r1, r2, r3):
r1['a'].value += 1
r1['b'].value += 2
r1['c'].value += 3
r2['a'].value += 4
r3['b'].value += 5
r3['c'].value += 6
self.readCurrent(r2['c'])
self.readCurrent(r2['d'])
self.readCurrent(r3['d'])
def unlock(orig, *args):
f.remove(rebase)
return orig(*args)
rebase = f.delayAskRebaseTransaction(
Patch(TransactionManager, unlock=unlock))
with ConnectionFilter() as f:
x = self._testComplexDeadlockAvoidanceWithOneStorage(changes,
(0, 1, 1, 0, 1, 2, 2, 2, 2, 0, 1, 2, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1),
('tpc_begin', 1, 'tpc_begin', 1, 2, 3, 'tpc_begin',
2, 3, 4, 3, 4, 'AskStoreTransaction', 'AskRebaseTransaction',
'AskRebaseTransaction', 'AnswerRebaseTransaction'),
[1, 7, 9, 0])
x[0].sort(key=str)
try:
x[1].remove(1)
except ValueError:
pass
self.assertEqual(x, {
0: [2, 3, 'AnswerRebaseTransaction',
'AskRebaseTransaction', 'AskStoreTransaction'],
1: ['AnswerRebaseTransaction','AskRebaseTransaction',
'AnswerRebaseTransaction', 'tpc_abort'],
})
def testCascadedDeadlockAvoidanceOnCheckCurrent(self):
"""Transaction checking an oid more than once
1. t1 < t2
2. t1 deadlocks, t2 gets all locks
3. t2 < t1 < t3
4. t2 finishes: conflict on A, t1 locks C, t3 locks B
5. t1 rebases B -> second deadlock
6. t1 resolves A
7. t3 resolves A -> deadlock, and t1 locks B
8. t1 rebases B whereas it was already locked
"""
def changes(*r):
for r in r:
r['a'].value += 1
self.readCurrent(r['b'])
self.readCurrent(r['c'])
t = []
def vote_t2(*args, **kw):
yield 0
t.append(threading.currentThread())
def tic_t1(*args, **kw):
# Make sure t2 finishes before rebasing B,
# so that B is locked by a newer transaction (t3).
t[0].join()
yield 0
end = self._testComplexDeadlockAvoidanceWithOneStorage(changes, end = self._testComplexDeadlockAvoidanceWithOneStorage(changes,
(0, 1, 1, 0, 1, 1, 0, 0, 2, 2, 2, 2, 1, vote_t2, tic_t1), (1, 2, 2, 2, 1, 1, 2, 0, 0, 1, 2, 0, 2, 1),
('tpc_begin', 1) * 2, [3, 0, 0, 0], None) ('tpc_begin', 'tpc_begin', 'tpc_begin', 2, 1, 2, 3, 2, 1, 3, 3, 1,
self.assertLessEqual(2, end[0].count('AskRebaseTransaction')) 'AskStoreTransaction', 'tpc_abort'),
[1, 2, 3], POSException.ConflictError)
self.assertEqual(end, {1: ['AskStoreTransaction']})
def testFailedConflictOnBigValueDuringDeadlockAvoidance(self): def testFailedConflictOnBigValueDuringDeadlockAvoidance(self):
def changes(r1, r2, r3): @self._testConflictDuringDeadlockAvoidance
r1['b'].value = 1 def change(ob):
r1['d'].value = 2 ob.value = '*' * ob._p_jar.db().storage._cache.max_size
r2['a'].value = '*' * r2._p_jar.db().storage._cache.max_size
r2['b'].value = 3 def testFailedCheckCurrentDuringDeadlockAvoidance(self):
r2['c'].value = 4 self._testConflictDuringDeadlockAvoidance(self.readCurrent)
r3['a'].value = 5
self.readCurrent(r3['c']) @with_cluster(replicas=1, partitions=2)
self.readCurrent(r3['d'])
with ConnectionFilter() as f:
x = self._testComplexDeadlockAvoidanceWithOneStorage(changes,
(1, 1, 1, 2, 2, 2, 1, 2, 2, 0, 0, 1, 1, 1, 0),
('tpc_begin', 'tpc_begin', 1, 2, 'tpc_begin', 1, 3, 3, 4,
'AskStoreTransaction', 2, 4, 'AskRebaseTransaction',
'AnswerRebaseTransaction', 'tpc_abort'),
[5, 1, 0, 2], POSException.ConflictError)
self.assertEqual(x, {0: ['AskStoreTransaction']})
@with_cluster(replicas=1, partitions=4)
def testNotifyReplicated(self, cluster): def testNotifyReplicated(self, cluster):
""" """
Check replication while several concurrent transactions leads to Check replication while several concurrent transactions leads to
conflict resolutions and deadlock avoidances, and in particular the conflict resolutions and deadlock avoidances, and in particular the
handling of write-locks when the storage node is about to notify the handling of write-locks when the storage node is about to notify the
master that partitions are replicated. master that partitions are replicated.
Transactions are committed in the following order:
- t2 About releasing shared write-locks:
- t4, conflict on 'd' - The test was originally written with only t1 & t3, covering only the
- t1, deadlock on 'a' case of not releasing a write-lock that is owned by a higher TTID.
- t3, deadlock on 'b', and 2 conflicts on 'a' - t4 was then added to test the opposite case ('a' on s1 by t3 & t4,
Special care is also taken for the change done by t3 on 'a', to check and t4 finishes first): the write-lock is released but given
that the client resolves conflicts with correct oldSerial: immediately after while checking for replicated partitions.
1. The initial store (a=8) is first delayed by t2.
2. It is then kept aside by the deadlock. At last, t2 was added to trigger a deadlock when t3 is about to vote:
3. On s1, deadlock avoidance happens after t1 stores a=7 and the store - the notification from s0 is ignored
is delayed again. However, it's the contrary on s0, and a conflict - s1 does not notify because 'a' is still lockless
is reported to the client.
4. Second store (a=12) based on t2.
5. t1 finishes and s1 reports the conflict for first store (with t1).
At that point, the base serial of this store is meaningless:
the client only has data for last store (based on t2), and it's its
base serial that must be used. t3 write 15 (and not 19 !).
6. Conflicts for the second store are with t2 and they're ignored
because they're already resolved.
Note that this test method lacks code to enforce some events to happen
in the expected order. Sometimes, the above scenario is not reproduced
entirely, but it's so rare that there's no point in making the code
further complicated.
""" """
s0, s1 = cluster.storage_list s0, s1 = cluster.storage_list
s1.stop() s1.stop()
...@@ -2352,96 +2271,73 @@ class Test(NEOThreadedTest): ...@@ -2352,96 +2271,73 @@ class Test(NEOThreadedTest):
s1.resetNode() s1.resetNode()
t1, c1 = cluster.getTransaction() t1, c1 = cluster.getTransaction()
r = c1.root() r = c1.root()
for x in 'abcd': for x in 'abc':
r[x] = PCounterWithResolution() r[x] = PCounterWithResolution()
t1.commit() t1.commit()
t3, c3 = cluster.getTransaction() t3, c3 = cluster.getTransaction()
r['c'].value += 1 r['a'].value += 1
t1.commit() t1.commit()
r['b'].value += 2 r['a'].value += 2
r['a'].value += 3 r['c'].value += 3
t2, c2 = cluster.getTransaction() t2, c2 = cluster.getTransaction()
r = c2.root() r = c2.root()
r['a'].value += 4 r['a'].value += 4
r['c'].value += 5
r['d'].value += 6
r = c3.root() r = c3.root()
r['c'].value += 7 r['b'].value += 5
r['a'].value += 8 r['c'].value += 6
r['b'].value += 9 r['a'].value += 7
t4, c4 = cluster.getTransaction() t4, c4 = cluster.getTransaction()
r = c4.root() r = c4.root()
r['d'].value += 10 r['a'].value += 8
threads = map(self.newPausedThread, (t2.commit, t3.commit, t4.commit)) threads = map(self.newPausedThread, (t2.commit, t3.commit, t4.commit))
def t3_c(*args, **kw): deadlocks = [(3, False)] # by t1
yield 1 def t3_relock(*args, **kw):
# We want to resolve the conflict before storing A. self.assertPartitionTable(cluster, 'UO|UO')
self.tic()
def t3_resolve(*args, **kw):
self.assertPartitionTable(cluster, 'UO|UO|UO|UO')
f.remove(delay) f.remove(delay)
self.tic() self.tic()
self.assertPartitionTable(cluster, 'UO|UO|UU|UO') self.assertPartitionTable(cluster, 'UU|UO')
yield yield 0
def t1_rebase(*args, **kw): def t2_store(*args, **kw):
self.tic() self.assertFalse(deadlocks)
self.assertPartitionTable(cluster, 'UO|UU|UU|UO') deadlocks.append((1, True))
yield yield 3
def t3_b(*args, **kw): def t4_vote(*args, **kw):
yield 1
self.tic()
self.assertPartitionTable(cluster, 'UO|UU|UU|UU')
def t4_d(*args, **kw):
self.tic()
self.assertPartitionTable(cluster, 'UU|UU|UU|UU')
yield 2 yield 2
# Delay the conflict for the second store of 'a' by t3. f.remove(delayDeadlock)
delay_conflict = {s0.uuid: [1], s1.uuid: [1,0]} self.assertFalse(deadlocks)
def delayConflict(conn, packet): def delayDeadlock(conn, packet):
app = self.getConnectionApp(conn) if isinstance(packet, Packets.NotifyDeadlock):
if (isinstance(packet, Packets.AnswerStoreObject) self.assertEqual(self.getConnectionApp(conn).uuid, s0.uuid)
and packet._args[0]): oid, delay = deadlocks.pop()
conn, = cluster.client.getConnectionList(app) self.assertEqual(u64(packet._args[1]), oid)
kw = conn._handlers._pending[0][0][packet._id][1] return delay
return 1 == u64(kw['oid']) and delay_conflict[app.uuid].pop()
def writeA(orig, txn_context, oid, serial, data):
if u64(oid) == 1:
value = unpickle_state(data)['value']
if value > 12:
f.remove(delayConflict)
elif value == 12:
f.add(delayConflict)
return orig(txn_context, oid, serial, data)
###
with ConnectionFilter() as f, \ with ConnectionFilter() as f, \
Patch(cluster.client, _store=writeA), \
self.thread_switcher(threads, self.thread_switcher(threads,
(1, 2, 3, 0, 1, 0, 2, t3_c, 1, 3, 2, t3_resolve, 0, 0, 0, (1, 2, 3, 2, 2, 2, 0, 3, 0, 2, t3_relock, 3,
t1_rebase, 2, t3_b, 3, t4_d, 0, 2, 2), 1, t2_store, t4_vote, 2, 2),
('tpc_begin', 'tpc_begin', 'tpc_begin', 'tpc_begin', ('tpc_begin', 'tpc_begin', 'tpc_begin', 'tpc_begin',
2, 1, 1, 3, 3, 4, 4, 3, 1, 2, 3, 1, 1, 1, 3, 3, 'AskStoreTransaction',
'AskRebaseTransaction', 'AskRebaseTransaction', 1, 1, 'AskStoreTransaction'), allow_recurse=True) as end:
'AnswerRebaseTransaction', 'AnswerRebaseTransaction', 2
)) as end:
delay = f.delayAskFetchTransactions() delay = f.delayAskFetchTransactions()
f.add(delayDeadlock)
s1.start() s1.start()
self.tic() self.tic()
t1.commit() t1.commit()
for t in threads: for t in threads:
t.join() t.join()
t4.begin() t4.begin()
self.assertEqual([15, 11, 13, 16], [r[x].value for x in 'abcd']) self.assertPartitionTable(cluster, 'UU|UU')
self.assertEqual([2, 2], map(end.pop(2).count, self.assertEqual([22, 5, 9], [r[x].value for x in 'abc'])
['AskRebaseTransaction', 'AnswerRebaseTransaction'])) self.assertEqual(end.pop(3), [1])
self.assertEqual(end, { self.assertEqual(sorted(end), [1, 2])
0: [1, 'AskStoreTransaction'],
1: ['AskStoreTransaction'],
3: [4, 'AskStoreTransaction'],
})
self.assertFalse(s1.dm.getOrphanList()) self.assertFalse(s1.dm.getOrphanList())
@with_cluster(replicas=1) @with_cluster(replicas=1)
def testNotifyReplicated2(self, cluster): def testNotifyReplicated2(self, cluster):
"""
See comment in about using 'discard' instead of 'remove'
in TransactionManager._notifyReplicated of the storage node.
"""
s0, s1 = cluster.storage_list s0, s1 = cluster.storage_list
s1.stop() s1.stop()
cluster.join((s1,)) cluster.join((s1,))
...@@ -2452,10 +2348,9 @@ class Test(NEOThreadedTest): ...@@ -2452,10 +2348,9 @@ class Test(NEOThreadedTest):
r[x] = PCounterWithResolution() r[x] = PCounterWithResolution()
t1.commit() t1.commit()
r['a'].value += 1 r['a'].value += 1
r['b'].value += 2
t2, c2 = cluster.getTransaction() t2, c2 = cluster.getTransaction()
r = c2.root() r = c2.root()
r['a'].value += 3 r['a'].value += 2
r['b'].value += 4 r['b'].value += 4
thread = self.newPausedThread(t2.commit) thread = self.newPausedThread(t2.commit)
def t2_b(*args, **kw): def t2_b(*args, **kw):
...@@ -2464,17 +2359,15 @@ class Test(NEOThreadedTest): ...@@ -2464,17 +2359,15 @@ class Test(NEOThreadedTest):
self.tic() self.tic()
self.assertPartitionTable(cluster, 'UO') self.assertPartitionTable(cluster, 'UO')
yield 0 yield 0
def t2_vote(*args, **kw): def t1_vote(*args, **kw):
self.tic() self.tic()
self.assertPartitionTable(cluster, 'UU') self.assertPartitionTable(cluster, 'UO')
yield 0 yield 1
with ConnectionFilter() as f, \ with ConnectionFilter() as f, \
self.thread_switcher((thread,), self.thread_switcher((thread,),
(1, 0, 1, 1, t2_b, 0, 0, 1, t2_vote, 0, 0), (1, 0, 1, 1, t2_b, t1_vote, 1, 1),
('tpc_begin', 'tpc_begin', 1, 1, 2, 2, ('tpc_begin', 'tpc_begin', 1, 1, 2, 'AskStoreTransaction',
'AskRebaseTransaction', 'AskRebaseTransaction', 1, 'AskStoreTransaction',
'AskStoreTransaction',
'AnswerRebaseTransaction', 'AnswerRebaseTransaction',
)) as end: )) as end:
delay = f.delayAskFetchTransactions() delay = f.delayAskFetchTransactions()
s1.start() s1.start()
...@@ -2482,164 +2375,39 @@ class Test(NEOThreadedTest): ...@@ -2482,164 +2375,39 @@ class Test(NEOThreadedTest):
t1.commit() t1.commit()
thread.join() thread.join()
t2.begin() t2.begin()
self.assertEqual([4, 6], [r[x].value for x in 'ab']) self.assertPartitionTable(cluster, 'UU')
self.assertEqual([3, 4], [r[x].value for x in 'ab'])
self.assertFalse(end)
@with_cluster(replicas=1, partitions=2) @with_cluster(replicas=1)
def testNotifyReplicated3(self, cluster): def testLateLocklessWrite(self, cluster):
s0, s1 = cluster.storage_list s0, s1 = cluster.storage_list
t1, c1 = cluster.getTransaction()
r = c1.root()
r[''] = PCounter()
t1.commit()
s1.stop() s1.stop()
cluster.join((s1,)) cluster.join((s1,))
s1.resetNode() s1.resetNode()
nonlocal_ = [0]
class Abort(Exception):
pass
with cluster.newClient(1) as db, Patch(Replicator,
_nextPartitionSortKey=lambda orig, self, offset: offset):
t3, c3 = cluster.getTransaction(db)
with ConnectionFilter() as f, self.noConnection(c3, s0):
@f.delayAnswerFetchObjects
def delay(_):
if nonlocal_:
return nonlocal_.pop()
s1.start()
self.tic()
r[''].value += 1
r._p_changed = 1
t2, c2 = cluster.getTransaction()
c2.root()._p_changed = 1
def t1_rebase(*args, **kw):
self.tic()
f.remove(delay)
yield 0
@self.newPausedThread
def commit23():
t2.commit()
c3.root()[''].value += 3
with self.assertRaises(Abort) as cm:
t3.commit()
self.assertTrue(*cm.exception.args)
def t3_commit(txn):
# Check that the storage hasn't answered to the store,
# which means that a lock is still taken for r[''] by t1.
self.tic()
try:
txn = txn.data(c3)
except (AttributeError, KeyError): # BBB: ZODB < 5
pass
txn_context = db.storage.app._txn_container.get(txn)
raise Abort(txn_context.queue.empty())
TransactionalResource(t3, 1, commit=t3_commit)
with self.thread_switcher((commit23,),
(1, 1, 0, 0, t1_rebase, 0, 0, 0, 1, 1, 1, 1, 0),
('tpc_begin', 'tpc_begin', 0, 1, 0,
'AskRebaseTransaction', 'AskRebaseTransaction',
'AnswerRebaseTransaction', 'AnswerRebaseTransaction',
'AskStoreTransaction', 'tpc_begin', 1, 'tpc_abort',
)) as end:
self.assertRaises(POSException.ConflictError, t1.commit)
commit23.join()
self.assertEqual(end, {0: ['tpc_abort']})
self.assertPartitionTable(cluster, 'UU|UU')
@with_cluster(partitions=2, storage_count=2)
def testConflictAfterLocklessWrite(self, cluster):
"""
Show that in case of a write to an outdated cell, the client must
discard the answer if it comes after a resolved conflict, as the client
would not have the data anymore to solve a second conflict (deadlock
avoidance). This test reproduces a case where the storage node can't
easily return the correct data back to the client.
The scenario focuses on object A (oid 1) and node s0, which is
initially replicating partition 1:
1. t1 writes A: s1 conflicts and the answer from s0 is delayed
2. t1 writes B: a deadlock is triggered by s0 and internally, the write
of A is not considered lockless anymore
3. replication of partition 1 finishes: A is marked as locked normally
(which can be considered wrong but discarding the write at that
point is not trivial and anyway another write is coming)
4. t1 resolves A: s1 is not yet notified of the deadlock and accepts
5. t1 receives the answer for the first write of A to s1: if it didn't
discard it, it would mark the write of A as completed on all nodes
6. t1 starts resolving the deadlock, s0 conflicts for the second store
and returns that A needs to be rebased (like in 3, questionable)
7. the answer of s0 for the rebasing of A contains data from the first
write and it is processed first: this is not an issue if the client
still has the data (i.e. not moved to transaction cache, or
discarded because the cache is too small)
"""
t1, c1 = cluster.getTransaction() t1, c1 = cluster.getTransaction()
r = c1.root() ob = c1.root()[''] = PCounterWithResolution()
for x in 'ab':
r[x] = PCounterWithResolution()
t1.commit() t1.commit()
cluster.neoctl.setNumReplicas(1) ob.value += 1
self.tic() t2, c2 = cluster.getTransaction()
s0, s1 = cluster.sortStorageList() ob = c2.root()['']
t1, c1 = cluster.getTransaction() ob.value += 2
r = c1.root()
r['a'].value += 1
r['b'].value += 2
with cluster.newClient(1) as db, ConnectionFilter() as f:
delayReplication = f.delayAnswerFetchObjects()
delayStore = f.delayAnswerStoreObject(lambda conn:
conn.uuid == cluster.client.uuid and
self.getConnectionApp(conn) is s0)
delayDeadlock = f.delayNotifyDeadlock()
delayRebase = f.delayAnswerRebaseObject(lambda conn:
# to first process the one from s0
self.getConnectionApp(conn) is s1)
cluster.neoctl.tweakPartitionTable()
self.tic()
t2, c2 = cluster.getTransaction(db)
r = c2.root()
r['a'].value += 3 # for a first conflict on t1/s1
t2.commit() t2.commit()
r['b'].value += 4 # for a deadlock on t1/s0 def resolve(orig, *args):
r['a'].value += 5 # for a second conflict on t1/s0 f.remove(d)
def t1_b(*args, **kw): return orig(*args)
self.tic() # make sure t2/b will be processed before t1/b with ConnectionFilter() as f, \
yield 0 Patch(PCounterWithResolution, _p_resolveConflict=resolve):
def t1_resolve(*args, **kw): f.delayAskFetchTransactions()
f.remove(delayReplication) d = f.delayAnswerStoreObject(lambda conn:
self.tic() self.getConnectionApp(conn) is s1)
yield 1 s1.start()
f.remove(delayStore)
self.tic() self.tic()
f.remove(delayDeadlock)
def t2_vote(*args, **kw):
yield 0
# From now own, prefer reading from s0,
# in case that packets from s1 are blocked by the filter.
no_read.append(s1.uuid)
def t1_end(*args, **kw):
yield 0
f.remove(delayRebase)
commit2 = self.newPausedThread(t2.commit)
no_read = []
with cluster.client.extraCellSortKey(
lambda cell: cell.getUUID() in no_read), \
self.thread_switcher((commit2,),
(1, 1, 0, 0, t1_b, t1_resolve, 0, 0, 0, 0, 1, t2_vote, t1_end),
('tpc_begin', 'tpc_begin', 2, 1, 2, 1, 1,
'AskRebaseTransaction', 'AskRebaseTransaction',
'AnswerRebaseTransaction', 'AnswerRebaseTransaction',
'AskStoreTransaction')) as end:
t1.commit() t1.commit()
commit2.join() t2.begin()
t1.begin() self.tic()
r = c1.root() self.assertEqual(ob.value, 3)
self.assertEqual(r['a'].value, 9) self.assertPartitionTable(cluster, 'UU')
self.assertEqual(r['b'].value, 6)
t1 = end.pop(0)
self.assertEqual(t1.pop(), 'AskStoreTransaction')
self.assertEqual(sorted(t1), [1, 2])
self.assertFalse(end)
self.assertPartitionTable(cluster, 'UU|UU')
@with_cluster(start_cluster=0, storage_count=2, replicas=1) @with_cluster(start_cluster=0, storage_count=2, replicas=1)
def testLocklessWriteDuringConflictResolution(self, cluster): def testLocklessWriteDuringConflictResolution(self, cluster):
...@@ -2651,39 +2419,35 @@ class Test(NEOThreadedTest): ...@@ -2651,39 +2419,35 @@ class Test(NEOThreadedTest):
work in itself, the code was hazardous and the client can't easily work in itself, the code was hazardous and the client can't easily
discard such "positive" answers, or process them early enough. discard such "positive" answers, or process them early enough.
The scenario focuses on transaction t1 (storing oid 1) The scenario focuses on transaction t2 (storing oid 1)
and node s1 (initially replicating the only partition): and node s1 (initially replicating the only partition):
1. t1 stores: conflict on s0, lockless write on s1 1. t2 stores: conflict on s0, lockless write on s1
2. t2 stores: locked on s0, lockless write on s1 2. t1 stores: locked on s0, lockless write on s1
3. t1 resolves: deadlock on s0, packet to s1 delayed 3. t2 resolves: waiting for write-lock on s0, packet to s1 delayed
4. t2 commits: on s1, a single lockless write remains and the storage 4. Partition 0 replicated. Multiple lockless writes block notification
node notifies the master that it is UP_TO_DATE to master.
5. s1 receives the second store from t1: answer delayed 5. t1 commits: t2 conflicts on s0, a single lockless write remains
6. t2 begins a new transaction on s1, and s1 notifies the master that it is UP_TO_DATE
7. t1 resolves the deadlock: conflict on s0, s1 asks to rebase 6. s1 receives the second store from t2: answer delayed
8. t2 stores and vote 7. while t2 is resolving the conflict: answer from s1 processed, s0 down
9. s0 down 8. conflict on s1
10. while t2 finishes, t1 starts solving the conflict and due to the 9. t2 resolves a last time and votes
way packets are processed, it proceeds as follows:
a. answer in step 5 is received but not processed
b. data asked, but not for the last serial
(so there will be yet another conflict to solve)
c. new data is stored: client waiting for s1
d. answer in step 5 is processed
What happened before: What happened before:
4. t1 still has the lock 5. t1 still has the lock
5. locked ("Transaction storing more than once") 6. locked ("Transaction storing more than once")
10d. store considered successful, and the data won't be there anymore 9. t2 already processed a successful store from s1 and crashes
for the actual conflict (KeyError in Transaction.written)
-> assertion failure
Now that the storage nodes discards lockless writes that actually Now that the storage nodes discards lockless writes that actually
conflict: conflict:
4. t1 does not have the lock anymore 5. t1 does not have the lock anymore
5. conflict 6. conflict
10d. ignored (conflict already resolved) 9. just after vote, t2 aborts because the only storage node that's
-> transaction aborted normally RUNNING from the beginning got lockless writes (note that the client
currently tracks them as a list of partitions: if it was a list
of oids, it could finish the transaction because oid 1 ended up
being stored normally)
""" """
s0, s1 = cluster.storage_list s0, s1 = cluster.storage_list
cluster.start(storage_list=(s0,)) cluster.start(storage_list=(s0,))
...@@ -2692,13 +2456,14 @@ class Test(NEOThreadedTest): ...@@ -2692,13 +2456,14 @@ class Test(NEOThreadedTest):
t1.commit() t1.commit()
x1.value += 1 x1.value += 1
with cluster.newClient(1) as db, ConnectionFilter() as f: with cluster.newClient(1) as db, ConnectionFilter() as f:
client = db.storage.app
delayReplication = f.delayAnswerFetchObjects() delayReplication = f.delayAnswerFetchObjects()
delayed = [] delayed = []
delayStore = f.delayAskStoreObject(lambda conn: delayStore = f.delayAskStoreObject(lambda conn:
conn.uuid in delayed and conn.uuid in delayed and
self.getConnectionApp(conn) is cluster.client) self.getConnectionApp(conn) is client)
delayStored = f.delayAnswerStoreObject(lambda conn: delayStored = f.delayAnswerStoreObject(lambda conn:
conn.uuid == cluster.client.uuid and conn.uuid == client.uuid and
self.getConnectionApp(conn).uuid in delayed) self.getConnectionApp(conn).uuid in delayed)
def load(orig, oid, at_tid, before_tid): def load(orig, oid, at_tid, before_tid):
if delayed: if delayed:
...@@ -2716,38 +2481,36 @@ class Test(NEOThreadedTest): ...@@ -2716,38 +2481,36 @@ class Test(NEOThreadedTest):
t2, c2 = cluster.getTransaction(db) t2, c2 = cluster.getTransaction(db)
x2 = c2.root()[''] x2 = c2.root()['']
x2.value += 2 x2.value += 2
t2.commit() t1.commit()
x2.value += 4 x1.value += 4
def tic1(*args, **kw): def tic2(*args, **kw):
yield 1 yield 0
self.tic() self.tic()
def t1_resolve(*args, **kw): def t2_resolve(*args, **kw):
delayed.append(s1.uuid) delayed.append(s1.uuid)
f.remove(delayReplication) f.remove(delayReplication)
self.tic() self.tic()
yield 1
self.tic()
def t2_begin(*args, **kw):
f.remove(delayStore)
yield 0 yield 0
@self.newPausedThread self.tic()
def commit2(): commit2 = self.newPausedThread(t2.commit)
t2.commit() with Patch(client, _loadFromStorage=load) as p, \
x2.value += 8
t2.commit()
with Patch(cluster.client, _loadFromStorage=load) as p, \
self.thread_switcher((commit2,), self.thread_switcher((commit2,),
(1, 0, tic1, 0, t1_resolve, 1, t2_begin, 0, 1, 1, 0), (1, 1, tic2, 1, t2_resolve, 1, 1, 1),
('tpc_begin', 'tpc_begin', 1, 1, 1, 'AskStoreTransaction', ('tpc_begin', 'tpc_begin', 1, 1, 1, 'AskStoreTransaction',
'tpc_begin', 'AskRebaseTransaction', 'AskRebaseTransaction',
1, 'AskStoreTransaction')) as end: 1, 'AskStoreTransaction')) as end:
t1.commit()
f.remove(delayStore)
self.assertRaisesRegexp(NEOStorageError, self.assertRaisesRegexp(NEOStorageError,
'^partition 0 not fully write-locked$', '^partition 0 not fully write-locked$',
t1.commit) commit2.join)
commit2.join() t2.begin()
t1.begin() self.assertEqual(x2.value, 5)
self.assertEqual(x1.value, 14)
self.assertPartitionTable(cluster, 'OU') self.assertPartitionTable(cluster, 'OU')
self.assertEqual(end, {1: [
'AskVoteTransaction', # no such packet sent in reality
# (the fact that it appears here is only an artefact
# between implementation detail and thread_switcher)
'tpc_abort']})
@with_cluster(partitions=2, storage_count=2) @with_cluster(partitions=2, storage_count=2)
def testUnstore(self, cluster): def testUnstore(self, cluster):
...@@ -2803,132 +2566,6 @@ class Test(NEOThreadedTest): ...@@ -2803,132 +2566,6 @@ class Test(NEOThreadedTest):
self.assertEqual(end, {0: [2, 2, 'AskStoreTransaction']}) self.assertEqual(end, {0: [2, 2, 'AskStoreTransaction']})
self.assertFalse(s1.dm.getOrphanList()) self.assertFalse(s1.dm.getOrphanList())
@with_cluster(storage_count=2, partitions=2)
def testDeadlockAvoidanceBeforeInvolvingAnotherNode(self, cluster):
t1, c1 = cluster.getTransaction()
r = c1.root()
for x in 'abc':
r[x] = PCounterWithResolution()
t1.commit()
r['a'].value += 1
r['c'].value += 2
r['b'].value += 3
t2, c2 = cluster.getTransaction()
r = c2.root()
r['c'].value += 4
r['a'].value += 5
r['b'].value += 6
t = self.newPausedThread(t2.commit)
def t1_b(*args, **kw):
yield 1
self.tic()
with self.thread_switcher((t,), (1, 0, 1, 0, t1_b, 0, 0, 0, 1),
('tpc_begin', 'tpc_begin', 1, 3, 3, 1, 'AskRebaseTransaction',
2, 'AnswerRebaseTransaction')) as end:
t1.commit()
t.join()
t2.begin()
self.assertEqual([6, 9, 6], [r[x].value for x in 'abc'])
self.assertEqual([2, 2], map(end.pop(1).count,
['AskRebaseTransaction', 'AnswerRebaseTransaction']))
# Rarely, there's an extra deadlock for t1:
# 0: ['AnswerRebaseTransaction', 'AskRebaseTransaction',
# 'AskRebaseTransaction', 'AnswerRebaseTransaction',
# 'AnswerRebaseTransaction', 2, 3, 1,
# 'AskStoreTransaction', 'VoteTransaction']
self.assertEqual(end.pop(0)[0], 'AnswerRebaseTransaction')
self.assertFalse(end)
@with_cluster()
def testDelayedStoreOrdering(self, cluster):
"""
By processing delayed stores (EventQueue) in the order of their locking
tid, we minimize the number deadlocks. Here, we trigger a first
deadlock, so that the delayed check for t1 is reordered after that of
t3.
"""
t1, c1 = cluster.getTransaction()
r = c1.root()
for x in 'abcd':
r[x] = PCounter()
t1.commit()
r['a'].value += 1
self.readCurrent(r['d'])
t2, c2 = cluster.getTransaction()
r = c2.root()
r['b'].value += 1
self.readCurrent(r['d'])
t3, c3 = cluster.getTransaction()
r = c3.root()
r['c'].value += 1
self.readCurrent(r['d'])
threads = map(self.newPausedThread, (t2.commit, t3.commit))
with self.thread_switcher(threads, (1, 2, 0, 1, 2, 1, 0, 2, 0, 1, 2),
('tpc_begin', 'tpc_begin', 'tpc_begin', 1, 2, 3, 4, 4, 4,
'AskRebaseTransaction', 'AskStoreTransaction')) as end:
t1.commit()
for t in threads:
t.join()
self.assertEqual(end, {
0: ['AnswerRebaseTransaction', 'AskStoreTransaction'],
2: ['AskStoreTransaction']})
@with_cluster(replicas=1)
def testConflictAfterDeadlockWithSlowReplica1(self, cluster,
slow_rebase=False):
t1, c1 = cluster.getTransaction()
r = c1.root()
for x in 'ab':
r[x] = PCounterWithResolution()
t1.commit()
r['a'].value += 1
r['b'].value += 2
s1 = cluster.storage_list[1]
with cluster.newClient(1) as db, \
(s1.filterConnection(cluster.client) if slow_rebase else
cluster.client.filterConnection(s1)) as f, \
cluster.client.extraCellSortKey(lambda cell:
cell.getUUID() == s1.uuid):
t2, c2 = cluster.getTransaction(db)
r = c2.root()
r['a'].value += 3
self.readCurrent(r['b'])
t = self.newPausedThread(t2.commit)
def tic_t1(*args, **kw):
yield 0
self.tic()
def tic_t2(*args, **kw):
yield 1
self.tic()
def load(orig, *args, **kw):
f.remove(delayStore)
return orig(*args, **kw)
order = [tic_t2, 0, tic_t2, 1, tic_t1, 0, 0, 0, 1, tic_t1, 0]
def t1_resolve(*args, **kw):
yield
f.remove(delay)
if slow_rebase:
order.append(t1_resolve)
delay = f.delayAnswerRebaseObject()
else:
order[-1] = t1_resolve
delay = f.delayAskStoreObject()
with self.thread_switcher((t,), order,
('tpc_begin', 'tpc_begin', 1, 1, 2, 2, 'AskRebaseTransaction',
'AskRebaseTransaction', 'AnswerRebaseTransaction',
'AskStoreTransaction')) as end:
t1.commit()
t.join()
self.assertNotIn(delay, f)
t2.begin()
end[0].sort(key=str)
self.assertEqual(end, {0: [1, 'AnswerRebaseTransaction',
'AskStoreTransaction']})
self.assertEqual([4, 2], [r[x].value for x in 'ab'])
def testConflictAfterDeadlockWithSlowReplica2(self):
self.testConflictAfterDeadlockWithSlowReplica1(True)
@with_cluster(start_cluster=0, master_count=3) @with_cluster(start_cluster=0, master_count=3)
def testElection(self, cluster): def testElection(self, cluster):
m0, m1, m2 = cluster.master_list m0, m1, m2 = cluster.master_list
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment