Lockless stores/checks during replication

7af948cf · Julien Muchembled · b7a5bc99 · 7af948cf · 7af948cf · 7af948cf
Commit 7af948cf authored Jan 04, 2017 by Julien Muchembled
11 changed files
--- a/TODO
+++ b/TODO
@@ -61,7 +61,9 @@
      partitions. Currently, reads succeed because feeding nodes don't delete
      anything while the cluster is operational, for performance reasons:
      deletion of dropped partitions must be reimplemented in a scalable way.
-      (HIGH AVAILABILITY)
+      The same thing happens for writes: storage nodes must discard
+      stores/checks of dropped partitions (in lockObject, that can be done by
+      raising ConflictError(None)). (HIGH AVAILABILITY)

    Storage
    - Use libmysqld instead of a stand-alone MySQL server.

--- a/neo/client/app.py
+++ b/neo/client/app.py
@@ -410,6 +410,8 @@ class Application(ThreadedApplication):
    def store(self, oid, serial, data, version, transaction):
        """Store object."""
        logging.debug('storing oid %s serial %s', dump(oid), dump(serial))
+        if not serial: # BBB
+            serial = ZERO_TID
        self._store(self._txn_container.get(transaction), oid, serial, data)

    def _store(self, txn_context, oid, serial, data, data_serial=None):
@@ -472,7 +474,7 @@ class Application(ThreadedApplication):
                oid, (serial, conflict_serial) = pop_conflict()
            except KeyError:
                return
-            if conflict_serial == ZERO_TID:
+            if conflict_serial == MAX_TID:
              if 1:
                # XXX: disable deadlock avoidance code until it is fixed
                logging.info('Deadlock avoidance on %r:%r',

--- a/neo/client/handlers/storage.py
+++ b/neo/client/handlers/storage.py
@@ -17,7 +17,7 @@
 from ZODB.TimeStamp import TimeStamp

 from neo.lib import logging
-from neo.lib.protocol import ZERO_TID
+from neo.lib.protocol import MAX_TID
 from neo.lib.util import dump
 from neo.lib.exception import NodeNotReady
 from neo.lib.handler import MTEventHandler
@@ -62,10 +62,13 @@ class StorageAnswersHandler(AnswerBaseHandler):
        self.app.setHandlerData(args)

    def answerStoreObject(self, conn, conflict, oid, serial):
+        if not conflict:
+            # Ignore if not locked on storage side.
+            return
        txn_context = self.app.getHandlerData()
        object_stored_counter_dict = txn_context[
            'object_stored_counter_dict'][oid]
-        if conflict:
+        if conflict != serial:
            # Conflicts can not be resolved now because 'conn' is locked.
            # We must postpone the resolution (by queuing the conflict in
            # 'conflict_dict') to avoid any deadlock with another thread that
@@ -76,10 +79,10 @@ class StorageAnswersHandler(AnswerBaseHandler):
            # receive the conflict answer from the first store on S2.
            logging.info('%r report a conflict for %r with %r',
                         conn, dump(oid), dump(conflict))
-            if conflict != ZERO_TID:
+            if conflict != MAX_TID:
                # If this conflict is not already resolved, mark it for
                # resolution.
-                if conflict <= txn_context['resolved_dict'].get(oid, ZERO_TID):
+                if conflict <= txn_context['resolved_dict'].get(oid, ''):
                    return
                if conflict in object_stored_counter_dict:
                    raise NEOStorageError('Storages %s accepted object %s'

--- a/neo/lib/protocol.py
+++ b/neo/lib/protocol.py
@@ -940,10 +940,12 @@ class StoreObject(Packet):
    """
    Ask to store an object. Send an OID, an original serial, a current
    transaction ID, and data. C -> S.
-    Answer if an object has been stored. If an object is in conflict,
-    a serial of the conflicting transaction is returned. In this case,
-    if this serial is newer than the current transaction ID, a client
-    node must not try to resolve the conflict. S -> C.
+    As for IStorage, 'serial' is ZERO_TID for new objects.
+    Answered 'conflict' value means:
+    - None: lockless
+    - serial: ok
+    - MAX_TID: deadlock
+    - else: conflict
    """
    _fmt = PStruct('ask_store_object',
        POID('oid'),

--- a/neo/storage/app.py
+++ b/neo/storage/app.py
@@ -38,13 +38,14 @@ from neo.lib.debug import register as registerLiveDebugger
 class Application(BaseApplication):
    """The storage node application."""

+    tm = None
+
    def __init__(self, config):
        super(Application, self).__init__(
            config.getSSL(), config.getDynamicMasterList())
        # set the cluster name
        self.name = config.getCluster()

-        self.tm = TransactionManager(self)
        self.dm = buildDatabaseManager(config.getAdapter(),
            (config.getDatabase(), config.getEngine(), config.getWait()),
        )
@@ -93,7 +94,8 @@ class Application(BaseApplication):
    def log(self):
        self.em.log()
        self.nm.log()
-        self.tm.log()
+        if self.tm:
+            self.tm.log()
        if self.pt is not None:
            self.pt.log()

@@ -184,6 +186,7 @@ class Application(BaseApplication):
            for conn in self.em.getConnectionList():
                if conn not in (self.listening_conn, self.master_conn):
                    conn.close()
+            self.tm = TransactionManager(self)
            try:
                self.initialize()
                self.doOperation()
@@ -194,6 +197,7 @@ class Application(BaseApplication):
                logging.error('primary master is down: %s', msg)
            finally:
                self.checker = Checker(self)
+            del self.tm

    def connectToPrimary(self):
        """Find a primary master node, and connect to it.
@@ -256,7 +260,6 @@ class Application(BaseApplication):

        # Forget all unfinished data.
        self.dm.dropUnfinishedData()
-        self.tm.reset()

        self.task_queue = task_queue = deque()
        try:

--- a/neo/storage/handlers/client.py
+++ b/neo/storage/handlers/client.py
@@ -72,7 +72,7 @@ class ClientOperationHandler(EventHandler):
    def _askStoreObject(self, conn, oid, serial, compression, checksum, data,
            data_serial, ttid, request_time):
        try:
-            self.app.tm.storeObject(ttid, serial, oid, compression,
+            locked = self.app.tm.storeObject(ttid, serial, oid, compression,
                    checksum, data, data_serial)
        except ConflictError, err:
            # resolvable or not
@@ -93,7 +93,7 @@ class ClientOperationHandler(EventHandler):
                duration = time.time() - request_time
                if duration > SLOW_STORE:
                    logging.info('StoreObject delay: %.02fs', duration)
-            conn.answer(Packets.AnswerStoreObject(None))
+            conn.answer(Packets.AnswerStoreObject(locked))

    def askStoreObject(self, conn, oid, serial,
            compression, checksum, data, data_serial, ttid):
@@ -171,7 +171,7 @@ class ClientOperationHandler(EventHandler):

    def _askCheckCurrentSerial(self, conn, ttid, serial, oid, request_time):
        try:
-            self.app.tm.checkCurrentSerial(ttid, serial, oid)
+            locked = self.app.tm.checkCurrentSerial(ttid, serial, oid)
        except ConflictError, err:
            # resolvable or not
            conn.answer(Packets.AnswerCheckCurrentSerial(err.tid))
@@ -191,7 +191,7 @@ class ClientOperationHandler(EventHandler):
                duration = time.time() - request_time
                if duration > SLOW_STORE:
                    logging.info('CheckCurrentSerial delay: %.02fs', duration)
-            conn.answer(Packets.AnswerCheckCurrentSerial(None))
+            conn.answer(Packets.AnswerCheckCurrentSerial(locked))


 # like ClientOperationHandler but read-only & only for tid <= backup_tid

--- a/neo/storage/handlers/master.py
+++ b/neo/storage/handlers/master.py
@@ -31,8 +31,8 @@ class MasterOperationHandler(BaseMasterHandler):
            dm._setBackupTID(dm.getLastIDs()[0] or ZERO_TID)
            dm.commit()

-    def notifyTransactionFinished(self, conn, *args, **kw):
-        self.app.replicator.transactionFinished(*args, **kw)
+    def notifyTransactionFinished(self, conn, *args):
+        self.app.replicator.transactionFinished(*args)

    def notifyPartitionChanges(self, conn, ptid, cell_list):
        """This is very similar to Send Partition Table, except that

--- a/neo/storage/replicator.py
+++ b/neo/storage/replicator.py
@@ -136,7 +136,7 @@ class Replicator(object):
        app = self.app
        pt = app.pt
        uuid = app.uuid
-        self.partition_dict = p = {}
+        self.partition_dict = {}
        self.replicate_dict = {}
        self.source_dict = {}
        self.ttid_set = set()
@@ -160,8 +160,7 @@ class Replicator(object):
                        p.next_trans = p.next_obj = next_tid
                        p.max_ttid = None
        if outdated_list:
-            self.app.master_conn.ask(Packets.AskUnfinishedTransactions(),
-                                     offset_list=outdated_list)
+            self.app.tm.replicating(outdated_list)

    def notifyPartitionChanges(self, cell_list):
        """This is a callback from MasterOperationHandler."""
@@ -190,8 +189,7 @@ class Replicator(object):
                    p.max_ttid = INVALID_TID
                    added_list.append(offset)
        if added_list:
-            self.app.master_conn.ask(Packets.AskUnfinishedTransactions(),
-                                     offset_list=added_list)
+            self.app.tm.replicating(added_list)
        if abort:
            self.abort()

@@ -326,8 +324,7 @@ class Replicator(object):
        p.next_obj = add64(tid, 1)
        self.updateBackupTID()
        if not p.max_ttid:
-            p = Packets.NotifyReplicationDone(offset, tid)
-            self.app.master_conn.notify(p)
+            self.app.tm.replicated(offset, tid)
        logging.debug("partition %u replicated up to %s from %r",
                      offset, dump(tid), self.current_node)
        self.getCurrentConnection().setReconnectionNoDelay()

--- a/neo/storage/transactions.py
+++ b/neo/storage/transactions.py
--- a/neo/tests/storage/testTransactions.py
+++ b/neo/tests/storage/testTransactions.py
@@ -28,7 +28,7 @@ class TransactionManagerTests(NeoUnitTestBase):
        self.app = Mock()
        # no history
        self.app.dm = Mock({'getObjectHistory': []})
-        self.app.pt = Mock({'isAssigned': True})
+        self.app.pt = Mock({'isAssigned': True, 'getPartitions': 2})
        self.app.em = Mock({'setTimeout': None})
        self.manager = TransactionManager(self.app)


--- a/neo/tests/threaded/test.py
+++ b/neo/tests/threaded/test.py
@@ -33,7 +33,7 @@ from neo.lib.exception import DatabaseFailure, StoppedOperation
 from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \
    ZERO_OID, ZERO_TID
 from .. import expectedFailure, Patch
-from . import LockLock, NEOThreadedTest, with_cluster
+from . import ConnectionFilter, LockLock, NEOThreadedTest, with_cluster
 from neo.lib.util import add64, makeChecksum, p64, u64
 from neo.client.exception import NEOPrimaryMasterLost, NEOStorageError
 from neo.client.pool import CELL_CONNECTED, CELL_GOOD
@@ -1351,11 +1351,11 @@ class Test(NEOThreadedTest):
        reports a conflict after that this conflict was fully resolved with
        another node.
        """
-        def answerStoreObject(orig, conn, conflict, **kw):
-            if not conflict:
+        def answerStoreObject(orig, conn, conflict, oid, serial):
+            if conflict == serial:
                p.revert()
                ll()
-            orig(conn, conflict, **kw)
+            orig(conn, conflict, oid, serial)
        if 1:
            s0, s1 = cluster.storage_list
            t1, c1 = cluster.getTransaction()
@@ -1389,6 +1389,36 @@ class Test(NEOThreadedTest):
        storage.store(oid, None, '*' * storage._cache._max_size, '', txn)
        self.assertRaises(POSException.ConflictError, storage.tpc_vote, txn)

+    @with_cluster(replicas=1)
+    def testConflictWithOutOfDateCell(self, cluster):
+        """
+        C1         S1         S0         C2
+        begin      down                  begin
+                              U <------- commit
+                   up (remaining out-of-date due to suspended replication)
+        store ---> O (stored lockless)
+             `--------------> conflict
+        resolve -> stored lockless
+               `------------> locked
+        committed
+        """
+        s0, s1 = cluster.storage_list
+        t1, c1 = cluster.getTransaction()
+        c1.root()['x'] = x = PCounterWithResolution()
+        t1.commit()
+        s1.stop()
+        cluster.join((s1,))
+        x.value += 1
+        t2, c2 = cluster.getTransaction()
+        c2.root()['x'].value += 2
+        t2.commit()
+        with ConnectionFilter() as f:
+            f.delayAskFetchTransactions()
+            s1.resetNode()
+            s1.start()
+            self.tic()
+            t1.commit()
+

 if __name__ == "__main__":
    unittest.main()