Perform DB truncation during recovery, send PT to storages before verification

Currently, the database may only be truncated when leaving backup mode, but the issue will be the same when neoctl gets a new command to truncate at an arbitrary tid: we want to be sure that all nodes are truncated before anything else. Therefore, we stop sending Truncate orders before stopping operation because nodes could fail/exit before actually processing them. Truncation must also happen before asking nodes their last ids. With this commit, if a truncation is requested: - this is always the first thing done when a storage node connects to the primary master during the RECOVERING phase, - and the cluster does not start automatically if there are missing nodes, unless an admin forces it. Other changes: - Connections to storage nodes don't need to be aborted anymore when leaving backup mode. - The master always initiates communication when a storage node identifies, which simplifies code and reduces the number of exchanged packets.

Perform DB truncation during recovery, send PT to storages before verification
Currently, the database may only be truncated when leaving backup mode, but the issue will be the same when neoctl gets a new command to truncate at an arbitrary tid: we want to be sure that all nodes are truncated before anything else. Therefore, we stop sending Truncate orders before stopping operation because nodes could fail/exit before actually processing them. Truncation must also happen before asking nodes their last ids. With this commit, if a truncation is requested: - this is always the first thing done when a storage node connects to the primary master during the RECOVERING phase, - and the cluster does not start automatically if there are missing nodes, unless an admin forces it. Other changes: - Connections to storage nodes don't need to be aborted anymore when leaving backup mode. - The master always initiates communication when a storage node identifies, which simplifies code and reduces the number of exchanged packets.
3e3eab5b · Julien Muchembled · 2485f151 · 3e3eab5b · 3e3eab5b · 3e3eab5b
Commit 3e3eab5b authored Nov 25, 2015 by Julien Muchembled
16 changed files
--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -331,6 +331,7 @@ class Application(BaseApplication):
        # machines but must not start automatically: otherwise, each storage
        # node would diverge.
        self._startup_allowed = False
+        self.truncate_tid = None
        try:
            while True:
                self.runManager(RecoveryManager)
@@ -345,12 +346,10 @@ class Application(BaseApplication):
                    if self.backup_app is None:
                        raise RuntimeError("No upstream cluster to backup"
                                           " defined in configuration")
-                    self.backup_app.provideService()
+                    self.truncate_tid = self.backup_app.provideService()
-                    # All connections to storages are aborted when leaving
-                    # backup mode so restart loop completely (recovery).
-                    continue
                except OperationFailure:
                    logging.critical('No longer operational')
+                    self.truncate_tid = None
                node_list = []
                for node in self.nm.getIdentifiedList():
                    if node.isStorage() or node.isClient():
@@ -442,7 +441,7 @@ class Application(BaseApplication):
                continue # keep handler
            if type(handler) is not type(conn.getLastHandler()):
                conn.setHandler(handler)
-                handler.connectionCompleted(conn)
+                handler.connectionCompleted(conn, new=False)
        self.cluster_state = state
    def getNewUUID(self, uuid, address, node_type):

--- a/neo/master/backup_app.py
+++ b/neo/master/backup_app.py
@@ -152,24 +152,18 @@ class BackupApplication(object):
                    assert tid != ZERO_TID
                    logging.warning("Truncating at %s (last_tid was %s)",
                        dump(app.backup_tid), dump(last_tid))
-                # XXX: We want to go through a recovery phase in order to
+                    # We will really truncate so do not start automatically
-                #      initialize the transaction manager, but this is only
+                    # if there's any missing storage.
-                #      possible if storages already know that we left backup
+                    app._startup_allowed = False
-                #      mode. To that purpose, we always send a Truncate packet,
-                #      even if there's nothing to truncate.
-                p = Packets.Truncate(tid)
-                for node in app.nm.getStorageList(only_identified=True):
-                    conn = node.getConnection()
-                    conn.setHandler(handler)
-                    node.setState(NodeStates.TEMPORARILY_DOWN)
-                    # Packets will be sent at the beginning of the recovery
-                    # phase.
-                    conn.notify(p)
-                    conn.abort()
                # If any error happened before reaching this line, we'd go back
                # to backup mode, which is the right mode to recover.
                del app.backup_tid
-                break
+                # We will go through a recovery phase in order to reset the
+                # transaction manager and this is only possible if storages
+                # already know that we left backup mode. To that purpose, we
+                # always stop operation with a tid, even if there's nothing to
+                # truncate.
+                return tid
            finally:
                del self.primary_partition_dict, self.tid_list
                pt.clearReplicating()

--- a/neo/master/handlers/__init__.py
+++ b/neo/master/handlers/__init__.py
@@ -24,6 +24,10 @@ from neo.lib.protocol import (uuid_str, NodeTypes, NodeStates, Packets,
 class MasterHandler(EventHandler):
    """This class implements a generic part of the event handlers."""
+    def connectionCompleted(self, conn, new=None):
+        if new is None:
+            super(MasterHandler, self).connectionCompleted(conn)
    def requestIdentification(self, conn, node_type, uuid, address, name):
        self.checkClusterName(name)
        app = self.app
@@ -74,13 +78,16 @@ class MasterHandler(EventHandler):
        conn.answer(Packets.AnswerLastTransaction(
            self.app.getLastTransaction()))
-    def askNodeInformation(self, conn):
+    def _notifyNodeInformation(self, conn):
        nm = self.app.nm
        node_list = []
        node_list.extend(n.asTuple() for n in nm.getMasterList())
        node_list.extend(n.asTuple() for n in nm.getClientList())
        node_list.extend(n.asTuple() for n in nm.getStorageList())
        conn.notify(Packets.NotifyNodeInformation(node_list))
+    def askNodeInformation(self, conn):
+        self._notifyNodeInformation(conn)
        conn.answer(Packets.AnswerNodeInformation())
    def askPartitionTable(self, conn):
@@ -95,6 +102,11 @@ DISCONNECTED_STATE_DICT = {
 class BaseServiceHandler(MasterHandler):
    """This class deals with events for a service phase."""
+    def connectionCompleted(self, conn, new):
+        self._notifyNodeInformation(conn)
+        pt = self.app.pt
+        conn.notify(Packets.SendPartitionTable(pt.getID(), pt.getRowList()))
    def connectionLost(self, conn, new_state):
        app = self.app
        node = app.nm.getByUUID(conn.getUUID())

--- a/neo/master/handlers/client.py
+++ b/neo/master/handlers/client.py
@@ -20,9 +20,6 @@ from . import MasterHandler
 class ClientServiceHandler(MasterHandler):
    """ Handler dedicated to client during service state """
-    def connectionCompleted(self, conn):
-        pass
    def connectionLost(self, conn, new_state):
        # cancel its transactions and forgot the node
        app = self.app

--- a/neo/master/handlers/identification.py
+++ b/neo/master/handlers/identification.py
@@ -26,7 +26,7 @@ class IdentificationHandler(MasterHandler):
            **kw)
        handler = conn.getHandler()
        assert not isinstance(handler, IdentificationHandler), handler
-        handler.connectionCompleted(conn)
+        handler.connectionCompleted(conn, True)
    def _setupNode(self, conn, node_type, uuid, address, node):
        app = self.app

--- a/neo/master/handlers/storage.py
+++ b/neo/master/handlers/storage.py
@@ -24,12 +24,13 @@ from . import BaseServiceHandler
 class StorageServiceHandler(BaseServiceHandler):
    """ Handler dedicated to storages during service state """
-    def connectionCompleted(self, conn):
+    def connectionCompleted(self, conn, new):
-        # TODO: unit test
        app = self.app
        uuid = conn.getUUID()
        node = app.nm.getByUUID(uuid)
        app.setStorageNotReady(uuid)
+        if new:
+            super(StorageServiceHandler, self).connectionCompleted(conn, new)
        # XXX: what other values could happen ?
        if node.isRunning():
            conn.notify(Packets.StartOperation(bool(app.backup_tid)))

--- a/neo/master/recovery.py
+++ b/neo/master/recovery.py
@@ -128,7 +128,10 @@ class RecoveryManager(MasterHandler):
        # broadcast to all so that admin nodes gets informed
        self.app.broadcastNodesInformation([node])
-    def connectionCompleted(self, conn):
+    def connectionCompleted(self, conn, new):
+        tid = self.app.truncate_tid
+        if tid:
+            conn.notify(Packets.Truncate(tid))
        # ask the last IDs to perform the recovery
        conn.ask(Packets.AskLastIDs())

--- a/neo/master/verification.py
+++ b/neo/master/verification.py
@@ -137,9 +137,6 @@ class VerificationManager(BaseServiceHandler):
        self._uuid_set.remove(conn.getUUID())
        self._tid = tid
-    def connectionCompleted(self, conn):
-        pass
    def connectionLost(self, conn, new_state):
        self._uuid_set.discard(conn.getUUID())
        super(VerificationManager, self).connectionLost(conn, new_state)
--- a/neo/storage/app.py
+++ b/neo/storage/app.py
@@ -30,7 +30,7 @@ from neo.lib.bootstrap import BootstrapManager
 from .checker import Checker
 from .database import buildDatabaseManager
 from .exception import AlreadyPendingError
-from .handlers import identification, verification, initialization
+from .handlers import identification, initialization
 from .handlers import master, hidden
 from .replicator import Replicator
 from .transactions import TransactionManager
@@ -193,14 +193,11 @@ class Application(BaseApplication):
            self.event_queue = deque()
            self.event_queue_dict = {}
            try:
-                self.verifyData()
                self.initialize()
                self.doOperation()
                raise RuntimeError, 'should not reach here'
            except OperationFailure, msg:
                logging.error('operation stopped: %s', msg)
-                if self.cluster_state == ClusterStates.STOPPING_BACKUP:
-                    self.dm.setBackupTID(None)
            except PrimaryFailure, msg:
                logging.error('primary master is down: %s', msg)
            finally:
@@ -247,30 +244,11 @@ class Application(BaseApplication):
            self.pt = PartitionTable(num_partitions, num_replicas)
            self.loadPartitionTable()
-    def verifyData(self):
-        """Verify data under the control by a primary master node.
-        Connections from client nodes may not be accepted at this stage."""
-        logging.info('verifying data')
-        handler = verification.VerificationHandler(self)
-        self.master_conn.setHandler(handler)
-        _poll = self._poll
-        while not self.operational:
-            _poll()
    def initialize(self):
-        """ Retreive partition table and node informations from the primary """
        logging.debug('initializing...')
        _poll = self._poll
-        handler = initialization.InitializationHandler(self)
+        self.master_conn.setHandler(initialization.InitializationHandler(self))
-        self.master_conn.setHandler(handler)
+        while not self.operational:
-        # ask node list and partition table
-        self.pt.clear()
-        self.master_conn.ask(Packets.AskNodeInformation())
-        self.master_conn.ask(Packets.AskPartitionTable())
-        while self.master_conn.isPending():
            _poll()
        self.ready = True
        self.replicator.populate()

--- a/neo/storage/database/manager.py
+++ b/neo/storage/database/manager.py
@@ -504,11 +504,9 @@ class DatabaseManager(object):
    def truncate(self, tid):
        assert tid not in (None, ZERO_TID), tid
-        assert self.getBackupTID()
-        self.setBackupTID(None) # XXX
        for partition in xrange(self.getNumPartitions()):
            self._deleteRange(partition, tid)
-        self.commit()
+        self.setBackupTID(None) # this also commits
    def getTransaction(self, tid, all = False):
        """Return a tuple of the list of OIDs, user information,

--- a/neo/storage/handlers/__init__.py
+++ b/neo/storage/handlers/__init__.py
@@ -17,7 +17,7 @@
 from neo.lib import logging
 from neo.lib.handler import EventHandler
 from neo.lib.exception import PrimaryFailure, OperationFailure
-from neo.lib.protocol import uuid_str, NodeStates, NodeTypes
+from neo.lib.protocol import uuid_str, NodeStates, NodeTypes, Packets
 class BaseMasterHandler(EventHandler):

--- a/neo/storage/handlers/initialization.py
+++ b/neo/storage/handlers/initialization.py
@@ -15,24 +15,23 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 from . import BaseMasterHandler
-from neo.lib import logging, protocol
+from neo.lib import logging
+from neo.lib.protocol import Packets, ProtocolError, ZERO_TID
 class InitializationHandler(BaseMasterHandler):
    def answerNodeInformation(self, conn):
        pass
-    def answerPartitionTable(self, conn, ptid, row_list):
+    def sendPartitionTable(self, conn, ptid, row_list):
        app = self.app
        pt = app.pt
        pt.load(ptid, row_list, self.app.nm)
        if not pt.filled():
-            raise protocol.ProtocolError('Partial partition table received')
+            raise ProtocolError('Partial partition table received')
-        logging.debug('Got the partition table:')
-        self.app.pt.log()
        # Install the partition table into the database for persistency.
        cell_list = []
-        num_partitions = app.pt.getPartitions()
+        num_partitions = pt.getPartitions()
        unassigned_set = set(xrange(num_partitions))
        for offset in xrange(num_partitions):
            for cell in pt.getCellList(offset):
@@ -46,12 +45,39 @@ class InitializationHandler(BaseMasterHandler):
        app.dm.changePartitionTable(ptid, cell_list, reset=True)
-    def notifyPartitionChanges(self, conn, ptid, cell_list):
+    def truncate(self, conn, tid):
-        # XXX: This is safe to ignore those notifications because all of the
+        self.app.dm.truncate(tid)
-        # following applies:
-        # - we first ask for node information, and *then* partition
+    def askLastIDs(self, conn):
-        #   table content, so it is possible to get notifyPartitionChanges
+        app = self.app
-        #   packets in between (or even before asking for node information).
+        ltid, _, _, loid = app.dm.getLastIDs()
-        # - this handler will be changed after receiving answerPartitionTable
+        conn.answer(Packets.AnswerLastIDs(
-        #   and before handling the next packet
+            loid,
-        logging.debug('ignoring notifyPartitionChanges during initialization')
+            ltid,
+            app.pt.getID(),
+            app.dm.getBackupTID()))
+    def askPartitionTable(self, conn):
+        pt = self.app.pt
+        conn.answer(Packets.AnswerPartitionTable(pt.getID(), pt.getRowList()))
+    def askLockedTransactions(self, conn):
+        conn.answer(Packets.AnswerLockedTransactions(
+            self.app.dm.getUnfinishedTIDDict()))
+    def validateTransaction(self, conn, ttid, tid):
+        dm = self.app.dm
+        dm.lockTransaction(tid, ttid)
+        dm.unlockTransaction(tid, ttid)
+    def startOperation(self, conn, backup):
+        self.app.operational = True
+        # XXX: see comment in protocol
+        dm = self.app.dm
+        if backup:
+            if dm.getBackupTID():
+                return
+            tid = dm.getLastIDs()[0] or ZERO_TID
+        else:
+            tid = None
+        dm.setBackupTID(tid)
--- a/neo/storage/handlers/master.py
+++ b/neo/storage/handlers/master.py
@@ -67,10 +67,5 @@ class MasterOperationHandler(BaseMasterHandler):
        self.app.replicator.backup(tid, {p: a and (a, upstream_name)
                                         for p, a in source_dict.iteritems()})
-    def truncate(self, conn, tid):
-        self.app.replicator.cancel()
-        self.app.dm.truncate(tid)
-        conn.close()
    def checkPartition(self, conn, *args):
        self.app.checker(*args)
--- a/neo/storage/handlers/verification.py
+++ b/neo/storage/handlers/verification.py
-#
-# Copyright (C) 2006-2015  Nexedi SA
-#
-# This program is free software; you can redistribute it and/or
-# modify it under the terms of the GNU General Public License
-# as published by the Free Software Foundation; either version 2
-# of the License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-from . import BaseMasterHandler
-from neo.lib import logging
-from neo.lib.protocol import Packets, ZERO_TID
-from neo.lib.exception import OperationFailure
-class VerificationHandler(BaseMasterHandler):
-    """This class deals with events for a verification phase."""
-    def askLastIDs(self, conn):
-        app = self.app
-        ltid, _, _, loid = app.dm.getLastIDs()
-        conn.answer(Packets.AnswerLastIDs(
-            loid,
-            ltid,
-            app.pt.getID(),
-            app.dm.getBackupTID()))
-    def askPartitionTable(self, conn):
-        pt = self.app.pt
-        conn.answer(Packets.AnswerPartitionTable(pt.getID(), pt.getRowList()))
-    def notifyPartitionChanges(self, conn, ptid, cell_list):
-        """This is very similar to Send Partition Table, except that
-        the information is only about changes from the previous."""
-        app = self.app
-        if ptid <= app.pt.getID():
-            # Ignore this packet.
-            logging.debug('ignoring older partition changes')
-            return
-        # update partition table in memory and the database
-        app.pt.update(ptid, cell_list, app.nm)
-        app.dm.changePartitionTable(ptid, cell_list)
-    def startOperation(self, conn, backup):
-        self.app.operational = True
-        # XXX: see comment in protocol
-        dm = self.app.dm
-        if backup:
-            if dm.getBackupTID():
-                return
-            tid = dm.getLastIDs()[0] or ZERO_TID
-        else:
-            tid = None
-        dm.setBackupTID(tid)
-    def stopOperation(self, conn):
-        raise OperationFailure('operation stopped')
-    def askLockedTransactions(self, conn):
-        conn.answer(Packets.AnswerLockedTransactions(
-            self.app.dm.getUnfinishedTIDDict()))
-    def askFinalTID(self, conn, ttid):
-        conn.answer(Packets.AnswerFinalTID(self.app.dm.getFinalTID(ttid)))
-    def validateTransaction(self, conn, ttid, tid):
-        dm = self.app.dm
-        dm.lockTransaction(tid, ttid)
-        dm.unlockTransaction(tid, ttid)
--- a/neo/tests/storage/testInitializationHandler.py
+++ b/neo/tests/storage/testInitializationHandler.py
@@ -76,7 +76,7 @@ class StorageInitializationHandlerTests(NeoUnitTestBase):
                    (2, ((node_2, CellStates.UP_TO_DATE), (node_3, CellStates.UP_TO_DATE)))]
        self.assertFalse(self.app.pt.filled())
        # send a complete new table and ack
-        self.verification.answerPartitionTable(conn, 2, row_list)
+        self.verification.sendPartitionTable(conn, 2, row_list)
        self.assertTrue(self.app.pt.filled())
        self.assertEqual(self.app.pt.getID(), 2)
        self.assertTrue(list(self.app.dm.getPartitionTable()))

--- a/neo/tests/threaded/test.py
+++ b/neo/tests/threaded/test.py
@@ -34,7 +34,7 @@ from . import NEOCluster, NEOThreadedTest
 from neo.lib.util import add64, makeChecksum, p64, u64
 from neo.client.exception import NEOStorageError
 from neo.client.pool import CELL_CONNECTED, CELL_GOOD
-from neo.storage.handlers.verification import VerificationHandler
+from neo.storage.handlers.initialization import InitializationHandler
 class PCounter(Persistent):
    value = 0
@@ -1051,8 +1051,9 @@ class Test(NEOThreadedTest):
            p.revert()
            conn.close()
        try:
-            with Patch(cluster.master.pt, make=make), Patch(VerificationHandler,
+            with Patch(cluster.master.pt, make=make), \
-                    askPartitionTable=askPartitionTable) as p:
+                 Patch(InitializationHandler,
+                       askPartitionTable=askPartitionTable) as p:
                cluster.start()
                self.assertFalse(p.applied)
        finally: