Fix update of partition when cluster is not operational due a storage lost

80038d2f · Julien Muchembled · 6aa372d9 · 80038d2f · 80038d2f · 80038d2f
Commit 80038d2f authored Jan 10, 2012 by Julien Muchembled
5 changed files
--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -258,10 +258,6 @@ class Application(object):
            if selector(node):
                node.notify(packet)

-    def outdateAndBroadcastPartition(self):
-        """ Outdate cell of non-working nodes and broadcast changes """
-        self.broadcastPartitionChanges(self.pt.outdate())
-
    def broadcastLastOID(self):
        oid = self.tm.getLastOID()
        neo.lib.logging.debug(

--- a/neo/master/handlers/storage.py
+++ b/neo/master/handlers/storage.py
@@ -40,14 +40,12 @@ class StorageServiceHandler(BaseServiceHandler):
    def nodeLost(self, conn, node):
        neo.lib.logging.info('storage node lost')
        assert not node.isRunning(), node.getState()
-
-        if not self.app.pt.operational():
+        app = self.app
+        app.broadcastPartitionChanges(app.pt.outdate(node))
+        if not app.pt.operational():
            raise OperationFailure, 'cannot continue operation'
-        # this is intentionaly placed after the raise because the last cell in a
-        # partition must not oudated to allows a cluster restart.
-        self.app.outdateAndBroadcastPartition()
-        self.app.tm.forget(conn.getUUID())
-        if self.app.packing is not None:
+        app.tm.forget(conn.getUUID())
+        if app.packing is not None:
            self.answerPack(conn, False)

    def askLastIDs(self, conn):

--- a/neo/master/pt.py
+++ b/neo/master/pt.py
@@ -291,16 +291,28 @@ class PartitionTable(PartitionTable):
        self.log()
        return changed_cell_list

-    def outdate(self):
-        """Outdate all non-working nodes."""
-        cell_list = []
+    def outdate(self, lost_node=None):
+        """Outdate all non-working nodes
+
+        Do not outdate cells of 'lost_node' for partitions it was the last node
+        to serve. This allows a cluster restart.
+        """
+        change_list = []
        for offset, row in enumerate(self.partition_list):
+            lost = lost_node
+            cell_list = []
            for cell in row:
-                if not cell.getNode().isRunning() and not cell.isOutOfDate():
+                if cell.isUpToDate() or cell.isFeeding():
+                    if cell.getNode().isRunning():
+                        lost = None
+                    else :
+                        cell_list.append(cell)
+            for cell in cell_list:
+                if cell.getNode() is not lost:
                    cell.setState(CellStates.OUT_OF_DATE)
-                    cell_list.append((offset, cell.getUUID(),
+                    change_list.append((offset, cell.getUUID(),
                        CellStates.OUT_OF_DATE))
-        return cell_list
+        return change_list

    def getUpToDateCellNodeSet(self):
        """

--- a/neo/tests/functional/testStorage.py
+++ b/neo/tests/functional/testStorage.py
@@ -189,19 +189,26 @@ class StorageTests(NEOFunctionalTest):
        down, the cluster remains up since there is a replica """

        # populate the two storages
-        (started, _) = self.__setup(storage_number=2, replicas=1)
+        started, _ = self.__setup(partitions=3, replicas=1, storage_number=3)
        self.neo.expectRunning(started[0])
        self.neo.expectRunning(started[1])
+        self.neo.expectRunning(started[2])
        self.neo.expectOudatedCells(number=0)
-        self.__populate()
-        self.__checkReplicationDone()
-        self.neo.expectClusterRunning()

-        # stop one storage and check outdated cells
        started[0].stop()
-        self.neo.expectOudatedCells(number=10)
+        # Cluster still operational. All cells of first storage should be
+        # outdated.
+        self.neo.expectUnavailable(started[0])
+        self.neo.expectOudatedCells(2)
        self.neo.expectClusterRunning()

+        started[1].stop()
+        # Cluster not operational anymore. Only cells of second storage that
+        # were shared with the third one should become outdated.
+        self.neo.expectUnavailable(started[1])
+        self.neo.expectClusterVerifying()
+        self.neo.expectOudatedCells(3)
+
    def testVerificationTriggered(self):
        """ Check that the verification stage is executed when a storage node
        required to be operationnal is lost, and the cluster come back in

--- a/neo/tests/master/testStorageHandler.py
+++ b/neo/tests/master/testStorageHandler.py
@@ -192,7 +192,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase):

        # Some shortcuts to simplify test code
        self.app.pt = Mock({'operational': True})
-        self.app.outdateAndBroadcastPartition = lambda: None

        # Register some transactions
        tm = self.app.tm