Add master's recovery manager.

git-svn-id: https://svn.erp5.org/repos/neo/trunk@1592 71dcc9de-d417-0410-9af5-da40c76e7ee4

Add master's recovery manager.
git-svn-id: https://svn.erp5.org/repos/neo/trunk@1592 71dcc9de-d417-0410-9af5-da40c76e7ee4
e6193579 · Grégory Wisniewski · 8ffcab59 · e6193579 · e6193579
Commit e6193579 authored Feb 02, 2010 by Grégory Wisniewski
Hide whitespace changes
Inline Side-by-side

Showing with 155 additions and 82 deletions

neo/master/app.py neo/master/app.py +6 -82

neo/master/recovery.py neo/master/recovery.py +149 -0

No files found.
--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -27,17 +27,16 @@ from neo.node import NodeManager
 from neo.event import EventManager
 from neo.connection import ListeningConnection, ClientConnection
 from neo.exception import ElectionFailure, PrimaryFailure, OperationFailure
-from neo.master.handlers import election, identification, secondary, recovery
+from neo.master.handlers import election, identification, secondary
 from neo.master.handlers import storage, client, shutdown
 from neo.master.handlers import administration
 from neo.master.pt import PartitionTable
 from neo.master.transactions import TransactionManager
 from neo.master.verification import VerificationManager
+from neo.master.recovery import RecoveryManager
 from neo.util import dump
 from neo.connector import getConnectorHandler
-REQUIRED_NODE_NUMBER = 1
 class Application(object):
    """The master node application."""
@@ -74,6 +73,7 @@ class Application(object):
        self.primary = None
        self.primary_master_node = None
        self.cluster_state = None
+        self._startup_allowed = False
        # Generate an UUID for self
        uuid = config.getUUID()
@@ -83,15 +83,12 @@ class Application(object):
        # The last OID.
        self.loid = None
-        # The target node's uuid to request next.
-        self.target_uuid = None
        # election related data
        self.unconnected_master_node_set = set()
        self.negotiating_master_node_set = set()
        self._current_manager = None
-        self._startup_allowed = False
    def run(self):
        """Make sure that the status is sane and start a loop."""
@@ -328,61 +325,6 @@ class Application(object):
            if node is not None and node.isStorage():
                conn.notify(packet)
-    def buildFromScratch(self):
-        nm, em, pt = self.nm, self.em, self.pt
-        logging.debug('creating a new partition table, wait for a storage node')
-        # wait for some empty storage nodes, their are accepted
-        while len(nm.getStorageList()) < REQUIRED_NODE_NUMBER:
-            em.poll(1)
-        # take the first node available
-        node_list = nm.getStorageList()[:REQUIRED_NODE_NUMBER]
-        for node in node_list:
-            node.setRunning()
-        self.broadcastNodesInformation(node_list)
-        # resert IDs generators
-        self.loid = '\0' * 8
-        # build the partition with this node
-        pt.setID(pack('!Q', 1))
-        pt.make(node_list)
-    def recoverStatus(self):
-        """
-        Recover the status about the cluster. Obtain the last OID, the last
-        TID, and the last Partition Table ID from storage nodes, then get
-        back the latest partition table or make a new table from scratch,
-        if this is the first time.
-        """
-        logging.info('begin the recovery of the status')
-        self.changeClusterState(ClusterStates.RECOVERING)
-        em = self.em
-        self.loid = None
-        self.pt.setID(None)
-        self.target_uuid = None
-        # collect the last partition table available
-        while not self._startup_allowed:
-            em.poll(1)
-        logging.info('startup allowed')
-        # build a new partition table
-        if self.pt.getID() is None:
-            self.buildFromScratch()
-        # collect node that are connected but not in the selected partition
-        # table and set them in pending state
-        allowed_node_set = set(self.pt.getNodeList())
-        refused_node_set = set(self.nm.getStorageList()) - allowed_node_set
-        for node in refused_node_set:
-            node.setPending()
-        self.broadcastNodesInformation(refused_node_set)
-        logging.debug('cluster starts with loid=%s and this partition table :',
-                dump(self.loid))
-        self.pt.log()
    def provideService(self):
        """
        This is the normal mode for a primary master node. Handle transactions
@@ -438,7 +380,6 @@ class Application(object):
                assert node.isMaster()
                conn.setHandler(handler)
        # If I know any storage node, make sure that they are not in the
        # running state, because they are not connected at this stage.
        for node in nm.getStorageList():
@@ -446,8 +387,7 @@ class Application(object):
                node.setTemporarilyDown()
        # recover the cluster status at startup
-        self.recoverStatus()
+        self.runManager(RecoveryManager)
        while True:
            self.runManager(VerificationManager)
            self.provideService()
@@ -509,9 +449,7 @@ class Application(object):
        # select the storage handler
        client_handler = client.ClientServiceHandler(self)
-        if state == ClusterStates.RECOVERING:
+        if state == ClusterStates.RUNNING:
-            storage_handler = recovery.RecoveryHandler(self)
-        elif state == ClusterStates.RUNNING:
            storage_handler = storage.StorageServiceHandler(self)
        elif self._current_manager is not None:
            storage_handler = self._current_manager.getHandler()
@@ -606,21 +544,7 @@ class Application(object):
    def identifyStorageNode(self, uuid, node):
        state = NodeStates.RUNNING
        handler = None
-        if self.cluster_state == ClusterStates.RECOVERING:
+        if self.cluster_state == ClusterStates.RUNNING:
-            # accept storage nodes when recovery is over
-            if uuid is None and self._startup_allowed:
-                logging.info('reject empty storage node')
-                raise protocol.NotReadyError
-            handler = recovery.RecoveryHandler(self)
-        elif self.cluster_state == ClusterStates.VERIFYING:
-            if uuid is None or node is None:
-                # if node is unknown, it has been forget when the current
-                # partition was validated by the admin
-                # Here the uuid is not cleared to allow lookup pending nodes by
-                # uuid from the test framework. It's safe since nodes with a
-                # conflicting UUID are rejected in the identification handler.
-                state = NodeStates.PENDING
-        elif self.cluster_state == ClusterStates.RUNNING:
            if uuid is None or node is None:
                # same as for verification
                state = NodeStates.PENDING

--- a/neo/master/handlers/recovery.py
+++ b/neo/master/handlers/recovery.py
@@ -15,14 +15,91 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
-from neo import logging
+from struct import pack
-from neo.protocol import Packets, ProtocolError
+from neo import logging
-from neo.master.handlers import MasterHandler
 from neo.util import dump
+from neo.protocol import Packets, ProtocolError, ClusterStates, NodeStates
+from neo.protocol import NotReadyError
+from neo.master.handlers import MasterHandler
+REQUIRED_NODE_NUMBER = 1
+class RecoveryManager(MasterHandler):
+    """
+      Manage the cluster recovery
+    """
+    def __init__(self, app):
+        super(RecoveryManager, self).__init__(app)
+        # The target node's uuid to request next.
+        self.target_uuid = None
+    def getHandler(self):
+        return self
+    def identifyStorageNode(self, uuid, node):
+        """
+            Returns the handler for storage nodes
+        """
+        if uuid is None and self.app._startup_allowed:
+            logging.info('reject empty storage node')
+            raise NotReadyError
+        return (uuid, NodeStates.RUNNING, self)
+    def run(self):
+        """
+        Recover the status about the cluster. Obtain the last OID, the last
+        TID, and the last Partition Table ID from storage nodes, then get
+        back the latest partition table or make a new table from scratch,
+        if this is the first time.
+        """
+        logging.info('begin the recovery of the status')
+        self.app.changeClusterState(ClusterStates.RECOVERING)
+        em = self.app.em
+        self.app.loid = None
+        self.app.pt.setID(None)
+        # collect the last partition table available
+        while not self.app._startup_allowed:
+            em.poll(1)
+        logging.info('startup allowed')
+        # build a new partition table
+        if self.app.pt.getID() is None:
+            self.buildFromScratch()
+        # collect node that are connected but not in the selected partition
+        # table and set them in pending state
+        allowed_node_set = set(self.app.pt.getNodeList())
+        refused_node_set = set(self.app.nm.getStorageList()) - allowed_node_set
+        for node in refused_node_set:
+            node.setPending()
+        self.app.broadcastNodesInformation(refused_node_set)
+        logging.debug('cluster starts with loid=%s and this partition table :',
+                dump(self.app.loid))
+        self.app.pt.log()
-class RecoveryHandler(MasterHandler):
+    def buildFromScratch(self):
-    """This class deals with events for a recovery phase."""
+        nm, em, pt = self.app.nm, self.app.em, self.app.pt
+        logging.debug('creating a new partition table, wait for a storage node')
+        # wait for some empty storage nodes, their are accepted
+        while len(nm.getStorageList()) < REQUIRED_NODE_NUMBER:
+            em.poll(1)
+        # take the first node available
+        node_list = nm.getStorageList()[:REQUIRED_NODE_NUMBER]
+        for node in node_list:
+            node.setRunning()
+        self.app.broadcastNodesInformation(node_list)
+        # resert IDs generators
+        self.app.loid = '\0' * 8
+        # build the partition with this node
+        pt.setID(pack('!Q', 1))
+        pt.make(node_list)
    def connectionLost(self, conn, new_state):
        node = self.app.nm.getByUUID(conn.getUUID())
@@ -46,17 +123,17 @@ class RecoveryHandler(MasterHandler):
        self.app.tm.setLastTID(ltid)
        if lptid > pt.getID():
            # something newer
-            app.target_uuid = conn.getUUID()
+            self.target_uuid = conn.getUUID()
            app.pt.setID(lptid)
            conn.ask(Packets.AskPartitionTable([]))
    def answerPartitionTable(self, conn, ptid, row_list):
        uuid = conn.getUUID()
        app = self.app
-        if uuid != app.target_uuid:
+        if uuid != self.target_uuid:
            # If this is not from a target node, ignore it.
            logging.warn('got answer partition table from %s while waiting ' \
-                    'for %s', dump(uuid), dump(app.target_uuid))
+                    'for %s', dump(uuid), dump(self.target_uuid))
            return
        # load unknown storage nodes
        for offset, row in row_list: