Commit efb4da42 authored by Grégory Wisniewski's avatar Grégory Wisniewski

Remove dropNode() calls where it's not safe to change the partition table

without give the choice to an admin to discard the node's content. Now a node
can be dropped only when requested by the admin node or when a storage has
changed its address (this case should be checked and discussed).


git-svn-id: https://svn.erp5.org/repos/neo/branches/prototype3@732 71dcc9de-d417-0410-9af5-da40c76e7ee4
parent 2cc58316
...@@ -646,11 +646,15 @@ class Application(object): ...@@ -646,11 +646,15 @@ class Application(object):
for node in nm.getStorageNodeList(): for node in nm.getStorageNodeList():
if node.getState() == TEMPORARILY_DOWN_STATE \ if node.getState() == TEMPORARILY_DOWN_STATE \
and node.getLastStateChange() + expiration < current_time: and node.getLastStateChange() + expiration < current_time:
logging.info('%s is down' % (node, )) logging.warning('%s is down, have to notify the admin' % (node, ))
node.setState(DOWN_STATE) # XXX: here we should notify the administrator that
self.broadcastNodeInformation(node) # a node seems dead and should be dropped frop the
cell_list = self.pt.dropNode(node) # partition table. This should not be done
self.broadcastPartitionChanges(self.pt.setNextID(), cell_list) # automaticaly to avoid data lost.
#node.setState(DOWN_STATE)
#self.broadcastNodeInformation(node)
#cell_list = self.pt.dropNode(node)
#self.broadcastPartitionChanges(self.pt.setNextID(), cell_list)
if not self.pt.operational(): if not self.pt.operational():
# Catastrophic. # Catastrophic.
raise OperationFailure, 'cannot continue operation' raise OperationFailure, 'cannot continue operation'
......
...@@ -42,10 +42,10 @@ class IdentificationEventHandler(MasterEventHandler): ...@@ -42,10 +42,10 @@ class IdentificationEventHandler(MasterEventHandler):
node_by_addr = nm.getNodeByServer(server) node_by_addr = nm.getNodeByServer(server)
def changeNodeAddress(node, server): def changeNodeAddress(node, server):
from copy import copy
if node_type == protocol.STORAGE_NODE_TYPE: if node_type == protocol.STORAGE_NODE_TYPE:
args = (node.getServer(), server) args = (node.getServer(), server)
# remove storage from partition table # remove storage from partition table
# XXX: this should be safe but need to be checked
cell_list = app.pt.dropNode(node) cell_list = app.pt.dropNode(node)
if cell_list: if cell_list:
ptid = app.pt.setNextID() ptid = app.pt.setNextID()
...@@ -70,11 +70,15 @@ class IdentificationEventHandler(MasterEventHandler): ...@@ -70,11 +70,15 @@ class IdentificationEventHandler(MasterEventHandler):
if node.getState() == protocol.RUNNING_STATE: if node.getState() == protocol.RUNNING_STATE:
# still running, reject this new node # still running, reject this new node
raise protocol.ProtocolError('invalid server address') raise protocol.ProtocolError('invalid server address')
# this node has changed its address
node = changeNodeAddress(node, server) node = changeNodeAddress(node, server)
if node_by_uuid is None and node_by_addr is not None: if node_by_uuid is None and node_by_addr is not None:
if node.getState() == protocol.RUNNING_STATE: if node.getState() == protocol.RUNNING_STATE:
# still running, reject this new node # still running, reject this new node
raise protocol.ProtocolError('invalid server address') raise protocol.ProtocolError('invalid server address')
# FIXME: here the node was known with a different uuid but with the
# same address, is it safe to forgot the old, even if he's not
# running ?
node = changeNodeAddress(node, server) node = changeNodeAddress(node, server)
# ask the app the node identification, if refused, an exception is raised # ask the app the node identification, if refused, an exception is raised
......
...@@ -31,7 +31,7 @@ class RecoveryEventHandler(MasterEventHandler): ...@@ -31,7 +31,7 @@ class RecoveryEventHandler(MasterEventHandler):
"""This class deals with events for a recovery phase.""" """This class deals with events for a recovery phase."""
def connectionCompleted(self, conn): def connectionCompleted(self, conn):
# ask the last IDs to perform the the recovery # ask the last IDs to perform the recovery
conn.ask(protocol.askLastIDs()) conn.ask(protocol.askLastIDs())
def connectionClosed(self, conn): def connectionClosed(self, conn):
......
...@@ -99,14 +99,7 @@ class ServiceEventHandler(MasterEventHandler): ...@@ -99,14 +99,7 @@ class ServiceEventHandler(MasterEventHandler):
app.broadcastNodeInformation(node) app.broadcastNodeInformation(node)
if node.getNodeType() == STORAGE_NODE_TYPE: if node.getNodeType() == STORAGE_NODE_TYPE:
if state in (DOWN_STATE, BROKEN_STATE): if state == TEMPORARILY_DOWN_STATE:
# XXX still required to change here ??? who can send
# this kind of message with these status except admin node
cell_list = app.pt.dropNode(node)
if len(cell_list) != 0:
ptid = app.pt.setNextID()
app.broadcastPartitionChanges(ptid, cell_list)
elif state == TEMPORARILY_DOWN_STATE:
cell_list = app.pt.outdate() cell_list = app.pt.outdate()
if len(cell_list) != 0: if len(cell_list) != 0:
ptid = app.pt.setNextID() ptid = app.pt.setNextID()
......
...@@ -103,14 +103,7 @@ class ShutdownEventHandler(ServiceEventHandler): ...@@ -103,14 +103,7 @@ class ShutdownEventHandler(ServiceEventHandler):
logging.debug('broadcasting node information') logging.debug('broadcasting node information')
app.broadcastNodeInformation(node) app.broadcastNodeInformation(node)
if node.getNodeType() == STORAGE_NODE_TYPE: if node.getNodeType() == STORAGE_NODE_TYPE:
if state in (DOWN_STATE, BROKEN_STATE): if state == TEMPORARILY_DOWN_STATE:
# XXX still required to change here ??? who can send
# this kind of message with these status except admin node
cell_list = app.pt.dropNode(node)
if len(cell_list) != 0:
ptid = app.pt.setNextID()
app.broadcastPartitionChanges(ptid, cell_list)
elif state == TEMPORARILY_DOWN_STATE:
cell_list = app.pt.outdate() cell_list = app.pt.outdate()
if len(cell_list) != 0: if len(cell_list) != 0:
ptid = app.pt.setNextID() ptid = app.pt.setNextID()
......
...@@ -33,43 +33,33 @@ class VerificationEventHandler(MasterEventHandler): ...@@ -33,43 +33,33 @@ class VerificationEventHandler(MasterEventHandler):
def connectionCompleted(self, conn): def connectionCompleted(self, conn):
pass pass
def connectionClosed(self, conn): def _dropIt(self, conn, node, new_state):
app = self.app app = self.app
uuid = conn.getUUID() node.setState(new_state)
node = app.nm.getNodeByUUID(uuid) app.broadcastNodeInformation(node)
if not app.pt.operational():
raise VerificationFailure, 'cannot continue verification'
def connectionClosed(self, conn):
node = self.app.nm.getNodeByUUID(conn.getUUID())
if node.getState() == RUNNING_STATE: if node.getState() == RUNNING_STATE:
node.setState(TEMPORARILY_DOWN_STATE) self._dropIt(conn, node, TEMPORARILY_DOWN_STATE)
app.broadcastNodeInformation(node)
if not app.pt.operational():
# Catastrophic.
raise VerificationFailure, 'cannot continue verification'
MasterEventHandler.connectionClosed(self, conn) MasterEventHandler.connectionClosed(self, conn)
def timeoutExpired(self, conn): def timeoutExpired(self, conn):
app = self.app node = self.app.nm.getNodeByUUID(conn.getUUID())
uuid = conn.getUUID()
node = app.nm.getNodeByUUID(uuid)
if node.getState() == RUNNING_STATE: if node.getState() == RUNNING_STATE:
node.setState(TEMPORARILY_DOWN_STATE) self._dropIt(conn, node, TEMPORARILY_DOWN_STATE)
app.broadcastNodeInformation(node)
if not app.pt.operational():
# Catastrophic.
raise VerificationFailure, 'cannot continue verification'
MasterEventHandler.timeoutExpired(self, conn) MasterEventHandler.timeoutExpired(self, conn)
def peerBroken(self, conn): def peerBroken(self, conn):
app = self.app node = self.app.nm.getNodeByUUID(conn.getUUID())
uuid = conn.getUUID()
node = app.nm.getNodeByUUID(uuid)
if node.getState() != BROKEN_STATE: if node.getState() != BROKEN_STATE:
node.setState(BROKEN_STATE) self._dropIt(conn, node, BROKEN_STATE)
app.broadcastNodeInformation(node) # here the node is no more dropped from the partition table anymore
cell_list = app.pt.dropNode(node) # because it's under the responsability of an administrator to
ptid = app.pt.setNextID() # restore the node, backup the node content or drop it definitely
app.broadcastPartitionChanges(ptid, cell_list) # and loose all it's content.
if not app.pt.operational():
# Catastrophic.
raise VerificationFailure, 'cannot continue verification'
MasterEventHandler.peerBroken(self, conn) MasterEventHandler.peerBroken(self, conn)
def handleNotifyNodeInformation(self, conn, packet, node_list): def handleNotifyNodeInformation(self, conn, packet, node_list):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment