Commit 80038d2f authored by Julien Muchembled's avatar Julien Muchembled

Fix update of partition when cluster is not operational due a storage lost

parent 6aa372d9
...@@ -258,10 +258,6 @@ class Application(object): ...@@ -258,10 +258,6 @@ class Application(object):
if selector(node): if selector(node):
node.notify(packet) node.notify(packet)
def outdateAndBroadcastPartition(self):
""" Outdate cell of non-working nodes and broadcast changes """
self.broadcastPartitionChanges(self.pt.outdate())
def broadcastLastOID(self): def broadcastLastOID(self):
oid = self.tm.getLastOID() oid = self.tm.getLastOID()
neo.lib.logging.debug( neo.lib.logging.debug(
......
...@@ -40,14 +40,12 @@ class StorageServiceHandler(BaseServiceHandler): ...@@ -40,14 +40,12 @@ class StorageServiceHandler(BaseServiceHandler):
def nodeLost(self, conn, node): def nodeLost(self, conn, node):
neo.lib.logging.info('storage node lost') neo.lib.logging.info('storage node lost')
assert not node.isRunning(), node.getState() assert not node.isRunning(), node.getState()
app = self.app
if not self.app.pt.operational(): app.broadcastPartitionChanges(app.pt.outdate(node))
if not app.pt.operational():
raise OperationFailure, 'cannot continue operation' raise OperationFailure, 'cannot continue operation'
# this is intentionaly placed after the raise because the last cell in a app.tm.forget(conn.getUUID())
# partition must not oudated to allows a cluster restart. if app.packing is not None:
self.app.outdateAndBroadcastPartition()
self.app.tm.forget(conn.getUUID())
if self.app.packing is not None:
self.answerPack(conn, False) self.answerPack(conn, False)
def askLastIDs(self, conn): def askLastIDs(self, conn):
......
...@@ -291,16 +291,28 @@ class PartitionTable(PartitionTable): ...@@ -291,16 +291,28 @@ class PartitionTable(PartitionTable):
self.log() self.log()
return changed_cell_list return changed_cell_list
def outdate(self): def outdate(self, lost_node=None):
"""Outdate all non-working nodes.""" """Outdate all non-working nodes
cell_list = []
Do not outdate cells of 'lost_node' for partitions it was the last node
to serve. This allows a cluster restart.
"""
change_list = []
for offset, row in enumerate(self.partition_list): for offset, row in enumerate(self.partition_list):
lost = lost_node
cell_list = []
for cell in row: for cell in row:
if not cell.getNode().isRunning() and not cell.isOutOfDate(): if cell.isUpToDate() or cell.isFeeding():
if cell.getNode().isRunning():
lost = None
else :
cell_list.append(cell)
for cell in cell_list:
if cell.getNode() is not lost:
cell.setState(CellStates.OUT_OF_DATE) cell.setState(CellStates.OUT_OF_DATE)
cell_list.append((offset, cell.getUUID(), change_list.append((offset, cell.getUUID(),
CellStates.OUT_OF_DATE)) CellStates.OUT_OF_DATE))
return cell_list return change_list
def getUpToDateCellNodeSet(self): def getUpToDateCellNodeSet(self):
""" """
......
...@@ -189,19 +189,26 @@ class StorageTests(NEOFunctionalTest): ...@@ -189,19 +189,26 @@ class StorageTests(NEOFunctionalTest):
down, the cluster remains up since there is a replica """ down, the cluster remains up since there is a replica """
# populate the two storages # populate the two storages
(started, _) = self.__setup(storage_number=2, replicas=1) started, _ = self.__setup(partitions=3, replicas=1, storage_number=3)
self.neo.expectRunning(started[0]) self.neo.expectRunning(started[0])
self.neo.expectRunning(started[1]) self.neo.expectRunning(started[1])
self.neo.expectRunning(started[2])
self.neo.expectOudatedCells(number=0) self.neo.expectOudatedCells(number=0)
self.__populate()
self.__checkReplicationDone()
self.neo.expectClusterRunning()
# stop one storage and check outdated cells
started[0].stop() started[0].stop()
self.neo.expectOudatedCells(number=10) # Cluster still operational. All cells of first storage should be
# outdated.
self.neo.expectUnavailable(started[0])
self.neo.expectOudatedCells(2)
self.neo.expectClusterRunning() self.neo.expectClusterRunning()
started[1].stop()
# Cluster not operational anymore. Only cells of second storage that
# were shared with the third one should become outdated.
self.neo.expectUnavailable(started[1])
self.neo.expectClusterVerifying()
self.neo.expectOudatedCells(3)
def testVerificationTriggered(self): def testVerificationTriggered(self):
""" Check that the verification stage is executed when a storage node """ Check that the verification stage is executed when a storage node
required to be operationnal is lost, and the cluster come back in required to be operationnal is lost, and the cluster come back in
......
...@@ -192,7 +192,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase): ...@@ -192,7 +192,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
# Some shortcuts to simplify test code # Some shortcuts to simplify test code
self.app.pt = Mock({'operational': True}) self.app.pt = Mock({'operational': True})
self.app.outdateAndBroadcastPartition = lambda: None
# Register some transactions # Register some transactions
tm = self.app.tm tm = self.app.tm
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment