Commit 80038d2f authored by Julien Muchembled's avatar Julien Muchembled

Fix update of partition when cluster is not operational due a storage lost

parent 6aa372d9
......@@ -258,10 +258,6 @@ class Application(object):
if selector(node):
node.notify(packet)
def outdateAndBroadcastPartition(self):
""" Outdate cell of non-working nodes and broadcast changes """
self.broadcastPartitionChanges(self.pt.outdate())
def broadcastLastOID(self):
oid = self.tm.getLastOID()
neo.lib.logging.debug(
......
......@@ -40,14 +40,12 @@ class StorageServiceHandler(BaseServiceHandler):
def nodeLost(self, conn, node):
neo.lib.logging.info('storage node lost')
assert not node.isRunning(), node.getState()
if not self.app.pt.operational():
app = self.app
app.broadcastPartitionChanges(app.pt.outdate(node))
if not app.pt.operational():
raise OperationFailure, 'cannot continue operation'
# this is intentionaly placed after the raise because the last cell in a
# partition must not oudated to allows a cluster restart.
self.app.outdateAndBroadcastPartition()
self.app.tm.forget(conn.getUUID())
if self.app.packing is not None:
app.tm.forget(conn.getUUID())
if app.packing is not None:
self.answerPack(conn, False)
def askLastIDs(self, conn):
......
......@@ -291,16 +291,28 @@ class PartitionTable(PartitionTable):
self.log()
return changed_cell_list
def outdate(self):
"""Outdate all non-working nodes."""
cell_list = []
def outdate(self, lost_node=None):
"""Outdate all non-working nodes
Do not outdate cells of 'lost_node' for partitions it was the last node
to serve. This allows a cluster restart.
"""
change_list = []
for offset, row in enumerate(self.partition_list):
lost = lost_node
cell_list = []
for cell in row:
if not cell.getNode().isRunning() and not cell.isOutOfDate():
if cell.isUpToDate() or cell.isFeeding():
if cell.getNode().isRunning():
lost = None
else :
cell_list.append(cell)
for cell in cell_list:
if cell.getNode() is not lost:
cell.setState(CellStates.OUT_OF_DATE)
cell_list.append((offset, cell.getUUID(),
change_list.append((offset, cell.getUUID(),
CellStates.OUT_OF_DATE))
return cell_list
return change_list
def getUpToDateCellNodeSet(self):
"""
......
......@@ -189,19 +189,26 @@ class StorageTests(NEOFunctionalTest):
down, the cluster remains up since there is a replica """
# populate the two storages
(started, _) = self.__setup(storage_number=2, replicas=1)
started, _ = self.__setup(partitions=3, replicas=1, storage_number=3)
self.neo.expectRunning(started[0])
self.neo.expectRunning(started[1])
self.neo.expectRunning(started[2])
self.neo.expectOudatedCells(number=0)
self.__populate()
self.__checkReplicationDone()
self.neo.expectClusterRunning()
# stop one storage and check outdated cells
started[0].stop()
self.neo.expectOudatedCells(number=10)
# Cluster still operational. All cells of first storage should be
# outdated.
self.neo.expectUnavailable(started[0])
self.neo.expectOudatedCells(2)
self.neo.expectClusterRunning()
started[1].stop()
# Cluster not operational anymore. Only cells of second storage that
# were shared with the third one should become outdated.
self.neo.expectUnavailable(started[1])
self.neo.expectClusterVerifying()
self.neo.expectOudatedCells(3)
def testVerificationTriggered(self):
""" Check that the verification stage is executed when a storage node
required to be operationnal is lost, and the cluster come back in
......
......@@ -192,7 +192,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
# Some shortcuts to simplify test code
self.app.pt = Mock({'operational': True})
self.app.outdateAndBroadcastPartition = lambda: None
# Register some transactions
tm = self.app.tm
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment