Commit 17af3b47 authored by Julien Muchembled's avatar Julien Muchembled

master: fix possibly wrong knowledge of cells' backup_tid when resuming backup

The issue happens when there were commits while the backup cluster was down.
In this case, the master thinks that these commits are already replicated,
reporting wrong backup_tid to neoctl. It solved by itself once:
- there are new commits triggering replication for all partitions;
- all storage nodes have really replicated.

This also resulted in an inconsistent database when leaving backup mode during
this period.
parent c95c6c39
......@@ -194,9 +194,8 @@ class BackupApplication(object):
for node in trigger_set:
self.triggerBackup(node)
def invalidatePartitions(self, tid, partition_set):
def invalidatePartitions(self, tid, prev_tid, partition_set):
app = self.app
prev_tid = app.getLastTransaction()
app.setLastTransaction(tid)
pt = app.pt
trigger_set = set()
......
......@@ -34,10 +34,18 @@ class BackupHandler(EventHandler):
def answerLastTransaction(self, conn, tid):
app = self.app
if tid != ZERO_TID:
app.invalidatePartitions(tid, set(xrange(app.pt.getPartitions())))
else: # upstream DB is empty
assert app.app.getLastTransaction() == tid
prev_tid = app.app.getLastTransaction()
if prev_tid < tid:
# Since we don't know which partitions were modified during our
# absence, we must force replication on all storages. As long as
# they haven't done this first check, our backup tid will remain
# inferior to this 'tid'. We don't know the real prev_tid, which is:
# >= app.app.getLastTransaction()
# < tid
# but passing 'tid' is good enough.
app.invalidatePartitions(tid, tid, xrange(app.pt.getPartitions()))
elif prev_tid != tid:
raise RuntimeError("upstream DB truncated")
app.ignore_invalidations = False
def invalidateObjects(self, conn, tid, oid_list):
......@@ -47,4 +55,5 @@ class BackupHandler(EventHandler):
getPartition = app.app.pt.getPartition
partition_set = set(map(getPartition, oid_list))
partition_set.add(getPartition(tid))
app.invalidatePartitions(tid, partition_set)
prev_tid = app.app.getLastTransaction()
app.invalidatePartitions(tid, prev_tid, partition_set)
......@@ -333,6 +333,29 @@ class ReplicationTests(NEOThreadedTest):
finally:
upstream.stop()
@backup_test()
def testBackupTid(self, backup):
"""
Check that the backup cluster does not claim it has all the data just
after it came back whereas new transactions were committed during its
absence.
"""
importZODB = backup.upstream.importZODB()
importZODB(1)
self.tic()
last_tid = backup.upstream.last_tid
self.assertEqual(last_tid, backup.backup_tid)
backup.stop()
importZODB(1)
backup.reset()
with ConnectionFilter() as f:
f.add(lambda conn, packet:
isinstance(packet, Packets.AskFetchTransactions))
backup.start()
self.assertEqual(last_tid, backup.backup_tid)
self.tic()
self.assertEqual(1, self.checkBackup(backup))
def testSafeTweak(self):
"""
Check that tweak always tries to keep a minimum of (replicas + 1)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment