Commit 08742377 authored by Julien Muchembled's avatar Julien Muchembled

More bugfixes to backup mode

- catch OperationFailure
- reset transaction manager when leaving backup mode
- send appropriate target tid to a storage that updates a outdated cell
- clean up partition table when leaving BACKINGUP state unexpectedly
- make sure all readable cells of a partition have the same 'backup_tid'
  if they have the same data, so that we know when internal replication is
  finished when leaving backup mode
- fix storage not finished internal replication when leaving backup mode
parent 92a828dc
......@@ -23,6 +23,9 @@ RC - Review output of pylint (CODE)
Consider the need to implement a keep-alive system (packets sent
automatically when there is no activity on the connection for a period
of time).
- When all cells are OUT_OF_DATE in backup mode, the one with most data
could become UP_TO_DATE with appropriate backup_tid, so that the cluster
stays operational (FEATURE).
- Finish renaming UUID into NID everywhere (CODE)
- Consider using multicast for cluster-wide notifications. (BANDWITH)
Currently, multi-receivers notifications are sent in unicast to each
......
......@@ -26,7 +26,7 @@ except ImportError:
pass
# The protocol version (major, minor).
PROTOCOL_VERSION = (13, 1)
PROTOCOL_VERSION = (14, 1)
# Size restrictions.
MIN_PACKET_SIZE = 10
......@@ -1497,12 +1497,12 @@ class ReplicationDone(Packet):
class Truncate(Packet):
"""
XXX: Used for both make storage consistent and leave backup mode
M -> S
"""
_fmt = PStruct('ask_truncate',
_fmt = PStruct('truncate',
PTID('tid'),
)
_answer = PFEmpty
StaticRegistry = {}
......@@ -1723,7 +1723,7 @@ class Packets(dict):
AddTransaction)
AddObject = register(
AddObject)
AskTruncate, AnswerTruncate = register(
Truncate = register(
Truncate)
def Errors():
......
......@@ -279,10 +279,6 @@ class Application(object):
try:
while True:
poll(1)
except OperationFailure:
# If not operational, send Stop Operation packets to storage
# nodes and client nodes. Abort connections to client nodes.
logging.critical('No longer operational')
except StateChangedException, e:
if e.args[0] != ClusterStates.STARTING_BACKUP:
raise
......@@ -337,13 +333,20 @@ class Application(object):
self.runManager(RecoveryManager)
while True:
self.runManager(VerificationManager)
try:
if self.backup_tid:
if self.backup_app is None:
raise RuntimeError("No upstream cluster to backup"
" defined in configuration")
self.backup_app.provideService()
else:
# Reset connection with storages (and go through a
# recovery phase) when leaving backup mode in order
# to get correct last oid/tid.
self.runManager(RecoveryManager)
continue
self.provideService()
except OperationFailure:
logging.critical('No longer operational')
for node in self.nm.getIdentifiedList():
if node.isStorage() or node.isClient():
node.notify(Packets.StopOperation())
......@@ -463,8 +466,6 @@ class Application(object):
while self.tm.hasPending():
self.em.poll(1)
except OperationFailure:
# If not operational, send Stop Operation packets to storage
# nodes and client nodes. Abort connections to client nodes.
logging.critical('No longer operational')
logging.info("asking remaining nodes to shutdown")
......
......@@ -21,9 +21,10 @@ from neo.lib import logging
from neo.lib.bootstrap import BootstrapManager
from neo.lib.connector import getConnectorHandler
from neo.lib.exception import PrimaryFailure
from neo.lib.handler import EventHandler
from neo.lib.node import NodeManager
from neo.lib.protocol import CellStates, ClusterStates, NodeTypes, Packets
from neo.lib.protocol import uuid_str, INVALID_TID, ZERO_TID
from neo.lib.protocol import CellStates, ClusterStates, \
NodeStates, NodeTypes, Packets, uuid_str, INVALID_TID, ZERO_TID
from neo.lib.util import add64, dump
from .app import StateChangedException
from .pt import PartitionTable
......@@ -144,26 +145,32 @@ class BackupApplication(object):
while pt.getCheckTid(xrange(pt.getPartitions())) < tid:
poll(1)
last_tid = app.getLastTransaction()
handler = EventHandler(app)
if tid < last_tid:
assert tid != ZERO_TID
logging.warning("Truncating at %s (last_tid was %s)",
dump(app.backup_tid), dump(last_tid))
p = Packets.AskTruncate(tid)
connection_list = []
# XXX: We want to go through a recovery phase in order to
# initialize the transaction manager, but this is only
# possible if storages already know that we left backup
# mode. To that purpose, we always send a Truncate packet,
# even if there's nothing to truncate.
p = Packets.Truncate(tid)
for node in app.nm.getStorageList(only_identified=True):
conn = node.getConnection()
conn.ask(p)
connection_list.append(conn)
for conn in connection_list:
while conn.isPending():
poll(1)
app.setLastTransaction(tid)
conn.setHandler(handler)
node.setState(NodeStates.TEMPORARILY_DOWN)
# Packets will be sent at the beginning of the recovery
# phase.
conn.notify(p)
conn.abort()
# If any error happened before reaching this line, we'd go back
# to backup mode, which is the right mode to recover.
del app.backup_tid
break
finally:
del self.primary_partition_dict, self.tid_list
pt.clearReplicating()
def nodeLost(self, node):
getCellList = self.app.pt.getCellList
......@@ -205,9 +212,25 @@ class BackupApplication(object):
node_list = []
for cell in pt.getCellList(offset, readable=True):
node = cell.getNode()
assert node.isConnected()
assert cell.backup_tid < last_max_tid or \
cell.backup_tid == prev_tid
assert node.isConnected(), node
if cell.backup_tid == prev_tid:
# Let's given 4 TID t0,t1,t2,t3: if a cell is only
# modified by t0 & t3 and has all data for t0, 4 values
# are possible for its 'backup_tid' until it replicates
# up to t3: t0, t1, t2 or t3 - 1
# Choosing the smallest one (t0) is easier to implement
# but when leaving backup mode, we would always lose
# data if the last full transaction does not modify
# all partitions. t1 is wrong for the same reason.
# So we have chosen the highest one (t3 - 1).
# t2 should also work but maybe harder to implement.
cell.backup_tid = add64(tid, -1)
  • @jm I was verifying backup tests and noticed this: by definition backup_tid is the tid up to which something in backup cluster has whole data. Something can be storage, cell, cluster, etc. Here we are updating cell.backup_tid before storages actually pulled the data - before master even told them to do so. So to me this way it creates false promise of backup cluster to have the data, and thus is not correct.

    And as storages from backup cluster (Sb) anyway notify backup's master (Mb) about when they complete replication Mb told them to do (via ReplicationDone), Mb correctly adjusts cell.backup_tid (https://lab.nexedi.com/nexedi/neoppod/blob/d9dd39f0/neo/master/backup_app.py#L312) I wonder why at all this pre-setting of cell.backup_tid is needed. You mention

                         # Choosing the smallest one (t0) is easier to implement
                         # but when leaving backup mode, we would always lose
                         # data if the last full transaction does not modify
                         # all partitions. t1 is wrong for the same reason.

    but for transactions which do not modify all partitions, we anyway scan cells of not touched partitions and adjust their backup_tid fast-forward straight without asking storages to do anything: https://lab.nexedi.com/nexedi/neoppod/blob/d9dd39f0/neo/master/backup_app.py#L245

    Anyway pre-setting .backup_tid in advance before actually pulling data is not correct to me and has to be fixed.

    Your feedback, maybe clarifying something for the rationale or codebase limitations, is welcome.

    Thanks,
    Kirill

    /cc @Tyagov, @vpelletier

  • I don't remember everything, and in particular what the using t0 would be significantly easier to implement.

    If there's no new data for a cell between 2 tids (t0 and t3 in the comment), then we can claim what we want between t0 and t3-1 inclusive. I don't see where there would be a false positive.

    For the overall database, see PartitionTable.getBackupTid

  • Thanks for feedback. You are reight this is correct to update backup_tid to something <tid, if we know there is no transactions modifying this cell in beetween, sorry for the confusion here. And now I see we want to pre-update because for the whole cluster (thanks for the link) we compute backup_tid as min of cell's .backup_tid, so if we don't pre-update and for some reason storages won't fetch associated data and then backup is stopped, the backup cluster will be truncated at cluster.backup_tid which will be getting minimum of cell's .backup_tid taking t0 as cut-off and this way potentially throwing away already fetched data in other cells which we might be correctly keeping.

    I was fooled by t1 & t2, to me it is only t0 and t3 there (and everything that comes in beetween but does not change objects in this cell) is handled in second branch plainly fast-forwarding cell.backup_tid to new tid. So if it is only t0 and t3, it is safe to update to t3-1 and that's all.

    Thanks for feedback and sorry for the confusion once again,
    Kirill

  • In fact, there exists a case where the cluster backup_tid is higher than in reality, and I must check whether it can be an issue other than giving false impression that the cluster has the data.

    It's when starting a backup cluster that is already late. As long as there's one partition for which synchronization hasn't started, neoctl should not report that it has data newer than the previous stop of the backup cluster.

Please register or sign in to reply
logging.debug(
"partition %u: updating backup_tid of %r to %s",
offset, cell, dump(cell.backup_tid))
else:
assert cell.backup_tid < last_max_tid, (
cell.backup_tid, last_max_tid, prev_tid, tid)
if app.isStorageReady(node.getUUID()):
node_list.append(node)
assert node_list
......
......@@ -16,7 +16,7 @@
from neo.lib.exception import PrimaryFailure
from neo.lib.handler import EventHandler
from neo.lib.protocol import CellStates
from neo.lib.protocol import CellStates, ZERO_TID
class BackupHandler(EventHandler):
"""Handler dedicated to upstream master during BACKINGUP state"""
......@@ -39,7 +39,10 @@ class BackupHandler(EventHandler):
def answerLastTransaction(self, conn, tid):
app = self.app
if tid != ZERO_TID:
app.invalidatePartitions(tid, set(xrange(app.pt.getPartitions())))
else: # upstream DB is empty
assert app.app.getLastTransaction() == tid
def invalidateObjects(self, conn, tid, oid_list):
app = self.app
......
......@@ -48,6 +48,8 @@ class IdentificationHandler(MasterHandler):
handler = app.client_service_handler
human_readable_node_type = ' client '
elif node_type == NodeTypes.STORAGE:
if app.cluster_state == ClusterStates.STOPPING_BACKUP:
raise NotReadyError
node_ctor = app.nm.createStorage
manager = app._current_manager
if manager is None:
......
......@@ -58,9 +58,13 @@ class StorageServiceHandler(BaseServiceHandler):
app.backup_tid))
def askUnfinishedTransactions(self, conn):
tm = self.app.tm
pending_list = tm.registerForNotification(conn.getUUID())
last_tid = tm.getLastTID()
app = self.app
if app.backup_tid:
last_tid = app.pt.getBackupTid()
pending_list = ()
else:
last_tid = app.tm.getLastTID()
pending_list = app.tm.registerForNotification(conn.getUUID())
p = Packets.AnswerUnfinishedTransactions(last_tid, pending_list)
conn.answer(p)
......@@ -99,9 +103,6 @@ class StorageServiceHandler(BaseServiceHandler):
logging.debug("%s is up for offset %s", node, offset)
self.app.broadcastPartitionChanges(cell_list)
def answerTruncate(self, conn):
pass
def answerPack(self, conn, status):
app = self.app
if app.packing is not None:
......
......@@ -311,9 +311,18 @@ class PartitionTable(neo.lib.pt.PartitionTable):
for cell in row
if cell.isReadable())
def clearReplicating(self):
for row in self.partition_list:
for cell in row:
try:
del cell.replicating
except AttributeError:
pass
def setBackupTidDict(self, backup_tid_dict):
for row in self.partition_list:
for cell in row:
if cell.isReadable():
cell.backup_tid = backup_tid_dict.get(cell.getUUID(),
ZERO_TID)
......
......@@ -312,6 +312,9 @@ class Application(object):
_poll()
finally:
del self.task_queue
# XXX: Although no handled exception should happen between
# replicator.populate() and the beginning of this 'try'
# clause, the replicator should be reset in a safer place.
self.replicator = Replicator(self)
# Abort any replication, whether we are feeding or out-of-date.
for node in self.nm.getStorageList(only_identified=True):
......
......@@ -436,7 +436,7 @@ class DatabaseManager(object):
def truncate(self, tid):
assert tid not in (None, ZERO_TID), tid
assert self.getBackupTID()
self.setBackupTID(tid)
self.setBackupTID(None) # XXX
for partition in xrange(self.getNumPartitions()):
self._deleteRange(partition, tid)
self.commit()
......
......@@ -68,10 +68,10 @@ class MasterOperationHandler(BaseMasterHandler):
dict((p, a and (a, upstream_name))
for p, a in source_dict.iteritems()))
def askTruncate(self, conn, tid):
def truncate(self, conn, tid):
self.app.replicator.cancel()
self.app.dm.truncate(tid)
conn.answer(Packets.AnswerTruncate())
conn.close()
def checkPartition(self, conn, *args):
self.app.checker(*args)
......@@ -354,16 +354,17 @@ class Replicator(object):
self.getCurrentConnection().close()
def stop(self):
d = None, None
# Close any open connection to an upstream storage,
# possibly aborting current replication.
node = self.current_node
if node is not None is node.getUUID():
self.cancel()
# Cancel all replication orders from upstream cluster.
for offset in self.replicate_dict.keys():
addr, name = self.source_dict.get(offset, d)
addr, name = self.source_dict.get(offset, (None, None))
if name:
tid = self.replicate_dict.pop(offset)
logging.info('cancel replication of partition %u from %r'
' up to %s', offset, addr, dump(tid))
# Close any open connection to an upstream storage,
# possibly aborting current replication.
node = self.current_node
if node is not None is node.getUUID():
self.cancel()
# Make UP_TO_DATE cells really UP_TO_DATE
self._nextPartition()
......@@ -20,6 +20,7 @@ import time
import threading
import transaction
import unittest
from collections import defaultdict
from functools import wraps
from neo.lib import logging
from neo.storage.checker import CHECK_COUNT
......@@ -200,6 +201,10 @@ class ReplicationTests(NEOThreadedTest):
try:
upstream.start()
importZODB = upstream.importZODB(random=random)
# Do not start with an empty DB so that 'primary_dict' below is not
# empty on the first iteration.
importZODB(1)
upstream.client.setPoll(0)
backup = NEOCluster(partitions=np, replicas=2, storage_count=4,
upstream=upstream)
try:
......@@ -214,11 +219,10 @@ class ReplicationTests(NEOThreadedTest):
p = Patch(upstream.master.tm,
_on_commit=onTransactionCommitted)
else:
primary_dict = {}
primary_dict = defaultdict(list)
for k, v in sorted(backup.master.backup_app
.primary_partition_dict.iteritems()):
primary_dict.setdefault(storage_list.index(v._uuid),
[]).append(k)
primary_dict[storage_list.index(v._uuid)].append(k)
if event % 2:
storage = slave(primary_dict).pop()
else:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment