Commit d3c8b76d authored by Julien Muchembled's avatar Julien Muchembled

Safer DB truncation, new 'truncate' ctl command

With the previous commit, the request to truncate the DB was not stored
persistently, which means that this operation was still vulnerable to the case
where the master is restarted after some nodes, but not all, have already
truncated. The master didn't have the information to fix this and the result
was a DB partially truncated.

-> On a Truncate packet, a storage node only stores the tid somewhere, to send
   it back to the master, which stays in RECOVERING state as long as any node
   has a different value than that of the node with the latest partition table.

We also want to make sure that there is no unfinished data, because a user may
truncate at a tid higher than a locked one.

-> Truncation is now effective at the end on the VERIFYING phase, just before
   returning the last ids to the master.

At last all nodes should be truncated, to avoid that an offline node comes back
with a different history. Currently, this would not be an issue since
replication is always restart from the beginning, but later we'd like they
remember where they stopped to replicate.

-> If a truncation is requested, the master waits for all nodes to be pending,
   even if it was previously started (the user can still force the cluster to
   start with neoctl). And any lost node during verification also causes the
   master to go back to recovery.

Obviously, the protocol has been changed to split the LastIDs packet and
introduce a new Recovery, since it does not make sense anymore to ask last ids
during recovery.
parent 3e3eab5b
...@@ -141,9 +141,7 @@ ...@@ -141,9 +141,7 @@
Admin Admin
- Make admin node able to monitor multiple clusters simultaneously - Make admin node able to monitor multiple clusters simultaneously
- Send notifications (ie: mail) when a storage or master node is lost - Send notifications (ie: mail) when a storage or master node is lost
- Add ctl command to truncate DB at arbitrary TID. 'Truncate' message - Add ctl command to list last transactions, like fstail for FileStorage.
can be reused. There should also be a way to list last transactions,
like fstail for FileStorage.
Tests Tests
- Use another mock library: Python 3.3+ has unittest.mock, which is - Use another mock library: Python 3.3+ has unittest.mock, which is
......
...@@ -65,10 +65,12 @@ class AdminEventHandler(EventHandler): ...@@ -65,10 +65,12 @@ class AdminEventHandler(EventHandler):
askLastIDs = forward_ask(Packets.AskLastIDs) askLastIDs = forward_ask(Packets.AskLastIDs)
askLastTransaction = forward_ask(Packets.AskLastTransaction) askLastTransaction = forward_ask(Packets.AskLastTransaction)
addPendingNodes = forward_ask(Packets.AddPendingNodes) addPendingNodes = forward_ask(Packets.AddPendingNodes)
askRecovery = forward_ask(Packets.AskRecovery)
tweakPartitionTable = forward_ask(Packets.TweakPartitionTable) tweakPartitionTable = forward_ask(Packets.TweakPartitionTable)
setClusterState = forward_ask(Packets.SetClusterState) setClusterState = forward_ask(Packets.SetClusterState)
setNodeState = forward_ask(Packets.SetNodeState) setNodeState = forward_ask(Packets.SetNodeState)
checkReplicas = forward_ask(Packets.CheckReplicas) checkReplicas = forward_ask(Packets.CheckReplicas)
truncate = forward_ask(Packets.Truncate)
class MasterEventHandler(EventHandler): class MasterEventHandler(EventHandler):
......
...@@ -102,11 +102,17 @@ class PrimaryNotificationsHandler(MTEventHandler): ...@@ -102,11 +102,17 @@ class PrimaryNotificationsHandler(MTEventHandler):
if app.master_conn is None: if app.master_conn is None:
app._cache_lock_acquire() app._cache_lock_acquire()
try: try:
oid_list = app._cache.clear_current()
db = app.getDB() db = app.getDB()
if db is not None: if app.last_tid < ltid:
db.invalidate(app.last_tid and oid_list = app._cache.clear_current()
add64(app.last_tid, 1), oid_list) db is None or db.invalidate(
app.last_tid and add64(app.last_tid, 1),
oid_list)
else:
# The DB was truncated. It happens so
# rarely that we don't need to optimize.
app._cache.clear()
db is None or db.invalidateCache()
finally: finally:
app._cache_lock_release() app._cache_lock_release()
app.last_tid = ltid app.last_tid = ltid
......
...@@ -23,7 +23,7 @@ class ElectionFailure(NeoException): ...@@ -23,7 +23,7 @@ class ElectionFailure(NeoException):
class PrimaryFailure(NeoException): class PrimaryFailure(NeoException):
pass pass
class OperationFailure(NeoException): class StoppedOperation(NeoException):
pass pass
class DatabaseFailure(NeoException): class DatabaseFailure(NeoException):
......
...@@ -722,16 +722,24 @@ class ReelectPrimary(Packet): ...@@ -722,16 +722,24 @@ class ReelectPrimary(Packet):
Force a re-election of a primary master node. M -> M. Force a re-election of a primary master node. M -> M.
""" """
class Recovery(Packet):
"""
Ask all data needed by master to recover. PM -> S, S -> PM.
"""
_answer = PStruct('answer_recovery',
PPTID('ptid'),
PTID('backup_tid'),
PTID('truncate_tid'),
)
class LastIDs(Packet): class LastIDs(Packet):
""" """
Ask the last OID, the last TID and the last Partition Table ID so that Ask the last OID/TID so that a master can initialize its TransactionManager.
a master recover. PM -> S, S -> PM. PM -> S, S -> PM.
""" """
_answer = PStruct('answer_last_ids', _answer = PStruct('answer_last_ids',
POID('last_oid'), POID('last_oid'),
PTID('last_tid'), PTID('last_tid'),
PPTID('last_ptid'),
PTID('backup_tid'),
) )
class PartitionTable(Packet): class PartitionTable(Packet):
...@@ -1470,13 +1478,14 @@ class ReplicationDone(Packet): ...@@ -1470,13 +1478,14 @@ class ReplicationDone(Packet):
class Truncate(Packet): class Truncate(Packet):
""" """
XXX: Used for both make storage consistent and leave backup mode Request DB to be truncated. Also used to leave backup mode.
M -> S
""" """
_fmt = PStruct('truncate', _fmt = PStruct('truncate',
PTID('tid'), PTID('tid'),
) )
_answer = Error
StaticRegistry = {} StaticRegistry = {}
def register(request, ignore_when_closed=None): def register(request, ignore_when_closed=None):
...@@ -1594,6 +1603,8 @@ class Packets(dict): ...@@ -1594,6 +1603,8 @@ class Packets(dict):
ReelectPrimary) ReelectPrimary)
NotifyNodeInformation = register( NotifyNodeInformation = register(
NotifyNodeInformation) NotifyNodeInformation)
AskRecovery, AnswerRecovery = register(
Recovery)
AskLastIDs, AnswerLastIDs = register( AskLastIDs, AnswerLastIDs = register(
LastIDs) LastIDs)
AskPartitionTable, AnswerPartitionTable = register( AskPartitionTable, AnswerPartitionTable = register(
......
...@@ -24,7 +24,7 @@ from neo.lib.protocol import uuid_str, UUID_NAMESPACES, ZERO_TID ...@@ -24,7 +24,7 @@ from neo.lib.protocol import uuid_str, UUID_NAMESPACES, ZERO_TID
from neo.lib.protocol import ClusterStates, NodeStates, NodeTypes, Packets from neo.lib.protocol import ClusterStates, NodeStates, NodeTypes, Packets
from neo.lib.handler import EventHandler from neo.lib.handler import EventHandler
from neo.lib.connection import ListeningConnection, ClientConnection from neo.lib.connection import ListeningConnection, ClientConnection
from neo.lib.exception import ElectionFailure, PrimaryFailure, OperationFailure from neo.lib.exception import ElectionFailure, PrimaryFailure, StoppedOperation
class StateChangedException(Exception): pass class StateChangedException(Exception): pass
...@@ -45,6 +45,7 @@ class Application(BaseApplication): ...@@ -45,6 +45,7 @@ class Application(BaseApplication):
backup_tid = None backup_tid = None
backup_app = None backup_app = None
uuid = None uuid = None
truncate_tid = None
def __init__(self, config): def __init__(self, config):
super(Application, self).__init__( super(Application, self).__init__(
...@@ -331,12 +332,9 @@ class Application(BaseApplication): ...@@ -331,12 +332,9 @@ class Application(BaseApplication):
# machines but must not start automatically: otherwise, each storage # machines but must not start automatically: otherwise, each storage
# node would diverge. # node would diverge.
self._startup_allowed = False self._startup_allowed = False
self.truncate_tid = None
try: try:
while True: while True:
self.runManager(RecoveryManager) self.runManager(RecoveryManager)
# Automatic restart if we become non-operational.
self._startup_allowed = True
try: try:
self.runManager(VerificationManager) self.runManager(VerificationManager)
if not self.backup_tid: if not self.backup_tid:
...@@ -346,10 +344,13 @@ class Application(BaseApplication): ...@@ -346,10 +344,13 @@ class Application(BaseApplication):
if self.backup_app is None: if self.backup_app is None:
raise RuntimeError("No upstream cluster to backup" raise RuntimeError("No upstream cluster to backup"
" defined in configuration") " defined in configuration")
self.truncate_tid = self.backup_app.provideService() truncate = Packets.Truncate(
except OperationFailure: self.backup_app.provideService())
except StoppedOperation, e:
logging.critical('No longer operational') logging.critical('No longer operational')
self.truncate_tid = None truncate = Packets.Truncate(*e.args) if e.args else None
# Automatic restart except if we truncate or retry to.
self._startup_allowed = not (self.truncate_tid or truncate)
node_list = [] node_list = []
for node in self.nm.getIdentifiedList(): for node in self.nm.getIdentifiedList():
if node.isStorage() or node.isClient(): if node.isStorage() or node.isClient():
...@@ -357,7 +358,10 @@ class Application(BaseApplication): ...@@ -357,7 +358,10 @@ class Application(BaseApplication):
conn.notify(Packets.StopOperation()) conn.notify(Packets.StopOperation())
if node.isClient(): if node.isClient():
conn.abort() conn.abort()
elif node.isRunning(): continue
if truncate:
conn.notify(truncate)
if node.isRunning():
node.setPending() node.setPending()
node_list.append(node) node_list.append(node)
self.broadcastNodesInformation(node_list) self.broadcastNodesInformation(node_list)
...@@ -475,7 +479,7 @@ class Application(BaseApplication): ...@@ -475,7 +479,7 @@ class Application(BaseApplication):
# wait for all transaction to be finished # wait for all transaction to be finished
while self.tm.hasPending(): while self.tm.hasPending():
self.em.poll(1) self.em.poll(1)
except OperationFailure: except StoppedOperation:
logging.critical('No longer operational') logging.critical('No longer operational')
logging.info("asking remaining nodes to shutdown") logging.info("asking remaining nodes to shutdown")
......
...@@ -152,17 +152,19 @@ class BackupApplication(object): ...@@ -152,17 +152,19 @@ class BackupApplication(object):
assert tid != ZERO_TID assert tid != ZERO_TID
logging.warning("Truncating at %s (last_tid was %s)", logging.warning("Truncating at %s (last_tid was %s)",
dump(app.backup_tid), dump(last_tid)) dump(app.backup_tid), dump(last_tid))
# We will really truncate so do not start automatically else:
# if there's any missing storage. # We will do a dummy truncation, just to leave backup mode,
app._startup_allowed = False # so it's fine to start automatically if there's any
# missing storage.
# XXX: Consider using another method to leave backup mode,
# at least when there's nothing to truncate. Because
# in case of StoppedOperation during VERIFYING state,
# this flag will be wrongly set to False.
app._startup_allowed = True
# If any error happened before reaching this line, we'd go back # If any error happened before reaching this line, we'd go back
# to backup mode, which is the right mode to recover. # to backup mode, which is the right mode to recover.
del app.backup_tid del app.backup_tid
# We will go through a recovery phase in order to reset the # Now back to RECOVERY...
# transaction manager and this is only possible if storages
# already know that we left backup mode. To that purpose, we
# always stop operation with a tid, even if there's nothing to
# truncate.
return tid return tid
finally: finally:
del self.primary_partition_dict, self.tid_list del self.primary_partition_dict, self.tid_list
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
from neo.lib import logging from neo.lib import logging
from neo.lib.exception import OperationFailure from neo.lib.exception import StoppedOperation
from neo.lib.handler import EventHandler from neo.lib.handler import EventHandler
from neo.lib.protocol import (uuid_str, NodeTypes, NodeStates, Packets, from neo.lib.protocol import (uuid_str, NodeTypes, NodeStates, Packets,
BrokenNodeDisallowedError, BrokenNodeDisallowedError,
...@@ -66,13 +66,16 @@ class MasterHandler(EventHandler): ...@@ -66,13 +66,16 @@ class MasterHandler(EventHandler):
state = self.app.getClusterState() state = self.app.getClusterState()
conn.answer(Packets.AnswerClusterState(state)) conn.answer(Packets.AnswerClusterState(state))
def askLastIDs(self, conn): def askRecovery(self, conn):
app = self.app app = self.app
conn.answer(Packets.AnswerLastIDs( conn.answer(Packets.AnswerRecovery(
app.tm.getLastOID(),
app.tm.getLastTID(),
app.pt.getID(), app.pt.getID(),
app.backup_tid)) app.backup_tid and app.pt.getBackupTid(),
app.truncate_tid))
def askLastIDs(self, conn):
tm = self.app.tm
conn.answer(Packets.AnswerLastIDs(tm.getLastOID(), tm.getLastTID()))
def askLastTransaction(self, conn): def askLastTransaction(self, conn):
conn.answer(Packets.AnswerLastTransaction( conn.answer(Packets.AnswerLastTransaction(
...@@ -130,9 +133,11 @@ class BaseServiceHandler(MasterHandler): ...@@ -130,9 +133,11 @@ class BaseServiceHandler(MasterHandler):
logging.info('drop a pending node from the node manager') logging.info('drop a pending node from the node manager')
app.nm.remove(node) app.nm.remove(node)
app.broadcastNodesInformation([node]) app.broadcastNodesInformation([node])
if app.truncate_tid:
raise StoppedOperation
app.broadcastPartitionChanges(app.pt.outdate(node)) app.broadcastPartitionChanges(app.pt.outdate(node))
if not app.pt.operational(): if not app.pt.operational():
raise OperationFailure("cannot continue operation") raise StoppedOperation
def notifyReady(self, conn): def notifyReady(self, conn):
self.app.setStorageReady(conn.getUUID()) self.app.setStorageReady(conn.getUUID())
......
...@@ -19,6 +19,7 @@ import random ...@@ -19,6 +19,7 @@ import random
from . import MasterHandler from . import MasterHandler
from ..app import StateChangedException from ..app import StateChangedException
from neo.lib import logging from neo.lib import logging
from neo.lib.exception import StoppedOperation
from neo.lib.pt import PartitionTableException from neo.lib.pt import PartitionTableException
from neo.lib.protocol import ClusterStates, Errors, \ from neo.lib.protocol import ClusterStates, Errors, \
NodeStates, NodeTypes, Packets, ProtocolError, uuid_str NodeStates, NodeTypes, Packets, ProtocolError, uuid_str
...@@ -159,6 +160,13 @@ class AdministrationHandler(MasterHandler): ...@@ -159,6 +160,13 @@ class AdministrationHandler(MasterHandler):
map(app.nm.getByUUID, uuid_list))) map(app.nm.getByUUID, uuid_list)))
conn.answer(Errors.Ack('')) conn.answer(Errors.Ack(''))
def truncate(self, conn, tid):
app = self.app
if app.cluster_state != ClusterStates.RUNNING:
raise ProtocolError('Can not truncate in this state')
conn.answer(Errors.Ack(''))
raise StoppedOperation(tid)
def checkReplicas(self, conn, partition_dict, min_tid, max_tid): def checkReplicas(self, conn, partition_dict, min_tid, max_tid):
app = self.app app = self.app
pt = app.pt pt = app.pt
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
from neo.lib import logging from neo.lib import logging
from neo.lib.protocol import CellStates, ClusterStates, Packets, ProtocolError from neo.lib.protocol import CellStates, ClusterStates, Packets, ProtocolError
from neo.lib.exception import OperationFailure from neo.lib.exception import StoppedOperation
from neo.lib.pt import PartitionTableException from neo.lib.pt import PartitionTableException
from . import BaseServiceHandler from . import BaseServiceHandler
...@@ -76,7 +76,7 @@ class StorageServiceHandler(BaseServiceHandler): ...@@ -76,7 +76,7 @@ class StorageServiceHandler(BaseServiceHandler):
CellStates.CORRUPTED)) CellStates.CORRUPTED))
self.app.broadcastPartitionChanges(change_list) self.app.broadcastPartitionChanges(change_list)
if not self.app.pt.operational(): if not self.app.pt.operational():
raise OperationFailure('cannot continue operation') raise StoppedOperation
def notifyReplicationDone(self, conn, offset, tid): def notifyReplicationDone(self, conn, offset, tid):
app = self.app app = self.app
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>. # along with this program. If not, see <http://www.gnu.org/licenses/>.
from neo.lib import logging from neo.lib import logging
from neo.lib.util import dump
from neo.lib.protocol import Packets, ProtocolError, ClusterStates, NodeStates from neo.lib.protocol import Packets, ProtocolError, ClusterStates, NodeStates
from .handlers import MasterHandler from .handlers import MasterHandler
...@@ -30,6 +29,7 @@ class RecoveryManager(MasterHandler): ...@@ -30,6 +29,7 @@ class RecoveryManager(MasterHandler):
self.target_ptid = None self.target_ptid = None
self.ask_pt = [] self.ask_pt = []
self.backup_tid_dict = {} self.backup_tid_dict = {}
self.truncate_dict = {}
def getHandler(self): def getHandler(self):
return self return self
...@@ -49,7 +49,6 @@ class RecoveryManager(MasterHandler): ...@@ -49,7 +49,6 @@ class RecoveryManager(MasterHandler):
""" """
logging.info('begin the recovery of the status') logging.info('begin the recovery of the status')
app = self.app app = self.app
app.tm.reset()
pt = app.pt pt = app.pt
app.changeClusterState(ClusterStates.RECOVERING) app.changeClusterState(ClusterStates.RECOVERING)
pt.clear() pt.clear()
...@@ -64,8 +63,12 @@ class RecoveryManager(MasterHandler): ...@@ -64,8 +63,12 @@ class RecoveryManager(MasterHandler):
node_list = pt.getOperationalNodeSet() node_list = pt.getOperationalNodeSet()
if app._startup_allowed: if app._startup_allowed:
node_list = [node for node in node_list if node.isPending()] node_list = [node for node in node_list if node.isPending()]
elif not all(node.isPending() for node in node_list): elif node_list:
continue # we want all nodes to be there if we're going to truncate
if app.truncate_tid:
node_list = pt.getNodeSet()
if not all(node.isPending() for node in node_list):
continue
elif app._startup_allowed or app.autostart: elif app._startup_allowed or app.autostart:
# No partition table and admin allowed startup, we are # No partition table and admin allowed startup, we are
# creating a new cluster out of all pending nodes. # creating a new cluster out of all pending nodes.
...@@ -77,6 +80,17 @@ class RecoveryManager(MasterHandler): ...@@ -77,6 +80,17 @@ class RecoveryManager(MasterHandler):
if node_list and not any(node.getConnection().isPending() if node_list and not any(node.getConnection().isPending()
for node in node_list): for node in node_list):
if pt.filled(): if pt.filled():
if app.truncate_tid:
node_list = app.nm.getIdentifiedList(pool_set={uuid
for uuid, tid in self.truncate_dict.iteritems()
if not tid or app.truncate_tid < tid})
if node_list:
truncate = Packets.Truncate(app.truncate_tid)
for node in node_list:
conn = node.getConnection()
conn.notify(truncate)
self.connectionCompleted(conn, False)
continue
node_list = pt.getConnectedNodeList() node_list = pt.getConnectedNodeList()
break break
...@@ -101,12 +115,13 @@ class RecoveryManager(MasterHandler): ...@@ -101,12 +115,13 @@ class RecoveryManager(MasterHandler):
pt.setBackupTidDict(self.backup_tid_dict) pt.setBackupTidDict(self.backup_tid_dict)
app.backup_tid = pt.getBackupTid() app.backup_tid = pt.getBackupTid()
logging.debug('cluster starts with loid=%s and this partition table :', logging.debug('cluster starts this partition table:')
dump(app.tm.getLastOID()))
pt.log() pt.log()
def connectionLost(self, conn, new_state): def connectionLost(self, conn, new_state):
uuid = conn.getUUID() uuid = conn.getUUID()
self.backup_tid_dict.pop(uuid, None)
self.truncate_dict.pop(uuid, None)
node = self.app.nm.getByUUID(uuid) node = self.app.nm.getByUUID(uuid)
try: try:
i = self.ask_pt.index(uuid) i = self.ask_pt.index(uuid)
...@@ -129,40 +144,38 @@ class RecoveryManager(MasterHandler): ...@@ -129,40 +144,38 @@ class RecoveryManager(MasterHandler):
self.app.broadcastNodesInformation([node]) self.app.broadcastNodesInformation([node])
def connectionCompleted(self, conn, new): def connectionCompleted(self, conn, new):
tid = self.app.truncate_tid
if tid:
conn.notify(Packets.Truncate(tid))
# ask the last IDs to perform the recovery # ask the last IDs to perform the recovery
conn.ask(Packets.AskLastIDs()) conn.ask(Packets.AskRecovery())
def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid): def answerRecovery(self, conn, ptid, backup_tid, truncate_tid):
tm = self.app.tm
tm.setLastOID(loid)
tm.setLastTID(ltid)
uuid = conn.getUUID() uuid = conn.getUUID()
if self.target_ptid <= lptid: if self.target_ptid <= ptid:
# Maybe a newer partition table. # Maybe a newer partition table.
if self.target_ptid == lptid and self.ask_pt: if self.target_ptid == ptid and self.ask_pt:
# Another node is already asked. # Another node is already asked.
self.ask_pt.append(uuid) self.ask_pt.append(uuid)
elif self.target_ptid < lptid or self.ask_pt is not (): elif self.target_ptid < ptid or self.ask_pt is not ():
# No node asked yet for the newest partition table. # No node asked yet for the newest partition table.
self.target_ptid = lptid self.target_ptid = ptid
self.ask_pt = [uuid] self.ask_pt = [uuid]
conn.ask(Packets.AskPartitionTable()) conn.ask(Packets.AskPartitionTable())
self.backup_tid_dict[uuid] = backup_tid self.backup_tid_dict[uuid] = backup_tid
self.truncate_dict[uuid] = truncate_tid
def answerPartitionTable(self, conn, ptid, row_list): def answerPartitionTable(self, conn, ptid, row_list):
# If this is not from a target node, ignore it. # If this is not from a target node, ignore it.
if ptid == self.target_ptid: if ptid == self.target_ptid:
app = self.app
try: try:
new_nodes = self.app.pt.load(ptid, row_list, self.app.nm) new_nodes = app.pt.load(ptid, row_list, app.nm)
except IndexError: except IndexError:
raise ProtocolError('Invalid offset') raise ProtocolError('Invalid offset')
self._notifyAdmins(Packets.NotifyNodeInformation(new_nodes), self._notifyAdmins(Packets.NotifyNodeInformation(new_nodes),
Packets.SendPartitionTable(ptid, row_list)) Packets.SendPartitionTable(ptid, row_list))
self.ask_pt = () self.ask_pt = ()
self.app.backup_tid = self.backup_tid_dict[conn.getUUID()] uuid = conn.getUUID()
app.backup_tid = self.backup_tid_dict[uuid]
app.truncate_tid = self.truncate_dict[uuid]
def _notifyAdmins(self, *packets): def _notifyAdmins(self, *packets):
for node in self.app.nm.getAdminList(only_identified=True): for node in self.app.nm.getAdminList(only_identified=True):
......
...@@ -59,9 +59,18 @@ class VerificationManager(BaseServiceHandler): ...@@ -59,9 +59,18 @@ class VerificationManager(BaseServiceHandler):
def run(self): def run(self):
app = self.app app = self.app
app.changeClusterState(ClusterStates.VERIFYING) app.changeClusterState(ClusterStates.VERIFYING)
app.tm.reset()
if not app.backup_tid: if not app.backup_tid:
self.verifyData() self.verifyData()
# This is where storages truncate if requested:
# - we make sure all nodes are running with a truncate_tid value saved
# - there's no unfinished data
# - just before they return the last tid/oid
self._askStorageNodesAndWait(Packets.AskLastIDs(),
[x for x in app.nm.getIdentifiedList() if x.isStorage()])
app.setLastTransaction(app.tm.getLastTID()) app.setLastTransaction(app.tm.getLastTID())
# Just to not return meaningless information in AnswerRecovery.
app.truncate_tid = None
def verifyData(self): def verifyData(self):
app = self.app app = self.app
...@@ -97,33 +106,18 @@ class VerificationManager(BaseServiceHandler): ...@@ -97,33 +106,18 @@ class VerificationManager(BaseServiceHandler):
# Finish all transactions for which we know that tpc_finish was called # Finish all transactions for which we know that tpc_finish was called
# but not fully processed. This may include replicas with transactions # but not fully processed. This may include replicas with transactions
# that were not even locked. # that were not even locked.
all_set = set()
for ttid, tid in self._locked_dict.iteritems(): for ttid, tid in self._locked_dict.iteritems():
uuid_set = self._voted_dict.get(ttid) uuid_set = self._voted_dict.get(ttid)
if uuid_set: if uuid_set:
all_set |= uuid_set
packet = Packets.ValidateTransaction(ttid, tid) packet = Packets.ValidateTransaction(ttid, tid)
for node in getIdentifiedList(pool_set=uuid_set): for node in getIdentifiedList(pool_set=uuid_set):
node.notify(packet) node.notify(packet)
# Ask last oid/tid again for nodes that recovers locked transactions. def answerLastIDs(self, conn, loid, ltid):
# In fact, this is mainly for the last oid since the last tid can be
# deduced from max(self._locked_dict.values()).
# If getLastIDs is not always instantaneous for some backends, we
# should split AskLastIDs to not ask the last oid/tid at the end of
# recovery phase (and instead ask all nodes once, here).
# With this request, we also prefer to make sure all nodes validate
# successfully before switching to RUNNING state.
self._askStorageNodesAndWait(Packets.AskLastIDs(),
getIdentifiedList(all_set))
def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
self._uuid_set.remove(conn.getUUID()) self._uuid_set.remove(conn.getUUID())
tm = self.app.tm tm = self.app.tm
tm.setLastOID(loid) tm.setLastOID(loid)
tm.setLastTID(ltid) tm.setLastTID(ltid)
ptid = self.app.pt.getID()
assert lptid < ptid if None != lptid != ptid else not backup_tid
def answerLockedTransactions(self, conn, tid_dict): def answerLockedTransactions(self, conn, tid_dict):
uuid = conn.getUUID() uuid = conn.getUUID()
......
...@@ -37,6 +37,7 @@ action_dict = { ...@@ -37,6 +37,7 @@ action_dict = {
'tweak': 'tweakPartitionTable', 'tweak': 'tweakPartitionTable',
'drop': 'dropNode', 'drop': 'dropNode',
'kill': 'killNode', 'kill': 'killNode',
'truncate': 'truncate',
} }
uuid_int = (lambda ns: lambda uuid: uuid_int = (lambda ns: lambda uuid:
...@@ -85,11 +86,14 @@ class TerminalNeoCTL(object): ...@@ -85,11 +86,14 @@ class TerminalNeoCTL(object):
Get last ids. Get last ids.
""" """
assert not params assert not params
r = self.neoctl.getLastIds() ptid, backup_tid, truncate_tid = self.neoctl.getRecovery()
if r[3]: if backup_tid:
return "last_tid = 0x%x" % u64(self.neoctl.getLastTransaction()) ltid = self.neoctl.getLastTransaction()
return "last_oid = 0x%x\nlast_tid = 0x%x\nlast_ptid = %u" % ( r = "backup_tid = 0x%x" % u64(backup_tid)
u64(r[0]), u64(r[1]), r[2]) else:
loid, ltid = self.neoctl.getLastIds()
r = "last_oid = 0x%x" % u64(loid)
return r + "\nlast_tid = 0x%x\nlast_ptid = %u" % (u64(ltid), ptid)
def getPartitionRowList(self, params): def getPartitionRowList(self, params):
""" """
...@@ -193,6 +197,19 @@ class TerminalNeoCTL(object): ...@@ -193,6 +197,19 @@ class TerminalNeoCTL(object):
""" """
return uuid_str(self.neoctl.getPrimary()) return uuid_str(self.neoctl.getPrimary())
def truncate(self, params):
"""
Truncate the database at the given tid.
The cluster must be in RUNNING state, without any pending transaction.
This causes the cluster to go back in RECOVERING state, waiting all
nodes to be pending (do not use 'start' command unless you're sure
the missing nodes don't need to be truncated).
Parameters: tid
"""
self.neoctl.truncate(self.asTID(*params))
def checkReplicas(self, params): def checkReplicas(self, params):
""" """
Test whether partitions have corrupted metadata Test whether partitions have corrupted metadata
......
...@@ -61,3 +61,4 @@ class CommandEventHandler(EventHandler): ...@@ -61,3 +61,4 @@ class CommandEventHandler(EventHandler):
answerPrimary = __answer(Packets.AnswerPrimary) answerPrimary = __answer(Packets.AnswerPrimary)
answerLastIDs = __answer(Packets.AnswerLastIDs) answerLastIDs = __answer(Packets.AnswerLastIDs)
answerLastTransaction = __answer(Packets.AnswerLastTransaction) answerLastTransaction = __answer(Packets.AnswerLastTransaction)
answerRecovery = __answer(Packets.AnswerRecovery)
...@@ -120,6 +120,12 @@ class NeoCTL(BaseApplication): ...@@ -120,6 +120,12 @@ class NeoCTL(BaseApplication):
raise RuntimeError(response) raise RuntimeError(response)
return response[1] return response[1]
def getRecovery(self):
response = self.__ask(Packets.AskRecovery())
if response[0] != Packets.AnswerRecovery:
raise RuntimeError(response)
return response[1:]
def getNodeList(self, node_type=None): def getNodeList(self, node_type=None):
""" """
Get a list of nodes, filtering with given type. Get a list of nodes, filtering with given type.
...@@ -163,6 +169,12 @@ class NeoCTL(BaseApplication): ...@@ -163,6 +169,12 @@ class NeoCTL(BaseApplication):
raise RuntimeError(response) raise RuntimeError(response)
return response[1] return response[1]
def truncate(self, tid):
response = self.__ask(Packets.Truncate(tid))
if response[0] != Packets.Error or response[1] != ErrorCodes.ACK:
raise RuntimeError(response)
return response[2]
def checkReplicas(self, *args): def checkReplicas(self, *args):
response = self.__ask(Packets.CheckReplicas(*args)) response = self.__ask(Packets.CheckReplicas(*args))
if response[0] != Packets.Error or response[1] != ErrorCodes.ACK: if response[0] != Packets.Error or response[1] != ErrorCodes.ACK:
......
...@@ -23,7 +23,7 @@ from neo.lib.protocol import uuid_str, \ ...@@ -23,7 +23,7 @@ from neo.lib.protocol import uuid_str, \
CellStates, ClusterStates, NodeTypes, Packets CellStates, ClusterStates, NodeTypes, Packets
from neo.lib.node import NodeManager from neo.lib.node import NodeManager
from neo.lib.connection import ListeningConnection from neo.lib.connection import ListeningConnection
from neo.lib.exception import OperationFailure, PrimaryFailure from neo.lib.exception import StoppedOperation, PrimaryFailure
from neo.lib.pt import PartitionTable from neo.lib.pt import PartitionTable
from neo.lib.util import dump from neo.lib.util import dump
from neo.lib.bootstrap import BootstrapManager from neo.lib.bootstrap import BootstrapManager
...@@ -196,7 +196,7 @@ class Application(BaseApplication): ...@@ -196,7 +196,7 @@ class Application(BaseApplication):
self.initialize() self.initialize()
self.doOperation() self.doOperation()
raise RuntimeError, 'should not reach here' raise RuntimeError, 'should not reach here'
except OperationFailure, msg: except StoppedOperation, msg:
logging.error('operation stopped: %s', msg) logging.error('operation stopped: %s', msg)
except PrimaryFailure, msg: except PrimaryFailure, msg:
logging.error('primary master is down: %s', msg) logging.error('primary master is down: %s', msg)
......
...@@ -194,10 +194,18 @@ class DatabaseManager(object): ...@@ -194,10 +194,18 @@ class DatabaseManager(object):
def getBackupTID(self): def getBackupTID(self):
return util.bin(self.getConfiguration('backup_tid')) return util.bin(self.getConfiguration('backup_tid'))
def setBackupTID(self, backup_tid): def _setBackupTID(self, tid):
tid = util.dump(backup_tid) tid = util.dump(tid)
logging.debug('backup_tid = %s', tid) logging.debug('backup_tid = %s', tid)
return self.setConfiguration('backup_tid', tid) return self._setConfiguration('backup_tid', tid)
def getTruncateTID(self):
return util.bin(self.getConfiguration('truncate_tid'))
def _setTruncateTID(self, tid):
tid = util.dump(tid)
logging.debug('truncate_tid = %s', tid)
return self._setConfiguration('truncate_tid', tid)
def _setPackTID(self, tid): def _setPackTID(self, tid):
self._setConfiguration('_pack_tid', tid) self._setConfiguration('_pack_tid', tid)
...@@ -502,11 +510,14 @@ class DatabaseManager(object): ...@@ -502,11 +510,14 @@ class DatabaseManager(object):
and max_tid (included)""" and max_tid (included)"""
raise NotImplementedError raise NotImplementedError
def truncate(self, tid): def truncate(self):
assert tid not in (None, ZERO_TID), tid tid = self.getTruncateTID()
for partition in xrange(self.getNumPartitions()): if tid:
self._deleteRange(partition, tid) assert tid != ZERO_TID, tid
self.setBackupTID(None) # this also commits for partition in xrange(self.getNumPartitions()):
self._deleteRange(partition, tid)
self._setTruncateTID(None)
self.commit()
def getTransaction(self, tid, all = False): def getTransaction(self, tid, all = False):
"""Return a tuple of the list of OIDs, user information, """Return a tuple of the list of OIDs, user information,
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
from neo.lib import logging from neo.lib import logging
from neo.lib.handler import EventHandler from neo.lib.handler import EventHandler
from neo.lib.exception import PrimaryFailure, OperationFailure from neo.lib.exception import PrimaryFailure, StoppedOperation
from neo.lib.protocol import uuid_str, NodeStates, NodeTypes, Packets from neo.lib.protocol import uuid_str, NodeStates, NodeTypes, Packets
class BaseMasterHandler(EventHandler): class BaseMasterHandler(EventHandler):
...@@ -27,7 +27,7 @@ class BaseMasterHandler(EventHandler): ...@@ -27,7 +27,7 @@ class BaseMasterHandler(EventHandler):
raise PrimaryFailure('connection lost') raise PrimaryFailure('connection lost')
def stopOperation(self, conn): def stopOperation(self, conn):
raise OperationFailure('operation stopped') raise StoppedOperation
def reelectPrimary(self, conn): def reelectPrimary(self, conn):
raise PrimaryFailure('re-election occurs') raise PrimaryFailure('re-election occurs')
...@@ -48,7 +48,7 @@ class BaseMasterHandler(EventHandler): ...@@ -48,7 +48,7 @@ class BaseMasterHandler(EventHandler):
erase = state == NodeStates.DOWN erase = state == NodeStates.DOWN
self.app.shutdown(erase=erase) self.app.shutdown(erase=erase)
elif state == NodeStates.HIDDEN: elif state == NodeStates.HIDDEN:
raise OperationFailure raise StoppedOperation
elif node_type == NodeTypes.CLIENT and state != NodeStates.RUNNING: elif node_type == NodeTypes.CLIENT and state != NodeStates.RUNNING:
logging.info('Notified of non-running client, abort (%s)', logging.info('Notified of non-running client, abort (%s)',
uuid_str(uuid)) uuid_str(uuid))
......
...@@ -46,16 +46,23 @@ class InitializationHandler(BaseMasterHandler): ...@@ -46,16 +46,23 @@ class InitializationHandler(BaseMasterHandler):
app.dm.changePartitionTable(ptid, cell_list, reset=True) app.dm.changePartitionTable(ptid, cell_list, reset=True)
def truncate(self, conn, tid): def truncate(self, conn, tid):
self.app.dm.truncate(tid) dm = self.app.dm
dm._setBackupTID(None)
dm._setTruncateTID(tid)
dm.commit()
def askLastIDs(self, conn): def askRecovery(self, conn):
app = self.app app = self.app
ltid, _, _, loid = app.dm.getLastIDs() conn.answer(Packets.AnswerRecovery(
conn.answer(Packets.AnswerLastIDs(
loid,
ltid,
app.pt.getID(), app.pt.getID(),
app.dm.getBackupTID())) app.dm.getBackupTID(),
app.dm.getTruncateTID()))
def askLastIDs(self, conn):
dm = self.app.dm
dm.truncate()
ltid, _, _, loid = dm.getLastIDs()
conn.answer(Packets.AnswerLastIDs(loid, ltid))
def askPartitionTable(self, conn): def askPartitionTable(self, conn):
pt = self.app.pt pt = self.app.pt
...@@ -80,4 +87,5 @@ class InitializationHandler(BaseMasterHandler): ...@@ -80,4 +87,5 @@ class InitializationHandler(BaseMasterHandler):
tid = dm.getLastIDs()[0] or ZERO_TID tid = dm.getLastIDs()[0] or ZERO_TID
else: else:
tid = None tid = None
dm.setBackupTID(tid) dm._setBackupTID(tid)
dm.commit()
...@@ -28,7 +28,8 @@ class MasterOperationHandler(BaseMasterHandler): ...@@ -28,7 +28,8 @@ class MasterOperationHandler(BaseMasterHandler):
assert self.app.operational and backup assert self.app.operational and backup
dm = self.app.dm dm = self.app.dm
if not dm.getBackupTID(): if not dm.getBackupTID():
dm.setBackupTID(dm.getLastIDs()[0] or ZERO_TID) dm._setBackupTID(dm.getLastIDs()[0] or ZERO_TID)
dm.commit()
def notifyTransactionFinished(self, conn, *args, **kw): def notifyTransactionFinished(self, conn, *args, **kw):
self.app.replicator.transactionFinished(*args, **kw) self.app.replicator.transactionFinished(*args, **kw)
......
...@@ -128,7 +128,8 @@ class Replicator(object): ...@@ -128,7 +128,8 @@ class Replicator(object):
if tid: if tid:
new_tid = self.getBackupTID() new_tid = self.getBackupTID()
if tid != new_tid: if tid != new_tid:
dm.setBackupTID(new_tid) dm._setBackupTID(new_tid)
dm.commit()
def populate(self): def populate(self):
app = self.app app = self.app
......
...@@ -67,29 +67,6 @@ class MasterRecoveryTests(NeoUnitTestBase): ...@@ -67,29 +67,6 @@ class MasterRecoveryTests(NeoUnitTestBase):
self.assertEqual(self.app.nm.getByAddress(conn.getAddress()).getState(), self.assertEqual(self.app.nm.getByAddress(conn.getAddress()).getState(),
NodeStates.TEMPORARILY_DOWN) NodeStates.TEMPORARILY_DOWN)
def test_09_answerLastIDs(self):
recovery = self.recovery
uuid = self.identifyToMasterNode()
oid1 = self.getOID(1)
oid2 = self.getOID(2)
tid1 = self.getNextTID()
tid2 = self.getNextTID(tid1)
ptid1 = self.getPTID(1)
ptid2 = self.getPTID(2)
self.app.tm.setLastOID(oid1)
self.app.tm.setLastTID(tid1)
self.app.pt.setID(ptid1)
# send information which are later to what PMN knows, this must update target node
conn = self.getFakeConnection(uuid, self.storage_port)
self.assertTrue(ptid2 > self.app.pt.getID())
self.assertTrue(oid2 > self.app.tm.getLastOID())
self.assertTrue(tid2 > self.app.tm.getLastTID())
recovery.answerLastIDs(conn, oid2, tid2, ptid2, None)
self.assertEqual(oid2, self.app.tm.getLastOID())
self.assertEqual(tid2, self.app.tm.getLastTID())
self.assertEqual(ptid2, recovery.target_ptid)
def test_10_answerPartitionTable(self): def test_10_answerPartitionTable(self):
recovery = self.recovery recovery = self.recovery
uuid = self.identifyToMasterNode(NodeTypes.MASTER, port=self.master_port) uuid = self.identifyToMasterNode(NodeTypes.MASTER, port=self.master_port)
......
...@@ -21,7 +21,7 @@ from neo.lib.protocol import NodeTypes, NodeStates, Packets ...@@ -21,7 +21,7 @@ from neo.lib.protocol import NodeTypes, NodeStates, Packets
from neo.master.handlers.storage import StorageServiceHandler from neo.master.handlers.storage import StorageServiceHandler
from neo.master.handlers.client import ClientServiceHandler from neo.master.handlers.client import ClientServiceHandler
from neo.master.app import Application from neo.master.app import Application
from neo.lib.exception import OperationFailure from neo.lib.exception import StoppedOperation
class MasterStorageHandlerTests(NeoUnitTestBase): class MasterStorageHandlerTests(NeoUnitTestBase):
...@@ -114,24 +114,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase): ...@@ -114,24 +114,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
self.checkNotifyUnlockInformation(storage_conn_1) self.checkNotifyUnlockInformation(storage_conn_1)
self.checkNotifyUnlockInformation(storage_conn_2) self.checkNotifyUnlockInformation(storage_conn_2)
def test_12_askLastIDs(self):
service = self.service
node, conn = self.identifyToMasterNode()
# give a uuid
conn = self.getFakeConnection(node.getUUID(), self.storage_address)
ptid = self.app.pt.getID()
oid = self.getOID(1)
tid = self.getNextTID()
self.app.tm.setLastOID(oid)
self.app.tm.setLastTID(tid)
service.askLastIDs(conn)
packet = self.checkAnswerLastIDs(conn)
loid, ltid, lptid, backup_tid = packet.decode()
self.assertEqual(loid, oid)
self.assertEqual(ltid, tid)
self.assertEqual(lptid, ptid)
self.assertEqual(backup_tid, None)
def test_13_askUnfinishedTransactions(self): def test_13_askUnfinishedTransactions(self):
service = self.service service = self.service
node, conn = self.identifyToMasterNode() node, conn = self.identifyToMasterNode()
...@@ -173,7 +155,7 @@ class MasterStorageHandlerTests(NeoUnitTestBase): ...@@ -173,7 +155,7 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
# drop the second, no storage node left # drop the second, no storage node left
lptid = self.app.pt.getID() lptid = self.app.pt.getID()
self.assertEqual(node2.getState(), NodeStates.RUNNING) self.assertEqual(node2.getState(), NodeStates.RUNNING)
self.assertRaises(OperationFailure, method, conn2) self.assertRaises(StoppedOperation, method, conn2)
self.assertEqual(node2.getState(), state) self.assertEqual(node2.getState(), state)
self.assertEqual(lptid, self.app.pt.getID()) self.assertEqual(lptid, self.app.pt.getID())
......
...@@ -20,7 +20,7 @@ from collections import deque ...@@ -20,7 +20,7 @@ from collections import deque
from .. import NeoUnitTestBase from .. import NeoUnitTestBase
from neo.storage.app import Application from neo.storage.app import Application
from neo.storage.handlers.master import MasterOperationHandler from neo.storage.handlers.master import MasterOperationHandler
from neo.lib.exception import PrimaryFailure, OperationFailure from neo.lib.exception import PrimaryFailure
from neo.lib.pt import PartitionTable from neo.lib.pt import PartitionTable
from neo.lib.protocol import CellStates, ProtocolError, Packets from neo.lib.protocol import CellStates, ProtocolError, Packets
...@@ -104,11 +104,6 @@ class StorageMasterHandlerTests(NeoUnitTestBase): ...@@ -104,11 +104,6 @@ class StorageMasterHandlerTests(NeoUnitTestBase):
self.assertEqual(len(calls), 1) self.assertEqual(len(calls), 1)
calls[0].checkArgs(ptid2, cells) calls[0].checkArgs(ptid2, cells)
def test_16_stopOperation1(self):
# OperationFailure
conn = self.getFakeConnection(is_server=False)
self.assertRaises(OperationFailure, self.operation.stopOperation, conn)
def _getConnection(self): def _getConnection(self):
return self.getFakeConnection() return self.getFakeConnection()
......
...@@ -26,7 +26,7 @@ from ZODB import DB, POSException ...@@ -26,7 +26,7 @@ from ZODB import DB, POSException
from neo.storage.transactions import TransactionManager, \ from neo.storage.transactions import TransactionManager, \
DelayedError, ConflictError DelayedError, ConflictError
from neo.lib.connection import ConnectionClosed, MTClientConnection from neo.lib.connection import ConnectionClosed, MTClientConnection
from neo.lib.exception import OperationFailure from neo.lib.exception import StoppedOperation
from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \ from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \
ZERO_TID ZERO_TID
from .. import expectedFailure, _ExpectedFailure, _UnexpectedSuccess, Patch from .. import expectedFailure, _ExpectedFailure, _UnexpectedSuccess, Patch
...@@ -933,7 +933,7 @@ class Test(NEOThreadedTest): ...@@ -933,7 +933,7 @@ class Test(NEOThreadedTest):
def testStorageFailureDuringTpcFinish(self): def testStorageFailureDuringTpcFinish(self):
def answerTransactionFinished(conn, packet): def answerTransactionFinished(conn, packet):
if isinstance(packet, Packets.AnswerTransactionFinished): if isinstance(packet, Packets.AnswerTransactionFinished):
raise OperationFailure raise StoppedOperation
cluster = NEOCluster() cluster = NEOCluster()
try: try:
cluster.start() cluster.start()
...@@ -1059,6 +1059,64 @@ class Test(NEOThreadedTest): ...@@ -1059,6 +1059,64 @@ class Test(NEOThreadedTest):
finally: finally:
cluster.stop() cluster.stop()
def testTruncate(self):
calls = [0, 0]
def dieFirst(i):
def f(orig, *args, **kw):
calls[i] += 1
if calls[i] == 1:
sys.exit()
return orig(*args, **kw)
return f
cluster = NEOCluster(replicas=1)
try:
cluster.start()
t, c = cluster.getTransaction()
r = c.root()
tids = []
for x in xrange(4):
r[x] = None
t.commit()
tids.append(r._p_serial)
truncate_tid = tids[2]
r['x'] = PCounter()
s0, s1 = cluster.storage_list
with Patch(s0.tm, unlock=dieFirst(0)), \
Patch(s1.dm, truncate=dieFirst(1)):
t.commit()
cluster.neoctl.truncate(truncate_tid)
self.tic()
getClusterState = cluster.neoctl.getClusterState
# Unless forced, the cluster waits all nodes to be up,
# so that all nodes are truncated.
self.assertEqual(getClusterState(), ClusterStates.RECOVERING)
self.assertEqual(calls, [1, 0])
s0.resetNode()
s0.start()
# s0 died with unfinished data, and before processing the
# Truncate packet from the master.
self.assertFalse(s0.dm.getTruncateTID())
self.assertEqual(s1.dm.getTruncateTID(), truncate_tid)
self.tic()
self.assertEqual(calls, [1, 1])
self.assertEqual(getClusterState(), ClusterStates.RECOVERING)
s1.resetNode()
with Patch(s1.dm, truncate=dieFirst(1)):
s1.start()
self.assertEqual(s0.dm.getLastIDs()[0], truncate_tid)
self.assertEqual(s1.dm.getLastIDs()[0], r._p_serial)
self.tic()
self.assertEqual(calls, [1, 2])
self.assertEqual(getClusterState(), ClusterStates.RUNNING)
t.begin()
self.assertEqual(r, dict.fromkeys(xrange(3)))
self.assertEqual(r._p_serial, truncate_tid)
self.assertEqual(1, u64(c._storage.new_oid()))
for s in cluster.storage_list:
self.assertEqual(s.dm.getLastIDs()[0], truncate_tid)
finally:
cluster.stop()
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment