Safer DB truncation, new 'truncate' ctl command

With the previous commit, the request to truncate the DB was not stored persistently, which means that this operation was still vulnerable to the case where the master is restarted after some nodes, but not all, have already truncated. The master didn't have the information to fix this and the result was a DB partially truncated. -> On a Truncate packet, a storage node only stores the tid somewhere, to send it back to the master, which stays in RECOVERING state as long as any node has a different value than that of the node with the latest partition table. We also want to make sure that there is no unfinished data, because a user may truncate at a tid higher than a locked one. -> Truncation is now effective at the end on the VERIFYING phase, just before returning the last ids to the master. At last all nodes should be truncated, to avoid that an offline node comes back with a different history. Currently, this would not be an issue since replication is always restart from the beginning, but later we'd like they remember where they stopped to replicate. -> If a truncation is requested, the master waits for all nodes to be pending, even if it was previously started (the user can still force the cluster to start with neoctl). And any lost node during verification also causes the master to go back to recovery. Obviously, the protocol has been changed to split the LastIDs packet and introduce a new Recovery, since it does not make sense anymore to ask last ids during recovery.

Safer DB truncation, new 'truncate' ctl command
With the previous commit, the request to truncate the DB was not stored persistently, which means that this operation was still vulnerable to the case where the master is restarted after some nodes, but not all, have already truncated. The master didn't have the information to fix this and the result was a DB partially truncated. -> On a Truncate packet, a storage node only stores the tid somewhere, to send it back to the master, which stays in RECOVERING state as long as any node has a different value than that of the node with the latest partition table. We also want to make sure that there is no unfinished data, because a user may truncate at a tid higher than a locked one. -> Truncation is now effective at the end on the VERIFYING phase, just before returning the last ids to the master. At last all nodes should be truncated, to avoid that an offline node comes back with a different history. Currently, this would not be an issue since replication is always restart from the beginning, but later we'd like they remember where they stopped to replicate. -> If a truncation is requested, the master waits for all nodes to be pending, even if it was previously started (the user can still force the cluster to start with neoctl). And any lost node during verification also causes the master to go back to recovery. Obviously, the protocol has been changed to split the LastIDs packet and introduce a new Recovery, since it does not make sense anymore to ask last ids during recovery.
d3c8b76d · Julien Muchembled · 3e3eab5b · d3c8b76d · d3c8b76d · d3c8b76d
Commit d3c8b76d authored Dec 01, 2015 by Julien Muchembled
25 changed files
--- a/TODO
+++ b/TODO
@@ -141,9 +141,7 @@
    Admin
    - Make admin node able to monitor multiple clusters simultaneously
    - Send notifications (ie: mail) when a storage or master node is lost
-    - Add ctl command to truncate DB at arbitrary TID. 'Truncate' message
-      can be reused. There should also be a way to list last transactions,
-      like fstail for FileStorage.
+    - Add ctl command to list last transactions, like fstail for FileStorage.

    Tests
    - Use another mock library: Python 3.3+ has unittest.mock, which is

--- a/neo/admin/handler.py
+++ b/neo/admin/handler.py
@@ -65,10 +65,12 @@ class AdminEventHandler(EventHandler):
    askLastIDs = forward_ask(Packets.AskLastIDs)
    askLastTransaction = forward_ask(Packets.AskLastTransaction)
    addPendingNodes = forward_ask(Packets.AddPendingNodes)
+    askRecovery = forward_ask(Packets.AskRecovery)
    tweakPartitionTable = forward_ask(Packets.TweakPartitionTable)
    setClusterState = forward_ask(Packets.SetClusterState)
    setNodeState = forward_ask(Packets.SetNodeState)
    checkReplicas = forward_ask(Packets.CheckReplicas)
+    truncate = forward_ask(Packets.Truncate)


 class MasterEventHandler(EventHandler):

--- a/neo/client/handlers/master.py
+++ b/neo/client/handlers/master.py
@@ -102,11 +102,17 @@ class PrimaryNotificationsHandler(MTEventHandler):
                if app.master_conn is None:
                    app._cache_lock_acquire()
                    try:
-                        oid_list = app._cache.clear_current()
                        db = app.getDB()
-                        if db is not None:
-                            db.invalidate(app.last_tid and
-                                          add64(app.last_tid, 1), oid_list)
+                        if app.last_tid < ltid:
+                            oid_list = app._cache.clear_current()
+                            db is None or db.invalidate(
+                                app.last_tid and add64(app.last_tid, 1),
+                                oid_list)
+                        else:
+                            # The DB was truncated. It happens so
+                            # rarely that we don't need to optimize.
+                            app._cache.clear()
+                            db is None or db.invalidateCache()
                    finally:
                        app._cache_lock_release()
                app.last_tid = ltid

--- a/neo/lib/exception.py
+++ b/neo/lib/exception.py
@@ -23,7 +23,7 @@ class ElectionFailure(NeoException):
 class PrimaryFailure(NeoException):
    pass

-class OperationFailure(NeoException):
+class StoppedOperation(NeoException):
    pass

 class DatabaseFailure(NeoException):

--- a/neo/lib/protocol.py
+++ b/neo/lib/protocol.py
@@ -722,16 +722,24 @@ class ReelectPrimary(Packet):
    Force a re-election of a primary master node. M -> M.
    """

+class Recovery(Packet):
+    """
+    Ask all data needed by master to recover. PM -> S, S -> PM.
+    """
+    _answer = PStruct('answer_recovery',
+        PPTID('ptid'),
+        PTID('backup_tid'),
+        PTID('truncate_tid'),
+    )
+
 class LastIDs(Packet):
    """
-    Ask the last OID, the last TID and the last Partition Table ID so that
-    a master recover. PM -> S, S -> PM.
+    Ask the last OID/TID so that a master can initialize its TransactionManager.
+    PM -> S, S -> PM.
    """
    _answer = PStruct('answer_last_ids',
        POID('last_oid'),
        PTID('last_tid'),
-        PPTID('last_ptid'),
-        PTID('backup_tid'),
    )

 class PartitionTable(Packet):
@@ -1470,13 +1478,14 @@ class ReplicationDone(Packet):

 class Truncate(Packet):
    """
-    XXX: Used for both make storage consistent and leave backup mode
-    M -> S
+    Request DB to be truncated. Also used to leave backup mode.
    """
    _fmt = PStruct('truncate',
        PTID('tid'),
    )

+    _answer = Error
+

 StaticRegistry = {}
 def register(request, ignore_when_closed=None):
@@ -1594,6 +1603,8 @@ class Packets(dict):
                    ReelectPrimary)
    NotifyNodeInformation = register(
                    NotifyNodeInformation)
+    AskRecovery, AnswerRecovery = register(
+                    Recovery)
    AskLastIDs, AnswerLastIDs = register(
                    LastIDs)
    AskPartitionTable, AnswerPartitionTable = register(

--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -24,7 +24,7 @@ from neo.lib.protocol import uuid_str, UUID_NAMESPACES, ZERO_TID
 from neo.lib.protocol import ClusterStates, NodeStates, NodeTypes, Packets
 from neo.lib.handler import EventHandler
 from neo.lib.connection import ListeningConnection, ClientConnection
-from neo.lib.exception import ElectionFailure, PrimaryFailure, OperationFailure
+from neo.lib.exception import ElectionFailure, PrimaryFailure, StoppedOperation

 class StateChangedException(Exception): pass

@@ -45,6 +45,7 @@ class Application(BaseApplication):
    backup_tid = None
    backup_app = None
    uuid = None
+    truncate_tid = None

    def __init__(self, config):
        super(Application, self).__init__(
@@ -331,12 +332,9 @@ class Application(BaseApplication):
        # machines but must not start automatically: otherwise, each storage
        # node would diverge.
        self._startup_allowed = False
-        self.truncate_tid = None
        try:
            while True:
                self.runManager(RecoveryManager)
-                # Automatic restart if we become non-operational.
-                self._startup_allowed = True
                try:
                    self.runManager(VerificationManager)
                    if not self.backup_tid:
@@ -346,10 +344,13 @@ class Application(BaseApplication):
                    if self.backup_app is None:
                        raise RuntimeError("No upstream cluster to backup"
                                           " defined in configuration")
-                    self.truncate_tid = self.backup_app.provideService()
-                except OperationFailure:
+                    truncate = Packets.Truncate(
+                        self.backup_app.provideService())
+                except StoppedOperation, e:
                    logging.critical('No longer operational')
-                    self.truncate_tid = None
+                    truncate = Packets.Truncate(*e.args) if e.args else None
+                    # Automatic restart except if we truncate or retry to.
+                    self._startup_allowed = not (self.truncate_tid or truncate)
                node_list = []
                for node in self.nm.getIdentifiedList():
                    if node.isStorage() or node.isClient():
@@ -357,7 +358,10 @@ class Application(BaseApplication):
                        conn.notify(Packets.StopOperation())
                        if node.isClient():
                            conn.abort()
-                        elif node.isRunning():
+                            continue
+                        if truncate:
+                            conn.notify(truncate)
+                        if node.isRunning():
                            node.setPending()
                            node_list.append(node)
                self.broadcastNodesInformation(node_list)
@@ -475,7 +479,7 @@ class Application(BaseApplication):
            # wait for all transaction to be finished
            while self.tm.hasPending():
                self.em.poll(1)
-        except OperationFailure:
+        except StoppedOperation:
            logging.critical('No longer operational')

        logging.info("asking remaining nodes to shutdown")

--- a/neo/master/backup_app.py
+++ b/neo/master/backup_app.py
@@ -152,17 +152,19 @@ class BackupApplication(object):
                    assert tid != ZERO_TID
                    logging.warning("Truncating at %s (last_tid was %s)",
                        dump(app.backup_tid), dump(last_tid))
-                    # We will really truncate so do not start automatically
-                    # if there's any missing storage.
-                    app._startup_allowed = False
+                else:
+                    # We will do a dummy truncation, just to leave backup mode,
+                    # so it's fine to start automatically if there's any
+                    # missing storage.
+                    # XXX: Consider using another method to leave backup mode,
+                    #      at least when there's nothing to truncate. Because
+                    #      in case of StoppedOperation during VERIFYING state,
+                    #      this flag will be wrongly set to False.
+                    app._startup_allowed = True
                # If any error happened before reaching this line, we'd go back
                # to backup mode, which is the right mode to recover.
                del app.backup_tid
-                # We will go through a recovery phase in order to reset the
-                # transaction manager and this is only possible if storages
-                # already know that we left backup mode. To that purpose, we
-                # always stop operation with a tid, even if there's nothing to
-                # truncate.
+                # Now back to RECOVERY...
                return tid
            finally:
                del self.primary_partition_dict, self.tid_list

--- a/neo/master/handlers/__init__.py
+++ b/neo/master/handlers/__init__.py
@@ -15,7 +15,7 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 from neo.lib import logging
-from neo.lib.exception import OperationFailure
+from neo.lib.exception import StoppedOperation
 from neo.lib.handler import EventHandler
 from neo.lib.protocol import (uuid_str, NodeTypes, NodeStates, Packets,
    BrokenNodeDisallowedError,
@@ -66,13 +66,16 @@ class MasterHandler(EventHandler):
        state = self.app.getClusterState()
        conn.answer(Packets.AnswerClusterState(state))

-    def askLastIDs(self, conn):
+    def askRecovery(self, conn):
        app = self.app
-        conn.answer(Packets.AnswerLastIDs(
-            app.tm.getLastOID(),
-            app.tm.getLastTID(),
+        conn.answer(Packets.AnswerRecovery(
            app.pt.getID(),
-            app.backup_tid))
+            app.backup_tid and app.pt.getBackupTid(),
+            app.truncate_tid))
+
+    def askLastIDs(self, conn):
+        tm = self.app.tm
+        conn.answer(Packets.AnswerLastIDs(tm.getLastOID(), tm.getLastTID()))

    def askLastTransaction(self, conn):
        conn.answer(Packets.AnswerLastTransaction(
@@ -130,9 +133,11 @@ class BaseServiceHandler(MasterHandler):
            logging.info('drop a pending node from the node manager')
            app.nm.remove(node)
        app.broadcastNodesInformation([node])
+        if app.truncate_tid:
+            raise StoppedOperation
        app.broadcastPartitionChanges(app.pt.outdate(node))
        if not app.pt.operational():
-            raise OperationFailure("cannot continue operation")
+            raise StoppedOperation

    def notifyReady(self, conn):
        self.app.setStorageReady(conn.getUUID())

--- a/neo/master/handlers/administration.py
+++ b/neo/master/handlers/administration.py
@@ -19,6 +19,7 @@ import random
 from . import MasterHandler
 from ..app import StateChangedException
 from neo.lib import logging
+from neo.lib.exception import StoppedOperation
 from neo.lib.pt import PartitionTableException
 from neo.lib.protocol import ClusterStates, Errors, \
    NodeStates, NodeTypes, Packets, ProtocolError, uuid_str
@@ -159,6 +160,13 @@ class AdministrationHandler(MasterHandler):
            map(app.nm.getByUUID, uuid_list)))
        conn.answer(Errors.Ack(''))

+    def truncate(self, conn, tid):
+        app = self.app
+        if app.cluster_state != ClusterStates.RUNNING:
+            raise ProtocolError('Can not truncate in this state')
+        conn.answer(Errors.Ack(''))
+        raise StoppedOperation(tid)
+
    def checkReplicas(self, conn, partition_dict, min_tid, max_tid):
        app = self.app
        pt = app.pt

--- a/neo/master/handlers/storage.py
+++ b/neo/master/handlers/storage.py
@@ -16,7 +16,7 @@

 from neo.lib import logging
 from neo.lib.protocol import CellStates, ClusterStates, Packets, ProtocolError
-from neo.lib.exception import OperationFailure
+from neo.lib.exception import StoppedOperation
 from neo.lib.pt import PartitionTableException
 from . import BaseServiceHandler

@@ -76,7 +76,7 @@ class StorageServiceHandler(BaseServiceHandler):
                                    CellStates.CORRUPTED))
        self.app.broadcastPartitionChanges(change_list)
        if not self.app.pt.operational():
-            raise OperationFailure('cannot continue operation')
+            raise StoppedOperation

    def notifyReplicationDone(self, conn, offset, tid):
        app = self.app

--- a/neo/master/recovery.py
+++ b/neo/master/recovery.py
@@ -15,7 +15,6 @@
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.

 from neo.lib import logging
-from neo.lib.util import dump
 from neo.lib.protocol import Packets, ProtocolError, ClusterStates, NodeStates
 from .handlers import MasterHandler

@@ -30,6 +29,7 @@ class RecoveryManager(MasterHandler):
        self.target_ptid = None
        self.ask_pt = []
        self.backup_tid_dict = {}
+        self.truncate_dict = {}

    def getHandler(self):
        return self
@@ -49,7 +49,6 @@ class RecoveryManager(MasterHandler):
        """
        logging.info('begin the recovery of the status')
        app = self.app
-        app.tm.reset()
        pt = app.pt
        app.changeClusterState(ClusterStates.RECOVERING)
        pt.clear()
@@ -64,7 +63,11 @@ class RecoveryManager(MasterHandler):
                node_list = pt.getOperationalNodeSet()
                if app._startup_allowed:
                    node_list = [node for node in node_list if node.isPending()]
-                elif not all(node.isPending() for node in node_list):
+                elif node_list:
+                    # we want all nodes to be there if we're going to truncate
+                    if app.truncate_tid:
+                        node_list = pt.getNodeSet()
+                    if not all(node.isPending() for node in node_list):
                        continue
            elif app._startup_allowed or app.autostart:
                # No partition table and admin allowed startup, we are
@@ -77,6 +80,17 @@ class RecoveryManager(MasterHandler):
            if node_list and not any(node.getConnection().isPending()
                                     for node in node_list):
                if pt.filled():
+                    if app.truncate_tid:
+                        node_list = app.nm.getIdentifiedList(pool_set={uuid
+                            for uuid, tid in self.truncate_dict.iteritems()
+                            if not tid or app.truncate_tid < tid})
+                        if node_list:
+                            truncate = Packets.Truncate(app.truncate_tid)
+                            for node in node_list:
+                                conn = node.getConnection()
+                                conn.notify(truncate)
+                                self.connectionCompleted(conn, False)
+                            continue
                    node_list = pt.getConnectedNodeList()
                break

@@ -101,12 +115,13 @@ class RecoveryManager(MasterHandler):
                pt.setBackupTidDict(self.backup_tid_dict)
                app.backup_tid = pt.getBackupTid()

-        logging.debug('cluster starts with loid=%s and this partition table :',
-                      dump(app.tm.getLastOID()))
+        logging.debug('cluster starts this partition table:')
        pt.log()

    def connectionLost(self, conn, new_state):
        uuid = conn.getUUID()
+        self.backup_tid_dict.pop(uuid, None)
+        self.truncate_dict.pop(uuid, None)
        node = self.app.nm.getByUUID(uuid)
        try:
            i = self.ask_pt.index(uuid)
@@ -129,40 +144,38 @@ class RecoveryManager(MasterHandler):
        self.app.broadcastNodesInformation([node])

    def connectionCompleted(self, conn, new):
-        tid = self.app.truncate_tid
-        if tid:
-            conn.notify(Packets.Truncate(tid))
        # ask the last IDs to perform the recovery
-        conn.ask(Packets.AskLastIDs())
+        conn.ask(Packets.AskRecovery())

-    def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
-        tm = self.app.tm
-        tm.setLastOID(loid)
-        tm.setLastTID(ltid)
+    def answerRecovery(self, conn, ptid, backup_tid, truncate_tid):
        uuid = conn.getUUID()
-        if self.target_ptid <= lptid:
+        if self.target_ptid <= ptid:
            # Maybe a newer partition table.
-            if self.target_ptid == lptid and self.ask_pt:
+            if self.target_ptid == ptid and self.ask_pt:
                # Another node is already asked.
                self.ask_pt.append(uuid)
-            elif self.target_ptid < lptid or self.ask_pt is not ():
+            elif self.target_ptid < ptid or self.ask_pt is not ():
                # No node asked yet for the newest partition table.
-                self.target_ptid = lptid
+                self.target_ptid = ptid
                self.ask_pt = [uuid]
                conn.ask(Packets.AskPartitionTable())
        self.backup_tid_dict[uuid] = backup_tid
+        self.truncate_dict[uuid] = truncate_tid

    def answerPartitionTable(self, conn, ptid, row_list):
        # If this is not from a target node, ignore it.
        if ptid == self.target_ptid:
+            app = self.app
            try:
-                new_nodes = self.app.pt.load(ptid, row_list, self.app.nm)
+                new_nodes = app.pt.load(ptid, row_list, app.nm)
            except IndexError:
                raise ProtocolError('Invalid offset')
            self._notifyAdmins(Packets.NotifyNodeInformation(new_nodes),
                               Packets.SendPartitionTable(ptid, row_list))
            self.ask_pt = ()
-            self.app.backup_tid = self.backup_tid_dict[conn.getUUID()]
+            uuid = conn.getUUID()
+            app.backup_tid = self.backup_tid_dict[uuid]
+            app.truncate_tid = self.truncate_dict[uuid]

    def _notifyAdmins(self, *packets):
        for node in self.app.nm.getAdminList(only_identified=True):

--- a/neo/master/verification.py
+++ b/neo/master/verification.py
@@ -59,9 +59,18 @@ class VerificationManager(BaseServiceHandler):
    def run(self):
        app = self.app
        app.changeClusterState(ClusterStates.VERIFYING)
+        app.tm.reset()
        if not app.backup_tid:
            self.verifyData()
+        # This is where storages truncate if requested:
+        # - we make sure all nodes are running with a truncate_tid value saved
+        # - there's no unfinished data
+        # - just before they return the last tid/oid
+        self._askStorageNodesAndWait(Packets.AskLastIDs(),
+            [x for x in app.nm.getIdentifiedList() if x.isStorage()])
        app.setLastTransaction(app.tm.getLastTID())
+        # Just to not return meaningless information in AnswerRecovery.
+        app.truncate_tid = None

    def verifyData(self):
        app = self.app
@@ -97,33 +106,18 @@ class VerificationManager(BaseServiceHandler):
        # Finish all transactions for which we know that tpc_finish was called
        # but not fully processed. This may include replicas with transactions
        # that were not even locked.
-        all_set = set()
        for ttid, tid in self._locked_dict.iteritems():
            uuid_set = self._voted_dict.get(ttid)
            if uuid_set:
-                all_set |= uuid_set
                packet = Packets.ValidateTransaction(ttid, tid)
                for node in getIdentifiedList(pool_set=uuid_set):
                    node.notify(packet)

-        # Ask last oid/tid again for nodes that recovers locked transactions.
-        # In fact, this is mainly for the last oid since the last tid can be
-        # deduced from max(self._locked_dict.values()).
-        # If getLastIDs is not always instantaneous for some backends, we
-        # should split AskLastIDs to not ask the last oid/tid at the end of
-        # recovery phase (and instead ask all nodes once, here).
-        # With this request, we also prefer to make sure all nodes validate
-        # successfully before switching to RUNNING state.
-        self._askStorageNodesAndWait(Packets.AskLastIDs(),
-            getIdentifiedList(all_set))
-
-    def answerLastIDs(self, conn, loid, ltid, lptid, backup_tid):
+    def answerLastIDs(self, conn, loid, ltid):
        self._uuid_set.remove(conn.getUUID())
        tm = self.app.tm
        tm.setLastOID(loid)
        tm.setLastTID(ltid)
-        ptid = self.app.pt.getID()
-        assert lptid < ptid if None != lptid != ptid else not backup_tid

    def answerLockedTransactions(self, conn, tid_dict):
        uuid = conn.getUUID()

--- a/neo/neoctl/app.py
+++ b/neo/neoctl/app.py
@@ -37,6 +37,7 @@ action_dict = {
    'tweak': 'tweakPartitionTable',
    'drop': 'dropNode',
    'kill': 'killNode',
+    'truncate': 'truncate',
 }

 uuid_int = (lambda ns: lambda uuid:
@@ -85,11 +86,14 @@ class TerminalNeoCTL(object):
          Get last ids.
        """
        assert not params
-        r = self.neoctl.getLastIds()
-        if r[3]:
-            return "last_tid = 0x%x" % u64(self.neoctl.getLastTransaction())
-        return "last_oid = 0x%x\nlast_tid = 0x%x\nlast_ptid = %u" % (
-            u64(r[0]), u64(r[1]), r[2])
+        ptid, backup_tid, truncate_tid = self.neoctl.getRecovery()
+        if backup_tid:
+            ltid = self.neoctl.getLastTransaction()
+            r = "backup_tid = 0x%x" % u64(backup_tid)
+        else:
+            loid, ltid = self.neoctl.getLastIds()
+            r = "last_oid = 0x%x" % u64(loid)
+        return r + "\nlast_tid = 0x%x\nlast_ptid = %u" % (u64(ltid), ptid)

    def getPartitionRowList(self, params):
        """
@@ -193,6 +197,19 @@ class TerminalNeoCTL(object):
        """
        return uuid_str(self.neoctl.getPrimary())

+    def truncate(self, params):
+        """
+          Truncate the database at the given tid.
+
+          The cluster must be in RUNNING state, without any pending transaction.
+          This causes the cluster to go back in RECOVERING state, waiting all
+          nodes to be pending (do not use 'start' command unless you're sure
+          the missing nodes don't need to be truncated).
+
+          Parameters: tid
+        """
+        self.neoctl.truncate(self.asTID(*params))
+
    def checkReplicas(self, params):
        """
          Test whether partitions have corrupted metadata

--- a/neo/neoctl/handler.py
+++ b/neo/neoctl/handler.py
@@ -61,3 +61,4 @@ class CommandEventHandler(EventHandler):
    answerPrimary = __answer(Packets.AnswerPrimary)
    answerLastIDs = __answer(Packets.AnswerLastIDs)
    answerLastTransaction = __answer(Packets.AnswerLastTransaction)
+    answerRecovery = __answer(Packets.AnswerRecovery)
--- a/neo/neoctl/neoctl.py
+++ b/neo/neoctl/neoctl.py
@@ -120,6 +120,12 @@ class NeoCTL(BaseApplication):
            raise RuntimeError(response)
        return response[1]

+    def getRecovery(self):
+        response = self.__ask(Packets.AskRecovery())
+        if response[0] != Packets.AnswerRecovery:
+            raise RuntimeError(response)
+        return response[1:]
+
    def getNodeList(self, node_type=None):
        """
          Get a list of nodes, filtering with given type.
@@ -163,6 +169,12 @@ class NeoCTL(BaseApplication):
            raise RuntimeError(response)
        return response[1]

+    def truncate(self, tid):
+        response = self.__ask(Packets.Truncate(tid))
+        if response[0] != Packets.Error or response[1] != ErrorCodes.ACK:
+            raise RuntimeError(response)
+        return response[2]
+
    def checkReplicas(self, *args):
        response = self.__ask(Packets.CheckReplicas(*args))
        if response[0] != Packets.Error or response[1] != ErrorCodes.ACK:

--- a/neo/storage/app.py
+++ b/neo/storage/app.py
@@ -23,7 +23,7 @@ from neo.lib.protocol import uuid_str, \
    CellStates, ClusterStates, NodeTypes, Packets
 from neo.lib.node import NodeManager
 from neo.lib.connection import ListeningConnection
-from neo.lib.exception import OperationFailure, PrimaryFailure
+from neo.lib.exception import StoppedOperation, PrimaryFailure
 from neo.lib.pt import PartitionTable
 from neo.lib.util import dump
 from neo.lib.bootstrap import BootstrapManager
@@ -196,7 +196,7 @@ class Application(BaseApplication):
                self.initialize()
                self.doOperation()
                raise RuntimeError, 'should not reach here'
-            except OperationFailure, msg:
+            except StoppedOperation, msg:
                logging.error('operation stopped: %s', msg)
            except PrimaryFailure, msg:
                logging.error('primary master is down: %s', msg)

--- a/neo/storage/database/manager.py
+++ b/neo/storage/database/manager.py
@@ -194,10 +194,18 @@ class DatabaseManager(object):
    def getBackupTID(self):
        return util.bin(self.getConfiguration('backup_tid'))

-    def setBackupTID(self, backup_tid):
-        tid = util.dump(backup_tid)
+    def _setBackupTID(self, tid):
+        tid = util.dump(tid)
        logging.debug('backup_tid = %s', tid)
-        return self.setConfiguration('backup_tid', tid)
+        return self._setConfiguration('backup_tid', tid)
+
+    def getTruncateTID(self):
+        return util.bin(self.getConfiguration('truncate_tid'))
+
+    def _setTruncateTID(self, tid):
+        tid = util.dump(tid)
+        logging.debug('truncate_tid = %s', tid)
+        return self._setConfiguration('truncate_tid', tid)

    def _setPackTID(self, tid):
        self._setConfiguration('_pack_tid', tid)
@@ -502,11 +510,14 @@ class DatabaseManager(object):
        and max_tid (included)"""
        raise NotImplementedError

-    def truncate(self, tid):
-        assert tid not in (None, ZERO_TID), tid
+    def truncate(self):
+        tid = self.getTruncateTID()
+        if tid:
+            assert tid != ZERO_TID, tid
            for partition in xrange(self.getNumPartitions()):
                self._deleteRange(partition, tid)
-        self.setBackupTID(None) # this also commits
+            self._setTruncateTID(None)
+            self.commit()

    def getTransaction(self, tid, all = False):
        """Return a tuple of the list of OIDs, user information,

--- a/neo/storage/handlers/__init__.py
+++ b/neo/storage/handlers/__init__.py
@@ -16,7 +16,7 @@

 from neo.lib import logging
 from neo.lib.handler import EventHandler
-from neo.lib.exception import PrimaryFailure, OperationFailure
+from neo.lib.exception import PrimaryFailure, StoppedOperation
 from neo.lib.protocol import uuid_str, NodeStates, NodeTypes, Packets

 class BaseMasterHandler(EventHandler):
@@ -27,7 +27,7 @@ class BaseMasterHandler(EventHandler):
            raise PrimaryFailure('connection lost')

    def stopOperation(self, conn):
-        raise OperationFailure('operation stopped')
+        raise StoppedOperation

    def reelectPrimary(self, conn):
        raise PrimaryFailure('re-election occurs')
@@ -48,7 +48,7 @@ class BaseMasterHandler(EventHandler):
                    erase = state == NodeStates.DOWN
                    self.app.shutdown(erase=erase)
                elif state == NodeStates.HIDDEN:
-                    raise OperationFailure
+                    raise StoppedOperation
            elif node_type == NodeTypes.CLIENT and state != NodeStates.RUNNING:
                logging.info('Notified of non-running client, abort (%s)',
                        uuid_str(uuid))

--- a/neo/storage/handlers/initialization.py
+++ b/neo/storage/handlers/initialization.py
@@ -46,16 +46,23 @@ class InitializationHandler(BaseMasterHandler):
        app.dm.changePartitionTable(ptid, cell_list, reset=True)

    def truncate(self, conn, tid):
-        self.app.dm.truncate(tid)
+        dm = self.app.dm
+        dm._setBackupTID(None)
+        dm._setTruncateTID(tid)
+        dm.commit()

-    def askLastIDs(self, conn):
+    def askRecovery(self, conn):
        app = self.app
-        ltid, _, _, loid = app.dm.getLastIDs()
-        conn.answer(Packets.AnswerLastIDs(
-            loid,
-            ltid,
+        conn.answer(Packets.AnswerRecovery(
            app.pt.getID(),
-            app.dm.getBackupTID()))
+            app.dm.getBackupTID(),
+            app.dm.getTruncateTID()))
+
+    def askLastIDs(self, conn):
+        dm = self.app.dm
+        dm.truncate()
+        ltid, _, _, loid = dm.getLastIDs()
+        conn.answer(Packets.AnswerLastIDs(loid, ltid))

    def askPartitionTable(self, conn):
        pt = self.app.pt
@@ -80,4 +87,5 @@ class InitializationHandler(BaseMasterHandler):
            tid = dm.getLastIDs()[0] or ZERO_TID
        else:
            tid = None
-        dm.setBackupTID(tid)
+        dm._setBackupTID(tid)
+        dm.commit()
--- a/neo/storage/handlers/master.py
+++ b/neo/storage/handlers/master.py
@@ -28,7 +28,8 @@ class MasterOperationHandler(BaseMasterHandler):
        assert self.app.operational and backup
        dm = self.app.dm
        if not dm.getBackupTID():
-            dm.setBackupTID(dm.getLastIDs()[0] or ZERO_TID)
+            dm._setBackupTID(dm.getLastIDs()[0] or ZERO_TID)
+            dm.commit()

    def notifyTransactionFinished(self, conn, *args, **kw):
        self.app.replicator.transactionFinished(*args, **kw)

--- a/neo/storage/replicator.py
+++ b/neo/storage/replicator.py
@@ -128,7 +128,8 @@ class Replicator(object):
        if tid:
            new_tid = self.getBackupTID()
            if tid != new_tid:
-                dm.setBackupTID(new_tid)
+                dm._setBackupTID(new_tid)
+                dm.commit()

    def populate(self):
        app = self.app

--- a/neo/tests/master/testRecovery.py
+++ b/neo/tests/master/testRecovery.py
@@ -67,29 +67,6 @@ class MasterRecoveryTests(NeoUnitTestBase):
        self.assertEqual(self.app.nm.getByAddress(conn.getAddress()).getState(),
                NodeStates.TEMPORARILY_DOWN)

-    def test_09_answerLastIDs(self):
-        recovery = self.recovery
-        uuid = self.identifyToMasterNode()
-        oid1 = self.getOID(1)
-        oid2 = self.getOID(2)
-        tid1 = self.getNextTID()
-        tid2 = self.getNextTID(tid1)
-        ptid1 = self.getPTID(1)
-        ptid2 = self.getPTID(2)
-        self.app.tm.setLastOID(oid1)
-        self.app.tm.setLastTID(tid1)
-        self.app.pt.setID(ptid1)
-        # send information which are later to what PMN knows, this must update target node
-        conn = self.getFakeConnection(uuid, self.storage_port)
-        self.assertTrue(ptid2 > self.app.pt.getID())
-        self.assertTrue(oid2 > self.app.tm.getLastOID())
-        self.assertTrue(tid2 > self.app.tm.getLastTID())
-        recovery.answerLastIDs(conn, oid2, tid2, ptid2, None)
-        self.assertEqual(oid2, self.app.tm.getLastOID())
-        self.assertEqual(tid2, self.app.tm.getLastTID())
-        self.assertEqual(ptid2, recovery.target_ptid)
-
-
    def test_10_answerPartitionTable(self):
        recovery = self.recovery
        uuid = self.identifyToMasterNode(NodeTypes.MASTER, port=self.master_port)

--- a/neo/tests/master/testStorageHandler.py
+++ b/neo/tests/master/testStorageHandler.py
@@ -21,7 +21,7 @@ from neo.lib.protocol import NodeTypes, NodeStates, Packets
 from neo.master.handlers.storage import StorageServiceHandler
 from neo.master.handlers.client import ClientServiceHandler
 from neo.master.app import Application
-from neo.lib.exception import OperationFailure
+from neo.lib.exception import StoppedOperation

 class MasterStorageHandlerTests(NeoUnitTestBase):

@@ -114,24 +114,6 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
        self.checkNotifyUnlockInformation(storage_conn_1)
        self.checkNotifyUnlockInformation(storage_conn_2)

-    def test_12_askLastIDs(self):
-        service = self.service
-        node, conn = self.identifyToMasterNode()
-        # give a uuid
-        conn = self.getFakeConnection(node.getUUID(), self.storage_address)
-        ptid = self.app.pt.getID()
-        oid = self.getOID(1)
-        tid = self.getNextTID()
-        self.app.tm.setLastOID(oid)
-        self.app.tm.setLastTID(tid)
-        service.askLastIDs(conn)
-        packet = self.checkAnswerLastIDs(conn)
-        loid, ltid, lptid, backup_tid = packet.decode()
-        self.assertEqual(loid, oid)
-        self.assertEqual(ltid, tid)
-        self.assertEqual(lptid, ptid)
-        self.assertEqual(backup_tid, None)
-
    def test_13_askUnfinishedTransactions(self):
        service = self.service
        node, conn = self.identifyToMasterNode()
@@ -173,7 +155,7 @@ class MasterStorageHandlerTests(NeoUnitTestBase):
        # drop the second, no storage node left
        lptid = self.app.pt.getID()
        self.assertEqual(node2.getState(), NodeStates.RUNNING)
-        self.assertRaises(OperationFailure, method, conn2)
+        self.assertRaises(StoppedOperation, method, conn2)
        self.assertEqual(node2.getState(), state)
        self.assertEqual(lptid, self.app.pt.getID())


--- a/neo/tests/storage/testMasterHandler.py
+++ b/neo/tests/storage/testMasterHandler.py
@@ -20,7 +20,7 @@ from collections import deque
 from .. import NeoUnitTestBase
 from neo.storage.app import Application
 from neo.storage.handlers.master import MasterOperationHandler
-from neo.lib.exception import PrimaryFailure, OperationFailure
+from neo.lib.exception import PrimaryFailure
 from neo.lib.pt import PartitionTable
 from neo.lib.protocol import CellStates, ProtocolError, Packets

@@ -104,11 +104,6 @@ class StorageMasterHandlerTests(NeoUnitTestBase):
        self.assertEqual(len(calls), 1)
        calls[0].checkArgs(ptid2, cells)

-    def test_16_stopOperation1(self):
-        # OperationFailure
-        conn = self.getFakeConnection(is_server=False)
-        self.assertRaises(OperationFailure, self.operation.stopOperation, conn)
-
    def _getConnection(self):
        return self.getFakeConnection()


--- a/neo/tests/threaded/test.py
+++ b/neo/tests/threaded/test.py
@@ -26,7 +26,7 @@ from ZODB import DB, POSException
 from neo.storage.transactions import TransactionManager, \
    DelayedError, ConflictError
 from neo.lib.connection import ConnectionClosed, MTClientConnection
-from neo.lib.exception import OperationFailure
+from neo.lib.exception import StoppedOperation
 from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \
    ZERO_TID
 from .. import expectedFailure, _ExpectedFailure, _UnexpectedSuccess, Patch
@@ -933,7 +933,7 @@ class Test(NEOThreadedTest):
    def testStorageFailureDuringTpcFinish(self):
        def answerTransactionFinished(conn, packet):
            if isinstance(packet, Packets.AnswerTransactionFinished):
-                raise OperationFailure
+                raise StoppedOperation
        cluster = NEOCluster()
        try:
            cluster.start()
@@ -1059,6 +1059,64 @@ class Test(NEOThreadedTest):
        finally:
            cluster.stop()

+    def testTruncate(self):
+        calls = [0, 0]
+        def dieFirst(i):
+            def f(orig, *args, **kw):
+                calls[i] += 1
+                if calls[i] == 1:
+                    sys.exit()
+                return orig(*args, **kw)
+            return f
+        cluster = NEOCluster(replicas=1)
+        try:
+            cluster.start()
+            t, c = cluster.getTransaction()
+            r = c.root()
+            tids = []
+            for x in xrange(4):
+                r[x] = None
+                t.commit()
+                tids.append(r._p_serial)
+            truncate_tid = tids[2]
+            r['x'] = PCounter()
+            s0, s1 = cluster.storage_list
+            with Patch(s0.tm, unlock=dieFirst(0)), \
+                 Patch(s1.dm, truncate=dieFirst(1)):
+                t.commit()
+                cluster.neoctl.truncate(truncate_tid)
+                self.tic()
+                getClusterState = cluster.neoctl.getClusterState
+                # Unless forced, the cluster waits all nodes to be up,
+                # so that all nodes are truncated.
+                self.assertEqual(getClusterState(), ClusterStates.RECOVERING)
+                self.assertEqual(calls, [1, 0])
+                s0.resetNode()
+                s0.start()
+                # s0 died with unfinished data, and before processing the
+                # Truncate packet from the master.
+                self.assertFalse(s0.dm.getTruncateTID())
+                self.assertEqual(s1.dm.getTruncateTID(), truncate_tid)
+                self.tic()
+                self.assertEqual(calls, [1, 1])
+                self.assertEqual(getClusterState(), ClusterStates.RECOVERING)
+            s1.resetNode()
+            with Patch(s1.dm, truncate=dieFirst(1)):
+                s1.start()
+                self.assertEqual(s0.dm.getLastIDs()[0], truncate_tid)
+                self.assertEqual(s1.dm.getLastIDs()[0], r._p_serial)
+                self.tic()
+                self.assertEqual(calls, [1, 2])
+                self.assertEqual(getClusterState(), ClusterStates.RUNNING)
+            t.begin()
+            self.assertEqual(r, dict.fromkeys(xrange(3)))
+            self.assertEqual(r._p_serial, truncate_tid)
+            self.assertEqual(1, u64(c._storage.new_oid()))
+            for s in cluster.storage_list:
+                self.assertEqual(s.dm.getLastIDs()[0], truncate_tid)
+        finally:
+            cluster.stop()
+

 if __name__ == "__main__":
    unittest.main()