Commit fd4cfaa9 authored by Julien Muchembled's avatar Julien Muchembled

Add test showing that clients may be stuck on an old snapshot in case of failure during tpc_finish

If anything wrong happens after a transaction is locked and before the end of
onTransactionCommitted, recovery phase should be run again, so that the master
gets correct last tid.

Following patch by Vincent is an attempt to fix this:

--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -329,8 +329,8 @@ def playPrimaryRole(self):

         # recover the cluster status at startup
         try:
-            self.runManager(RecoveryManager)
             while True:
+                self.runManager(RecoveryManager)
                 self.runManager(VerificationManager)
                 try:
                     if self.backup_tid:
@@ -338,10 +338,6 @@ def playPrimaryRole(self):
                             raise RuntimeError("No upstream cluster to backup"
                                                " defined in configuration")
                         self.backup_app.provideService()
-                        # Reset connection with storages (and go through a
-                        # recovery phase) when leaving backup mode in order
-                        # to get correct last oid/tid.
-                        self.runManager(RecoveryManager)
                         continue
                     self.provideService()
                 except OperationFailure:
parent ffa5f6fc
......@@ -151,6 +151,7 @@ RC - Review output of pylint (CODE)
legitimaltely think transaction is not committed, and might decide to
retry. To solve this, client can know if its TTID got successfuly
committed by looking at currently unused '(t)trans.ttid' column.
See neo.threaded.test.Test.testStorageFailureDuringTpcFinish
- Fix and reenable deadlock avoidance (SPEED). This is required for
neo.threaded.test.Test.testDeadlockAvoidance
......
......@@ -23,7 +23,7 @@ from persistent import Persistent
from ZODB import POSException
from neo.storage.transactions import TransactionManager, \
DelayedError, ConflictError
from neo.lib.connection import MTClientConnection
from neo.lib.connection import ConnectionClosed, MTClientConnection
from neo.lib.protocol import CellStates, ClusterStates, NodeStates, Packets, \
ZERO_TID
from . import ClientApplication, NEOCluster, NEOThreadedTest, Patch
......@@ -702,6 +702,26 @@ class Test(NEOThreadedTest):
finally:
cluster.stop()
def testStorageFailureDuringTpcFinish(self):
def answerTransactionFinished(conn, packet):
if isinstance(packet, Packets.AnswerTransactionFinished):
c, = cluster.storage.getConnectionList(cluster.master)
c.abort()
cluster = NEOCluster()
try:
cluster.start()
t, c = cluster.getTransaction()
c.root()['x'] = PCounter()
with cluster.master.filterConnection(cluster.client) as m2c:
m2c.add(answerTransactionFinished)
# XXX: This is an expected failure. A ttid column was added to
# 'trans' table to permit recovery, by checking that the
# transaction was really committed.
self.assertRaises(ConnectionClosed, t.commit)
t.begin()
c.root()['x']
finally:
cluster.stop()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment