...
 
Commits (7)
  • Julien Muchembled's avatar
    Merge v1.12 · 2c823e2e
    Julien Muchembled authored
    2c823e2e
  • Julien Muchembled's avatar
    mysql: workaround for MDEV-20693 · 70387981
    Julien Muchembled authored
    70387981
  • Julien Muchembled's avatar
    master: fix crash in STARTING_BACKUP when connecting to an upstream secondary master · 011eba12
    Julien Muchembled authored
    This fixes the following assertion:
    
      Traceback (most recent call last):
        File "neo/master/app.py", line 172, in run
          self._run()
        File "neo/master/app.py", line 182, in _run
          self.playPrimaryRole()
        File "neo/master/app.py", line 302, in playPrimaryRole
          self.backup_app.provideService())
        File "neo/master/backup_app.py", line 114, in provideService
          node, conn = bootstrap.getPrimaryConnection()
        File "neo/lib/bootstrap.py", line 74, in getPrimaryConnection
          poll(1)
        File "neo/lib/event.py", line 160, in poll
          to_process.process()
        File "neo/lib/connection.py", line 504, in process
          self._handlers.handle(self, self._queue.pop(0))
        File "neo/lib/connection.py", line 92, in handle
          self._handle(connection, packet)
        File "neo/lib/connection.py", line 107, in _handle
          pending[0][1].packetReceived(connection, packet)
        File "neo/lib/handler.py", line 125, in packetReceived
          self.dispatch(*args)
        File "neo/lib/handler.py", line 75, in dispatch
          method(conn, *args, **kw)
        File "neo/lib/handler.py", line 159, in notPrimaryMaster
          assert primary != self.app.server
      AttributeError: 'BackupApplication' object has no attribute 'server'
    
    (cherry picked from commit dba07e72)
    011eba12
  • Julien Muchembled's avatar
    Code clean-up, comment fixes · fa7fbad6
    Julien Muchembled authored
    (cherry picked from commit 43029be2)
    fa7fbad6
  • Julien Muchembled's avatar
    client: fix race with invalidations when starting a new transaction on ZODB 5 · 96a5c01f
    Julien Muchembled authored
    This requires ZODB >= 5.6.0
    
    (cherry picked from commit a7d101ec)
    96a5c01f
  • Julien Muchembled's avatar
    qa: skip broken ZODB test · a1418c9d
    Julien Muchembled authored
    ======================================================================
    FAIL: check_tid_ordering_w_commit (neo.tests.zodb.testBasic.BasicTests)
    ----------------------------------------------------------------------
    Traceback (most recent call last):
      File "ZODB/tests/BasicStorage.py", line 397, in check_tid_ordering_w_commit
        self.assertEqual(results.pop('lastTransaction'), tids[1])
      File "neo/tests/__init__.py", line 301, in assertEqual
        return super(NeoTestBase, self).assertEqual(first, second, msg=msg)
    failureException: '\x03\xd8\x85H\xbffp\xbb' != '\x03\xd8\x85H\xbfs\x0b\xdd'
    
    (cherry picked from commit f4cb59d2)
    a1418c9d
  • Julien Muchembled's avatar
    f2ea4be2
......@@ -17,7 +17,7 @@
import heapq
import random
import time
from collections import defaultdict
try:
from ZODB._compat import dumps, loads, _protocol
except ImportError:
......@@ -79,7 +79,7 @@ class Application(ThreadedApplication):
# no self-assigned NID, primary master will supply us one
self._cache = ClientCache() if cache_size is None else \
ClientCache(max_size=cache_size)
self._loading_oid = None
self._loading = defaultdict(lambda: (Lock(), []))
self.new_oid_list = ()
self.last_oid = '\0' * 8
self.storage_event_handler = storage.StorageEventHandler(self)
......@@ -90,19 +90,13 @@ class Application(ThreadedApplication):
self.notifications_handler = master.PrimaryNotificationsHandler( self)
self._txn_container = TransactionContainer()
# Lock definition :
# _load_lock is used to make loading and storing atomic
lock = Lock()
self._load_lock_acquire = lock.acquire
self._load_lock_release = lock.release
# _oid_lock is used in order to not call multiple oid
# generation at the same time
lock = Lock()
self._oid_lock_acquire = lock.acquire
self._oid_lock_release = lock.release
lock = Lock()
# _cache_lock is used for the client cache
self._cache_lock_acquire = lock.acquire
self._cache_lock_release = lock.release
self._cache_lock = Lock()
# _connecting_to_master_node is used to prevent simultaneous master
# node connection attempts
self._connecting_to_master_node = Lock()
......@@ -397,21 +391,32 @@ class Application(ThreadedApplication):
"""
# TODO:
# - rename parameters (here? and in handlers & packet definitions)
acquire = self._cache_lock_acquire
release = self._cache_lock_release
# XXX: Consider using a more fine-grained lock.
self._load_lock_acquire()
acquired = False
lock = self._cache_lock
try:
acquire()
try:
result = self._loadFromCache(oid, tid, before_tid)
if result:
return result
self._loading_oid = oid
self._loading_invalidated = []
finally:
release()
while 1:
with lock:
if tid:
result = self._cache.load(oid, tid + '*')
assert not result or result[1] == tid
else:
result = self._cache.load(oid, before_tid)
if result:
return result
load_lock = self._loading[oid][0]
acquired = load_lock.acquire(0)
# Several concurrent cache misses for the same oid are probably
# for the same tid so we use a per-oid lock to avoid asking the
# same data to the storage node.
if acquired:
# The first thread does load from storage,
# and fills cache with the response.
break
# The other threads wait for the first one to complete and
# loop, possibly resulting in a new cache miss if a different
# tid is actually wanted or if the data was too big.
with load_lock:
pass
# While the cache lock is released, an arbitrary number of
# invalidations may be processed, for this oid or not. And at this
# precise moment, if both tid and before_tid are None (which is
......@@ -427,20 +432,24 @@ class Application(ThreadedApplication):
# we got from master.
before_tid = p64(u64(self.last_tid) + 1)
data, tid, next_tid, _ = self._loadFromStorage(oid, tid, before_tid)
acquire()
try:
if self._loading_oid:
with lock:
loading = self._loading.pop(oid, None)
if loading:
assert loading[0] is load_lock
if not next_tid:
for t in self._loading_invalidated:
for t in loading[1]:
if tid < t:
next_tid = t
break
self._cache.store(oid, data, tid, next_tid)
# Else, we just reconnected to the master.
finally:
release()
finally:
self._load_lock_release()
load_lock.release()
except:
if acquired:
with lock:
self._loading.pop(oid, None)
load_lock.release()
raise
return data, tid, next_tid
def _loadFromStorage(self, oid, at_tid, before_tid):
......@@ -459,16 +468,6 @@ class Application(ThreadedApplication):
Packets.AskObject(oid, at_tid, before_tid),
askStorage)
def _loadFromCache(self, oid, at_tid=None, before_tid=None):
"""
Load from local cache, return None if not found.
"""
if at_tid:
result = self._cache.load(oid, at_tid + '*')
assert not result or result[1] == at_tid
return result
return self._cache.load(oid, before_tid)
def tpc_begin(self, storage, transaction, tid=None, status=' '):
"""Begin a new transaction."""
# First get a transaction, only one is allowed at a time
......@@ -670,7 +669,7 @@ class Application(ThreadedApplication):
txn_context = self._txn_container.pop(transaction)
if txn_context is None:
return
# We want that the involved nodes abort a transaction after any
# We want the involved nodes to abort a transaction after any
# other packet sent by the client for this transaction. IOW, if we
# already have a connection with a storage node, potentially with
# a pending write, aborting only via the master may lead to a race
......@@ -699,9 +698,8 @@ class Application(ThreadedApplication):
txn_context.conn_dict))
except ConnectionClosed:
pass
# We don't need to flush queue, as it won't be reused by future
# transactions (deleted on next line & indexed by transaction object
# instance).
# No need to flush queue, as it will be destroyed on return,
# along with txn_context.
self.dispatcher.forget_queue(txn_context.queue, flush_queue=False)
def tpc_finish(self, transaction, f=None):
......@@ -724,28 +722,22 @@ class Application(ThreadedApplication):
txn_container = self._txn_container
if not txn_container.get(transaction).voted:
self.tpc_vote(transaction)
checked_list = []
self._load_lock_acquire()
txn_context = txn_container.pop(transaction)
cache_dict = txn_context.cache_dict
checked_list = [oid for oid, data in cache_dict.iteritems()
if data is CHECKED_SERIAL]
for oid in checked_list:
del cache_dict[oid]
ttid = txn_context.ttid
p = Packets.AskFinishTransaction(ttid, cache_dict, checked_list)
try:
# Call finish on master
txn_context = txn_container.pop(transaction)
cache_dict = txn_context.cache_dict
checked_list = [oid for oid, data in cache_dict.iteritems()
if data is CHECKED_SERIAL]
for oid in checked_list:
del cache_dict[oid]
ttid = txn_context.ttid
p = Packets.AskFinishTransaction(ttid, cache_dict, checked_list)
try:
tid = self._askPrimary(p, cache_dict=cache_dict, callback=f)
assert tid
except ConnectionClosed:
tid = self._getFinalTID(ttid)
if not tid:
raise
return tid
finally:
self._load_lock_release()
tid = self._askPrimary(p, cache_dict=cache_dict, callback=f)
assert tid
except ConnectionClosed:
tid = self._getFinalTID(ttid)
if not tid:
raise
return tid
def _getFinalTID(self, ttid):
try:
......@@ -991,11 +983,8 @@ class Application(ThreadedApplication):
# It should not be otherwise required (clients should be free to load
# old data as long as it is available in cache, event if it was pruned
# by a pack), so don't bother invalidating on other clients.
self._cache_lock_acquire()
try:
with self._cache_lock:
self._cache.clear()
finally:
self._cache_lock_release()
def getLastTID(self, oid):
return self.load(oid)[1]
......
# -*- coding: utf-8 -*-
#
# Copyright (C) 2006-2019 Nexedi SA
#
......@@ -45,8 +46,7 @@ class PrimaryNotificationsHandler(MTEventHandler):
# Either we're connecting or we already know the last tid
# via invalidations.
assert app.master_conn is None, app.master_conn
app._cache_lock_acquire()
try:
with app._cache_lock:
if app_last_tid < ltid:
app._cache.clear_current()
# In the past, we tried not to invalidate the
......@@ -60,9 +60,7 @@ class PrimaryNotificationsHandler(MTEventHandler):
app._cache.clear()
# Make sure a parallel load won't refill the cache
# with garbage.
app._loading_oid = app._loading_invalidated = None
finally:
app._cache_lock_release()
app._loading.clear()
db = app.getDB()
db is None or db.invalidateCache()
app.last_tid = ltid
......@@ -70,21 +68,22 @@ class PrimaryNotificationsHandler(MTEventHandler):
def answerTransactionFinished(self, conn, _, tid, callback, cache_dict):
app = self.app
app.last_tid = tid
# Update cache
cache = app._cache
app._cache_lock_acquire()
try:
invalidate = cache.invalidate
loading_get = app._loading.get
with app._cache_lock:
for oid, data in cache_dict.iteritems():
# Update ex-latest value in cache
cache.invalidate(oid, tid)
invalidate(oid, tid)
loading = loading_get(oid)
if loading:
loading[1].append(tid)
if data is not None:
# Store in cache with no next_tid
cache.store(oid, data, tid, None)
if callback is not None:
callback(tid)
finally:
app._cache_lock_release()
app.last_tid = tid # see comment in invalidateObjects
def connectionClosed(self, conn):
app = self.app
......@@ -112,20 +111,24 @@ class PrimaryNotificationsHandler(MTEventHandler):
app = self.app
if app.ignore_invalidations:
return
app.last_tid = tid
app._cache_lock_acquire()
try:
with app._cache_lock:
invalidate = app._cache.invalidate
loading = app._loading_oid
loading_get = app._loading.get
for oid in oid_list:
invalidate(oid, tid)
if oid == loading:
app._loading_invalidated.append(tid)
loading = loading_get(oid)
if loading:
loading[1].append(tid)
db = app.getDB()
if db is not None:
db.invalidate(tid, oid_list)
finally:
app._cache_lock_release()
# ZODB<5: Update before releasing the lock so that app.load
# asks the last serial (with respect to already processed
# invalidations by Connection._setstate).
# ZODB≥5: Update after db.invalidate because the MVCC
# adapter starts at the greatest TID between
# IStorage.lastTransaction and processed invalidations.
app.last_tid = tid
def sendPartitionTable(self, conn, ptid, num_replicas, row_list):
pt = self.app.pt = object.__new__(PartitionTable)
......
......@@ -50,7 +50,7 @@ class Transaction(object):
self.conflict_dict = {} # {oid: serial}
# resolved conflicts
self.resolved_dict = {} # {oid: serial}
# involved storage nodes; connection is None is connection was lost
# involved storage nodes; connection is None if connection was lost
self.conn_dict = {} # {node_id: connection}
def __repr__(self):
......
......@@ -197,8 +197,7 @@ elif IF == 'trace-cache':
@defer
def profile(app):
app._cache_lock_acquire()
try:
with app._cache_lock:
cache = app._cache
if type(cache) is ClientCache:
app._cache = CacheTracer(cache, '%s-%s.neo-cache-trace' %
......@@ -206,5 +205,3 @@ elif IF == 'trace-cache':
app._cache.clear()
else:
app._cache = cache.close()
finally:
app._cache_lock_release()
......@@ -587,8 +587,8 @@ class Application(BaseApplication):
node.send(Packets.StartOperation(self.backup_tid))
uuid = node.getUUID()
assert uuid not in self.storage_starting_set
if uuid not in self.storage_ready_dict:
self.storage_starting_set.add(uuid)
assert uuid not in self.storage_ready_dict
self.storage_starting_set.add(uuid)
def setStorageReady(self, uuid):
self.storage_starting_set.remove(uuid)
......
......@@ -65,6 +65,7 @@ There is no conflict of node id between the 2 clusters:
class BackupApplication(object):
pt = None
server = None # like in BaseApplication
uuid = None
def __init__(self, app, name, master_addresses):
......
......@@ -781,11 +781,19 @@ class MySQLDatabaseManager(DatabaseManager):
if max_tid is not None:
sql += " AND tid <= %d" % max_tid
q = self.query
q("DELETE FROM trans" + sql)
if q("SELECT 1 FROM trans%s LIMIT 1" % sql):
q("DELETE FROM trans" + sql)
else:
logging.info("Nothing to truncate in trans for partition %s",
partition)
sql = " FROM obj" + sql
data_id_list = [x for x, in q(
"SELECT DISTINCT data_id%s AND data_id IS NOT NULL" % sql)]
q("DELETE" + sql)
if q("SELECT 1%s LIMIT 1" % sql):
q("DELETE" + sql)
else:
logging.info("Nothing to truncate in obj for partition %s",
partition)
self._pruneData(data_id_list)
def getTransaction(self, tid, all = False):
......
......@@ -34,7 +34,7 @@ class ClientOperationHandler(BaseHandler):
app = self.app
if app.operational:
# Even if in most cases, abortFor is called from both this method
# and BaseMasterHandler.notifyPartitionChanges (especially since
# and BaseMasterHandler.notifyNodeInformation (especially since
# storage nodes disconnects unknown clients on their own), these 2
# handlers also cover distinct scenarios, so neither of them is
# redundant:
......
......@@ -139,10 +139,11 @@ class TransactionManager(EventQueue):
def replicating(self, offset_list):
self._replicating.update(offset_list)
isdisjoint = set(offset_list).isdisjoint
assert isdisjoint(self._replicated), (offset_list, self._replicated)
assert isdisjoint(map(self.getPartition, self._store_lock_dict)), (
offset_list, self._store_lock_dict)
if __debug__:
isdisjoint = set(offset_list).isdisjoint
assert isdisjoint(self._replicated), (offset_list, self._replicated)
assert isdisjoint(map(self.getPartition, self._store_lock_dict)), (
offset_list, self._store_lock_dict)
p = Packets.AskUnfinishedTransactions(offset_list)
self._app.master_conn.ask(p, offset_list=offset_list)
......
......@@ -1084,8 +1084,7 @@ class NEOThreadedTest(NeoTestBase):
def run(self):
try:
apply(*self.__target)
self.__exc_info = None
self.__result = apply(*self.__target)
except:
self.__exc_info = sys.exc_info()
if self.__exc_info[0] is NEOThreadedTest.failureException:
......@@ -1093,10 +1092,13 @@ class NEOThreadedTest(NeoTestBase):
def join(self, timeout=None):
threading.Thread.join(self, timeout)
if not self.is_alive() and self.__exc_info:
etype, value, tb = self.__exc_info
del self.__exc_info
raise etype, value, tb
if not self.is_alive():
try:
return self.__result
except AttributeError:
etype, value, tb = self.__exc_info
del self.__exc_info
raise etype, value, tb
class newThread(newPausedThread):
......
This diff is collapsed.
......@@ -349,6 +349,22 @@ class ReplicationTests(NEOThreadedTest):
self.tic()
self.assertTrue(backup.master.is_alive())
@with_cluster(master_count=2)
def testBackupFromUpstreamWithSecondaryMaster(self, upstream):
"""
Check that the backup master reacts correctly when connecting first
to a secondary master of the upstream cluster.
"""
with NEOCluster(upstream=upstream) as backup:
primary = upstream.primary_master
m, = (m for m in upstream.master_list if m is not primary)
backup.master.resetNode(upstream_masters=[m.server])
backup.start()
backup.neoctl.setClusterState(ClusterStates.STARTING_BACKUP)
self.tic()
self.assertEqual(backup.neoctl.getClusterState(),
ClusterStates.BACKINGUP)
@backup_test()
def testCreationUndone(self, backup):
"""
......
......@@ -39,6 +39,14 @@ class BasicTests(ZODBTestCase, StorageTestBase, BasicStorage):
with Patch(threaded, TIC_LOOP=TIC_LOOP()):
super(BasicTests, self).check_checkCurrentSerialInTransaction()
# The test expects that both load & lastTransaction would be blocked
# as long as the tpc_finish callback has not finished, taking more
# than .1 second. ZODB 5.6.0 clarified that lastTransaction() can
# return immediately with the previous last TID rather than blocking
# until it is allowed to return the new last TID.
check_tid_ordering_w_commit = unittest.skip("ZODB PR #316")(
BasicStorage.check_tid_ordering_w_commit)
if __name__ == "__main__":
suite = unittest.makeSuite(BasicTests, 'check')
unittest.main(defaultTest='suite')
......