Prevent nodes from reconnecting too fast

This happened between storage nodes of different clusters because they're not informed about their state, e.g. a dead upstream storage node. In any case, logs were flooded at 100% cpu usage.

Prevent nodes from reconnecting too fast
This happened between storage nodes of different clusters because they're not informed about their state, e.g. a dead upstream storage node. In any case, logs were flooded at 100% cpu usage.
c05e65ac · Julien Muchembled · c321971f · c05e65ac · c05e65ac · c05e65ac
Commit c05e65ac authored Jun 23, 2015 by Julien Muchembled
5 changed files
--- a/neo/lib/connection.py
+++ b/neo/lib/connection.py
@@ -28,6 +28,8 @@ from .util import ReadBuffer

 CRITICAL_TIMEOUT = 30

+connect_limit = 0
+
 class ConnectionClosed(Exception):
    pass

@@ -548,6 +550,18 @@ class Connection(BaseConnection):
            self._handlers.handle(self, self._queue.pop(0))
        self.close()

+    def _delayed_closure(self):
+        # Wait at least 1 second between connection failures.
+        global connect_limit
+        t = time()
+        if t < connect_limit:
+            self.checkTimeout = lambda t: t < connect_limit or \
+                self._delayed_closure()
+            self.readable = self.writable = lambda: None
+        else:
+            connect_limit = t + 1
+            self._closure()
+
    def _recv(self):
        """Receive data from a connector."""
        try:
@@ -556,7 +570,7 @@ class Connection(BaseConnection):
            pass
        except ConnectorConnectionRefusedException:
            assert self.connecting
-            self._closure()
+            self._delayed_closure()
        except ConnectorConnectionClosedException:
            # connection resetted by peer, according to the man, this error
            # should not occurs but it seems it's false
@@ -671,7 +685,7 @@ class ClientConnection(Connection):
            else:
                self._connectionCompleted()
        except ConnectorConnectionRefusedException:
-            self._closure()
+            self._delayed_closure()
        except ConnectorException:
            # unhandled connector exception
            self._closure()
@@ -680,7 +694,7 @@ class ClientConnection(Connection):
    def writable(self):
        """Called when self is writable."""
        if self.connector.getError():
-            self._closure()
+            self._delayed_closure()
        else:
            self._connectionCompleted()
            self.writable()

--- a/neo/master/app.py
+++ b/neo/master/app.py
@@ -190,12 +190,12 @@ class Application(object):
                while (self.unconnected_master_node_set or
                        self.negotiating_master_node_set):
                    for addr in self.unconnected_master_node_set:
+                        self.negotiating_master_node_set.add(addr)
                        ClientConnection(self.em, client_handler,
                            # XXX: Ugly, but the whole election code will be
                            # replaced soon
                            node=getByAddress(addr),
                            connector=self.connector_handler())
-                        self.negotiating_master_node_set.add(addr)
                    self.unconnected_master_node_set.clear()
                    self.em.poll(1)
            except ElectionFailure, m:

--- a/neo/master/handlers/election.py
+++ b/neo/master/handlers/election.py
@@ -57,9 +57,7 @@ class ClientElectionHandler(BaseElectionHandler):
        addr = conn.getAddress()
        node = self.app.nm.getByAddress(addr)
        assert node is not None, (uuid_str(self.app.uuid), addr)
-        assert node.isUnknown(), (uuid_str(self.app.uuid), node.whoSetState(),
-          node)
-        # connection never success, node is still in unknown state
+        # node may still be in unknown state
        self.app.negotiating_master_node_set.discard(addr)
        super(ClientElectionHandler, self).connectionFailed(conn)


--- a/neo/tests/testConnection.py
+++ b/neo/tests/testConnection.py
@@ -17,6 +17,7 @@
 import unittest
 from time import time
 from mock import Mock
+from neo.lib import connection
 from neo.lib.connection import ListeningConnection, Connection, \
     ClientConnection, ServerConnection, MTClientConnection, \
     HandlerSwitcher, CRITICAL_TIMEOUT
@@ -37,6 +38,7 @@ class ConnectionTests(NeoUnitTestBase):
        self.handler = Mock({'__repr__': 'Fake Handler'})
        self.address = ("127.0.0.7", 93413)
        self.node = Mock({'getAddress': self.address})
+        connection.connect_limit = 0

    def _makeListeningConnection(self, addr):
        # create instance after monkey patches

--- a/neo/tests/threaded/testReplication.py
+++ b/neo/tests/threaded/testReplication.py
@@ -21,11 +21,12 @@ from collections import defaultdict
 from functools import wraps
 from neo.lib import logging
 from neo.storage.checker import CHECK_COUNT
+from neo.lib.connection import ClientConnection
 from neo.lib.protocol import CellStates, ClusterStates, Packets, \
    ZERO_OID, ZERO_TID, MAX_TID, uuid_str
 from neo.lib.util import p64
 from . import ConnectionFilter, NEOCluster, NEOThreadedTest, Patch, \
-    predictable_random
+    predictable_random, Serialized


 def backup_test(partitions=1, upstream_kw={}, backup_kw={}):
@@ -261,6 +262,31 @@ class ReplicationTests(NEOThreadedTest):
        new_conn, = backup.master.getConnectionList(backup.upstream.master)
        self.assertFalse(new_conn is conn)

+    @backup_test()
+    def testBackupUpstreamStorageDead(self, backup):
+        upstream = backup.upstream
+        with ConnectionFilter() as f:
+            f.add(lambda conn, packet:
+                isinstance(packet, Packets.InvalidateObjects))
+            upstream.importZODB()(1)
+            upstream.client.setPoll(0)
+        count = [0]
+        def __init__(orig, *args, **kw):
+            count[0] += 1
+            orig(*args, **kw)
+        p = Patch(ClientConnection, __init__=__init__)
+        try:
+            upstream.storage.listening_conn.close()
+            Serialized.tic(); self.assertEqual(count[0], 0)
+            Serialized.tic(); count[0] or Serialized.tic()
+            Serialized.tic(); self.assertEqual(count[0], 2)
+            Serialized.tic(); self.assertEqual(count[0], 2)
+            time.sleep(1.1)
+            Serialized.tic(); self.assertEqual(count[0], 3)
+            Serialized.tic(); self.assertEqual(count[0], 3)
+        finally:
+            del p
+
    @backup_test()
    def testBackupDelayedUnlockTransaction(self, backup):
        """