Sort cell list after randomising it.

There are 2 objectives: - Prevent randomly trying to connect to an unresponsive storage node, which impairs performances a lot. Note that this happens only when the master didn't notice the disconnection, so the node is still in running state in the node manager. - Increase connection reuse, saving the cost of establishing a new connection and a slot in connection pool. Randomisation should be kept to even out storage node use. git-svn-id: https://svn.erp5.org/repos/neo/trunk@2173 71dcc9de-d417-0410-9af5-da40c76e7ee4

Sort cell list after randomising it.
There are 2 objectives: - Prevent randomly trying to connect to an unresponsive storage node, which impairs performances a lot. Note that this happens only when the master didn't notice the disconnection, so the node is still in running state in the node manager. - Increase connection reuse, saving the cost of establishing a new connection and a slot in connection pool. Randomisation should be kept to even out storage node use. git-svn-id: https://svn.erp5.org/repos/neo/trunk@2173 71dcc9de-d417-0410-9af5-da40c76e7ee4
6d330027 · Vincent Pelletier · a29756a3 · 6d330027 · 6d330027 · 6d330027
Commit 6d330027 authored Jun 17, 2010 by Vincent Pelletier
4 changed files
--- a/neo/client/app.py
+++ b/neo/client/app.py
@@ -447,6 +447,7 @@ class Application(object):
            raise NEOStorageNotFoundError()

        shuffle(cell_list)
+        cell_list.sort(key=self.cp.getCellSortKey)
        self.local_var.asked_object = 0
        for cell in cell_list:
            logging.debug('trying to load %s from %s',
@@ -859,6 +860,7 @@ class Application(object):
        assert len(cell_list), 'No cell found for transaction %s' % (
            dump(undone_tid), )
        shuffle(cell_list)
+        cell_list.sort(key=self.cp.getCellSortKey)
        for cell in cell_list:
            conn = self.cp.getConnForCell(cell)
            if conn is None:
@@ -976,6 +978,7 @@ class Application(object):
        for tid in ordered_tids:
            cell_list = self._getCellListForTID(tid, readable=True)
            shuffle(cell_list)
+            cell_list.sort(key=self.cp.getCellSortKey)
            for cell in cell_list:
                conn = self.cp.getConnForCell(cell)
                if conn is not None:
@@ -1021,7 +1024,7 @@ class Application(object):
        # Get history informations for object first
        cell_list = self._getCellListForOID(oid, readable=True)
        shuffle(cell_list)
-
+        cell_list.sort(key=self.cp.getCellSortKey)
        for cell in cell_list:
            conn = self.cp.getConnForCell(cell)
            if conn is None:
@@ -1060,7 +1063,7 @@ class Application(object):
        for serial, size in self.local_var.history[1]:
            self._getCellListForTID(serial, readable=True)
            shuffle(cell_list)
-
+            cell_list.sort(key=self.cp.getCellSortKey)
            for cell in cell_list:
                conn = self.cp.getConnForCell(cell)
                if conn is None:

--- a/neo/client/pool.py
+++ b/neo/client/pool.py
@@ -22,6 +22,19 @@ from neo.protocol import NodeTypes, Packets
 from neo.connection import MTClientConnection
 from neo.client.exception import ConnectionClosed
 from neo.profiling import profiler_decorator
+import time
+
+# How long before we might retry a connection to a node to which connection
+# failed in the past.
+MAX_FAILURE_AGE = 600
+
+# Cell list sort keys
+#   We are connected to storage node hosting cell, high priority
+CELL_CONNECTED = -1
+#   normal priority
+CELL_GOOD = 0
+#   Storage node hosting cell failed recently, low priority
+CELL_FAILED = 1

 class ConnectionPool(object):
    """This class manages a pool of connections to storage nodes."""
@@ -36,6 +49,7 @@ class ConnectionPool(object):
        l = RLock()
        self.connection_lock_acquire = l.acquire
        self.connection_lock_release = l.release
+        self.node_failure_dict = {}

    @profiler_decorator
    def _initNodeConnection(self, node):
@@ -59,6 +73,7 @@ class ConnectionPool(object):
                if conn.getConnector() is None:
                    # This happens, if a connection could not be established.
                    logging.error('Connection to %r failed', node)
+                    self.notifyFailure(node)
                    return None

                p = Packets.RequestIdentification(NodeTypes.CLIENT,
@@ -72,6 +87,7 @@ class ConnectionPool(object):
                        handler=app.storage_bootstrap_handler)
            except ConnectionClosed:
                logging.error('Connection to %r failed', node)
+                self.notifyFailure(node)
                return None

            if app.isNodeReady():
@@ -79,6 +95,7 @@ class ConnectionPool(object):
                return conn
            else:
                logging.info('%r not ready', node)
+                self.notifyFailure(node)
                return None

    @profiler_decorator
@@ -111,6 +128,28 @@ class ConnectionPool(object):
            self.connection_dict[node.getUUID()] = conn
        return conn

+    @profiler_decorator
+    def notifyFailure(self, node):
+        self._notifyFailure(node.getUUID(), time.time() + MAX_FAILURE_AGE)
+
+    def _notifyFailure(self, uuid, at):
+        self.node_failure_dict[uuid] = at
+
+    @profiler_decorator
+    def getCellSortKey(self, cell):
+        return self._getCellSortKey(cell.getUUID(), time.time())
+
+    def _getCellSortKey(self, uuid, now):
+        if uuid in self.connection_dict:
+            result = CELL_CONNECTED
+        else:
+            failure = self.node_failure_dict.get(uuid)
+            if failure is None or failure < now:
+                result = CELL_GOOD
+            else:
+                result = CELL_FAILED
+        return result
+
    @profiler_decorator
    def getConnForCell(self, cell):
        return self.getConnForNode(cell.getNode())

--- a/neo/tests/client/testClientApp.py
+++ b/neo/tests/client/testClientApp.py
@@ -195,21 +195,21 @@ class ClientApplicationTests(NeoTestBase):
        tid = self.makeTID()
        # cache cleared
        self.assertTrue(oid not in mq)
-        app.pt = Mock({ 'getCellListForOID': (), })
+        app.pt = Mock({ 'getCellListForOID': [], })
        app.local_var.history = (oid, [(tid, 0)])
        # If object len is 0, this object doesn't exist anymore because its
        # creation has been undone.
        self.assertRaises(KeyError, app.getSerial, oid)
        self.assertEquals(len(app.pt.mockGetNamedCalls('getCellListForOID')), 1)
        # Otherwise, result from ZODB
-        app.pt = Mock({ 'getCellListForOID': (), })
+        app.pt = Mock({ 'getCellListForOID': [], })
        app.local_var.history = (oid, [(tid, 1)])
        self.assertEquals(app.getSerial(oid), tid)
        self.assertEquals(len(app.pt.mockGetNamedCalls('getCellListForOID')), 1)
        # fill the cache -> hit
        mq.store(oid, (tid, ' '))
        self.assertTrue(oid in mq)
-        app.pt = Mock({ 'getCellListForOID': (), })
+        app.pt = Mock({ 'getCellListForOID': [], })
        app.getSerial(oid)
        self.assertEquals(app.getSerial(oid), tid)
        self.assertEquals(len(app.pt.mockGetNamedCalls('getCellListForOID')), 0)
@@ -231,7 +231,7 @@ class ClientApplicationTests(NeoTestBase):
                     'fakeReceived': packet,
                     })
        app.local_var.queue = Mock({'get' : (conn, None)})
-        app.pt = Mock({ 'getCellListForOID': (cell, ), })
+        app.pt = Mock({ 'getCellListForOID': [cell, ], })
        app.cp = Mock({ 'getConnForCell' : conn})
        app.local_var.asked_object = -1
        Application._waitMessage = self._waitMessage
@@ -247,7 +247,7 @@ class ClientApplicationTests(NeoTestBase):
            'getAddress': ('127.0.0.1', 0),
            'fakeReceived': packet,
        })
-        app.pt = Mock({ 'getCellListForOID': (cell, ), })
+        app.pt = Mock({ 'getCellListForOID': [cell, ], })
        app.cp = Mock({ 'getConnForCell' : conn})
        app.local_var.asked_object = -1
        self.assertRaises(NEOStorageNotFoundError, app.load, oid)
@@ -289,7 +289,7 @@ class ClientApplicationTests(NeoTestBase):
            'getAddress': ('127.0.0.1', 0),
            'fakeReceived': packet,
        })
-        app.pt = Mock({ 'getCellListForOID': (cell, ), })
+        app.pt = Mock({ 'getCellListForOID': [cell, ], })
        app.cp = Mock({ 'getConnForCell' : conn})
        app.local_var.asked_object = -1
        self.assertRaises(NEOStorageNotFoundError, app.loadSerial, oid, tid2)
@@ -329,7 +329,7 @@ class ClientApplicationTests(NeoTestBase):
            'getAddress': ('127.0.0.1', 0),
            'fakeReceived': packet,
        })
-        app.pt = Mock({ 'getCellListForOID': (cell, ), })
+        app.pt = Mock({ 'getCellListForOID': [cell, ], })
        app.cp = Mock({ 'getConnForCell' : conn})
        app.local_var.asked_object = -1
        self.assertRaises(NEOStorageNotFoundError, app.loadBefore, oid, tid2)
@@ -772,8 +772,8 @@ class ClientApplicationTests(NeoTestBase):
            'getState': 'FakeState',
        })
        app.pt = Mock({
-            'getCellListForTID': (cell, ),
-            'getCellListForOID': (cell, ),
+            'getCellListForTID': [cell, ],
+            'getCellListForOID': [cell, ],
        })
        app.cp = Mock({'getConnForCell': conn, 'getConnForNode': conn})
        def tryToResolveConflict(oid, conflict_serial, serial, data,

--- a/neo/tests/client/testConnectionPool.py
+++ b/neo/tests/client/testConnectionPool.py
@@ -48,6 +48,26 @@ class ConnectionPoolTests(NeoTestBase):

    # TODO: test getConnForNode (requires splitting complex functionalities)

+    def test_CellSortKey(self):
+        pool = ConnectionPool(None)
+        node_uuid_1 = self.getNewUUID()
+        node_uuid_2 = self.getNewUUID()
+        node_uuid_3 = self.getNewUUID()
+        # We are connected to node 1
+        pool.connection_dict[node_uuid_1] = None
+        # A connection to node 3 failed, will be forgotten at 5
+        pool._notifyFailure(node_uuid_3, 5)
+        getCellSortKey = pool._getCellSortKey
+
+        # At 0, key values are not ambiguous
+        self.assertTrue(getCellSortKey(node_uuid_1, 0) < getCellSortKey(
+            node_uuid_2, 0) < getCellSortKey(node_uuid_3, 0))
+        # At 10, nodes 2 and 3 have the same key value
+        self.assertTrue(getCellSortKey(node_uuid_1, 10) < getCellSortKey(
+            node_uuid_2, 10))
+        self.assertEqual(getCellSortKey(node_uuid_2, 10), getCellSortKey(
+            node_uuid_3, 10))
+
 if __name__ == '__main__':
    unittest.main()