importer: speed up txn lookup when migrating from FileStorage

This speeds up 2 operations that were horribly slow (linear scan of source database, from the beginning or from the end): - read access to not yet migrated data - resumption (restart of storage node whereas import was not finshed) Testing will tell if this patch is enough, or if more efficient solutions are required, like: - building a preliminary {tid->file_pos} index while the source DB is still in use (in this case, NEO may have to deal with a small gap at the end) - disabling ZODB features requiring data_serial (undo)

importer: speed up txn lookup when migrating from FileStorage
This speeds up 2 operations that were horribly slow (linear scan of source database, from the beginning or from the end): - read access to not yet migrated data - resumption (restart of storage node whereas import was not finshed) Testing will tell if this patch is enough, or if more efficient solutions are required, like: - building a preliminary {tid->file_pos} index while the source DB is still in use (in this case, NEO may have to deal with a small gap at the end) - disabling ZODB features requiring data_serial (undo)
cd33de9f · Julien Muchembled · e582696c · cd33de9f · cd33de9f
Commit cd33de9f authored Apr 24, 2015 by Julien Muchembled
Hide whitespace changes
Inline Side-by-side

Showing with 74 additions and 4 deletions

neo/lib/patch.py neo/lib/patch.py +67 -0

neo/storage/database/importer.py neo/storage/database/importer.py +7 -4

No files found.
--- a/neo/lib/patch.py
+++ b/neo/lib/patch.py
+#
+# Copyright (C) 2015 Nexedi SA
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+def speedupFileStorageTxnLookup():
+    """Speed up lookup of start position when instanciating an iterator
+
+    FileStorage does not index the file positions of transactions.
+    With this patch, we use the existing {oid->file_pos} index to bisect the
+    the closest file position to start iterating.
+    """
+    from array import array
+    from bisect import bisect
+    from collections import defaultdict
+    from ZODB.FileStorage.FileStorage import FileStorage, FileIterator
+
+    typecode = 'L' if array('I').itemsize < 4 else 'I'
+
+    class Start(object):
+
+        def __init__(self, read_data_header, h, tid):
+            self.read_data_header = read_data_header
+            self.h = h << 32
+            self.tid = tid
+
+        def __lt__(self, l):
+            return self.tid < self.read_data_header(self.h | l).tid
+
+    def iterator(self, start=None, stop=None):
+        if start:
+            try:
+                index = self._tidindex
+            except AttributeError:
+                # Cache a sorted list of all the file pos from oid index.
+                # To reduce memory usage, the list is splitted in arrays of
+                # low order 32-bit words.
+                tindex = defaultdict(lambda: array(typecode))
+                for x in self._index.itervalues():
+                    tindex[x >> 32].append(x & 0xffffffff)
+                index = self._tidindex = []
+                for h, l in sorted(tindex.iteritems()):
+                    x = array('I')
+                    x.fromlist(sorted(l))
+                    l = self._read_data_header(h << 32 | x[0])
+                    index.append((l.tid, h, x))
+            x = bisect(index, (start,)) - 1
+            if x >= 0:
+                x, h, index = index[x]
+                x = self._read_data_header
+                h = x(h << 32 | index[bisect(index, Start(x, h, start)) - 1])
+                return FileIterator(self._file_name, start, stop, h.tloc)
+        return FileIterator(self._file_name, start, stop)
+
+    FileStorage.iterator = iterator
--- a/neo/storage/database/importer.py
+++ b/neo/storage/database/importer.py
@@ -23,10 +23,11 @@ from ZODB.config import storageFromString
 from ZODB.POSException import POSKeyError

 from . import buildDatabaseManager, DatabaseManager
-from neo.lib import logging, util
+from neo.lib import logging, patch, util
 from neo.lib.exception import DatabaseFailure
 from neo.lib.protocol import CellStates, ZERO_OID, ZERO_TID, ZERO_HASH, MAX_TID

+patch.speedupFileStorageTxnLookup()

 class Reference(object):

@@ -239,7 +240,7 @@ class ZODB(object):

    def getDataTid(self, oid, tid):
        try:
-            return self.data_tid[tid][oid]
+            return self.data_tid[tid].get(oid)
        except KeyError:
            assert tid not in self.data_tid, (oid, tid)
            p_tid = util.p64(tid)
@@ -247,8 +248,10 @@ class ZODB(object):
            if txn.tid != p_tid:
                raise
        u64 = util.u64
-        txn = self.data_tid[tid] = {u64(x.oid): x.data_txn for x in txn}
-        return txn[oid]
+        txn = self.data_tid[tid] = {
+            u64(x.oid): x.data_txn
+            for x in txn if x.data_txn}
+        return txn.get(oid)


 class ZODBIterator(object):