Reindex docs touching as few docid->w(docid, w) maps as possible.

86fc53ee · Tim Peters · bad257b8 · 86fc53ee · 86fc53ee · 86fc53ee
Commit 86fc53ee authored May 17, 2002 by Tim Peters
3 changed files
--- a/lib/python/Products/ZCTextIndex/BaseIndex.py
+++ b/lib/python/Products/ZCTextIndex/BaseIndex.py
@@ -19,6 +19,7 @@ import math
 from BTrees.IOBTree import IOBTree
 from BTrees.IIBTree import IIBTree, IIBucket, IITreeSet
+from BTrees.IIBTree import intersection, difference
 from Products.ZCTextIndex.IIndex import IIndex
 from Products.ZCTextIndex import WidCode
@@ -91,8 +92,7 @@ class BaseIndex(Persistent):
    # A subclass may wish to extend or override this.
    def index_doc(self, docid, text):
        if self._docwords.has_key(docid):
-            # XXX Do something smarter than this.
+            return self._reindex_doc(docid, text)
-            self.unindex_doc(docid)
        wids = self._lexicon.sourceToWordIds(text)
        wid2weight, docweight = self._get_frequencies(wids)
        for wid, weight in wid2weight.items():
@@ -101,6 +101,45 @@ class BaseIndex(Persistent):
        self._docwords[docid] = WidCode.encode(wids)
        return len(wids)
+    # A subclass may wish to extend or override this.  This is for adjusting
+    # to a new version of a doc that already exists.  The goal is to be
+    # faster than simply unindexing the old version in its entirety and then
+    # adding the new version in its entirety.
+    def _reindex_doc(self, docid, text):
+        # Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
+        old_wids = self.get_words(docid)
+        old_wid2w, old_docw = self._get_frequencies(old_wids)
+        new_wids = self._lexicon.sourceToWordIds(text)
+        new_wid2w, new_docw = self._get_frequencies(new_wids)
+        old_widset = IITreeSet(old_wid2w.keys())
+        new_widset = IITreeSet(new_wid2w.keys())
+        in_both_widset = intersection(old_widset, new_widset)
+        only_old_widset = difference(old_widset, in_both_widset)
+        only_new_widset = difference(new_widset, in_both_widset)
+        del old_widset, new_widset
+        for wid in only_old_widset.keys():
+            self._del_wordinfo(wid, docid)
+        for wid in only_new_widset.keys():
+            self._add_wordinfo(wid, new_wid2w[wid], docid)
+        for wid in in_both_widset.keys():
+            # For the Okapi indexer, the "if" will trigger only for words
+            # whose counts have changed.  For the cosine indexer, the "if"
+            # may trigger for every wid, since W(d) probably changed and
+            # W(d) is divided into every score.
+            newscore = new_wid2w[wid]
+            if old_wid2w[wid] != newscore:
+                self._add_wordinfo(wid, newscore, docid)
+        self._docweight[docid] = new_docw
+        self._docwords[docid] = WidCode.encode(new_wids)
+        return len(new_wids)
    # Subclass must override.
    def _get_frequencies(self, wids):
        # Compute term frequencies and a doc weight, whatever those mean

--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
@@ -54,6 +54,11 @@ class OkapiIndex(BaseIndex):
    def index_doc(self, docid, text):
        count = BaseIndex.index_doc(self, docid, text)
        self._totaldoclen += count
+        return count
+    def _reindex_doc(self, docid, text):
+        self._totaldoclen -= self._docweight[docid]
+        return BaseIndex._reindex_doc(self, docid, text)
    def unindex_doc(self, docid):
        self._totaldoclen -= self._docweight[docid]

--- a/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+++ b/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
@@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
    def testRanking(self):
        self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
                      "pease", "porridge", "pot"]
+        self.docs = ["Pease porridge hot, pease porridge cold,",
+                     "Pease porridge in the pot,",
+                     "Nine days old.",
+                     "In the pot cold, in the pot hot,",
+                     "Pease porridge, pease porridge,",
+                     "Eat the lot."]
        self._ranking_index()
        self._ranking_tf()
        self._ranking_idf()
        self._ranking_queries()
+        # A digression to exercise re-indexing.  This should leave
+        # things exactly as they were.
+        docs = self.docs
+        for variant in ("hot cold porridge python", "pease hot pithy ",
+                        docs[-1]):
+            self.zc_index.index_object(len(docs), Indexable(variant))
+        self._ranking_tf()
+        self._ranking_idf()
+        self._ranking_queries()
    def _ranking_index(self):
-        docs = ["Pease porridge hot, pease porridge cold,",
+        docs = self.docs
-                "Pease porridge in the pot,",
-                "Nine days old.",
-                "In the pot cold, in the pot hot,",
-                "Pease porridge, pease porridge,",
-                "Eat the lot."]
        for i in range(len(docs)):
            self.zc_index.index_object(i + 1, Indexable(docs[i]))
@@ -220,6 +231,12 @@ class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
                "one two three"]
        for i in range(len(docs)):
            self.zc_index.index_object(i + 1, Indexable(docs[i]))
+        # A brief digression to exercise re-indexing.  This should leave
+        # things exactly as they were.
+        for variant in "one xyz", "xyz two three", "abc def", docs[-1]:
+            self.zc_index.index_object(len(docs), Indexable(variant))
        self.assertEqual(self.index._totaldoclen, 6)
        # So the mean doc length is 2.  We use that later.