Refactor/combine _docweight/_doclen.

38ba01b6 · Tim Peters · 455af8ce · 38ba01b6 · 38ba01b6 · 38ba01b6
Commit 38ba01b6 authored May 17, 2002 by Tim Peters
4 changed files
--- a/lib/python/Products/ZCTextIndex/BaseIndex.py
+++ b/lib/python/Products/ZCTextIndex/BaseIndex.py
@@ -53,7 +53,7 @@ class BaseIndex(Persistent):

        # wid -> {docid -> weight}; t -> D -> w(D, t)
        # Different indexers have different notions of term weight, but we
-        # expect all indexers to use ._wordinfo to map wids to its notion
+        # expect each indexer to use ._wordinfo to map wids to its notion
        # of a docid-to-weight map.
        # There are two kinds of OOV words:  wid 0 is explicitly OOV,
        # and it's possible that the lexicon will return a non-zero wid
@@ -64,6 +64,12 @@ class BaseIndex(Persistent):
        # wid 0 must not be a key in _wordinfo.
        self._wordinfo = IOBTree()

+        # docid -> weight
+        # Different indexers have different notions of doc weight, but we
+        # expect each indexer to use ._docweight to map docids to its
+        # notion of what a doc weight is.
+        self._docweight = IIBTree()
+
        # docid -> WidCode'd list of wids
        # Used for un-indexing, and for phrase search.
        self._docwords = IOBTree()

--- a/lib/python/Products/ZCTextIndex/CosineIndex.py
+++ b/lib/python/Products/ZCTextIndex/CosineIndex.py
@@ -54,8 +54,8 @@ class CosineIndex(BaseIndex):
        # ._wordinfo for cosine is wid -> {docid -> weight};
        # t -> D -> w(d, t)/W(d)

+        # ._docweight for Okapi is
        # docid -> W(docid)
-        self._docweight = IIBTree()

    # Most of the computation for computing a relevance score for the
    # document occurs in the search() method.  The code currently

--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
@@ -63,20 +63,20 @@ class OkapiIndex(BaseIndex):
        # ._wordinfo for Okapi is
        # wid -> {docid -> frequency}; t -> D -> f(D, t)

+        # ._docweight for Okapi is
        # docid -> # of words in the doc
        # This is just len(self._docwords[docid]), but _docwords is stored
        # in compressed form, so uncompressing it just to count the list
        # length would be ridiculously expensive.
-        self._doclen = IIBTree()

-        # sum(self._doclen.values()), the total # of words in all docs
+        # sum(self._docweight.values()), the total # of words in all docs
        # This is a long for "better safe than sorry" reasons.  It isn't
        # used often enough that speed should matter.
        self._totaldoclen = 0L

    def index_doc(self, docid, text):
        wids = self._lexicon.sourceToWordIds(text)
-        self._doclen[docid] = len(wids)
+        self._docweight[docid] = len(wids)
        self._totaldoclen += len(wids)

        wid2count = self._get_frequencies(wids)
@@ -92,8 +92,8 @@ class OkapiIndex(BaseIndex):

        del self._docwords[docid]

-        count = self._doclen[docid]
-        del self._doclen[docid]
+        count = self._docweight[docid]
+        del self._docweight[docid]
        self._totaldoclen -= count

    # The workhorse.  Return a list of (IIBucket, weight) pairs, one pair
@@ -105,7 +105,7 @@ class OkapiIndex(BaseIndex):
    def _search_wids(self, wids):
        if not wids:
            return []
-        N = float(len(self._doclen))  # total # of docs
+        N = float(len(self._docweight))  # total # of docs
        meandoclen = self._totaldoclen / N
        K1 = self.K1
        B = self.B
@@ -117,7 +117,7 @@ class OkapiIndex(BaseIndex):
        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))

        L = []
-        docid2len = self._doclen
+        docid2len = self._docweight
        for t in wids:
            assert self._wordinfo.has_key(t)  # caller responsible for OOV
            d2f = self._wordinfo[t] # map {docid -> f(docid, t)}

--- a/lib/python/Products/ZCTextIndex/tests/testIndex.py
+++ b/lib/python/Products/ZCTextIndex/tests/testIndex.py
@@ -18,25 +18,11 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
 from Products.ZCTextIndex.CosineIndex import CosineIndex
 from Products.ZCTextIndex.OkapiIndex import OkapiIndex

-# The cosine and Okapi indices have the same public interfaces, but these
-# tests access internal attributes, and those aren't identical.
-# The IndexTest class is abstract, and subclasses must implement the
-# check_docid_known and num_docs_known methods.  CosineIndexTest (later in
-# this file) does those in terms of ._docweight, while OkapiIndexTest
-# (later in this file) does them in terms of ._doclen.
+# Subclasses must set a class variable IndexFactory to the appropriate
+# index object constructor.

 class IndexTest(TestCase):

-    # Subclasses must implement these methods, and set a class variable
-    # IndexFactory to the appropriate index object constructor.
-
-    def check_docid_known(self, DOCID):
-        raise NotImplementedError
-
-    def num_docs_known(self):
-        raise NotImplementedError
-
-
    def setUp(self):
        self.lexicon = Lexicon(Splitter())
        self.index = self.IndexFactory(self.lexicon)
@@ -44,8 +30,8 @@ class IndexTest(TestCase):
    def test_index_document(self, DOCID=1):
        doc = "simple document contains five words"
        self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
-        self.assertEqual(self.num_docs_known(), 1)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 1)
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 5)
@@ -57,7 +43,7 @@ class IndexTest(TestCase):
        DOCID = 1
        self.test_index_document(DOCID)
        self.index.unindex_doc(DOCID)
-        self.assertEqual(self.num_docs_known(), 0)
+        self.assertEqual(len(self.index._docweight), 0)
        self.assertEqual(len(self.index._wordinfo), 0)
        self.assertEqual(len(self.index._docwords), 0)

@@ -66,8 +52,8 @@ class IndexTest(TestCase):
        doc = "another document just four"
        DOCID = 2
        self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
-        self.assertEqual(self.num_docs_known(), 2)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._docweight), 2)
        self.assertEqual(len(self.index._wordinfo), 8)
        self.assertEqual(len(self.index._docwords), 2)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
@@ -87,8 +73,8 @@ class IndexTest(TestCase):
        self.test_index_two_documents()
        self.index.unindex_doc(1)
        DOCID = 2
-        self.assertEqual(self.num_docs_known(), 1)
-        self.check_docid_known(DOCID)
+        self.assertEqual(len(self.index._docweight), 1)
+        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 4)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 4)
@@ -99,7 +85,7 @@ class IndexTest(TestCase):
    def test_index_duplicated_words(self, DOCID=1):
        doc = "very simple repeat repeat repeat document test"
        self.index.index_doc(DOCID, doc)
-        self.check_docid_known(DOCID)
+        self.assert_(self.index._docweight[DOCID])
        self.assertEqual(len(self.index._wordinfo), 5)
        self.assertEqual(len(self.index._docwords), 1)
        self.assertEqual(len(self.index.get_words(DOCID)), 7)
@@ -144,23 +130,9 @@ class IndexTest(TestCase):
 class CosineIndexTest(IndexTest):
    IndexFactory = CosineIndex

-    def check_docid_known(self, docid):
-        self.assert_(self.index._docweight.has_key(docid))
-        self.assert_(self.index._docweight[docid] > 0)
-
-    def num_docs_known(self):
-        return len(self.index._docweight)
-
 class OkapiIndexTest(IndexTest):
    IndexFactory = OkapiIndex

-    def check_docid_known(self, docid):
-        self.assert_(self.index._doclen.has_key(docid))
-        self.assert_(self.index._doclen[docid] > 0)
-
-    def num_docs_known(self):
-        return len(self.index._doclen)
-
 def test_suite():
    return TestSuite((makeSuite(CosineIndexTest),
                      makeSuite(OkapiIndexTest),