Commit 38ba01b6 authored by Tim Peters's avatar Tim Peters

Refactor/combine _docweight/_doclen.

parent 455af8ce
......@@ -53,7 +53,7 @@ class BaseIndex(Persistent):
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect all indexers to use ._wordinfo to map wids to its notion
# expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
......@@ -64,6 +64,12 @@ class BaseIndex(Persistent):
# wid 0 must not be a key in _wordinfo.
self._wordinfo = IOBTree()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self._docweight = IIBTree()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self._docwords = IOBTree()
......
......@@ -54,8 +54,8 @@ class CosineIndex(BaseIndex):
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for Okapi is
# docid -> W(docid)
self._docweight = IIBTree()
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
......
......@@ -63,20 +63,20 @@ class OkapiIndex(BaseIndex):
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
self._doclen = IIBTree()
# sum(self._doclen.values()), the total # of words in all docs
# sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self._totaldoclen = 0L
def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids)
self._docweight[docid] = len(wids)
self._totaldoclen += len(wids)
wid2count = self._get_frequencies(wids)
......@@ -92,8 +92,8 @@ class OkapiIndex(BaseIndex):
del self._docwords[docid]
count = self._doclen[docid]
del self._doclen[docid]
count = self._docweight[docid]
del self._docweight[docid]
self._totaldoclen -= count
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
......@@ -105,7 +105,7 @@ class OkapiIndex(BaseIndex):
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._doclen)) # total # of docs
N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N
K1 = self.K1
B = self.B
......@@ -117,7 +117,7 @@ class OkapiIndex(BaseIndex):
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = []
docid2len = self._doclen
docid2len = self._docweight
for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
......
......@@ -18,25 +18,11 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex
# The cosine and Okapi indices have the same public interfaces, but these
# tests access internal attributes, and those aren't identical.
# The IndexTest class is abstract, and subclasses must implement the
# check_docid_known and num_docs_known methods. CosineIndexTest (later in
# this file) does those in terms of ._docweight, while OkapiIndexTest
# (later in this file) does them in terms of ._doclen.
# Subclasses must set a class variable IndexFactory to the appropriate
# index object constructor.
class IndexTest(TestCase):
# Subclasses must implement these methods, and set a class variable
# IndexFactory to the appropriate index object constructor.
def check_docid_known(self, DOCID):
raise NotImplementedError
def num_docs_known(self):
raise NotImplementedError
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.index = self.IndexFactory(self.lexicon)
......@@ -44,8 +30,8 @@ class IndexTest(TestCase):
def test_index_document(self, DOCID=1):
doc = "simple document contains five words"
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assertEqual(self.num_docs_known(), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 1)
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5)
......@@ -57,7 +43,7 @@ class IndexTest(TestCase):
DOCID = 1
self.test_index_document(DOCID)
self.index.unindex_doc(DOCID)
self.assertEqual(self.num_docs_known(), 0)
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)
......@@ -66,8 +52,8 @@ class IndexTest(TestCase):
doc = "another document just four"
DOCID = 2
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assertEqual(self.num_docs_known(), 2)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._docweight), 2)
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
......@@ -87,8 +73,8 @@ class IndexTest(TestCase):
self.test_index_two_documents()
self.index.unindex_doc(1)
DOCID = 2
self.assertEqual(self.num_docs_known(), 1)
self.check_docid_known(DOCID)
self.assertEqual(len(self.index._docweight), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4)
......@@ -99,7 +85,7 @@ class IndexTest(TestCase):
def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7)
......@@ -144,23 +130,9 @@ class IndexTest(TestCase):
class CosineIndexTest(IndexTest):
IndexFactory = CosineIndex
def check_docid_known(self, docid):
self.assert_(self.index._docweight.has_key(docid))
self.assert_(self.index._docweight[docid] > 0)
def num_docs_known(self):
return len(self.index._docweight)
class OkapiIndexTest(IndexTest):
IndexFactory = OkapiIndex
def check_docid_known(self, docid):
self.assert_(self.index._doclen.has_key(docid))
self.assert_(self.index._doclen[docid] > 0)
def num_docs_known(self):
return len(self.index._doclen)
def test_suite():
return TestSuite((makeSuite(CosineIndexTest),
makeSuite(OkapiIndexTest),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment