Commit 38ba01b6 authored by Tim Peters's avatar Tim Peters

Refactor/combine _docweight/_doclen.

parent 455af8ce
...@@ -53,7 +53,7 @@ class BaseIndex(Persistent): ...@@ -53,7 +53,7 @@ class BaseIndex(Persistent):
# wid -> {docid -> weight}; t -> D -> w(D, t) # wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we # Different indexers have different notions of term weight, but we
# expect all indexers to use ._wordinfo to map wids to its notion # expect each indexer to use ._wordinfo to map wids to its notion
# of a docid-to-weight map. # of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV, # There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid # and it's possible that the lexicon will return a non-zero wid
...@@ -64,6 +64,12 @@ class BaseIndex(Persistent): ...@@ -64,6 +64,12 @@ class BaseIndex(Persistent):
# wid 0 must not be a key in _wordinfo. # wid 0 must not be a key in _wordinfo.
self._wordinfo = IOBTree() self._wordinfo = IOBTree()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self._docweight = IIBTree()
# docid -> WidCode'd list of wids # docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search. # Used for un-indexing, and for phrase search.
self._docwords = IOBTree() self._docwords = IOBTree()
......
...@@ -54,8 +54,8 @@ class CosineIndex(BaseIndex): ...@@ -54,8 +54,8 @@ class CosineIndex(BaseIndex):
# ._wordinfo for cosine is wid -> {docid -> weight}; # ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d) # t -> D -> w(d, t)/W(d)
# ._docweight for Okapi is
# docid -> W(docid) # docid -> W(docid)
self._docweight = IIBTree()
# Most of the computation for computing a relevance score for the # Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently # document occurs in the search() method. The code currently
......
...@@ -63,20 +63,20 @@ class OkapiIndex(BaseIndex): ...@@ -63,20 +63,20 @@ class OkapiIndex(BaseIndex):
# ._wordinfo for Okapi is # ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t) # wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc # docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored # This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list # in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive. # length would be ridiculously expensive.
self._doclen = IIBTree()
# sum(self._doclen.values()), the total # of words in all docs # sum(self._docweight.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't # This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter. # used often enough that speed should matter.
self._totaldoclen = 0L self._totaldoclen = 0L
def index_doc(self, docid, text): def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text) wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids) self._docweight[docid] = len(wids)
self._totaldoclen += len(wids) self._totaldoclen += len(wids)
wid2count = self._get_frequencies(wids) wid2count = self._get_frequencies(wids)
...@@ -92,8 +92,8 @@ class OkapiIndex(BaseIndex): ...@@ -92,8 +92,8 @@ class OkapiIndex(BaseIndex):
del self._docwords[docid] del self._docwords[docid]
count = self._doclen[docid] count = self._docweight[docid]
del self._doclen[docid] del self._docweight[docid]
self._totaldoclen -= count self._totaldoclen -= count
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair # The workhorse. Return a list of (IIBucket, weight) pairs, one pair
...@@ -105,7 +105,7 @@ class OkapiIndex(BaseIndex): ...@@ -105,7 +105,7 @@ class OkapiIndex(BaseIndex):
def _search_wids(self, wids): def _search_wids(self, wids):
if not wids: if not wids:
return [] return []
N = float(len(self._doclen)) # total # of docs N = float(len(self._docweight)) # total # of docs
meandoclen = self._totaldoclen / N meandoclen = self._totaldoclen / N
K1 = self.K1 K1 = self.K1
B = self.B B = self.B
...@@ -117,7 +117,7 @@ class OkapiIndex(BaseIndex): ...@@ -117,7 +117,7 @@ class OkapiIndex(BaseIndex):
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D))) # f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L = [] L = []
docid2len = self._doclen docid2len = self._docweight
for t in wids: for t in wids:
assert self._wordinfo.has_key(t) # caller responsible for OOV assert self._wordinfo.has_key(t) # caller responsible for OOV
d2f = self._wordinfo[t] # map {docid -> f(docid, t)} d2f = self._wordinfo[t] # map {docid -> f(docid, t)}
......
...@@ -18,25 +18,11 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter ...@@ -18,25 +18,11 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.CosineIndex import CosineIndex from Products.ZCTextIndex.CosineIndex import CosineIndex
from Products.ZCTextIndex.OkapiIndex import OkapiIndex from Products.ZCTextIndex.OkapiIndex import OkapiIndex
# The cosine and Okapi indices have the same public interfaces, but these # Subclasses must set a class variable IndexFactory to the appropriate
# tests access internal attributes, and those aren't identical. # index object constructor.
# The IndexTest class is abstract, and subclasses must implement the
# check_docid_known and num_docs_known methods. CosineIndexTest (later in
# this file) does those in terms of ._docweight, while OkapiIndexTest
# (later in this file) does them in terms of ._doclen.
class IndexTest(TestCase): class IndexTest(TestCase):
# Subclasses must implement these methods, and set a class variable
# IndexFactory to the appropriate index object constructor.
def check_docid_known(self, DOCID):
raise NotImplementedError
def num_docs_known(self):
raise NotImplementedError
def setUp(self): def setUp(self):
self.lexicon = Lexicon(Splitter()) self.lexicon = Lexicon(Splitter())
self.index = self.IndexFactory(self.lexicon) self.index = self.IndexFactory(self.lexicon)
...@@ -44,8 +30,8 @@ class IndexTest(TestCase): ...@@ -44,8 +30,8 @@ class IndexTest(TestCase):
def test_index_document(self, DOCID=1): def test_index_document(self, DOCID=1):
doc = "simple document contains five words" doc = "simple document contains five words"
self.index.index_doc(DOCID, doc) self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID) self.assert_(self.index._docweight[DOCID])
self.assertEqual(self.num_docs_known(), 1) self.assertEqual(len(self.index._docweight), 1)
self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 5) self.assertEqual(len(self.index.get_words(DOCID)), 5)
...@@ -57,7 +43,7 @@ class IndexTest(TestCase): ...@@ -57,7 +43,7 @@ class IndexTest(TestCase):
DOCID = 1 DOCID = 1
self.test_index_document(DOCID) self.test_index_document(DOCID)
self.index.unindex_doc(DOCID) self.index.unindex_doc(DOCID)
self.assertEqual(self.num_docs_known(), 0) self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0) self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0) self.assertEqual(len(self.index._docwords), 0)
...@@ -66,8 +52,8 @@ class IndexTest(TestCase): ...@@ -66,8 +52,8 @@ class IndexTest(TestCase):
doc = "another document just four" doc = "another document just four"
DOCID = 2 DOCID = 2
self.index.index_doc(DOCID, doc) self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID) self.assert_(self.index._docweight[DOCID])
self.assertEqual(self.num_docs_known(), 2) self.assertEqual(len(self.index._docweight), 2)
self.assertEqual(len(self.index._wordinfo), 8) self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2) self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index.get_words(DOCID)), 4)
...@@ -87,8 +73,8 @@ class IndexTest(TestCase): ...@@ -87,8 +73,8 @@ class IndexTest(TestCase):
self.test_index_two_documents() self.test_index_two_documents()
self.index.unindex_doc(1) self.index.unindex_doc(1)
DOCID = 2 DOCID = 2
self.assertEqual(self.num_docs_known(), 1) self.assertEqual(len(self.index._docweight), 1)
self.check_docid_known(DOCID) self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4) self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 4) self.assertEqual(len(self.index.get_words(DOCID)), 4)
...@@ -99,7 +85,7 @@ class IndexTest(TestCase): ...@@ -99,7 +85,7 @@ class IndexTest(TestCase):
def test_index_duplicated_words(self, DOCID=1): def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test" doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc) self.index.index_doc(DOCID, doc)
self.check_docid_known(DOCID) self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5) self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1) self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index.get_words(DOCID)), 7) self.assertEqual(len(self.index.get_words(DOCID)), 7)
...@@ -144,23 +130,9 @@ class IndexTest(TestCase): ...@@ -144,23 +130,9 @@ class IndexTest(TestCase):
class CosineIndexTest(IndexTest): class CosineIndexTest(IndexTest):
IndexFactory = CosineIndex IndexFactory = CosineIndex
def check_docid_known(self, docid):
self.assert_(self.index._docweight.has_key(docid))
self.assert_(self.index._docweight[docid] > 0)
def num_docs_known(self):
return len(self.index._docweight)
class OkapiIndexTest(IndexTest): class OkapiIndexTest(IndexTest):
IndexFactory = OkapiIndex IndexFactory = OkapiIndex
def check_docid_known(self, docid):
self.assert_(self.index._doclen.has_key(docid))
self.assert_(self.index._doclen[docid] > 0)
def num_docs_known(self):
return len(self.index._doclen)
def test_suite(): def test_suite():
return TestSuite((makeSuite(CosineIndexTest), return TestSuite((makeSuite(CosineIndexTest),
makeSuite(OkapiIndexTest), makeSuite(OkapiIndexTest),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment