Commit 85236fcc authored by Tim Peters's avatar Tim Peters

Compute inverse doc frequency the same way everywhere.

parent b690a6e3
...@@ -21,7 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket ...@@ -21,7 +21,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \ from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion mass_weightedUnion
...@@ -77,7 +77,7 @@ class CosineIndex(BaseIndex): ...@@ -77,7 +77,7 @@ class CosineIndex(BaseIndex):
# self._wordinfo[t] is a map from d to w(d, t). # self._wordinfo[t] is a map from d to w(d, t).
# #
# w(q, t) = log(1 + N/f(t)) # w(q, t) = log(1 + N/f(t))
# computed by query_term_weight() # computed by inverse_doc_frequency()
# #
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2)) # W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in # computed by _get_frequencies(), and remembered in
...@@ -110,7 +110,7 @@ class CosineIndex(BaseIndex): ...@@ -110,7 +110,7 @@ class CosineIndex(BaseIndex):
for wid in wids: for wid in wids:
assert self._wordinfo.has_key(wid) # caller responsible for OOV assert self._wordinfo.has_key(wid) # caller responsible for OOV
d2w = self._wordinfo[wid] # maps docid to w(docid, wid) d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = query_term_weight(len(d2w), N) # this is an unscaled float idf = inverse_doc_frequency(len(d2w), N) # this is an unscaled float
#print "idf = %.3f" % idf #print "idf = %.3f" % idf
if isinstance(d2w, DictType): if isinstance(d2w, DictType):
d2w = IIBucket(d2w) d2w = IIBucket(d2w)
...@@ -237,12 +237,3 @@ def doc_term_weight(count): ...@@ -237,12 +237,3 @@ def doc_term_weight(count):
"""Return the doc-term weight for a term that appears count times.""" """Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t) # implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count) return 1.0 + math.log(count)
def query_term_weight(term_count, num_items):
"""Return the query-term weight for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements w(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
...@@ -24,7 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket ...@@ -24,7 +24,7 @@ from BTrees.IIBTree import IIBTree, IIBucket
from Products.ZCTextIndex.IIndex import IIndex from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.BaseIndex import BaseIndex from Products.ZCTextIndex.BaseIndex import BaseIndex, inverse_doc_frequency
from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \ from Products.ZCTextIndex.SetOps import mass_weightedIntersection, \
mass_weightedUnion mass_weightedUnion
...@@ -212,15 +212,6 @@ class OkapiIndex(BaseIndex): ...@@ -212,15 +212,6 @@ class OkapiIndex(BaseIndex):
map = new map = new
self._wordinfo[wid] = map # Not redundant, because of Persistency! self._wordinfo[wid] = map # Not redundant, because of Persistency!
def inverse_doc_frequency(term_count, num_items):
"""Return the inverse doc frequency for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements IDF(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
""" """
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks. "Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in It's based on probability arguments about how words are distributed in
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment