Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
72ed10fe
Commit
72ed10fe
authored
May 17, 2002
by
Tim Peters
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Pushed the subclassing far enough to be useful. More is needed, but
I need a break.
parent
597b6934
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
307 deletions
+24
-307
lib/python/Products/ZCTextIndex/BaseIndex.py
lib/python/Products/ZCTextIndex/BaseIndex.py
+21
-240
lib/python/Products/ZCTextIndex/CosineIndex.py
lib/python/Products/ZCTextIndex/CosineIndex.py
+2
-29
lib/python/Products/ZCTextIndex/OkapiIndex.py
lib/python/Products/ZCTextIndex/OkapiIndex.py
+1
-38
No files found.
lib/python/Products/ZCTextIndex/BaseIndex.py
View file @
72ed10fe
...
...
@@ -51,6 +51,19 @@ class BaseIndex(Persistent):
def
__init__
(
self
,
lexicon
):
self
.
_lexicon
=
lexicon
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect all indexers to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word *we've* never seen (e.g., lexicons can be shared
# across indices, and a query can contain a word some other
# index knows about but we don't). A word is in-vocabulary for
# this index if and only if _wordinfo.has_key(wid). Note that
# wid 0 most not be a key in _wordinfo.
self
.
_wordinfo
=
IOBTree
()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self
.
_docwords
=
IOBTree
()
...
...
@@ -63,27 +76,13 @@ class BaseIndex(Persistent):
"""Returns the wordids for a given docid"""
return
WidCode
.
decode
(
self
.
_docwords
[
docid
])
# Subclass must override.
def
index_doc
(
self
,
docid
,
text
):
wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
self
.
_doclen
[
docid
]
=
len
(
wids
)
self
.
_totaldoclen
+=
len
(
wids
)
wid2count
=
self
.
_get_frequencies
(
wids
)
for
wid
,
count
in
wid2count
.
items
():
self
.
_add_wordinfo
(
wid
,
count
,
docid
)
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
wids
)
return
len
(
wids
)
raise
NotImplementedError
# Subclass must override.
def
unindex_doc
(
self
,
docid
):
for
wid
in
WidCode
.
decode
(
self
.
_docwords
[
docid
]):
self
.
_del_wordinfo
(
wid
,
docid
)
del
self
.
_docwords
[
docid
]
count
=
self
.
_doclen
[
docid
]
del
self
.
_doclen
[
docid
]
self
.
_totaldoclen
-=
count
raise
NotImplementedError
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
...
...
@@ -117,52 +116,13 @@ class BaseIndex(Persistent):
def
_remove_oov_wids
(
self
,
wids
):
return
filter
(
self
.
_wordinfo
.
has_key
,
wids
)
# Subclass must override.
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
# As currently written, the weights are always 1, and the IIBucket maps
# D to TF(D,t)*IDF(t) directly, where the product is computed as a float
# but stored as a scaled_int.
# TF(D,t) * IDF(t) for every docid D containing t. wids must not
# contain any OOV words.
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
N
=
float
(
len
(
self
.
_doclen
))
# total # of docs
meandoclen
=
self
.
_totaldoclen
/
N
K1
=
self
.
K1
B
=
self
.
B
K1_plus1
=
K1
+
1.0
B_from1
=
1.0
-
B
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L
=
[]
docid2len
=
self
.
_doclen
for
t
in
wids
:
assert
self
.
_wordinfo
.
has_key
(
t
)
# caller responsible for OOV
d2f
=
self
.
_wordinfo
[
t
]
# map {docid -> f(docid, t)}
idf
=
inverse_doc_frequency
(
len
(
d2f
),
N
)
# an unscaled float
result
=
IIBucket
()
for
docid
,
f
in
d2f
.
items
():
lenweight
=
B_from1
+
B
*
docid2len
[
docid
]
/
meandoclen
tf
=
f
*
K1_plus1
/
(
f
+
K1
*
lenweight
)
result
[
docid
]
=
scaled_int
(
tf
*
idf
)
L
.
append
((
result
,
1
))
return
L
# Note about the above: the result is tf * idf. tf is small -- it
# can't be larger than k1+1 = 2.2. idf is formally unbounded, but
# is less than 14 for a term that appears in only 1 of a million
# documents. So the product is probably less than 32, or 5 bits
# before the radix point. If we did the scaled-int business on
# both of them, we'd be up to 25 bits. Add 64 of those and we'd
# be in overflow territory. That's pretty unlikely, so we *could*
# just store scaled_int(tf) in result[docid], and use scaled_int(idf)
# as an invariant weight across the whole result. But besides
# skating near the edge, it's not a speed cure, since the computation
# of tf would still be done at Python speed, and it's a lot more
# work than just multiplying by idf.
raise
NotImplementedError
def
query_weight
(
self
,
terms
):
# This method was inherited from the cosine measure, and doesn't
...
...
@@ -246,182 +206,3 @@ def inverse_doc_frequency(term_count, num_items):
"""
# implements IDF(q, t) = log(1 + N/f(t))
return
math
.
log
(
1.0
+
float
(
num_items
)
/
term_count
)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in
documents, not on an abstract vector space model. A long paper by its
principal inventors gives an excellent overview of how it was derived:
A probabilistic model of information retrieval: development and status
K. Sparck Jones, S. Walker, S.E. Robertson
http://citeseer.nj.nec.com/jones98probabilistic.html
Spellings that ignore relevance information (which we don't have) are of this
high-level form:
score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
where
D a specific document
Q a specific query
t a term (word, atomic phrase, whatever)
D&Q the terms common to D and Q
TF(D, t) a measure of t's importance in D -- a kind of term frequency
weight
IDF(Q, t) a measure of t's importance in the query and in the set of
documents as a whole -- a kind of inverse document frequency
weight
The IDF(Q, t) here is identical to the one used for our cosine measure.
Since queries are expected to be short, it ignores Q entirely:
IDF(Q, t) = log(1.0 + N / f(t))
where
N the total number of documents
f(t) the number of documents in which t appears
Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
that becomes 0 for a term that's in every document, and, e.g., if someone
is searching for "documentation" on python.org (a term that may well show
up on every page, due to the top navigation bar), we still want to find the
pages that use the word a lot (which is TF's job to find, not IDF's -- we
just want to stop IDF from considering this t to be irrelevant).
The TF(D, t) spellings are more interesting. With lots of variations, the
most basic spelling is of the form
f(D, t)
TF(D, t) = ---------------
f(D, t) + K(D)
where
f(D, t) the number of times t appears in D
K(D) a measure of the length of D, normalized to mean doc length
The functional *form* f/(f+K) is clever. It's a gross approximation to a
mixture of two distinct Poisson distributions, based on the idea that t
probably appears in D for one of two reasons:
1. More or less at random.
2. Because it's important to D's purpose in life ("eliteness" in papers).
Note that f/(f+K) is always between 0 and 1. If f is very large compared to
K, it approaches 1. If K is very large compared to f, it approaches 0. If
t appears in D more or less "for random reasons", f is likely to be small,
and so K will dominate unless it's a very small doc, and the ratio will be
small. OTOH, if t appears a lot in D, f will dominate unless it's a very
large doc, and the ratio will be close to 1.
We use a variation on that simple theme, a simplification of what's called
BM25 in the literature (it was the 25th stab at a Best Match function from
the Okapi group; "a simplification" means we're setting some of BM25's more
esoteric free parameters to 0):
f(D, t) * (k1 + 1)
TF(D, t) = --------------------
f(D, t) + k1 * K(D)
where
k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
the usual default value. This constant adjusts the curve to
look more like a theoretical 2-Poisson curve.
Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
an asymptote of k1+1 from below.
Finally, we use
K(D) = (1-b) + b * len(D)/E(len(D))
where
b is another free parameter, discussed below. We use 0.75.
len(D) the length of D in words
E(len(D)) the expected value of len(D) across the whole document set;
or, IOW, the average document length
b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
10 times as often in document d2 than in document d1. If document d2 is
also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
f(d2, t) * (k1 + 1)
TF(d2, t) = --------------------------------- =
f(d2, t) + k1 * len(d2)/E(len(D))
10 * f(d1, t) * (k1 + 1)
----------------------------------------------- = TF(d1, t)
10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
because the 10's cancel out. This is appropriate if we believe that a word
appearing 10x more often in a doc 10x as long is simply due to that the
longer doc is more verbose. If we do believe that, the longer doc and the
shorter doc are probably equally relevant. OTOH, it *could* be that the
longer doc is talking about t in greater depth too, in which case it's
probably more relevant than the shorter doc.
At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
completely, and a doc scores higher for having more occurences of a word
regardless of the doc's length.
Reality is between these extremes, and probably varies by document and word
too. Reports in the literature suggest that b=0.75 is a good compromise "in
general", favoring the "verbosity hypothesis" end of the scale.
Putting it all together, the final TF function is
f(D, t) * (k1 + 1)
TF(D, t) = --------------------------------------------
f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
with k1=1.2 and b=0.75.
Query Term Weighting
--------------------
I'm ignoring the query adjustment part of Okapi BM25 because I expect our
queries are very short. Full BM25 takes them into account by adding the
following to every score(D, Q); it depends on the lengths of D and Q, but
not on the specific words in Q, or even on whether they appear in D(!):
E(len(D)) - len(D)
k2 * len(Q) * -------------------
E(len(D)) + len(D)
Here k2 is another "tuning constant", len(Q) is the number of words in Q, and
len(D) & E(len(D)) were defined above. The Okapi group set k2 to 0 in TREC-9,
so it apparently doesn't do much good (or may even hurt).
Full BM25 *also* multiplies the following factor into IDF(Q, t):
f(Q, t) * (k3 + 1)
------------------
f(Q, t) + k3
where k3 is yet another free parameter, and f(Q,t) is the number of times t
appears in Q. Since we're using short "web style" queries, I expect f(Q,t)
to always be 1, and then that quotient is
1 * (k3 + 1)
------------ = 1
1 + k3
regardless of k3's value. So, in a trivial sense, we are incorporating
this measure (and optimizing it by not bothering to multiply by 1 <wink>).
"""
lib/python/Products/ZCTextIndex/CosineIndex.py
View file @
72ed10fe
...
...
@@ -51,8 +51,8 @@ class CosineIndex(BaseIndex):
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
#
wid -> { docid -> frequency }
self
.
_wordinfo
=
IOBTree
(
)
#
._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d
)
# docid -> W(docid)
self
.
_docweight
=
IIBTree
()
...
...
@@ -102,33 +102,6 @@ class CosineIndex(BaseIndex):
del
self
.
_docwords
[
docid
]
del
self
.
_docweight
[
docid
]
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
if
not
wids
:
return
None
# All docs match
if
0
in
wids
:
wids
=
filter
(
None
,
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_glob
(
self
,
pattern
):
wids
=
self
.
_lexicon
.
globToWordIds
(
pattern
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_phrase
(
self
,
phrase
):
wids
=
self
.
_lexicon
.
termToWordIds
(
phrase
)
if
0
in
wids
:
return
IIBTree
()
hits
=
mass_weightedIntersection
(
self
.
_search_wids
(
wids
))
if
not
hits
:
return
hits
code
=
WidCode
.
encode
(
wids
)
result
=
IIBTree
()
for
docid
,
weight
in
hits
.
items
():
docwords
=
self
.
_docwords
[
docid
]
if
docwords
.
find
(
code
)
>=
0
:
result
[
docid
]
=
weight
return
result
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
...
...
lib/python/Products/ZCTextIndex/OkapiIndex.py
View file @
72ed10fe
...
...
@@ -60,13 +60,8 @@ class OkapiIndex(BaseIndex):
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word *we've* never seen (e.g., lexicons can be shared
# across indices, and a query can contain a word some other
# index knows about but we don't).
self
.
_wordinfo
=
IOBTree
()
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
...
...
@@ -101,38 +96,6 @@ class OkapiIndex(BaseIndex):
del
self
.
_doclen
[
docid
]
self
.
_totaldoclen
-=
count
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
if
not
wids
:
return
None
# All docs match
wids
=
self
.
_remove_oov_wids
(
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_glob
(
self
,
pattern
):
wids
=
self
.
_lexicon
.
globToWordIds
(
pattern
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_phrase
(
self
,
phrase
):
wids
=
self
.
_lexicon
.
termToWordIds
(
phrase
)
cleaned_wids
=
self
.
_remove_oov_wids
(
wids
)
if
len
(
wids
)
!=
len
(
cleaned_wids
):
# At least one wid was OOV: can't possibly find it.
return
IIBTree
()
scores
=
self
.
_search_wids
(
cleaned_wids
)
hits
=
mass_weightedIntersection
(
scores
)
if
not
hits
:
return
hits
code
=
WidCode
.
encode
(
wids
)
result
=
IIBTree
()
for
docid
,
weight
in
hits
.
items
():
docwords
=
self
.
_docwords
[
docid
]
if
docwords
.
find
(
code
)
>=
0
:
result
[
docid
]
=
weight
return
result
def
_remove_oov_wids
(
self
,
wids
):
return
filter
(
self
.
_wordinfo
.
has_key
,
wids
)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment