Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
72ed10fe
Commit
72ed10fe
authored
May 17, 2002
by
Tim Peters
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Pushed the subclassing far enough to be useful. More is needed, but
I need a break.
parent
597b6934
Changes
3
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
24 additions
and
307 deletions
+24
-307
lib/python/Products/ZCTextIndex/BaseIndex.py
lib/python/Products/ZCTextIndex/BaseIndex.py
+21
-240
lib/python/Products/ZCTextIndex/CosineIndex.py
lib/python/Products/ZCTextIndex/CosineIndex.py
+2
-29
lib/python/Products/ZCTextIndex/OkapiIndex.py
lib/python/Products/ZCTextIndex/OkapiIndex.py
+1
-38
No files found.
lib/python/Products/ZCTextIndex/BaseIndex.py
View file @
72ed10fe
This diff is collapsed.
Click to expand it.
lib/python/Products/ZCTextIndex/CosineIndex.py
View file @
72ed10fe
...
...
@@ -51,8 +51,8 @@ class CosineIndex(BaseIndex):
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
#
wid -> { docid -> frequency }
self
.
_wordinfo
=
IOBTree
(
)
#
._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d
)
# docid -> W(docid)
self
.
_docweight
=
IIBTree
()
...
...
@@ -102,33 +102,6 @@ class CosineIndex(BaseIndex):
del
self
.
_docwords
[
docid
]
del
self
.
_docweight
[
docid
]
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
if
not
wids
:
return
None
# All docs match
if
0
in
wids
:
wids
=
filter
(
None
,
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_glob
(
self
,
pattern
):
wids
=
self
.
_lexicon
.
globToWordIds
(
pattern
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_phrase
(
self
,
phrase
):
wids
=
self
.
_lexicon
.
termToWordIds
(
phrase
)
if
0
in
wids
:
return
IIBTree
()
hits
=
mass_weightedIntersection
(
self
.
_search_wids
(
wids
))
if
not
hits
:
return
hits
code
=
WidCode
.
encode
(
wids
)
result
=
IIBTree
()
for
docid
,
weight
in
hits
.
items
():
docwords
=
self
.
_docwords
[
docid
]
if
docwords
.
find
(
code
)
>=
0
:
result
[
docid
]
=
weight
return
result
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
...
...
lib/python/Products/ZCTextIndex/OkapiIndex.py
View file @
72ed10fe
...
...
@@ -60,13 +60,8 @@ class OkapiIndex(BaseIndex):
def
__init__
(
self
,
lexicon
):
BaseIndex
.
__init__
(
self
,
lexicon
)
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
# for a word *we've* never seen (e.g., lexicons can be shared
# across indices, and a query can contain a word some other
# index knows about but we don't).
self
.
_wordinfo
=
IOBTree
()
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
...
...
@@ -101,38 +96,6 @@ class OkapiIndex(BaseIndex):
del
self
.
_doclen
[
docid
]
self
.
_totaldoclen
-=
count
def
search
(
self
,
term
):
wids
=
self
.
_lexicon
.
termToWordIds
(
term
)
if
not
wids
:
return
None
# All docs match
wids
=
self
.
_remove_oov_wids
(
wids
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_glob
(
self
,
pattern
):
wids
=
self
.
_lexicon
.
globToWordIds
(
pattern
)
return
mass_weightedUnion
(
self
.
_search_wids
(
wids
))
def
search_phrase
(
self
,
phrase
):
wids
=
self
.
_lexicon
.
termToWordIds
(
phrase
)
cleaned_wids
=
self
.
_remove_oov_wids
(
wids
)
if
len
(
wids
)
!=
len
(
cleaned_wids
):
# At least one wid was OOV: can't possibly find it.
return
IIBTree
()
scores
=
self
.
_search_wids
(
cleaned_wids
)
hits
=
mass_weightedIntersection
(
scores
)
if
not
hits
:
return
hits
code
=
WidCode
.
encode
(
wids
)
result
=
IIBTree
()
for
docid
,
weight
in
hits
.
items
():
docwords
=
self
.
_docwords
[
docid
]
if
docwords
.
find
(
code
)
>=
0
:
result
[
docid
]
=
weight
return
result
def
_remove_oov_wids
(
self
,
wids
):
return
filter
(
self
.
_wordinfo
.
has_key
,
wids
)
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
# for each wid t in wids. The IIBucket, times the weight, maps D to
# TF(D,t) * IDF(t) for every docid D containing t.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment