Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
38ba01b6
Commit
38ba01b6
authored
May 17, 2002
by
Tim Peters
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Refactor/combine _docweight/_doclen.
parent
455af8ce
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
25 additions
and
47 deletions
+25
-47
lib/python/Products/ZCTextIndex/BaseIndex.py
lib/python/Products/ZCTextIndex/BaseIndex.py
+7
-1
lib/python/Products/ZCTextIndex/CosineIndex.py
lib/python/Products/ZCTextIndex/CosineIndex.py
+1
-1
lib/python/Products/ZCTextIndex/OkapiIndex.py
lib/python/Products/ZCTextIndex/OkapiIndex.py
+7
-7
lib/python/Products/ZCTextIndex/tests/testIndex.py
lib/python/Products/ZCTextIndex/tests/testIndex.py
+10
-38
No files found.
lib/python/Products/ZCTextIndex/BaseIndex.py
View file @
38ba01b6
...
...
@@ -53,7 +53,7 @@ class BaseIndex(Persistent):
# wid -> {docid -> weight}; t -> D -> w(D, t)
# Different indexers have different notions of term weight, but we
# expect
all indexers
to use ._wordinfo to map wids to its notion
# expect
each indexer
to use ._wordinfo to map wids to its notion
# of a docid-to-weight map.
# There are two kinds of OOV words: wid 0 is explicitly OOV,
# and it's possible that the lexicon will return a non-zero wid
...
...
@@ -64,6 +64,12 @@ class BaseIndex(Persistent):
# wid 0 must not be a key in _wordinfo.
self
.
_wordinfo
=
IOBTree
()
# docid -> weight
# Different indexers have different notions of doc weight, but we
# expect each indexer to use ._docweight to map docids to its
# notion of what a doc weight is.
self
.
_docweight
=
IIBTree
()
# docid -> WidCode'd list of wids
# Used for un-indexing, and for phrase search.
self
.
_docwords
=
IOBTree
()
...
...
lib/python/Products/ZCTextIndex/CosineIndex.py
View file @
38ba01b6
...
...
@@ -54,8 +54,8 @@ class CosineIndex(BaseIndex):
# ._wordinfo for cosine is wid -> {docid -> weight};
# t -> D -> w(d, t)/W(d)
# ._docweight for Okapi is
# docid -> W(docid)
self
.
_docweight
=
IIBTree
()
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
...
...
lib/python/Products/ZCTextIndex/OkapiIndex.py
View file @
38ba01b6
...
...
@@ -63,20 +63,20 @@ class OkapiIndex(BaseIndex):
# ._wordinfo for Okapi is
# wid -> {docid -> frequency}; t -> D -> f(D, t)
# ._docweight for Okapi is
# docid -> # of words in the doc
# This is just len(self._docwords[docid]), but _docwords is stored
# in compressed form, so uncompressing it just to count the list
# length would be ridiculously expensive.
self
.
_doclen
=
IIBTree
()
# sum(self._doc
len
.values()), the total # of words in all docs
# sum(self._doc
weight
.values()), the total # of words in all docs
# This is a long for "better safe than sorry" reasons. It isn't
# used often enough that speed should matter.
self
.
_totaldoclen
=
0L
def
index_doc
(
self
,
docid
,
text
):
wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
self
.
_doc
len
[
docid
]
=
len
(
wids
)
self
.
_doc
weight
[
docid
]
=
len
(
wids
)
self
.
_totaldoclen
+=
len
(
wids
)
wid2count
=
self
.
_get_frequencies
(
wids
)
...
...
@@ -92,8 +92,8 @@ class OkapiIndex(BaseIndex):
del
self
.
_docwords
[
docid
]
count
=
self
.
_doc
len
[
docid
]
del
self
.
_doc
len
[
docid
]
count
=
self
.
_doc
weight
[
docid
]
del
self
.
_doc
weight
[
docid
]
self
.
_totaldoclen
-=
count
# The workhorse. Return a list of (IIBucket, weight) pairs, one pair
...
...
@@ -105,7 +105,7 @@ class OkapiIndex(BaseIndex):
def
_search_wids
(
self
,
wids
):
if
not
wids
:
return
[]
N
=
float
(
len
(
self
.
_doc
len
))
# total # of docs
N
=
float
(
len
(
self
.
_doc
weight
))
# total # of docs
meandoclen
=
self
.
_totaldoclen
/
N
K1
=
self
.
K1
B
=
self
.
B
...
...
@@ -117,7 +117,7 @@ class OkapiIndex(BaseIndex):
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
L
=
[]
docid2len
=
self
.
_doc
len
docid2len
=
self
.
_doc
weight
for
t
in
wids
:
assert
self
.
_wordinfo
.
has_key
(
t
)
# caller responsible for OOV
d2f
=
self
.
_wordinfo
[
t
]
# map {docid -> f(docid, t)}
...
...
lib/python/Products/ZCTextIndex/tests/testIndex.py
View file @
38ba01b6
...
...
@@ -18,25 +18,11 @@ from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from
Products.ZCTextIndex.CosineIndex
import
CosineIndex
from
Products.ZCTextIndex.OkapiIndex
import
OkapiIndex
# The cosine and Okapi indices have the same public interfaces, but these
# tests access internal attributes, and those aren't identical.
# The IndexTest class is abstract, and subclasses must implement the
# check_docid_known and num_docs_known methods. CosineIndexTest (later in
# this file) does those in terms of ._docweight, while OkapiIndexTest
# (later in this file) does them in terms of ._doclen.
# Subclasses must set a class variable IndexFactory to the appropriate
# index object constructor.
class
IndexTest
(
TestCase
):
# Subclasses must implement these methods, and set a class variable
# IndexFactory to the appropriate index object constructor.
def
check_docid_known
(
self
,
DOCID
):
raise
NotImplementedError
def
num_docs_known
(
self
):
raise
NotImplementedError
def
setUp
(
self
):
self
.
lexicon
=
Lexicon
(
Splitter
())
self
.
index
=
self
.
IndexFactory
(
self
.
lexicon
)
...
...
@@ -44,8 +30,8 @@ class IndexTest(TestCase):
def
test_index_document
(
self
,
DOCID
=
1
):
doc
=
"simple document contains five words"
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
check_docid_known
(
DOCID
)
self
.
assertEqual
(
self
.
num_docs_known
(
),
1
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
]
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
5
)
...
...
@@ -57,7 +43,7 @@ class IndexTest(TestCase):
DOCID
=
1
self
.
test_index_document
(
DOCID
)
self
.
index
.
unindex_doc
(
DOCID
)
self
.
assertEqual
(
self
.
num_docs_known
(
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
0
)
...
...
@@ -66,8 +52,8 @@ class IndexTest(TestCase):
doc
=
"another document just four"
DOCID
=
2
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
check_docid_known
(
DOCID
)
self
.
assertEqual
(
self
.
num_docs_known
(
),
2
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
]
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
2
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
8
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
2
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
4
)
...
...
@@ -87,8 +73,8 @@ class IndexTest(TestCase):
self
.
test_index_two_documents
()
self
.
index
.
unindex_doc
(
1
)
DOCID
=
2
self
.
assertEqual
(
self
.
num_docs_known
(
),
1
)
self
.
check_docid_known
(
DOCID
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
1
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
]
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
4
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
4
)
...
...
@@ -99,7 +85,7 @@ class IndexTest(TestCase):
def
test_index_duplicated_words
(
self
,
DOCID
=
1
):
doc
=
"very simple repeat repeat repeat document test"
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
check_docid_known
(
DOCID
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
]
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
get_words
(
DOCID
)),
7
)
...
...
@@ -144,23 +130,9 @@ class IndexTest(TestCase):
class
CosineIndexTest
(
IndexTest
):
IndexFactory
=
CosineIndex
def
check_docid_known
(
self
,
docid
):
self
.
assert_
(
self
.
index
.
_docweight
.
has_key
(
docid
))
self
.
assert_
(
self
.
index
.
_docweight
[
docid
]
>
0
)
def
num_docs_known
(
self
):
return
len
(
self
.
index
.
_docweight
)
class
OkapiIndexTest
(
IndexTest
):
IndexFactory
=
OkapiIndex
def
check_docid_known
(
self
,
docid
):
self
.
assert_
(
self
.
index
.
_doclen
.
has_key
(
docid
))
self
.
assert_
(
self
.
index
.
_doclen
[
docid
]
>
0
)
def
num_docs_known
(
self
):
return
len
(
self
.
index
.
_doclen
)
def
test_suite
():
return
TestSuite
((
makeSuite
(
CosineIndexTest
),
makeSuite
(
OkapiIndexTest
),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment