Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
86fc53ee
Commit
86fc53ee
authored
May 17, 2002
by
Tim Peters
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Reindex docs touching as few docid->w(docid, w) maps as possible.
parent
bad257b8
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
69 additions
and
8 deletions
+69
-8
lib/python/Products/ZCTextIndex/BaseIndex.py
lib/python/Products/ZCTextIndex/BaseIndex.py
+41
-2
lib/python/Products/ZCTextIndex/OkapiIndex.py
lib/python/Products/ZCTextIndex/OkapiIndex.py
+5
-0
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+23
-6
No files found.
lib/python/Products/ZCTextIndex/BaseIndex.py
View file @
86fc53ee
...
@@ -19,6 +19,7 @@ import math
...
@@ -19,6 +19,7 @@ import math
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IIBTree
import
IIBTree
,
IIBucket
,
IITreeSet
from
BTrees.IIBTree
import
IIBTree
,
IIBucket
,
IITreeSet
from
BTrees.IIBTree
import
intersection
,
difference
from
Products.ZCTextIndex.IIndex
import
IIndex
from
Products.ZCTextIndex.IIndex
import
IIndex
from
Products.ZCTextIndex
import
WidCode
from
Products.ZCTextIndex
import
WidCode
...
@@ -91,8 +92,7 @@ class BaseIndex(Persistent):
...
@@ -91,8 +92,7 @@ class BaseIndex(Persistent):
# A subclass may wish to extend or override this.
# A subclass may wish to extend or override this.
def
index_doc
(
self
,
docid
,
text
):
def
index_doc
(
self
,
docid
,
text
):
if
self
.
_docwords
.
has_key
(
docid
):
if
self
.
_docwords
.
has_key
(
docid
):
# XXX Do something smarter than this.
return
self
.
_reindex_doc
(
docid
,
text
)
self
.
unindex_doc
(
docid
)
wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
wid2weight
,
docweight
=
self
.
_get_frequencies
(
wids
)
wid2weight
,
docweight
=
self
.
_get_frequencies
(
wids
)
for
wid
,
weight
in
wid2weight
.
items
():
for
wid
,
weight
in
wid2weight
.
items
():
...
@@ -101,6 +101,45 @@ class BaseIndex(Persistent):
...
@@ -101,6 +101,45 @@ class BaseIndex(Persistent):
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
wids
)
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
wids
)
return
len
(
wids
)
return
len
(
wids
)
# A subclass may wish to extend or override this. This is for adjusting
# to a new version of a doc that already exists. The goal is to be
# faster than simply unindexing the old version in its entirety and then
# adding the new version in its entirety.
def
_reindex_doc
(
self
,
docid
,
text
):
# Touch as few docid->w(docid, score) maps in ._wordinfo as possible.
old_wids
=
self
.
get_words
(
docid
)
old_wid2w
,
old_docw
=
self
.
_get_frequencies
(
old_wids
)
new_wids
=
self
.
_lexicon
.
sourceToWordIds
(
text
)
new_wid2w
,
new_docw
=
self
.
_get_frequencies
(
new_wids
)
old_widset
=
IITreeSet
(
old_wid2w
.
keys
())
new_widset
=
IITreeSet
(
new_wid2w
.
keys
())
in_both_widset
=
intersection
(
old_widset
,
new_widset
)
only_old_widset
=
difference
(
old_widset
,
in_both_widset
)
only_new_widset
=
difference
(
new_widset
,
in_both_widset
)
del
old_widset
,
new_widset
for
wid
in
only_old_widset
.
keys
():
self
.
_del_wordinfo
(
wid
,
docid
)
for
wid
in
only_new_widset
.
keys
():
self
.
_add_wordinfo
(
wid
,
new_wid2w
[
wid
],
docid
)
for
wid
in
in_both_widset
.
keys
():
# For the Okapi indexer, the "if" will trigger only for words
# whose counts have changed. For the cosine indexer, the "if"
# may trigger for every wid, since W(d) probably changed and
# W(d) is divided into every score.
newscore
=
new_wid2w
[
wid
]
if
old_wid2w
[
wid
]
!=
newscore
:
self
.
_add_wordinfo
(
wid
,
newscore
,
docid
)
self
.
_docweight
[
docid
]
=
new_docw
self
.
_docwords
[
docid
]
=
WidCode
.
encode
(
new_wids
)
return
len
(
new_wids
)
# Subclass must override.
# Subclass must override.
def
_get_frequencies
(
self
,
wids
):
def
_get_frequencies
(
self
,
wids
):
# Compute term frequencies and a doc weight, whatever those mean
# Compute term frequencies and a doc weight, whatever those mean
...
...
lib/python/Products/ZCTextIndex/OkapiIndex.py
View file @
86fc53ee
...
@@ -54,6 +54,11 @@ class OkapiIndex(BaseIndex):
...
@@ -54,6 +54,11 @@ class OkapiIndex(BaseIndex):
def
index_doc
(
self
,
docid
,
text
):
def
index_doc
(
self
,
docid
,
text
):
count
=
BaseIndex
.
index_doc
(
self
,
docid
,
text
)
count
=
BaseIndex
.
index_doc
(
self
,
docid
,
text
)
self
.
_totaldoclen
+=
count
self
.
_totaldoclen
+=
count
return
count
def
_reindex_doc
(
self
,
docid
,
text
):
self
.
_totaldoclen
-=
self
.
_docweight
[
docid
]
return
BaseIndex
.
_reindex_doc
(
self
,
docid
,
text
)
def
unindex_doc
(
self
,
docid
):
def
unindex_doc
(
self
,
docid
):
self
.
_totaldoclen
-=
self
.
_docweight
[
docid
]
self
.
_totaldoclen
-=
self
.
_docweight
[
docid
]
...
...
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
View file @
86fc53ee
...
@@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
...
@@ -142,18 +142,29 @@ class CosineIndexTests(ZCIndexTestsBase, testIndex.CosineIndexTest):
def testRanking(self):
def testRanking(self):
self.words = ["
cold
", "
days
", "
eat
", "
hot
", "
lot
", "
nine
", "
old
",
self.words = ["
cold
", "
days
", "
eat
", "
hot
", "
lot
", "
nine
", "
old
",
"
pease
", "
porridge
", "
pot
"]
"
pease
", "
porridge
", "
pot
"]
self.docs = ["
Pease
porridge
hot
,
pease
porridge
cold
,
",
"
Pease
porridge
in
the
pot
,
",
"
Nine
days
old
.
",
"
In
the
pot
cold
,
in
the
pot
hot
,
",
"
Pease
porridge
,
pease
porridge
,
",
"
Eat
the
lot
.
"]
self._ranking_index()
self._ranking_index()
self._ranking_tf()
self._ranking_tf()
self._ranking_idf()
self._ranking_idf()
self._ranking_queries()
self._ranking_queries()
# A digression to exercise re-indexing. This should leave
# things exactly as they were.
docs = self.docs
for variant in ("
hot
cold
porridge
python
", "
pease
hot
pithy
",
docs[-1]):
self.zc_index.index_object(len(docs), Indexable(variant))
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
def _ranking_index(self):
docs = ["
Pease
porridge
hot
,
pease
porridge
cold
,
",
docs = self.docs
"
Pease
porridge
in
the
pot
,
",
"
Nine
days
old
.
",
"
In
the
pot
cold
,
in
the
pot
hot
,
",
"
Pease
porridge
,
pease
porridge
,
",
"
Eat
the
lot
.
"]
for i in range(len(docs)):
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
self.zc_index.index_object(i + 1, Indexable(docs[i]))
...
@@ -220,6 +231,12 @@ class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
...
@@ -220,6 +231,12 @@ class OkapiIndexTests(ZCIndexTestsBase, testIndex.OkapiIndexTest):
"
one
two
three
"]
"
one
two
three
"]
for i in range(len(docs)):
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
self.zc_index.index_object(i + 1, Indexable(docs[i]))
# A brief digression to exercise re-indexing. This should leave
# things exactly as they were.
for variant in "
one
xyz
", "
xyz
two
three
", "
abc
def
", docs[-1]:
self.zc_index.index_object(len(docs), Indexable(variant))
self.assertEqual(self.index._totaldoclen, 6)
self.assertEqual(self.index._totaldoclen, 6)
# So the mean doc length is 2. We use that later.
# So the mean doc length is 2. We use that later.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment