Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
61e89f2f
Commit
61e89f2f
authored
May 14, 2002
by
Guido van Rossum
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Merged TextIndexDS9-branch into trunk.
parent
a340cb9d
Changes
35
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
35 changed files
with
4054 additions
and
0 deletions
+4054
-0
lib/python/Products/ZCTextIndex/HTMLSplitter.py
lib/python/Products/ZCTextIndex/HTMLSplitter.py
+41
-0
lib/python/Products/ZCTextIndex/IIndex.py
lib/python/Products/ZCTextIndex/IIndex.py
+58
-0
lib/python/Products/ZCTextIndex/ILexicon.py
lib/python/Products/ZCTextIndex/ILexicon.py
+51
-0
lib/python/Products/ZCTextIndex/INBest.py
lib/python/Products/ZCTextIndex/INBest.py
+73
-0
lib/python/Products/ZCTextIndex/IPipelineElement.py
lib/python/Products/ZCTextIndex/IPipelineElement.py
+23
-0
lib/python/Products/ZCTextIndex/IQueryParser.py
lib/python/Products/ZCTextIndex/IQueryParser.py
+63
-0
lib/python/Products/ZCTextIndex/ISplitter.py
lib/python/Products/ZCTextIndex/ISplitter.py
+21
-0
lib/python/Products/ZCTextIndex/Index.py
lib/python/Products/ZCTextIndex/Index.py
+286
-0
lib/python/Products/ZCTextIndex/Lexicon.py
lib/python/Products/ZCTextIndex/Lexicon.py
+139
-0
lib/python/Products/ZCTextIndex/NBest.py
lib/python/Products/ZCTextIndex/NBest.py
+76
-0
lib/python/Products/ZCTextIndex/OkapiIndex.py
lib/python/Products/ZCTextIndex/OkapiIndex.py
+431
-0
lib/python/Products/ZCTextIndex/ParseTree.py
lib/python/Products/ZCTextIndex/ParseTree.py
+128
-0
lib/python/Products/ZCTextIndex/QueryParser.py
lib/python/Products/ZCTextIndex/QueryParser.py
+198
-0
lib/python/Products/ZCTextIndex/RiceCode.py
lib/python/Products/ZCTextIndex/RiceCode.py
+194
-0
lib/python/Products/ZCTextIndex/Setup
lib/python/Products/ZCTextIndex/Setup
+2
-0
lib/python/Products/ZCTextIndex/StopDict.py
lib/python/Products/ZCTextIndex/StopDict.py
+22
-0
lib/python/Products/ZCTextIndex/WidCode.py
lib/python/Products/ZCTextIndex/WidCode.py
+114
-0
lib/python/Products/ZCTextIndex/ZCTextIndex.py
lib/python/Products/ZCTextIndex/ZCTextIndex.py
+107
-0
lib/python/Products/ZCTextIndex/__init__.py
lib/python/Products/ZCTextIndex/__init__.py
+28
-0
lib/python/Products/ZCTextIndex/dtml/addLexicon.dtml
lib/python/Products/ZCTextIndex/dtml/addLexicon.dtml
+74
-0
lib/python/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
lib/python/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+97
-0
lib/python/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
lib/python/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+9
-0
lib/python/Products/ZCTextIndex/stopper.c
lib/python/Products/ZCTextIndex/stopper.c
+182
-0
lib/python/Products/ZCTextIndex/tests/__init__.py
lib/python/Products/ZCTextIndex/tests/__init__.py
+18
-0
lib/python/Products/ZCTextIndex/tests/hs-tool.py
lib/python/Products/ZCTextIndex/tests/hs-tool.py
+126
-0
lib/python/Products/ZCTextIndex/tests/indexhtml.py
lib/python/Products/ZCTextIndex/tests/indexhtml.py
+95
-0
lib/python/Products/ZCTextIndex/tests/mailtest.py
lib/python/Products/ZCTextIndex/tests/mailtest.py
+232
-0
lib/python/Products/ZCTextIndex/tests/mhindex.py
lib/python/Products/ZCTextIndex/tests/mhindex.py
+411
-0
lib/python/Products/ZCTextIndex/tests/testIndex.py
lib/python/Products/ZCTextIndex/tests/testIndex.py
+127
-0
lib/python/Products/ZCTextIndex/tests/testLexicon.py
lib/python/Products/ZCTextIndex/tests/testLexicon.py
+121
-0
lib/python/Products/ZCTextIndex/tests/testNBest.py
lib/python/Products/ZCTextIndex/tests/testNBest.py
+89
-0
lib/python/Products/ZCTextIndex/tests/testQueryEngine.py
lib/python/Products/ZCTextIndex/tests/testQueryEngine.py
+70
-0
lib/python/Products/ZCTextIndex/tests/testQueryParser.py
lib/python/Products/ZCTextIndex/tests/testQueryParser.py
+131
-0
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+172
-0
lib/python/Products/ZCTextIndex/tests/wordstats.py
lib/python/Products/ZCTextIndex/tests/wordstats.py
+45
-0
No files found.
lib/python/Products/ZCTextIndex/HTMLSplitter.py
0 → 100644
View file @
61e89f2f
from
Products.ZCTextIndex.ISplitter
import
ISplitter
import
re
class
HTMLSplitter
:
__implements__
=
ISplitter
def
process
(
self
,
text
):
return
re
.
sub
(
'<[^>]*>'
,
' '
,
text
).
split
()
class
HTMLWordSplitter
:
__implements__
=
ISplitter
def
process
(
self
,
text
):
splat
=
[]
for
t
in
text
:
splat
+=
self
.
split
(
t
)
return
splat
def
split
(
self
,
text
):
text
=
text
.
lower
()
remove
=
[
"<[^>]*>"
,
"&[A-Za-z]+;"
,
"
\
W+
"
]
for pat in remove:
text = re.sub(pat, "
", text)
rx = re.compile("
[
A
-
Za
-
z
]
")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
if __name__ == "
__main__
":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
lib/python/Products/ZCTextIndex/IIndex.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
import
Interface
class
IIndex
(
Interface
.
Base
):
"""Interface for an Index."""
def
search
(
term
):
"""Execute a search on a single term given as a string.
Return an IIBucket.
"""
def
search_phrase
(
phrase
):
"""Execute a search on a phrase given as a string.
Return an IIBucket.
"""
def
search_glob
(
pattern
):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
NOTE: Currently only a single trailing * is supported.
Return an IIBucket.
"""
def
query_weight
(
terms
):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
"""
def
index_doc
(
docid
,
text
):
"XXX"
def
unindex_doc
(
docid
):
"XXX"
lib/python/Products/ZCTextIndex/ILexicon.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Interface
import
Base
as
Interface
class
ILexicon
(
Interface
):
"""Object responsible for converting text to word identifiers."""
def
termToWordIds
(
text
):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
"""
def
sourceToWordIds
(
text
):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
"""
def
globToWordIds
(
pattern
):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the
pattern.
"""
def
length
():
"""Return the number of unique term in the lexicon."""
lib/python/Products/ZCTextIndex/INBest.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""NBest Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
import
Interface
class
INBest
(
Interface
.
Base
):
"""Interface for an N-Best chooser."""
def
add
(
item
,
score
):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def
addmany
(
sequence
):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def
getbest
():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def
pop_smallest
():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def
__len__
():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def
capacity
():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
lib/python/Products/ZCTextIndex/IPipelineElement.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Interface
import
Base
as
Interface
class
IPipelineElement
(
Interface
):
def
process
(
source
):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
lib/python/Products/ZCTextIndex/IQueryParser.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser Interface."""
import
Interface
class
IQueryParser
(
Interface
.
Base
):
"""Interface for Query Parsers."""
def
parseQuery
(
query
):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
May raise ParseTree.ParseError.
"""
class
IQueryParseTree
(
Interface
.
Base
):
"""Interface for parse trees returned by parseQuery()."""
def
nodeType
():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def
getValue
():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def
terms
():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def
executeQuery
(
index
):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
lib/python/Products/ZCTextIndex/ISplitter.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
Interface
import
Base
as
Interface
class
ISplitter
(
Interface
):
"""A splitter."""
def
process
(
text
):
"""Run the splitter over the input text, returning a list of terms."""
lib/python/Products/ZCTextIndex/Index.py
0 → 100644
View file @
61e89f2f
This diff is collapsed.
Click to expand it.
lib/python/Products/ZCTextIndex/Lexicon.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import
re
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OIBTree
import
OIBTree
from
Products.ZCTextIndex.ILexicon
import
ILexicon
from
Products.ZCTextIndex.StopDict
import
get_stopdict
class
Lexicon
:
__implements__
=
ILexicon
def
__init__
(
self
,
*
pipeline
):
self
.
__wids
=
OIBTree
()
self
.
__words
=
IOBTree
()
# XXX we're reserving wid 0, but that might be yagni
self
.
__nextwid
=
1
self
.
__pipeline
=
pipeline
def
length
(
self
):
"""Return the number of unique terms in the lexicon."""
return
self
.
__nextwid
-
1
def
words
(
self
):
return
self
.
__wids
.
keys
()
def
wids
(
self
):
return
self
.
__words
.
keys
()
def
items
(
self
):
return
self
.
__wids
.
items
()
def
sourceToWordIds
(
self
,
text
):
last
=
_text2list
(
text
)
for
element
in
self
.
__pipeline
:
last
=
element
.
process
(
last
)
return
map
(
self
.
_getWordIdCreate
,
last
)
def
termToWordIds
(
self
,
text
):
last
=
_text2list
(
text
)
for
element
in
self
.
__pipeline
:
last
=
element
.
process
(
last
)
wids
=
[]
for
word
in
last
:
wid
=
self
.
__wids
.
get
(
word
)
if
wid
is
not
None
:
wids
.
append
(
wid
)
return
wids
def
globToWordIds
(
self
,
pattern
):
if
not
re
.
match
(
"^
\
w+
\
*$"
,
pattern
):
return
[]
pattern
=
pattern
.
lower
()
assert
pattern
.
endswith
(
"*"
)
prefix
=
pattern
[:
-
1
]
assert
prefix
and
not
prefix
.
endswith
(
"*"
)
keys
=
self
.
__wids
.
keys
(
prefix
)
# Keys starting at prefix
wids
=
[]
words
=
[]
for
key
in
keys
:
if
not
key
.
startswith
(
prefix
):
break
wids
.
append
(
self
.
__wids
[
key
])
words
.
append
(
key
)
return
wids
def
_getWordIdCreate
(
self
,
word
):
wid
=
self
.
__wids
.
get
(
word
)
if
wid
is
None
:
wid
=
self
.
__new_wid
()
self
.
__wids
[
word
]
=
wid
self
.
__words
[
wid
]
=
word
return
wid
def
__new_wid
(
self
):
wid
=
self
.
__nextwid
self
.
__nextwid
+=
1
return
wid
def
_text2list
(
text
):
# Helper: splitter input may be a string or a list of strings
try
:
text
+
""
except
:
return
text
else
:
return
[
text
]
# Sample pipeline elements
class
Splitter
:
import
re
rx
=
re
.
compile
(
r"\
w+
")
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
class StopWordRemover:
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
try:
from Products.ZCTextIndex import stopper as _stopper
except ImportError:
pass
else:
_stopwords = StopWordRemover.dict
def StopWordRemover():
swr = _stopper.new()
swr.dict.update(_stopwords)
return swr
lib/python/Products/ZCTextIndex/NBest.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from
bisect
import
bisect
from
Products.ZCTextIndex.INBest
import
INBest
class
NBest
:
__implements__
=
INBest
def
__init__
(
self
,
N
):
"Build an NBest object to remember the N best-scoring objects."
if
N
<
1
:
raise
ValueError
(
"NBest() argument must be at least 1"
)
self
.
_capacity
=
N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self
.
scores
=
[]
self
.
items
=
[]
def
__len__
(
self
):
return
len
(
self
.
scores
)
def
capacity
(
self
):
return
self
.
_capacity
def
add
(
self
,
item
,
score
):
self
.
addmany
([(
item
,
score
)])
def
addmany
(
self
,
sequence
):
scores
,
items
,
capacity
=
self
.
scores
,
self
.
items
,
self
.
_capacity
n
=
len
(
scores
)
for
item
,
score
in
sequence
:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if
n
>=
capacity
and
score
<=
scores
[
0
]:
continue
i
=
bisect
(
scores
,
score
)
scores
.
insert
(
i
,
score
)
items
.
insert
(
i
,
item
)
if
n
==
capacity
:
del
items
[
0
],
scores
[
0
]
else
:
n
+=
1
assert
n
==
len
(
scores
)
def
getbest
(
self
):
result
=
zip
(
self
.
items
,
self
.
scores
)
result
.
reverse
()
return
result
def
pop_smallest
(
self
):
if
self
.
scores
:
return
self
.
items
.
pop
(
0
),
self
.
scores
.
pop
(
0
)
raise
IndexError
(
"pop_smallest() called on empty NBest object"
)
lib/python/Products/ZCTextIndex/OkapiIndex.py
0 → 100644
View file @
61e89f2f
This diff is collapsed.
Click to expand it.
lib/python/Products/ZCTextIndex/ParseTree.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from
BTrees.IIBTree
import
difference
,
weightedIntersection
,
weightedUnion
from
Products.ZCTextIndex.NBest
import
NBest
class
QueryError
(
Exception
):
pass
class
ParseError
(
Exception
):
pass
class
ParseTreeNode
:
_nodeType
=
None
def
__init__
(
self
,
value
):
self
.
_value
=
value
def
nodeType
(
self
):
return
self
.
_nodeType
def
getValue
(
self
):
return
self
.
_value
def
__repr__
(
self
):
return
"%s(%r)"
%
(
self
.
__class__
.
__name__
,
self
.
getValue
())
def
terms
(
self
):
t
=
[]
for
v
in
self
.
getValue
():
t
.
extend
(
v
.
terms
())
return
t
def
executeQuery
(
self
,
index
):
raise
NotImplementedError
class
NotNode
(
ParseTreeNode
):
_nodeType
=
"NOT"
def
terms
(
self
):
return
[]
def
executeQuery
(
self
,
index
):
raise
QueryError
,
"NOT operator must occur right after AND"
class
AndNode
(
ParseTreeNode
):
_nodeType
=
"AND"
def
executeQuery
(
self
,
index
):
L
=
[]
Nots
=
[]
for
subnode
in
self
.
getValue
():
if
subnode
.
nodeType
()
==
"NOT"
:
Nots
.
append
(
subnode
.
getValue
().
executeQuery
(
index
))
else
:
L
.
append
(
subnode
.
executeQuery
(
index
))
assert
L
L
.
sort
(
lambda
x
,
y
:
cmp
(
len
(
x
),
len
(
y
)))
set
=
L
[
0
]
for
x
in
L
[
1
:]:
dummy
,
set
=
weightedIntersection
(
set
,
x
)
if
Nots
:
Nots
.
sort
(
lambda
x
,
y
:
cmp
(
len
(
x
),
len
(
y
)))
notset
=
Nots
[
0
]
for
x
in
Nots
[
1
:]:
dummy
,
notset
=
weightedUnion
(
notset
,
x
)
set
=
difference
(
set
,
notset
)
return
set
class
OrNode
(
ParseTreeNode
):
_nodeType
=
"OR"
def
executeQuery
(
self
,
index
):
# Balance unions as closely as possible, smallest to largest.
allofem
=
self
.
getValue
()
merge
=
NBest
(
len
(
allofem
))
for
subnode
in
allofem
:
result
=
subnode
.
executeQuery
(
index
)
merge
.
add
(
result
,
len
(
result
))
while
len
(
merge
)
>
1
:
# Merge the two smallest so far, and add back to the queue.
x
,
dummy
=
merge
.
pop_smallest
()
y
,
dummy
=
merge
.
pop_smallest
()
dummy
,
z
=
weightedUnion
(
x
,
y
)
merge
.
add
(
z
,
len
(
z
))
result
,
dummy
=
merge
.
pop_smallest
()
return
result
class
AtomNode
(
ParseTreeNode
):
_nodeType
=
"ATOM"
def
terms
(
self
):
return
[
self
.
getValue
()]
def
executeQuery
(
self
,
index
):
return
index
.
search
(
self
.
getValue
())
class
PhraseNode
(
AtomNode
):
_nodeType
=
"PHRASE"
def
executeQuery
(
self
,
index
):
return
index
.
search_phrase
(
self
.
getValue
())
class
GlobNode
(
AtomNode
):
_nodeType
=
"GLOB"
def
executeQuery
(
self
,
index
):
return
index
.
search_glob
(
self
.
getValue
())
lib/python/Products/ZCTextIndex/QueryParser.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the string
can contain whitespace, parentheses and key words.
In addition, an ATOM may optionally be preceded by a hyphen, meaning
that it must not be present.
An unquoted ATOM may also end in a star. This is a primitive
"globbing" function, meaning to search for any word with a given
prefix.
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
"""
import
re
import
ParseTree
# relative import
# Create unique symbols for token types.
_AND
=
intern
(
"AND"
)
_OR
=
intern
(
"OR"
)
_NOT
=
intern
(
"NOT"
)
_LPAREN
=
intern
(
"("
)
_RPAREN
=
intern
(
")"
)
_ATOM
=
intern
(
"ATOM"
)
_EOF
=
intern
(
"EOF"
)
# Map keyword string to token type.
_keywords
=
{
_AND
:
_AND
,
_OR
:
_OR
,
_NOT
:
_NOT
,
_LPAREN
:
_LPAREN
,
_RPAREN
:
_RPAREN
,
}
# Regular expression to tokenize.
_tokenizer_regex
=
re
.
compile
(
r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\
s
"]+
)
"""
,
re
.
VERBOSE
)
class
QueryParser
:
def
__init__
(
self
):
pass
# This parser has no persistent state
def
parseQuery
(
self
,
query
):
# Lexical analysis.
tokens
=
_tokenizer_regex
.
findall
(
query
)
self
.
__tokens
=
tokens
# classify tokens
self
.
__tokentypes
=
[
_keywords
.
get
(
token
.
upper
(),
_ATOM
)
for
token
in
tokens
]
# add _EOF
self
.
__tokens
.
append
(
_EOF
)
self
.
__tokentypes
.
append
(
_EOF
)
self
.
__index
=
0
# Syntactical analysis.
tree
=
self
.
_parseOrExpr
()
self
.
_require
(
_EOF
)
return
tree
# Recursive descent parser
def
_require
(
self
,
tokentype
):
if
not
self
.
_check
(
tokentype
):
t
=
self
.
__tokens
[
self
.
__index
]
msg
=
"Token %r required, %r found"
%
(
tokentype
,
t
)
raise
ParseTree
.
ParseError
,
msg
def
_check
(
self
,
tokentype
):
if
self
.
__tokentypes
[
self
.
__index
]
is
tokentype
:
self
.
__index
+=
1
return
1
else
:
return
0
def
_peek
(
self
,
tokentype
):
return
self
.
__tokentypes
[
self
.
__index
]
is
tokentype
def
_get
(
self
,
tokentype
):
t
=
self
.
__tokens
[
self
.
__index
]
self
.
_require
(
tokentype
)
return
t
def
_parseOrExpr
(
self
):
L
=
[]
L
.
append
(
self
.
_parseAndExpr
())
while
self
.
_check
(
_OR
):
L
.
append
(
self
.
_parseAndExpr
())
if
len
(
L
)
==
1
:
return
L
[
0
]
else
:
return
ParseTree
.
OrNode
(
L
)
def
_parseAndExpr
(
self
):
L
=
[]
L
.
append
(
self
.
_parseTerm
())
while
self
.
_check
(
_AND
):
L
.
append
(
self
.
_parseNotExpr
())
if
len
(
L
)
==
1
:
return
L
[
0
]
else
:
return
ParseTree
.
AndNode
(
L
)
def
_parseNotExpr
(
self
):
if
self
.
_check
(
_NOT
):
return
ParseTree
.
NotNode
(
self
.
_parseTerm
())
else
:
return
self
.
_parseTerm
()
def
_parseTerm
(
self
):
if
self
.
_check
(
_LPAREN
):
tree
=
self
.
_parseOrExpr
()
self
.
_require
(
_RPAREN
)
else
:
atoms
=
[
self
.
_get
(
_ATOM
)]
while
self
.
_peek
(
_ATOM
):
atoms
.
append
(
self
.
_get
(
_ATOM
))
nodes
=
[]
nots
=
[]
for
a
in
atoms
:
words
=
re
.
findall
(
r"\
w+
\*?"
,
a
)
if
not
words
:
continue
if
len
(
words
)
>
1
:
n
=
ParseTree
.
PhraseNode
(
" "
.
join
(
words
))
elif
words
[
0
].
endswith
(
"*"
):
n
=
ParseTree
.
GlobNode
(
words
[
0
])
else
:
n
=
ParseTree
.
AtomNode
(
words
[
0
])
if
a
[
0
]
==
"-"
:
n
=
ParseTree
.
NotNode
(
n
)
nots
.
append
(
n
)
else
:
nodes
.
append
(
n
)
if
not
nodes
:
text
=
" "
.
join
(
atoms
)
msg
=
"At least one positive term required: %r"
%
text
raise
ParseTree
.
ParseError
,
msg
nodes
.
extend
(
nots
)
if
len
(
nodes
)
==
1
:
tree
=
nodes
[
0
]
else
:
tree
=
ParseTree
.
AndNode
(
nodes
)
return
tree
lib/python/Products/ZCTextIndex/RiceCode.py
0 → 100644
View file @
61e89f2f
"""Rice coding (a varaitn of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import
array
class
BitArray
:
def
__init__
(
self
,
buf
=
None
):
self
.
bytes
=
array
.
array
(
'B'
)
self
.
nbits
=
0
self
.
bitsleft
=
0
self
.
tostring
=
self
.
bytes
.
tostring
def
__getitem__
(
self
,
i
):
byte
,
offset
=
divmod
(
i
,
8
)
mask
=
2
**
offset
if
self
.
bytes
[
byte
]
&
mask
:
return
1
else
:
return
0
def
__setitem__
(
self
,
i
,
val
):
byte
,
offset
=
divmod
(
i
,
8
)
mask
=
2
**
offset
if
val
:
self
.
bytes
[
byte
]
|=
mask
else
:
self
.
bytes
[
byte
]
&=
~
mask
def
__len__
(
self
):
return
self
.
nbits
def
append
(
self
,
bit
):
"""Append a 1 if bit is true or 1 if it is false."""
if
self
.
bitsleft
==
0
:
self
.
bytes
.
append
(
0
)
self
.
bitsleft
=
8
self
.
__setitem__
(
self
.
nbits
,
bit
)
self
.
nbits
+=
1
self
.
bitsleft
-=
1
def
__getstate__
(
self
):
return
self
.
nbits
,
self
.
bitsleft
,
self
.
tostring
()
def
__setstate__
(
self
,
(
nbits
,
bitsleft
,
s
)):
self
.
bytes
=
array
.
array
(
'B'
,
s
)
self
.
nbits
=
nbits
self
.
bitsleft
=
bitsleft
class
RiceCode
:
def
__init__
(
self
,
m
):
"""Constructor a RiceCode for m-bit values."""
if
not
(
0
<=
m
<=
16
):
raise
ValueError
,
"m must be between 0 and 16"
self
.
init
(
m
)
self
.
bits
=
BitArray
()
self
.
len
=
0
def
init
(
self
,
m
):
self
.
m
=
m
self
.
lower
=
(
1
<<
m
)
-
1
self
.
mask
=
1
<<
(
m
-
1
)
def
append
(
self
,
val
):
"""Append an item to the list."""
if
val
<
1
:
raise
ValueError
,
"value >= 1 expected, got %s"
%
`val`
val
-=
1
# emit the unary part of the code
q
=
val
>>
self
.
m
for
i
in
range
(
q
):
self
.
bits
.
append
(
1
)
self
.
bits
.
append
(
0
)
# emit the binary part
r
=
val
&
self
.
lower
mask
=
self
.
mask
while
mask
:
self
.
bits
.
append
(
r
&
mask
)
mask
>>=
1
self
.
len
+=
1
def
__len__
(
self
):
return
self
.
len
def
tolist
(
self
):
"""Return the items as a list."""
l
=
[]
i
=
0
# bit offset
binary_range
=
range
(
self
.
m
)
for
j
in
range
(
self
.
len
):
unary
=
0
while
self
.
bits
[
i
]
==
1
:
unary
+=
1
i
+=
1
assert
self
.
bits
[
i
]
==
0
i
+=
1
binary
=
0
for
k
in
binary_range
:
binary
=
(
binary
<<
1
)
|
self
.
bits
[
i
]
i
+=
1
l
.
append
((
unary
<<
self
.
m
)
+
(
binary
+
1
))
return
l
def
tostring
(
self
):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return
self
.
bits
.
tostring
()
def
__getstate__
(
self
):
return
self
.
m
,
self
.
bits
def
__setstate__
(
self
,
(
m
,
bits
)):
self
.
init
(
m
)
self
.
bits
=
bits
def
encode
(
m
,
l
):
c
=
RiceCode
(
m
)
for
elt
in
l
:
c
.
append
(
elt
)
assert
c
.
tolist
()
==
l
return
c
def
encode_deltas
(
l
):
if
len
(
l
)
==
1
:
return
l
[
0
],
[]
deltas
=
RiceCode
(
6
)
deltas
.
append
(
l
[
1
]
-
l
[
0
])
for
i
in
range
(
2
,
len
(
l
)):
deltas
.
append
(
l
[
i
]
-
l
[
i
-
1
])
return
l
[
0
],
deltas
def
decode_deltas
(
start
,
enc_deltas
):
deltas
=
enc_deltas
.
tolist
()
l
=
[
start
]
for
i
in
range
(
1
,
len
(
deltas
)):
l
.
append
(
l
[
i
-
1
]
+
deltas
[
i
])
l
.
append
(
l
[
-
1
]
+
deltas
[
-
1
])
return
l
def
test
():
import
random
for
size
in
[
10
,
20
,
50
,
100
,
200
]:
l
=
[
random
.
randint
(
1
,
size
)
for
i
in
range
(
50
)]
c
=
encode
(
random
.
randint
(
1
,
16
),
l
)
assert
c
.
tolist
()
==
l
for
size
in
[
10
,
20
,
50
,
100
,
200
]:
l
=
range
(
random
.
randint
(
1
,
size
),
size
+
random
.
randint
(
1
,
size
))
t
=
encode_deltas
(
l
)
l2
=
decode_deltas
(
*
t
)
assert
l
==
l2
if
l
!=
l2
:
print
l
print
l2
def
pickle_efficiency
():
import
pickle
import
random
for
m
in
[
4
,
8
,
12
]:
for
size
in
[
10
,
20
,
50
,
100
,
200
,
500
,
1000
,
2000
,
5000
]:
for
elt_range
in
[
10
,
20
,
50
,
100
,
200
,
500
,
1000
]:
l
=
[
random
.
randint
(
1
,
elt_range
)
for
i
in
range
(
size
)]
raw
=
pickle
.
dumps
(
l
,
1
)
enc
=
pickle
.
dumps
(
encode
(
m
,
l
),
1
)
print
"m=%2d size=%4d range=%4d"
%
(
m
,
size
,
elt_range
),
print
"%5d %5d"
%
(
len
(
raw
),
len
(
enc
)),
if
len
(
raw
)
>
len
(
enc
):
print
"win"
else
:
print
"lose"
if
__name__
==
"__main__"
:
test
()
lib/python/Products/ZCTextIndex/Setup
0 → 100644
View file @
61e89f2f
*shared*
stopper stopper.c
lib/python/Products/ZCTextIndex/StopDict.py
0 → 100644
View file @
61e89f2f
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def
get_stopdict
():
"""Return a dictionary of stopwords."""
return
_dict
# This list of English stopwords comes from Lucene
_words
=
[
"a"
,
"and"
,
"are"
,
"as"
,
"at"
,
"be"
,
"but"
,
"by"
,
"for"
,
"if"
,
"in"
,
"into"
,
"is"
,
"it"
,
"no"
,
"not"
,
"of"
,
"on"
,
"or"
,
"such"
,
"that"
,
"the"
,
"their"
,
"then"
,
"there"
,
"these"
,
"they"
,
"this"
,
"to"
,
"was"
,
"will"
,
"with"
]
_dict
=
{}
for
w
in
_words
:
_dict
[
w
]
=
None
lib/python/Products/ZCTextIndex/WidCode.py
0 → 100644
View file @
61e89f2f
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The number of bytes in the encoding is encoded in unary at the start of
# the first byte (i.e., an encoding with n bytes begins with n 1-bits
# followed by a 0 bit).
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The number of data bits in the first byte of an encoding varies.
#
# The int to be encoded can contain no more than 24 bits.
# XXX this could certainly be increased
#
# If it contains no more than 6 bits, 00abcdef, the encoding is
# 10abcdef
#
# If it contains 7 thru 12 bits,
# 0000abcd efghijkL
# the encoding is
# 110abcde 0fghijkL
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 12 or fewer bits.
#
# If it contains 13 thru 18 bits,
# 000000ab cdefghij kLmnopqr
# the encoding is
# 1110abcd 0efghijk 0Lmnopqr
#
# If it contains 19 thru 24 bits,
# abcdefgh ijkLmnop qrstuvwx
# the encoding is
# 11110abc 0defghij 0kLmnopq 0rstuvwx
import
re
def
encode
(
wids
):
# Encode a list of wids as a string.
wid2enc
=
_encoding
n
=
len
(
wid2enc
)
return
""
.
join
([
w
<
n
and
wid2enc
[
w
]
or
_encode
(
w
)
for
w
in
wids
])
_encoding
=
[
None
]
*
0x1000
# Filled later, and converted to a tuple
def
_encode
(
w
):
assert
0x1000
<=
w
<
0x1000000
b
,
c
=
divmod
(
w
,
0x80
)
a
,
b
=
divmod
(
b
,
0x80
)
s
=
chr
(
b
)
+
chr
(
c
)
if
a
<
0x10
:
# no more than 18 data bits
return
chr
(
a
+
0xE0
)
+
s
a
,
b
=
divmod
(
a
,
0x80
)
assert
a
<
0x4
,
(
w
,
a
,
b
,
s
)
# else more than 24 data bits
return
(
chr
(
a
+
0xF0
)
+
chr
(
b
))
+
s
_prog
=
re
.
compile
(
r"[\x80-\xFF][\x00-\x7F]*"
)
def
decode
(
code
):
# Decode a string into a list of wids.
get
=
_decoding
.
get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return
[
get
(
p
)
or
_decode
(
p
)
for
p
in
_prog
.
findall
(
code
)]
_decoding
=
{}
# Filled later
def
_decode
(
s
):
if
s
==
'
\
x80
'
:
# See comment in decode(). This is here to allow a trick to work.
return
0
if
len
(
s
)
==
3
:
a
,
b
,
c
=
map
(
ord
,
s
)
assert
a
&
0xF0
==
0xE0
and
not
b
&
0x80
and
not
c
&
0x80
return
((
a
&
0xF
)
<<
14
)
|
(
b
<<
7
)
|
c
assert
len
(
s
)
==
4
,
`s`
a
,
b
,
c
,
d
=
map
(
ord
,
s
)
assert
a
&
0xF8
==
0xF0
and
not
b
&
0x80
and
not
c
&
0x80
and
not
d
&
0x80
return
((
a
&
0x7
)
<<
21
)
|
(
b
<<
14
)
|
(
c
<<
7
)
|
d
def
_fill
():
global
_encoding
for
i
in
range
(
0x40
):
s
=
chr
(
i
+
0x80
)
_encoding
[
i
]
=
s
_decoding
[
s
]
=
i
for
i
in
range
(
0x40
,
0x1000
):
hi
,
lo
=
divmod
(
i
,
0x80
)
s
=
chr
(
hi
+
0xC0
)
+
chr
(
lo
)
_encoding
[
i
]
=
s
_decoding
[
s
]
=
i
_encoding
=
tuple
(
_encoding
)
_fill
()
def
test
():
for
i
in
range
(
2
**
20
):
if
i
%
1000
==
0
:
print
i
wids
=
[
i
]
code
=
encode
(
wids
)
assert
decode
(
code
)
==
wids
,
(
wids
,
code
,
decode
(
code
))
if
__name__
==
"__main__"
:
test
()
lib/python/Products/ZCTextIndex/ZCTextIndex.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking."""
import
ZODB
from
Persistence
import
Persistent
import
Acquisition
from
OFS.SimpleItem
import
SimpleItem
from
Products.PluginIndexes.common.PluggableIndex
\
import
PluggableIndexInterface
from
Products.ZCTextIndex.Index
import
Index
from
Products.ZCTextIndex.ILexicon
import
ILexicon
from
Products.ZCTextIndex.NBest
import
NBest
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Globals
import
DTMLFile
from
Interface
import
verify_class_implementation
class
ZCTextIndex
(
Persistent
,
Acquisition
.
Implicit
,
SimpleItem
):
__implements__
=
PluggableIndexInterface
meta_type
=
'ZCTextIndex'
manage_options
=
(
{
'label'
:
'Settings'
,
'action'
:
'manage_main'
},
)
def
__init__
(
self
,
id
,
extra
,
caller
):
self
.
id
=
id
self
.
_fieldname
=
extra
.
doc_attr
lexicon
=
getattr
(
caller
,
extra
.
lexicon_id
,
None
)
if
lexicon
is
None
:
raise
LookupError
,
'Lexicon "%s" not found'
%
extra
.
lexicon_id
verify_class_implementation
(
ILexicon
,
lexicon
.
__class__
)
self
.
lexicon
=
lexicon
self
.
index
=
Index
(
self
.
lexicon
)
self
.
parser
=
QueryParser
()
def
index_object
(
self
,
docid
,
obj
):
self
.
index
.
index_doc
(
docid
,
self
.
_get_object_text
(
obj
))
self
.
_p_changed
=
1
# XXX
def
unindex_object
(
self
,
docid
):
self
.
index
.
unindex_doc
(
docid
)
self
.
_p_changed
=
1
# XXX
def
_apply_index
(
self
,
req
):
pass
# XXX
def
query
(
self
,
query
,
nbest
=
10
):
# returns a mapping from docids to scores
tree
=
self
.
parser
.
parseQuery
(
query
)
results
=
tree
.
executeQuery
(
self
.
index
)
chooser
=
NBest
(
nbest
)
chooser
.
addmany
(
results
.
items
())
return
chooser
.
getbest
()
def
_get_object_text
(
self
,
obj
):
x
=
getattr
(
obj
,
self
.
_fieldname
)
if
callable
(
x
):
return
x
()
else
:
return
x
## User Interface Methods ##
manage_main
=
DTMLFile
(
'dtml/manageZCTextIndex'
,
globals
())
def
manage_addZCTextIndex
(
self
,
id
,
extra
=
None
,
REQUEST
=
None
,
RESPONSE
=
None
):
"""Add a text index"""
return
self
.
manage_addIndex
(
id
,
'ZCTextIndex'
,
extra
,
REQUEST
,
RESPONSE
,
REQUEST
.
URL3
)
manage_addZCTextIndexForm
=
DTMLFile
(
'dtml/addZCTextIndex'
,
globals
())
manage_addLexiconForm
=
DTMLFile
(
'dtml/addLexicon'
,
globals
())
def
manage_addLexicon
(
self
,
id
,
title
,
splitter
=
None
,
normalizer
=
None
,
stopword
=
None
,
REQUEST
=
None
):
elements
=
[]
if
splitter
:
elements
.
append
(
Lexicon
.
Splitter
())
if
normalizer
:
elements
.
append
(
CaseNormalizer
())
if
stopwords
:
elements
.
append
(
StopWordRemover
())
lexicon
=
Lexicon
(
*
elements
)
self
.
_setObject
(
id
,
lexicon
)
if
REQUEST
is
not
None
:
return
self
.
manage_main
(
self
,
REQUEST
,
update_menu
=
1
)
lib/python/Products/ZCTextIndex/__init__.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Experimental plugin text index for ZCatalog.
"""
def
initialize
(
context
):
from
Products.ZCTextIndex
import
ZCTextIndex
context
.
registerClass
(
ZCTextIndex
.
ZCTextIndex
,
permission
=
'Add Pluggable Index'
,
constructors
=
(
ZCTextIndex
.
manage_addZCTextIndexForm
,
ZCTextIndex
.
manage_addZCTextIndex
),
visibility
=
None
)
lib/python/Products/ZCTextIndex/dtml/addLexicon.dtml
0 → 100644
View file @
61e89f2f
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add Lexicon',
)">
<FORM ACTION="manage_addLexicon" METHOD="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
splitter?
</td>
<td align="left" valign="top">
<input type="checkbox" name="splitter" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
case normalizer?
</td>
<td align="left" valign="top">
<input type="checkbox" name="normalizer" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
remove stop words?
</td>
<td align="left" valign="top">
<input type="checkbox" name="stopword" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
lib/python/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
0 → 100644
View file @
61e89f2f
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<select name="extra.vocabulary:record">
<dtml-in "this().aq_parent.objectItems('Vocabulary')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Field name
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<select name="extra.lexicon_id:record">
<dtml-in "this().aq_parent.objectItems('Lexicon')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
ZCTextIndex
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
lib/python/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
0 → 100644
View file @
61e89f2f
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
There is nothing to manage here. Move along.
</p>
<dtml-var manage_page_footer>
lib/python/Products/ZCTextIndex/stopper.c
0 → 100644
View file @
61e89f2f
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
#include "structmember.h"
typedef
struct
{
PyObject_HEAD
PyObject
*
swr_dict
;
}
StopWordRemover
;
static
PyObject
*
swr_process
(
StopWordRemover
*
self
,
PyObject
*
args
)
{
PyObject
*
result
=
NULL
;
PyObject
*
seq
;
int
len
,
i
;
if
(
!
PyArg_ParseTuple
(
args
,
"O:process"
,
&
seq
))
return
NULL
;
seq
=
PySequence_Fast
(
seq
,
"process() requires a sequence as the argument"
);
if
(
seq
==
NULL
)
return
NULL
;
result
=
PyList_New
(
0
);
if
(
result
==
NULL
)
goto
finally
;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len
=
PySequence_Fast_GET_SIZE
(
seq
);
#else
len
=
PyObject_Length
(
seq
);
#endif
for
(
i
=
0
;
i
<
len
;
++
i
)
{
PyObject
*
s
=
PySequence_Fast_GET_ITEM
(
seq
,
i
);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if
(
PyDict_GetItem
(
self
->
swr_dict
,
s
)
==
NULL
)
if
(
PyList_Append
(
result
,
s
)
<
0
)
{
Py_DECREF
(
result
);
result
=
NULL
;
goto
finally
;
}
}
finally:
Py_XDECREF
(
seq
);
return
result
;
}
static
struct
memberlist
swr_members
[]
=
{
{
"dict"
,
T_OBJECT
,
offsetof
(
StopWordRemover
,
swr_dict
),
READONLY
},
{
NULL
}
};
static
PyMethodDef
swr_methods
[]
=
{
{
"process"
,
(
PyCFunction
)
swr_process
,
METH_VARARGS
,
"process([str, ...]) --> [str, ...]
\n
"
"Remove stop words from the input list of strings to create a new list."
},
{
NULL
}
};
static
PyObject
*
swr_getattr
(
PyObject
*
self
,
char
*
name
)
{
PyObject
*
res
;
res
=
Py_FindMethod
(
swr_methods
,
self
,
name
);
if
(
res
!=
NULL
)
return
res
;
PyErr_Clear
();
return
PyMember_Get
((
char
*
)
self
,
swr_members
,
name
);
}
static
void
swr_dealloc
(
StopWordRemover
*
self
)
{
Py_XDECREF
(
self
->
swr_dict
);
PyObject_Del
(
self
);
}
static
PyTypeObject
StopWordRemover_Type
=
{
PyObject_HEAD_INIT
(
NULL
)
/* ob_type */
0
,
/* ob_size */
"stopper.StopWordRemover"
,
/* tp_name */
sizeof
(
StopWordRemover
),
/* tp_basicsize */
0
,
/* tp_itemsize */
(
destructor
)
swr_dealloc
,
/* tp_dealloc */
0
,
/* tp_print */
(
getattrfunc
)
swr_getattr
,
/* tp_getattr */
0
,
/* tp_setattr */
};
static
PyObject
*
swr_new
(
PyObject
*
notused
,
PyObject
*
args
)
{
StopWordRemover
*
swr
=
NULL
;
PyObject
*
dict
=
NULL
;
if
(
PyArg_ParseTuple
(
args
,
"|O!:new"
,
&
PyDict_Type
,
&
dict
))
{
swr
=
PyObject_New
(
StopWordRemover
,
&
StopWordRemover_Type
);
if
(
swr
!=
NULL
)
{
if
(
dict
!=
NULL
)
{
Py_INCREF
(
dict
);
swr
->
swr_dict
=
dict
;
}
else
{
swr
->
swr_dict
=
PyDict_New
();
if
(
swr
->
swr_dict
==
NULL
)
{
Py_DECREF
(
swr
);
swr
=
NULL
;
}
}
}
}
return
(
PyObject
*
)
swr
;
}
static
PyObject
*
pickle_constructor
=
NULL
;
PyObject
*
swr_pickler
(
PyObject
*
unused
,
PyObject
*
args
)
{
StopWordRemover
*
swr
;
PyObject
*
result
=
NULL
;
if
(
PyArg_ParseTuple
(
args
,
"O!:_pickler"
,
&
StopWordRemover_Type
,
&
swr
))
{
result
=
Py_BuildValue
(
"O(O)"
,
pickle_constructor
,
swr
->
swr_dict
);
}
return
result
;
}
static
PyMethodDef
stopper_functions
[]
=
{
{
"new"
,
swr_new
,
METH_VARARGS
,
"new() -> StopWordRemover instance
\n
"
"Create & return a new stop-word remover."
},
{
"_pickler"
,
swr_pickler
,
METH_VARARGS
,
"_pickler(StopWordRemover instance) -> pickle magic
\n
"
"Internal magic used to make stop-word removers picklable."
},
{
NULL
}
};
void
initstopper
(
void
)
{
PyObject
*
m
,
*
copy_reg
;
StopWordRemover_Type
.
ob_type
=
&
PyType_Type
;
m
=
Py_InitModule3
(
"stopper"
,
stopper_functions
,
"Fast StopWordRemover implementation."
);
if
(
m
==
NULL
)
return
;
if
(
PyObject_SetAttrString
(
m
,
"StopWordRemoverType"
,
(
PyObject
*
)
&
StopWordRemover_Type
)
<
0
)
return
;
/* register to support pickling */
copy_reg
=
PyImport_ImportModule
(
"copy_reg"
);
if
(
copy_reg
!=
NULL
)
{
PyObject
*
pickler
;
if
(
pickle_constructor
==
NULL
)
{
pickle_constructor
=
PyObject_GetAttrString
(
m
,
"new"
);
Py_XINCREF
(
pickle_constructor
);
}
pickler
=
PyObject_GetAttrString
(
m
,
"_pickler"
);
if
((
pickle_constructor
!=
NULL
)
&&
(
pickler
!=
NULL
))
{
PyObject
*
res
;
res
=
PyObject_CallMethod
(
copy_reg
,
"pickle"
,
"OOO"
,
&
StopWordRemover_Type
,
pickler
,
pickle_constructor
);
Py_XDECREF
(
res
);
}
Py_DECREF
(
copy_reg
);
}
}
lib/python/Products/ZCTextIndex/tests/__init__.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""
Revision information:
$Id: __init__.py,v 1.2 2002/05/14 15:12:34 gvanrossum Exp $
"""
lib/python/Products/ZCTextIndex/tests/hs-tool.py
0 → 100755
View file @
61e89f2f
#! /usr/bin/env python
import
cPickle
import
os.path
import
sys
from
hotshot.log
import
LogReader
def
load_line_info
(
log
):
byline
=
{}
prevloc
=
None
for
what
,
place
,
tdelta
in
log
:
if
tdelta
>
0
:
t
,
nhits
=
byline
.
get
(
prevloc
,
(
0
,
0
))
byline
[
prevloc
]
=
(
tdelta
+
t
),
(
nhits
+
1
)
prevloc
=
place
return
byline
def
basename
(
path
,
cache
=
{}):
try
:
return
cache
[
path
]
except
KeyError
:
fn
=
os
.
path
.
split
(
path
)[
1
]
cache
[
path
]
=
fn
return
fn
def
print_results
(
results
):
for
info
,
place
in
results
:
if
not
place
:
print
'Bad unpack:'
,
info
,
place
continue
filename
,
line
,
funcname
=
place
print
'%8d %8d'
%
info
,
basename
(
filename
),
line
def
annotate_results
(
results
):
files
=
{}
for
stats
,
place
in
results
:
if
not
place
:
continue
time
,
hits
=
stats
file
,
line
,
func
=
place
l
=
files
.
get
(
file
)
if
l
is
None
:
l
=
files
[
file
]
=
[]
l
.
append
((
line
,
hits
,
time
))
order
=
files
.
keys
()
order
.
sort
()
for
k
in
order
:
if
os
.
path
.
exists
(
k
):
v
=
files
[
k
]
v
.
sort
()
annotate
(
k
,
v
)
def
annotate
(
file
,
lines
):
print
"-"
*
60
print
file
print
"-"
*
60
f
=
open
(
file
)
i
=
1
match
=
lines
[
0
][
0
]
for
line
in
f
:
if
match
==
i
:
print
"%6d %8d "
%
lines
[
0
][
1
:],
line
,
del
lines
[
0
]
if
lines
:
match
=
lines
[
0
][
0
]
else
:
match
=
None
else
:
print
" "
*
16
,
line
,
i
+=
1
print
def
get_cache_name
(
filename
):
d
,
fn
=
os
.
path
.
split
(
filename
)
cache_dir
=
os
.
path
.
join
(
d
,
'.hs-tool'
)
cache_file
=
os
.
path
.
join
(
cache_dir
,
fn
)
return
cache_dir
,
cache_file
def
cache_results
(
filename
,
results
):
cache_dir
,
cache_file
=
get_cache_name
(
filename
)
if
not
os
.
path
.
exists
(
cache_dir
):
os
.
mkdir
(
cache_dir
)
fp
=
open
(
cache_file
,
'wb'
)
try
:
cPickle
.
dump
(
results
,
fp
,
1
)
finally
:
fp
.
close
()
def
main
(
filename
,
annotate
):
cache_dir
,
cache_file
=
get_cache_name
(
filename
)
if
(
os
.
path
.
isfile
(
cache_file
)
and
os
.
path
.
getmtime
(
cache_file
)
>
os
.
path
.
getmtime
(
filename
)):
# cached data is up-to-date:
fp
=
open
(
cache_file
,
'rb'
)
results
=
cPickle
.
load
(
fp
)
fp
.
close
()
else
:
log
=
LogReader
(
filename
)
byline
=
load_line_info
(
log
)
# Sort
results
=
[(
v
,
k
)
for
k
,
v
in
byline
.
items
()]
results
.
sort
()
cache_results
(
filename
,
results
)
if
annotate
:
annotate_results
(
results
)
else
:
print_results
(
results
)
if
__name__
==
"__main__"
:
import
getopt
annotate_p
=
0
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'A'
)
for
o
,
v
in
opts
:
if
o
==
'-A'
:
annotate_p
=
1
if
args
:
filename
,
=
args
else
:
filename
=
"profile.dat"
main
(
filename
,
annotate_p
)
lib/python/Products/ZCTextIndex/tests/indexhtml.py
0 → 100644
View file @
61e89f2f
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
import
os
import
ZODB
from
ZODB.FileStorage
import
FileStorage
from
BTrees.IOBTree
import
IOBTree
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
from
Products.ZCTextIndex.HTMLSplitter
import
HTMLWordSplitter
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
StopWordRemover
def
make_index
():
# there's an elaborate dance necessary to construct an index
class
Struct
:
pass
extra
=
Struct
()
extra
.
doc_attr
=
"read"
extra
.
lexicon_id
=
"lexicon"
caller
=
Struct
()
caller
.
lexicon
=
Lexicon
(
HTMLWordSplitter
(),
StopWordRemover
())
return
ZCTextIndex
(
extra
,
caller
)
def
main
(
db
,
root
,
dir
):
rt
[
"index"
]
=
index
=
make_index
()
rt
[
"files"
]
=
paths
=
IOBTree
()
get_transaction
().
commit
()
files
=
[
os
.
path
.
join
(
dir
,
file
)
for
file
in
os
.
listdir
(
dir
)]
docid
=
0
for
file
in
files
:
if
os
.
path
.
isdir
(
file
):
files
+=
[
os
.
path
.
join
(
file
,
sub
)
for
sub
in
os
.
listdir
(
file
)]
else
:
if
not
file
.
endswith
(
".html"
):
continue
docid
+=
1
print
"%5d"
%
docid
,
file
f
=
open
(
file
,
"rb"
)
paths
[
docid
]
=
file
index
.
index_object
(
docid
,
f
)
f
.
close
()
if
docid
%
TXN_INTERVAL
==
0
:
get_transaction
().
commit
()
if
docid
%
PACK_INTERVAL
==
0
:
db
.
pack
()
get_transaction
().
commit
()
if
__name__
==
"__main__"
:
import
sys
import
getopt
VERBOSE
=
0
FSPATH
=
"Data.fs"
TXN_INTERVAL
=
100
PACK_INTERVAL
=
500
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'vf:'
)
except
getopt
.
error
,
msg
:
print
msg
print
__doc__
sys
.
exit
(
2
)
for
o
,
v
in
opts
:
if
o
==
'-v'
:
VERBOSE
+=
1
if
o
==
'-f'
:
FSPATH
=
v
if
len
(
args
)
!=
1
:
print
"Expected on argument"
print
__doc__
sys
.
exit
(
2
)
dir
=
args
[
0
]
fs
=
FileStorage
(
FSPATH
)
db
=
ZODB
.
DB
(
fs
)
cn
=
db
.
open
()
rt
=
cn
.
root
()
dir
=
os
.
path
.
join
(
os
.
getcwd
(),
dir
)
print
dir
main
(
db
,
rt
,
dir
)
cn
.
close
()
fs
.
close
()
lib/python/Products/ZCTextIndex/tests/mailtest.py
0 → 100644
View file @
61e89f2f
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
-n NNN -- max number of messages to read from mailbox
-q query
-i mailbox
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-b NNN -- return the NNN best matches (default: 10)
-x -- exclude the message text from the data.fs
-t NNN -- commit a transaction every NNN messages (default: 1)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import
ZODB
import
ZODB.FileStorage
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
\
CaseNormalizer
,
Splitter
,
StopWordRemover
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
from
BTrees.IOBTree
import
IOBTree
import
sys
import
mailbox
import
time
def
usage
(
msg
):
print
msg
print
__doc__
sys
.
exit
(
2
)
class
Message
:
total_bytes
=
0
def
__init__
(
self
,
msg
):
subject
=
msg
.
getheader
(
'subject'
,
''
)
author
=
msg
.
getheader
(
'from'
,
''
)
if
author
:
summary
=
"%s (%s)
\
n
"
%
(
subject
,
author
)
else
:
summary
=
"%s
\
n
"
%
subject
self
.
text
=
summary
+
msg
.
fp
.
read
()
Message
.
total_bytes
+=
len
(
self
.
text
)
class
Extra
:
pass
def
index
(
rt
,
mboxfile
,
db
):
global
NUM
idx_time
=
0
pack_time
=
0
lexicon
=
Lexicon
(
Splitter
(),
CaseNormalizer
(),
StopWordRemover
())
extra
=
Extra
()
extra
.
lexicon_id
=
'lexicon'
extra
.
doc_attr
=
'text'
caller
=
Extra
()
caller
.
lexicon
=
lexicon
rt
[
"index"
]
=
idx
=
ZCTextIndex
(
"index"
,
extra
,
caller
)
if
not
EXCLUDE_TEXT
:
rt
[
"documents"
]
=
docs
=
IOBTree
()
get_transaction
().
commit
()
mbox
=
mailbox
.
UnixMailbox
(
open
(
mboxfile
))
if
VERBOSE
:
print
"opened"
,
mboxfile
if
not
NUM
:
NUM
=
sys
.
maxint
i
=
0
while
i
<
NUM
:
_msg
=
mbox
.
next
()
if
_msg
is
None
:
break
i
+=
1
msg
=
Message
(
_msg
)
if
VERBOSE
>=
2
:
print
"indexing msg"
,
i
i0
=
time
.
clock
()
idx
.
index_object
(
i
,
msg
)
if
not
EXCLUDE_TEXT
:
docs
[
i
]
=
msg
if
i
%
TXN_SIZE
==
0
:
get_transaction
().
commit
()
i1
=
time
.
clock
()
idx_time
+=
i1
-
i0
if
VERBOSE
and
i
%
50
==
0
:
print
i
,
"messages indexed"
print
"cache size"
,
db
.
cacheSize
()
if
PACK_INTERVAL
and
i
%
PACK_INTERVAL
==
0
:
if
VERBOSE
>=
2
:
print
"packing..."
p0
=
time
.
clock
()
db
.
pack
(
time
.
time
())
p1
=
time
.
clock
()
if
VERBOSE
:
print
"pack took %s sec"
%
(
p1
-
p0
)
pack_time
+=
p1
-
p0
get_transaction
().
commit
()
if
PACK_INTERVAL
and
i
%
PACK_INTERVAL
!=
0
:
if
VERBOSE
>=
2
:
print
"packing one last time..."
p0
=
time
.
clock
()
db
.
pack
(
time
.
time
())
p1
=
time
.
clock
()
if
VERBOSE
:
print
"pack took %s sec"
%
(
p1
-
p0
)
pack_time
+=
p1
-
p0
if
VERBOSE
:
print
"Index time"
,
idx_time
print
"Index bytes"
,
Message
.
total_bytes
rate
=
(
Message
.
total_bytes
/
idx_time
)
/
1024
print
"Index rate %d KB/sec"
%
int
(
rate
)
def
query
(
rt
,
query_str
):
idx
=
rt
[
"index"
]
docs
=
rt
[
"documents"
]
results
=
idx
.
query
(
query_str
,
BEST
)
print
"query:"
,
query_str
print
"# results:"
,
len
(
results
)
for
docid
,
score
in
results
:
print
"docid %4d score %2d"
%
(
docid
,
score
)
if
VERBOSE
:
msg
=
docs
[
docid
]
# print 3 lines of context
CONTEXT
=
5
ctx
=
msg
.
text
.
split
(
"
\
n
"
,
CONTEXT
)
del
ctx
[
-
1
]
print
"-"
*
60
print
"message:"
for
l
in
ctx
:
print
l
print
"-"
*
60
def
main
(
fs_path
,
mbox_path
,
query_str
):
f
=
ZODB
.
FileStorage
.
FileStorage
(
fs_path
)
db
=
ZODB
.
DB
(
f
,
cache_size
=
CACHE_SIZE
)
cn
=
db
.
open
()
rt
=
cn
.
root
()
if
mbox_path
is
not
None
:
index
(
rt
,
mbox_path
,
db
)
if
query_str
is
not
None
:
query
(
rt
,
query_str
)
cn
.
close
()
db
.
close
()
f
.
close
()
if
__name__
==
"__main__"
:
import
getopt
NUM
=
0
BEST
=
10
VERBOSE
=
0
PACK_INTERVAL
=
500
EXCLUDE_TEXT
=
0
CACHE_SIZE
=
10000
TXN_SIZE
=
1
query_str
=
None
mbox_path
=
None
profile
=
None
old_profile
=
None
try
:
opts
,
args
=
getopt
.
getopt
(
sys
.
argv
[
1
:],
'vn:p:i:q:b:xt:'
,
[
'profile='
,
'old-profile='
])
except
getopt
.
error
,
msg
:
usage
(
msg
)
if
len
(
args
)
!=
1
:
usage
(
"exactly 1 filename argument required"
)
for
o
,
v
in
opts
:
if
o
==
'-n'
:
NUM
=
int
(
v
)
elif
o
==
'-v'
:
VERBOSE
+=
1
elif
o
==
'-p'
:
PACK_INTERVAL
=
int
(
v
)
elif
o
==
'-q'
:
query_str
=
v
elif
o
==
'-i'
:
mbox_path
=
v
elif
o
==
'-b'
:
BEST
=
int
(
v
)
elif
o
==
'-x'
:
EXCLUDE_TEXT
=
1
elif
o
==
'-t'
:
TXN_SIZE
=
int
(
v
)
elif
o
==
'--profile'
:
profile
=
v
elif
o
==
'--old-profile'
:
old_profile
=
v
fs_path
,
=
args
if
profile
:
import
hotshot
profiler
=
hotshot
.
Profile
(
profile
,
lineevents
=
1
,
linetimings
=
1
)
profiler
.
runcall
(
main
,
fs_path
,
mbox_path
,
query_str
)
profiler
.
close
()
elif
old_profile
:
import
profile
,
pstats
profiler
=
profile
.
Profile
()
profiler
.
runcall
(
main
,
fs_path
,
mbox_path
,
query_str
)
profiler
.
dump_stats
(
old_profile
)
stats
=
pstats
.
Stats
(
old_profile
)
stats
.
strip_dirs
().
sort_stats
(
'time'
).
print_stats
(
20
)
else
:
main
(
fs_path
,
mbox_path
,
query_str
)
lib/python/Products/ZCTextIndex/tests/mhindex.py
0 → 100644
View file @
61e89f2f
This diff is collapsed.
Click to expand it.
lib/python/Products/ZCTextIndex/tests/testIndex.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.Index
import
Index
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
Splitter
class
IndexTest
(
TestCase
):
def
setUp
(
self
):
self
.
lexicon
=
Lexicon
(
Splitter
())
self
.
index
=
Index
(
self
.
lexicon
)
def
test_index_document
(
self
,
DOCID
=
1
):
doc
=
"simple document contains five words"
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
_get_undoinfo
(
DOCID
)),
5
)
for
map
in
self
.
index
.
_wordinfo
.
values
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_unindex_document
(
self
):
DOCID
=
1
self
.
test_index_document
(
DOCID
)
self
.
index
.
unindex_doc
(
DOCID
)
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
0
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
0
)
def
test_index_two_documents
(
self
):
self
.
test_index_document
()
doc
=
"another document just four"
DOCID
=
2
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
8
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
2
)
self
.
assertEqual
(
len
(
self
.
index
.
_get_undoinfo
(
DOCID
)),
4
)
wids
=
self
.
lexicon
.
termToWordIds
(
"document"
)
self
.
assertEqual
(
len
(
wids
),
1
)
document_wid
=
wids
[
0
]
for
wid
,
map
in
self
.
index
.
_wordinfo
.
items
():
if
wid
==
document_wid
:
self
.
assertEqual
(
len
(
map
),
2
)
self
.
assert_
(
map
.
has_key
(
1
))
self
.
assert_
(
map
.
has_key
(
DOCID
))
else
:
self
.
assertEqual
(
len
(
map
),
1
)
def
test_index_two_unindex_one
(
self
):
# index two documents, unindex one, and test the results
self
.
test_index_two_documents
()
self
.
index
.
unindex_doc
(
1
)
DOCID
=
2
self
.
assertEqual
(
len
(
self
.
index
.
_docweight
),
1
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
4
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
self
.
assertEqual
(
len
(
self
.
index
.
_get_undoinfo
(
DOCID
)),
4
)
for
map
in
self
.
index
.
_wordinfo
.
values
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_index_duplicated_words
(
self
,
DOCID
=
1
):
doc
=
"very simple repeat repeat repeat document test"
self
.
index
.
index_doc
(
DOCID
,
doc
)
self
.
assert_
(
self
.
index
.
_docweight
[
DOCID
])
self
.
assertEqual
(
len
(
self
.
index
.
_wordinfo
),
5
)
self
.
assertEqual
(
len
(
self
.
index
.
_docwords
),
1
)
## self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
wids
=
self
.
lexicon
.
termToWordIds
(
"repeat"
)
self
.
assertEqual
(
len
(
wids
),
1
)
repititive_wid
=
wids
[
0
]
for
wid
,
map
in
self
.
index
.
_wordinfo
.
items
():
self
.
assertEqual
(
len
(
map
),
1
)
self
.
assert_
(
map
.
has_key
(
DOCID
))
def
test_simple_query_oneresult
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
results
=
self
.
index
.
search
(
"document"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_simple_query_noresults
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
results
=
self
.
index
.
search
(
"frobnicate"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[])
def
test_query_oneresult
(
self
):
self
.
index
.
index_doc
(
1
,
'not the same document'
)
self
.
index
.
index_doc
(
2
,
'something about something else'
)
results
=
self
.
index
.
search
(
"document"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_search_phrase
(
self
):
self
.
index
.
index_doc
(
1
,
"the quick brown fox jumps over the lazy dog"
)
self
.
index
.
index_doc
(
2
,
"the quick fox jumps lazy over the brown dog"
)
results
=
self
.
index
.
search_phrase
(
"quick brown fox"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
])
def
test_search_glob
(
self
):
self
.
index
.
index_doc
(
1
,
"how now brown cow"
)
self
.
index
.
index_doc
(
2
,
"hough nough browne cough"
)
self
.
index
.
index_doc
(
3
,
"bar brawl"
)
results
=
self
.
index
.
search_glob
(
"bro*"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
,
2
])
results
=
self
.
index
.
search_glob
(
"b*"
)
self
.
assertEqual
(
list
(
results
.
keys
()),
[
1
,
2
,
3
])
def
test_suite
():
return
makeSuite
(
IndexTest
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/testLexicon.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.Lexicon
import
Lexicon
from
Products.ZCTextIndex.Lexicon
import
Splitter
,
CaseNormalizer
class
StupidPipelineElement
:
def
__init__
(
self
,
fromword
,
toword
):
self
.
__fromword
=
fromword
self
.
__toword
=
toword
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
term
==
self
.
__fromword
:
res
.
append
(
self
.
__toword
)
else
:
res
.
append
(
term
)
return
res
class
WackyReversePipelineElement
:
def
__init__
(
self
,
revword
):
self
.
__revword
=
revword
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
term
==
self
.
__revword
:
x
=
list
(
term
)
x
.
reverse
()
res
.
append
(
''
.
join
(
x
))
else
:
res
.
append
(
term
)
return
res
class
StopWordPipelineElement
:
def
__init__
(
self
,
stopdict
=
{}):
self
.
__stopdict
=
stopdict
def
process
(
self
,
seq
):
res
=
[]
for
term
in
seq
:
if
self
.
__stopdict
.
get
(
term
):
continue
else
:
res
.
append
(
term
)
return
res
class
Test
(
TestCase
):
def
testSourceToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
1
,
2
,
3
])
def
testTermToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'dogs'
)
self
.
assertEqual
(
wids
,
[
3
])
def
testMissingTermToWordIds
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'boxes'
)
self
.
assertEqual
(
wids
,
[])
def
testOnePipelineElement
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StupidPipelineElement
(
'dogs'
,
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'fish'
)
self
.
assertEqual
(
wids
,
[
3
])
def
testSplitterAdaptorFold
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
CaseNormalizer
())
wids
=
lexicon
.
sourceToWordIds
(
'CATS and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
1
,
2
,
3
])
def
testSplitterAdaptorNofold
(
self
):
lexicon
=
Lexicon
(
Splitter
())
wids
=
lexicon
.
sourceToWordIds
(
'CATS and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'cats and dogs'
)
self
.
assertEqual
(
wids
,
[
2
,
3
])
def
testTwoElementPipeline
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StupidPipelineElement
(
'cats'
,
'fish'
),
WackyReversePipelineElement
(
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'hsif'
)
self
.
assertEqual
(
wids
,
[
1
])
def
testThreeElementPipeline
(
self
):
lexicon
=
Lexicon
(
Splitter
(),
StopWordPipelineElement
({
'and'
:
1
}),
StupidPipelineElement
(
'dogs'
,
'fish'
),
WackyReversePipelineElement
(
'fish'
))
wids
=
lexicon
.
sourceToWordIds
(
'cats and dogs'
)
wids
=
lexicon
.
termToWordIds
(
'hsif'
)
self
.
assertEqual
(
wids
,
[
2
])
def
test_suite
():
return
makeSuite
(
Test
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/testNBest.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.NBest
import
NBest
class
NBestTest
(
TestCase
):
def
testConstructor
(
self
):
self
.
assertRaises
(
ValueError
,
NBest
,
0
)
self
.
assertRaises
(
ValueError
,
NBest
,
-
1
)
for
n
in
range
(
1
,
11
):
nb
=
NBest
(
n
)
self
.
assertEqual
(
len
(
nb
),
0
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
def
testOne
(
self
):
nb
=
NBest
(
1
)
nb
.
add
(
'a'
,
0
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'a'
,
0
)])
nb
.
add
(
'b'
,
1
)
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'b'
,
1
)])
nb
.
add
(
'c'
,
-
1
)
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'b'
,
1
)])
nb
.
addmany
([(
'd'
,
3
),
(
'e'
,
-
6
),
(
'f'
,
5
),
(
'g'
,
4
)])
self
.
assertEqual
(
len
(
nb
),
1
)
self
.
assertEqual
(
nb
.
capacity
(),
1
)
self
.
assertEqual
(
nb
.
getbest
(),
[(
'f'
,
5
)])
def
testMany
(
self
):
import
random
inputs
=
[(
-
i
,
i
)
for
i
in
range
(
50
)]
reversed_inputs
=
inputs
[:]
reversed_inputs
.
reverse
()
# Test the N-best for a variety of n (1, 6, 11, ... 50).
for
n
in
range
(
1
,
len
(
inputs
)
+
1
,
5
):
expected
=
inputs
[
-
n
:]
expected
.
reverse
()
random_inputs
=
inputs
[:]
random
.
shuffle
(
random_inputs
)
for
source
in
inputs
,
reversed_inputs
,
random_inputs
:
# Try feeding them one at a time.
nb
=
NBest
(
n
)
for
item
,
score
in
source
:
nb
.
add
(
item
,
score
)
self
.
assertEqual
(
len
(
nb
),
n
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
self
.
assertEqual
(
nb
.
getbest
(),
expected
)
# And again in one gulp.
nb
=
NBest
(
n
)
nb
.
addmany
(
source
)
self
.
assertEqual
(
len
(
nb
),
n
)
self
.
assertEqual
(
nb
.
capacity
(),
n
)
self
.
assertEqual
(
nb
.
getbest
(),
expected
)
for
i
in
range
(
1
,
n
+
1
):
self
.
assertEqual
(
nb
.
pop_smallest
(),
expected
[
-
i
])
self
.
assertRaises
(
IndexError
,
nb
.
pop_smallest
)
def
test_suite
():
return
makeSuite
(
NBestTest
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/testQueryEngine.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
BTrees.IIBTree
import
IIBucket
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.ParseTree
import
ParseError
,
QueryError
class
FauxIndex
:
def
search
(
self
,
term
):
b
=
IIBucket
()
if
term
==
"foo"
:
b
[
1
]
=
b
[
3
]
=
1
elif
term
==
"bar"
:
b
[
1
]
=
b
[
2
]
=
1
elif
term
==
"ham"
:
b
[
1
]
=
b
[
2
]
=
b
[
3
]
=
b
[
4
]
=
1
return
b
class
TestQueryEngine
(
TestCase
):
def
setUp
(
self
):
self
.
parser
=
QueryParser
()
self
.
index
=
FauxIndex
()
def
compareSet
(
self
,
set
,
dict
):
d
=
{}
for
k
,
v
in
set
.
items
():
d
[
k
]
=
v
self
.
assertEqual
(
d
,
dict
)
def
compareQuery
(
self
,
query
,
dict
):
tree
=
self
.
parser
.
parseQuery
(
query
)
set
=
tree
.
executeQuery
(
self
.
index
)
self
.
compareSet
(
set
,
dict
)
def
testExecuteQuery
(
self
):
self
.
compareQuery
(
"foo AND bar"
,
{
1
:
2
})
self
.
compareQuery
(
"foo OR bar"
,
{
1
:
2
,
2
:
1
,
3
:
1
})
self
.
compareQuery
(
"foo AND NOT bar"
,
{
3
:
1
})
self
.
compareQuery
(
"foo AND foo AND foo"
,
{
1
:
3
,
3
:
3
})
self
.
compareQuery
(
"foo OR foo OR foo"
,
{
1
:
3
,
3
:
3
})
self
.
compareQuery
(
"ham AND NOT foo AND NOT bar"
,
{
4
:
1
})
self
.
compareQuery
(
"ham OR foo OR bar"
,
{
1
:
3
,
2
:
2
,
3
:
2
,
4
:
1
})
self
.
compareQuery
(
"ham AND foo AND bar"
,
{
1
:
3
})
def
testInvalidQuery
(
self
):
from
Products.ZCTextIndex.ParseTree
import
NotNode
,
AtomNode
tree
=
NotNode
(
AtomNode
(
"foo"
))
self
.
assertRaises
(
QueryError
,
tree
.
executeQuery
,
self
.
index
)
def
test_suite
():
return
makeSuite
(
TestQueryEngine
)
if
__name__
==
'__main__'
:
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/testQueryParser.py
0 → 100644
View file @
61e89f2f
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from
unittest
import
TestCase
,
TestSuite
,
main
,
makeSuite
from
Products.ZCTextIndex.QueryParser
import
QueryParser
from
Products.ZCTextIndex.ParseTree
import
ParseError
,
ParseTreeNode
from
Products.ZCTextIndex.ParseTree
import
OrNode
,
AndNode
,
NotNode
from
Products.ZCTextIndex.ParseTree
import
AtomNode
,
PhraseNode
,
GlobNode
class
TestQueryParser
(
TestCase
):
def
compareParseTrees
(
self
,
got
,
expected
):
self
.
assertEqual
(
isinstance
(
got
,
ParseTreeNode
),
1
)
self
.
assertEqual
(
got
.
__class__
,
expected
.
__class__
)
if
isinstance
(
got
,
PhraseNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"PHRASE"
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
())
elif
isinstance
(
got
,
GlobNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"GLOB"
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
())
elif
isinstance
(
got
,
AtomNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"ATOM"
)
self
.
assertEqual
(
got
.
getValue
(),
expected
.
getValue
())
elif
isinstance
(
got
,
NotNode
):
self
.
assertEqual
(
got
.
nodeType
(),
"NOT"
)
self
.
compareParseTrees
(
got
.
getValue
(),
expected
.
getValue
())
elif
isinstance
(
got
,
AndNode
)
or
isinstance
(
got
,
OrNode
):
self
.
assertEqual
(
got
.
nodeType
(),
isinstance
(
got
,
AndNode
)
and
"AND"
or
"OR"
)
list1
=
got
.
getValue
()
list2
=
expected
.
getValue
()
self
.
assertEqual
(
len
(
list1
),
len
(
list2
))
for
i
in
range
(
len
(
list1
)):
self
.
compareParseTrees
(
list1
[
i
],
list2
[
i
])
def
expect
(
self
,
input
,
output
):
tree
=
self
.
p
.
parseQuery
(
input
)
self
.
compareParseTrees
(
tree
,
output
)
def
failure
(
self
,
input
):
self
.
assertRaises
(
ParseError
,
self
.
p
.
parseQuery
,
input
)
def
setUp
(
self
):
self
.
p
=
QueryParser
()
def
testParseQuery
(
self
):
self
.
expect
(
"foo"
,
AtomNode
(
"foo"
))
self
.
expect
(
"note"
,
AtomNode
(
"note"
))
self
.
expect
(
"a and b AND c"
,
AndNode
([
AtomNode
(
"a"
),
AtomNode
(
"b"
),
AtomNode
(
"c"
)]))
self
.
expect
(
"a OR b or c"
,
OrNode
([
AtomNode
(
"a"
),
AtomNode
(
"b"
),
AtomNode
(
"c"
)]))
self
.
expect
(
"a AND b OR c AnD d"
,
OrNode
([
AndNode
([
AtomNode
(
"a"
),
AtomNode
(
"b"
)]),
AndNode
([
AtomNode
(
"c"
),
AtomNode
(
"d"
)])]))
self
.
expect
(
"(a OR b) AND (c OR d)"
,
AndNode
([
OrNode
([
AtomNode
(
"a"
),
AtomNode
(
"b"
)]),
OrNode
([
AtomNode
(
"c"
),
AtomNode
(
"d"
)])]))
self
.
expect
(
"a AND not b"
,
AndNode
([
AtomNode
(
"a"
),
NotNode
(
AtomNode
(
"b"
))]))
self
.
expect
(
'"foo bar"'
,
PhraseNode
(
"foo bar"
))
self
.
expect
(
"foo bar"
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
self
.
expect
(
'(("foo bar"))"'
,
PhraseNode
(
"foo bar"
))
self
.
expect
(
"((foo bar))"
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
self
.
expect
(
'and/'
,
AtomNode
(
"and"
))
self
.
expect
(
"foo-bar"
,
PhraseNode
(
"foo bar"
))
self
.
expect
(
"foo -bar"
,
AndNode
([
AtomNode
(
"foo"
),
NotNode
(
AtomNode
(
"bar"
))]))
self
.
expect
(
"-foo bar"
,
AndNode
([
AtomNode
(
"bar"
),
NotNode
(
AtomNode
(
"foo"
))]))
self
.
expect
(
"booh -foo-bar"
,
AndNode
([
AtomNode
(
"booh"
),
NotNode
(
PhraseNode
(
"foo bar"
))]))
self
.
expect
(
'booh -"foo bar"'
,
AndNode
([
AtomNode
(
"booh"
),
NotNode
(
PhraseNode
(
"foo bar"
))]))
self
.
expect
(
'foo"bar"'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
self
.
expect
(
'"foo"bar'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
)]))
self
.
expect
(
'foo"bar"blech'
,
AndNode
([
AtomNode
(
"foo"
),
AtomNode
(
"bar"
),
AtomNode
(
"blech"
)]))
self
.
expect
(
"foo*"
,
GlobNode
(
"foo*"
))
self
.
expect
(
"foo* bar"
,
AndNode
([
GlobNode
(
"foo*"
),
AtomNode
(
"bar"
)]))
def
testParseFailures
(
self
):
self
.
failure
(
""
)
self
.
failure
(
"not"
)
self
.
failure
(
"OR"
)
self
.
failure
(
"AND"
)
self
.
failure
(
"not foo"
)
self
.
failure
(
")"
)
self
.
failure
(
"("
)
self
.
failure
(
"foo OR"
)
self
.
failure
(
"foo AND"
)
self
.
failure
(
"OR foo"
)
self
.
failure
(
"and foo"
)
self
.
failure
(
"(foo) bar"
)
self
.
failure
(
"(foo OR)"
)
self
.
failure
(
"(foo AND)"
)
self
.
failure
(
"(NOT foo)"
)
self
.
failure
(
"-foo"
)
self
.
failure
(
"-foo -bar"
)
self
.
failure
(
'""'
)
def
test_suite
():
return
makeSuite
(
TestQueryParser
)
if
__name__
==
"__main__"
:
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
0 → 100644
View file @
61e89f2f
from
Products.ZCTextIndex.ZCTextIndex
import
ZCTextIndex
from
Products.ZCTextIndex.tests
\
import
testIndex
,
testQueryEngine
,
testQueryParser
from
Products.ZCTextIndex.Index
import
scaled_int
,
SCALE_FACTOR
from
Products.ZCTextIndex.Lexicon
import
Lexicon
,
Splitter
from
Products.ZCTextIndex.Lexicon
import
CaseNormalizer
,
StopWordRemover
import
unittest
class
Indexable
:
def
__init__
(
self
,
text
):
self
.
text
=
text
class
LexiconHolder
:
def
__init__
(
self
,
lexicon
):
self
.
lexicon
=
lexicon
class
Extra
:
pass
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def
eq
(
scaled1
,
scaled2
,
epsilon
=
scaled_int
(
0.01
)):
if
abs
(
scaled1
-
scaled2
)
>
epsilon
:
raise
AssertionError
,
"%s != %s"
%
(
scaled1
,
scaled2
)
class
IndexTests
(
testIndex
.
IndexTest
):
def
setUp
(
self
):
extra
=
Extra
()
extra
.
doc_attr
=
'text'
extra
.
lexicon_id
=
'lexicon'
caller
=
LexiconHolder
(
Lexicon
(
Splitter
(),
CaseNormalizer
(),
StopWordRemover
()))
self
.
zc_index
=
ZCTextIndex
(
'name'
,
extra
,
caller
)
self
.
index
=
self
.
zc_index
.
index
self
.
lexicon
=
self
.
zc_index
.
lexicon
def
testStopWords
(
self
):
# the only non-stopword is question
text
=
(
"to be or not to be "
"that is the question"
)
doc
=
Indexable
(
text
)
self
.
zc_index
.
index_object
(
1
,
doc
)
for
word
in
text
.
split
():
if
word
!=
"question"
:
wids
=
self
.
lexicon
.
termToWordIds
(
word
)
self
.
assertEqual
(
wids
,
[])
self
.
assertEqual
(
len
(
self
.
index
.
_get_undoinfo
(
1
)),
1
)
def
testRanking
(
self
):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188.
self
.
words
=
[
"cold"
,
"days"
,
"eat"
,
"hot"
,
"lot"
,
"nine"
,
"old"
,
"pease"
,
"porridge"
,
"pot"
]
self
.
_ranking_index
()
self
.
_ranking_tf
()
self
.
_ranking_idf
()
self
.
_ranking_queries
()
def
_ranking_index
(
self
):
docs
=
[
"Pease porridge hot, pease porridge cold,"
,
"Pease porridge in the pot,"
,
"Nine days old."
,
"In the pot cold, in the pot hot,"
,
"Pease porridge, pease porridge,"
,
"Eat the lot."
]
for
i
in
range
(
len
(
docs
)):
self
.
zc_index
.
index_object
(
i
+
1
,
Indexable
(
docs
[
i
]))
def
_ranking_tf
(
self
):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt
=
[(
1.0
,
0.0
,
0.0
,
1.0
,
0.0
,
0.0
,
0.0
,
1.7
,
1.7
,
0.0
),
(
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
1.0
,
1.0
,
1.0
),
(
0.0
,
1.0
,
0.0
,
0.0
,
0.0
,
1.0
,
1.0
,
0.0
,
0.0
,
0.0
),
(
1.0
,
0.0
,
0.0
,
1.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
1.7
),
(
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
,
1.7
,
1.7
,
0.0
),
(
0.0
,
0.0
,
1.0
,
0.0
,
1.0
,
0.0
,
0.0
,
0.0
,
0.0
,
0.0
)]
l_Wd
=
[
2.78
,
1.73
,
1.73
,
2.21
,
2.39
,
1.41
]
for
i
in
range
(
len
(
l_Wd
)):
docid
=
i
+
1
scaled_Wd
=
scaled_int
(
l_Wd
[
i
])
eq
(
scaled_Wd
,
self
.
index
.
_get_Wd
(
docid
))
wdts
=
[
scaled_int
(
t
)
for
t
in
l_wdt
[
i
]]
for
j
in
range
(
len
(
wdts
)):
wdt
=
self
.
index
.
_get_wdt
(
docid
,
self
.
words
[
j
])
eq
(
wdts
[
j
],
wdt
)
def
_ranking_idf
(
self
):
word_freqs
=
[
2
,
1
,
1
,
2
,
1
,
1
,
1
,
3
,
3
,
2
]
idfs
=
[
1.39
,
1.95
,
1.95
,
1.39
,
1.95
,
1.95
,
1.95
,
1.10
,
1.10
,
1.39
]
for
i
in
range
(
len
(
self
.
words
)):
word
=
self
.
words
[
i
]
eq
(
word_freqs
[
i
],
self
.
index
.
_get_ft
(
word
))
eq
(
scaled_int
(
idfs
[
i
]),
self
.
index
.
_get_wt
(
word
))
def
_ranking_queries
(
self
):
queries
=
[
"eat"
,
"porridge"
,
"hot OR porridge"
,
"eat OR nine OR day OR old OR porridge"
]
wqs
=
[
1.95
,
1.10
,
1.77
,
3.55
]
results
=
[[(
6
,
0.71
)],
[(
1
,
0.61
),
(
2
,
0.58
),
(
5
,
0.71
)],
[(
1
,
0.66
),
(
2
,
0.36
),
(
4
,
0.36
),
(
5
,
0.44
)],
[(
1
,
0.19
),
(
2
,
0.18
),
(
3
,
0.63
),
(
5
,
0.22
),
(
6
,
0.39
)]]
for
i
in
range
(
len
(
queries
)):
raw
=
queries
[
i
]
q
=
self
.
zc_index
.
parser
.
parseQuery
(
raw
)
wq
=
self
.
index
.
query_weight
(
q
.
terms
())
eq
(
wq
,
scaled_int
(
wqs
[
i
]))
r
=
self
.
zc_index
.
query
(
raw
)
self
.
assertEqual
(
len
(
r
),
len
(
results
[
i
]))
# convert the results to a dict for each checking
d
=
{}
for
doc
,
score
in
results
[
i
]:
d
[
doc
]
=
scaled_int
(
score
)
for
doc
,
score
in
r
:
score
=
scaled_int
(
float
(
score
/
SCALE_FACTOR
)
/
wq
)
self
.
assert_
(
0
<=
score
<=
SCALE_FACTOR
)
eq
(
d
[
doc
],
score
)
class
QueryTests
(
testQueryEngine
.
TestQueryEngine
,
testQueryParser
.
TestQueryParser
):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs
=
[
"foo bar ham"
,
"bar ham"
,
"foo ham"
,
"ham"
]
def
setUp
(
self
):
extra
=
Extra
()
extra
.
doc_attr
=
'text'
extra
.
lexicon_id
=
'lexicon'
caller
=
LexiconHolder
(
Lexicon
(
Splitter
(),
CaseNormalizer
(),
StopWordRemover
()))
self
.
zc_index
=
ZCTextIndex
(
'name'
,
extra
,
caller
)
self
.
p
=
self
.
parser
=
self
.
zc_index
.
parser
self
.
index
=
self
.
zc_index
.
index
self
.
add_docs
()
def
add_docs
(
self
):
for
i
in
range
(
len
(
self
.
docs
)):
text
=
self
.
docs
[
i
]
obj
=
Indexable
(
text
)
self
.
zc_index
.
index_object
(
i
+
1
,
obj
)
def
compareSet
(
self
,
set
,
dict
):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
d
=
{}
for
k
,
v
in
set
.
items
():
d
[
k
]
=
v
self
.
assertEqual
(
d
.
keys
(),
dict
.
keys
())
def
test_suite
():
s
=
unittest
.
TestSuite
()
for
klass
in
IndexTests
,
QueryTests
:
s
.
addTest
(
unittest
.
makeSuite
(
klass
))
return
s
if
__name__
==
'__main__'
:
unittest
.
main
(
defaultTest
=
'test_suite'
)
lib/python/Products/ZCTextIndex/tests/wordstats.py
0 → 100644
View file @
61e89f2f
#! /usr/bin/env python
"""Dump statistics about each word in the index.
usage: wordstats.py data.fs [index key]
"""
import
ZODB
from
ZODB.FileStorage
import
FileStorage
def
main
(
fspath
,
key
):
fs
=
FileStorage
(
fspath
,
read_only
=
1
)
db
=
ZODB
.
DB
(
fs
)
rt
=
db
.
open
().
root
()
index
=
rt
[
key
]
lex
=
index
.
lexicon
idx
=
index
.
index
print
"Words"
,
lex
.
length
()
print
"Documents"
,
idx
.
length
()
print
"Word frequencies: count, word, wid"
for
word
,
wid
in
lex
.
items
():
docs
=
idx
.
_wordinfo
[
wid
]
print
len
(
docs
),
word
,
wid
print
"Per-doc scores: wid, (doc, score,)+"
for
wid
in
lex
.
wids
():
print
wid
,
docs
=
idx
.
_wordinfo
[
wid
]
for
docid
,
score
in
docs
.
items
():
print
docid
,
score
,
print
if
__name__
==
"__main__"
:
import
sys
args
=
sys
.
argv
[
1
:]
index_key
=
"index"
if
len
(
args
)
==
1
:
fspath
=
args
[
0
]
elif
len
(
args
)
==
2
:
fspath
,
index_key
=
args
else
:
print
"Expected 1 or 2 args, got"
,
len
(
args
)
main
(
fspath
,
index_key
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment