Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
3095fce7
Commit
3095fce7
authored
Apr 14, 2004
by
Andreas Jung
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
removed 'SearchIndex' package
parent
2dc887a3
Changes
16
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
2 additions
and
3076 deletions
+2
-3076
doc/CHANGES.txt
doc/CHANGES.txt
+2
-0
lib/python/SearchIndex/.testinfo
lib/python/SearchIndex/.testinfo
+0
-1
lib/python/SearchIndex/GlobbingLexicon.py
lib/python/SearchIndex/GlobbingLexicon.py
+0
-245
lib/python/SearchIndex/Index.py
lib/python/SearchIndex/Index.py
+0
-248
lib/python/SearchIndex/Lexicon.py
lib/python/SearchIndex/Lexicon.py
+0
-202
lib/python/SearchIndex/PluggableIndex.py
lib/python/SearchIndex/PluggableIndex.py
+0
-74
lib/python/SearchIndex/README.txt
lib/python/SearchIndex/README.txt
+0
-2
lib/python/SearchIndex/ResultList.py
lib/python/SearchIndex/ResultList.py
+0
-94
lib/python/SearchIndex/Setup
lib/python/SearchIndex/Setup
+0
-2
lib/python/SearchIndex/Splitter.c
lib/python/SearchIndex/Splitter.c
+0
-427
lib/python/SearchIndex/TextIndex.py
lib/python/SearchIndex/TextIndex.py
+0
-563
lib/python/SearchIndex/UnIndex.py
lib/python/SearchIndex/UnIndex.py
+0
-393
lib/python/SearchIndex/UnKeywordIndex.py
lib/python/SearchIndex/UnKeywordIndex.py
+0
-92
lib/python/SearchIndex/UnTextIndex.py
lib/python/SearchIndex/UnTextIndex.py
+0
-689
lib/python/SearchIndex/__init__.py
lib/python/SearchIndex/__init__.py
+0
-25
lib/python/SearchIndex/randid.py
lib/python/SearchIndex/randid.py
+0
-19
No files found.
doc/CHANGES.txt
View file @
3095fce7
...
@@ -24,6 +24,8 @@ Zope Changes
...
@@ -24,6 +24,8 @@ Zope Changes
Features added
Features added
- The obsolete 'SearchIndex' packages has been removed
- Traversal now supports a "post traversal hook" that get's run
- Traversal now supports a "post traversal hook" that get's run
after traversal finished and the security context is established.
after traversal finished and the security context is established.
...
...
lib/python/SearchIndex/.testinfo
deleted
100644 → 0
View file @
2dc887a3
# Nothing to see here (deprecated module).
lib/python/SearchIndex/GlobbingLexicon.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
from
Lexicon
import
Lexicon
from
Splitter
import
Splitter
from
UnTextIndex
import
Or
import
re
,
string
from
BTrees.IIBTree
import
IISet
,
union
,
IITreeSet
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OOBTree
import
OOBTree
from
randid
import
randid
class
GlobbingLexicon
(
Lexicon
):
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
multi_wc
=
'*'
single_wc
=
'?'
eow
=
'$'
def
__init__
(
self
):
self
.
clear
()
def
clear
(
self
):
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
self
.
_digrams
=
OOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
Lexicon
.
_convertBTrees
(
self
,
threshold
)
if
type
(
self
.
_digrams
)
is
OOBTree
:
return
from
BTrees.convert
import
convert
_digrams
=
self
.
_digrams
self
.
_digrams
=
OOBTree
()
self
.
_digrams
.
_p_jar
=
self
.
_p_jar
convert
(
_digrams
,
self
.
_digrams
,
threshold
,
IITreeSet
)
def
createDigrams
(
self
,
word
):
"""Returns a list with the set of digrams in the word."""
digrams
=
list
(
word
)
digrams
.
append
(
self
.
eow
)
last
=
self
.
eow
for
i
in
range
(
len
(
digrams
)):
last
,
digrams
[
i
]
=
digrams
[
i
],
last
+
digrams
[
i
]
return
digrams
def
getWordId
(
self
,
word
):
"""Provided 'word', return the matching integer word id."""
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
else
:
return
self
.
assignWordId
(
word
)
set
=
getWordId
# Kludge for old code
def
getWord
(
self
,
wid
):
return
self
.
_inverseLex
.
get
(
wid
,
None
)
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word, and return it."""
# Double check it's not in the lexicon already, and if it is, just
# return it.
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
# Get word id. BBB Backward compat pain.
inverse
=
self
.
_inverseLex
try
:
insert
=
inverse
.
insert
except
AttributeError
:
# we have an "old" BTree object
if
inverse
:
wid
=
inverse
.
keys
()[
-
1
]
+
1
else
:
self
.
_inverseLex
=
IOBTree
()
wid
=
1
inverse
[
wid
]
=
word
else
:
# we have a "new" IOBTree object
wid
=
randid
()
while
not
inverse
.
insert
(
wid
,
word
):
wid
=
randid
()
self
.
_lexicon
[
word
]
=
wid
# Now take all the digrams and insert them into the digram map.
for
digram
in
self
.
createDigrams
(
word
):
set
=
self
.
_digrams
.
get
(
digram
,
None
)
if
set
is
None
:
self
.
_digrams
[
digram
]
=
set
=
IISet
()
set
.
insert
(
wid
)
return
wid
def
get
(
self
,
pattern
):
""" Query the lexicon for words matching a pattern."""
wc_set
=
[
self
.
multi_wc
,
self
.
single_wc
]
digrams
=
[]
globbing
=
0
for
i
in
range
(
len
(
pattern
)):
if
pattern
[
i
]
in
wc_set
:
globbing
=
1
continue
if
i
==
0
:
digrams
.
insert
(
i
,
(
self
.
eow
+
pattern
[
i
])
)
digrams
.
append
((
pattern
[
i
]
+
pattern
[
i
+
1
]))
else
:
try
:
if
pattern
[
i
+
1
]
not
in
wc_set
:
digrams
.
append
(
pattern
[
i
]
+
pattern
[
i
+
1
]
)
except
IndexError
:
digrams
.
append
(
(
pattern
[
i
]
+
self
.
eow
)
)
if
not
globbing
:
result
=
self
.
_lexicon
.
get
(
pattern
,
None
)
if
result
is
None
:
return
()
return
(
result
,
)
## now get all of the intsets that contain the result digrams
result
=
None
for
digram
in
digrams
:
result
=
union
(
result
,
self
.
_digrams
.
get
(
digram
,
None
))
if
not
result
:
return
()
else
:
## now we have narrowed the list of possible candidates
## down to those words which contain digrams. However,
## some words may have been returned that match digrams,
## but do not match 'pattern'. This is because some words
## may contain all matching digrams, but in the wrong
## order.
expr
=
re
.
compile
(
self
.
createRegex
(
pattern
))
words
=
[]
hits
=
IISet
()
for
x
in
result
:
if
expr
.
match
(
self
.
_inverseLex
[
x
]):
hits
.
insert
(
x
)
return
hits
def
__getitem__
(
self
,
word
):
""" """
return
self
.
get
(
word
)
def
query_hook
(
self
,
q
):
"""expand wildcards"""
ListType
=
type
([])
i
=
len
(
q
)
-
1
while
i
>=
0
:
e
=
q
[
i
]
if
isinstance
(
e
,
ListType
):
self
.
query_hook
(
e
)
elif
(
(
self
.
multi_wc
in
e
)
or
(
self
.
single_wc
in
e
)
):
wids
=
self
.
get
(
e
)
words
=
[]
for
wid
in
wids
:
if
words
:
words
.
append
(
Or
)
words
.
append
(
wid
)
if
not
words
:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words
.
append
(
''
)
q
[
i
]
=
words
i
=
i
-
1
return
q
def
Splitter
(
self
,
astring
,
words
=
None
):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
return
Splitter
(
astring
)
def
createRegex
(
self
,
pat
):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
transTable
=
string
.
maketrans
(
""
,
""
)
result
=
string
.
translate
(
pat
,
transTable
,
r'()&|!@#$%^{}\
<>.
')
# First, deal with multi-character globbing
result = string.replace(result, '
*
', '
.
*
')
# Next, we need to deal with single-character globbing
result = string.replace(result, '
?
', '
.
')
return "%s$" % result
lib/python/SearchIndex/Index.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Simple column indices"""
__version__
=
'$Revision: 1.31 $'
[
11
:
-
2
]
from
Persistence
import
Persistent
from
BTrees.OOBTree
import
OOBTree
from
BTrees.IIBTree
import
IITreeSet
import
operator
from
Missing
import
MV
import
string
ListType
=
type
([])
StringType
=
type
(
's'
)
def
nonEmpty
(
s
):
"returns true if a non-empty string or any other (nonstring) type"
if
type
(
s
)
is
StringType
:
if
s
:
return
1
else
:
return
0
else
:
return
1
class
Index
(
Persistent
):
"""Index object interface"""
isDeprecatedIndex
=
1
def
__init__
(
self
,
data
=
None
,
schema
=
None
,
id
=
None
,
ignore_ex
=
None
,
call_methods
=
None
):
"""Create an index
The arguments are:
'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data
records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
"""
######################################################################
# For b/w compatability, have to allow __init__ calls with zero args
if
not
data
==
schema
==
id
==
ignore_ex
==
call_methods
==
None
:
self
.
_data
=
data
self
.
_schema
=
schema
self
.
id
=
id
self
.
ignore_ex
=
ignore_ex
self
.
call_methods
=
call_methods
self
.
_index
=
OOBTree
()
self
.
_reindex
()
else
:
pass
# for b/w compatability
_init
=
__init__
def
dpHasUniqueValuesFor
(
self
,
name
):
' has unique values for column NAME '
if
name
==
self
.
id
:
return
1
else
:
return
0
def
dpUniqueValues
(
self
,
name
=
None
,
withLengths
=
0
):
"""
\
returns the unique values for name
if withLengths is true, returns a sequence of
tuples of (value, length)
"""
if
name
is
None
:
name
=
self
.
id
elif
name
!=
self
.
id
:
return
[]
if
not
withLengths
:
return
tuple
(
filter
(
nonEmpty
,
self
.
_index
.
keys
())
)
else
:
rl
=
[]
for
i
in
self
.
_index
.
keys
():
if
not
nonEmpty
(
i
):
continue
else
:
rl
.
append
((
i
,
len
(
self
.
_index
[
i
])))
return
tuple
(
rl
)
def
clear
(
self
):
self
.
_index
=
OOBTree
()
def
_reindex
(
self
,
start
=
0
):
"""Recompute index data for data with ids >= start."""
index
=
self
.
_index
get
=
index
.
get
if
not
start
:
index
.
clear
()
id
=
self
.
id
if
self
.
_schema
is
None
:
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
id
]
for
i
,
row
in
self
.
_data
.
items
(
start
):
k
=
f
(
row
,
id
)
if
k
is
None
or
k
==
MV
:
continue
set
=
get
(
k
)
if
set
is
None
:
index
[
k
]
=
set
=
IITreeSet
()
set
.
insert
(
i
)
def
index_item
(
self
,
i
,
obj
=
None
):
"""Recompute index data for data with ids >= start."""
index
=
self
.
_index
id
=
self
.
id
if
(
self
.
_schema
is
None
)
or
(
obj
is
not
None
):
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
id
]
if
obj
is
None
:
obj
=
self
.
_data
[
i
]
try
:
k
=
f
(
obj
,
id
)
except
:
return
if
self
.
call_methods
:
k
=
k
()
if
k
is
None
or
k
==
MV
:
return
set
=
index
.
get
(
k
)
if
set
is
None
:
index
[
k
]
=
set
=
IITreeSet
()
set
.
insert
(
i
)
def
unindex_item
(
self
,
i
,
obj
=
None
):
"""Recompute index data for data with ids >= start."""
index
=
self
.
_index
id
=
self
.
id
if
self
.
_schema
is
None
:
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
id
]
if
obj
is
None
:
obj
=
self
.
_data
[
i
]
try
:
k
=
f
(
obj
,
id
)
except
:
return
if
self
.
call_methods
:
k
=
k
()
if
k
is
None
or
k
==
MV
:
return
set
=
index
.
get
(
k
)
if
set
is
not
None
:
set
.
remove
(
i
)
def
_apply_index
(
self
,
request
,
cid
=
''
):
"""Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then
None is returned.
If the request contains a parameter with the name of the
column + '_usage', it is sniffed for information on how to
handle applying the index.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
id
=
self
.
id
#name of the column
cidid
=
"%s/%s"
%
(
cid
,
id
)
has_key
=
request
.
has_key
if
has_key
(
cidid
):
keys
=
request
[
cidid
]
elif
has_key
(
id
):
keys
=
request
[
id
]
else
:
return
None
if
type
(
keys
)
is
not
ListType
:
keys
=
[
keys
]
index
=
self
.
_index
r
=
None
anyTrue
=
0
opr
=
None
if
request
.
has_key
(
id
+
'_usage'
):
# see if any usage params are sent to field
opr
=
string
.
split
(
string
.
lower
(
request
[
id
+
"_usage"
]),
':'
)
opr
,
opr_args
=
opr
[
0
],
opr
[
1
:]
if
opr
==
"range"
:
if
'min'
in
opr_args
:
lo
=
min
(
keys
)
else
:
lo
=
None
if
'max'
in
opr_args
:
hi
=
max
(
keys
)
else
:
hi
=
None
anyTrue
=
1
try
:
if
hi
:
setlist
=
index
.
items
(
lo
,
hi
)
else
:
setlist
=
index
.
items
(
lo
)
for
k
,
set
in
setlist
:
w
,
r
=
weightedUnion
(
r
,
set
)
except
KeyError
:
pass
else
:
#not a range
get
=
index
.
get
for
key
in
keys
:
if
key
:
anyTrue
=
1
set
=
get
(
key
)
if
set
is
not
None
:
w
,
r
=
weightedUnion
(
r
,
set
)
if
r
is
None
:
if
anyTrue
:
r
=
IISet
()
else
:
return
None
return
r
,
(
id
,)
lib/python/SearchIndex/Lexicon.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__
=
""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
mapping.
"""
from
Splitter
import
Splitter
from
Persistence
import
Persistent
from
Acquisition
import
Implicit
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IIBTree
import
IISet
,
IITreeSet
from
randid
import
randid
class
Lexicon
(
Persistent
,
Implicit
):
"""Maps words to word ids and then some
The Lexicon object is an attempt to abstract vocabularies out of
Text indexes. This abstraction is not totally cooked yet, this
module still includes the parser for the 'Text Index Query
Language' and a few other hacks.
"""
# default for older objects
stop_syn
=
{}
def
__init__
(
self
,
stop_syn
=
None
):
self
.
clear
()
if
stop_syn
is
None
:
self
.
stop_syn
=
{}
else
:
self
.
stop_syn
=
stop_syn
def
clear
(
self
):
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
if
(
type
(
self
.
_lexicon
)
is
OIBTree
and
type
(
getattr
(
self
,
'_inverseLex'
,
None
))
is
IOBTree
):
return
from
BTrees.convert
import
convert
lexicon
=
self
.
_lexicon
self
.
_lexicon
=
OIBTree
()
self
.
_lexicon
.
_p_jar
=
self
.
_p_jar
convert
(
lexicon
,
self
.
_lexicon
,
threshold
)
try
:
inverseLex
=
self
.
_inverseLex
self
.
_inverseLex
=
IOBTree
()
except
AttributeError
:
# older lexicons didn't have an inverse lexicon
self
.
_inverseLex
=
IOBTree
()
inverseLex
=
self
.
_inverseLex
self
.
_inverseLex
.
_p_jar
=
self
.
_p_jar
convert
(
inverseLex
,
self
.
_inverseLex
,
threshold
)
def
set_stop_syn
(
self
,
stop_syn
):
""" pass in a mapping of stopwords and synonyms. Format is:
{'word' : [syn1, syn2, ..., synx]}
Vocabularies do not necesarily need to implement this if their
splitters do not support stemming or stoping.
"""
self
.
stop_syn
=
stop_syn
def
getWordId
(
self
,
word
):
""" return the word id of 'word' """
wid
=
self
.
_lexicon
.
get
(
word
,
None
)
if
wid
is
None
:
wid
=
self
.
assignWordId
(
word
)
return
wid
set
=
getWordId
def
getWord
(
self
,
wid
):
""" post-2.3.1b2 method, will not work with unconverted lexicons """
return
self
.
_inverseLex
.
get
(
wid
,
None
)
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
try
:
inverse
=
self
.
_inverseLex
except
AttributeError
:
# woops, old lexicom wo wids
inverse
=
self
.
_inverseLex
=
IOBTree
()
for
word
,
wid
in
self
.
_lexicon
.
items
():
inverse
[
wid
]
=
word
wid
=
randid
()
while
not
inverse
.
insert
(
wid
,
word
):
wid
=
randid
()
self
.
_lexicon
[
intern
(
word
)]
=
wid
return
wid
def
get
(
self
,
key
,
default
=
None
):
"""Return the matched word against the key."""
r
=
IISet
()
wid
=
self
.
_lexicon
.
get
(
key
,
default
)
if
wid
is
not
None
:
r
.
insert
(
wid
)
return
r
def
__getitem__
(
self
,
key
):
return
self
.
get
(
key
)
def
__len__
(
self
):
return
len
(
self
.
_lexicon
)
def
Splitter
(
self
,
astring
,
words
=
None
):
""" wrap the splitter """
if
words
is
None
:
words
=
self
.
stop_syn
return
Splitter
(
astring
,
words
)
def
query_hook
(
self
,
q
):
""" we don't want to modify the query cuz we're dumb """
return
q
stop_words
=
(
'am'
,
'ii'
,
'iii'
,
'per'
,
'po'
,
're'
,
'a'
,
'about'
,
'above'
,
'across'
,
'after'
,
'afterwards'
,
'again'
,
'against'
,
'all'
,
'almost'
,
'alone'
,
'along'
,
'already'
,
'also'
,
'although'
,
'always'
,
'am'
,
'among'
,
'amongst'
,
'amoungst'
,
'amount'
,
'an'
,
'and'
,
'another'
,
'any'
,
'anyhow'
,
'anyone'
,
'anything'
,
'anyway'
,
'anywhere'
,
'are'
,
'around'
,
'as'
,
'at'
,
'back'
,
'be'
,
'became'
,
'because'
,
'become'
,
'becomes'
,
'becoming'
,
'been'
,
'before'
,
'beforehand'
,
'behind'
,
'being'
,
'below'
,
'beside'
,
'besides'
,
'between'
,
'beyond'
,
'bill'
,
'both'
,
'bottom'
,
'but'
,
'by'
,
'can'
,
'cannot'
,
'cant'
,
'con'
,
'could'
,
'couldnt'
,
'cry'
,
'describe'
,
'detail'
,
'do'
,
'done'
,
'down'
,
'due'
,
'during'
,
'each'
,
'eg'
,
'eight'
,
'either'
,
'eleven'
,
'else'
,
'elsewhere'
,
'empty'
,
'enough'
,
'even'
,
'ever'
,
'every'
,
'everyone'
,
'everything'
,
'everywhere'
,
'except'
,
'few'
,
'fifteen'
,
'fifty'
,
'fill'
,
'find'
,
'fire'
,
'first'
,
'five'
,
'for'
,
'former'
,
'formerly'
,
'forty'
,
'found'
,
'four'
,
'from'
,
'front'
,
'full'
,
'further'
,
'get'
,
'give'
,
'go'
,
'had'
,
'has'
,
'hasnt'
,
'have'
,
'he'
,
'hence'
,
'her'
,
'here'
,
'hereafter'
,
'hereby'
,
'herein'
,
'hereupon'
,
'hers'
,
'herself'
,
'him'
,
'himself'
,
'his'
,
'how'
,
'however'
,
'hundred'
,
'i'
,
'ie'
,
'if'
,
'in'
,
'inc'
,
'indeed'
,
'interest'
,
'into'
,
'is'
,
'it'
,
'its'
,
'itself'
,
'keep'
,
'last'
,
'latter'
,
'latterly'
,
'least'
,
'less'
,
'made'
,
'many'
,
'may'
,
'me'
,
'meanwhile'
,
'might'
,
'mill'
,
'mine'
,
'more'
,
'moreover'
,
'most'
,
'mostly'
,
'move'
,
'much'
,
'must'
,
'my'
,
'myself'
,
'name'
,
'namely'
,
'neither'
,
'never'
,
'nevertheless'
,
'next'
,
'nine'
,
'no'
,
'nobody'
,
'none'
,
'noone'
,
'nor'
,
'not'
,
'nothing'
,
'now'
,
'nowhere'
,
'of'
,
'off'
,
'often'
,
'on'
,
'once'
,
'one'
,
'only'
,
'onto'
,
'or'
,
'other'
,
'others'
,
'otherwise'
,
'our'
,
'ours'
,
'ourselves'
,
'out'
,
'over'
,
'own'
,
'per'
,
'perhaps'
,
'please'
,
'pre'
,
'put'
,
'rather'
,
're'
,
'same'
,
'see'
,
'seem'
,
'seemed'
,
'seeming'
,
'seems'
,
'serious'
,
'several'
,
'she'
,
'should'
,
'show'
,
'side'
,
'since'
,
'sincere'
,
'six'
,
'sixty'
,
'so'
,
'some'
,
'somehow'
,
'someone'
,
'something'
,
'sometime'
,
'sometimes'
,
'somewhere'
,
'still'
,
'such'
,
'take'
,
'ten'
,
'than'
,
'that'
,
'the'
,
'their'
,
'them'
,
'themselves'
,
'then'
,
'thence'
,
'there'
,
'thereafter'
,
'thereby'
,
'therefore'
,
'therein'
,
'thereupon'
,
'these'
,
'they'
,
'thick'
,
'thin'
,
'third'
,
'this'
,
'those'
,
'though'
,
'three'
,
'through'
,
'throughout'
,
'thru'
,
'thus'
,
'to'
,
'together'
,
'too'
,
'toward'
,
'towards'
,
'twelve'
,
'twenty'
,
'two'
,
'un'
,
'under'
,
'until'
,
'up'
,
'upon'
,
'us'
,
'very'
,
'via'
,
'was'
,
'we'
,
'well'
,
'were'
,
'what'
,
'whatever'
,
'when'
,
'whence'
,
'whenever'
,
'where'
,
'whereafter'
,
'whereas'
,
'whereby'
,
'wherein'
,
'whereupon'
,
'wherever'
,
'whether'
,
'which'
,
'while'
,
'whither'
,
'who'
,
'whoever'
,
'whole'
,
'whom'
,
'whose'
,
'why'
,
'will'
,
'with'
,
'within'
,
'without'
,
'would'
,
'yet'
,
'you'
,
'your'
,
'yours'
,
'yourself'
,
'yourselves'
,
)
stop_word_dict
=
{}
for
word
in
stop_words
:
stop_word_dict
[
word
]
=
None
lib/python/SearchIndex/PluggableIndex.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Pluggable Index Base Class """
__version__
=
'$Revision: 1.4 $'
[
11
:
-
2
]
import
Interface
class
PluggableIndex
:
"""Base pluggable index class"""
def
getEntryForObject
(
self
,
documentId
,
default
=
None
):
"""Get all information contained for a specific object by documentId"""
pass
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
"""Index an object:
'documentId' is the integer ID of the document
'obj' is the object to be indexed
'threshold' is the number of words to process between committing
subtransactions. If None, subtransactions are disabled"""
pass
def
unindex_object
(
self
,
documentId
):
"""Remove the documentId from the index"""
pass
def
uniqueValues
(
self
,
name
=
None
,
withLengths
=
0
):
"""Returns the unique values for name.
If 'withLengths' is true, returns a sequence of tuples of
(value, length)"""
pass
def
_apply_index
(
self
,
request
,
cid
=
''
):
"""Apply the index to query parameters given in the argument, request.
The argument should be a mapping object.
If the request does not contain the needed parametrs, then None is
returned.
If the request contains a parameter with the name of the column
+ "_usage", it is sniffed for information on how to handle applying
the index.
Otherwise two objects are returned. The first object is a ResultSet
containing the record numbers of the matching records. The second
object is a tuple containing the names of all data fields used."""
pass
PluggableIndexInterface
=
Interface
.
impliedInterface
(
PluggableIndex
)
PluggableIndex
.
__implements__
=
PluggableIndexInterface
lib/python/SearchIndex/README.txt
deleted
100644 → 0
View file @
2dc887a3
The SearchIndex package is deprecated since Zope 2.4
Instead use the re-factored modules in Products/PluginIndexes.
lib/python/SearchIndex/ResultList.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
BTrees.IIBTree
import
IIBucket
from
BTrees.IIBTree
import
weightedIntersection
,
weightedUnion
,
difference
from
BTrees.OOBTree
import
OOSet
,
union
class
ResultList
:
def
__init__
(
self
,
d
,
words
,
index
,
TupleType
=
type
(())):
self
.
_index
=
index
if
type
(
words
)
is
not
OOSet
:
words
=
OOSet
(
words
)
self
.
_words
=
words
if
(
type
(
d
)
is
TupleType
):
d
=
IIBucket
((
d
,))
elif
type
(
d
)
is
not
IIBucket
:
d
=
IIBucket
(
d
)
self
.
_dict
=
d
self
.
__getitem__
=
d
.
__getitem__
try
:
self
.
__nonzero__
=
d
.
__nonzero__
except
:
pass
self
.
get
=
d
.
get
def
__nonzero__
(
self
):
return
not
not
self
.
_dict
def
bucket
(
self
):
return
self
.
_dict
def
keys
(
self
):
return
self
.
_dict
.
keys
()
def
has_key
(
self
,
key
):
return
self
.
_dict
.
has_key
(
key
)
def
items
(
self
):
return
self
.
_dict
.
items
()
def
__and__
(
self
,
x
):
return
self
.
__class__
(
weightedIntersection
(
self
.
_dict
,
x
.
_dict
)[
1
],
union
(
self
.
_words
,
x
.
_words
),
self
.
_index
,
)
def
and_not
(
self
,
x
):
return
self
.
__class__
(
difference
(
self
.
_dict
,
x
.
_dict
),
self
.
_words
,
self
.
_index
,
)
def
__or__
(
self
,
x
):
return
self
.
__class__
(
weightedUnion
(
self
.
_dict
,
x
.
_dict
)[
1
],
union
(
self
.
_words
,
x
.
_words
),
self
.
_index
,
)
return
self
.
__class__
(
result
,
self
.
_words
+
x
.
_words
,
self
.
_index
)
def
near
(
self
,
x
):
result
=
IIBucket
()
dict
=
self
.
_dict
xdict
=
x
.
_dict
xhas
=
xdict
.
has_key
positions
=
self
.
_index
.
positions
for
id
,
score
in
dict
.
items
():
if
not
xhas
(
id
):
continue
p
=
(
map
(
lambda
i
:
(
i
,
0
),
positions
(
id
,
self
.
_words
))
+
map
(
lambda
i
:
(
i
,
1
),
positions
(
id
,
x
.
_words
)))
p
.
sort
()
d
=
lp
=
9999
li
=
None
lsrc
=
None
for
i
,
src
in
p
:
if
i
is
not
li
and
src
is
not
lsrc
and
li
is
not
None
:
d
=
min
(
d
,
i
-
li
)
li
=
i
lsrc
=
src
if
d
==
lp
:
score
=
min
(
score
,
xdict
[
id
])
# synonyms
else
:
score
=
(
score
+
xdict
[
id
])
/
d
result
[
id
]
=
score
return
self
.
__class__
(
result
,
union
(
self
.
_words
,
x
.
_words
),
self
.
_index
)
lib/python/SearchIndex/Setup
deleted
100644 → 0
View file @
2dc887a3
*shared*
Splitter Splitter.c
lib/python/SearchIndex/Splitter.c
deleted
100644 → 0
View file @
2dc887a3
/*****************************************************************************
Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
typedef
struct
{
PyObject_HEAD
PyObject
*
text
,
*
synstop
;
char
*
here
,
*
end
;
int
index
;
}
Splitter
;
static
PyObject
*
next_word
(
Splitter
*
,
char
**
,
char
**
);
static
void
Splitter_reset
(
Splitter
*
self
)
{
self
->
here
=
PyString_AsString
(
self
->
text
);
self
->
index
=
-
1
;
}
static
void
Splitter_dealloc
(
Splitter
*
self
)
{
Py_XDECREF
(
self
->
text
);
Py_XDECREF
(
self
->
synstop
);
PyMem_DEL
(
self
);
}
static
int
Splitter_length
(
Splitter
*
self
)
{
PyObject
*
res
=
0
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
res
,
next_word
(
self
,
NULL
,
NULL
))
return
-
1
;
UNLESS
(
PyString_Check
(
res
))
{
Py_DECREF
(
res
);
break
;
}
}
return
self
->
index
+
1
;
}
static
PyObject
*
Splitter_concat
(
Splitter
*
self
,
PyObject
*
other
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot concatenate Splitters."
);
return
NULL
;
}
static
PyObject
*
Splitter_repeat
(
Splitter
*
self
,
long
n
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot repeat Splitters."
);
return
NULL
;
}
/*
Map an input word to an output word by applying standard
filtering/mapping words, including synonyms/stop words.
Input is a word.
Output is:
None -- The word is a stop word
sometext -- A replacement for the word
*/
static
PyObject
*
check_synstop
(
Splitter
*
self
,
PyObject
*
word
)
{
PyObject
*
value
;
char
*
cword
;
int
len
;
cword
=
PyString_AsString
(
word
);
len
=
PyString_Size
(
word
);
if
(
len
<
2
)
/* Single-letter words are stop words! */
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*************************************************************
Test whether a word has any letters. *
*/
for
(;
--
len
>=
0
&&
!
isalpha
((
unsigned
char
)
cword
[
len
]);
);
if
(
len
<
0
)
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF
(
word
);
if
(
self
->
synstop
==
NULL
)
return
word
;
while
((
value
=
PyObject_GetItem
(
self
->
synstop
,
word
))
&&
PyString_Check
(
value
))
{
ASSIGN
(
word
,
value
);
if
(
len
++
>
100
)
break
;
/* Avoid infinite recurssion */
}
if
(
value
==
NULL
)
{
PyErr_Clear
();
return
word
;
}
return
value
;
/* Which must be None! */
}
#define MAX_WORD 64
/* Words longer than MAX_WORD are stemmed */
static
PyObject
*
next_word
(
Splitter
*
self
,
char
**
startpos
,
char
**
endpos
)
{
char
wbuf
[
MAX_WORD
];
char
*
end
,
*
here
,
*
b
;
int
i
=
0
,
c
;
PyObject
*
pyword
,
*
res
;
here
=
self
->
here
;
end
=
self
->
end
;
b
=
wbuf
;
while
(
here
<
end
)
{
/* skip hyphens */
if
((
i
>
0
)
&&
(
*
here
==
'-'
))
{
here
++
;
while
(
isspace
((
unsigned
char
)
*
here
)
&&
(
here
<
end
))
here
++
;
continue
;
}
c
=
tolower
((
unsigned
char
)
*
here
);
/* Check to see if this character is part of a word */
if
(
isalnum
((
unsigned
char
)
c
)
||
c
==
'/'
||
c
==
'_'
)
{
/* Found a word character */
if
(
startpos
&&
i
==
0
)
*
startpos
=
here
;
if
(
i
++
<
MAX_WORD
)
*
b
++
=
c
;
}
else
if
(
i
!=
0
)
{
/* We've found the end of a word */
if
(
i
>=
MAX_WORD
)
i
=
MAX_WORD
;
/* "stem" the long word */
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
{
self
->
here
=
here
;
return
NULL
;
}
UNLESS
(
res
=
check_synstop
(
self
,
pyword
))
{
self
->
here
=
here
;
Py_DECREF
(
pyword
);
return
NULL
;
}
if
(
res
!=
Py_None
)
{
if
(
endpos
)
*
endpos
=
here
;
self
->
here
=
here
;
Py_DECREF
(
pyword
);
self
->
index
++
;
return
res
;
}
/* The word is a stopword, so ignore it */
Py_DECREF
(
res
);
Py_DECREF
(
pyword
);
i
=
0
;
b
=
wbuf
;
}
here
++
;
}
self
->
here
=
here
;
/* We've reached the end of the string */
if
(
i
>=
MAX_WORD
)
i
=
MAX_WORD
;
/* "stem" the long word */
if
(
i
==
0
)
{
/* No words */
self
->
here
=
here
;
Py_INCREF
(
Py_None
);
return
Py_None
;
}
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
return
NULL
;
if
(
endpos
)
*
endpos
=
here
;
res
=
check_synstop
(
self
,
pyword
);
Py_DECREF
(
pyword
);
if
(
PyString_Check
(
res
))
self
->
index
++
;
return
res
;
}
static
PyObject
*
Splitter_item
(
Splitter
*
self
,
int
i
)
{
PyObject
*
word
=
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
Py_XDECREF
(
word
);
UNLESS
(
word
=
next_word
(
self
,
NULL
,
NULL
))
return
NULL
;
if
(
word
==
Py_None
)
{
Py_DECREF
(
word
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
}
return
word
;
}
static
PyObject
*
Splitter_slice
(
Splitter
*
self
,
int
i
,
int
j
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot slice Splitters."
);
return
NULL
;
}
static
PySequenceMethods
Splitter_as_sequence
=
{
(
inquiry
)
Splitter_length
,
/*sq_length*/
(
binaryfunc
)
Splitter_concat
,
/*sq_concat*/
(
intargfunc
)
Splitter_repeat
,
/*sq_repeat*/
(
intargfunc
)
Splitter_item
,
/*sq_item*/
(
intintargfunc
)
Splitter_slice
,
/*sq_slice*/
(
intobjargproc
)
0
,
/*sq_ass_item*/
(
intintobjargproc
)
0
,
/*sq_ass_slice*/
};
static
PyObject
*
Splitter_pos
(
Splitter
*
self
,
PyObject
*
args
)
{
char
*
start
,
*
end
,
*
ctext
;
PyObject
*
res
;
int
i
;
UNLESS
(
PyArg_Parse
(
args
,
"i"
,
&
i
))
return
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
UNLESS
(
res
=
next_word
(
self
,
&
start
,
&
end
))
return
NULL
;
if
(
PyString_Check
(
res
))
{
self
->
index
++
;
Py_DECREF
(
res
);
continue
;
}
Py_DECREF
(
res
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
ctext
=
PyString_AsString
(
self
->
text
);
return
Py_BuildValue
(
"(ii)"
,
start
-
ctext
,
end
-
ctext
);
}
static
PyObject
*
Splitter_indexes
(
Splitter
*
self
,
PyObject
*
args
)
{
PyObject
*
word
,
*
r
,
*
w
=
0
,
*
index
=
0
;
int
i
=
0
;
UNLESS
(
PyArg_ParseTuple
(
args
,
"O"
,
&
word
))
return
NULL
;
UNLESS
(
r
=
PyList_New
(
0
))
return
NULL
;
UNLESS
(
word
=
check_synstop
(
self
,
word
))
goto
err
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
w
,
next_word
(
self
,
NULL
,
NULL
))
goto
err
;
UNLESS
(
PyString_Check
(
w
))
break
;
if
(
PyObject_Compare
(
word
,
w
)
==
0
)
{
UNLESS_ASSIGN
(
index
,
PyInt_FromLong
(
i
))
goto
err
;
if
(
PyList_Append
(
r
,
index
)
<
0
)
goto
err
;
}
i
++
;
}
Py_XDECREF
(
w
);
Py_XDECREF
(
index
);
return
r
;
err:
Py_DECREF
(
r
);
Py_XDECREF
(
index
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_methods
[]
=
{
{
"pos"
,
(
PyCFunction
)
Splitter_pos
,
0
,
"pos(index) -- Return the starting and ending position of a token"
},
{
"indexes"
,
(
PyCFunction
)
Splitter_indexes
,
METH_VARARGS
,
"indexes(word) -- Return al list of the indexes of word in the sequence"
,
},
{
NULL
,
NULL
}
/* sentinel */
};
static
PyObject
*
Splitter_getattr
(
Splitter
*
self
,
char
*
name
)
{
return
Py_FindMethod
(
Splitter_methods
,
(
PyObject
*
)
self
,
name
);
}
static
char
SplitterType__doc__
[]
=
""
;
static
PyTypeObject
SplitterType
=
{
PyObject_HEAD_INIT
(
NULL
)
0
,
/*ob_size*/
"Splitter"
,
/*tp_name*/
sizeof
(
Splitter
),
/*tp_basicsize*/
0
,
/*tp_itemsize*/
/* methods */
(
destructor
)
Splitter_dealloc
,
/*tp_dealloc*/
(
printfunc
)
0
,
/*tp_print*/
(
getattrfunc
)
Splitter_getattr
,
/*tp_getattr*/
(
setattrfunc
)
0
,
/*tp_setattr*/
(
cmpfunc
)
0
,
/*tp_compare*/
(
reprfunc
)
0
,
/*tp_repr*/
0
,
/*tp_as_number*/
&
Splitter_as_sequence
,
/*tp_as_sequence*/
0
,
/*tp_as_mapping*/
(
hashfunc
)
0
,
/*tp_hash*/
(
ternaryfunc
)
0
,
/*tp_call*/
(
reprfunc
)
0
,
/*tp_str*/
/* Space for future expansion */
0L
,
0L
,
0L
,
0L
,
SplitterType__doc__
/* Documentation string */
};
static
PyObject
*
get_Splitter
(
PyObject
*
modinfo
,
PyObject
*
args
)
{
Splitter
*
self
;
PyObject
*
doc
,
*
synstop
=
NULL
;
UNLESS
(
PyArg_ParseTuple
(
args
,
"O|O"
,
&
doc
,
&
synstop
))
return
NULL
;
UNLESS
(
self
=
PyObject_NEW
(
Splitter
,
&
SplitterType
))
return
NULL
;
if
(
synstop
)
{
self
->
synstop
=
synstop
;
Py_INCREF
(
synstop
);
}
else
self
->
synstop
=
NULL
;
UNLESS
(
self
->
text
=
PyObject_Str
(
doc
))
goto
err
;
UNLESS
(
self
->
here
=
PyString_AsString
(
self
->
text
))
goto
err
;
self
->
end
=
self
->
here
+
PyString_Size
(
self
->
text
);
self
->
index
=
-
1
;
return
(
PyObject
*
)
self
;
err:
Py_DECREF
(
self
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_module_methods
[]
=
{
{
"Splitter"
,
(
PyCFunction
)
get_Splitter
,
METH_VARARGS
,
"Splitter(doc[,synstop]) -- Return a word splitter"
},
{
NULL
,
NULL
}
};
static
char
Splitter_module_documentation
[]
=
"Parse source strings into sequences of words
\n
"
"
\n
"
"for use in an inverted index
\n
"
"
\n
"
"$Id: Splitter.c,v 1.19 2002/03/21 15:48:55 htrd Exp $
\n
"
;
void
initSplitter
(
void
)
{
PyObject
*
m
,
*
d
;
/* Create the module and add the functions */
m
=
Py_InitModule4
(
"Splitter"
,
Splitter_module_methods
,
Splitter_module_documentation
,
(
PyObject
*
)
NULL
,
PYTHON_API_VERSION
);
/* Add some symbolic constants to the module */
d
=
PyModule_GetDict
(
m
);
if
(
PyErr_Occurred
())
Py_FatalError
(
"can't initialize module Splitter"
);
}
lib/python/SearchIndex/TextIndex.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Text Index
Notes on a new text index design
The current inverted index algoirthm works well enough for our needs.
Speed of the algorithm does not seem to be a problem, however, data
management *is* a significant problem. In particular:
- Process size grows unacceptably *during mass indexing*.
- Data load and store seems to take too long. For example,
clearing an inverted index and committing takes a significant
amount of time.
- The current trie data structure contributes significantly to the
number of objects in the system.
- Removal/update of documents is especially problematic. We have
to either:
- Unindex old version of an object before updating it. This is
a real hassle for apps like sws.
- Tool through entire index looking for object references. This
is *totally* impractical.
Some observations of competition:
- Xerox system can index "5-million word document in 256k". What
does this mean?
- Does the system save word positions as we do?
- What is the index indexing?
- What was the vocabulary of the system?
Let
\
'
s see. Assume a 10,000 word vocabulary. Then we use
25-bytes per entry. Hm.....
- Verity has some sense of indexing in phases and packing index.
Verity keeps the index in multiple chunks and a search may
operate on multiple chunks. This means that we can add data
without updating large records.
This may be especially handy for mass updates, like we do in
cv3. In a sense we do this in cv3 and sws. We index a large
batch of documents to a temporary index and then merge changes
in.
If "temporary" index was integral to system, then maybe merger
could be done as a background task....
Tree issues
Tree structures benefit small updates, because an update to an
entry does not cause update of entire tree, however, each node in
tree introduces overhead.
Trie structure currently introduces an excessive number of nodes.
Typically, a node per two or three words. Trie has potential to
reduce storage because key storage is shared between words.
Maybe an alternative to a Trie is some sort of nested BTree. Or
maybe a Trie with some kind of binary-search-based indexing.
Suppose that:
- database objects were at leaves of tree
- vocabulary was finite
- we don
\
'
t remove a leaf when it becomes empty
Then:
- After some point, tree objects no longer change
If this is case, then it doesn
\
'
t make sense to optimize tree for
change.
Additional notes
Stemming reduces the number of words substantially.
Proposal -- new TextIndex
TextIndex -- word -> textSearchResult
Implemented with:
InvertedIndex -- word -> idSet
ResultIndex -- id -> docData
where:
word -- is a token, typically a word, but could be a name or a
number
textSearchResult -- id -> (score, positions)
id -- integer, say 4-byte.
positions -- sequence of integers.
score -- numeric measure of relevence, f(numberOfWords, positions)
numberOfWords -- number of words in source document.
idSet -- set of ids
docData -- numberOfWords, word->positions
Note that ids and positions are ints. We will build C
extensions for efficiently storing and pickling structures
with lots of ints. This should significantly improve space
overhead and storage/retrieveal times, as well as storeage
space.
"""
__version__
=
'$Revision: 1.32 $'
[
11
:
-
2
]
#XXX I strongly suspect that this is broken, but I'm not going to fix it. :(
from
Globals
import
Persistent
from
BTrees.OOBTree
import
OOBTree
from
BTrees.IIBTree
import
IISet
,
IIBucket
import
operator
from
Splitter
import
Splitter
from
string
import
strip
import
string
,
re
from
Lexicon
import
Lexicon
,
stop_word_dict
from
ResultList
import
ResultList
class
TextIndex
(
Persistent
):
isDeprecatedIndex
=
1
def
__init__
(
self
,
data
=
None
,
schema
=
None
,
id
=
None
,
ignore_ex
=
None
,
call_methods
=
None
):
"""Create an index
The arguments are:
'data' -- a mapping from integer object ids to objects or
records,
'schema' -- a mapping from item name to index into data
records. If 'data' is a mapping to objects, then schema
should ne 'None'.
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
"""
######################################################################
# For b/w compatability, have to allow __init__ calls with zero args
if
not
data
==
schema
==
id
==
ignore_ex
==
call_methods
==
None
:
self
.
_data
=
data
self
.
_schema
=
schema
self
.
id
=
id
self
.
ignore_ex
=
ignore_ex
self
.
call_methods
=
call_methods
self
.
_index
=
OOBTree
()
#XXX Is this really an IOBTree?
self
.
_syn
=
stop_word_dict
self
.
_reindex
()
else
:
pass
# for backwards compatability
_init
=
__init__
def
clear
(
self
):
self
.
_index
=
OOBTree
()
def
positions
(
self
,
docid
,
words
):
"""Return the positions in the document for the given document
id of the word, word."""
id
=
self
.
id
if
self
.
_schema
is
None
:
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
id
]
row
=
self
.
_data
[
docid
]
if
self
.
call_methods
:
doc
=
str
(
f
(
row
,
id
)())
else
:
doc
=
str
(
f
(
row
,
id
))
r
=
[]
for
word
in
words
:
r
=
r
+
Splitter
(
doc
,
self
.
_syn
).
indexes
(
word
)
return
r
def
index_item
(
self
,
i
,
obj
=
None
,
un
=
0
):
"""Recompute index data for data with ids >= start.
if 'obj' is passed in, it is indexed instead of _data[i]"""
id
=
self
.
id
if
(
self
.
_schema
is
None
)
or
(
obj
is
not
None
):
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
id
]
if
obj
is
None
:
obj
=
self
.
_data
[
i
]
try
:
if
self
.
call_methods
:
k
=
str
(
f
(
obj
,
id
)())
else
:
k
=
str
(
f
(
obj
,
id
))
self
.
_index_document
(
k
,
i
,
un
)
except
:
pass
def
unindex_item
(
self
,
i
,
obj
=
None
):
return
self
.
index_item
(
i
,
obj
,
1
)
def
_reindex
(
self
,
start
=
0
):
"""Recompute index data for data with ids >= start."""
for
i
in
self
.
_data
.
keys
(
start
):
self
.
index_item
(
i
)
def
_index_document
(
self
,
document_text
,
id
,
un
=
0
,
tupleType
=
type
(()),
dictType
=
type
({}),
):
src
=
Splitter
(
document_text
,
self
.
_syn
)
d
=
{}
old
=
d
.
has_key
last
=
None
for
s
in
src
:
if
s
[
0
]
==
'
\
"
'
:
last
=
self
.
subindex
(
s
[
1
:
-
1
],
d
,
old
,
last
)
else
:
if
old
(
s
):
if
s
!=
last
:
d
[
s
]
=
d
[
s
]
+
1
else
:
d
[
s
]
=
1
index
=
self
.
_index
get
=
index
.
get
if
un
:
for
word
,
score
in
d
.
items
():
r
=
get
(
word
)
if
r
is
not
None
:
if
type
(
r
)
is
tupleType
:
del
index
[
word
]
else
:
if
r
.
has_key
(
id
):
del
r
[
id
]
if
type
(
r
)
is
dictType
:
if
len
(
r
)
<
2
:
if
r
:
for
k
,
v
in
r
.
items
():
index
[
word
]
=
k
,
v
else
:
del
index
[
word
]
else
:
index
[
word
]
=
r
else
:
for
word
,
score
in
d
.
items
():
r
=
get
(
word
)
if
r
is
not
None
:
r
=
index
[
word
]
if
type
(
r
)
is
tupleType
:
r
=
{
r
[
0
]:
r
[
1
]}
r
[
id
]
=
score
index
[
word
]
=
r
elif
type
(
r
)
is
dictType
:
if
len
(
r
)
>
4
:
b
=
IIBucket
()
for
k
,
v
in
r
.
items
():
b
[
k
]
=
v
r
=
b
r
[
id
]
=
score
index
[
word
]
=
r
else
:
r
[
id
]
=
score
else
:
index
[
word
]
=
id
,
score
def
_subindex
(
self
,
isrc
,
d
,
old
,
last
):
src
=
Splitter
(
isrc
,
self
.
_syn
)
for
s
in
src
:
if
s
[
0
]
==
'
\
"
'
:
last
=
self
.
subindex
(
s
[
1
:
-
1
],
d
,
old
,
last
)
else
:
if
old
(
s
):
if
s
!=
last
:
d
[
s
]
=
d
[
s
]
+
1
else
:
d
[
s
]
=
1
return
last
def
__getitem__
(
self
,
word
):
"""Return an InvertedIndex-style result "list"
"""
src
=
tuple
(
Splitter
(
word
,
self
.
_syn
))
if
not
src
:
return
ResultList
({},
(
word
,),
self
)
if
len
(
src
)
==
1
:
src
=
src
[
0
]
if
src
[:
1
]
==
'"'
and
src
[
-
1
:]
==
'"'
:
return
self
[
src
]
r
=
self
.
_index
.
get
(
word
,
None
)
if
r
is
None
:
r
=
{}
return
ResultList
(
r
,
(
word
,),
self
)
r
=
None
for
word
in
src
:
rr
=
self
[
word
]
if
r
is
None
:
r
=
rr
else
:
r
=
r
.
near
(
rr
)
return
r
def
_apply_index
(
self
,
request
,
cid
=
''
,
ListType
=
[]):
""" Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then
None is returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
id
=
self
.
id
cidid
=
"%s/%s"
%
(
cid
,
id
)
has_key
=
request
.
has_key
if
has_key
(
cidid
):
keys
=
request
[
cidid
]
elif
has_key
(
id
):
keys
=
request
[
id
]
else
:
return
None
if
type
(
keys
)
is
type
(
''
):
if
not
keys
or
not
strip
(
keys
):
return
None
keys
=
[
keys
]
r
=
None
for
key
in
keys
:
key
=
strip
(
key
)
if
not
key
:
continue
rr
=
IISet
()
try
:
for
i
,
score
in
query
(
key
,
self
).
items
():
if
score
:
rr
.
insert
(
i
)
except
KeyError
:
pass
if
r
is
None
:
r
=
rr
else
:
# Note that we *and*/*narrow* multiple search terms.
r
=
r
.
intersection
(
rr
)
if
r
is
not
None
:
return
r
,
(
id
,)
return
IISet
(),
(
id
,)
AndNot
=
'andnot'
And
=
'and'
Or
=
'or'
Near
=
'...'
QueryError
=
'TextIndex.QueryError'
def
query
(
s
,
index
,
default_operator
=
Or
,
ws
=
(
string
.
whitespace
,)):
# First replace any occurences of " and not " with " andnot "
s
=
re
.
sub
(
'[%s]+and[%s]+not[%s]+'
%
(
ws
*
3
),
' andnot '
,
s
)
q
=
parse
(
s
)
q
=
parse2
(
q
,
default_operator
)
return
evaluate
(
q
,
index
)
def
parse
(
s
):
'''Parse parentheses and quotes'''
l
=
[]
tmp
=
string
.
lower
(
s
)
while
(
1
):
p
=
parens
(
tmp
)
if
(
p
is
None
):
# No parentheses found. Look for quotes then exit.
l
=
l
+
quotes
(
tmp
)
break
else
:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l
=
l
+
quotes
(
tmp
[:(
p
[
0
]
-
1
)])
l
.
append
(
parse
(
tmp
[
p
[
0
]
:
p
[
1
]]))
# continue looking through the rest of the string
tmp
=
tmp
[(
p
[
1
]
+
1
):]
return
l
def
parse2
(
q
,
default_operator
,
operator_dict
=
{
AndNot
:
AndNot
,
And
:
And
,
Or
:
Or
,
Near
:
Near
},
ListType
=
type
([]),
):
'''Find operators and operands'''
i
=
0
isop
=
operator_dict
.
has_key
while
(
i
<
len
(
q
)):
if
(
type
(
q
[
i
])
is
ListType
):
q
[
i
]
=
parse2
(
q
[
i
],
default_operator
)
# every other item, starting with the first, should be an operand
if
((
i
%
2
)
!=
0
):
# This word should be an operator; if it is not, splice in
# the default operator.
if
type
(
q
[
i
])
is
not
ListType
and
isop
(
q
[
i
]):
q
[
i
]
=
operator_dict
[
q
[
i
]]
else
:
q
[
i
:
i
]
=
[
default_operator
]
i
=
i
+
1
return
q
def
parens
(
s
,
parens_re
=
re
.
compile
(
r'(\
|)
').search):
index=open_index=paren_count = 0
while 1:
index = parens_re(s, index)
if index is None : break
if s[index] == '
(
':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
def quotes(s, ws = (string.whitespace,)):
# split up quoted regions
splitted = re.split( '
[
%
s
]
*
\
"[%s]*' % (ws * 2),s)
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "
Mismatched
quotes
"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
def get_operands(q, i, index, ListType=type([]), StringType=type('')):
'''Evaluate and return the left and right operands for an operator'''
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "
Malformed
query
"
t=type(left)
if t is ListType: left = evaluate(left, index)
elif t is StringType: left=index[left]
t=type(right)
if t is ListType: right = evaluate(right, index)
elif t is StringType: right=index[right]
return (left, right)
def evaluate(q, index, ListType=type([])):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], index)
return index[q[0]]
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = get_operands(q, i, index)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = get_operands(q, i, index)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = get_operands(q, i, index)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = get_operands(q, i, index)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "
Malformed
query
"
return q[0]
lib/python/SearchIndex/UnIndex.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Simple column indices"""
__version__
=
'$Revision: 1.35 $'
[
11
:
-
2
]
from
Globals
import
Persistent
from
Acquisition
import
Implicit
import
string
from
zLOG
import
LOG
,
ERROR
from
types
import
StringType
,
ListType
,
IntType
,
TupleType
from
BTrees.OOBTree
import
OOBTree
,
OOSet
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IIBTree
import
IITreeSet
,
IISet
,
union
import
BTrees.Length
import
sys
_marker
=
[]
class
UnIndex
(
Persistent
,
Implicit
):
"""UnIndex object interface"""
meta_type
=
'Field Index'
isDeprecatedIndex
=
1
def
__init__
(
self
,
id
,
ignore_ex
=
None
,
call_methods
=
None
):
"""Create an unindex
UnIndexes are indexes that contain two index components, the
forward index (like plain index objects) and an inverted
index. The inverted index is so that objects can be unindexed
even when the old value of the object is not known.
e.g.
self._index = {datum:[documentId1, documentId2]}
self._unindex = {documentId:datum}
If any item in self._index has a length-one value, the value is an
integer, and not a set. There are special cases in the code to deal
with this.
The arguments are:
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- should be set to true if you want the index
to ignore exceptions raised while indexing instead of
propagating them.
'call_methods' -- should be set to true if you want the index
to call the attribute 'id' (note: 'id' should be callable!)
You will also need to pass in an object in the index and
uninded methods for this to work.
"""
self
.
id
=
id
self
.
ignore_ex
=
ignore_ex
# currently unimplimented
self
.
call_methods
=
call_methods
# Note that it was unfortunate to use __len__ as the attribute
# name here. New-style classes cache slot methods in C slot
# pointers. The result is that instances can't override slots.
# This is not easy to change on account of old objects with
# __len__ attr.
self
.
__len__
=
BTrees
.
Length
.
Length
()
self
.
clear
()
def
__len__
(
self
):
try
:
return
self
.
__dict__
[
'__len__'
]()
except
KeyError
:
# Fallback for really old indexes
return
len
(
self
.
_unindex
)
def
clear
(
self
):
# inplace opportunistic conversion from old-style to new style BTrees
try
:
self
.
__len__
.
set
(
0
)
except
AttributeError
:
self
.
__len__
=
BTrees
.
Length
.
Length
()
self
.
_index
=
OOBTree
()
self
.
_unindex
=
IOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
if
type
(
self
.
_index
)
is
OOBTree
:
return
from
BTrees.convert
import
convert
_index
=
self
.
_index
self
.
_index
=
OOBTree
()
def
convertSet
(
s
,
IITreeSet
=
IITreeSet
,
IntType
=
type
(
0
),
type
=
type
,
len
=
len
,
doneTypes
=
(
IntType
,
IITreeSet
)):
if
type
(
s
)
in
doneTypes
:
return
s
if
len
(
s
)
==
1
:
try
:
return
s
[
0
]
# convert to int
except
:
pass
# This is just an optimization.
return
IITreeSet
(
s
)
convert
(
_index
,
self
.
_index
,
threshold
,
convertSet
)
_unindex
=
self
.
_unindex
self
.
_unindex
=
IOBTree
()
convert
(
_unindex
,
self
.
_unindex
,
threshold
)
self
.
__len__
=
BTrees
.
Length
.
Length
(
len
(
_index
))
def
__nonzero__
(
self
):
return
not
not
self
.
_unindex
def
histogram
(
self
):
"""Return a mapping which provides a histogram of the number of
elements found at each point in the index."""
histogram
=
{}
for
item
in
self
.
_index
.
items
():
if
type
(
item
)
is
IntType
:
entry
=
1
# "set" length is 1
else
:
key
,
value
=
item
entry
=
len
(
value
)
histogram
[
entry
]
=
histogram
.
get
(
entry
,
0
)
+
1
return
histogram
def
referencedObjects
(
self
):
"""Generate a list of IDs for which we have referenced objects."""
return
self
.
_unindex
.
keys
()
def
getEntryForObject
(
self
,
documentId
,
default
=
_marker
):
"""Takes a document ID and returns all the information we have
on that specific object."""
if
default
is
_marker
:
return
self
.
_unindex
.
get
(
documentId
)
else
:
return
self
.
_unindex
.
get
(
documentId
,
default
)
def
removeForwardIndexEntry
(
self
,
entry
,
documentId
):
"""Take the entry provided and remove any reference to documentId
in its entry in the index."""
global
_marker
indexRow
=
self
.
_index
.
get
(
entry
,
_marker
)
if
indexRow
is
not
_marker
:
try
:
indexRow
.
remove
(
documentId
)
if
not
indexRow
:
del
self
.
_index
[
entry
]
try
:
self
.
__len__
.
change
(
-
1
)
except
AttributeError
:
pass
# pre-BTrees-module instance
except
AttributeError
:
# index row is an int
del
self
.
_index
[
entry
]
try
:
self
.
__len__
.
change
(
-
1
)
except
AttributeError
:
pass
# pre-BTrees-module instance
except
:
LOG
(
self
.
__class__
.
__name__
,
ERROR
,
(
'unindex_object could not remove '
'documentId %s from index %s. This '
'should not happen.'
%
(
str
(
documentId
),
str
(
self
.
id
))),
''
,
sys
.
exc_info
())
else
:
LOG
(
self
.
__class__
.
__name__
,
ERROR
,
(
'unindex_object tried to retrieve set %s '
'from index %s but couldn
\
'
t. This '
'should not happen.'
%
(
repr
(
entry
),
str
(
self
.
id
))))
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
):
"""Take the entry provided and put it in the correct place
in the forward index.
This will also deal with creating the entire row if necessary."""
global
_marker
indexRow
=
self
.
_index
.
get
(
entry
,
_marker
)
# Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first.
if
indexRow
is
_marker
:
self
.
_index
[
entry
]
=
documentId
try
:
self
.
__len__
.
change
(
1
)
except
AttributeError
:
pass
# pre-BTrees-module instance
else
:
try
:
indexRow
.
insert
(
documentId
)
except
AttributeError
:
# index row is an int
indexRow
=
IITreeSet
((
indexRow
,
documentId
))
self
.
_index
[
entry
]
=
indexRow
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
""" index and object 'obj' with integer id 'documentId'"""
global
_marker
returnStatus
=
0
# First we need to see if there's anything interesting to look at
# self.id is the name of the index, which is also the name of the
# attribute we're interested in. If the attribute is callable,
# we'll do so.
try
:
datum
=
getattr
(
obj
,
self
.
id
)
if
callable
(
datum
):
datum
=
datum
()
except
AttributeError
:
datum
=
_marker
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
oldDatum
=
self
.
_unindex
.
get
(
documentId
,
_marker
)
if
datum
!=
oldDatum
:
if
oldDatum
is
not
_marker
:
self
.
removeForwardIndexEntry
(
oldDatum
,
documentId
)
if
datum
is
not
_marker
:
self
.
insertForwardIndexEntry
(
datum
,
documentId
)
self
.
_unindex
[
documentId
]
=
datum
returnStatus
=
1
return
returnStatus
def
unindex_object
(
self
,
documentId
):
""" Unindex the object with integer id 'documentId' and don't
raise an exception if we fail """
global
_marker
unindexRecord
=
self
.
_unindex
.
get
(
documentId
,
_marker
)
if
unindexRecord
is
_marker
:
return
None
self
.
removeForwardIndexEntry
(
unindexRecord
,
documentId
)
try
:
del
self
.
_unindex
[
documentId
]
except
:
LOG
(
'UnIndex'
,
ERROR
,
'Attempt to unindex nonexistent document'
' with id %s'
%
documentId
)
def
_apply_index
(
self
,
request
,
cid
=
''
,
type
=
type
):
"""Apply the index to query parameters given in the request arg.
The request argument should be a mapping object.
If the request does not have a key which matches the "id" of
the index instance, then None is returned.
If the request *does* have a key which matches the "id" of
the index instance, one of a few things can happen:
- if the value is a blank string, None is returned (in
order to support requests from web forms where
you can't tell a blank string from empty).
- if the value is a nonblank string, turn the value into
a single-element sequence, and proceed.
- if the value is a sequence, return a union search.
If the request contains a parameter with the name of the
column + '_usage', it is sniffed for information on how to
handle applying the index.
If None is not returned as a result of the abovementioned
constraints, two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
FAQ answer: to search a Field Index for documents that
have a blank string as their value, wrap the request value
up in a tuple ala: request = {'id':('',)}
"""
id
=
self
.
id
#name of the column
cidid
=
"%s/%s"
%
(
cid
,
id
)
# i have no f'ing clue what this cdid stuff is for - chrism
if
request
.
has_key
(
cidid
):
keys
=
request
[
cidid
]
elif
request
.
has_key
(
id
):
keys
=
request
[
id
]
else
:
return
None
if
type
(
keys
)
not
in
(
ListType
,
TupleType
):
if
keys
==
''
:
return
None
else
:
keys
=
[
keys
]
index
=
self
.
_index
r
=
None
opr
=
None
if
request
.
has_key
(
id
+
'_usage'
):
# see if any usage params are sent to field
opr
=
string
.
split
(
string
.
lower
(
request
[
id
+
"_usage"
]),
':'
)
opr
,
opr_args
=
opr
[
0
],
opr
[
1
:]
if
opr
==
"range"
:
# range search
if
'min'
in
opr_args
:
lo
=
min
(
keys
)
else
:
lo
=
None
if
'max'
in
opr_args
:
hi
=
max
(
keys
)
else
:
hi
=
None
if
hi
:
setlist
=
index
.
items
(
lo
,
hi
)
else
:
setlist
=
index
.
items
(
lo
)
for
k
,
set
in
setlist
:
if
type
(
set
)
is
IntType
:
set
=
IISet
((
set
,))
r
=
union
(
r
,
set
)
else
:
# not a range search
for
key
in
keys
:
set
=
index
.
get
(
key
,
None
)
if
set
is
not
None
:
if
type
(
set
)
is
IntType
:
set
=
IISet
((
set
,))
r
=
union
(
r
,
set
)
if
type
(
r
)
is
IntType
:
r
=
IISet
((
r
,))
if
r
is
None
:
return
IISet
(),
(
id
,)
else
:
return
r
,
(
id
,)
def
hasUniqueValuesFor
(
self
,
name
):
' has unique values for column NAME '
if
name
==
self
.
id
:
return
1
else
:
return
0
def
uniqueValues
(
self
,
name
=
None
,
withLengths
=
0
):
"""
\
returns the unique values for name
if withLengths is true, returns a sequence of
tuples of (value, length)
"""
if
name
is
None
:
name
=
self
.
id
elif
name
!=
self
.
id
:
return
[]
if
not
withLengths
:
return
tuple
(
self
.
_index
.
keys
())
else
:
rl
=
[]
for
i
in
self
.
_index
.
keys
():
set
=
self
.
_index
[
i
]
if
type
(
set
)
is
IntType
:
l
=
1
else
:
l
=
len
(
set
)
rl
.
append
((
i
,
l
))
return
tuple
(
rl
)
def
keyForDocument
(
self
,
id
):
return
self
.
_unindex
[
id
]
def
items
(
self
):
items
=
[]
for
k
,
v
in
self
.
_index
.
items
():
if
type
(
v
)
is
IntType
:
v
=
IISet
((
v
,))
items
.
append
((
k
,
v
))
return
items
lib/python/SearchIndex/UnKeywordIndex.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from
UnIndex
import
UnIndex
from
zLOG
import
LOG
,
ERROR
from
types
import
StringType
from
BTrees.OOBTree
import
OOSet
,
difference
class
UnKeywordIndex
(
UnIndex
):
meta_type
=
'Keyword Index'
"""Like an UnIndex only it indexes sequences of items
Searches match any keyword.
This should have an _apply_index that returns a relevance score
"""
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
""" index an object 'obj' with integer id 'i'
Ideally, we've been passed a sequence of some sort that we
can iterate over. If however, we haven't, we should do something
useful with the results. In the case of a string, this means
indexing the entire string as a keyword."""
# First we need to see if there's anything interesting to look at
# self.id is the name of the index, which is also the name of the
# attribute we're interested in. If the attribute is callable,
# we'll do so.
newKeywords
=
getattr
(
obj
,
self
.
id
,
())
if
callable
(
newKeywords
):
newKeywords
=
newKeywords
()
if
type
(
newKeywords
)
is
StringType
:
newKeywords
=
(
newKeywords
,
)
oldKeywords
=
self
.
_unindex
.
get
(
documentId
,
None
)
if
oldKeywords
is
None
:
# we've got a new document, let's not futz around.
try
:
for
kw
in
newKeywords
:
self
.
insertForwardIndexEntry
(
kw
,
documentId
)
self
.
_unindex
[
documentId
]
=
list
(
newKeywords
)
except
TypeError
:
return
0
else
:
# we have an existing entry for this document, and we need
# to figure out if any of the keywords have actually changed
if
type
(
oldKeywords
)
is
not
OOSet
:
oldKeywords
=
OOSet
(
oldKeywords
)
newKeywords
=
OOSet
(
newKeywords
)
fdiff
=
difference
(
oldKeywords
,
newKeywords
)
rdiff
=
difference
(
newKeywords
,
oldKeywords
)
if
fdiff
or
rdiff
:
# if we've got forward or reverse changes
self
.
_unindex
[
documentId
]
=
list
(
newKeywords
)
if
fdiff
:
self
.
unindex_objectKeywords
(
documentId
,
fdiff
)
if
rdiff
:
for
kw
in
rdiff
:
self
.
insertForwardIndexEntry
(
kw
,
documentId
)
return
1
def
unindex_objectKeywords
(
self
,
documentId
,
keywords
):
""" carefully unindex the object with integer id 'documentId'"""
if
keywords
is
not
None
:
for
kw
in
keywords
:
self
.
removeForwardIndexEntry
(
kw
,
documentId
)
def
unindex_object
(
self
,
documentId
):
""" carefully unindex the object with integer id 'documentId'"""
keywords
=
self
.
_unindex
.
get
(
documentId
,
None
)
self
.
unindex_objectKeywords
(
documentId
,
keywords
)
try
:
del
self
.
_unindex
[
documentId
]
except
KeyError
:
LOG
(
'UnKeywordIndex'
,
ERROR
,
'Attempt to unindex nonexistent'
' document id %s'
%
documentId
)
lib/python/SearchIndex/UnTextIndex.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Text Index
The UnTextIndex falls under the 'I didnt have a better name for it'
excuse. It is an 'Un' Text index because it stores a little bit of
undo information so that objects can be unindexed when the old value
is no longer known.
"""
__version__
=
'$Revision: 1.54 $'
[
11
:
-
2
]
import
string
,
re
import
operator
from
Globals
import
Persistent
from
Acquisition
import
Implicit
from
Splitter
import
Splitter
from
zLOG
import
LOG
,
ERROR
from
Lexicon
import
Lexicon
from
ResultList
import
ResultList
from
types
import
*
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IIBTree
import
IIBTree
,
IIBucket
,
IISet
,
IITreeSet
from
BTrees.IIBTree
import
difference
,
weightedIntersection
AndNot
=
'andnot'
And
=
'and'
Or
=
'or'
Near
=
'...'
QueryError
=
'TextIndex.QueryError'
class
UnTextIndex
(
Persistent
,
Implicit
):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way."""
isDeprecatedIndex
=
1
meta_type
=
'Text Index'
def
__init__
(
self
,
id
,
ignore_ex
=
None
,
call_methods
=
None
,
lexicon
=
None
):
"""Create an index
The arguments are:
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
'lexicon' is the lexicon object to specify, if None, the
index will use a private lexicon."""
self
.
id
=
id
self
.
ignore_ex
=
ignore_ex
self
.
call_methods
=
call_methods
self
.
clear
()
if
lexicon
is
None
:
## if no lexicon is provided, create a default one
self
.
_lexicon
=
Lexicon
()
else
:
# We need to hold a reference to the lexicon, since we can't
# really change lexicons.
self
.
_lexicon
=
self
.
getLexicon
(
lexicon
)
def
getLexicon
(
self
,
vocab_id
):
"""Return the Lexicon in use.
Bit of a hack, indexes have been made acquirers so that they
can acquire a vocabulary object from the object system in
Zope. I don't think indexes were ever intended to participate
in this way, but I don't see too much of a problem with it."""
if
type
(
vocab_id
)
is
not
StringType
:
vocab
=
vocab_id
# we already havd the lexicon
return
vocab
else
:
vocab
=
getattr
(
self
,
vocab_id
)
return
vocab
.
lexicon
def
__nonzero__
(
self
):
return
not
not
self
.
_unindex
# Too expensive
#def __len__(self):
# """Return the number of objects indexed."""
# return len(self._unindex)
def
clear
(
self
):
"""Reinitialize the text index."""
self
.
_index
=
IOBTree
()
self
.
_unindex
=
IOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
if
type
(
self
.
_lexicon
)
is
type
(
''
):
# Turn the name reference into a hard reference.
self
.
_lexicon
=
self
.
getLexicon
(
self
.
_lexicon
)
if
type
(
self
.
_index
)
is
IOBTree
:
return
from
BTrees.convert
import
convert
_index
=
self
.
_index
self
.
_index
=
IOBTree
()
def
convertScores
(
scores
,
type
=
type
,
TupleType
=
TupleType
,
IIBTree
=
IIBTree
):
if
type
(
scores
)
is
not
TupleType
and
type
(
scores
)
is
not
IIBTree
():
scores
=
IIBTree
(
scores
)
return
scores
convert
(
_index
,
self
.
_index
,
threshold
,
convertScores
)
_unindex
=
self
.
_unindex
self
.
_unindex
=
IOBTree
()
convert
(
_unindex
,
self
.
_unindex
,
threshold
)
def
histogram
(
self
,
type
=
type
,
TupleType
=
type
(())):
"""Return a mapping which provides a histogram of the number of
elements found at each point in the index."""
histogram
=
IIBucket
()
for
(
key
,
value
)
in
self
.
_index
.
items
():
if
type
(
value
)
is
TupleType
:
entry
=
1
else
:
entry
=
len
(
value
)
histogram
[
entry
]
=
histogram
.
get
(
entry
,
0
)
+
1
return
histogram
def
getEntryForObject
(
self
,
rid
,
default
=
None
):
"""Get all information contained for a specific object.
This takes the objects record ID as it's main argument."""
wordMap
=
self
.
getLexicon
(
self
.
_lexicon
).
_lexicon
.
items
()
results
=
self
.
_unindex
.
get
(
rid
,
None
)
if
results
is
None
:
return
default
else
:
return
tuple
(
map
(
self
.
getLexicon
(
self
.
_lexicon
).
getWord
,
results
))
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
,
score
=
1
):
"""Uses the information provided to update the indexes.
The basic logic for choice of data structure is based on
the number of entries as follows:
1 tuple
2-4 dictionary
5+ bucket.
"""
index
=
self
.
_index
indexRow
=
index
.
get
(
entry
,
None
)
if
indexRow
is
not
None
:
if
type
(
indexRow
)
is
TupleType
:
# Tuples are only used for rows which have only
# a single entry. Since we now need more, we'll
# promote it to a mapping object (dictionary).
# First, make sure we're not already in it, if so
# update the score if necessary.
if
indexRow
[
0
]
==
documentId
:
if
indexRow
[
1
]
!=
score
:
indexRow
=
(
documentId
,
score
)
index
[
entry
]
=
indexRow
else
:
indexRow
=
{
indexRow
[
0
]:
indexRow
[
1
],
documentId
:
score
,
}
index
[
entry
]
=
indexRow
else
:
if
indexRow
.
get
(
documentId
,
-
1
)
!=
score
:
# score changed (or new entry)
if
type
(
indexRow
)
is
DictType
:
indexRow
[
documentId
]
=
score
if
len
(
indexRow
)
>
3
:
# Big enough to give it's own database record
indexRow
=
IIBTree
(
indexRow
)
index
[
entry
]
=
indexRow
else
:
indexRow
[
documentId
]
=
score
else
:
# We don't have any information at this point, so we'll
# put our first entry in, and use a tuple to save space
index
[
entry
]
=
(
documentId
,
score
)
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
""" Index an object:
'documentId' is the integer id of the document
'obj' is the objects to be indexed
'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are
disabled. """
# sniff the object for our 'id', the 'document source' of the
# index is this attribute. If it smells callable, call it.
try
:
source
=
getattr
(
obj
,
self
.
id
)
if
callable
(
source
):
source
=
str
(
source
())
else
:
source
=
str
(
source
)
except
(
AttributeError
,
TypeError
):
return
0
lexicon
=
self
.
getLexicon
(
self
.
_lexicon
)
splitter
=
lexicon
.
Splitter
wordScores
=
OIBTree
()
last
=
None
# Run through the words and score them
for
word
in
splitter
(
source
):
if
word
[
0
]
==
'
\
"
'
:
last
=
self
.
_subindex
(
word
[
1
:
-
1
],
wordScores
,
last
,
splitter
)
else
:
if
word
==
last
:
continue
last
=
word
wordScores
[
word
]
=
wordScores
.
get
(
word
,
0
)
+
1
# Convert scores to use wids:
widScores
=
IIBucket
()
getWid
=
lexicon
.
getWordId
for
word
,
score
in
wordScores
.
items
():
widScores
[
getWid
(
word
)]
=
score
del
wordScores
currentWids
=
IISet
(
self
.
_unindex
.
get
(
documentId
,
[]))
# Get rid of document words that are no longer indexed
self
.
unindex_objectWids
(
documentId
,
difference
(
currentWids
,
widScores
))
# Now index the words. Note that the new xIBTrees are clever
# enough to do nothing when there isn't a change. Woo hoo.
insert
=
self
.
insertForwardIndexEntry
for
wid
,
score
in
widScores
.
items
():
insert
(
wid
,
documentId
,
score
)
# Save the unindexing info if it's changed:
wids
=
widScores
.
keys
()
if
wids
!=
currentWids
.
keys
():
self
.
_unindex
[
documentId
]
=
wids
return
len
(
wids
)
def
_subindex
(
self
,
source
,
wordScores
,
last
,
splitter
):
"""Recursively handle multi-word synonyms"""
for
word
in
splitter
(
source
):
if
word
[
0
]
==
'
\
"
'
:
last
=
self
.
_subindex
(
word
[
1
:
-
1
],
wordScores
,
last
,
splitter
)
else
:
if
word
==
last
:
continue
last
=
word
wordScores
[
word
]
=
wordScores
.
get
(
word
,
0
)
+
1
return
last
def
unindex_object
(
self
,
i
):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index
=
self
.
_index
unindex
=
self
.
_unindex
wids
=
unindex
.
get
(
i
,
None
)
if
wids
is
not
None
:
self
.
unindex_objectWids
(
i
,
wids
)
del
unindex
[
i
]
def
unindex_objectWids
(
self
,
i
,
wids
):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index
=
self
.
_index
get
=
index
.
get
for
wid
in
wids
:
widScores
=
get
(
wid
,
None
)
if
widScores
is
None
:
LOG
(
'UnTextIndex'
,
ERROR
,
'unindex_object tried to unindex nonexistent'
' document, wid %s, %s'
%
(
i
,
wid
))
continue
if
type
(
widScores
)
is
TupleType
:
del
index
[
wid
]
else
:
try
:
del
widScores
[
i
]
if
widScores
:
if
type
(
widScores
)
is
DictType
:
if
len
(
widScores
)
==
1
:
# convert to tuple
widScores
=
widScores
.
items
()[
0
]
index
[
wid
]
=
widScores
else
:
del
index
[
wid
]
except
(
KeyError
,
IndexError
,
TypeError
):
LOG
(
'UnTextIndex'
,
ERROR
,
'unindex_object tried to unindex nonexistent'
' document %s'
%
str
(
i
))
def
__getitem__
(
self
,
word
):
"""Return an InvertedIndex-style result "list"
Note that this differentiates between being passed an Integer
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if
isinstance
(
word
,
IntType
):
# We have a word ID
result
=
self
.
_index
.
get
(
word
,
{})
return
ResultList
(
result
,
(
word
,),
self
)
else
:
splitSource
=
tuple
(
self
.
getLexicon
(
self
.
_lexicon
).
Splitter
(
word
))
if
not
splitSource
:
return
ResultList
({},
(
word
,),
self
)
if
len
(
splitSource
)
==
1
:
splitSource
=
splitSource
[
0
]
if
splitSource
[:
1
]
==
splitSource
[
-
1
:]
==
'"'
:
return
self
[
splitSource
]
wids
=
self
.
getLexicon
(
self
.
_lexicon
).
get
(
splitSource
)
if
wids
:
r
=
self
.
_index
.
get
(
wids
[
0
],
None
)
if
r
is
None
:
r
=
{}
else
:
r
=
{}
return
ResultList
(
r
,
(
splitSource
,),
self
)
r
=
None
for
word
in
splitSource
:
rr
=
self
[
word
]
if
r
is
None
:
r
=
rr
else
:
r
=
r
.
near
(
rr
)
return
r
def
_apply_index
(
self
,
request
,
cid
=
''
):
""" Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then
None is returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
if
request
.
has_key
(
self
.
id
):
keys
=
request
[
self
.
id
]
else
:
return
None
operators
=
{
'andnot'
:
AndNot
,
'and'
:
And
,
'near'
:
Near
,
'or'
:
Or
}
query_operator
=
Or
# We default to 'or' if we aren't passed an operator in the request
# or if we can't make sense of the passed-in operator
if
request
.
has_key
(
'textindex_operator'
):
op
=
string
.
lower
(
str
(
request
[
'textindex_operator'
]))
query_operator
=
operators
.
get
(
op
,
query_operator
)
if
type
(
keys
)
is
StringType
:
if
not
keys
or
not
string
.
strip
(
keys
):
return
None
keys
=
[
keys
]
r
=
None
for
key
in
keys
:
key
=
string
.
strip
(
key
)
if
not
key
:
continue
b
=
self
.
query
(
key
,
query_operator
).
bucket
()
w
,
r
=
weightedIntersection
(
r
,
b
)
if
r
is
not
None
:
return
r
,
(
self
.
id
,)
return
(
IIBucket
(),
(
self
.
id
,))
def
positions
(
self
,
docid
,
words
,
# This was never tested: obj
):
"""Return the positions in the document for the given document
id of the word, word."""
return
[
1
]
#################################################################
# The code below here is broken and requires an API change to fix
# it. Waaaaa.
if
self
.
_schema
is
None
:
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
self
.
id
]
if
self
.
call_methods
:
doc
=
str
(
f
(
obj
,
self
.
id
)())
else
:
doc
=
str
(
f
(
obj
,
self
.
id
))
r
=
[]
for
word
in
words
:
r
=
r
+
self
.
getLexicon
(
self
.
_lexicon
).
Splitter
(
doc
).
indexes
(
word
)
return
r
def
query
(
self
,
s
,
default_operator
=
Or
):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s
=
re
.
sub
(
'(?i)
\
s+
a
nd
\
s*
n
ot
\
s+
'
, '
andnot
', s)
# Parse parentheses and quotes
q = parse(s)
# Allow the Lexicon to process the query
q = self.getLexicon(self._lexicon).query_hook(q)
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
# evalute the final '
expression
'
return self.evaluate(q)
def get_operands(self, q, i):
"""Evaluate and return the left and right operands for an operator"""
try:
left = q[i - 1]
right = q[i + 1]
except IndexError:
raise QueryError, "Malformed query"
operandType = type(left)
if operandType is IntType:
left = self[left]
elif operandType is StringType:
left = self[left]
elif operandType is ListType:
left = self.evaluate(left)
operandType = type(right)
if operandType is IntType:
right = self[right]
elif operandType is StringType:
right = self[right]
elif operandType is ListType:
right = self.evaluate(right)
return (left, right)
def evaluate(self, query):
"""Evaluate a parsed query"""
# Strip off meaningless layers
while isinstance(query, ListType) and len(query) == 1:
query = query[0]
# If it'
s
not
a
list
,
assume
a
string
or
number
if
not
isinstance
(
query
,
ListType
):
return
self
[
query
]
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNot -> And -> Or -> Near
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
AndNot
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
.
and_not
(
right
)
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
And
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
&
right
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
Or
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
|
right
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
Near
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
.
near
(
right
)
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
if
(
len
(
query
)
!=
1
):
raise
QueryError
,
"Malformed query"
return
query
[
0
]
def
parse
(
s
):
"""Parse parentheses and quotes"""
l
=
[]
tmp
=
string
.
lower
(
s
)
p
=
parens
(
tmp
)
while
p
is
not
None
:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l
=
l
+
quotes
(
p
[
0
])
l
.
append
(
parse
(
p
[
1
]))
# continue looking through the rest of the string
tmp
=
p
[
2
]
p
=
parens
(
tmp
)
return
l
+
quotes
(
tmp
)
def
parse2
(
q
,
default_operator
,
operator_dict
=
{
AndNot
:
AndNot
,
And
:
And
,
Or
:
Or
,
Near
:
Near
}):
"""Find operators and operands"""
isop
=
operator_dict
.
has_key
i
=
0
while
i
<
len
(
q
):
e
=
q
[
i
]
if
isinstance
(
e
,
ListType
):
q
[
i
]
=
parse2
(
e
,
default_operator
)
if
i
%
2
:
q
.
insert
(
i
,
default_operator
)
i
=
i
+
1
elif
i
%
2
:
# This element should be an operator
if
isop
(
e
):
# Ensure that it is identical, not merely equal.
q
[
i
]
=
operator_dict
[
e
]
else
:
# Insert the default operator.
q
.
insert
(
i
,
default_operator
)
i
=
i
+
1
i
=
i
+
1
return
q
def
parens
(
s
,
parens_re
=
re
.
compile
(
'[()]'
).
search
):
mo
=
parens_re
(
s
)
if
mo
is
None
:
return
open_index
=
mo
.
start
(
0
)
+
1
paren_count
=
0
while
mo
is
not
None
:
index
=
mo
.
start
(
0
)
if
s
[
index
]
==
'('
:
paren_count
=
paren_count
+
1
else
:
paren_count
=
paren_count
-
1
if
paren_count
==
0
:
return
(
s
[:
open_index
-
1
],
s
[
open_index
:
index
],
s
[
index
+
1
:])
if
paren_count
<
0
:
break
mo
=
parens_re
(
s
,
index
+
1
)
raise
QueryError
,
"Mismatched parentheses"
def
quotes
(
s
):
split
=
string
.
split
if
'"'
not
in
s
:
return
split
(
s
)
# split up quoted regions
splitted
=
re
.
split
(
'
\
s*
\
"
\
s*
'
, s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = split(splitted[i])
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = split(splitted[i])
i = i - 2
return filter(None, splitted)
lib/python/SearchIndex/__init__.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__
=
'''Collected utilities to support database indexing.
$Id: __init__.py,v 1.10 2002/08/14 21:46:24 mj Exp $'''
__version__
=
'$Revision: 1.10 $'
[
11
:
-
2
]
import
warnings
warnings
.
warn
(
"The usage of the SearchIndex package is deprecated since
\
Zope 2.4.
\
n
\
This package is only kept for backwards compatibility for a while
\
n
\
and will go away in a future release.
\
n
\
\
n
\
Please use instead the re-factored modules in Products/PluginIndexes.
\
n
\
"
,
DeprecationWarning
)
lib/python/SearchIndex/randid.py
deleted
100644 → 0
View file @
2dc887a3
##############################################################################
#
# Copyright (c) 2001 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import
whrandom
def
randid
(
randint
=
whrandom
.
randint
,
choice
=
whrandom
.
choice
,
signs
=
(
-
1
,
1
)):
return
choice
(
signs
)
*
randint
(
1
,
2000000000
)
del
whrandom
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment