Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Z
Zope
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Kirill Smelkov
Zope
Commits
cb45cbcc
Commit
cb45cbcc
authored
Jul 03, 2009
by
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
- removed deprecated TextIndex
parent
adc7e054
Changes
33
Show whitespace changes
Inline
Side-by-side
Showing
33 changed files
with
4 additions
and
4008 deletions
+4
-4008
doc/CHANGES.rst
doc/CHANGES.rst
+2
-0
setup.py
setup.py
+0
-15
src/Products/PluginIndexes/README.txt
src/Products/PluginIndexes/README.txt
+0
-26
src/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
src/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
+0
-269
src/Products/PluginIndexes/TextIndex/Lexicon.py
src/Products/PluginIndexes/TextIndex/Lexicon.py
+0
-220
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/SETUP.cfg
...nIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/SETUP.cfg
+0
-3
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/__init__.py
...ndexes/TextIndex/Splitter/ISO_8859_1_Splitter/__init__.py
+0
-4
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c
...ex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c
+0
-593
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/SETUP.cfg
...luginIndexes/TextIndex/Splitter/UnicodeSplitter/SETUP.cfg
+0
-3
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/__init__.py
...ginIndexes/TextIndex/Splitter/UnicodeSplitter/__init__.py
+0
-1
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
.../TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
+0
-429
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/tests/__init__.py
...exes/TextIndex/Splitter/UnicodeSplitter/tests/__init__.py
+0
-1
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/tests/testUnicodeSplitter.py
...dex/Splitter/UnicodeSplitter/tests/testUnicodeSplitter.py
+0
-75
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/SETUP.cfg
...s/PluginIndexes/TextIndex/Splitter/ZopeSplitter/SETUP.cfg
+0
-3
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/__init__.py
...PluginIndexes/TextIndex/Splitter/ZopeSplitter/__init__.py
+0
-4
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
...ndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
+0
-529
src/Products/PluginIndexes/TextIndex/Splitter/__init__.py
src/Products/PluginIndexes/TextIndex/Splitter/__init__.py
+0
-33
src/Products/PluginIndexes/TextIndex/Splitter/setup.py
src/Products/PluginIndexes/TextIndex/Splitter/setup.py
+0
-23
src/Products/PluginIndexes/TextIndex/TextIndex.py
src/Products/PluginIndexes/TextIndex/TextIndex.py
+0
-760
src/Products/PluginIndexes/TextIndex/Vocabulary.py
src/Products/PluginIndexes/TextIndex/Vocabulary.py
+0
-138
src/Products/PluginIndexes/TextIndex/__init__.py
src/Products/PluginIndexes/TextIndex/__init__.py
+0
-7
src/Products/PluginIndexes/TextIndex/dtml/addTextIndex.dtml
src/Products/PluginIndexes/TextIndex/dtml/addTextIndex.dtml
+0
-86
src/Products/PluginIndexes/TextIndex/dtml/addVocabulary.dtml
src/Products/PluginIndexes/TextIndex/dtml/addVocabulary.dtml
+0
-112
src/Products/PluginIndexes/TextIndex/dtml/manageTextIndex.dtml
...roducts/PluginIndexes/TextIndex/dtml/manageTextIndex.dtml
+0
-79
src/Products/PluginIndexes/TextIndex/dtml/manageVocabulary.dtml
...oducts/PluginIndexes/TextIndex/dtml/manageVocabulary.dtml
+0
-17
src/Products/PluginIndexes/TextIndex/dtml/manage_vocab.dtml
src/Products/PluginIndexes/TextIndex/dtml/manage_vocab.dtml
+0
-117
src/Products/PluginIndexes/TextIndex/dtml/vocab_query.dtml
src/Products/PluginIndexes/TextIndex/dtml/vocab_query.dtml
+0
-12
src/Products/PluginIndexes/TextIndex/help/TextIndex_searchResults.stx
.../PluginIndexes/TextIndex/help/TextIndex_searchResults.stx
+0
-23
src/Products/PluginIndexes/TextIndex/tests/__init__.py
src/Products/PluginIndexes/TextIndex/tests/__init__.py
+0
-15
src/Products/PluginIndexes/TextIndex/tests/testSplitter.py
src/Products/PluginIndexes/TextIndex/tests/testSplitter.py
+0
-89
src/Products/PluginIndexes/TextIndex/tests/testTextIndex.py
src/Products/PluginIndexes/TextIndex/tests/testTextIndex.py
+0
-277
src/Products/PluginIndexes/__init__.py
src/Products/PluginIndexes/__init__.py
+1
-14
src/Products/PluginIndexes/interfaces.py
src/Products/PluginIndexes/interfaces.py
+1
-31
No files found.
doc/CHANGES.rst
View file @
cb45cbcc
...
@@ -19,6 +19,8 @@ Features Added
...
@@ -19,6 +19,8 @@ Features Added
Restructuring
Restructuring
+++++++++++++
+++++++++++++
- PluginIndexes: Removed deprecated TextIndex.
- HelpSys now uses ZCTextIndex instead of the deprecated TextIndex. Please
- HelpSys now uses ZCTextIndex instead of the deprecated TextIndex. Please
update your Zope databases by deleting the Product registrations in the
update your Zope databases by deleting the Product registrations in the
Control Panel and restarting Zope.
Control Panel and restarting Zope.
...
...
setup.py
View file @
cb45cbcc
...
@@ -75,21 +75,6 @@ params = dict(name='Zope2',
...
@@ -75,21 +75,6 @@ params = dict(name='Zope2',
sources
=
[
'src/initgroups/_initgroups.c'
]),
sources
=
[
'src/initgroups/_initgroups.c'
]),
# indexes
# indexes
Extension
(
name
=
'Products.PluginIndexes.TextIndex.Splitter.'
'ZopeSplitter.ZopeSplitter'
,
sources
=
[
'src/Products/PluginIndexes/TextIndex/Splitter/'
'ZopeSplitter/src/ZopeSplitter.c'
]),
Extension
(
name
=
'Products.PluginIndexes.TextIndex.Splitter.'
'ISO_8859_1_Splitter.ISO_8859_1_Splitter'
,
sources
=
[
'src/Products/PluginIndexes/TextIndex/Splitter/'
'ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c'
]),
Extension
(
name
=
'Products.PluginIndexes.TextIndex.Splitter.'
'UnicodeSplitter.UnicodeSplitter'
,
sources
=
[
'src/Products/PluginIndexes/TextIndex/Splitter/'
'UnicodeSplitter/src/UnicodeSplitter.c'
]),
Extension
(
Extension
(
name
=
'Products.ZCTextIndex.stopper'
,
name
=
'Products.ZCTextIndex.stopper'
,
sources
=
[
'src/Products/ZCTextIndex/stopper.c'
]),
sources
=
[
'src/Products/ZCTextIndex/stopper.c'
]),
...
...
src/Products/PluginIndexes/README.txt
View file @
cb45cbcc
...
@@ -47,34 +47,8 @@ Changes to Indexes:
...
@@ -47,34 +47,8 @@ Changes to Indexes:
- new index type
- new index type
Changes to TextIndex:
- ZMI allows to select a different vocabulary. To use a vocabulary different
from the ZCatalogs default vocabulary 'Vocabulary' you must create a new
Vocabulary through the ZMI of the ZCatalog. After creating the vocabulary you
can choose the vocabulary on the ZMI management screen for the text index.
- the default operator might be overridden by specifying a new one
as 'operator' (see below)
- removed direct dependency from Splitter module. Splitter is now
acquired from used vocabulary
- usage of the 'textindex_operator' is deprecated
- lots of internal rework
Changes to Vocabulary:
- added Splitter selection on the add formular
Changes to ZCatalog
Changes to ZCatalog
- Vocabulary.py moved to Products/PluginIndexes/TextIndex. A wrapper
for backward compatibility is in place
- added ZCatalogIndexes.py to provide access to indexes with pluggable
- added ZCatalogIndexes.py to provide access to indexes with pluggable
index interface
index interface
...
...
src/Products/PluginIndexes/TextIndex/GlobbingLexicon.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import
re
import
string
from
BTrees.IIBTree
import
IISet
,
union
,
IITreeSet
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OOBTree
import
OOBTree
from
Products.PluginIndexes.common.randid
import
randid
from
Products.PluginIndexes.TextIndex.TextIndex
import
Op
from
Products.PluginIndexes.TextIndex.TextIndex
import
Or
from
Products.PluginIndexes.TextIndex.Lexicon
import
Lexicon
from
Products.PluginIndexes.TextIndex.Splitter
import
getSplitter
class
GlobbingLexicon
(
Lexicon
):
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
multi_wc
=
'*'
single_wc
=
'?'
eow
=
'$'
def
__init__
(
self
,
useSplitter
=
None
,
extra
=
None
):
self
.
clear
()
self
.
useSplitter
=
useSplitter
self
.
splitterParams
=
extra
self
.
SplitterFunc
=
getSplitter
(
self
.
useSplitter
)
def
clear
(
self
):
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
self
.
_digrams
=
OOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
Lexicon
.
_convertBTrees
(
self
,
threshold
)
if
type
(
self
.
_digrams
)
is
OOBTree
:
return
from
BTrees.convert
import
convert
_digrams
=
self
.
_digrams
self
.
_digrams
=
OOBTree
()
self
.
_digrams
.
_p_jar
=
self
.
_p_jar
convert
(
_digrams
,
self
.
_digrams
,
threshold
,
IITreeSet
)
def
createDigrams
(
self
,
word
):
"""Returns a list with the set of digrams in the word."""
word
=
'$'
+
word
+
'$'
return
[
word
[
i
:
i
+
2
]
for
i
in
range
(
len
(
word
)
-
1
)]
def
getWordId
(
self
,
word
):
"""Provided 'word', return the matching integer word id."""
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
else
:
return
self
.
assignWordId
(
word
)
set
=
getWordId
# Kludge for old code
def
getWord
(
self
,
wid
):
return
self
.
_inverseLex
.
get
(
wid
,
None
)
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word, and return it."""
# Double check it's not in the lexicon already, and if it is, just
# return it.
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
# Get word id. BBB Backward compat pain.
inverse
=
self
.
_inverseLex
try
:
insert
=
inverse
.
insert
except
AttributeError
:
# we have an "old" BTree object
if
inverse
:
wid
=
inverse
.
keys
()[
-
1
]
+
1
else
:
self
.
_inverseLex
=
IOBTree
()
wid
=
1
inverse
[
wid
]
=
word
else
:
# we have a "new" IOBTree object
wid
=
randid
()
while
not
inverse
.
insert
(
wid
,
word
):
wid
=
randid
()
self
.
_lexicon
[
word
]
=
wid
# Now take all the digrams and insert them into the digram map.
for
digram
in
self
.
createDigrams
(
word
):
set
=
self
.
_digrams
.
get
(
digram
,
None
)
if
set
is
None
:
self
.
_digrams
[
digram
]
=
set
=
IISet
()
set
.
insert
(
wid
)
return
wid
def
get
(
self
,
pattern
):
""" Query the lexicon for words matching a pattern."""
# single word pattern produce a slicing problem below.
# Because the splitter throws away single characters we can
# return an empty tuple here.
if
len
(
pattern
)
==
1
:
return
()
wc_set
=
[
self
.
multi_wc
,
self
.
single_wc
]
digrams
=
[]
globbing
=
0
for
i
in
range
(
len
(
pattern
)):
if
pattern
[
i
]
in
wc_set
:
globbing
=
1
continue
if
i
==
0
:
digrams
.
insert
(
i
,
(
self
.
eow
+
pattern
[
i
])
)
digrams
.
append
((
pattern
[
i
]
+
pattern
[
i
+
1
]))
else
:
try
:
if
pattern
[
i
+
1
]
not
in
wc_set
:
digrams
.
append
(
pattern
[
i
]
+
pattern
[
i
+
1
]
)
except
IndexError
:
digrams
.
append
(
(
pattern
[
i
]
+
self
.
eow
)
)
if
not
globbing
:
result
=
self
.
_lexicon
.
get
(
pattern
,
None
)
if
result
is
None
:
return
()
return
(
result
,
)
## now get all of the intsets that contain the result digrams
result
=
None
for
digram
in
digrams
:
result
=
union
(
result
,
self
.
_digrams
.
get
(
digram
,
None
))
if
not
result
:
return
()
else
:
## now we have narrowed the list of possible candidates
## down to those words which contain digrams. However,
## some words may have been returned that match digrams,
## but do not match 'pattern'. This is because some words
## may contain all matching digrams, but in the wrong
## order.
expr
=
re
.
compile
(
self
.
createRegex
(
pattern
))
words
=
[]
hits
=
IISet
()
for
x
in
result
:
if
expr
.
match
(
self
.
_inverseLex
[
x
]):
hits
.
insert
(
x
)
return
hits
def
__getitem__
(
self
,
word
):
""" """
return
self
.
get
(
word
)
def
query_hook
(
self
,
q
):
"""expand wildcards"""
ListType
=
type
([])
i
=
len
(
q
)
-
1
while
i
>=
0
:
e
=
q
[
i
]
if
isinstance
(
e
,
ListType
):
self
.
query_hook
(
e
)
elif
isinstance
(
e
,
Op
):
pass
elif
(
(
self
.
multi_wc
in
e
)
or
(
self
.
single_wc
in
e
)
):
wids
=
self
.
get
(
e
)
words
=
[]
for
wid
in
wids
:
if
words
:
words
.
append
(
Or
)
words
.
append
(
wid
)
if
not
words
:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words
.
append
(
''
)
q
[
i
]
=
words
i
=
i
-
1
return
q
def
Splitter
(
self
,
astring
,
words
=
None
,
encoding
=
"latin1"
):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
try
:
return
self
.
SplitterFunc
(
astring
,
words
,
encoding
=
encoding
,
singlechar
=
self
.
splitterParams
.
splitterSingleChars
,
indexnumbers
=
self
.
splitterParams
.
splitterIndexNumbers
,
casefolding
=
self
.
splitterParams
.
splitterCasefolding
)
except
:
return
self
.
SplitterFunc
(
astring
,
words
)
def
createRegex
(
self
,
pat
):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
if
not
isinstance
(
pat
,
unicode
):
transTable
=
string
.
maketrans
(
""
,
""
)
result
=
string
.
translate
(
pat
,
transTable
,
r'()&|!@#$%^{}\
<>.
')
else:
transTable={}
for ch in r'
()
&|
!
@
#$%^{}\<>.':
transTable
[
ord
(
ch
)]
=
None
result
=
pat
.
translate
(
transTable
)
# First, deal with multi-character globbing
result
=
result
.
replace
(
'*'
,
'.*'
)
# Next, we need to deal with single-character globbing
result
=
result
.
replace
(
'?'
,
'.'
)
return
"%s$"
%
result
src/Products/PluginIndexes/TextIndex/Lexicon.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__
=
""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
mapping.
"""
from
Acquisition
import
Implicit
from
BTrees.OIBTree
import
OIBTree
from
BTrees.IOBTree
import
IOBTree
from
BTrees.IIBTree
import
IISet
from
BTrees.IIBTree
import
IITreeSet
from
Persistence
import
Persistent
from
Products.PluginIndexes.common.randid
import
randid
from
Products.PluginIndexes.TextIndex.Splitter
import
getSplitter
from
Products.PluginIndexes.TextIndex.Splitter
import
splitterNames
from
types
import
StringType
class
Lexicon
(
Persistent
,
Implicit
):
"""Maps words to word ids and then some
The Lexicon object is an attempt to abstract vocabularies out of
Text indexes. This abstraction is not totally cooked yet, this
module still includes the parser for the 'Text Index Query
Language' and a few other hacks.
"""
# default for older objects
stop_syn
=
{}
def
__init__
(
self
,
stop_syn
=
None
,
useSplitter
=
None
,
extra
=
None
):
self
.
clear
()
if
stop_syn
is
None
:
self
.
stop_syn
=
{}
else
:
self
.
stop_syn
=
stop_syn
self
.
useSplitter
=
splitterNames
[
0
]
if
useSplitter
:
self
.
useSplitter
=
useSplitter
self
.
splitterParams
=
extra
self
.
SplitterFunc
=
getSplitter
(
self
.
useSplitter
)
def
clear
(
self
):
self
.
_lexicon
=
OIBTree
()
self
.
_inverseLex
=
IOBTree
()
def
_convertBTrees
(
self
,
threshold
=
200
):
if
(
type
(
self
.
_lexicon
)
is
OIBTree
and
type
(
getattr
(
self
,
'_inverseLex'
,
None
))
is
IOBTree
):
return
from
BTrees.convert
import
convert
lexicon
=
self
.
_lexicon
self
.
_lexicon
=
OIBTree
()
self
.
_lexicon
.
_p_jar
=
self
.
_p_jar
convert
(
lexicon
,
self
.
_lexicon
,
threshold
)
try
:
inverseLex
=
self
.
_inverseLex
self
.
_inverseLex
=
IOBTree
()
except
AttributeError
:
# older lexicons didn't have an inverse lexicon
self
.
_inverseLex
=
IOBTree
()
inverseLex
=
self
.
_inverseLex
self
.
_inverseLex
.
_p_jar
=
self
.
_p_jar
convert
(
inverseLex
,
self
.
_inverseLex
,
threshold
)
def
set_stop_syn
(
self
,
stop_syn
):
""" pass in a mapping of stopwords and synonyms. Format is:
{'word' : [syn1, syn2, ..., synx]}
Vocabularies do not necesarily need to implement this if their
splitters do not support stemming or stoping.
"""
self
.
stop_syn
=
stop_syn
def
getWordId
(
self
,
word
):
""" return the word id of 'word' """
wid
=
self
.
_lexicon
.
get
(
word
,
None
)
if
wid
is
None
:
wid
=
self
.
assignWordId
(
word
)
return
wid
set
=
getWordId
def
getWord
(
self
,
wid
):
""" post-2.3.1b2 method, will not work with unconverted lexicons """
return
self
.
_inverseLex
.
get
(
wid
,
None
)
def
assignWordId
(
self
,
word
):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if
self
.
_lexicon
.
has_key
(
word
):
return
self
.
_lexicon
[
word
]
try
:
inverse
=
self
.
_inverseLex
except
AttributeError
:
# woops, old lexicom wo wids
inverse
=
self
.
_inverseLex
=
IOBTree
()
for
word
,
wid
in
self
.
_lexicon
.
items
():
inverse
[
wid
]
=
word
wid
=
randid
()
while
not
inverse
.
insert
(
wid
,
word
):
wid
=
randid
()
if
isinstance
(
word
,
str
):
self
.
_lexicon
[
intern
(
word
)]
=
wid
else
:
self
.
_lexicon
[
word
]
=
wid
return
wid
def
get
(
self
,
key
,
default
=
None
):
"""Return the matched word against the key."""
r
=
IISet
()
wid
=
self
.
_lexicon
.
get
(
key
,
default
)
if
wid
is
not
None
:
r
.
insert
(
wid
)
return
r
def
__getitem__
(
self
,
key
):
return
self
.
get
(
key
)
def
__len__
(
self
):
return
len
(
self
.
_lexicon
)
def
Splitter
(
self
,
astring
,
words
=
None
,
encoding
=
"latin1"
):
""" wrap the splitter """
if
words
is
None
:
words
=
self
.
stop_syn
try
:
return
self
.
SplitterFunc
(
astring
,
words
,
encoding
=
encoding
,
singlechar
=
self
.
splitterParams
.
splitterSingleChars
,
indexnumbers
=
self
.
splitterParams
.
splitterIndexNumbers
,
casefolding
=
self
.
splitterParams
.
splitterCasefolding
)
except
:
return
self
.
SplitterFunc
(
astring
,
words
)
def
query_hook
(
self
,
q
):
""" we don't want to modify the query cuz we're dumb """
return
q
stop_words
=
(
'am'
,
'ii'
,
'iii'
,
'per'
,
'po'
,
're'
,
'a'
,
'about'
,
'above'
,
'across'
,
'after'
,
'afterwards'
,
'again'
,
'against'
,
'all'
,
'almost'
,
'alone'
,
'along'
,
'already'
,
'also'
,
'although'
,
'always'
,
'am'
,
'among'
,
'amongst'
,
'amoungst'
,
'amount'
,
'an'
,
'and'
,
'another'
,
'any'
,
'anyhow'
,
'anyone'
,
'anything'
,
'anyway'
,
'anywhere'
,
'are'
,
'around'
,
'as'
,
'at'
,
'back'
,
'be'
,
'became'
,
'because'
,
'become'
,
'becomes'
,
'becoming'
,
'been'
,
'before'
,
'beforehand'
,
'behind'
,
'being'
,
'below'
,
'beside'
,
'besides'
,
'between'
,
'beyond'
,
'bill'
,
'both'
,
'bottom'
,
'but'
,
'by'
,
'can'
,
'cannot'
,
'cant'
,
'con'
,
'could'
,
'couldnt'
,
'cry'
,
'describe'
,
'detail'
,
'do'
,
'done'
,
'down'
,
'due'
,
'during'
,
'each'
,
'eg'
,
'eight'
,
'either'
,
'eleven'
,
'else'
,
'elsewhere'
,
'empty'
,
'enough'
,
'even'
,
'ever'
,
'every'
,
'everyone'
,
'everything'
,
'everywhere'
,
'except'
,
'few'
,
'fifteen'
,
'fifty'
,
'fill'
,
'find'
,
'fire'
,
'first'
,
'five'
,
'for'
,
'former'
,
'formerly'
,
'forty'
,
'found'
,
'four'
,
'from'
,
'front'
,
'full'
,
'further'
,
'get'
,
'give'
,
'go'
,
'had'
,
'has'
,
'hasnt'
,
'have'
,
'he'
,
'hence'
,
'her'
,
'here'
,
'hereafter'
,
'hereby'
,
'herein'
,
'hereupon'
,
'hers'
,
'herself'
,
'him'
,
'himself'
,
'his'
,
'how'
,
'however'
,
'hundred'
,
'i'
,
'ie'
,
'if'
,
'in'
,
'inc'
,
'indeed'
,
'interest'
,
'into'
,
'is'
,
'it'
,
'its'
,
'itself'
,
'keep'
,
'last'
,
'latter'
,
'latterly'
,
'least'
,
'less'
,
'made'
,
'many'
,
'may'
,
'me'
,
'meanwhile'
,
'might'
,
'mill'
,
'mine'
,
'more'
,
'moreover'
,
'most'
,
'mostly'
,
'move'
,
'much'
,
'must'
,
'my'
,
'myself'
,
'name'
,
'namely'
,
'neither'
,
'never'
,
'nevertheless'
,
'next'
,
'nine'
,
'no'
,
'nobody'
,
'none'
,
'noone'
,
'nor'
,
'not'
,
'nothing'
,
'now'
,
'nowhere'
,
'of'
,
'off'
,
'often'
,
'on'
,
'once'
,
'one'
,
'only'
,
'onto'
,
'or'
,
'other'
,
'others'
,
'otherwise'
,
'our'
,
'ours'
,
'ourselves'
,
'out'
,
'over'
,
'own'
,
'per'
,
'perhaps'
,
'please'
,
'pre'
,
'put'
,
'rather'
,
're'
,
'same'
,
'see'
,
'seem'
,
'seemed'
,
'seeming'
,
'seems'
,
'serious'
,
'several'
,
'she'
,
'should'
,
'show'
,
'side'
,
'since'
,
'sincere'
,
'six'
,
'sixty'
,
'so'
,
'some'
,
'somehow'
,
'someone'
,
'something'
,
'sometime'
,
'sometimes'
,
'somewhere'
,
'still'
,
'such'
,
'take'
,
'ten'
,
'than'
,
'that'
,
'the'
,
'their'
,
'them'
,
'themselves'
,
'then'
,
'thence'
,
'there'
,
'thereafter'
,
'thereby'
,
'therefore'
,
'therein'
,
'thereupon'
,
'these'
,
'they'
,
'thick'
,
'thin'
,
'third'
,
'this'
,
'those'
,
'though'
,
'three'
,
'through'
,
'throughout'
,
'thru'
,
'thus'
,
'to'
,
'together'
,
'too'
,
'toward'
,
'towards'
,
'twelve'
,
'twenty'
,
'two'
,
'un'
,
'under'
,
'until'
,
'up'
,
'upon'
,
'us'
,
'very'
,
'via'
,
'was'
,
'we'
,
'well'
,
'were'
,
'what'
,
'whatever'
,
'when'
,
'whence'
,
'whenever'
,
'where'
,
'whereafter'
,
'whereas'
,
'whereby'
,
'wherein'
,
'whereupon'
,
'wherever'
,
'whether'
,
'which'
,
'while'
,
'whither'
,
'who'
,
'whoever'
,
'whole'
,
'whom'
,
'whose'
,
'why'
,
'will'
,
'with'
,
'within'
,
'without'
,
'would'
,
'yet'
,
'you'
,
'your'
,
'yours'
,
'yourself'
,
'yourselves'
,
)
stop_word_dict
=
{}
for
word
in
stop_words
:
stop_word_dict
[
word
]
=
None
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/SETUP.cfg
deleted
100644 → 0
View file @
adc7e054
<extension ISO_8859_1_Splitter>
source src/ISO_8859_1_Splitter.c
</extension>
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/__init__.py
deleted
100644 → 0
View file @
adc7e054
from
ISO_8859_1_Splitter
import
ISO_8859_1_Splitter
def
Splitter
(
txt
,
stopwords
=
None
,
encoding
=
'latin1'
):
return
ISO_8859_1_Splitter
(
txt
,
stopwords
)
src/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c
deleted
100644 → 0
View file @
adc7e054
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
#define UPPERCASE "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define LOWERCASE "abcdefghijklmnopqrstuvwxyz"
#define DIGITSETC "0123456789-"
static
unsigned
char
letdig
[
256
];
static
unsigned
char
trtolower
[
256
];
typedef
struct
{
PyObject_HEAD
PyObject
*
text
,
*
synstop
;
char
*
here
,
*
end
;
int
index
;
int
allow_single_chars
;
int
index_numbers
;
int
max_len
;
int
casefolding
;
}
Splitter
;
static
PyObject
*
next_word
(
Splitter
*
,
char
**
,
char
**
);
static
int
myisalnum
(
int
c
)
{
return
letdig
[(
unsigned
char
)
c
];
}
static
int
mytolower
(
int
c
)
{
return
trtolower
[(
unsigned
char
)
c
];
}
static
int
myisspace
(
int
c
)
{
if
(
myisalnum
(
c
))
return
0
;
return
isspace
(
c
);
}
static
void
initSplitterTrtabs
(
void
)
{
int
i
;
static
int
initialized
=
0
;
if
(
initialized
)
return
;
initialized
=
1
;
for
(
i
=
0
;
i
<
256
;
i
++
)
{
letdig
[
i
]
=
0
;
trtolower
[
i
]
=
i
;
}
for
(
i
=
0
;
i
<
sizeof
(
UPPERCASE
);
i
++
)
{
trtolower
[(
unsigned
char
)
UPPERCASE
[
i
]]
=
LOWERCASE
[
i
];
letdig
[(
unsigned
char
)
LOWERCASE
[
i
]]
=
1
;
letdig
[(
unsigned
char
)
UPPERCASE
[
i
]]
=
1
;
}
for
(
i
=
0
;
i
<
sizeof
(
DIGITSETC
);
i
++
)
{
letdig
[(
unsigned
char
)
DIGITSETC
[
i
]]
=
1
;
}
}
static
void
Splitter_reset
(
Splitter
*
self
)
{
self
->
here
=
PyString_AsString
(
self
->
text
);
self
->
index
=
-
1
;
}
static
void
Splitter_dealloc
(
Splitter
*
self
)
{
Py_XDECREF
(
self
->
text
);
Py_XDECREF
(
self
->
synstop
);
PyObject_DEL
(
self
);
}
static
int
Splitter_length
(
Splitter
*
self
)
{
PyObject
*
res
=
0
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
res
,
next_word
(
self
,
NULL
,
NULL
))
return
-
1
;
UNLESS
(
PyString_Check
(
res
))
{
Py_DECREF
(
res
);
break
;
}
}
return
self
->
index
+
1
;
}
static
PyObject
*
Splitter_split
(
Splitter
*
self
)
{
PyObject
*
list
=
NULL
,
*
word
=
NULL
;
UNLESS
(
list
=
PyList_New
(
0
))
return
NULL
;
Splitter_reset
(
self
);
while
(
1
)
{
Py_XDECREF
(
word
);
UNLESS
(
word
=
next_word
(
self
,
NULL
,
NULL
))
return
NULL
;
if
(
word
==
Py_None
)
{
return
list
;
}
PyList_Append
(
list
,
word
);
}
return
list
;
}
static
PyObject
*
Splitter_concat
(
Splitter
*
self
,
PyObject
*
other
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot concatenate Splitters."
);
return
NULL
;
}
static
PyObject
*
Splitter_repeat
(
Splitter
*
self
,
long
n
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot repeat Splitters."
);
return
NULL
;
}
/*
Map an input word to an output word by applying standard
filtering/mapping words, including synonyms/stop words.
Input is a word.
Output is:
None -- The word is a stop word
sometext -- A replacement for the word
*/
static
PyObject
*
check_synstop
(
Splitter
*
self
,
PyObject
*
word
)
{
PyObject
*
value
;
char
*
cword
;
int
len
;
cword
=
PyString_AsString
(
word
);
len
=
PyString_Size
(
word
)
-
1
;
len
=
PyString_Size
(
word
);
if
(
len
<
2
&&
!
self
->
allow_single_chars
)
/* Single-letter words are stop words! */
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*************************************************************
Test whether a word has any letters. *
*/
for
(;
--
len
>=
0
&&
!
isalpha
((
unsigned
char
)
cword
[
len
]);
)
;
if
(
len
<
0
&&
!
self
->
index_numbers
)
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF
(
word
);
if
(
self
->
synstop
==
NULL
)
return
word
;
while
((
value
=
PyObject_GetItem
(
self
->
synstop
,
word
))
&&
PyString_Check
(
value
))
{
ASSIGN
(
word
,
value
);
if
(
len
++
>
100
)
break
;
/* Avoid infinite recurssion */
}
if
(
value
==
NULL
)
{
PyErr_Clear
();
return
word
;
}
return
value
;
/* Which must be None! */
}
static
PyObject
*
next_word
(
Splitter
*
self
,
char
**
startpos
,
char
**
endpos
)
{
char
wbuf
[
256
];
char
*
end
,
*
here
,
*
b
;
int
i
=
0
,
c
;
PyObject
*
pyword
,
*
res
;
here
=
self
->
here
;
end
=
self
->
end
;
b
=
wbuf
;
while
(
here
<
end
)
{
/* skip hyphens */
if
((
i
>
0
)
&&
(
*
here
==
'-'
))
{
here
++
;
while
(
myisspace
(
*
here
)
&&
(
here
<
end
))
here
++
;
continue
;
}
if
(
self
->
casefolding
)
c
=
mytolower
(
*
here
);
else
c
=
(
*
here
);
/* Check to see if this character is part of a word */
if
(
myisalnum
((
unsigned
char
)
c
)
||
c
==
'/'
)
{
/* Found a word character */
if
(
startpos
&&
i
==
0
)
*
startpos
=
here
;
if
(
i
++
<
self
->
max_len
)
*
b
++
=
c
;
}
else
if
(
i
!=
0
)
{
/* We've found the end of a word */
if
(
i
>=
self
->
max_len
)
i
=
self
->
max_len
;
/* "stem" the long word */
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
{
self
->
here
=
here
;
return
NULL
;
}
UNLESS
(
res
=
check_synstop
(
self
,
pyword
))
{
self
->
here
=
here
;
Py_DECREF
(
pyword
);
return
NULL
;
}
if
(
res
!=
Py_None
)
{
if
(
endpos
)
*
endpos
=
here
;
self
->
here
=
here
;
Py_DECREF
(
pyword
);
self
->
index
++
;
return
res
;
}
/* The word is a stopword, so ignore it */
Py_DECREF
(
res
);
Py_DECREF
(
pyword
);
i
=
0
;
b
=
wbuf
;
}
here
++
;
}
self
->
here
=
here
;
/* We've reached the end of the string */
if
(
i
>=
self
->
max_len
)
i
=
self
->
max_len
;
/* "stem" the long word */
if
(
i
==
0
)
{
/* No words */
self
->
here
=
here
;
Py_INCREF
(
Py_None
);
return
Py_None
;
}
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
return
NULL
;
if
(
endpos
)
*
endpos
=
here
;
res
=
check_synstop
(
self
,
pyword
);
Py_DECREF
(
pyword
);
if
(
PyString_Check
(
res
))
self
->
index
++
;
return
res
;
}
static
PyObject
*
Splitter_item
(
Splitter
*
self
,
int
i
)
{
PyObject
*
word
=
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
Py_XDECREF
(
word
);
UNLESS
(
word
=
next_word
(
self
,
NULL
,
NULL
))
return
NULL
;
if
(
word
==
Py_None
)
{
Py_DECREF
(
word
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
}
return
word
;
}
static
PyObject
*
Splitter_slice
(
Splitter
*
self
,
int
i
,
int
j
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot slice Splitters."
);
return
NULL
;
}
static
PySequenceMethods
Splitter_as_sequence
=
{
(
inquiry
)
Splitter_length
,
/*sq_length*/
(
binaryfunc
)
Splitter_concat
,
/*sq_concat*/
(
intargfunc
)
Splitter_repeat
,
/*sq_repeat*/
(
intargfunc
)
Splitter_item
,
/*sq_item*/
(
intintargfunc
)
Splitter_slice
,
/*sq_slice*/
(
intobjargproc
)
0
,
/*sq_ass_item*/
(
intintobjargproc
)
0
,
/*sq_ass_slice*/
};
static
PyObject
*
Splitter_pos
(
Splitter
*
self
,
PyObject
*
args
)
{
char
*
start
,
*
end
,
*
ctext
;
PyObject
*
res
;
int
i
;
UNLESS
(
PyArg_Parse
(
args
,
"i"
,
&
i
))
return
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
UNLESS
(
res
=
next_word
(
self
,
&
start
,
&
end
))
return
NULL
;
if
(
PyString_Check
(
res
))
{
self
->
index
++
;
Py_DECREF
(
res
);
continue
;
}
Py_DECREF
(
res
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
ctext
=
PyString_AsString
(
self
->
text
);
return
Py_BuildValue
(
"(ii)"
,
start
-
ctext
,
end
-
ctext
);
}
static
PyObject
*
Splitter_indexes
(
Splitter
*
self
,
PyObject
*
args
)
{
PyObject
*
word
,
*
r
,
*
w
=
0
,
*
index
=
0
;
int
i
=
0
;
UNLESS
(
PyArg_ParseTuple
(
args
,
"O"
,
&
word
))
return
NULL
;
UNLESS
(
r
=
PyList_New
(
0
))
return
NULL
;
UNLESS
(
word
=
check_synstop
(
self
,
word
))
goto
err
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
w
,
next_word
(
self
,
NULL
,
NULL
))
goto
err
;
UNLESS
(
PyString_Check
(
w
))
break
;
if
(
PyObject_Compare
(
word
,
w
)
==
0
)
{
UNLESS_ASSIGN
(
index
,
PyInt_FromLong
(
i
))
goto
err
;
if
(
PyList_Append
(
r
,
index
)
<
0
)
goto
err
;
}
i
++
;
}
Py_XDECREF
(
w
);
Py_XDECREF
(
index
);
return
r
;
err:
Py_DECREF
(
r
);
Py_XDECREF
(
index
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_methods
[]
=
{
{
"split"
,
(
PyCFunction
)
Splitter_split
,
0
,
"split() -- Split the string in one run"
},
{
"pos"
,
(
PyCFunction
)
Splitter_pos
,
0
,
"pos(index) -- Return the starting and ending position of a token"
},
{
"indexes"
,
(
PyCFunction
)
Splitter_indexes
,
METH_VARARGS
,
"indexes(word) -- Return al list of the indexes of word in the sequence"
,
},
{
NULL
,
NULL
}
/* sentinel */
};
static
PyObject
*
Splitter_getattr
(
Splitter
*
self
,
char
*
name
)
{
return
Py_FindMethod
(
Splitter_methods
,
(
PyObject
*
)
self
,
name
);
}
static
char
SplitterType__doc__
[]
=
""
;
static
PyTypeObject
SplitterType
=
{
PyObject_HEAD_INIT
(
NULL
)
0
,
/*ob_size*/
"Splitter"
,
/*tp_name*/
sizeof
(
Splitter
),
/*tp_basicsize*/
0
,
/*tp_itemsize*/
/* methods */
(
destructor
)
Splitter_dealloc
,
/*tp_dealloc*/
(
printfunc
)
0
,
/*tp_print*/
(
getattrfunc
)
Splitter_getattr
,
/*tp_getattr*/
(
setattrfunc
)
0
,
/*tp_setattr*/
(
cmpfunc
)
0
,
/*tp_compare*/
(
reprfunc
)
0
,
/*tp_repr*/
0
,
/*tp_as_number*/
&
Splitter_as_sequence
,
/*tp_as_sequence*/
0
,
/*tp_as_mapping*/
(
hashfunc
)
0
,
/*tp_hash*/
(
ternaryfunc
)
0
,
/*tp_call*/
(
reprfunc
)
0
,
/*tp_str*/
/* Space for future expansion */
0L
,
0L
,
0L
,
0L
,
SplitterType__doc__
/* Documentation string */
};
static
char
*
splitter_args
[]
=
{
"doc"
,
"synstop"
,
"encoding"
,
"singlechar"
,
"indexnumbers"
,
"maxlen"
,
"casefolding"
,
NULL
};
static
PyObject
*
get_Splitter
(
PyObject
*
modinfo
,
PyObject
*
args
,
PyObject
*
keywds
)
{
Splitter
*
self
;
PyObject
*
doc
,
*
synstop
=
NULL
;
char
*
encoding
=
"latin1"
;
int
single_char
=
0
;
int
index_numbers
=
0
;
int
max_len
=
64
;
int
casefolding
=
1
;
UNLESS
(
PyArg_ParseTupleAndKeywords
(
args
,
keywds
,
"O|Osiiii"
,
splitter_args
,
&
doc
,
&
synstop
,
&
encoding
,
&
single_char
,
&
index_numbers
,
&
max_len
,
&
casefolding
))
return
NULL
;
if
(
index_numbers
<
0
||
index_numbers
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"indexnumbers must be 0 or 1"
);
return
NULL
;
}
if
(
casefolding
<
0
||
casefolding
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"casefolding must be 0 or 1"
);
return
NULL
;
}
if
(
single_char
<
0
||
single_char
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"singlechar must be 0 or 1"
);
return
NULL
;
}
if
(
max_len
<
1
||
max_len
>
128
)
{
PyErr_SetString
(
PyExc_ValueError
,
"maxlen must be between 1 and 128"
);
return
NULL
;
}
UNLESS
(
self
=
PyObject_NEW
(
Splitter
,
&
SplitterType
))
return
NULL
;
if
(
synstop
)
{
self
->
synstop
=
synstop
;
Py_INCREF
(
synstop
);
}
else
self
->
synstop
=
NULL
;
UNLESS
(
self
->
text
=
PyObject_Str
(
doc
))
goto
err
;
UNLESS
(
self
->
here
=
PyString_AsString
(
self
->
text
))
goto
err
;
self
->
end
=
self
->
here
+
PyString_Size
(
self
->
text
);
self
->
allow_single_chars
=
single_char
;
self
->
index_numbers
=
index_numbers
;
self
->
max_len
=
max_len
;
self
->
casefolding
=
casefolding
;
self
->
index
=
-
1
;
return
(
PyObject
*
)
self
;
err:
Py_DECREF
(
self
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_module_methods
[]
=
{
{
"ISO_8859_1_Splitter"
,
(
PyCFunction
)
get_Splitter
,
METH_VARARGS
|
METH_KEYWORDS
,
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{
NULL
,
NULL
}
};
static
char
Splitter_module_documentation
[]
=
"Parse source strings into sequences of words
\n
"
"
\n
"
"for use in an inverted index
\n
"
"
\n
"
"$Id$
\n
"
;
void
initISO_8859_1_Splitter
(
void
)
{
PyObject
*
m
;
/* Create the module and add the functions */
initSplitterTrtabs
();
m
=
Py_InitModule4
(
"ISO_8859_1_Splitter"
,
Splitter_module_methods
,
Splitter_module_documentation
,
(
PyObject
*
)
NULL
,
PYTHON_API_VERSION
);
}
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/SETUP.cfg
deleted
100644 → 0
View file @
adc7e054
<extension UnicodeSplitter>
source src/UnicodeSplitter.c
</extension>
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/__init__.py
deleted
100644 → 0
View file @
adc7e054
from
UnicodeSplitter
import
UnicodeSplitter
as
Splitter
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
deleted
100644 → 0
View file @
adc7e054
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
typedef
struct
{
PyObject_HEAD
PyObject
*
list
;
PyObject
*
synstop
;
int
max_len
;
int
allow_single_chars
;
int
index_numbers
;
int
casefolding
;
}
Splitter
;
static
PyUnicodeObject
*
prepareString
(
Splitter
*
self
,
PyUnicodeObject
*
o
);
static
PyObject
*
checkSynword
(
Splitter
*
self
,
PyObject
*
word
)
{
/* Always returns a borrowed reference */
PyObject
*
value
;
if
(
PyUnicode_GetSize
(
word
)
==
1
&&
!
self
->
allow_single_chars
)
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
if
(
self
->
synstop
)
{
value
=
PyDict_GetItem
(
self
->
synstop
,
word
);
if
(
value
!=
NULL
)
{
return
value
;
}
}
return
word
;
}
static
void
Splitter_dealloc
(
Splitter
*
self
)
{
Py_XDECREF
(
self
->
list
);
Py_XDECREF
(
self
->
synstop
);
PyObject_DEL
(
self
);
}
static
int
Splitter_length
(
Splitter
*
self
)
{
return
PyList_Size
(
self
->
list
);
}
static
PyObject
*
Splitter_concat
(
Splitter
*
self
,
PyObject
*
other
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot concatenate Splitters."
);
return
NULL
;
}
static
PyObject
*
Splitter_repeat
(
Splitter
*
self
,
long
n
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot repeat Splitters."
);
return
NULL
;
}
static
PyObject
*
Splitter_item
(
Splitter
*
self
,
int
i
)
{
PyObject
*
item
;
item
=
PyList_GetItem
(
self
->
list
,
i
);
Py_XINCREF
(
item
);
/* Promote borrowed ref unless exception */
return
item
;
}
static
PyObject
*
Splitter_split
(
Splitter
*
self
)
{
Py_INCREF
(
self
->
list
);
return
self
->
list
;
}
static
PyObject
*
Splitter_indexes
(
Splitter
*
self
,
PyObject
*
args
)
{
int
i
=
0
,
size
;
PyObject
*
word
=
NULL
,
*
item
=
NULL
,
*
r
=
NULL
,
*
index
=
NULL
;
if
(
!
(
PyArg_ParseTuple
(
args
,
"O"
,
&
word
)))
return
NULL
;
if
(
!
(
r
=
PyList_New
(
0
)))
return
NULL
;
size
=
PyList_Size
(
self
->
list
);
for
(
i
=
0
;
i
<
size
;
i
++
)
{
item
=
PyList_GET_ITEM
(
self
->
list
,
i
);
if
(
PyUnicode_Compare
(
word
,
item
)
==
0
)
{
index
=
PyInt_FromLong
(
i
);
if
(
!
index
)
return
NULL
;
PyList_Append
(
r
,
index
);
}
}
return
r
;
}
static
PyObject
*
Splitter_slice
(
Splitter
*
self
,
int
i
,
int
j
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot slice Splitters."
);
return
NULL
;
}
static
PySequenceMethods
Splitter_as_sequence
=
{
(
inquiry
)
Splitter_length
,
/*sq_length*/
(
binaryfunc
)
Splitter_concat
,
/*sq_concat*/
(
intargfunc
)
Splitter_repeat
,
/*sq_repeat*/
(
intargfunc
)
Splitter_item
,
/*sq_item*/
(
intintargfunc
)
Splitter_slice
,
/*sq_slice*/
(
intobjargproc
)
0
,
/*sq_ass_item*/
(
intintobjargproc
)
0
,
/*sq_ass_slice*/
};
static
struct
PyMethodDef
Splitter_methods
[]
=
{
{
"split"
,
(
PyCFunction
)
Splitter_split
,
0
,
"split() -- Split string in one run"
},
{
"indexes"
,
(
PyCFunction
)
Splitter_indexes
,
METH_VARARGS
,
"indexes(word) -- Return a list of the indexes of word in the sequence"
,
},
{
NULL
,
NULL
}
/* sentinel */
};
static
PyObject
*
Splitter_getattr
(
Splitter
*
self
,
char
*
name
)
{
return
Py_FindMethod
(
Splitter_methods
,
(
PyObject
*
)
self
,
name
);
}
static
char
SplitterType__doc__
[]
=
""
;
static
PyTypeObject
SplitterType
=
{
PyObject_HEAD_INIT
(
NULL
)
0
,
/*ob_size*/
"Splitter"
,
/*tp_name*/
sizeof
(
Splitter
),
/*tp_basicsize*/
0
,
/*tp_itemsize*/
/* methods */
(
destructor
)
Splitter_dealloc
,
/*tp_dealloc*/
(
printfunc
)
0
,
/*tp_print*/
(
getattrfunc
)
Splitter_getattr
,
/*tp_getattr*/
(
setattrfunc
)
0
,
/*tp_setattr*/
(
cmpfunc
)
0
,
/*tp_compare*/
(
reprfunc
)
0
,
/*tp_repr*/
0
,
/*tp_as_number*/
&
Splitter_as_sequence
,
/*tp_as_sequence*/
0
,
/*tp_as_mapping*/
(
hashfunc
)
0
,
/*tp_hash*/
(
ternaryfunc
)
0
,
/*tp_call*/
(
reprfunc
)
0
,
/*tp_str*/
/* Space for future expansion */
0L
,
0L
,
0L
,
0L
,
SplitterType__doc__
/* Documentation string */
};
static
int
splitUnicodeString
(
Splitter
*
self
,
PyUnicodeObject
*
doc
)
{
PyObject
*
word
,
*
synword
;
PyUnicodeObject
*
doc1
;
Py_UNICODE
*
s
;
int
len
=
doc
->
length
;
int
inside_word
=
0
;
int
i
=
0
;
int
start
=
0
;
doc1
=
prepareString
(
self
,
doc
);
if
(
doc1
==
NULL
)
return
-
1
;
s
=
doc1
->
str
;
self
->
list
=
PyList_New
(
0
);
for
(
i
=
0
;
i
<
len
;
s
++
,
i
++
)
{
register
Py_UNICODE
ch
;
ch
=
*
s
;
if
(
!
inside_word
)
{
if
(
self
->
index_numbers
)
{
if
(
Py_UNICODE_ISALNUM
(
ch
))
{
inside_word
=
1
;
start
=
i
;
}
}
else
{
if
(
Py_UNICODE_ISALPHA
(
ch
))
{
inside_word
=
1
;
start
=
i
;
}
}
}
else
{
if
(
!
(
Py_UNICODE_ISALNUM
(
ch
)
||
ch
==
'/'
||
ch
==
'_'
||
ch
==
'-'
))
{
inside_word
=
0
;
word
=
PySequence_GetSlice
((
PyObject
*
)
doc1
,
start
,
min
(
i
,
start
+
self
->
max_len
));
if
(
word
==
NULL
)
goto
err
;
synword
=
checkSynword
(
self
,
word
);
if
(
synword
!=
Py_None
)
{
PyList_Append
(
self
->
list
,
synword
);
}
start
=
0
;
#ifdef DEBUG
PyObject_Print
(
word
,
stdout
,
0
);
fflush
(
stdout
);
#endif
Py_DECREF
(
word
);
}
}
}
if
(
inside_word
)
{
word
=
PySequence_GetSlice
((
PyObject
*
)
doc1
,
start
,
min
(
len
,
start
+
self
->
max_len
));
if
(
word
==
NULL
)
goto
err
;
synword
=
checkSynword
(
self
,
word
);
if
(
synword
!=
Py_None
)
{
PyList_Append
(
self
->
list
,
synword
);
}
Py_DECREF
(
word
);
}
#ifdef DEBUG
PyObject_Print
(
self
->
list
,
stdout
,
0
);
fflush
(
stdout
);
#endif
Py_DECREF
(
doc1
);
return
1
;
err:
Py_DECREF
(
doc1
);
return
-
1
;
}
static
void
fixlower
(
PyUnicodeObject
*
self
)
{
int
len
=
self
->
length
;
Py_UNICODE
*
s
=
self
->
str
;
while
(
len
--
>
0
)
{
register
Py_UNICODE
ch
;
ch
=
Py_UNICODE_TOLOWER
(
*
s
);
if
(
ch
!=
*
s
)
*
s
=
ch
;
s
++
;
}
}
static
PyUnicodeObject
*
prepareString
(
Splitter
*
self
,
PyUnicodeObject
*
o
)
{
PyUnicodeObject
*
u
;
u
=
(
PyUnicodeObject
*
)
PyUnicode_FromUnicode
(
o
->
str
,
o
->
length
);
if
(
u
!=
NULL
){
if
(
self
->
casefolding
)
fixlower
(
u
);
}
return
u
;
}
static
char
*
splitter_args
[]
=
{
"doc"
,
"synstop"
,
"encoding"
,
"indexnumbers"
,
"singlechar"
,
"maxlen"
,
"casefolding"
,
NULL
};
static
PyObject
*
newSplitter
(
PyObject
*
modinfo
,
PyObject
*
args
,
PyObject
*
keywds
)
{
Splitter
*
self
=
NULL
;
PyObject
*
doc
=
NULL
,
*
unicodedoc
=
NULL
,
*
synstop
=
NULL
;
char
*
encoding
=
"latin1"
;
int
index_numbers
=
0
;
int
max_len
=
64
;
int
single_char
=
0
;
int
casefolding
=
1
;
if
(
!
(
PyArg_ParseTupleAndKeywords
(
args
,
keywds
,
"O|Osiiii"
,
splitter_args
,
&
doc
,
&
synstop
,
&
encoding
,
&
index_numbers
,
&
single_char
,
&
max_len
,
&
casefolding
)))
return
NULL
;
#ifdef DEBUG
puts
(
"got text"
);
PyObject_Print
(
doc
,
stdout
,
0
);
fflush
(
stdout
);
#endif
if
(
index_numbers
<
0
||
index_numbers
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"indexnumbers must be 0 or 1"
);
return
NULL
;
}
if
(
casefolding
<
0
||
casefolding
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"casefolding must be 0 or 1"
);
return
NULL
;
}
if
(
single_char
<
0
||
single_char
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"singlechar must be 0 or 1"
);
return
NULL
;
}
if
(
max_len
<
1
||
max_len
>
128
)
{
PyErr_SetString
(
PyExc_ValueError
,
"maxlen must be between 1 and 128"
);
return
NULL
;
}
if
(
PyString_Check
(
doc
))
{
unicodedoc
=
PyUnicode_FromEncodedObject
(
doc
,
encoding
,
"strict"
);
if
(
unicodedoc
==
NULL
)
{
PyErr_SetString
(
PyExc_UnicodeError
,
"Problem converting encoded string"
);
return
NULL
;
}
}
else
if
(
PyUnicode_Check
(
doc
))
{
unicodedoc
=
doc
;
Py_INCREF
(
unicodedoc
);
}
else
{
PyErr_SetString
(
PyExc_TypeError
,
"first argument is neither string nor unicode."
);
return
NULL
;
}
if
(
!
(
self
=
PyObject_NEW
(
Splitter
,
&
SplitterType
)))
return
NULL
;
if
(
synstop
)
{
self
->
synstop
=
synstop
;
Py_INCREF
(
synstop
);
}
else
self
->
synstop
=
NULL
;
self
->
index_numbers
=
index_numbers
;
self
->
max_len
=
max_len
;
self
->
allow_single_chars
=
single_char
;
self
->
casefolding
=
casefolding
;
if
((
splitUnicodeString
(
self
,(
PyUnicodeObject
*
)
unicodedoc
))
<
0
)
goto
err
;
Py_DECREF
(
unicodedoc
);
return
(
PyObject
*
)
self
;
err:
Py_DECREF
(
self
);
Py_DECREF
(
unicodedoc
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_module_methods
[]
=
{
{
"UnicodeSplitter"
,
(
PyCFunction
)
newSplitter
,
METH_VARARGS
|
METH_KEYWORDS
,
"UnicodeSplitter(doc[,synstop][,encoding='latin1'][,indexnumbers][,maxlen][,singlechar][,casefolding]) "
"-- Return a word splitter"
},
{
NULL
,
NULL
}
};
static
char
Splitter_module_documentation
[]
=
"Parse source (unicode) string into sequences of words
\n
"
"
\n
"
"for use in an inverted index
\n
"
"
\n
"
"$Id$
\n
"
;
void
initUnicodeSplitter
(
void
)
{
PyObject
*
m
,
*
d
;
char
*
rev
=
"$Revision: 1.16 $"
;
/* Create the module and add the functions */
m
=
Py_InitModule4
(
"UnicodeSplitter"
,
Splitter_module_methods
,
Splitter_module_documentation
,
(
PyObject
*
)
NULL
,
PYTHON_API_VERSION
);
/* Add some symbolic constants to the module */
d
=
PyModule_GetDict
(
m
);
PyDict_SetItemString
(
d
,
"__version__"
,
PyString_FromStringAndSize
(
rev
+
11
,
strlen
(
rev
+
11
)
-
2
));
if
(
PyErr_Occurred
())
Py_FatalError
(
"can't initialize module Splitter"
);
}
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/tests/__init__.py
deleted
100644 → 0
View file @
adc7e054
# Nothing to see here.
src/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/tests/testUnicodeSplitter.py
deleted
100644 → 0
View file @
adc7e054
# -*- coding: ISO-8859-1 -*-
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import
os
,
sys
,
unittest
from
Products.PluginIndexes.TextIndex.Splitter.UnicodeSplitter.UnicodeSplitter
\
import
UnicodeSplitter
class
UnicodeSplitterTests
(
unittest
.
TestCase
):
def
setUp
(
self
):
texts
=
(
'The quick brown fox jumps over the lazy dog'
,
'Bei den dreitgigen Angriffen seien auch bis'
' auf einen alle Flugpltze der Taliban zerstrt worden'
,
)
self
.
testdata
=
[]
for
t
in
texts
:
uniLst
=
[
unicode
(
x
,
'latin1'
)
for
x
in
t
.
lower
().
split
(
' '
)]
self
.
testdata
.
append
(
(
t
,
uniLst
)
)
def
testSimpleSplit
(
self
):
""" testing splitter functionality """
for
t
,
expected
in
self
.
testdata
:
fields
=
list
(
UnicodeSplitter
(
t
))
assert
fields
==
expected
,
"%s vs %s"
%
(
fields
,
expected
)
return
0
def
testStopwords
(
self
):
""" testing splitter with stopwords """
text
=
'The quick brown fox jumps over The lazy dog'
expected
=
[
u'quick'
,
u'brown'
,
u'fox'
,
u'jumps'
,
u'over'
,
u'lazy'
,
u'cat'
]
sw_dict
=
{
'the'
:
None
,
'dog'
:
'cat'
}
splitter
=
UnicodeSplitter
(
text
,
sw_dict
)
fields
=
list
(
splitter
)
self
.
assertEquals
(
fields
,
expected
)
self
.
assertEquals
(
splitter
.
indexes
(
'jumps'
),
[
3
])
def
test_suite
():
return
unittest
.
makeSuite
(
UnicodeSplitterTests
)
def
debug
():
return
test_suite
().
debug
()
def
pdebug
():
import
pdb
pdb
.
run
(
'debug()'
)
def
main
():
unittest
.
TextTestRunner
().
run
(
test_suite
()
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
>
1
:
globals
()[
sys
.
argv
[
1
]]()
else
:
main
()
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/SETUP.cfg
deleted
100644 → 0
View file @
adc7e054
<extension ZopeSplitter>
source src/ZopeSplitter.c
</extension>
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/__init__.py
deleted
100644 → 0
View file @
adc7e054
from
ZopeSplitter
import
ZopeSplitter
def
Splitter
(
txt
,
stopwords
=
{},
encoding
=
"latin1"
):
return
ZopeSplitter
(
txt
,
stopwords
)
src/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
deleted
100644 → 0
View file @
adc7e054
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
typedef
struct
{
PyObject_HEAD
PyObject
*
text
,
*
synstop
;
char
*
here
,
*
end
;
int
index
;
int
allow_single_chars
;
int
index_numbers
;
int
max_len
;
int
casefolding
;
}
Splitter
;
static
PyObject
*
next_word
(
Splitter
*
,
char
**
,
char
**
);
static
void
Splitter_reset
(
Splitter
*
self
)
{
self
->
here
=
PyString_AsString
(
self
->
text
);
self
->
index
=
-
1
;
}
static
void
Splitter_dealloc
(
Splitter
*
self
)
{
Py_XDECREF
(
self
->
text
);
Py_XDECREF
(
self
->
synstop
);
PyObject_DEL
(
self
);
}
static
int
Splitter_length
(
Splitter
*
self
)
{
PyObject
*
res
=
0
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
res
,
next_word
(
self
,
NULL
,
NULL
))
return
-
1
;
UNLESS
(
PyString_Check
(
res
))
{
Py_DECREF
(
res
);
break
;
}
}
return
self
->
index
+
1
;
}
static
PyObject
*
Splitter_concat
(
Splitter
*
self
,
PyObject
*
other
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot concatenate Splitters."
);
return
NULL
;
}
static
PyObject
*
Splitter_repeat
(
Splitter
*
self
,
long
n
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot repeat Splitters."
);
return
NULL
;
}
/*
Map an input word to an output word by applying standard
filtering/mapping words, including synonyms/stop words.
Input is a word.
Output is:
None -- The word is a stop word
sometext -- A replacement for the word
*/
static
PyObject
*
check_synstop
(
Splitter
*
self
,
PyObject
*
word
)
{
PyObject
*
value
;
char
*
cword
;
int
len
;
cword
=
PyString_AS_STRING
(
word
);
len
=
PyString_GET_SIZE
(
word
);
if
(
len
<
2
&&
!
self
->
allow_single_chars
)
/* Single-letter words are stop words! */
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*************************************************************
Test whether a word has any letters. *
*/
for
(;
--
len
>=
0
&&
!
isalpha
((
unsigned
char
)
cword
[
len
]);
)
;
if
(
len
<
0
&&
!
self
->
index_numbers
)
{
Py_INCREF
(
Py_None
);
return
Py_None
;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF
(
word
);
if
(
self
->
synstop
==
NULL
)
return
word
;
len
=
0
;
while
((
value
=
PyObject_GetItem
(
self
->
synstop
,
word
))
&&
PyString_Check
(
value
))
{
Py_DECREF
(
word
);
word
=
value
;
if
(
len
++
>
100
)
break
;
/* Avoid infinite recurssion */
}
if
(
value
==
NULL
)
{
PyErr_Clear
();
return
word
;
}
return
value
;
/* Which must be None! */
}
static
PyObject
*
next_word
(
Splitter
*
self
,
char
**
startpos
,
char
**
endpos
)
{
char
wbuf
[
256
];
char
*
end
,
*
here
,
*
b
;
int
i
=
0
,
c
;
PyObject
*
pyword
,
*
res
;
here
=
self
->
here
;
end
=
self
->
end
;
b
=
wbuf
;
while
(
here
<
end
)
{
/* skip hyphens */
if
((
i
>
0
)
&&
(
*
here
==
'-'
))
{
here
++
;
while
(
isspace
((
unsigned
char
)
*
here
)
&&
(
here
<
end
))
here
++
;
continue
;
}
if
(
self
->
casefolding
)
c
=
tolower
((
unsigned
char
)
*
here
);
else
c
=
(
unsigned
char
)
*
here
;
/* Check to see if this character is part of a word */
if
(
isalnum
((
unsigned
char
)
c
)
||
c
==
'/'
||
c
==
'_'
)
{
/* Found a word character */
if
(
startpos
&&
i
==
0
)
*
startpos
=
here
;
if
(
i
++
<
self
->
max_len
)
*
b
++
=
c
;
}
else
if
(
i
!=
0
)
{
/* We've found the end of a word */
if
(
i
>=
self
->
max_len
)
i
=
self
->
max_len
;
/* "stem" the long word */
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
{
self
->
here
=
here
;
return
NULL
;
}
UNLESS
(
res
=
check_synstop
(
self
,
pyword
))
{
self
->
here
=
here
;
Py_DECREF
(
pyword
);
return
NULL
;
}
if
(
res
!=
Py_None
)
{
if
(
endpos
)
*
endpos
=
here
;
self
->
here
=
here
;
Py_DECREF
(
pyword
);
self
->
index
++
;
return
res
;
}
/* The word is a stopword, so ignore it */
Py_DECREF
(
res
);
Py_DECREF
(
pyword
);
i
=
0
;
b
=
wbuf
;
}
here
++
;
}
self
->
here
=
here
;
/* We've reached the end of the string */
if
(
i
>=
self
->
max_len
)
i
=
self
->
max_len
;
/* "stem" the long word */
if
(
i
==
0
)
{
/* No words */
self
->
here
=
here
;
Py_INCREF
(
Py_None
);
return
Py_None
;
}
UNLESS
(
pyword
=
PyString_FromStringAndSize
(
wbuf
,
i
))
return
NULL
;
if
(
endpos
)
*
endpos
=
here
;
res
=
check_synstop
(
self
,
pyword
);
Py_DECREF
(
pyword
);
if
(
PyString_Check
(
res
))
self
->
index
++
;
return
res
;
}
static
PyObject
*
Splitter_item
(
Splitter
*
self
,
int
i
)
{
PyObject
*
word
=
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
Py_XDECREF
(
word
);
UNLESS
(
word
=
next_word
(
self
,
NULL
,
NULL
))
return
NULL
;
if
(
word
==
Py_None
)
{
Py_DECREF
(
word
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
}
return
word
;
}
static
PyObject
*
Splitter_split
(
Splitter
*
self
)
{
PyObject
*
list
=
NULL
,
*
word
=
NULL
;
UNLESS
(
list
=
PyList_New
(
0
))
return
NULL
;
Splitter_reset
(
self
);
while
(
1
)
{
Py_XDECREF
(
word
);
UNLESS
(
word
=
next_word
(
self
,
NULL
,
NULL
))
return
NULL
;
if
(
word
==
Py_None
)
{
return
list
;
}
PyList_Append
(
list
,
word
);
}
return
list
;
}
static
PyObject
*
Splitter_slice
(
Splitter
*
self
,
int
i
,
int
j
)
{
PyErr_SetString
(
PyExc_TypeError
,
"Cannot slice Splitters."
);
return
NULL
;
}
static
PySequenceMethods
Splitter_as_sequence
=
{
(
inquiry
)
Splitter_length
,
/*sq_length*/
(
binaryfunc
)
Splitter_concat
,
/*sq_concat*/
(
intargfunc
)
Splitter_repeat
,
/*sq_repeat*/
(
intargfunc
)
Splitter_item
,
/*sq_item*/
(
intintargfunc
)
Splitter_slice
,
/*sq_slice*/
(
intobjargproc
)
0
,
/*sq_ass_item*/
(
intintobjargproc
)
0
,
/*sq_ass_slice*/
};
static
PyObject
*
Splitter_pos
(
Splitter
*
self
,
PyObject
*
args
)
{
char
*
start
,
*
end
,
*
ctext
;
PyObject
*
res
;
int
i
;
UNLESS
(
PyArg_Parse
(
args
,
"i"
,
&
i
))
return
NULL
;
if
(
i
<=
self
->
index
)
Splitter_reset
(
self
);
while
(
self
->
index
<
i
)
{
UNLESS
(
res
=
next_word
(
self
,
&
start
,
&
end
))
return
NULL
;
if
(
PyString_Check
(
res
))
{
self
->
index
++
;
Py_DECREF
(
res
);
continue
;
}
Py_DECREF
(
res
);
PyErr_SetString
(
PyExc_IndexError
,
"Splitter index out of range"
);
return
NULL
;
}
ctext
=
PyString_AsString
(
self
->
text
);
return
Py_BuildValue
(
"(ii)"
,
start
-
ctext
,
end
-
ctext
);
}
static
PyObject
*
Splitter_indexes
(
Splitter
*
self
,
PyObject
*
args
)
{
PyObject
*
word
,
*
r
,
*
w
=
0
,
*
index
=
0
;
int
i
=
0
;
UNLESS
(
PyArg_ParseTuple
(
args
,
"O"
,
&
word
))
return
NULL
;
UNLESS
(
r
=
PyList_New
(
0
))
return
NULL
;
UNLESS
(
word
=
check_synstop
(
self
,
word
))
goto
err
;
Splitter_reset
(
self
);
while
(
1
)
{
UNLESS_ASSIGN
(
w
,
next_word
(
self
,
NULL
,
NULL
))
goto
err
;
UNLESS
(
PyString_Check
(
w
))
break
;
if
(
PyObject_Compare
(
word
,
w
)
==
0
)
{
UNLESS_ASSIGN
(
index
,
PyInt_FromLong
(
i
))
goto
err
;
if
(
PyList_Append
(
r
,
index
)
<
0
)
goto
err
;
}
i
++
;
}
Py_XDECREF
(
w
);
Py_XDECREF
(
index
);
return
r
;
err:
Py_DECREF
(
r
);
Py_XDECREF
(
index
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_methods
[]
=
{
{
"split"
,
(
PyCFunction
)
Splitter_split
,
0
,
"split() -- Split complete string in one run"
},
{
"pos"
,
(
PyCFunction
)
Splitter_pos
,
0
,
"pos(index) -- Return the starting and ending position of a token"
},
{
"indexes"
,
(
PyCFunction
)
Splitter_indexes
,
METH_VARARGS
,
"indexes(word) -- Return a list of the indexes of word in the sequence"
,
},
{
NULL
,
NULL
}
/* sentinel */
};
static
PyObject
*
Splitter_getattr
(
Splitter
*
self
,
char
*
name
)
{
return
Py_FindMethod
(
Splitter_methods
,
(
PyObject
*
)
self
,
name
);
}
static
char
SplitterType__doc__
[]
=
""
;
static
PyTypeObject
SplitterType
=
{
PyObject_HEAD_INIT
(
NULL
)
0
,
/*ob_size*/
"Splitter"
,
/*tp_name*/
sizeof
(
Splitter
),
/*tp_basicsize*/
0
,
/*tp_itemsize*/
/* methods */
(
destructor
)
Splitter_dealloc
,
/*tp_dealloc*/
(
printfunc
)
0
,
/*tp_print*/
(
getattrfunc
)
Splitter_getattr
,
/*tp_getattr*/
(
setattrfunc
)
0
,
/*tp_setattr*/
(
cmpfunc
)
0
,
/*tp_compare*/
(
reprfunc
)
0
,
/*tp_repr*/
0
,
/*tp_as_number*/
&
Splitter_as_sequence
,
/*tp_as_sequence*/
0
,
/*tp_as_mapping*/
(
hashfunc
)
0
,
/*tp_hash*/
(
ternaryfunc
)
0
,
/*tp_call*/
(
reprfunc
)
0
,
/*tp_str*/
/* Space for future expansion */
0L
,
0L
,
0L
,
0L
,
SplitterType__doc__
/* Documentation string */
};
static
char
*
splitter_args
[]
=
{
"doc"
,
"synstop"
,
"encoding"
,
"singlechar"
,
"indexnumbers"
,
"maxlen"
,
"casefolding"
,
NULL
};
static
PyObject
*
get_Splitter
(
PyObject
*
modinfo
,
PyObject
*
args
,
PyObject
*
keywds
)
{
Splitter
*
self
;
PyObject
*
doc
,
*
synstop
=
NULL
;
char
*
encoding
=
"latin1"
;
int
single_char
=
0
;
int
index_numbers
=
0
;
int
max_len
=
64
;
int
casefolding
=
1
;
UNLESS
(
PyArg_ParseTupleAndKeywords
(
args
,
keywds
,
"O|Osiiii"
,
splitter_args
,
&
doc
,
&
synstop
,
&
encoding
,
&
single_char
,
&
index_numbers
,
&
max_len
,
&
casefolding
))
return
NULL
;
if
(
index_numbers
<
0
||
index_numbers
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"indexnumbers must be 0 or 1"
);
return
NULL
;
}
if
(
casefolding
<
0
||
casefolding
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"casefolding must be 0 or 1"
);
return
NULL
;
}
if
(
single_char
<
0
||
single_char
>
1
)
{
PyErr_SetString
(
PyExc_ValueError
,
"singlechar must be 0 or 1"
);
return
NULL
;
}
if
(
max_len
<
1
||
max_len
>
128
)
{
PyErr_SetString
(
PyExc_ValueError
,
"maxlen must be between 1 and 128"
);
return
NULL
;
}
UNLESS
(
self
=
PyObject_NEW
(
Splitter
,
&
SplitterType
))
return
NULL
;
if
(
synstop
)
{
self
->
synstop
=
synstop
;
Py_INCREF
(
synstop
);
}
else
self
->
synstop
=
NULL
;
UNLESS
(
self
->
text
=
PyObject_Str
(
doc
))
goto
err
;
UNLESS
(
self
->
here
=
PyString_AS_STRING
(
self
->
text
))
goto
err
;
self
->
end
=
self
->
here
+
PyString_GET_SIZE
(
self
->
text
);
self
->
index
=
-
1
;
self
->
allow_single_chars
=
single_char
;
self
->
index_numbers
=
index_numbers
;
self
->
max_len
=
max_len
;
self
->
casefolding
=
casefolding
;
return
(
PyObject
*
)
self
;
err:
Py_DECREF
(
self
);
return
NULL
;
}
static
struct
PyMethodDef
Splitter_module_methods
[]
=
{
{
"ZopeSplitter"
,
(
PyCFunction
)
get_Splitter
,
METH_VARARGS
|
METH_KEYWORDS
,
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{
NULL
,
NULL
}
};
static
char
Splitter_module_documentation
[]
=
"Parse source strings into sequences of words
\n
"
"
\n
"
"for use in an inverted index
\n
"
"
\n
"
"$Id$
\n
"
;
void
initZopeSplitter
(
void
)
{
/* Create the module and add the functions */
Py_InitModule4
(
"ZopeSplitter"
,
Splitter_module_methods
,
Splitter_module_documentation
,
NULL
,
PYTHON_API_VERSION
);
}
src/Products/PluginIndexes/TextIndex/Splitter/__init__.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import
os
,
sys
,
exceptions
availableSplitters
=
(
(
"ZopeSplitter"
,
"Zope Default Splitter"
),
(
"ISO_8859_1_Splitter"
,
"Werner Strobls ISO-8859-1 Splitter"
),
(
"UnicodeSplitter"
,
"Unicode-aware splitter"
)
)
splitterNames
=
map
(
lambda
x
:
x
[
0
],
availableSplitters
)
def
getSplitter
(
name
=
None
):
if
not
name
in
splitterNames
and
name
:
raise
exceptions
.
RuntimeError
,
"No such splitter '%s'"
%
name
if
not
name
:
name
=
splitterNames
[
0
]
if
not
vars
().
has_key
(
name
):
exec
(
"from %s.%s import %s"
%
(
name
,
name
,
name
))
return
vars
()[
name
]
src/Products/PluginIndexes/TextIndex/Splitter/setup.py
deleted
100644 → 0
View file @
adc7e054
#!/usr/bin/env python
from
distutils.core
import
setup
,
Extension
import
os
,
exceptions
,
commands
,
sys
CFLAGS
=
[]
LFLAGS
=
[]
LIBS
=
[]
setup
(
name
=
"Splitter"
,
version
=
"1.0"
,
description
=
"Splitters for Zope 2.5"
,
author
=
"Andreas Jung"
,
author_email
=
"andreas@zope.com"
,
url
=
"http://www.zope.org/..."
,
ext_modules
=
[
Extension
(
"ZopeSplitter"
,[
'ZopeSplitter/src/ZopeSplitter.c'
]),
\
Extension
(
"ISO_8859_1_Splitter"
,[
'ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c'
]),
\
Extension
(
"UnicodeSplitter"
,[
'UnicodeSplitter/src/UnicodeSplitter.c'
])
\
]
)
src/Products/PluginIndexes/TextIndex/TextIndex.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Deprecated text index. Please use ZCTextIndex instead.
$Id$
"""
from
cgi
import
escape
from
logging
import
getLogger
import
operator
import
re
import
warnings
from
types
import
*
from
Acquisition
import
Implicit
from
App.special_dtml
import
DTMLFile
from
OFS.SimpleItem
import
SimpleItem
from
BTrees.IIBTree
import
difference
from
BTrees.IIBTree
import
IIBTree
from
BTrees.IIBTree
import
IIBucket
from
BTrees.IIBTree
import
IISet
from
BTrees.IIBTree
import
weightedIntersection
from
BTrees.IOBTree
import
IOBTree
from
BTrees.OIBTree
import
OIBTree
from
Persistence
import
Persistent
from
zope.interface
import
implements
from
Products.PluginIndexes.common
import
safe_callable
from
Products.PluginIndexes.common.ResultList
import
ResultList
from
Products.PluginIndexes.common.util
import
parseIndexRequest
from
Products.PluginIndexes.interfaces
import
IPluggableIndex
from
Products.PluginIndexes.interfaces
import
ITextIndex
from
Products.PluginIndexes.TextIndex.Lexicon
import
Lexicon
LOG
=
getLogger
(
'TextIndex'
)
class
Op
:
def
__init__
(
self
,
name
):
self
.
name
=
name
def
__repr__
(
self
):
return
self
.
name
__str__
=
__repr__
AndNot
=
Op
(
'andnot'
)
And
=
Op
(
'and'
)
Or
=
Op
(
'or'
)
Near
=
Op
(
'...'
)
QueryError
=
'TextIndex.QueryError'
operator_dict
=
{
'andnot'
:
AndNot
,
'and'
:
And
,
'or'
:
Or
,
'...'
:
Near
,
'near'
:
Near
,
AndNot
:
AndNot
,
And
:
And
,
Or
:
Or
,
Near
:
Near
}
class
TextIndex
(
Persistent
,
Implicit
,
SimpleItem
):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
implements
(
ITextIndex
,
IPluggableIndex
)
meta_type
=
'TextIndex'
query_options
=
(
'query'
,
'operator'
)
manage_options
=
(
{
'label'
:
'Settings'
,
'action'
:
'manage_main'
,
'help'
:
(
'TextIndex'
,
'TextIndex_Settings.stx'
)},
)
def
__init__
(
self
,
id
,
ignore_ex
=
None
,
call_methods
=
None
,
lexicon
=
None
,
caller
=
None
,
extra
=
None
):
"""Create an index
The arguments are:
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
'lexicon' is the lexicon object to specify, if None, the
index will use a private lexicon.
'caller' -- instance that created the index (maybe None)
'extra' -- Record to keep additional parameters
"""
self
.
id
=
id
self
.
ignore_ex
=
ignore_ex
self
.
call_methods
=
call_methods
self
.
catalog
=
caller
# Default text index operator (should be visible to ZMI)
self
.
useOperator
=
'or'
if
extra
:
self
.
vocabulary_id
=
extra
.
vocabulary
else
:
self
.
vocabulary_id
=
"Vocabulary"
self
.
_lexicon
=
None
self
.
clear
()
if
lexicon
is
not
None
:
# We need to hold a reference to the lexicon, since we can't
# really change lexicons.
self
.
_lexicon
=
lexicon
self
.
vocabulary_id
=
'__userdefined__'
def
getId
(
self
):
return
self
.
id
def
getLexicon
(
self
,
vocab_id
=
None
):
"""Get the Lexicon in use.
"""
if
self
.
_lexicon
is
None
:
## if no lexicon is provided, create a default one
try
:
if
self
.
catalog
is
None
:
self
.
catalog
=
self
.
aq_inner
.
aq_parent
.
aq_base
self
.
_lexicon
=
getattr
(
self
.
catalog
,
self
.
vocabulary_id
).
getLexicon
()
except
:
self
.
_lexicon
=
Lexicon
()
self
.
vocabulary_id
=
'__intern__'
return
self
.
_lexicon
def
__nonzero__
(
self
):
return
not
not
self
.
_unindex
def
clear
(
self
):
"""Reinitialize the text index."""
self
.
_index
=
IOBTree
()
self
.
_unindex
=
IOBTree
()
if
self
.
getLexicon
()
and
self
.
vocabulary_id
==
'__userdefined__'
:
self
.
getLexicon
().
clear
()
self
.
_lexicon
=
None
def
_convertBTrees
(
self
,
threshold
=
200
):
if
type
(
self
.
_lexicon
)
is
type
(
''
):
# Turn the name reference into a hard reference.
self
.
_lexicon
=
self
.
getLexicon
()
if
type
(
self
.
_index
)
is
IOBTree
:
return
from
BTrees.convert
import
convert
_index
=
self
.
_index
self
.
_index
=
IOBTree
()
def
convertScores
(
scores
,
type
=
type
,
IIBTree
=
IIBTree
):
if
type
(
scores
)
is
not
tuple
and
type
(
scores
)
is
not
IIBTree
():
scores
=
IIBTree
(
scores
)
return
scores
convert
(
_index
,
self
.
_index
,
threshold
,
convertScores
)
_unindex
=
self
.
_unindex
self
.
_unindex
=
IOBTree
()
convert
(
_unindex
,
self
.
_unindex
,
threshold
)
def
histogram
(
self
,
type
=
type
):
"""Return a mapping which provides a histogram of the number of
elements found at each point in the index."""
histogram
=
IIBucket
()
for
(
key
,
value
)
in
self
.
_index
.
items
():
if
type
(
value
)
is
tuple
:
entry
=
1
else
:
entry
=
len
(
value
)
histogram
[
entry
]
=
histogram
.
get
(
entry
,
0
)
+
1
return
histogram
def
getEntryForObject
(
self
,
rid
,
default
=
None
):
"""Get all information contained for a specific object.
This takes the objects record ID as it's main argument."""
results
=
self
.
_unindex
.
get
(
rid
,
None
)
if
results
is
None
:
return
default
else
:
return
tuple
(
map
(
self
.
getLexicon
().
getWord
,
results
))
def
insertForwardIndexEntry
(
self
,
entry
,
documentId
,
score
=
1
):
"""Uses the information provided to update the indexes.
The basic logic for choice of data structure is based on
the number of entries as follows:
1 tuple
2-3 dictionary
4+ bucket.
"""
index
=
self
.
_index
indexRow
=
index
.
get
(
entry
,
None
)
if
indexRow
is
not
None
:
if
type
(
indexRow
)
is
tuple
:
# Tuples are only used for rows which have only
# a single entry. Since we now need more, we'll
# promote it to a mapping object (dictionary).
# First, make sure we're not already in it, if so
# update the score if necessary.
if
indexRow
[
0
]
==
documentId
:
if
indexRow
[
1
]
!=
score
:
indexRow
=
(
documentId
,
score
)
index
[
entry
]
=
indexRow
else
:
indexRow
=
{
indexRow
[
0
]:
indexRow
[
1
],
documentId
:
score
,
}
index
[
entry
]
=
indexRow
else
:
if
indexRow
.
get
(
documentId
,
-
1
)
!=
score
:
# score changed (or new entry)
if
type
(
indexRow
)
is
dict
:
indexRow
[
documentId
]
=
score
if
len
(
indexRow
)
>
3
:
# Big enough to give it's own database record
indexRow
=
IIBTree
(
indexRow
)
index
[
entry
]
=
indexRow
else
:
indexRow
[
documentId
]
=
score
else
:
# We don't have any information at this point, so we'll
# put our first entry in, and use a tuple to save space
index
[
entry
]
=
(
documentId
,
score
)
def
index_object
(
self
,
documentId
,
obj
,
threshold
=
None
):
""" Index an object:
'documentId' is the integer id of the document
'obj' is the object to be indexed
'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are
disabled. """
# sniff the object for our 'id', the 'document source' of the
# index is this attribute. If it smells callable, call it.
try
:
source
=
getattr
(
obj
,
self
.
id
)
if
safe_callable
(
source
):
source
=
source
()
if
not
isinstance
(
source
,
unicode
):
source
=
str
(
source
)
except
(
AttributeError
,
TypeError
):
return
0
# sniff the object for 'id'+'_encoding'
try
:
encoding
=
getattr
(
obj
,
self
.
id
+
'_encoding'
)
if
safe_callable
(
encoding
):
encoding
=
str
(
encoding
())
else
:
encoding
=
str
(
encoding
)
except
(
AttributeError
,
TypeError
):
encoding
=
'latin1'
lexicon
=
self
.
getLexicon
()
splitter
=
lexicon
.
Splitter
wordScores
=
OIBTree
()
last
=
None
# Run through the words and score them
for
word
in
list
(
splitter
(
source
,
encoding
=
encoding
)):
if
word
[
0
]
==
'
\
"
'
:
last
=
self
.
_subindex
(
word
[
1
:
-
1
],
wordScores
,
last
,
splitter
)
else
:
if
word
==
last
:
continue
last
=
word
wordScores
[
word
]
=
wordScores
.
get
(
word
,
0
)
+
1
# Convert scores to use wids:
widScores
=
IIBucket
()
getWid
=
lexicon
.
getWordId
for
word
,
score
in
wordScores
.
items
():
widScores
[
getWid
(
word
)]
=
score
del
wordScores
currentWids
=
IISet
(
self
.
_unindex
.
get
(
documentId
,
[]))
# Get rid of document words that are no longer indexed
self
.
unindex_objectWids
(
documentId
,
difference
(
currentWids
,
widScores
))
# Now index the words. Note that the new xIBTrees are clever
# enough to do nothing when there isn't a change. Woo hoo.
insert
=
self
.
insertForwardIndexEntry
for
wid
,
score
in
widScores
.
items
():
insert
(
wid
,
documentId
,
score
)
# Save the unindexing info if it's changed:
wids
=
widScores
.
keys
()
if
wids
!=
currentWids
.
keys
():
self
.
_unindex
[
documentId
]
=
wids
return
len
(
wids
)
def
_subindex
(
self
,
source
,
wordScores
,
last
,
splitter
):
"""Recursively handle multi-word synonyms"""
for
word
in
splitter
(
source
):
if
word
[
0
]
==
'
\
"
'
:
last
=
self
.
_subindex
(
word
[
1
:
-
1
],
wordScores
,
last
,
splitter
)
else
:
if
word
==
last
:
continue
last
=
word
wordScores
[
word
]
=
wordScores
.
get
(
word
,
0
)
+
1
return
last
def
unindex_object
(
self
,
i
):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index
=
self
.
_index
unindex
=
self
.
_unindex
wids
=
unindex
.
get
(
i
,
None
)
if
wids
is
not
None
:
self
.
unindex_objectWids
(
i
,
wids
)
del
unindex
[
i
]
def
unindex_objectWids
(
self
,
i
,
wids
):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index
=
self
.
_index
get
=
index
.
get
for
wid
in
wids
:
widScores
=
get
(
wid
,
None
)
if
widScores
is
None
:
LOG
.
error
(
'unindex_object tried to unindex nonexistent'
' document, wid %s, %s'
%
(
i
,
wid
))
continue
if
type
(
widScores
)
is
tuple
:
del
index
[
wid
]
else
:
try
:
del
widScores
[
i
]
if
widScores
:
if
type
(
widScores
)
is
dict
:
if
len
(
widScores
)
==
1
:
# convert to tuple
widScores
=
widScores
.
items
()[
0
]
index
[
wid
]
=
widScores
else
:
del
index
[
wid
]
except
(
KeyError
,
IndexError
,
TypeError
):
LOG
.
error
(
'unindex_object tried to unindex nonexistent'
' document %s'
%
str
(
i
))
def
__getitem__
(
self
,
word
):
"""Return an InvertedIndex-style result "list"
Note that this differentiates between being passed an Integer
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if
type
(
word
)
is
IntType
:
# We have a word ID
result
=
self
.
_index
.
get
(
word
,
{})
return
ResultList
(
result
,
(
word
,),
self
)
else
:
splitSource
=
tuple
(
self
.
getLexicon
().
Splitter
(
word
))
if
not
splitSource
:
return
ResultList
({},
(
word
,),
self
)
if
len
(
splitSource
)
==
1
:
splitSource
=
splitSource
[
0
]
if
splitSource
[:
1
]
==
'"'
and
splitSource
[
-
1
:]
==
'"'
:
return
self
[
splitSource
]
wids
=
self
.
getLexicon
().
get
(
splitSource
)
if
wids
:
r
=
self
.
_index
.
get
(
wids
[
0
],
None
)
if
r
is
None
:
r
=
{}
else
:
r
=
{}
return
ResultList
(
r
,
(
splitSource
,),
self
)
r
=
None
for
word
in
splitSource
:
rr
=
self
[
word
]
if
r
is
None
:
r
=
rr
else
:
r
=
r
.
near
(
rr
)
return
r
def
_apply_index
(
self
,
request
):
""" Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then
None is returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
record
=
parseIndexRequest
(
request
,
self
.
id
,
self
.
query_options
)
if
record
.
keys
is
None
:
return
None
# Changed for 2.4
# We use the default operator that can me managed via the ZMI
qop
=
record
.
get
(
'operator'
,
self
.
useOperator
)
# We keep this for pre-2.4 compatibility
# This stinking code should go away somewhere. A global
# textindex_operator makes no sense when using multiple
# text indexes inside a catalog. An index operator should
# should be specified on a per-index base
if
request
.
has_key
(
'textindex_operator'
):
qop
=
request
[
'textindex_operator'
]
warnings
.
warn
(
"The usage of the 'textindex_operator' "
"is no longer recommended.
\
n
"
"Please use a mapping object and the "
"'operator' key to specify the operator."
)
query_operator
=
operator_dict
.
get
(
qop
)
if
query_operator
is
None
:
raise
exceptions
.
RuntimeError
,
(
"Invalid operator '%s' "
"for a TextIndex"
%
escape
(
qop
))
r
=
None
for
key
in
record
.
keys
:
key
=
key
.
strip
()
if
not
key
:
continue
b
=
self
.
query
(
key
,
query_operator
).
bucket
()
w
,
r
=
weightedIntersection
(
r
,
b
)
if
r
is
not
None
:
return
r
,
(
self
.
id
,)
return
(
IIBucket
(),
(
self
.
id
,))
def
positions
(
self
,
docid
,
words
,
# This was never tested: obj
):
"""Return the positions in the document for the given document
id of the word, word."""
return
[
1
]
#################################################################
# The code below here is broken and requires an API change to fix
# it. Waaaaa.
if
self
.
_schema
is
None
:
f
=
getattr
else
:
f
=
operator
.
__getitem__
id
=
self
.
_schema
[
self
.
id
]
if
self
.
call_methods
:
doc
=
str
(
f
(
obj
,
self
.
id
)())
else
:
doc
=
str
(
f
(
obj
,
self
.
id
))
r
=
[]
for
word
in
words
:
r
=
r
+
self
.
getLexicon
().
Splitter
(
doc
).
indexes
(
word
)
return
r
def
query
(
self
,
s
,
default_operator
=
Or
):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s
=
re
.
sub
(
'(?i)
\
s+
a
nd
\
s*
n
ot
\
s+
'
, '
andnot
', s)
# Parse parentheses and quotes
q = parse(s)
# Allow the Lexicon to process the query
q = self.getLexicon().query_hook(q)
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
# evalute the final '
expression
'
return self.evaluate(q)
def get_operands(self, q, i):
"""Evaluate and return the left and right operands for an operator"""
try:
left = q[i - 1]
right = q[i + 1]
except IndexError:
raise QueryError, "Malformed query"
operandType = type(left)
if operandType is IntType:
left = self[left]
elif isinstance(left,str) or isinstance(left,unicode):
left = self[left]
elif operandType is list:
left = self.evaluate(left)
operandType = type(right)
if operandType is IntType:
right = self[right]
elif isinstance(right,str) or isinstance(right,unicode):
right = self[right]
elif operandType is list:
right = self.evaluate(right)
return (left, right)
def evaluate(self, query):
"""Evaluate a parsed query"""
# Strip off meaningless layers
while isinstance(query, list) and len(query) == 1:
query = query[0]
# If it'
s
not
a
list
,
assume
a
string
or
number
if
not
isinstance
(
query
,
list
):
return
self
[
query
]
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNot -> And -> Or -> Near
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
AndNot
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
.
and_not
(
right
)
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
And
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
&
right
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
Or
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
|
right
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
i
=
0
while
(
i
<
len
(
query
)):
if
query
[
i
]
is
Near
:
left
,
right
=
self
.
get_operands
(
query
,
i
)
val
=
left
.
near
(
right
)
query
[(
i
-
1
)
:
(
i
+
2
)]
=
[
val
]
else
:
i
=
i
+
1
if
(
len
(
query
)
!=
1
):
raise
QueryError
,
"Malformed query"
return
query
[
0
]
def
getIndexSourceNames
(
self
):
""" return name of indexed attributes """
return
(
self
.
id
,
)
def
numObjects
(
self
):
""" return number of index objects """
return
len
(
self
.
_index
)
def
manage_setPreferences
(
self
,
vocabulary
,
REQUEST
=
None
,
RESPONSE
=
None
,
URL2
=
None
):
""" preferences of TextIndex """
if
self
.
vocabulary_id
!=
vocabulary
:
self
.
clear
()
self
.
vocabulary_id
=
vocabulary
if
RESPONSE
:
RESPONSE
.
redirect
(
URL2
+
'/manage_main?manage_tabs_message=Preferences%20saved'
)
manage
=
manage_main
=
DTMLFile
(
"dtml/manageTextIndex"
,
globals
())
manage_main
.
_setName
(
'manage_main'
)
manage_vocabulary
=
DTMLFile
(
"dtml/manageVocabulary"
,
globals
())
def
parse
(
s
):
"""Parse parentheses and quotes"""
l
=
[]
tmp
=
s
.
lower
()
p
=
parens
(
tmp
)
while
p
is
not
None
:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l
=
l
+
quotes
(
p
[
0
])
l
.
append
(
parse
(
p
[
1
]))
# continue looking through the rest of the string
tmp
=
p
[
2
]
p
=
parens
(
tmp
)
return
l
+
quotes
(
tmp
)
def
parse2
(
q
,
default_operator
,
operator_dict
=
operator_dict
):
"""Find operators and operands"""
isop
=
operator_dict
.
has_key
i
=
0
while
i
<
len
(
q
):
e
=
q
[
i
]
if
isinstance
(
e
,
list
):
q
[
i
]
=
parse2
(
e
,
default_operator
)
if
i
%
2
:
q
.
insert
(
i
,
default_operator
)
i
=
i
+
1
elif
i
%
2
:
# This element should be an operator
if
isop
(
e
):
# Ensure that it is identical, not merely equal.
q
[
i
]
=
operator_dict
[
e
]
else
:
# Insert the default operator.
q
.
insert
(
i
,
default_operator
)
i
=
i
+
1
i
=
i
+
1
return
q
def
parens
(
s
,
parens_re
=
re
.
compile
(
'[()]'
).
search
):
mo
=
parens_re
(
s
)
if
mo
is
None
:
return
open_index
=
mo
.
start
(
0
)
+
1
paren_count
=
0
while
mo
is
not
None
:
index
=
mo
.
start
(
0
)
if
s
[
index
]
==
'('
:
paren_count
=
paren_count
+
1
else
:
paren_count
=
paren_count
-
1
if
paren_count
==
0
:
return
(
s
[:
open_index
-
1
],
s
[
open_index
:
index
],
s
[
index
+
1
:])
if
paren_count
<
0
:
break
mo
=
parens_re
(
s
,
index
+
1
)
raise
QueryError
,
"Mismatched parentheses"
def
quotes
(
s
):
if
'"'
not
in
s
:
return
s
.
split
()
# split up quoted regions
splitted
=
re
.
split
(
'
\
s*
\
"
\
s*
'
, s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = splitted[i].split()
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = splitted[i].split()
i = i - 2
return filter(None, splitted)
manage_addTextIndexForm = DTMLFile('
dtml
/
addTextIndex
', globals())
def manage_addTextIndex(self, id, extra=None, REQUEST=None, RESPONSE=None, URL3=None):
"""Add a text index"""
return self.manage_addIndex(id, '
TextIndex
', extra, REQUEST, RESPONSE, URL3)
src/Products/PluginIndexes/TextIndex/Vocabulary.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Vocabulary for deprecated text index.
$Id$
"""
from
AccessControl.Permissions
import
manage_vocabulary
from
AccessControl.Permissions
import
query_vocabulary
from
AccessControl.Role
import
RoleManager
from
AccessControl.SecurityInfo
import
ClassSecurityInfo
from
Acquisition
import
Implicit
from
App.class_init
import
InitializeClass
from
App.Dialogs
import
MessageDialog
from
App.special_dtml
import
DTMLFile
from
Persistence
import
Persistent
from
OFS.SimpleItem
import
Item
from
zope.interface
import
implements
from
Products.PluginIndexes.interfaces
import
IVocabulary
from
Products.PluginIndexes.TextIndex
import
Lexicon
,
GlobbingLexicon
from
Products.PluginIndexes.TextIndex.Lexicon
import
stop_word_dict
from
Products.PluginIndexes.TextIndex
import
Splitter
manage_addVocabularyForm
=
DTMLFile
(
'dtml/addVocabulary'
,
globals
())
def
manage_addVocabulary
(
self
,
id
,
title
,
globbing
=
None
,
extra
=
None
,
splitter
=
''
,
REQUEST
=
None
):
"""Add a Vocabulary object
"""
id
=
str
(
id
)
title
=
str
(
title
)
if
globbing
:
globbing
=
1
c
=
Vocabulary
(
id
,
title
,
globbing
,
splitter
,
extra
)
self
.
_setObject
(
id
,
c
)
if
REQUEST
is
not
None
:
return
self
.
manage_main
(
self
,
REQUEST
,
update_menu
=
1
)
class
_extra
:
pass
class
Vocabulary
(
Item
,
Persistent
,
Implicit
,
RoleManager
):
"""A Vocabulary is a user-managable realization of a Lexicon object.
"""
implements
(
IVocabulary
)
security
=
ClassSecurityInfo
()
security
.
setPermissionDefault
(
manage_vocabulary
,
(
'Manager'
,))
security
.
setPermissionDefault
(
query_vocabulary
,
(
'Anonymous'
,
'Manager'
,))
meta_type
=
"Vocabulary"
_isAVocabulary
=
1
manage_options
=
(
(
{
'label'
:
'Vocabulary'
,
'action'
:
'manage_main'
,
'help'
:
(
'ZCatalog'
,
'Vocabulary_Vocabulary.stx'
)},
{
'label'
:
'Query'
,
'action'
:
'manage_query'
,
'help'
:
(
'ZCatalog'
,
'Vocabulary_Query.stx'
)},
)
+
Item
.
manage_options
+
RoleManager
.
manage_options
)
security
.
declareProtected
(
manage_vocabulary
,
'manage_main'
)
manage_main
=
DTMLFile
(
'dtml/manage_vocab'
,
globals
())
security
.
declareProtected
(
manage_vocabulary
,
'manage_query'
)
manage_query
=
DTMLFile
(
'dtml/vocab_query'
,
globals
())
def
__init__
(
self
,
id
,
title
=
''
,
globbing
=
None
,
splitter
=
None
,
extra
=
None
):
""" create the lexicon to manage... """
self
.
id
=
id
self
.
title
=
title
self
.
globbing
=
not
not
globbing
self
.
useSplitter
=
Splitter
.
splitterNames
[
0
]
if
splitter
:
self
.
useSplitter
=
splitter
if
not
extra
:
extra
=
_extra
()
extra
.
splitterIndexNumbers
=
0
extra
.
splitterSingleChars
=
0
extra
.
splitterCasefolding
=
1
if
globbing
:
self
.
lexicon
=
GlobbingLexicon
.
GlobbingLexicon
(
useSplitter
=
self
.
useSplitter
,
extra
=
extra
)
else
:
self
.
lexicon
=
Lexicon
.
Lexicon
(
stop_word_dict
,
useSplitter
=
self
.
useSplitter
,
extra
=
extra
)
def
getLexicon
(
self
):
return
self
.
lexicon
security
.
declareProtected
(
query_vocabulary
,
'query'
)
def
query
(
self
,
pattern
):
""" """
result
=
[]
for
x
in
self
.
lexicon
.
get
(
pattern
):
if
self
.
globbing
:
result
.
append
(
self
.
lexicon
.
_inverseLex
[
x
])
else
:
result
.
append
(
pattern
)
return
str
(
result
)
def
manage_insert
(
self
,
word
=
''
,
URL1
=
None
,
RESPONSE
=
None
):
""" doc string """
self
.
insert
(
word
)
if
RESPONSE
:
RESPONSE
.
redirect
(
URL1
+
'/manage_main'
)
def
manage_stop_syn
(
self
,
stop_syn
,
REQUEST
=
None
):
pass
def
insert
(
self
,
word
=
''
):
self
.
lexicon
.
set
(
word
)
def
words
(
self
):
return
self
.
lexicon
.
_lexicon
.
items
()
InitializeClass
(
Vocabulary
)
src/Products/PluginIndexes/TextIndex/__init__.py
deleted
100644 → 0
View file @
adc7e054
# empty comment for winzip and friends
import
warnings
warnings
.
warn
(
'Using TextIndex is deprecated (will be removed in Zope '
'2.12). Use ZCTextIndex instead.'
,
DeprecationWarning
,
stacklevel
=
2
)
src/Products/PluginIndexes/TextIndex/dtml/addTextIndex.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add TextIndex',
)">
<p class="form-help">
<strong>Note:</strong>
TextIndex is deprecated. It has been replaced by ZCTextIndex. Consider
using it instead
</p>
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score meaning they return hits in order
from the most relevant to the lest relevant.
</p>
<form action="manage_addTextIndex" method="post" enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<dtml-let vocabs="superValues('Vocabulary')">
<dtml-if vocabs>
<select name="extra.vocabulary:record">
<dtml-in expr="superValues('Vocabulary')">
<option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
</dtml-in>
</select>
<dtml-else>
<em class="std-text">Create a Vocabulary object first.</em>
</dtml-if>
</dtml-let>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
TextIndex
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/dtml/addVocabulary.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add Vocabulary',
)">
<FORM ACTION="manage_addVocabulary" METHOD="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<dtml-if availableSplitters>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Splitter
</div>
</td>
<td align="left" valign="top">
<select name="splitter">
<dtml-in availableSplitters>
<option value="&dtml-sequence-key;">&dtml-sequence-item;
</dtml-in>
</select>
</td>
</tr>
</dtml-if>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index numbers
</td>
<td align="left" valign="top">
<select name="extra.splitterIndexNumbers:record:int">
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index single characters
</td>
<td align="left" valign="top">
<select name="extra.splitterSingleChars:record:int" >
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Case-insensitive
</td>
<td align="left" valign="top">
<select name="extra.splitterCasefolding:record:int">
<option value="0" >no
<option value="1"selected>yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
globbing?
</td>
<td align="left" valign="top">
<input type="checkbox" name="globbing" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/dtml/manageTextIndex.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
<form method="post" action="manage_setPreferences">
<table border="0" cellspacing="2" cellpadding="2">
<tr>
<th align="left" width="20%">Vocabulary to use</th>
<td align="left">
<select name="vocabulary">
<dtml-in "superValues('Vocabulary')">
<dtml-if "getId()==vocabulary_id">
<option value="&dtml-id;" selected>
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
<dtml-else>
<option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
</dtml-if>
</dtml-in>
</select>
</td>
<td>
<em>Warning:</em> changing the vocabulary makes only sense when after
creating the index and before indexing any objects. The index will be cleared
when you change the vocabulary after indexing objects.
</td>
</tr>
<dtml-comment>
<tr>
<th align="left">Splitter</th>
<td>
<select name="splitter">
<dtml-in availableSplitters>
<dtml-if "_.getitem('sequence-key')==useSplitter">
<option value="&dtml-sequence-key;" selected>&dtml-sequence-item;
<dtml-else>
<option value="&dtml-sequence-key;">&dtml-sequence-item;
</dtml-if>
</dtml-in>
</select>
</td>
</tr>
<tr>
<th align="left">Default text operator</th>
<td>
<select name="text_operator">
<dtml-in "operators.keys()">
<dtml-if "_.getitem('sequence-item')==useOperator">
<option value="&dtml-sequence-item;" selected>&dtml-sequence-item;
<dtml-else>
<option value="&dtml-sequence-item;">&dtml-sequence-item;
</dtml-if>
</dtml-in>
</select>
</td>
</tr>
</dtml-comment>
<tr>
<td colspan="3">
<input type="submit" value="Save changes">
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/dtml/manageVocabulary.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Manage vocabulary of text index',
help_topic='addIndex.stx'
)">
<dtml-var "getLexicon('Vocabulary')">
<form action="manage_addTextIndex" method="post" enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
</table>
</form>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/dtml/manage_vocab.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-call "RESPONSE.setHeader('content-type','text/html; charset: utf-8')">
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-text">
<dtml-let lexicon="getLexicon()">
<dtml-try>
<dtml-let x="lexicon.multi_wc"></dtml-let>
Globbing is <em>enabled</em>
<dtml-except>
Globbing is <em>disabled</em>
</dtml-try>
<dtml-if useSplitter>
, Splitter is <em>&dtml-useSplitter;</em>
</dtml-if>
<dtml-try>
, Index number=<dtml-var "lexicon.splitterParams.splitterIndexNumbers">
, Case-insensitve=<dtml-var "lexicon.splitterParams.splitterCasefolding">
, Index single characters=<dtml-var "lexicon.splitterParams.splitterSingleChars">
<dtml-except>
</dtml-try>
</dtml-let>
</p>
<dtml-if words>
<p class="form-text">
&dtml-id; contains <em><dtml-var words fmt=collection-length thousands_commas></em>
word(s).
</p>
<dtml-in words previous size=20 start=query_start >
<span class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-previous-sequence-start-number;">
[Previous <dtml-var previous-sequence-size> entries]
</a>
</span>
</dtml-in>
<dtml-in words next size=20 start=query_start >
<span class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-next-sequence-start-number;">
[Next <dtml-var next-sequence-size> entries]
</a>
</span>
</dtml-in>
<table width="100%" cellspacing="0" cellpadding="2" border="0">
<dtml-in words size=20 start=query_start >
<dtml-if name="sequence-start">
<tr class="list-header">
<td width="80%" align="left" valign="top">
<div class="list-item">Word</div></td>
<td width="20%" align="left" valign="top">
<div class="list-item">Word ID</div></td>
</tr>
</dtml-if>
<dtml-if name="sequence-odd"><tr class="row-normal">
<dtml-else><tr class="row-hilite"></dtml-if>
<td valign="top" align="left">
<div class="form-text">
<dtml-if "_.same_type(_['sequence-key'], 'x')">
&dtml-sequence-key;
<dtml-else>
<dtml-var "_['sequence-key'].encode('utf-8')" html_quote>
</dtml-if>
</div>
</td>
<td valign="top" align="left">
<div class="form-text">&dtml-sequence-item;</div>
</td>
</tr>
</dtml-in>
</table>
<dtml-in words previous size=20 start=query_start >
<div class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-previous-sequence-start-number;">
[Previous <dtml-var previous-sequence-size> entries]
</a>
</div>
</dtml-in>
<dtml-in words next size=20 start=query_start >
<div class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-next-sequence-start-number;">
[Next <dtml-var next-sequence-size> entries]
</a>
</div>
</dtml-in>
<dtml-else>
<p class="form-text">
There are no words in the Vocabulary.
</p>
</dtml-if>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/dtml/vocab_query.dtml
deleted
100644 → 0
View file @
adc7e054
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<form action="query" method=POST>
<input type="text" name="pattern" size="20">
<div class="form-element">
<input class="form-element" type="submit" name="submit" value="Query">
</div>
</form>
<dtml-var manage_page_footer>
src/Products/PluginIndexes/TextIndex/help/TextIndex_searchResults.stx
deleted
100644 → 0
View file @
adc7e054
ZCatalog - searchResults: specifying parameters for a search query
The searchResults() method of the ZCatalog accepts parameters that
define a query to be made on that catalog. A query can either be
passed as keyword argument to searchResults(), as a mapping, or as
part of a Zope REQUEST object, typically from HTML forms.
The index of the catalog to query is either the name of the
keyword argument, a key in a mapping, or an attribute of a record
object.
Attributes of record objects
'query' -- either a sequence of objects or a single value to be
passed as query to the index (mandatory)
'operator' -- specifies the combination of search results when
query is a sequence of values. (optional, default: 'or').
Allowed values:
'and', 'or', 'andnot', 'near'
src/Products/PluginIndexes/TextIndex/tests/__init__.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2003 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# This file is needed to make this a package.
src/Products/PluginIndexes/TextIndex/tests/testSplitter.py
deleted
100644 → 0
View file @
adc7e054
# -*- coding: ISO-8859-1 -*-
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import
os
,
sys
import
unittest
,
locale
from
Products.PluginIndexes.TextIndex
import
Splitter
class
TestCase
(
unittest
.
TestCase
):
"""
Test our Splitters
"""
def
setUp
(
self
):
self
.
testdata
=
(
(
'The quick brown fox jumps over the lazy dog'
,
[
'the'
,
'quick'
,
'brown'
,
'fox'
,
'jumps'
,
'over'
,
'the'
,
'lazy'
,
'dog'
]),
(
'fters sterreichische herber berfall da rger verrgert'
,
[
'fters'
,
'sterreichische'
,
'herber'
,
'berfall'
,
'da'
,
'rger'
,
'verrgert'
])
)
pass
def
tearDown
(
self
):
"""
"""
def
testAvailableSplitters
(
self
):
"Test available splitters"
assert
len
(
Splitter
.
availableSplitters
)
>
0
assert
len
(
Splitter
.
splitterNames
)
>
0
assert
len
(
Splitter
.
availableSplitters
)
==
len
(
Splitter
.
splitterNames
)
def
_test
(
self
,
sp_name
,
text
,
splitted
):
splitter
=
Splitter
.
getSplitter
(
sp_name
)
result
=
list
(
splitter
(
text
))
assert
result
==
splitted
,
"%s: %s vs %s"
%
(
sp_name
,
result
,
splitted
)
# def testZopeSplitter(self):
# """test ZopeSplitter (this test is known to fail because it does not support ISO stuff) """
#
# for text,splitted in self.testdata:
# self._test("ZopeSplitter",text,splitted)
def
testISOSplitter
(
self
):
"""test ISOSplitter"""
for
text
,
splitted
in
self
.
testdata
:
self
.
_test
(
"ISO_8859_1_Splitter"
,
text
,
splitted
)
def
test_suite
():
return
unittest
.
makeSuite
(
TestCase
)
def
debug
():
return
test_suite
().
debug
()
def
pdebug
():
import
pdb
pdb
.
run
(
'debug()'
)
def
main
():
unittest
.
TextTestRunner
().
run
(
test_suite
()
)
if
__name__
==
'__main__'
:
if
len
(
sys
.
argv
)
>
1
:
globals
()[
sys
.
argv
[
1
]]()
else
:
main
()
src/Products/PluginIndexes/TextIndex/tests/testTextIndex.py
deleted
100644 → 0
View file @
adc7e054
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""TextIndex unit tests.
$Id$
"""
import
unittest
import
Testing
import
Zope2
Zope2
.
startup
()
import
ZODB
from
ZODB.MappingStorage
import
MappingStorage
import
transaction
from
Products.PluginIndexes.TextIndex
import
TextIndex
from
Products.PluginIndexes.TextIndex
import
GlobbingLexicon
class
Dummy
:
def
__init__
(
self
,
text
):
self
.
_text
=
text
def
text
(
self
):
return
self
.
_text
def
__str__
(
self
):
return
'<Dummy: %s>'
%
self
.
_text
__repr__
=
__str__
class
Tests
(
unittest
.
TestCase
):
db
=
None
jar
=
None
def
setUp
(
self
):
self
.
index
=
TextIndex
.
TextIndex
(
'text'
)
self
.
doc
=
Dummy
(
text
=
'this is the time, when all good zopes'
)
def
dbopen
(
self
):
if
self
.
db
is
None
:
s
=
MappingStorage
()
self
.
db
=
ZODB
.
DB
(
s
)
db
=
self
.
db
if
self
.
jar
is
not
None
:
raise
RuntimeError
,
'test needs to dbclose() before dbopen()'
jar
=
db
.
open
()
self
.
jar
=
jar
if
not
jar
.
root
().
has_key
(
'index'
):
jar
.
root
()[
'index'
]
=
TextIndex
.
TextIndex
(
'text'
)
transaction
.
commit
()
return
jar
.
root
()[
'index'
]
def
dbclose
(
self
):
self
.
jar
.
close
()
self
.
jar
=
None
def
tearDown
(
self
):
transaction
.
abort
()
if
self
.
jar
is
not
None
:
self
.
dbclose
()
if
self
.
db
is
not
None
:
self
.
db
.
close
()
self
.
db
=
None
def
test_z3interfaces
(
self
):
from
Products.PluginIndexes.interfaces
import
IPluggableIndex
from
Products.PluginIndexes.interfaces
import
ITextIndex
from
Products.PluginIndexes.TextIndex.TextIndex
import
TextIndex
from
zope.interface.verify
import
verifyClass
verifyClass
(
IPluggableIndex
,
TextIndex
)
verifyClass
(
ITextIndex
,
TextIndex
)
def
test_SimpleAddDelete
(
self
):
self
.
index
.
index_object
(
0
,
self
.
doc
)
self
.
index
.
index_object
(
1
,
self
.
doc
)
self
.
doc
.
text
=
'spam is good, spam is fine, span span span'
self
.
index
.
index_object
(
0
,
self
.
doc
)
self
.
index
.
unindex_object
(
0
)
def
test_PersistentUpdate1
(
self
):
# Check simple persistent indexing
index
=
self
.
dbopen
()
self
.
doc
.
text
=
'this is the time, when all good zopes'
index
.
index_object
(
0
,
self
.
doc
)
transaction
.
commit
()
self
.
doc
.
text
=
'time waits for no one'
index
.
index_object
(
1
,
self
.
doc
)
transaction
.
commit
()
self
.
dbclose
()
index
=
self
.
dbopen
()
r
=
index
.
_apply_index
({})
assert
r
==
None
r
=
index
.
_apply_index
({
'text'
:
'python'
})
assert
len
(
r
)
==
2
and
r
[
1
]
==
(
'text'
,),
'incorrectly not used'
assert
not
r
[
0
],
"should have no results"
r
=
index
.
_apply_index
({
'text'
:
'time'
})
r
=
list
(
r
[
0
].
keys
())
assert
r
==
[
0
,
1
],
r
def
test_PersistentUpdate2
(
self
):
# Check less simple persistent indexing
index
=
self
.
dbopen
()
self
.
doc
.
text
=
'this is the time, when all good zopes'
index
.
index_object
(
0
,
self
.
doc
)
transaction
.
commit
()
self
.
doc
.
text
=
'time waits for no one'
index
.
index_object
(
1
,
self
.
doc
)
transaction
.
commit
()
self
.
doc
.
text
=
'the next task is to test'
index
.
index_object
(
3
,
self
.
doc
)
transaction
.
commit
()
self
.
doc
.
text
=
'time time'
index
.
index_object
(
2
,
self
.
doc
)
transaction
.
commit
()
self
.
dbclose
()
index
=
self
.
dbopen
()
r
=
index
.
_apply_index
({})
assert
r
==
None
r
=
index
.
_apply_index
({
'text'
:
'python'
})
assert
len
(
r
)
==
2
and
r
[
1
]
==
(
'text'
,),
'incorrectly not used'
assert
not
r
[
0
],
"should have no results"
r
=
index
.
_apply_index
({
'text'
:
'time'
})
r
=
list
(
r
[
0
].
keys
())
assert
r
==
[
0
,
1
,
2
],
r
sample_texts
=
[
"""This is the time for all good men to come to
the aid of their country"""
,
"""ask not what your country can do for you,
ask what you can do for your country"""
,
"""Man, I can't wait to get to Montross!"""
,
"""Zope Public License (ZPL) Version 1.0"""
,
"""Copyright (c) Digital Creations. All rights reserved."""
,
"""This license has been certified as Open Source(tm)."""
,
"""I hope I get to work on time"""
,
]
def
globTest
(
self
,
qmap
,
rlist
):
"Check a glob query"
index
=
self
.
dbopen
()
index
.
_lexicon
=
GlobbingLexicon
.
GlobbingLexicon
()
for
i
in
range
(
len
(
self
.
sample_texts
)):
self
.
doc
.
text
=
self
.
sample_texts
[
i
]
index
.
index_object
(
i
,
self
.
doc
)
transaction
.
commit
()
self
.
dbclose
()
index
=
self
.
dbopen
()
r
=
list
(
index
.
_apply_index
(
qmap
)[
0
].
keys
())
assert
r
==
rlist
,
r
return
index
.
_apply_index
def
test_StarQuery
(
self
):
self
.
globTest
({
'text'
:
'm*n'
},
[
0
,
2
])
def
test_AndQuery
(
self
):
self
.
globTest
({
'text'
:
'time and country'
},
[
0
,])
def
test_OrQuery
(
self
):
self
.
globTest
({
'text'
:
'time or country'
},
[
0
,
1
,
6
])
def
test_DefaultOrQuery
(
self
):
self
.
globTest
({
'text'
:
'time country'
},
[
0
,
1
,
6
])
def
test_NearQuery
(
self
):
# Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self
.
globTest
({
'text'
:
'time ... country'
},
[
0
,])
def
test_QuotesQuery
(
self
):
ai
=
self
.
globTest
({
'text'
:
'"This is the time"'
},
[
0
,])
r
=
list
(
ai
({
'text'
:
'"now is the time"'
})[
0
].
keys
())
assert
r
==
[],
r
def
test_AndNotQuery
(
self
):
self
.
globTest
({
'text'
:
'time and not country'
},
[
6
,])
def
test_ParenMatchingQuery
(
self
):
ai
=
self
.
globTest
({
'text'
:
'(time and country) men'
},
[
0
,])
r
=
list
(
ai
({
'text'
:
'(time and not country) or men'
})[
0
].
keys
())
assert
r
==
[
0
,
6
],
r
def
test_TextIndexOperatorQuery
(
self
):
self
.
globTest
({
'text'
:
{
'query'
:
'time men'
,
'operator'
:
'and'
}},
[
0
,])
def
test_NonExistentWord
(
self
):
self
.
globTest
({
'text'
:
'zop'
},
[])
def
test_ComplexQuery1
(
self
):
self
.
globTest
({
'text'
:
'((?ount* or get) and not wait) '
'"been *ert*"'
},
[
0
,
1
,
5
,
6
])
# same tests, unicode strings
def
test_StarQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
u'm*n'
},
[
0
,
2
])
def
test_AndQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
u'time and country'
},
[
0
,])
def
test_OrQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
u'time or country'
},
[
0
,
1
,
6
])
def
test_DefaultOrQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
u'time country'
},
[
0
,
1
,
6
])
def
test_NearQueryUnicode
(
self
):
# Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!) (unicode)
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self
.
globTest
({
'text'
:
u'time ... country'
},
[
0
,])
def
test_QuotesQueryUnicode
(
self
):
ai
=
self
.
globTest
({
'text'
:
u'"This is the time"'
},
[
0
,])
r
=
list
(
ai
({
'text'
:
'"now is the time"'
})[
0
].
keys
())
assert
r
==
[],
r
def
test_AndNotQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
u'time and not country'
},
[
6
,])
def
test_ParenMatchingQueryUnicode
(
self
):
ai
=
self
.
globTest
({
'text'
:
u'(time and country) men'
},
[
0
,])
r
=
list
(
ai
({
'text'
:
u'(time and not country) or men'
})[
0
].
keys
())
assert
r
==
[
0
,
6
],
r
def
test_TextIndexOperatorQueryUnicode
(
self
):
self
.
globTest
({
'text'
:
{
u'query'
:
u'time men'
,
'operator'
:
'and'
}},
[
0
,])
def
test_NonExistentWordUnicode
(
self
):
self
.
globTest
({
'text'
:
u'zop'
},
[])
def
test_ComplexQuery1Unicode
(
self
):
self
.
globTest
({
'text'
:
u'((?ount* or get) and not wait) '
'"been *ert*"'
},
[
0
,
1
,
5
,
6
])
def
test_suite
():
return
unittest
.
makeSuite
(
Tests
)
if
__name__
==
'__main__'
:
unittest
.
main
(
defaultTest
=
'test_suite'
)
src/Products/PluginIndexes/__init__.py
View file @
cb45cbcc
...
@@ -21,20 +21,7 @@ import DateRangeIndex.DateRangeIndex
...
@@ -21,20 +21,7 @@ import DateRangeIndex.DateRangeIndex
from
Products.PluginIndexes.common
import
ResultList
from
Products.PluginIndexes.common
import
ResultList
from
Products.PluginIndexes.common
import
UnIndex
from
Products.PluginIndexes.common
import
UnIndex
# BBB: TextIndex is deprecated but we don't want the warning to appear here
_indexes
=
(
'KeywordIndex'
,
import
warnings
warnings
.
filterwarnings
(
'ignore'
,
message
=
'^Using TextIndex'
,
append
=
1
)
try
:
import
TextIndex.TextIndex
finally
:
del
warnings
.
filters
[
-
1
]
try
:
del
__warningregistry__
except
NameError
:
pass
_indexes
=
(
'TextIndex'
,
'KeywordIndex'
,
'FieldIndex'
,
'FieldIndex'
,
'PathIndex'
,
'PathIndex'
,
'TopicIndex'
,
'TopicIndex'
,
...
...
src/Products/PluginIndexes/interfaces.py
View file @
cb45cbcc
...
@@ -160,38 +160,8 @@ class IPathIndex(Interface):
...
@@ -160,38 +160,8 @@ class IPathIndex(Interface):
"""
"""
class
IVocabulary
(
Interface
):
"""A Vocabulary is a user-managable realization of a Lexicon object.
"""
class
ITextIndex
(
Interface
):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
def
getLexicon
(
vocab_id
=
None
):
"""Get the Lexicon in use.
"""
class
IFilteredSet
(
Interface
):
class
IFilteredSet
(
Interface
):
"""A pre-calculated result list based on an expression.
"""A pre-calculated result list based on an expression.
"""
"""
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment