Commit cb45cbcc authored by 's avatar

- removed deprecated TextIndex

parent adc7e054
......@@ -19,6 +19,8 @@ Features Added
Restructuring
+++++++++++++
- PluginIndexes: Removed deprecated TextIndex.
- HelpSys now uses ZCTextIndex instead of the deprecated TextIndex. Please
update your Zope databases by deleting the Product registrations in the
Control Panel and restarting Zope.
......
......@@ -75,21 +75,6 @@ params = dict(name='Zope2',
sources=['src/initgroups/_initgroups.c']),
# indexes
Extension(
name='Products.PluginIndexes.TextIndex.Splitter.'
'ZopeSplitter.ZopeSplitter',
sources=['src/Products/PluginIndexes/TextIndex/Splitter/'
'ZopeSplitter/src/ZopeSplitter.c']),
Extension(
name='Products.PluginIndexes.TextIndex.Splitter.'
'ISO_8859_1_Splitter.ISO_8859_1_Splitter',
sources=['src/Products/PluginIndexes/TextIndex/Splitter/'
'ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c']),
Extension(
name='Products.PluginIndexes.TextIndex.Splitter.'
'UnicodeSplitter.UnicodeSplitter',
sources=['src/Products/PluginIndexes/TextIndex/Splitter/'
'UnicodeSplitter/src/UnicodeSplitter.c']),
Extension(
name='Products.ZCTextIndex.stopper',
sources=['src/Products/ZCTextIndex/stopper.c']),
......
......@@ -47,34 +47,8 @@ Changes to Indexes:
- new index type
Changes to TextIndex:
- ZMI allows to select a different vocabulary. To use a vocabulary different
from the ZCatalogs default vocabulary 'Vocabulary' you must create a new
Vocabulary through the ZMI of the ZCatalog. After creating the vocabulary you
can choose the vocabulary on the ZMI management screen for the text index.
- the default operator might be overridden by specifying a new one
as 'operator' (see below)
- removed direct dependency from Splitter module. Splitter is now
acquired from used vocabulary
- usage of the 'textindex_operator' is deprecated
- lots of internal rework
Changes to Vocabulary:
- added Splitter selection on the add formular
Changes to ZCatalog
- Vocabulary.py moved to Products/PluginIndexes/TextIndex. A wrapper
for backward compatibility is in place
- added ZCatalogIndexes.py to provide access to indexes with pluggable
index interface
......
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import re
import string
from BTrees.IIBTree import IISet, union, IITreeSet
from BTrees.OIBTree import OIBTree
from BTrees.IOBTree import IOBTree
from BTrees.OOBTree import OOBTree
from Products.PluginIndexes.common.randid import randid
from Products.PluginIndexes.TextIndex.TextIndex import Op
from Products.PluginIndexes.TextIndex.TextIndex import Or
from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
from Products.PluginIndexes.TextIndex.Splitter import getSplitter
class GlobbingLexicon(Lexicon):
"""Lexicon which supports basic globbing function ('*' and '?').
This lexicon keeps several data structures around that are useful
for searching. They are:
'_lexicon' -- Contains the mapping from word => word_id
'_inverseLex' -- Contains the mapping from word_id => word
'_digrams' -- Contains a mapping from digram => word_id
Before going further, it is necessary to understand what a digram is,
as it is a core component of the structure of this lexicon. A digram
is a two-letter sequence in a word. For example, the word 'zope'
would be converted into the digrams::
['$z', 'zo', 'op', 'pe', 'e$']
where the '$' is a word marker. It is used at the beginning and end
of the words. Those digrams are significant.
"""
multi_wc = '*'
single_wc = '?'
eow = '$'
def __init__(self,useSplitter=None,extra=None):
self.clear()
self.useSplitter = useSplitter
self.splitterParams = extra
self.SplitterFunc = getSplitter(self.useSplitter)
def clear(self):
self._lexicon = OIBTree()
self._inverseLex = IOBTree()
self._digrams = OOBTree()
def _convertBTrees(self, threshold=200):
Lexicon._convertBTrees(self, threshold)
if type(self._digrams) is OOBTree: return
from BTrees.convert import convert
_digrams=self._digrams
self._digrams=OOBTree()
self._digrams._p_jar=self._p_jar
convert(_digrams, self._digrams, threshold, IITreeSet)
def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
word = '$'+word+'$'
return [ word[i:i+2] for i in range(len(word)-1)]
def getWordId(self, word):
"""Provided 'word', return the matching integer word id."""
if self._lexicon.has_key(word):
return self._lexicon[word]
else:
return self.assignWordId(word)
set = getWordId # Kludge for old code
def getWord(self, wid):
return self._inverseLex.get(wid, None)
def assignWordId(self, word):
"""Assigns a new word id to the provided word, and return it."""
# Double check it's not in the lexicon already, and if it is, just
# return it.
if self._lexicon.has_key(word):
return self._lexicon[word]
# Get word id. BBB Backward compat pain.
inverse=self._inverseLex
try: insert=inverse.insert
except AttributeError:
# we have an "old" BTree object
if inverse:
wid=inverse.keys()[-1]+1
else:
self._inverseLex=IOBTree()
wid=1
inverse[wid] = word
else:
# we have a "new" IOBTree object
wid=randid()
while not inverse.insert(wid, word):
wid=randid()
self._lexicon[word] = wid
# Now take all the digrams and insert them into the digram map.
for digram in self.createDigrams(word):
set = self._digrams.get(digram, None)
if set is None:
self._digrams[digram] = set = IISet()
set.insert(wid)
return wid
def get(self, pattern):
""" Query the lexicon for words matching a pattern."""
# single word pattern produce a slicing problem below.
# Because the splitter throws away single characters we can
# return an empty tuple here.
if len(pattern)==1: return ()
wc_set = [self.multi_wc, self.single_wc]
digrams = []
globbing = 0
for i in range(len(pattern)):
if pattern[i] in wc_set:
globbing = 1
continue
if i == 0:
digrams.insert(i, (self.eow + pattern[i]) )
digrams.append((pattern[i] + pattern[i+1]))
else:
try:
if pattern[i+1] not in wc_set:
digrams.append( pattern[i] + pattern[i+1] )
except IndexError:
digrams.append( (pattern[i] + self.eow) )
if not globbing:
result = self._lexicon.get(pattern, None)
if result is None:
return ()
return (result, )
## now get all of the intsets that contain the result digrams
result = None
for digram in digrams:
result=union(result, self._digrams.get(digram, None))
if not result:
return ()
else:
## now we have narrowed the list of possible candidates
## down to those words which contain digrams. However,
## some words may have been returned that match digrams,
## but do not match 'pattern'. This is because some words
## may contain all matching digrams, but in the wrong
## order.
expr = re.compile(self.createRegex(pattern))
words = []
hits = IISet()
for x in result:
if expr.match(self._inverseLex[x]):
hits.insert(x)
return hits
def __getitem__(self, word):
""" """
return self.get(word)
def query_hook(self, q):
"""expand wildcards"""
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif isinstance(e, Op):
pass
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
if not words:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
return q
def Splitter(self, astring, words=None, encoding="latin1"):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
try:
return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except:
return self.SplitterFunc(astring, words)
def createRegex(self, pat):
"""Translate a PATTERN to a regular expression.
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
if not isinstance(pat, unicode):
transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
else:
transTable={}
for ch in r'()&|!@#$%^{}\<>.':
transTable[ord(ch)]=None
result=pat.translate(transTable)
# First, deal with multi-character globbing
result = result.replace( '*', '.*')
# Next, we need to deal with single-character globbing
result = result.replace( '?', '.')
return "%s$" % result
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__=""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer
mapping.
"""
from Acquisition import Implicit
from BTrees.OIBTree import OIBTree
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IISet
from BTrees.IIBTree import IITreeSet
from Persistence import Persistent
from Products.PluginIndexes.common.randid import randid
from Products.PluginIndexes.TextIndex.Splitter import getSplitter
from Products.PluginIndexes.TextIndex.Splitter import splitterNames
from types import StringType
class Lexicon(Persistent, Implicit):
"""Maps words to word ids and then some
The Lexicon object is an attempt to abstract vocabularies out of
Text indexes. This abstraction is not totally cooked yet, this
module still includes the parser for the 'Text Index Query
Language' and a few other hacks.
"""
# default for older objects
stop_syn={}
def __init__(self, stop_syn=None,useSplitter=None,extra=None):
self.clear()
if stop_syn is None:
self.stop_syn = {}
else:
self.stop_syn = stop_syn
self.useSplitter = splitterNames[0]
if useSplitter: self.useSplitter=useSplitter
self.splitterParams = extra
self.SplitterFunc = getSplitter(self.useSplitter)
def clear(self):
self._lexicon = OIBTree()
self._inverseLex = IOBTree()
def _convertBTrees(self, threshold=200):
if (type(self._lexicon) is OIBTree and
type(getattr(self, '_inverseLex', None)) is IOBTree):
return
from BTrees.convert import convert
lexicon=self._lexicon
self._lexicon=OIBTree()
self._lexicon._p_jar=self._p_jar
convert(lexicon, self._lexicon, threshold)
try:
inverseLex=self._inverseLex
self._inverseLex=IOBTree()
except AttributeError:
# older lexicons didn't have an inverse lexicon
self._inverseLex=IOBTree()
inverseLex=self._inverseLex
self._inverseLex._p_jar=self._p_jar
convert(inverseLex, self._inverseLex, threshold)
def set_stop_syn(self, stop_syn):
""" pass in a mapping of stopwords and synonyms. Format is:
{'word' : [syn1, syn2, ..., synx]}
Vocabularies do not necesarily need to implement this if their
splitters do not support stemming or stoping.
"""
self.stop_syn = stop_syn
def getWordId(self, word):
""" return the word id of 'word' """
wid=self._lexicon.get(word, None)
if wid is None:
wid=self.assignWordId(word)
return wid
set = getWordId
def getWord(self, wid):
""" post-2.3.1b2 method, will not work with unconverted lexicons """
return self._inverseLex.get(wid, None)
def assignWordId(self, word):
"""Assigns a new word id to the provided word and returns it."""
# First make sure it's not already in there
if self._lexicon.has_key(word):
return self._lexicon[word]
try: inverse=self._inverseLex
except AttributeError:
# woops, old lexicom wo wids
inverse=self._inverseLex=IOBTree()
for word, wid in self._lexicon.items():
inverse[wid]=word
wid=randid()
while not inverse.insert(wid, word):
wid=randid()
if isinstance(word, str):
self._lexicon[intern(word)] = wid
else:
self._lexicon[word] = wid
return wid
def get(self, key, default=None):
"""Return the matched word against the key."""
r=IISet()
wid=self._lexicon.get(key, default)
if wid is not None: r.insert(wid)
return r
def __getitem__(self, key):
return self.get(key)
def __len__(self):
return len(self._lexicon)
def Splitter(self, astring, words=None, encoding = "latin1"):
""" wrap the splitter """
if words is None: words = self.stop_syn
try:
return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except:
return self.SplitterFunc(astring, words)
def query_hook(self, q):
""" we don't want to modify the query cuz we're dumb """
return q
stop_words=(
'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
'along', 'already', 'also', 'although', 'always', 'am', 'among',
'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
'somehow', 'someone', 'something', 'sometime', 'sometimes',
'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
'their', 'them', 'themselves', 'then', 'thence', 'there',
'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
)
stop_word_dict={}
for word in stop_words: stop_word_dict[word]=None
<extension ISO_8859_1_Splitter>
source src/ISO_8859_1_Splitter.c
</extension>
from ISO_8859_1_Splitter import ISO_8859_1_Splitter
def Splitter(txt,stopwords=None,encoding='latin1'):
return ISO_8859_1_Splitter(txt,stopwords)
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
#define UPPERCASE "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
#define LOWERCASE "abcdefghijklmnopqrstuvwxyz"
#define DIGITSETC "0123456789-"
static unsigned char letdig[256];
static unsigned char trtolower[256];
typedef struct
{
PyObject_HEAD
PyObject *text, *synstop;
char *here, *end;
int index;
int allow_single_chars;
int index_numbers;
int max_len;
int casefolding;
}
Splitter;
static PyObject *next_word(Splitter *,char **,char **);
static int myisalnum(int c)
{
return letdig[(unsigned char)c];
}
static int mytolower(int c)
{
return trtolower[(unsigned char)c];
}
static int myisspace(int c)
{
if (myisalnum(c))
return 0;
return isspace(c);
}
static void initSplitterTrtabs(void)
{
int i;
static int initialized=0;
if (initialized)
return;
initialized=1;
for (i=0;i<256;i++) {
letdig[i]=0;
trtolower[i]=i;
}
for (i=0;i<sizeof(UPPERCASE);i++) {
trtolower[(unsigned char)UPPERCASE[i]]=LOWERCASE[i];
letdig[(unsigned char)LOWERCASE[i]]=1;
letdig[(unsigned char)UPPERCASE[i]]=1;
}
for (i=0;i<sizeof(DIGITSETC);i++) {
letdig[(unsigned char)DIGITSETC[i]]=1;
}
}
static void
Splitter_reset(Splitter *self)
{
self->here = PyString_AsString(self->text);
self->index = -1;
}
static void
Splitter_dealloc(Splitter *self)
{
Py_XDECREF(self->text);
Py_XDECREF(self->synstop);
PyObject_DEL(self);
}
static int
Splitter_length(Splitter *self)
{
PyObject *res=0;
Splitter_reset(self);
while(1) {
UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
UNLESS(PyString_Check(res)) {
Py_DECREF(res);
break;
}
}
return self->index+1;
}
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject *
Splitter_concat(Splitter *self, PyObject *other)
{
PyErr_SetString(PyExc_TypeError, "Cannot concatenate Splitters.");
return NULL;
}
static PyObject *
Splitter_repeat(Splitter *self, long n)
{
PyErr_SetString(PyExc_TypeError, "Cannot repeat Splitters.");
return NULL;
}
/*
Map an input word to an output word by applying standard
filtering/mapping words, including synonyms/stop words.
Input is a word.
Output is:
None -- The word is a stop word
sometext -- A replacement for the word
*/
static PyObject *
check_synstop(Splitter *self, PyObject *word)
{
PyObject *value;
char *cword;
int len;
cword = PyString_AsString(word);
len = PyString_Size(word) - 1;
len = PyString_Size(word);
if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
}
/*************************************************************
Test whether a word has any letters. *
*/
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF(word);
if (self->synstop == NULL)
return word;
while ((value = PyObject_GetItem(self->synstop, word)) &&
PyString_Check(value)) {
ASSIGN(word,value);
if(len++ > 100)
break; /* Avoid infinite recurssion */
}
if (value == NULL) {
PyErr_Clear();
return word;
}
return value; /* Which must be None! */
}
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
here=self->here;
end=self->end;
b=wbuf;
while (here < end) {
/* skip hyphens */
if ((i > 0) && (*here == '-')) {
here++;
while (myisspace(*here) && (here < end))
here++;
continue;
}
if (self->casefolding)
c=mytolower(*here);
else
c = (*here);
/* Check to see if this character is part of a word */
if(myisalnum((unsigned char)c) || c=='/') { /* Found a word character */
if(startpos && i==0)
*startpos=here;
if(i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
return NULL;
}
UNLESS(res = check_synstop(self, pyword)) {
self->here=here;
Py_DECREF(pyword);
return NULL;
}
if (res != Py_None) {
if(endpos)
*endpos=here;
self->here=here;
Py_DECREF(pyword);
self->index++;
return res;
}
/* The word is a stopword, so ignore it */
Py_DECREF(res);
Py_DECREF(pyword);
i = 0;
b=wbuf;
}
here++;
}
self->here=here;
/* We've reached the end of the string */
if(i >= self->max_len)
i=self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
self->here=here;
Py_INCREF(Py_None);
return Py_None;
}
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
if(endpos)
*endpos=here;
res = check_synstop(self, pyword);
Py_DECREF(pyword);
if(PyString_Check(res))
self->index++;
return res;
}
static PyObject *
Splitter_item(Splitter *self, int i)
{
PyObject *word = NULL;
if (i <= self->index)
Splitter_reset(self);
while(self->index < i) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
Py_DECREF(word);
PyErr_SetString(PyExc_IndexError,
"Splitter index out of range");
return NULL;
}
}
return word;
}
static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
PyErr_SetString(PyExc_TypeError, "Cannot slice Splitters.");
return NULL;
}
static PySequenceMethods Splitter_as_sequence = {
(inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
{
char *start, *end, *ctext;
PyObject *res;
int i;
UNLESS(PyArg_Parse(args, "i", &i)) return NULL;
if (i <= self->index)
Splitter_reset(self);
while(self->index < i) {
UNLESS(res=next_word(self, &start, &end)) return NULL;
if(PyString_Check(res)) {
self->index++;
Py_DECREF(res);
continue;
}
Py_DECREF(res);
PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
return NULL;
}
ctext=PyString_AsString(self->text);
return Py_BuildValue("(ii)", start - ctext, end - ctext);
}
static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
{
PyObject *word, *r, *w=0, *index=0;
int i=0;
UNLESS(PyArg_ParseTuple(args,"O",&word)) return NULL;
UNLESS(r=PyList_New(0)) return NULL;
UNLESS(word=check_synstop(self, word)) goto err;
Splitter_reset(self);
while(1) {
UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
UNLESS(PyString_Check(w)) break;
if(PyObject_Compare(word,w)==0) {
UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
if(PyList_Append(r,index) < 0)
goto err;
}
i++;
}
Py_XDECREF(w);
Py_XDECREF(index);
return r;
err:
Py_DECREF(r);
Py_XDECREF(index);
return NULL;
}
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split the string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return al list of the indexes of word in the sequence",
},
{ NULL, NULL } /* sentinel */
};
static PyObject *
Splitter_getattr(Splitter *self, char *name)
{
return Py_FindMethod(Splitter_methods, (PyObject *)self, name);
}
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
{
Splitter *self;
PyObject *doc, *synstop = NULL;
char * encoding="latin1";
int single_char = 0;
int index_numbers = 0;
int max_len=64;
int casefolding=1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len,&casefolding)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop) {
self->synstop=synstop;
Py_INCREF(synstop);
} else
self->synstop=NULL;
UNLESS(self->text = PyObject_Str(doc)) goto err;
UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text);
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->casefolding = casefolding;
self->index = -1;
return (PyObject*)self;
err:
Py_DECREF(self);
return NULL;
}
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{ NULL, NULL }
};
static char Splitter_module_documentation[] =
"Parse source strings into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"
"$Id$\n"
;
void
initISO_8859_1_Splitter(void)
{
PyObject *m;
/* Create the module and add the functions */
initSplitterTrtabs();
m = Py_InitModule4("ISO_8859_1_Splitter", Splitter_module_methods,
Splitter_module_documentation,
(PyObject*)NULL,PYTHON_API_VERSION);
}
<extension UnicodeSplitter>
source src/UnicodeSplitter.c
</extension>
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#ifndef min
#define min(a,b) ((a)<(b)?(a):(b))
#endif
typedef struct
{
PyObject_HEAD
PyObject *list;
PyObject *synstop;
int max_len;
int allow_single_chars;
int index_numbers;
int casefolding;
}
Splitter;
static
PyUnicodeObject *prepareString(Splitter *self, PyUnicodeObject *o);
static PyObject *checkSynword(Splitter *self, PyObject *word)
{
/* Always returns a borrowed reference */
PyObject *value;
if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars) {
Py_INCREF(Py_None);
return Py_None;
}
if (self->synstop) {
value = PyDict_GetItem(self->synstop,word);
if (value != NULL) {
return value;
}
}
return word;
}
static void
Splitter_dealloc(Splitter *self)
{
Py_XDECREF(self->list);
Py_XDECREF(self->synstop);
PyObject_DEL(self);
}
static int
Splitter_length(Splitter *self)
{
return PyList_Size(self->list);
}
static PyObject *
Splitter_concat(Splitter *self, PyObject *other)
{
PyErr_SetString(PyExc_TypeError, "Cannot concatenate Splitters.");
return NULL;
}
static PyObject *
Splitter_repeat(Splitter *self, long n)
{
PyErr_SetString(PyExc_TypeError, "Cannot repeat Splitters.");
return NULL;
}
static PyObject *
Splitter_item(Splitter *self, int i)
{
PyObject *item;
item = PyList_GetItem(self->list, i);
Py_XINCREF(item); /* Promote borrowed ref unless exception */
return item;
}
static PyObject *
Splitter_split(Splitter *self) {
Py_INCREF(self->list);
return self->list;
}
static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
{
int i=0, size;
PyObject *word=NULL,*item=NULL,*r=NULL,*index=NULL;
if (! (PyArg_ParseTuple(args,"O",&word))) return NULL;
if (! (r=PyList_New(0))) return NULL;
size = PyList_Size(self->list);
for (i=0;i<size;i++) {
item=PyList_GET_ITEM(self->list,i);
if (PyUnicode_Compare(word,item)==0) {
index=PyInt_FromLong(i);
if(!index) return NULL;
PyList_Append(r,index);
}
}
return r;
}
static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
PyErr_SetString(PyExc_TypeError, "Cannot slice Splitters.");
return NULL;
}
static PySequenceMethods Splitter_as_sequence = {
(inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction) Splitter_split, 0,
"split() -- Split string in one run" },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence",
},
{ NULL, NULL } /* sentinel */
};
static PyObject *
Splitter_getattr(Splitter *self, char *name)
{
return Py_FindMethod(Splitter_methods, (PyObject *)self, name);
}
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
{
PyObject *word,*synword;
PyUnicodeObject * doc1;
Py_UNICODE *s;
int len = doc->length;
int inside_word=0;
int i=0;
int start=0;
doc1 = prepareString(self,doc);
if (doc1 == NULL)
return -1;
s=doc1->str;
self->list = PyList_New(0);
for (i = 0; i < len; s++, i++) {
register Py_UNICODE ch;
ch = *s;
if (!inside_word) {
if (self->index_numbers) {
if (Py_UNICODE_ISALNUM(ch)) {
inside_word=1;
start = i;
}
} else {
if (Py_UNICODE_ISALPHA(ch)) {
inside_word=1;
start = i;
}
}
} else {
if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start,
min(i, start + self->max_len));
if (word==NULL)
goto err;
synword = checkSynword(self,word);
if (synword != Py_None) {
PyList_Append(self->list,synword);
}
start = 0;
#ifdef DEBUG
PyObject_Print(word,stdout,0);
fflush(stdout);
#endif
Py_DECREF(word);
}
}
}
if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start,
min(len, start + self->max_len));
if (word==NULL)
goto err;
synword = checkSynword(self,word);
if (synword != Py_None) {
PyList_Append(self->list,synword);
}
Py_DECREF(word);
}
#ifdef DEBUG
PyObject_Print(self->list,stdout,0);
fflush(stdout);
#endif
Py_DECREF(doc1);
return 1;
err:
Py_DECREF(doc1);
return -1;
}
static
void fixlower(PyUnicodeObject *self)
{
int len = self->length;
Py_UNICODE *s = self->str;
while (len-- > 0) {
register Py_UNICODE ch;
ch = Py_UNICODE_TOLOWER(*s);
if (ch != *s) *s = ch;
s++;
}
}
static
PyUnicodeObject *prepareString(Splitter *self,PyUnicodeObject *o)
{
PyUnicodeObject *u;
u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length);
if (u != NULL){
if (self->casefolding)
fixlower(u);
}
return u;
}
static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen","casefolding",NULL};
static PyObject *
newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
{
Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1";
int index_numbers = 0;
int max_len=64;
int single_char = 0;
int casefolding=1;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len,&casefolding))) return NULL;
#ifdef DEBUG
puts("got text");
PyObject_Print(doc,stdout,0);
fflush(stdout);
#endif
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
if (unicodedoc ==NULL) {
PyErr_SetString(PyExc_UnicodeError, "Problem converting encoded string");
return NULL;
}
} else if( PyUnicode_Check(doc)) {
unicodedoc = doc;
Py_INCREF(unicodedoc);
} else {
PyErr_SetString(PyExc_TypeError, "first argument is neither string nor unicode.");
return NULL;
}
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
if (synstop) {
self->synstop = synstop;
Py_INCREF(synstop);
} else self->synstop=NULL;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->allow_single_chars = single_char;
self->casefolding = casefolding;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err;
Py_DECREF(unicodedoc);
return (PyObject*)self;
err:
Py_DECREF(self);
Py_DECREF(unicodedoc);
return NULL;
}
static struct PyMethodDef Splitter_module_methods[] =
{
{ "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1'][,indexnumbers][,maxlen][,singlechar][,casefolding]) "
"-- Return a word splitter"
},
{ NULL, NULL }
};
static char Splitter_module_documentation[] =
"Parse source (unicode) string into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"
"$Id$\n"
;
void
initUnicodeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.16 $";
/* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
Splitter_module_documentation,
(PyObject*)NULL,PYTHON_API_VERSION);
/* Add some symbolic constants to the module */
d = PyModule_GetDict(m);
PyDict_SetItemString(d, "__version__",
PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter");
}
# -*- coding: ISO-8859-1 -*-
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import os,sys,unittest
from Products.PluginIndexes.TextIndex.Splitter.UnicodeSplitter.UnicodeSplitter \
import UnicodeSplitter
class UnicodeSplitterTests(unittest.TestCase):
def setUp(self):
texts = ('The quick brown fox jumps over the lazy dog',
'Bei den dreitgigen Angriffen seien auch bis'
' auf einen alle Flugpltze der Taliban zerstrt worden',
)
self.testdata = []
for t in texts:
uniLst = [unicode(x,'latin1') for x in t.lower().split(' ')]
self.testdata.append( (t, uniLst) )
def testSimpleSplit(self):
""" testing splitter functionality """
for t,expected in self.testdata:
fields = list(UnicodeSplitter(t))
assert fields == expected, "%s vs %s" % (fields,expected)
return 0
def testStopwords(self):
""" testing splitter with stopwords """
text = 'The quick brown fox jumps over The lazy dog'
expected = [ u'quick',u'brown',u'fox',u'jumps',u'over',u'lazy',u'cat']
sw_dict = {'the':None,'dog':'cat'}
splitter = UnicodeSplitter(text,sw_dict)
fields = list(splitter)
self.assertEquals(fields, expected)
self.assertEquals(splitter.indexes('jumps'), [3])
def test_suite():
return unittest.makeSuite(UnicodeSplitterTests)
def debug():
return test_suite().debug()
def pdebug():
import pdb
pdb.run('debug()')
def main():
unittest.TextTestRunner().run( test_suite() )
if __name__ == '__main__':
if len(sys.argv) > 1:
globals()[sys.argv[1]]()
else:
main()
<extension ZopeSplitter>
source src/ZopeSplitter.c
</extension>
from ZopeSplitter import ZopeSplitter
def Splitter(txt,stopwords={},encoding="latin1"):
return ZopeSplitter(txt,stopwords)
/*****************************************************************************
Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
This software is subject to the provisions of the Zope Public License,
Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
FOR A PARTICULAR PURPOSE
****************************************************************************/
#include "Python.h"
#include <ctype.h>
#define ASSIGN(V,E) {PyObject *__e; __e=(E); Py_XDECREF(V); (V)=__e;}
#define UNLESS(E) if(!(E))
#define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)
typedef struct
{
PyObject_HEAD
PyObject *text, *synstop;
char *here, *end;
int index;
int allow_single_chars;
int index_numbers;
int max_len;
int casefolding;
}
Splitter;
static PyObject *next_word(Splitter *, char **, char **);
static void
Splitter_reset(Splitter *self)
{
self->here = PyString_AsString(self->text);
self->index = -1;
}
static void
Splitter_dealloc(Splitter *self)
{
Py_XDECREF(self->text);
Py_XDECREF(self->synstop);
PyObject_DEL(self);
}
static int
Splitter_length(Splitter *self)
{
PyObject *res=0;
Splitter_reset(self);
while(1) {
UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
UNLESS(PyString_Check(res)) {
Py_DECREF(res);
break;
}
}
return self->index+1;
}
static PyObject *
Splitter_concat(Splitter *self, PyObject *other)
{
PyErr_SetString(PyExc_TypeError, "Cannot concatenate Splitters.");
return NULL;
}
static PyObject *
Splitter_repeat(Splitter *self, long n)
{
PyErr_SetString(PyExc_TypeError, "Cannot repeat Splitters.");
return NULL;
}
/*
Map an input word to an output word by applying standard
filtering/mapping words, including synonyms/stop words.
Input is a word.
Output is:
None -- The word is a stop word
sometext -- A replacement for the word
*/
static PyObject *
check_synstop(Splitter *self, PyObject *word)
{
PyObject *value;
char *cword;
int len;
cword = PyString_AS_STRING(word);
len = PyString_GET_SIZE(word);
if (len < 2 && !self->allow_single_chars)
/* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
return Py_None;
}
/*************************************************************
Test whether a word has any letters. *
*/
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None);
return Py_None;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF(word);
if (self->synstop == NULL)
return word;
len = 0;
while ((value = PyObject_GetItem(self->synstop, word)) &&
PyString_Check(value)) {
Py_DECREF(word);
word = value;
if (len++ > 100)
break; /* Avoid infinite recurssion */
}
if (value == NULL) {
PyErr_Clear();
return word;
}
return value; /* Which must be None! */
}
static PyObject *
next_word(Splitter *self, char **startpos, char **endpos)
{
char wbuf[256];
char *end, *here, *b;
int i = 0, c;
PyObject *pyword, *res;
here=self->here;
end=self->end;
b=wbuf;
while (here < end) {
/* skip hyphens */
if ((i > 0) && (*here == '-')) {
here++;
while (isspace((unsigned char) *here) && (here < end))
here++;
continue;
}
if (self->casefolding)
c = tolower((unsigned char) *here);
else
c = (unsigned char) *here;
/* Check to see if this character is part of a word */
if (isalnum((unsigned char)c) || c == '/' || c == '_') {
/* Found a word character */
if (startpos && i == 0)
*startpos = here;
if (i++ < self->max_len)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if (i >= self->max_len)
i =self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here = here;
return NULL;
}
UNLESS(res = check_synstop(self, pyword)) {
self->here = here;
Py_DECREF(pyword);
return NULL;
}
if (res != Py_None) {
if (endpos)
*endpos = here;
self->here = here;
Py_DECREF(pyword);
self->index++;
return res;
}
/* The word is a stopword, so ignore it */
Py_DECREF(res);
Py_DECREF(pyword);
i = 0;
b = wbuf;
}
here++;
}
self->here=here;
/* We've reached the end of the string */
if (i >= self->max_len)
i = self->max_len; /* "stem" the long word */
if (i == 0) {
/* No words */
self->here=here;
Py_INCREF(Py_None);
return Py_None;
}
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
if(endpos)
*endpos=here;
res = check_synstop(self, pyword);
Py_DECREF(pyword);
if (PyString_Check(res))
self->index++;
return res;
}
static PyObject *
Splitter_item(Splitter *self, int i)
{
PyObject *word = NULL;
if (i <= self->index)
Splitter_reset(self);
while(self->index < i) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
Py_DECREF(word);
PyErr_SetString(PyExc_IndexError,
"Splitter index out of range");
return NULL;
}
}
return word;
}
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self, NULL, NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject *
Splitter_slice(Splitter *self, int i, int j)
{
PyErr_SetString(PyExc_TypeError, "Cannot slice Splitters.");
return NULL;
}
static PySequenceMethods Splitter_as_sequence = {
(inquiry)Splitter_length, /*sq_length*/
(binaryfunc)Splitter_concat, /*sq_concat*/
(intargfunc)Splitter_repeat, /*sq_repeat*/
(intargfunc)Splitter_item, /*sq_item*/
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
{
char *start, *end, *ctext;
PyObject *res;
int i;
UNLESS(PyArg_Parse(args, "i", &i)) return NULL;
if (i <= self->index)
Splitter_reset(self);
while(self->index < i) {
UNLESS(res=next_word(self, &start, &end)) return NULL;
if(PyString_Check(res)) {
self->index++;
Py_DECREF(res);
continue;
}
Py_DECREF(res);
PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
return NULL;
}
ctext=PyString_AsString(self->text);
return Py_BuildValue("(ii)", start - ctext, end - ctext);
}
static PyObject *
Splitter_indexes(Splitter *self, PyObject *args)
{
PyObject *word, *r, *w=0, *index=0;
int i=0;
UNLESS(PyArg_ParseTuple(args,"O",&word)) return NULL;
UNLESS(r=PyList_New(0)) return NULL;
UNLESS(word=check_synstop(self, word)) goto err;
Splitter_reset(self);
while(1) {
UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
UNLESS(PyString_Check(w)) break;
if(PyObject_Compare(word,w)==0) {
UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
if(PyList_Append(r,index) < 0)
goto err;
}
i++;
}
Py_XDECREF(w);
Py_XDECREF(index);
return r;
err:
Py_DECREF(r);
Py_XDECREF(index);
return NULL;
}
static struct PyMethodDef Splitter_methods[] =
{
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split complete string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence",
},
{ NULL, NULL } /* sentinel */
};
static PyObject *
Splitter_getattr(Splitter *self, char *name)
{
return Py_FindMethod(Splitter_methods, (PyObject *)self, name);
}
static char SplitterType__doc__[] = "";
static PyTypeObject SplitterType = {
PyObject_HEAD_INIT(NULL)
0, /*ob_size*/
"Splitter", /*tp_name*/
sizeof(Splitter), /*tp_basicsize*/
0, /*tp_itemsize*/
/* methods */
(destructor)Splitter_dealloc, /*tp_dealloc*/
(printfunc)0, /*tp_print*/
(getattrfunc)Splitter_getattr, /*tp_getattr*/
(setattrfunc)0, /*tp_setattr*/
(cmpfunc)0, /*tp_compare*/
(reprfunc)0, /*tp_repr*/
0, /*tp_as_number*/
&Splitter_as_sequence, /*tp_as_sequence*/
0, /*tp_as_mapping*/
(hashfunc)0, /*tp_hash*/
(ternaryfunc)0, /*tp_call*/
(reprfunc)0, /*tp_str*/
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen","casefolding",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
{
Splitter *self;
PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
int single_char = 0;
int index_numbers = 0;
int max_len= 64;
int casefolding = 1;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiiii",splitter_args,
&doc,
&synstop,
&encoding,
&single_char,
&index_numbers,
&max_len,
&casefolding
)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (casefolding<0 || casefolding>1) {
PyErr_SetString(PyExc_ValueError,"casefolding must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop) {
self->synstop=synstop;
Py_INCREF(synstop);
} else
self->synstop=NULL;
UNLESS(self->text = PyObject_Str(doc)) goto err;
UNLESS(self->here = PyString_AS_STRING(self->text)) goto err;
self->end = self->here + PyString_GET_SIZE(self->text);
self->index = -1;
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->casefolding = casefolding;
return (PyObject*)self;
err:
Py_DECREF(self);
return NULL;
}
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen][,casefolding]) -- Return a word splitter"
},
{ NULL, NULL }
};
static char Splitter_module_documentation[] =
"Parse source strings into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"
"$Id$\n"
;
void
initZopeSplitter(void)
{
/* Create the module and add the functions */
Py_InitModule4("ZopeSplitter", Splitter_module_methods,
Splitter_module_documentation, NULL, PYTHON_API_VERSION);
}
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
#############################################################################
import os,sys,exceptions
availableSplitters = (
("ZopeSplitter" , "Zope Default Splitter"),
("ISO_8859_1_Splitter" , "Werner Strobls ISO-8859-1 Splitter"),
("UnicodeSplitter" , "Unicode-aware splitter")
)
splitterNames = map(lambda x: x[0],availableSplitters)
def getSplitter(name=None):
if not name in splitterNames and name:
raise exceptions.RuntimeError, "No such splitter '%s'" % name
if not name: name = splitterNames[0]
if not vars().has_key(name):
exec( "from %s.%s import %s" % (name,name,name))
return vars()[name]
#!/usr/bin/env python
from distutils.core import setup,Extension
import os,exceptions,commands,sys
CFLAGS = []
LFLAGS = []
LIBS=[]
setup (name = "Splitter",
version = "1.0",
description = "Splitters for Zope 2.5",
author = "Andreas Jung",
author_email = "andreas@zope.com",
url = "http://www.zope.org/...",
ext_modules=[
Extension("ZopeSplitter",['ZopeSplitter/src/ZopeSplitter.c']), \
Extension("ISO_8859_1_Splitter",['ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c']), \
Extension("UnicodeSplitter",['UnicodeSplitter/src/UnicodeSplitter.c']) \
]
)
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Deprecated text index. Please use ZCTextIndex instead.
$Id$
"""
from cgi import escape
from logging import getLogger
import operator
import re
import warnings
from types import *
from Acquisition import Implicit
from App.special_dtml import DTMLFile
from OFS.SimpleItem import SimpleItem
from BTrees.IIBTree import difference
from BTrees.IIBTree import IIBTree
from BTrees.IIBTree import IIBucket
from BTrees.IIBTree import IISet
from BTrees.IIBTree import weightedIntersection
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from Persistence import Persistent
from zope.interface import implements
from Products.PluginIndexes.common import safe_callable
from Products.PluginIndexes.common.ResultList import ResultList
from Products.PluginIndexes.common.util import parseIndexRequest
from Products.PluginIndexes.interfaces import IPluggableIndex
from Products.PluginIndexes.interfaces import ITextIndex
from Products.PluginIndexes.TextIndex.Lexicon import Lexicon
LOG = getLogger('TextIndex')
class Op:
def __init__(self, name):
self.name = name
def __repr__(self):
return self.name
__str__ = __repr__
AndNot = Op('andnot')
And = Op('and')
Or = Op('or')
Near = Op('...')
QueryError = 'TextIndex.QueryError'
operator_dict = {'andnot': AndNot, 'and': And, 'or': Or,
'...': Near, 'near': Near,
AndNot: AndNot, And: And, Or: Or, Near: Near}
class TextIndex(Persistent, Implicit, SimpleItem):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
implements(ITextIndex, IPluggableIndex)
meta_type='TextIndex'
query_options = ('query', 'operator')
manage_options= (
{'label': 'Settings',
'action': 'manage_main',
'help': ('TextIndex','TextIndex_Settings.stx')},
)
def __init__(self, id, ignore_ex=None, call_methods=None, lexicon=None,
caller=None, extra=None):
"""Create an index
The arguments are:
'id' -- the name of the item attribute to index. This is
either an attribute name or a record key.
'ignore_ex' -- Tells the indexer to ignore exceptions that
are rasied when indexing an object.
'call_methods' -- Tells the indexer to call methods instead
of getattr or getitem to get an attribute.
'lexicon' is the lexicon object to specify, if None, the
index will use a private lexicon.
'caller' -- instance that created the index (maybe None)
'extra' -- Record to keep additional parameters
"""
self.id = id
self.ignore_ex = ignore_ex
self.call_methods = call_methods
self.catalog = caller
# Default text index operator (should be visible to ZMI)
self.useOperator = 'or'
if extra: self.vocabulary_id = extra.vocabulary
else: self.vocabulary_id = "Vocabulary"
self._lexicon = None
self.clear()
if lexicon is not None:
# We need to hold a reference to the lexicon, since we can't
# really change lexicons.
self._lexicon = lexicon
self.vocabulary_id = '__userdefined__'
def getId(self):
return self.id
def getLexicon(self, vocab_id=None):
"""Get the Lexicon in use.
"""
if self._lexicon is None:
## if no lexicon is provided, create a default one
try:
if self.catalog is None:
self.catalog = self.aq_inner.aq_parent.aq_base
self._lexicon = getattr(self.catalog,self.vocabulary_id).getLexicon()
except:
self._lexicon = Lexicon()
self.vocabulary_id = '__intern__'
return self._lexicon
def __nonzero__(self):
return not not self._unindex
def clear(self):
"""Reinitialize the text index."""
self._index = IOBTree()
self._unindex = IOBTree()
if self.getLexicon() and self.vocabulary_id=='__userdefined__':
self.getLexicon().clear()
self._lexicon = None
def _convertBTrees(self, threshold=200):
if type(self._lexicon) is type(''):
# Turn the name reference into a hard reference.
self._lexicon=self.getLexicon()
if type(self._index) is IOBTree: return
from BTrees.convert import convert
_index=self._index
self._index=IOBTree()
def convertScores(scores,
type=type,
IIBTree=IIBTree
):
if type(scores) is not tuple and type(scores) is not IIBTree():
scores=IIBTree(scores)
return scores
convert(_index, self._index, threshold, convertScores)
_unindex=self._unindex
self._unindex=IOBTree()
convert(_unindex, self._unindex, threshold)
def histogram(self, type=type):
"""Return a mapping which provides a histogram of the number of
elements found at each point in the index."""
histogram = IIBucket()
for (key, value) in self._index.items():
if type(value) is tuple:
entry = 1
else:
entry = len(value)
histogram[entry] = histogram.get(entry, 0) + 1
return histogram
def getEntryForObject(self, rid, default=None):
"""Get all information contained for a specific object.
This takes the objects record ID as it's main argument."""
results = self._unindex.get(rid, None)
if results is None:
return default
else:
return tuple(map(self.getLexicon().getWord,
results))
def insertForwardIndexEntry(self, entry, documentId, score=1):
"""Uses the information provided to update the indexes.
The basic logic for choice of data structure is based on
the number of entries as follows:
1 tuple
2-3 dictionary
4+ bucket.
"""
index=self._index
indexRow = index.get(entry, None)
if indexRow is not None:
if type(indexRow) is tuple:
# Tuples are only used for rows which have only
# a single entry. Since we now need more, we'll
# promote it to a mapping object (dictionary).
# First, make sure we're not already in it, if so
# update the score if necessary.
if indexRow[0] == documentId:
if indexRow[1] != score:
indexRow = (documentId, score)
index[entry] = indexRow
else:
indexRow={
indexRow[0]: indexRow[1],
documentId: score,
}
index[entry] = indexRow
else:
if indexRow.get(documentId, -1) != score:
# score changed (or new entry)
if type(indexRow) is dict:
indexRow[documentId] = score
if len(indexRow) > 3:
# Big enough to give it's own database record
indexRow=IIBTree(indexRow)
index[entry] = indexRow
else:
indexRow[documentId] = score
else:
# We don't have any information at this point, so we'll
# put our first entry in, and use a tuple to save space
index[entry] = (documentId, score)
def index_object(self, documentId, obj, threshold=None):
""" Index an object:
'documentId' is the integer id of the document
'obj' is the object to be indexed
'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are
disabled. """
# sniff the object for our 'id', the 'document source' of the
# index is this attribute. If it smells callable, call it.
try:
source = getattr(obj, self.id)
if safe_callable(source):
source = source()
if not isinstance(source, unicode):
source = str(source)
except (AttributeError, TypeError):
return 0
# sniff the object for 'id'+'_encoding'
try:
encoding = getattr(obj, self.id+'_encoding')
if safe_callable(encoding ):
encoding = str(encoding())
else:
encoding = str(encoding)
except (AttributeError, TypeError):
encoding = 'latin1'
lexicon = self.getLexicon()
splitter = lexicon.Splitter
wordScores = OIBTree()
last = None
# Run through the words and score them
for word in list(splitter(source,encoding=encoding)):
if word[0] == '\"':
last = self._subindex(word[1:-1], wordScores, last, splitter)
else:
if word==last: continue
last=word
wordScores[word]=wordScores.get(word,0)+1
# Convert scores to use wids:
widScores=IIBucket()
getWid=lexicon.getWordId
for word, score in wordScores.items():
widScores[getWid(word)]=score
del wordScores
currentWids=IISet(self._unindex.get(documentId, []))
# Get rid of document words that are no longer indexed
self.unindex_objectWids(documentId, difference(currentWids, widScores))
# Now index the words. Note that the new xIBTrees are clever
# enough to do nothing when there isn't a change. Woo hoo.
insert=self.insertForwardIndexEntry
for wid, score in widScores.items():
insert(wid, documentId, score)
# Save the unindexing info if it's changed:
wids=widScores.keys()
if wids != currentWids.keys():
self._unindex[documentId]=wids
return len(wids)
def _subindex(self, source, wordScores, last, splitter):
"""Recursively handle multi-word synonyms"""
for word in splitter(source):
if word[0] == '\"':
last = self._subindex(word[1:-1], wordScores, last, splitter)
else:
if word==last: continue
last=word
wordScores[word]=wordScores.get(word,0)+1
return last
def unindex_object(self, i):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index = self._index
unindex = self._unindex
wids = unindex.get(i, None)
if wids is not None:
self.unindex_objectWids(i, wids)
del unindex[i]
def unindex_objectWids(self, i, wids):
""" carefully unindex document with integer id 'i' from the text
index and do not fail if it does not exist """
index = self._index
get=index.get
for wid in wids:
widScores = get(wid, None)
if widScores is None:
LOG.error('unindex_object tried to unindex nonexistent'
' document, wid %s, %s' % (i,wid))
continue
if type(widScores) is tuple:
del index[wid]
else:
try:
del widScores[i]
if widScores:
if type(widScores) is dict:
if len(widScores) == 1:
# convert to tuple
widScores = widScores.items()[0]
index[wid]=widScores
else:
del index[wid]
except (KeyError, IndexError, TypeError):
LOG.error('unindex_object tried to unindex nonexistent'
' document %s' % str(i))
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
Note that this differentiates between being passed an Integer
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if type(word) is IntType:
# We have a word ID
result = self._index.get(word, {})
return ResultList(result, (word,), self)
else:
splitSource = tuple(self.getLexicon().Splitter(word))
if not splitSource:
return ResultList({}, (word,), self)
if len(splitSource) == 1:
splitSource = splitSource[0]
if splitSource[:1] == '"' and splitSource[-1:] == '"':
return self[splitSource]
wids=self.getLexicon().get(splitSource)
if wids:
r = self._index.get(wids[0], None)
if r is None:
r = {}
else:
r={}
return ResultList(r, (splitSource,), self)
r = None
for word in splitSource:
rr = self[word]
if r is None:
r = rr
else:
r = r.near(rr)
return r
def _apply_index(self, request):
""" Apply the index to query parameters given in the argument,
request
The argument should be a mapping object.
If the request does not contain the needed parameters, then
None is returned.
Otherwise two objects are returned. The first object is a
ResultSet containing the record numbers of the matching
records. The second object is a tuple containing the names of
all data fields used.
"""
record = parseIndexRequest(request, self.id, self.query_options)
if record.keys is None:
return None
# Changed for 2.4
# We use the default operator that can me managed via the ZMI
qop = record.get('operator', self.useOperator)
# We keep this for pre-2.4 compatibility
# This stinking code should go away somewhere. A global
# textindex_operator makes no sense when using multiple
# text indexes inside a catalog. An index operator should
# should be specified on a per-index base
if request.has_key('textindex_operator'):
qop = request['textindex_operator']
warnings.warn("The usage of the 'textindex_operator' "
"is no longer recommended.\n"
"Please use a mapping object and the "
"'operator' key to specify the operator.")
query_operator = operator_dict.get(qop)
if query_operator is None:
raise exceptions.RuntimeError, ("Invalid operator '%s' "
"for a TextIndex" % escape(qop))
r = None
for key in record.keys:
key = key.strip()
if not key:
continue
b = self.query(key, query_operator).bucket()
w, r = weightedIntersection(r, b)
if r is not None:
return r, (self.id,)
return (IIBucket(), (self.id,))
def positions(self, docid, words,
# This was never tested: obj
):
"""Return the positions in the document for the given document
id of the word, word."""
return [1]
#################################################################
# The code below here is broken and requires an API change to fix
# it. Waaaaa.
if self._schema is None:
f = getattr
else:
f = operator.__getitem__
id = self._schema[self.id]
if self.call_methods:
doc = str(f(obj, self.id)())
else:
doc = str(f(obj, self.id))
r = []
for word in words:
r = r+self.getLexicon().Splitter(doc).indexes(word)
return r
def query(self, s, default_operator=Or):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
# Parse parentheses and quotes
q = parse(s)
# Allow the Lexicon to process the query
q = self.getLexicon().query_hook(q)
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
# evalute the final 'expression'
return self.evaluate(q)
def get_operands(self, q, i):
"""Evaluate and return the left and right operands for an operator"""
try:
left = q[i - 1]
right = q[i + 1]
except IndexError:
raise QueryError, "Malformed query"
operandType = type(left)
if operandType is IntType:
left = self[left]
elif isinstance(left,str) or isinstance(left,unicode):
left = self[left]
elif operandType is list:
left = self.evaluate(left)
operandType = type(right)
if operandType is IntType:
right = self[right]
elif isinstance(right,str) or isinstance(right,unicode):
right = self[right]
elif operandType is list:
right = self.evaluate(right)
return (left, right)
def evaluate(self, query):
"""Evaluate a parsed query"""
# Strip off meaningless layers
while isinstance(query, list) and len(query) == 1:
query = query[0]
# If it's not a list, assume a string or number
if not isinstance(query, list):
return self[query]
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNot -> And -> Or -> Near
i = 0
while (i < len(query)):
if query[i] is AndNot:
left, right = self.get_operands(query, i)
val = left.and_not(right)
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(query)):
if query[i] is And:
left, right = self.get_operands(query, i)
val = left & right
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(query)):
if query[i] is Or:
left, right = self.get_operands(query, i)
val = left | right
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(query)):
if query[i] is Near:
left, right = self.get_operands(query, i)
val = left.near(right)
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(query) != 1):
raise QueryError, "Malformed query"
return query[0]
def getIndexSourceNames(self):
""" return name of indexed attributes """
return (self.id, )
def numObjects(self):
""" return number of index objects """
return len(self._index)
def manage_setPreferences(self,vocabulary,
REQUEST=None,RESPONSE=None,URL2=None):
""" preferences of TextIndex """
if self.vocabulary_id != vocabulary:
self.clear()
self.vocabulary_id = vocabulary
if RESPONSE:
RESPONSE.redirect(URL2 + '/manage_main?manage_tabs_message=Preferences%20saved')
manage = manage_main = DTMLFile("dtml/manageTextIndex",globals())
manage_main._setName('manage_main')
manage_vocabulary = DTMLFile("dtml/manageVocabulary",globals())
def parse(s):
"""Parse parentheses and quotes"""
l = []
tmp = s.lower()
p = parens(tmp)
while p is not None:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(p[0])
l.append(parse(p[1]))
# continue looking through the rest of the string
tmp = p[2]
p = parens(tmp)
return l + quotes(tmp)
def parse2(q, default_operator, operator_dict=operator_dict):
"""Find operators and operands"""
isop = operator_dict.has_key
i = 0
while i < len(q):
e = q[i]
if isinstance(e, list):
q[i] = parse2(e, default_operator)
if i % 2:
q.insert(i, default_operator)
i = i + 1
elif i % 2:
# This element should be an operator
if isop(e):
# Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
else:
# Insert the default operator.
q.insert(i, default_operator)
i = i + 1
i = i + 1
return q
def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
if mo is None:
return
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return (s[:open_index - 1], s[open_index:index],
s[index + 1:])
if paren_count < 0:
break
mo = parens_re(s, index + 1)
raise QueryError, "Mismatched parentheses"
def quotes(s):
if '"' not in s:
return s.split()
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = splitted[i].split()
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = splitted[i].split()
i = i - 2
return filter(None, splitted)
manage_addTextIndexForm = DTMLFile('dtml/addTextIndex', globals())
def manage_addTextIndex(self, id, extra=None, REQUEST=None, RESPONSE=None, URL3=None):
"""Add a text index"""
return self.manage_addIndex(id, 'TextIndex', extra, REQUEST, RESPONSE, URL3)
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Vocabulary for deprecated text index.
$Id$
"""
from AccessControl.Permissions import manage_vocabulary
from AccessControl.Permissions import query_vocabulary
from AccessControl.Role import RoleManager
from AccessControl.SecurityInfo import ClassSecurityInfo
from Acquisition import Implicit
from App.class_init import InitializeClass
from App.Dialogs import MessageDialog
from App.special_dtml import DTMLFile
from Persistence import Persistent
from OFS.SimpleItem import Item
from zope.interface import implements
from Products.PluginIndexes.interfaces import IVocabulary
from Products.PluginIndexes.TextIndex import Lexicon, GlobbingLexicon
from Products.PluginIndexes.TextIndex.Lexicon import stop_word_dict
from Products.PluginIndexes.TextIndex import Splitter
manage_addVocabularyForm=DTMLFile('dtml/addVocabulary',globals())
def manage_addVocabulary(self, id, title, globbing=None, extra=None,
splitter='', REQUEST=None):
"""Add a Vocabulary object
"""
id=str(id)
title=str(title)
if globbing: globbing=1
c=Vocabulary(id, title, globbing,splitter,extra)
self._setObject(id, c)
if REQUEST is not None:
return self.manage_main(self,REQUEST,update_menu=1)
class _extra: pass
class Vocabulary(Item, Persistent, Implicit, RoleManager):
"""A Vocabulary is a user-managable realization of a Lexicon object.
"""
implements(IVocabulary)
security = ClassSecurityInfo()
security.setPermissionDefault(manage_vocabulary, ('Manager',))
security.setPermissionDefault(query_vocabulary, ('Anonymous', 'Manager',))
meta_type = "Vocabulary"
_isAVocabulary = 1
manage_options=(
(
{'label': 'Vocabulary', 'action': 'manage_main',
'help' : ('ZCatalog', 'Vocabulary_Vocabulary.stx')},
{'label': 'Query', 'action': 'manage_query',
'help': ('ZCatalog', 'Vocabulary_Query.stx')},
)
+ Item.manage_options
+ RoleManager.manage_options
)
security.declareProtected(manage_vocabulary, 'manage_main')
manage_main = DTMLFile('dtml/manage_vocab', globals())
security.declareProtected(manage_vocabulary, 'manage_query')
manage_query = DTMLFile('dtml/vocab_query', globals())
def __init__(self, id, title='', globbing=None,splitter=None,extra=None):
""" create the lexicon to manage... """
self.id = id
self.title = title
self.globbing = not not globbing
self.useSplitter = Splitter.splitterNames[0]
if splitter:
self.useSplitter = splitter
if not extra:
extra = _extra()
extra.splitterIndexNumbers = 0
extra.splitterSingleChars = 0
extra.splitterCasefolding = 1
if globbing:
self.lexicon = GlobbingLexicon.GlobbingLexicon(
useSplitter=self.useSplitter,extra=extra)
else:
self.lexicon = Lexicon.Lexicon(stop_word_dict,
useSplitter=self.useSplitter,extra=extra)
def getLexicon(self):
return self.lexicon
security.declareProtected(query_vocabulary, 'query')
def query(self, pattern):
""" """
result = []
for x in self.lexicon.get(pattern):
if self.globbing:
result.append(self.lexicon._inverseLex[x])
else:
result.append(pattern)
return str(result)
def manage_insert(self, word='', URL1=None, RESPONSE=None):
""" doc string """
self.insert(word)
if RESPONSE:
RESPONSE.redirect(URL1 + '/manage_main')
def manage_stop_syn(self, stop_syn, REQUEST=None):
pass
def insert(self, word=''):
self.lexicon.set(word)
def words(self):
return self.lexicon._lexicon.items()
InitializeClass(Vocabulary)
# empty comment for winzip and friends
import warnings
warnings.warn('Using TextIndex is deprecated (will be removed in Zope '
'2.12). Use ZCTextIndex instead.',
DeprecationWarning,
stacklevel=2)
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add TextIndex',
)">
<p class="form-help">
<strong>Note:</strong>
TextIndex is deprecated. It has been replaced by ZCTextIndex. Consider
using it instead
</p>
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score meaning they return hits in order
from the most relevant to the lest relevant.
</p>
<form action="manage_addTextIndex" method="post" enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<dtml-let vocabs="superValues('Vocabulary')">
<dtml-if vocabs>
<select name="extra.vocabulary:record">
<dtml-in expr="superValues('Vocabulary')">
<option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
</dtml-in>
</select>
<dtml-else>
<em class="std-text">Create a Vocabulary object first.</em>
</dtml-if>
</dtml-let>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
TextIndex
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add Vocabulary',
)">
<FORM ACTION="manage_addVocabulary" METHOD="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<dtml-if availableSplitters>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Splitter
</div>
</td>
<td align="left" valign="top">
<select name="splitter">
<dtml-in availableSplitters>
<option value="&dtml-sequence-key;">&dtml-sequence-item;
</dtml-in>
</select>
</td>
</tr>
</dtml-if>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index numbers
</td>
<td align="left" valign="top">
<select name="extra.splitterIndexNumbers:record:int">
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index single characters
</td>
<td align="left" valign="top">
<select name="extra.splitterSingleChars:record:int" >
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Case-insensitive
</td>
<td align="left" valign="top">
<select name="extra.splitterCasefolding:record:int">
<option value="0" >no
<option value="1"selected>yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
globbing?
</td>
<td align="left" valign="top">
<input type="checkbox" name="globbing" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
<form method="post" action="manage_setPreferences">
<table border="0" cellspacing="2" cellpadding="2">
<tr>
<th align="left" width="20%">Vocabulary to use</th>
<td align="left">
<select name="vocabulary">
<dtml-in "superValues('Vocabulary')">
<dtml-if "getId()==vocabulary_id">
<option value="&dtml-id;" selected>
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
<dtml-else>
<option value="&dtml-id;">
&dtml-id; <dtml-var title fmt="(%s)" null html_quote>
</option>
</dtml-if>
</dtml-in>
</select>
</td>
<td>
<em>Warning:</em> changing the vocabulary makes only sense when after
creating the index and before indexing any objects. The index will be cleared
when you change the vocabulary after indexing objects.
</td>
</tr>
<dtml-comment>
<tr>
<th align="left">Splitter</th>
<td>
<select name="splitter">
<dtml-in availableSplitters>
<dtml-if "_.getitem('sequence-key')==useSplitter">
<option value="&dtml-sequence-key;" selected>&dtml-sequence-item;
<dtml-else>
<option value="&dtml-sequence-key;">&dtml-sequence-item;
</dtml-if>
</dtml-in>
</select>
</td>
</tr>
<tr>
<th align="left">Default text operator</th>
<td>
<select name="text_operator">
<dtml-in "operators.keys()">
<dtml-if "_.getitem('sequence-item')==useOperator">
<option value="&dtml-sequence-item;" selected>&dtml-sequence-item;
<dtml-else>
<option value="&dtml-sequence-item;">&dtml-sequence-item;
</dtml-if>
</dtml-in>
</select>
</td>
</tr>
</dtml-comment>
<tr>
<td colspan="3">
<input type="submit" value="Save changes">
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Manage vocabulary of text index',
help_topic='addIndex.stx'
)">
<dtml-var "getLexicon('Vocabulary')">
<form action="manage_addTextIndex" method="post" enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
</table>
</form>
<dtml-var manage_page_footer>
<dtml-call "RESPONSE.setHeader('content-type','text/html; charset: utf-8')">
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-text">
<dtml-let lexicon="getLexicon()">
<dtml-try>
<dtml-let x="lexicon.multi_wc"></dtml-let>
Globbing is <em>enabled</em>
<dtml-except>
Globbing is <em>disabled</em>
</dtml-try>
<dtml-if useSplitter>
, Splitter is <em>&dtml-useSplitter;</em>
</dtml-if>
<dtml-try>
, Index number=<dtml-var "lexicon.splitterParams.splitterIndexNumbers">
, Case-insensitve=<dtml-var "lexicon.splitterParams.splitterCasefolding">
, Index single characters=<dtml-var "lexicon.splitterParams.splitterSingleChars">
<dtml-except>
</dtml-try>
</dtml-let>
</p>
<dtml-if words>
<p class="form-text">
&dtml-id; contains <em><dtml-var words fmt=collection-length thousands_commas></em>
word(s).
</p>
<dtml-in words previous size=20 start=query_start >
<span class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-previous-sequence-start-number;">
[Previous <dtml-var previous-sequence-size> entries]
</a>
</span>
</dtml-in>
<dtml-in words next size=20 start=query_start >
<span class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-next-sequence-start-number;">
[Next <dtml-var next-sequence-size> entries]
</a>
</span>
</dtml-in>
<table width="100%" cellspacing="0" cellpadding="2" border="0">
<dtml-in words size=20 start=query_start >
<dtml-if name="sequence-start">
<tr class="list-header">
<td width="80%" align="left" valign="top">
<div class="list-item">Word</div></td>
<td width="20%" align="left" valign="top">
<div class="list-item">Word ID</div></td>
</tr>
</dtml-if>
<dtml-if name="sequence-odd"><tr class="row-normal">
<dtml-else><tr class="row-hilite"></dtml-if>
<td valign="top" align="left">
<div class="form-text">
<dtml-if "_.same_type(_['sequence-key'], 'x')">
&dtml-sequence-key;
<dtml-else>
<dtml-var "_['sequence-key'].encode('utf-8')" html_quote>
</dtml-if>
</div>
</td>
<td valign="top" align="left">
<div class="form-text">&dtml-sequence-item;</div>
</td>
</tr>
</dtml-in>
</table>
<dtml-in words previous size=20 start=query_start >
<div class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-previous-sequence-start-number;">
[Previous <dtml-var previous-sequence-size> entries]
</a>
</div>
</dtml-in>
<dtml-in words next size=20 start=query_start >
<div class="list-nav">
<a href="&dtml-URL;?query_start=&dtml-next-sequence-start-number;">
[Next <dtml-var next-sequence-size> entries]
</a>
</div>
</dtml-in>
<dtml-else>
<p class="form-text">
There are no words in the Vocabulary.
</p>
</dtml-if>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<form action="query" method=POST>
<input type="text" name="pattern" size="20">
<div class="form-element">
<input class="form-element" type="submit" name="submit" value="Query">
</div>
</form>
<dtml-var manage_page_footer>
ZCatalog - searchResults: specifying parameters for a search query
The searchResults() method of the ZCatalog accepts parameters that
define a query to be made on that catalog. A query can either be
passed as keyword argument to searchResults(), as a mapping, or as
part of a Zope REQUEST object, typically from HTML forms.
The index of the catalog to query is either the name of the
keyword argument, a key in a mapping, or an attribute of a record
object.
Attributes of record objects
'query' -- either a sequence of objects or a single value to be
passed as query to the index (mandatory)
'operator' -- specifies the combination of search results when
query is a sequence of values. (optional, default: 'or').
Allowed values:
'and', 'or', 'andnot', 'near'
##############################################################################
#
# Copyright (c) 2003 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
# This file is needed to make this a package.
# -*- coding: ISO-8859-1 -*-
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import os,sys
import unittest,locale
from Products.PluginIndexes.TextIndex import Splitter
class TestCase( unittest.TestCase ):
"""
Test our Splitters
"""
def setUp( self ):
self.testdata = (
('The quick brown fox jumps over the lazy dog',
['the','quick','brown','fox','jumps','over','the','lazy','dog']),
( 'fters sterreichische herber berfall da rger verrgert',
['fters','sterreichische','herber','berfall','da','rger','verrgert'])
)
pass
def tearDown( self ):
"""
"""
def testAvailableSplitters( self ):
"Test available splitters"
assert len(Splitter.availableSplitters) >0
assert len(Splitter.splitterNames)>0
assert len(Splitter.availableSplitters)==len(Splitter.splitterNames)
def _test(self,sp_name,text,splitted):
splitter = Splitter.getSplitter(sp_name)
result = list(splitter(text))
assert result==splitted, "%s: %s vs %s" % (sp_name,result,splitted)
# def testZopeSplitter(self):
# """test ZopeSplitter (this test is known to fail because it does not support ISO stuff) """
#
# for text,splitted in self.testdata:
# self._test("ZopeSplitter",text,splitted)
def testISOSplitter(self):
"""test ISOSplitter"""
for text,splitted in self.testdata:
self._test("ISO_8859_1_Splitter",text,splitted)
def test_suite():
return unittest.makeSuite( TestCase )
def debug():
return test_suite().debug()
def pdebug():
import pdb
pdb.run('debug()')
def main():
unittest.TextTestRunner().run( test_suite() )
if __name__ == '__main__':
if len(sys.argv) > 1:
globals()[sys.argv[1]]()
else:
main()
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors. All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.1 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""TextIndex unit tests.
$Id$
"""
import unittest
import Testing
import Zope2
Zope2.startup()
import ZODB
from ZODB.MappingStorage import MappingStorage
import transaction
from Products.PluginIndexes.TextIndex import TextIndex
from Products.PluginIndexes.TextIndex import GlobbingLexicon
class Dummy:
def __init__( self, text ):
self._text = text
def text( self ):
return self._text
def __str__( self ):
return '<Dummy: %s>' % self._text
__repr__ = __str__
class Tests(unittest.TestCase):
db = None
jar = None
def setUp(self):
self.index=TextIndex.TextIndex('text')
self.doc=Dummy(text='this is the time, when all good zopes')
def dbopen(self):
if self.db is None:
s = MappingStorage()
self.db = ZODB.DB(s)
db = self.db
if self.jar is not None:
raise RuntimeError, 'test needs to dbclose() before dbopen()'
jar = db.open()
self.jar = jar
if not jar.root().has_key('index'):
jar.root()['index'] = TextIndex.TextIndex('text')
transaction.commit()
return jar.root()['index']
def dbclose(self):
self.jar.close()
self.jar = None
def tearDown(self):
transaction.abort()
if self.jar is not None:
self.dbclose()
if self.db is not None:
self.db.close()
self.db = None
def test_z3interfaces(self):
from Products.PluginIndexes.interfaces import IPluggableIndex
from Products.PluginIndexes.interfaces import ITextIndex
from Products.PluginIndexes.TextIndex.TextIndex import TextIndex
from zope.interface.verify import verifyClass
verifyClass(IPluggableIndex, TextIndex)
verifyClass(ITextIndex, TextIndex)
def test_SimpleAddDelete(self):
self.index.index_object(0, self.doc)
self.index.index_object(1, self.doc)
self.doc.text='spam is good, spam is fine, span span span'
self.index.index_object(0, self.doc)
self.index.unindex_object(0)
def test_PersistentUpdate1(self):
# Check simple persistent indexing
index=self.dbopen()
self.doc.text='this is the time, when all good zopes'
index.index_object(0, self.doc)
transaction.commit()
self.doc.text='time waits for no one'
index.index_object(1, self.doc)
transaction.commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({})
assert r==None
r = index._apply_index({'text': 'python'})
assert len(r) == 2 and r[1]==('text',), 'incorrectly not used'
assert not r[0], "should have no results"
r = index._apply_index({'text': 'time'})
r=list(r[0].keys())
assert r == [0,1], r
def test_PersistentUpdate2(self):
# Check less simple persistent indexing
index=self.dbopen()
self.doc.text='this is the time, when all good zopes'
index.index_object(0, self.doc)
transaction.commit()
self.doc.text='time waits for no one'
index.index_object(1, self.doc)
transaction.commit()
self.doc.text='the next task is to test'
index.index_object(3, self.doc)
transaction.commit()
self.doc.text='time time'
index.index_object(2, self.doc)
transaction.commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({})
assert r==None
r = index._apply_index({'text': 'python'})
assert len(r) == 2 and r[1]==('text',), 'incorrectly not used'
assert not r[0], "should have no results"
r = index._apply_index({'text': 'time'})
r=list(r[0].keys())
assert r == [0,1,2], r
sample_texts = [
"""This is the time for all good men to come to
the aid of their country""",
"""ask not what your country can do for you,
ask what you can do for your country""",
"""Man, I can't wait to get to Montross!""",
"""Zope Public License (ZPL) Version 1.0""",
"""Copyright (c) Digital Creations. All rights reserved.""",
"""This license has been certified as Open Source(tm).""",
"""I hope I get to work on time""",
]
def globTest(self, qmap, rlist):
"Check a glob query"
index=self.dbopen()
index._lexicon = GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
transaction.commit()
self.dbclose()
index=self.dbopen()
r = list(index._apply_index(qmap)[0].keys())
assert r == rlist, r
return index._apply_index
def test_StarQuery(self):
self.globTest({'text':'m*n'}, [0,2])
def test_AndQuery(self):
self.globTest({'text':'time and country'}, [0,])
def test_OrQuery(self):
self.globTest({'text':'time or country'}, [0,1,6])
def test_DefaultOrQuery(self):
self.globTest({'text':'time country'}, [0,1,6])
def test_NearQuery(self):
# Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self.globTest({'text':'time ... country'}, [0,])
def test_QuotesQuery(self):
ai = self.globTest({'text':'"This is the time"'}, [0,])
r = list(ai({'text':'"now is the time"'})[0].keys())
assert r == [], r
def test_AndNotQuery(self):
self.globTest({'text':'time and not country'}, [6,])
def test_ParenMatchingQuery(self):
ai = self.globTest({'text':'(time and country) men'}, [0,])
r = list(ai({'text':'(time and not country) or men'})[0].keys())
assert r == [0, 6], r
def test_TextIndexOperatorQuery(self):
self.globTest({'text': {'query': 'time men', 'operator':'and'}}, [0,])
def test_NonExistentWord(self):
self.globTest({'text':'zop'}, [])
def test_ComplexQuery1(self):
self.globTest({'text':'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
# same tests, unicode strings
def test_StarQueryUnicode(self):
self.globTest({'text':u'm*n'}, [0,2])
def test_AndQueryUnicode(self):
self.globTest({'text':u'time and country'}, [0,])
def test_OrQueryUnicode(self):
self.globTest({'text':u'time or country'}, [0,1,6])
def test_DefaultOrQueryUnicode(self):
self.globTest({'text':u'time country'}, [0,1,6])
def test_NearQueryUnicode(self):
# Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!) (unicode)
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self.globTest({'text':u'time ... country'}, [0,])
def test_QuotesQueryUnicode(self):
ai = self.globTest({'text':u'"This is the time"'}, [0,])
r = list(ai({'text':'"now is the time"'})[0].keys())
assert r == [], r
def test_AndNotQueryUnicode(self):
self.globTest({'text':u'time and not country'}, [6,])
def test_ParenMatchingQueryUnicode(self):
ai = self.globTest({'text':u'(time and country) men'}, [0,])
r = list(ai({'text':u'(time and not country) or men'})[0].keys())
assert r == [0, 6], r
def test_TextIndexOperatorQueryUnicode(self):
self.globTest({'text': {u'query': u'time men', 'operator':'and'}},
[0,])
def test_NonExistentWordUnicode(self):
self.globTest({'text':u'zop'}, [])
def test_ComplexQuery1Unicode(self):
self.globTest({'text':u'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
def test_suite():
return unittest.makeSuite(Tests)
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
......@@ -21,20 +21,7 @@ import DateRangeIndex.DateRangeIndex
from Products.PluginIndexes.common import ResultList
from Products.PluginIndexes.common import UnIndex
# BBB: TextIndex is deprecated but we don't want the warning to appear here
import warnings
warnings.filterwarnings('ignore', message='^Using TextIndex', append=1)
try:
import TextIndex.TextIndex
finally:
del warnings.filters[-1]
try:
del __warningregistry__
except NameError:
pass
_indexes = ('TextIndex',
'KeywordIndex',
_indexes = ('KeywordIndex',
'FieldIndex',
'PathIndex',
'TopicIndex',
......
......@@ -160,38 +160,8 @@ class IPathIndex(Interface):
"""
class IVocabulary(Interface):
"""A Vocabulary is a user-managable realization of a Lexicon object.
"""
class ITextIndex(Interface):
"""Full-text index.
There is a ZCatalog UML model that sheds some light on what is
going on here. '_index' is a BTree which maps word ids to mapping
from document id to score. Something like:
{'bob' : {1 : 5, 2 : 3, 42 : 9}}
{'uncle' : {1 : 1}}
The '_unindex' attribute is a mapping from document id to word
ids. This mapping allows the catalog to unindex an object:
{42 : ('bob', 'is', 'your', 'uncle')
This isn't exactly how things are represented in memory, many
optimizations happen along the way.
"""
def getLexicon(vocab_id=None):
"""Get the Lexicon in use.
"""
class IFilteredSet(Interface):
"""A pre-calculated result list based on an expression.
"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment