Commit 3d88c027 authored by Evan Simpson's avatar Evan Simpson

Merge TextIndex fixes from 2.4 branch

parent 233671d4
......@@ -85,7 +85,7 @@
from Lexicon import Lexicon
import Splitter
from Products.PluginIndexes.TextIndex.TextIndex import Or
from TextIndex import Or, Op
import re, string
......@@ -147,14 +147,12 @@ class GlobbingLexicon(Lexicon):
def createDigrams(self, word):
"""Returns a list with the set of digrams in the word."""
digrams = []
digrams.append(self.eow + word[0]) # Mark the beginning
for i in range(1,len(word)):
digrams.append(word[i-1:i+1])
digrams = list(word)
digrams.append(self.eow)
last = self.eow
digrams[-1] = digrams[-1] + self.eow # Mark the end
for i in range(len(digrams)):
last, digrams[i] = digrams[i], last + digrams[i]
return digrams
......@@ -269,21 +267,30 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q):
"""expand wildcards"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif isinstance(e, Op):
pass
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
else:
words.append(w)
if not words:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's
# __getitem__ return an empty result list
return words or ['']
return q
def Splitter(self, astring, words=None):
""" wrap the splitter """
......@@ -300,18 +307,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing
result = string.replace(pat, '*', '.*')
# First, deal with multi-character globbing
result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
result = string.replace(result, '?', '.')
return "%s$" % result
......
......@@ -85,13 +85,9 @@
"""Text Index
The TextIndex falls under the 'I didnt have a better name for it'
excuse. It is an 'Un' Text index because it stores a little bit of
undo information so that objects can be unindexed when the old value
is no longer known.
"""
__version__ = '$Revision: 1.9 $'[11:-2]
__version__ = '$Revision: 1.10 $'[11:-2]
import string, re
......@@ -113,12 +109,21 @@ from Lexicon import Lexicon
from types import *
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
class Op:
def __init__(self, name):
self.name = name
def __repr__(self):
return self.name
__str__ = __repr__
AndNot = Op('andnot')
And = Op('and')
Or = Op('or')
Near = Op('...')
QueryError = 'TextIndex.QueryError'
operator_dict = {'andnot': AndNot, 'and': And, 'or': Or,
'...': Near, 'near': Near,
AndNot: AndNot, And: And, Or: Or, Near: Near}
class TextIndex(PluggableIndex.PluggableIndex, Persistent,
Implicit, SimpleItem):
......@@ -176,8 +181,6 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
# Default text index operator (should be visible to ZMI)
self.operators = { 'andnot':AndNot, 'and':And,
'near':Near, 'or':Or }
self.useOperator = 'or'
self.clear()
......@@ -508,10 +511,7 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
# Changed for 2.4
# We use the default operator that can me managed via the ZMI
query_operator = record.get('operator',self.useOperator)
if not query_operator in self.operators.keys():
raise exceptions.RuntimeError,"Invalid operator '%s' for a TextIndex"\
% query_operator
qop = record.get('operator', self.useOperator)
# We keep this for pre-2.4 compatibility
# This stinking code should go away somewhere. A global
......@@ -520,10 +520,16 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
# should be specified on a per-index base
if request.has_key('textindex_operator'):
query_operator = request['textindex_operator']
warnings.warn("The usage of the 'textindex_operator' is no longer recommended.\n"\
"Please use a mapping object and the 'operator' to specify the operator")
qop = request['textindex_operator']
warnings.warn("The usage of the 'textindex_operator' "
"is no longer recommended.\n"
"Please use a mapping object and the "
"'operator' key to specify the operator.")
query_operator = operator_dict.get(qop)
if query_operator is None:
raise exceptions.RuntimeError, ("Invalid operator '%s' "
"for a TextIndex" % qop)
r = None
for key in record.keys:
......@@ -572,29 +578,37 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
def query(self, s, default_operator=Or, ws=(string.whitespace,)):
""" This is called by TextIndexes. A 'query term' which is a
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is
parsed again, then the whole thing is 'evaluated'. """
def query(self, s, default_operator=Or):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s = re.sub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
' andnot ', s)
s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
# do some parsing
# Parse parentheses and quotes
q = parse(s)
## here, we give lexicons a chance to transform the query.
## For example, substitute wildcards, or translate words into
## various languages.
# Allow the Lexicon to process the query
q = self.getLexicon().query_hook(q)
# do some more parsing
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
## evalute the final 'expression'
# evalute the final 'expression'
return self.evaluate(q)
......@@ -629,22 +643,20 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
def evaluate(self, query):
"""Evaluate a parsed query"""
# There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us
# to evaluate, and we just get the results and return them.
if (len(query) == 1):
if (type(query[0]) is ListType):
return self.evaluate(query[0])
# Strip off meaningless layers
while isinstance(query, ListType) and len(query) == 1:
query = query[0]
return self[query[0]] # __getitem__
# If it's not a list, assume a string or number
if not isinstance(query, ListType):
return self[query]
# Now we need to loop through the query and expand out
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNote -> And -> Or -> Near
# order: AndNot -> And -> Or -> Near
i = 0
while (i < len(query)):
if query[i] == AndNot:
if query[i] is AndNot:
left, right = self.get_operands(query, i)
val = left.and_not(right)
query[(i - 1) : (i + 2)] = [ val ]
......@@ -652,7 +664,7 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
i = 0
while (i < len(query)):
if query[i] == And:
if query[i] is And:
left, right = self.get_operands(query, i)
val = left & right
query[(i - 1) : (i + 2)] = [ val ]
......@@ -660,7 +672,7 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
i = 0
while (i < len(query)):
if query[i] == Or:
if query[i] is Or:
left, right = self.get_operands(query, i)
val = left | right
query[(i - 1) : (i + 2)] = [ val ]
......@@ -668,14 +680,15 @@ class TextIndex(PluggableIndex.PluggableIndex, Persistent,
i = 0
while (i < len(query)):
if query[i] == Near:
if query[i] is Near:
left, right = self.get_operands(query, i)
val = left.near(right)
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(query) != 1): raise QueryError, "Malformed query"
if (len(query) != 1):
import pdb; pdb.set_trace()
raise QueryError, "Malformed query"
return query[0]
......@@ -706,101 +719,93 @@ def parse(s):
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
p = parens(tmp)
while p is not None:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(p[0])
l.append(parse(p[1]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
# continue looking through the rest of the string
tmp = p[2]
p = parens(tmp)
return l
return l + quotes(tmp)
def parse2(q, default_operator,
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
def parse2(q, default_operator, operator_dict=operator_dict):
"""Find operators and operands"""
i = 0
isop = operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
q[i] = parse2(e, default_operator)
if i % 2:
q.insert(i, default_operator)
elif i % 2:
# This element should be an operator
if isop(e):
# Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
else:
# Insert the default operator.
q.insert(i, default_operator)
i = i - 1
return q
def parens(s, parens_re=re.compile('[\(\)]').search):
index = open_index = paren_count = 0
while 1:
mo = parens_re(s, index)
if mo is None : break
def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
if mo is None:
return
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return (s[:open_index - 1], s[open_index:index],
s[index + 1:])
if paren_count < 0:
break
mo = parens_re(s, index + 1)
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
raise QueryError, "Mismatched parentheses"
def quotes(s, ws=(string.whitespace,)):
# split up quoted regions
splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
def quotes(s):
split=string.split
if '"' not in s:
return split(s)
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = split(splitted[i])
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = split(splitted[i])
i = i - 2
return filter(None, splitted)
manage_addTextIndexForm = DTMLFile('dtml/addTextIndex', globals())
......
##############################################################################
#
# Zope Public License (ZPL) Version 1.0
# -------------------------------------
#
# Copyright (c) Digital Creations. All rights reserved.
#
# This license has been certified as Open Source(tm).
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
# 1. Redistributions in source code must retain the above copyright
# notice, this list of conditions, and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions, and the following disclaimer in
# the documentation and/or other materials provided with the
# distribution.
#
# 3. Digital Creations requests that attribution be given to Zope
# in any manner possible. Zope includes a "Powered by Zope"
# button that is installed by default. While it is not a license
# violation to remove this button, it is requested that the
# attribution remain. A significant investment has been put
# into Zope, and this effort will continue if the Zope community
# continues to grow. This is one way to assure that growth.
#
# 4. All advertising materials and documentation mentioning
# features derived from or use of this software must display
# the following acknowledgement:
#
# "This product includes software developed by Digital Creations
# for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# In the event that the product being advertised includes an
# intact Zope distribution (with copyright and license included)
# then this clause is waived.
#
# 5. Names associated with Zope or Digital Creations must not be used to
# endorse or promote products derived from this software without
# prior written permission from Digital Creations.
#
# 6. Modified redistributions of any form whatsoever must retain
# the following acknowledgment:
#
# "This product includes software developed by Digital Creations
# for use in the Z Object Publishing Environment
# (http://www.zope.org/)."
#
# Intact (re-)distributions of any official Zope release do not
# require an external acknowledgement.
#
# 7. Modifications are encouraged but must be packaged separately as
# patches to official Zope releases. Distributions that do not
# clearly separate the patches from the original work must be clearly
# labeled as unofficial distributions. Modifications which do not
# carry the name Zope may be packaged in any form, as long as they
# conform to all of the clauses above.
#
#
# Disclaimer
#
# THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
# USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
# OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
#
# This software consists of contributions made by Digital Creations and
# many individuals on behalf of Digital Creations. Specific
# attributions are listed in the accompanying credits file.
#
##############################################################################
import sys, os
sys.path.insert(0, os.path.join(sys.path[0],'..'))
sys.path.insert(0, os.getcwd())
try: import unittest
except:
sys.path[0]=os.path.join(sys.path[0],'..','..', '..')
import unittest
print sys.path
class Dummy:
def __init__(self, **kw):
self.__dict__.update(kw)
import zLOG
def log_write(subsystem, severity, summary, detail, error):
if severity >= zLOG.PROBLEM:
assert 0, "%s(%s): %s" % (subsystem, severity, summary)
zLOG.log_write=log_write
import ZODB, ZODB.DemoStorage, ZODB.FileStorage
import TextIndex
import GlobbingLexicon
class Tests(unittest.TestCase):
def setUp(self):
self.index=TextIndex.TextIndex('text')
self.doc=Dummy(text='this is the time, when all good zopes')
def dbopen(self):
n = 'fs_tmp__%s' % os.getpid()
s = ZODB.FileStorage.FileStorage(n)
db=self.db=ZODB.DB(s)
self.jar=db.open()
if not self.jar.root().has_key('index'):
self.jar.root()['index']=TextIndex.TextIndex('text')
get_transaction().commit()
return self.jar.root()['index']
def dbclose(self):
self.jar.close()
self.db.close()
del self.jar
del self.db
def tearDown(self):
get_transaction().abort()
if hasattr(self, 'jar'):
self.dbclose()
os.system('rm -f fs_tmp__*')
def checkSimpleAddDelete(self):
"Check that we can add and delete an object without error"
self.index.index_object(0, self.doc)
self.index.index_object(1, self.doc)
self.doc.text='spam is good, spam is fine, span span span'
self.index.index_object(0, self.doc)
self.index.unindex_object(0)
def checkPersistentUpdate1(self):
"Check simple persistent indexing"
index=self.dbopen()
self.doc.text='this is the time, when all good zopes'
index.index_object(0, self.doc)
get_transaction().commit()
self.doc.text='time waits for no one'
index.index_object(1, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({})
assert r==None
r = index._apply_index({'text': 'python'})
assert len(r) == 2 and r[1]==('text',), 'incorrectly not used'
assert not r[0], "should have no results"
r = index._apply_index({'text': 'time'})
r=list(r[0].keys())
assert r == [0,1], r
def checkPersistentUpdate2(self):
"Check less simple persistent indexing"
index=self.dbopen()
self.doc.text='this is the time, when all good zopes'
index.index_object(0, self.doc)
get_transaction().commit()
self.doc.text='time waits for no one'
index.index_object(1, self.doc)
get_transaction().commit()
self.doc.text='the next task is to test'
index.index_object(3, self.doc)
get_transaction().commit()
self.doc.text='time time'
index.index_object(2, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({})
assert r==None
r = index._apply_index({'text': 'python'})
assert len(r) == 2 and r[1]==('text',), 'incorrectly not used'
assert not r[0], "should have no results"
r = index._apply_index({'text': 'time'})
r=list(r[0].keys())
assert r == [0,1,2], r
sample_texts = [
"""This is the time for all good men to come to
the aid of their country""",
"""ask not what your country can do for you,
ask what you can do for your country""",
"""Man, I can't wait to get to Montross!""",
"""Zope Public License (ZPL) Version 1.0""",
"""Copyright (c) Digital Creations. All rights reserved.""",
"""This license has been certified as Open Source(tm).""",
"""I hope I get to work on time""",
]
def globTest(self, qmap, rlist):
"Check a glob query"
index=self.dbopen()
index._lexicon = GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = list(index._apply_index(qmap)[0].keys())
assert r == rlist, r
return index._apply_index
def checkStarQuery(self):
"Check a star query"
self.globTest({'text':'m*n'}, [0,2])
def checkAndQuery(self):
"Check an AND query"
self.globTest({'text':'time and country'}, [0,])
def checkOrQuery(self):
"Check an OR query"
self.globTest({'text':'time or country'}, [0,1,6])
def checkDefOrQuery(self):
"Check a default OR query"
self.globTest({'text':'time country'}, [0,1,6])
def checkNearQuery(self):
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self.globTest({'text':'time ... country'}, [0,])
def checkQuotesQuery(self):
"""Check a quoted query"""
ai = self.globTest({'text':'"This is the time"'}, [0,])
r = list(ai({'text':'"now is the time"'})[0].keys())
assert r == [], r
def checkAndNotQuery(self):
"Check an ANDNOT query"
self.globTest({'text':'time and not country'}, [6,])
def checkParenMatchingQuery(self):
"Check a query with parens"
ai = self.globTest({'text':'(time and country) men'}, [0,])
r = list(ai({'text':'(time and not country) or men'})[0].keys())
assert r == [0, 6], r
def checkTextIndexOperatorQuery(self):
"Check a query with 'operator' in the request"
self.globTest({'text': {'query': 'time men', 'operator':'and'}}, [0,])
def checkNonExistentWord(self):
""" Check for nonexistent word """
self.globTest({'text':'zop'}, [])
def checkComplexQuery1(self):
""" Check complex query 1 """
self.globTest({'text':'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
def test_suite():
return unittest.makeSuite(Tests, 'check')
def main():
unittest.TextTestRunner().run(test_suite())
def debug():
test_suite().debug()
def pdebug():
import pdb
pdb.run('debug()')
if __name__=='__main__':
if len(sys.argv) > 1:
globals()[sys.argv[1]]()
else:
main()
......@@ -267,21 +267,28 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q):
"""expand wildcards"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = []
for wid in wids:
if words:
words.append(Or)
words.append(wid)
else:
words.append(w)
if not words:
# if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's
# __getitem__ return an empty result list
return words or ['']
return q
def Splitter(self, astring, words=None):
""" wrap the splitter """
......@@ -298,19 +305,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters.
"""
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing
result = string.replace(pat, '*', '.*')
# First, deal with multi-character globbing
result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
result = string.replace(result, '?', '.')
return "%s$" % result
......@@ -91,7 +91,7 @@ undo information so that objects can be unindexed when the old value
is no longer known.
"""
__version__ = '$Revision: 1.49 $'[11:-2]
__version__ = '$Revision: 1.50 $'[11:-2]
import string, re
......@@ -428,7 +428,7 @@ class UnTextIndex(Persistent, Implicit):
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if type(word) is IntType:
if isinstance(word, IntType):
# We have a word ID
result = self._index.get(word, {})
return ResultList(result, (word,), self)
......@@ -440,7 +440,7 @@ class UnTextIndex(Persistent, Implicit):
if len(splitSource) == 1:
splitSource = splitSource[0]
if splitSource[:1] == '"' and splitSource[-1:] == '"':
if splitSource[:1] == splitSource[-1:] == '"':
return self[splitSource]
wids=self.getLexicon(self._lexicon).get(splitSource)
......@@ -551,28 +551,37 @@ class UnTextIndex(Persistent, Implicit):
def query(self, s, default_operator=Or, ws=(string.whitespace,)):
""" This is called by TextIndexes. A 'query term' which is a
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is
parsed again, then the whole thing is 'evaluated'. """
def query(self, s, default_operator=Or):
""" Evaluate a query string.
Convert the query string into a data structure of nested lists
and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot "
s = re.sub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
' andnot ', s)
s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
# do some parsing
# Parse parentheses and quotes
q = parse(s)
## here, we give lexicons a chance to transform the query.
## For example, substitute wildcards, or translate words into
## various languages.
# Allow the Lexicon to process the query
q = self.getLexicon(self._lexicon).query_hook(q)
# do some more parsing
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator)
## evalute the final 'expression'
# evalute the final 'expression'
return self.evaluate(q)
......@@ -605,19 +614,17 @@ class UnTextIndex(Persistent, Implicit):
def evaluate(self, query):
"""Evaluate a parsed query"""
# There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us
# to evaluate, and we just get the results and return them.
if (len(query) == 1):
if (type(query[0]) is ListType):
return self.evaluate(query[0])
# Strip off meaningless layers
while isinstance(query, ListType) and len(query) == 1:
query = query[0]
return self[query[0]] # __getitem__
# If it's not a list, assume a string or number
if not isinstance(query, ListType):
return self[query]
# Now we need to loop through the query and expand out
# Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following
# order: AndNote -> And -> Or -> Near
# order: AndNot -> And -> Or -> Near
i = 0
while (i < len(query)):
if query[i] is AndNot:
......@@ -660,98 +667,91 @@ def parse(s):
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
p = parens(tmp)
while p is not None:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(p[0])
l.append(parse(p[1]))
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
# continue looking through the rest of the string
tmp = p[2]
p = parens(tmp)
return l
return l + quotes(tmp)
def parse2(q, default_operator,
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
"""Find operators and operands"""
i = 0
isop = operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
q[i] = parse2(e, default_operator)
if i % 2:
q.insert(i, default_operator)
elif i % 2:
# This element should be an operator
if isop(e):
# Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
else:
# Insert the default operator.
q.insert(i, default_operator)
i = i - 1
return q
def parens(s, parens_re=re.compile('[\(\)]').search):
index = open_index = paren_count = 0
while 1:
mo = parens_re(s, index)
if mo is None : break
def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
if mo is None:
return
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0)
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return (s[:open_index - 1], s[open_index:index],
s[index + 1:])
if paren_count < 0:
break
mo = parens_re(s, index + 1)
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
raise QueryError, "Mismatched parentheses"
def quotes(s, ws=(string.whitespace,)):
# split up quoted regions
splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
def quotes(s):
split=string.split
if '"' not in s:
return split(s)
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
words = splitted[i] = split(splitted[i])
# put the Proxmity operator in between quoted words
j = len(words) - 1
while j > 0:
words.insert(j, Near)
j = j - 1
i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words
splitted[i:i+1] = split(splitted[i])
i = i - 2
return filter(None, splitted)
......@@ -217,8 +217,8 @@ class Tests(unittest.TestCase):
"""This license has been certified as Open Source(tm).""",
"""I hope I get to work on time""",
]
def checkGlobQuery(self):
def globTest(self, qmap, rlist):
"Check a glob query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
......@@ -232,162 +232,61 @@ class Tests(unittest.TestCase):
index=self.dbopen()
r = index._apply_index({'text':'m*n'})
r=list(r[0].keys())
assert r == [0,2], r
r = list(index._apply_index(qmap)[0].keys())
assert r == rlist, r
return index._apply_index
def checkStarQuery(self):
"Check a star query"
self.globTest({'text':'m*n'}, [0,2])
def checkAndQuery(self):
"Check an AND query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and country'})
r=list(r[0].keys())
assert r == [0,], r
self.globTest({'text':'time and country'}, [0,])
def checkOrQuery(self):
"Check an OR query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
self.globTest({'text':'time or country'}, [0,1,6])
index=self.dbopen()
r = index._apply_index({'text':'time or country'})
r=list(r[0].keys())
assert r == [0,1,6], r
def checkDefOrQuery(self):
"Check a default OR query"
self.globTest({'text':'time country'}, [0,1,6])
def checkNearQuery(self):
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'OR' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean OR
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
self.globTest({'text':'time ... country'}, [0,])
self.dbclose()
index=self.dbopen()
def checkQuotesQuery(self):
"""Check a quoted query"""
ai = self.globTest({'text':'"This is the time"'}, [0,])
r = index._apply_index({'text':'time near country'})
r=list(r[0].keys())
assert r == [0,1,6], r
r = list(ai({'text':'"now is the time"'})[0].keys())
assert r == [], r
def checkAndNotQuery(self):
"Check an ANDNOT query"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and not country'})
r=list(r[0].keys())
assert r == [6], r
self.globTest({'text':'time and not country'}, [6,])
def checkParenMatchingQuery(self):
"Check a query with parens"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
ai = self.globTest({'text':'(time and country) men'}, [0,])
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'(time and country) men'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'(time and not country) or men'})
r=list(r[0].keys())
r = list(ai({'text':'(time and not country) or men'})[0].keys())
assert r == [0, 6], r
def checkQuoteMatchingQuery(self):
"Check a query with quotes.. this is known to fail under 2.3.1b3-"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'"This is the time"'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'"now is the time"'})
r=list(r[0].keys())
assert r == [], r
def checkTextIndexOperatorQuery(self):
"Check a query with 'textindex_operator' in the request"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time men','textindex_operator':'and'})
r=list(r[0].keys())
assert r == [0], r
self.globTest({'text':'time men', 'textindex_operator':'and'}, [0,])
def checkNonExistentWord(self):
""" Check for nonexistent word """
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'zop'})
r=list(r[0].keys())
assert r == [], r
self.globTest({'text':'zop'}, [])
def checkComplexQuery1(self):
""" Check complex query 1 """
self.globTest({'text':'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
def test_suite():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment