Commit 3d88c027 authored by Evan Simpson's avatar Evan Simpson

Merge TextIndex fixes from 2.4 branch

parent 233671d4
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
from Lexicon import Lexicon from Lexicon import Lexicon
import Splitter import Splitter
from Products.PluginIndexes.TextIndex.TextIndex import Or from TextIndex import Or, Op
import re, string import re, string
...@@ -147,14 +147,12 @@ class GlobbingLexicon(Lexicon): ...@@ -147,14 +147,12 @@ class GlobbingLexicon(Lexicon):
def createDigrams(self, word): def createDigrams(self, word):
"""Returns a list with the set of digrams in the word.""" """Returns a list with the set of digrams in the word."""
digrams = [] digrams = list(word)
digrams.append(self.eow)
digrams.append(self.eow + word[0]) # Mark the beginning last = self.eow
for i in range(1,len(word)):
digrams.append(word[i-1:i+1])
digrams[-1] = digrams[-1] + self.eow # Mark the end for i in range(len(digrams)):
last, digrams[i] = digrams[i], last + digrams[i]
return digrams return digrams
...@@ -269,21 +267,30 @@ class GlobbingLexicon(Lexicon): ...@@ -269,21 +267,30 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q): def query_hook(self, q):
"""expand wildcards""" """expand wildcards"""
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif isinstance(e, Op):
pass
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = [] words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
for wid in wids: for wid in wids:
if words: if words:
words.append(Or) words.append(Or)
words.append(wid) words.append(wid)
else: if not words:
words.append(w) # if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's return q
# __getitem__ return an empty result list
return words or ['']
def Splitter(self, astring, words=None): def Splitter(self, astring, words=None):
""" wrap the splitter """ """ wrap the splitter """
...@@ -300,18 +307,16 @@ class GlobbingLexicon(Lexicon): ...@@ -300,18 +307,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters. There is no way to quote meta-characters.
""" """
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "") transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing # First, deal with multi-character globbing
result = string.replace(pat, '*', '.*') result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing # Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?') result = string.replace(result, '?', '.')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
return "%s$" % result return "%s$" % result
......
...@@ -267,21 +267,28 @@ class GlobbingLexicon(Lexicon): ...@@ -267,21 +267,28 @@ class GlobbingLexicon(Lexicon):
def query_hook(self, q): def query_hook(self, q):
"""expand wildcards""" """expand wildcards"""
ListType = type([])
i = len(q) - 1
while i >= 0:
e = q[i]
if isinstance(e, ListType):
self.query_hook(e)
elif ( (self.multi_wc in e) or
(self.single_wc in e) ):
wids = self.get(e)
words = [] words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
for wid in wids: for wid in wids:
if words: if words:
words.append(Or) words.append(Or)
words.append(wid) words.append(wid)
else: if not words:
words.append(w) # if words is empty, return something that will make
# textindex's __getitem__ return an empty result list
words.append('')
q[i] = words
i = i - 1
# if words is empty, return something that will make textindex's return q
# __getitem__ return an empty result list
return words or ['']
def Splitter(self, astring, words=None): def Splitter(self, astring, words=None):
""" wrap the splitter """ """ wrap the splitter """
...@@ -298,19 +305,16 @@ class GlobbingLexicon(Lexicon): ...@@ -298,19 +305,16 @@ class GlobbingLexicon(Lexicon):
There is no way to quote meta-characters. There is no way to quote meta-characters.
""" """
# Remove characters that are meaningful in a regex
transTable = string.maketrans("", "") transTable = string.maketrans("", "")
result = string.translate(pat, transTable,
r'()&|!@#$%^{}\<>.')
# First, deal with mutli-character globbing # First, deal with multi-character globbing
result = string.replace(pat, '*', '.*') result = string.replace(result, '*', '.*')
# Next, we need to deal with single-character globbing # Next, we need to deal with single-character globbing
result = string.replace(result, '?', '.?') result = string.replace(result, '?', '.')
# Now, we need to remove all of the characters that
# are forbidden.
result = string.translate(result, transTable,
r'()&|!@#$%^{}\<>')
return "%s$" % result return "%s$" % result
...@@ -91,7 +91,7 @@ undo information so that objects can be unindexed when the old value ...@@ -91,7 +91,7 @@ undo information so that objects can be unindexed when the old value
is no longer known. is no longer known.
""" """
__version__ = '$Revision: 1.49 $'[11:-2] __version__ = '$Revision: 1.50 $'[11:-2]
import string, re import string, re
...@@ -428,7 +428,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -428,7 +428,7 @@ class UnTextIndex(Persistent, Implicit):
and a String. Strings are looked up in the lexicon, whereas and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """ Integers are assumed to be resolved word ids. """
if type(word) is IntType: if isinstance(word, IntType):
# We have a word ID # We have a word ID
result = self._index.get(word, {}) result = self._index.get(word, {})
return ResultList(result, (word,), self) return ResultList(result, (word,), self)
...@@ -440,7 +440,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -440,7 +440,7 @@ class UnTextIndex(Persistent, Implicit):
if len(splitSource) == 1: if len(splitSource) == 1:
splitSource = splitSource[0] splitSource = splitSource[0]
if splitSource[:1] == '"' and splitSource[-1:] == '"': if splitSource[:1] == splitSource[-1:] == '"':
return self[splitSource] return self[splitSource]
wids=self.getLexicon(self._lexicon).get(splitSource) wids=self.getLexicon(self._lexicon).get(splitSource)
...@@ -551,28 +551,37 @@ class UnTextIndex(Persistent, Implicit): ...@@ -551,28 +551,37 @@ class UnTextIndex(Persistent, Implicit):
def query(self, s, default_operator=Or, ws=(string.whitespace,)): def query(self, s, default_operator=Or):
""" This is called by TextIndexes. A 'query term' which is a """ Evaluate a query string.
string 's' is passed in, along with an index object. s is
parsed, then the wildcards are parsed, then something is Convert the query string into a data structure of nested lists
parsed again, then the whole thing is 'evaluated'. """ and strings, based on the grouping of whitespace-separated
strings by parentheses and quotes. The 'Near' operator is
inserted between the strings of a quoted group.
The Lexicon is given the opportunity to transform the
data structure. Stemming, wildcards, and translation are
possible Lexicon services.
Finally, the query list is normalized so that it and every
sub-list consist of non-operator strings or lists separated
by operators. This list is evaluated.
"""
# First replace any occurences of " and not " with " andnot " # First replace any occurences of " and not " with " andnot "
s = re.sub( s = re.sub('(?i)\s+and\s*not\s+', ' andnot ', s)
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
' andnot ', s)
# do some parsing # Parse parentheses and quotes
q = parse(s) q = parse(s)
## here, we give lexicons a chance to transform the query. # Allow the Lexicon to process the query
## For example, substitute wildcards, or translate words into
## various languages.
q = self.getLexicon(self._lexicon).query_hook(q) q = self.getLexicon(self._lexicon).query_hook(q)
# do some more parsing
# Insert the default operator between any two search terms not
# already joined by an operator.
q = parse2(q, default_operator) q = parse2(q, default_operator)
## evalute the final 'expression' # evalute the final 'expression'
return self.evaluate(q) return self.evaluate(q)
...@@ -605,19 +614,17 @@ class UnTextIndex(Persistent, Implicit): ...@@ -605,19 +614,17 @@ class UnTextIndex(Persistent, Implicit):
def evaluate(self, query): def evaluate(self, query):
"""Evaluate a parsed query""" """Evaluate a parsed query"""
# There are two options if the query passed in is only one # Strip off meaningless layers
# item. It means either it's an embedded query, in which case while isinstance(query, ListType) and len(query) == 1:
# we'll recursively evaluate, other wise it's nothing for us query = query[0]
# to evaluate, and we just get the results and return them.
if (len(query) == 1):
if (type(query[0]) is ListType):
return self.evaluate(query[0])
return self[query[0]] # __getitem__ # If it's not a list, assume a string or number
if not isinstance(query, ListType):
return self[query]
# Now we need to loop through the query and expand out # Now we need to loop through the query and reduce
# operators. They are currently evaluated in the following # operators. They are currently evaluated in the following
# order: AndNote -> And -> Or -> Near # order: AndNot -> And -> Or -> Near
i = 0 i = 0
while (i < len(query)): while (i < len(query)):
if query[i] is AndNot: if query[i] is AndNot:
...@@ -660,98 +667,91 @@ def parse(s): ...@@ -660,98 +667,91 @@ def parse(s):
l = [] l = []
tmp = string.lower(s) tmp = string.lower(s)
while (1):
p = parens(tmp) p = parens(tmp)
while p is not None:
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before # Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens # the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)]) l = l + quotes(p[0])
l.append(parse(tmp[p[0] : p[1]])) l.append(parse(p[1]))
# continue looking through the rest of the string # continue looking through the rest of the string
tmp = tmp[(p[1] + 1):] tmp = p[2]
p = parens(tmp)
return l return l + quotes(tmp)
def parse2(q, default_operator, def parse2(q, default_operator,
operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}): operator_dict={AndNot: AndNot, And: And, Or: Or, Near: Near}):
"""Find operators and operands""" """Find operators and operands"""
i = 0
isop = operator_dict.has_key isop = operator_dict.has_key
while (i < len(q)): i = len(q) - 1
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator) while i >= 0:
e = q[i]
# every other item, starting with the first, should be an operand if isinstance(e, ListType):
if ((i % 2) != 0): q[i] = parse2(e, default_operator)
# This word should be an operator; if it is not, splice in if i % 2:
# the default operator. q.insert(i, default_operator)
elif i % 2:
if type(q[i]) is not ListType and isop(q[i]): # This element should be an operator
q[i] = operator_dict[q[i]] if isop(e):
else: q[i : i] = [ default_operator ] # Ensure that it is identical, not merely equal.
q[i] = operator_dict[e]
i = i + 1 else:
# Insert the default operator.
q.insert(i, default_operator)
i = i - 1
return q return q
def parens(s, parens_re=re.compile('[\(\)]').search): def parens(s, parens_re=re.compile('[()]').search):
mo = parens_re(s)
index = open_index = paren_count = 0 if mo is None:
return
while 1:
mo = parens_re(s, index)
if mo is None : break
open_index = mo.start(0) + 1
paren_count = 0
while mo is not None:
index = mo.start(0) index = mo.start(0)
if s[index] == '(': if s[index] == '(':
paren_count = paren_count + 1 paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else: else:
paren_count = paren_count - 1 paren_count = paren_count - 1
if paren_count == 0: if paren_count == 0:
return open_index, index return (s[:open_index - 1], s[open_index:index],
else: s[index + 1:])
index = index + 1 if paren_count < 0:
break
mo = parens_re(s, index + 1)
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses" raise QueryError, "Mismatched parentheses"
def quotes(s, ws=(string.whitespace,)): def quotes(s):
# split up quoted regions
splitted = re.split( '[%s]*\"[%s]*' % (ws * 2),s)
split=string.split split=string.split
if '"' not in s:
return split(s)
# split up quoted regions
splitted = re.split('\s*\"\s*', s)
if (len(splitted) > 1): if (len(splitted) % 2) == 0: raise QueryError, "Mismatched quotes"
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2): for i in range(1,len(splitted),2):
# split the quoted region into words # split the quoted region into words
splitted[i] = filter(None, split(splitted[i])) words = splitted[i] = split(splitted[i])
# put the Proxmity operator in between quoted words # put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])): j = len(words) - 1
splitted[i][j : j] = [ Near ] while j > 0:
words.insert(j, Near)
j = j - 1
for i in range(len(splitted)-1,-1,-2): i = len(splitted) - 1
while i >= 0:
# split the non-quoted region into words # split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i])) splitted[i:i+1] = split(splitted[i])
i = i - 2
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
return filter(None, splitted)
...@@ -218,7 +218,7 @@ class Tests(unittest.TestCase): ...@@ -218,7 +218,7 @@ class Tests(unittest.TestCase):
"""I hope I get to work on time""", """I hope I get to work on time""",
] ]
def checkGlobQuery(self): def globTest(self, qmap, rlist):
"Check a glob query" "Check a glob query"
index=self.dbopen() index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon() index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
...@@ -232,162 +232,61 @@ class Tests(unittest.TestCase): ...@@ -232,162 +232,61 @@ class Tests(unittest.TestCase):
index=self.dbopen() index=self.dbopen()
r = index._apply_index({'text':'m*n'}) r = list(index._apply_index(qmap)[0].keys())
r=list(r[0].keys()) assert r == rlist, r
assert r == [0,2], r return index._apply_index
def checkStarQuery(self):
"Check a star query"
self.globTest({'text':'m*n'}, [0,2])
def checkAndQuery(self): def checkAndQuery(self):
"Check an AND query" "Check an AND query"
index=self.dbopen() self.globTest({'text':'time and country'}, [0,])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and country'})
r=list(r[0].keys())
assert r == [0,], r
def checkOrQuery(self): def checkOrQuery(self):
"Check an OR query" "Check an OR query"
index=self.dbopen() self.globTest({'text':'time or country'}, [0,1,6])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen() def checkDefOrQuery(self):
"Check a default OR query"
r = index._apply_index({'text':'time or country'}) self.globTest({'text':'time country'}, [0,1,6])
r=list(r[0].keys())
assert r == [0,1,6], r
def checkNearQuery(self): def checkNearQuery(self):
"""Check a NEAR query.. (NOTE:ACTUALLY AN 'OR' TEST!!)""" """Check a NEAR query.. (NOTE:ACTUALLY AN 'AND' TEST!!)"""
# NEAR never worked, so Zopes post-2.3.1b3 define near to mean OR # NEAR never worked, so Zopes post-2.3.1b3 define near to mean AND
index=self.dbopen() self.globTest({'text':'time ... country'}, [0,])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen() def checkQuotesQuery(self):
"""Check a quoted query"""
ai = self.globTest({'text':'"This is the time"'}, [0,])
r = index._apply_index({'text':'time near country'}) r = list(ai({'text':'"now is the time"'})[0].keys())
r=list(r[0].keys()) assert r == [], r
assert r == [0,1,6], r
def checkAndNotQuery(self): def checkAndNotQuery(self):
"Check an ANDNOT query" "Check an ANDNOT query"
index=self.dbopen() self.globTest({'text':'time and not country'}, [6,])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time and not country'})
r=list(r[0].keys())
assert r == [6], r
def checkParenMatchingQuery(self): def checkParenMatchingQuery(self):
"Check a query with parens" "Check a query with parens"
index=self.dbopen() ai = self.globTest({'text':'(time and country) men'}, [0,])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)): r = list(ai({'text':'(time and not country) or men'})[0].keys())
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'(time and country) men'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'(time and not country) or men'})
r=list(r[0].keys())
assert r == [0, 6], r assert r == [0, 6], r
def checkQuoteMatchingQuery(self):
"Check a query with quotes.. this is known to fail under 2.3.1b3-"
index=self.dbopen()
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'"This is the time"'})
r=list(r[0].keys())
assert r == [0], r
r = index._apply_index({'text':'"now is the time"'})
r=list(r[0].keys())
assert r == [], r
def checkTextIndexOperatorQuery(self): def checkTextIndexOperatorQuery(self):
"Check a query with 'textindex_operator' in the request" "Check a query with 'textindex_operator' in the request"
index=self.dbopen() self.globTest({'text':'time men', 'textindex_operator':'and'}, [0,])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose()
index=self.dbopen()
r = index._apply_index({'text':'time men','textindex_operator':'and'})
r=list(r[0].keys())
assert r == [0], r
def checkNonExistentWord(self): def checkNonExistentWord(self):
""" Check for nonexistent word """ """ Check for nonexistent word """
index=self.dbopen() self.globTest({'text':'zop'}, [])
index._lexicon = SearchIndex.GlobbingLexicon.GlobbingLexicon()
for i in range(len(self.sample_texts)):
self.doc.text=self.sample_texts[i]
index.index_object(i, self.doc)
get_transaction().commit()
self.dbclose() def checkComplexQuery1(self):
""" Check complex query 1 """
index=self.dbopen() self.globTest({'text':'((?ount* or get) and not wait) '
'"been *ert*"'}, [0, 1, 5, 6])
r = index._apply_index({'text':'zop'})
r=list(r[0].keys())
assert r == [], r
def test_suite(): def test_suite():
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment