Commit 8e6e5acb authored by Michel Pelletier's avatar Michel Pelletier

Unscrewed Globbing dependencies in text index, fixed non-globbing

vocabularies.  redid Lexicon interface.
parent ea93883d
......@@ -533,6 +533,8 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
type(''): Query.String,
}, **kw):
# Get search arguments:
if REQUEST is None and not kw:
try: REQUEST=self.REQUEST
......
......@@ -156,8 +156,11 @@ class Vocabulary(Item, Persistent, Implicit):
def query(self, pattern):
""" """
result = []
for x in self.lexicon.query(pattern):
for x in self.lexicon.get(pattern):
if self.globbing:
result.append(self.lexicon._inverseLex[x])
else:
result.append(pattern)
return result
......
......@@ -103,6 +103,7 @@ OOBTree=BTree.BTree
IOBTree=IOBTree.BTree
import re
from UnTextIndex import Or
class GlobbingLexicon(Lexicon):
"""
......@@ -160,7 +161,7 @@ class GlobbingLexicon(Lexicon):
return self.counter
def query(self, pattern):
def get(self, pattern):
""" Query the lexicon for words matching a pattern.
"""
......@@ -218,7 +219,26 @@ class GlobbingLexicon(Lexicon):
def __getitem__(self, word):
""" """
return self.query(word)
return self.get(word)
def query_hook(self, q):
"""expand wildcards
"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
for wid in wids:
if words:
words.append(Or)
words.append(self._inverseLex[wid])
else:
words.append(w)
return words
def translate(self, pat):
"""Translate a PATTERN to a regular expression.
......
......@@ -134,7 +134,10 @@ class Lexicon(Persistent, Implicit):
def get(self, key):
""" """
return list(self._lexicon[key])
return [self._lexicon[key]]
def __getitem__(self, key):
return self.get(key)
def __len__(self):
return len(self._lexicon)
......@@ -157,203 +160,12 @@ class Lexicon(Persistent, Implicit):
hits.append(x)
return hits
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
QueryError='TextIndex.QueryError'
def query(s, index, default_operator = Or,
ws = (string.whitespace,)):
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
q = parse(s)
q = parse_wc(q, index)
q = parse2(q, default_operator)
return evaluate(q, index)
def parse_wc(q, index):
'''expand wildcards'''
lex = index.getLexicon(index._lexicon)
words = []
for w in q:
if ( (lex.multi_wc in w) or
(lex.single_wc in w) ):
wids = lex.query(w)
for wid in wids:
if words:
words.append(Or)
words.append(lex._inverseLex[wid])
else:
words.append(w)
return words
def parse(s):
'''Parse parentheses and quotes'''
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
'''Find operators and operands'''
i = 0
isop=operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
def query_hook(self, q):
""" we don't want to modify the query cuz we're dumb """
return q
def parens(s, parens_re = regex.compile('(\|)').search):
index=open_index=paren_count = 0
while 1:
index = parens_re(s, index)
if index < 0 : break
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
def quotes(s, ws = (string.whitespace,)):
# split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
def get_operands(q, i, index, ListType=type([]), StringType=type('')):
'''Evaluate and return the left and right operands for an operator'''
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "Malformed query"
t=type(left)
if t is ListType: left = evaluate(left, index)
elif t is StringType: left=index[left]
t=type(right)
if t is ListType: right = evaluate(right, index)
elif t is StringType: right=index[right]
return (left, right)
def evaluate(q, index, ListType=type([])):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], index)
return index[q[0]]
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = get_operands(q, i, index)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = get_operands(q, i, index)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = get_operands(q, i, index)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = get_operands(q, i, index)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
return q[0]
stop_words=(
......
......@@ -92,7 +92,7 @@ is no longer known.
"""
__version__='$Revision: 1.20 $'[11:-2]
__version__='$Revision: 1.21 $'[11:-2]
from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree
......@@ -105,12 +105,22 @@ from intSet import intSet
import operator
from Splitter import Splitter
from string import strip
import string, regex, regsub, pdb
import string, regex, regsub, ts_regex
from Lexicon import Lexicon, query, stop_word_dict
from Lexicon import Lexicon, stop_word_dict
from ResultList import ResultList
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
QueryError='TextIndex.QueryError'
class UnTextIndex(Persistent, Implicit):
def __init__(self, id=None, ignore_ex=None,
......@@ -160,6 +170,8 @@ class UnTextIndex(Persistent, Implicit):
pass
if lexicon is None:
## if no lexicon is provided, create a dumb one
self._lexicon=Lexicon()
else:
self._lexicon = lexicon
......@@ -365,7 +377,7 @@ class UnTextIndex(Persistent, Implicit):
rr = IIBucket()
try:
for i, score in query(key,self).items():
for i, score in self.query(key).items():
if score:
rr[i] = score
except KeyError:
......@@ -406,7 +418,7 @@ class UnTextIndex(Persistent, Implicit):
def _subindex(self, isrc, d, old, last):
src = self.getLexicon.Splitter(isrc, self._syn)
src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn)
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
......@@ -417,3 +429,197 @@ class UnTextIndex(Persistent, Implicit):
return last
def query(self, s, default_operator = Or, ws = (string.whitespace,)):
"""
This is called by TextIndexes. A 'query term' which is a string
's' is passed in, along with an index object. s is parsed, then
the wildcards are parsed, then something is parsed again, then the
whole thing is 'evaluated'
"""
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
# do some parsing
q = parse(s)
## here, we give lexicons a chance to transform the query.
## For example, substitute wildcards, or translate words into
## various languages.
q = self.getLexicon(self._lexicon).query_hook(q)
# do some more parsing
q = parse2(q, default_operator)
## evalute the final 'expression'
return self.evaluate(q)
def get_operands(self, q, i, ListType=type([]), StringType=type('')):
'''Evaluate and return the left and right operands for an operator'''
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "Malformed query"
t=type(left)
if t is ListType: left = evaluate(left, self)
elif t is StringType: left=self[left]
t=type(right)
if t is ListType: right = evaluate(right, self)
elif t is StringType: right=self[right]
return (left, right)
def evaluate(self, q, ListType=type([])):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], self)
return self[q[0]]
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = self.get_operands(q, i)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = self.get_operands(q, i)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = self.get_operands(q, i)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = self.get_operands(q, i)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
return q[0]
def parse(s):
'''Parse parentheses and quotes'''
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
'''Find operators and operands'''
i = 0
isop=operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
return q
def parens(s, parens_re = regex.compile('(\|)').search):
index=open_index=paren_count = 0
while 1:
index = parens_re(s, index)
if index < 0 : break
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
def quotes(s, ws = (string.whitespace,)):
# split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment