Commit 8e6e5acb authored by Michel Pelletier's avatar Michel Pelletier

Unscrewed Globbing dependencies in text index, fixed non-globbing

vocabularies.  redid Lexicon interface.
parent ea93883d
...@@ -533,6 +533,8 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): ...@@ -533,6 +533,8 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
type(''): Query.String, type(''): Query.String,
}, **kw): }, **kw):
# Get search arguments: # Get search arguments:
if REQUEST is None and not kw: if REQUEST is None and not kw:
try: REQUEST=self.REQUEST try: REQUEST=self.REQUEST
......
...@@ -156,8 +156,11 @@ class Vocabulary(Item, Persistent, Implicit): ...@@ -156,8 +156,11 @@ class Vocabulary(Item, Persistent, Implicit):
def query(self, pattern): def query(self, pattern):
""" """ """ """
result = [] result = []
for x in self.lexicon.query(pattern): for x in self.lexicon.get(pattern):
result.append(self.lexicon._inverseLex[x]) if self.globbing:
result.append(self.lexicon._inverseLex[x])
else:
result.append(pattern)
return result return result
......
...@@ -471,7 +471,7 @@ class ZCatalog(Folder, Persistent, Implicit): ...@@ -471,7 +471,7 @@ class ZCatalog(Folder, Persistent, Implicit):
Search terms can be passed in the REQUEST or as keyword Search terms can be passed in the REQUEST or as keyword
arguments. arguments.
""" """
return apply(self._catalog.searchResults, return apply(self._catalog.searchResults,
(REQUEST,used, query_map), kw) (REQUEST,used, query_map), kw)
......
...@@ -103,6 +103,7 @@ OOBTree=BTree.BTree ...@@ -103,6 +103,7 @@ OOBTree=BTree.BTree
IOBTree=IOBTree.BTree IOBTree=IOBTree.BTree
import re import re
from UnTextIndex import Or
class GlobbingLexicon(Lexicon): class GlobbingLexicon(Lexicon):
""" """
...@@ -160,7 +161,7 @@ class GlobbingLexicon(Lexicon): ...@@ -160,7 +161,7 @@ class GlobbingLexicon(Lexicon):
return self.counter return self.counter
def query(self, pattern): def get(self, pattern):
""" Query the lexicon for words matching a pattern. """ Query the lexicon for words matching a pattern.
""" """
...@@ -218,26 +219,45 @@ class GlobbingLexicon(Lexicon): ...@@ -218,26 +219,45 @@ class GlobbingLexicon(Lexicon):
def __getitem__(self, word): def __getitem__(self, word):
""" """ """ """
return self.query(word) return self.get(word)
def query_hook(self, q):
"""expand wildcards
"""
words = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
wids = self.get(w)
for wid in wids:
if words:
words.append(Or)
words.append(self._inverseLex[wid])
else:
words.append(w)
return words
def translate(self, pat): def translate(self, pat):
"""Translate a PATTERN to a regular expression. """Translate a PATTERN to a regular expression.
There is no way to quote meta-characters. There is no way to quote meta-characters.
""" """
i, n = 0, len(pat) i, n = 0, len(pat)
res = '' res = ''
while i < n: while i < n:
c = pat[i] c = pat[i]
i = i+1 i = i+1
if c == self.multi_wc: if c == self.multi_wc:
res = res + '.*' res = res + '.*'
elif c == self.single_wc: elif c == self.single_wc:
res = res + '.' res = res + '.'
else: else:
res = res + re.escape(c) res = res + re.escape(c)
return res + "$" return res + "$"
......
...@@ -134,7 +134,10 @@ class Lexicon(Persistent, Implicit): ...@@ -134,7 +134,10 @@ class Lexicon(Persistent, Implicit):
def get(self, key): def get(self, key):
""" """ """ """
return list(self._lexicon[key]) return [self._lexicon[key]]
def __getitem__(self, key):
return self.get(key)
def __len__(self): def __len__(self):
return len(self._lexicon) return len(self._lexicon)
...@@ -157,203 +160,12 @@ class Lexicon(Persistent, Implicit): ...@@ -157,203 +160,12 @@ class Lexicon(Persistent, Implicit):
hits.append(x) hits.append(x)
return hits return hits
def query_hook(self, q):
""" we don't want to modify the query cuz we're dumb """
return q
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
QueryError='TextIndex.QueryError'
def query(s, index, default_operator = Or,
ws = (string.whitespace,)):
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
q = parse(s)
q = parse_wc(q, index)
q = parse2(q, default_operator)
return evaluate(q, index)
def parse_wc(q, index):
'''expand wildcards'''
lex = index.getLexicon(index._lexicon)
words = []
for w in q:
if ( (lex.multi_wc in w) or
(lex.single_wc in w) ):
wids = lex.query(w)
for wid in wids:
if words:
words.append(Or)
words.append(lex._inverseLex[wid])
else:
words.append(w)
return words
def parse(s):
'''Parse parentheses and quotes'''
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
'''Find operators and operands'''
i = 0
isop=operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
return q
def parens(s, parens_re = regex.compile('(\|)').search):
index=open_index=paren_count = 0
while 1:
index = parens_re(s, index)
if index < 0 : break
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
def quotes(s, ws = (string.whitespace,)):
# split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
def get_operands(q, i, index, ListType=type([]), StringType=type('')):
'''Evaluate and return the left and right operands for an operator'''
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "Malformed query"
t=type(left)
if t is ListType: left = evaluate(left, index)
elif t is StringType: left=index[left]
t=type(right)
if t is ListType: right = evaluate(right, index)
elif t is StringType: right=index[right]
return (left, right)
def evaluate(q, index, ListType=type([])):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], index)
return index[q[0]]
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = get_operands(q, i, index)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = get_operands(q, i, index)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = get_operands(q, i, index)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = get_operands(q, i, index)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
return q[0]
stop_words=( stop_words=(
......
...@@ -92,7 +92,7 @@ is no longer known. ...@@ -92,7 +92,7 @@ is no longer known.
""" """
__version__='$Revision: 1.20 $'[11:-2] __version__='$Revision: 1.21 $'[11:-2]
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree import BTree, IIBTree, IOBTree, OIBTree
...@@ -105,12 +105,22 @@ from intSet import intSet ...@@ -105,12 +105,22 @@ from intSet import intSet
import operator import operator
from Splitter import Splitter from Splitter import Splitter
from string import strip from string import strip
import string, regex, regsub, pdb import string, regex, regsub, ts_regex
from Lexicon import Lexicon, query, stop_word_dict
from Lexicon import Lexicon, stop_word_dict
from ResultList import ResultList from ResultList import ResultList
AndNot = 'andnot'
And = 'and'
Or = 'or'
Near = '...'
QueryError='TextIndex.QueryError'
class UnTextIndex(Persistent, Implicit): class UnTextIndex(Persistent, Implicit):
def __init__(self, id=None, ignore_ex=None, def __init__(self, id=None, ignore_ex=None,
...@@ -160,6 +170,8 @@ class UnTextIndex(Persistent, Implicit): ...@@ -160,6 +170,8 @@ class UnTextIndex(Persistent, Implicit):
pass pass
if lexicon is None: if lexicon is None:
## if no lexicon is provided, create a dumb one
self._lexicon=Lexicon() self._lexicon=Lexicon()
else: else:
self._lexicon = lexicon self._lexicon = lexicon
...@@ -365,7 +377,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -365,7 +377,7 @@ class UnTextIndex(Persistent, Implicit):
rr = IIBucket() rr = IIBucket()
try: try:
for i, score in query(key,self).items(): for i, score in self.query(key).items():
if score: if score:
rr[i] = score rr[i] = score
except KeyError: except KeyError:
...@@ -406,7 +418,7 @@ class UnTextIndex(Persistent, Implicit): ...@@ -406,7 +418,7 @@ class UnTextIndex(Persistent, Implicit):
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
src = self.getLexicon.Splitter(isrc, self._syn) src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn)
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
...@@ -417,3 +429,197 @@ class UnTextIndex(Persistent, Implicit): ...@@ -417,3 +429,197 @@ class UnTextIndex(Persistent, Implicit):
return last return last
def query(self, s, default_operator = Or, ws = (string.whitespace,)):
"""
This is called by TextIndexes. A 'query term' which is a string
's' is passed in, along with an index object. s is parsed, then
the wildcards are parsed, then something is parsed again, then the
whole thing is 'evaluated'
"""
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
# do some parsing
q = parse(s)
## here, we give lexicons a chance to transform the query.
## For example, substitute wildcards, or translate words into
## various languages.
q = self.getLexicon(self._lexicon).query_hook(q)
# do some more parsing
q = parse2(q, default_operator)
## evalute the final 'expression'
return self.evaluate(q)
def get_operands(self, q, i, ListType=type([]), StringType=type('')):
'''Evaluate and return the left and right operands for an operator'''
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "Malformed query"
t=type(left)
if t is ListType: left = evaluate(left, self)
elif t is StringType: left=self[left]
t=type(right)
if t is ListType: right = evaluate(right, self)
elif t is StringType: right=self[right]
return (left, right)
def evaluate(self, q, ListType=type([])):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], self)
return self[q[0]]
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = self.get_operands(q, i)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = self.get_operands(q, i)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = self.get_operands(q, i)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = self.get_operands(q, i)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
return q[0]
def parse(s):
'''Parse parentheses and quotes'''
l = []
tmp = string.lower(s)
while (1):
p = parens(tmp)
if (p is None):
# No parentheses found. Look for quotes then exit.
l = l + quotes(tmp)
break
else:
# Look for quotes in the section of the string before
# the parentheses, then parse the string inside the parens
l = l + quotes(tmp[:(p[0] - 1)])
l.append(parse(tmp[p[0] : p[1]]))
# continue looking through the rest of the string
tmp = tmp[(p[1] + 1):]
return l
def parse2(q, default_operator,
operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
ListType=type([]),
):
'''Find operators and operands'''
i = 0
isop=operator_dict.has_key
while (i < len(q)):
if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
# every other item, starting with the first, should be an operand
if ((i % 2) != 0):
# This word should be an operator; if it is not, splice in
# the default operator.
if type(q[i]) is not ListType and isop(q[i]):
q[i] = operator_dict[q[i]]
else: q[i : i] = [ default_operator ]
i = i + 1
return q
def parens(s, parens_re = regex.compile('(\|)').search):
index=open_index=paren_count = 0
while 1:
index = parens_re(s, index)
if index < 0 : break
if s[index] == '(':
paren_count = paren_count + 1
if open_index == 0 : open_index = index + 1
else:
paren_count = paren_count - 1
if paren_count == 0:
return open_index, index
else:
index = index + 1
if paren_count == 0: # No parentheses Found
return None
else:
raise QueryError, "Mismatched parentheses"
def quotes(s, ws = (string.whitespace,)):
# split up quoted regions
splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
split=string.split
if (len(splitted) > 1):
if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
for i in range(1,len(splitted),2):
# split the quoted region into words
splitted[i] = filter(None, split(splitted[i]))
# put the Proxmity operator in between quoted words
for j in range(1, len(splitted[i])):
splitted[i][j : j] = [ Near ]
for i in range(len(splitted)-1,-1,-2):
# split the non-quoted region into words
splitted[i:i+1] = filter(None, split(splitted[i]))
splitted = filter(None, splitted)
else:
# No quotes, so just split the string into words
splitted = filter(None, split(s))
return splitted
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment