Commit d4237698 authored by Michel Pelletier's avatar Michel Pelletier

Lexicon object now provides the splitter, parsing the query language

is still a hack but is much cleaner (ie, in it's own module).
parent a15641e0
...@@ -88,243 +88,28 @@ import regsub ...@@ -88,243 +88,28 @@ import regsub
##AndNot = 'andnot'
##And = 'and'
##Or = 'or'
##Near = '...'
##QueryError='TextIndex.QueryError'
##def query(s, index, default_operator = Or,
## ws = (string.whitespace,)):
## # First replace any occurences of " and not " with " andnot "
## s = regsub.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
## q = parse(s)
## q = parse2(q, default_operator)
## return evaluate(q, index)
##def parse(s):
## '''Parse parentheses and quotes'''
## l = []
## tmp = string.lower(s)
## while (1):
## p = parens(tmp)
## if (p is None):
## # No parentheses found. Look for quotes then exit.
## l = l + quotes(tmp)
## break
## else:
## # Look for quotes in the section of the string before
## # the parentheses, then parse the string inside the parens
## l = l + quotes(tmp[:(p[0] - 1)])
## l.append(parse(tmp[p[0] : p[1]]))
## # continue looking through the rest of the string
## tmp = tmp[(p[1] + 1):]
## return l
##def parse2(q, default_operator,
## operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
## ListType=type([]),
## ):
## '''Find operators and operands'''
## i = 0
## isop=operator_dict.has_key
## while (i < len(q)):
## if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
## # every other item, starting with the first, should be an operand
## if ((i % 2) != 0):
## # This word should be an operator; if it is not, splice in
## # the default operator.
## if type(q[i]) is not ListType and isop(q[i]):
## q[i] = operator_dict[q[i]]
## else: q[i : i] = [ default_operator ]
## i = i + 1
## return q
##def parens(s, parens_regex = regex.compile("(\|)")):
## '''Find the beginning and end of the first set of parentheses'''
## if (parens_regex.search(s) < 0): return None
## if (parens_regex.group(0) == ")"):
## raise QueryError, "Mismatched parentheses"
## open = parens_regex.regs[0][0] + 1
## start = parens_regex.regs[0][1]
## p = 1
## while (parens_regex.search(s, start) >= 0):
## if (parens_regex.group(0) == ")"): p = p - 1
## else: p = p + 1
## start = parens_regex.regs[0][1]
## if (p == 0): return (open, parens_regex.regs[0][0])
## raise QueryError, "Mismatched parentheses"
##def quotes(s, ws = (string.whitespace,)):
## # split up quoted regions
## splitted = regsub.split(s, '[%s]*\"[%s]*' % (ws * 2))
## split=string.split
## if (len(splitted) > 1):
## if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
## for i in range(1,len(splitted),2):
## # split the quoted region into words
## splitted[i] = filter(None, split(splitted[i]))
## # put the Proxmity operator in between quoted words
## for j in range(1, len(splitted[i])):
## splitted[i][j : j] = [ Near ]
## for i in range(len(splitted)-1,-1,-2):
## # split the non-quoted region into words
## splitted[i:i+1] = filter(None, split(splitted[i]))
## splitted = filter(None, splitted)
## else:
## # No quotes, so just split the string into words
## splitted = filter(None, split(s))
## return splitted
##def get_operands(q, i, index, ListType=type([]), StringType=type('')):
## '''Evaluate and return the left and right operands for an operator'''
## try:
## left = q[i - 1]
## right = q[i + 1]
## except IndexError: raise QueryError, "Malformed query"
## t=type(left)
## if t is ListType: left = evaluate(left, index)
## elif t is StringType: left=index[left]
## t=type(right)
## if t is ListType: right = evaluate(right, index)
## elif t is StringType: right=index[right]
## return (left, right)
##def evaluate(q, index,ListType=type([])):
## '''Evaluate a parsed query'''
## if (len(q) == 1):
## if (type(q[0]) is ListType):
## return evaluate(q[0], index)
## return index[q[0]]
## i = 0
## while (i < len(q)):
## if q[i] is AndNot:
## left, right = get_operands(q, i, index)
## val = left.and_not(right)
## q[(i - 1) : (i + 2)] = [ val ]
## else: i = i + 1
## i = 0
## while (i < len(q)):
## if q[i] is And:
## left, right = get_operands(q, i, index)
## val = left & right
## q[(i - 1) : (i + 2)] = [ val ]
## else: i = i + 1
## i = 0
## while (i < len(q)):
## if q[i] is Or:
## left, right = get_operands(q, i, index)
## val = left | right
## q[(i - 1) : (i + 2)] = [ val ]
## else: i = i + 1
## i = 0
## while (i < len(q)):
## if q[i] is Near:
## left, right = get_operands(q, i, index)
## val = left.near(right)
## q[(i - 1) : (i + 2)] = [ val ]
## else: i = i + 1
## if (len(q) != 1): raise QueryError, "Malformed query"
## return q[0]
##stop_words=(
## 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
## 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
## 'along', 'already', 'also', 'although', 'always', 'am', 'among',
## 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
## 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
## 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
## 'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
## 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
## 'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
## 'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
## 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
## 'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
## 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
## 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
## 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
## 'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
## 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
## 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
## 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
## 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
## 'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
## 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
## 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
## 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
## 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
## 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
## 'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
## 'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
## 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
## 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
## 'somehow', 'someone', 'something', 'sometime', 'sometimes',
## 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
## 'their', 'them', 'themselves', 'then', 'thence', 'there',
## 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
## 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
## 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
## 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
## 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
## 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
## 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
## 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
## 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
## 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
## )
##stop_word_dict={}
##for word in stop_words: stop_word_dict[word]=None
__doc__=""" Module breaks out Zope specific methods and behavior. In __doc__=""" Module breaks out Zope specific methods and behavior. In
addition, provides the Lexicon class which defines a word to integer addition, provides the Lexicon class which defines a word to integer
mapping. mapping.
""" """
from Splitter import Splitter
from Persistence import Persistent from Persistence import Persistent
from Acquisition import Implicit from Acquisition import Implicit
import OIBTree import OIBTree
OIBTree=OIBTree.BTree OIBTree=OIBTree.BTree
class Lexicon(Persistent, Implicit): class Lexicon(Persistent, Implicit):
""" maps words to word ids """ """ maps words to word ids and then some
The Lexicon object is an attempt to abstract voacbularies out of
Text indexes. This abstraction is not totally cooked yet, this
module still includes the parser for the 'Text Index Query
Language' and a few other hacks.
"""
def __init__(self): def __init__(self):
self._lexicon = OIBTree() self._lexicon = OIBTree()
...@@ -344,8 +129,14 @@ class Lexicon(Persistent, Implicit): ...@@ -344,8 +129,14 @@ class Lexicon(Persistent, Implicit):
self._lexicon[intern(word)] = self.counter self._lexicon[intern(word)] = self.counter
self.counter = self.counter + 1 self.counter = self.counter + 1
return self.counter return self.counter
def Splitter(self, astring, words):
""" wrap the splitter """
return Splitter(astring, words)
AndNot = 'andnot' AndNot = 'andnot'
And = 'and' And = 'and'
Or = 'or' Or = 'or'
...@@ -570,3 +361,7 @@ stop_words=( ...@@ -570,3 +361,7 @@ stop_words=(
) )
stop_word_dict={} stop_word_dict={}
for word in stop_words: stop_word_dict[word]=None for word in stop_words: stop_word_dict[word]=None
...@@ -92,7 +92,7 @@ is no longer known. ...@@ -92,7 +92,7 @@ is no longer known.
""" """
__version__='$Revision: 1.13 $'[11:-2] __version__='$Revision: 1.14 $'[11:-2]
from Globals import Persistent from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree import BTree, IIBTree, IOBTree, OIBTree
...@@ -207,7 +207,7 @@ class UnTextIndex(Persistent): ...@@ -207,7 +207,7 @@ class UnTextIndex(Persistent):
## The Splitter should now be european compliant at least. ## The Splitter should now be european compliant at least.
## Someone should test this. ## Someone should test this.
src = Splitter(k, self._syn) src = self._lexicon.Splitter(k, self._syn)
## This returns a tuple of stemmed words. Stopwords have been ## This returns a tuple of stemmed words. Stopwords have been
## stripped. ## stripped.
...@@ -291,7 +291,7 @@ class UnTextIndex(Persistent): ...@@ -291,7 +291,7 @@ class UnTextIndex(Persistent):
def __getitem__(self, word): def __getitem__(self, word):
"""Return an InvertedIndex-style result "list" """Return an InvertedIndex-style result "list"
""" """
src = tuple(Splitter(word, self._syn)) src = tuple(self._lexicon.Splitter(word, self._syn))
if not src: if not src:
return ResultList({}, (word,), self) return ResultList({}, (word,), self)
...@@ -387,13 +387,13 @@ class UnTextIndex(Persistent): ...@@ -387,13 +387,13 @@ class UnTextIndex(Persistent):
r = [] r = []
for word in words: for word in words:
r = r+Splitter(doc, self._syn).indexes(word) r = r+self._lexicon.Splitter(doc, self._syn).indexes(word)
return r return r
def _subindex(self, isrc, d, old, last): def _subindex(self, isrc, d, old, last):
src = Splitter(isrc, self._syn) src = self._lexicon.Splitter(isrc, self._syn)
for s in src: for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last) if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment