Lexicon object now provides the splitter, parsing the query language

is still a hack but is much cleaner (ie, in it's own module).

Lexicon object now provides the splitter, parsing the query language
is still a hack but is much cleaner (ie, in it's own module).
d4237698 · Michel Pelletier · a15641e0 · d4237698 · d4237698
Commit d4237698 authored Sep 20, 1999 by Michel Pelletier
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 230 deletions

lib/python/SearchIndex/Lexicon.py lib/python/SearchIndex/Lexicon.py +20 -225

lib/python/SearchIndex/UnTextIndex.py lib/python/SearchIndex/UnTextIndex.py +5 -5

No files found.
--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -88,243 +88,28 @@ import regsub



-
-
-##AndNot    = 'andnot'
-##And       = 'and'
-##Or        = 'or'
-##Near = '...'
-##QueryError='TextIndex.QueryError'
-
-##def query(s, index, default_operator = Or,
-##          ws = (string.whitespace,)):
-##    # First replace any occurences of " and not " with " andnot "
-##    s = regsub.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
-##    q = parse(s)
-##    q = parse2(q, default_operator)
-##    return evaluate(q, index)
-
-##def parse(s):
-##    '''Parse parentheses and quotes'''
-##    l = []
-##    tmp = string.lower(s)
-
-##    while (1):
-##        p = parens(tmp)
-
-##        if (p is None):
-##            # No parentheses found.  Look for quotes then exit.
-##            l = l + quotes(tmp)
-##            break
-##        else:
-##            # Look for quotes in the section of the string before
-##            # the parentheses, then parse the string inside the parens
-##            l = l + quotes(tmp[:(p[0] - 1)])
-##            l.append(parse(tmp[p[0] : p[1]]))
-
-##            # continue looking through the rest of the string
-##            tmp = tmp[(p[1] + 1):]
-
-##    return l
-
-##def parse2(q, default_operator,
-##           operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
-##           ListType=type([]),
-##           ):
-##    '''Find operators and operands'''
-##    i = 0
-##    isop=operator_dict.has_key
-##    while (i < len(q)):
-##        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
-
-##        # every other item, starting with the first, should be an operand
-##        if ((i % 2) != 0):
-##            # This word should be an operator; if it is not, splice in
-##            # the default operator.
-            
-##            if type(q[i]) is not ListType and isop(q[i]):
-##                q[i] = operator_dict[q[i]]
-##            else: q[i : i] = [ default_operator ]
-
-##        i = i + 1
-
-##    return q
-
-##def parens(s, parens_regex = regex.compile("(\|)")):
-##    '''Find the beginning and end of the first set of parentheses'''
-
-##    if (parens_regex.search(s) < 0): return None
-
-##    if (parens_regex.group(0) == ")"):
-##        raise QueryError, "Mismatched parentheses"
-
-##    open = parens_regex.regs[0][0] + 1
-##    start = parens_regex.regs[0][1]
-##    p = 1
-
-##    while (parens_regex.search(s, start) >= 0):
-##        if (parens_regex.group(0) == ")"): p = p - 1
-##        else: p = p + 1
-
-##        start = parens_regex.regs[0][1]
-  
-##        if (p == 0): return (open, parens_regex.regs[0][0])
-
-##    raise QueryError, "Mismatched parentheses"    
-
-##def quotes(s, ws = (string.whitespace,)):
-##     # split up quoted regions
-##     splitted = regsub.split(s, '[%s]*\"[%s]*' % (ws * 2))
-##     split=string.split
-
-##     if (len(splitted) > 1):
-##         if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
-    
-##         for i in range(1,len(splitted),2):
-##             # split the quoted region into words
-##             splitted[i] = filter(None, split(splitted[i]))
-
-##             # put the Proxmity operator in between quoted words
-##             for j in range(1, len(splitted[i])):
-##                 splitted[i][j : j] = [ Near ]
-
-##         for i in range(len(splitted)-1,-1,-2):
-##             # split the non-quoted region into words
-##             splitted[i:i+1] = filter(None, split(splitted[i]))
-
-##         splitted = filter(None, splitted)
-##     else:
-##         # No quotes, so just split the string into words
-##         splitted = filter(None, split(s))
-
-##     return splitted
-
-##def get_operands(q, i, index, ListType=type([]), StringType=type('')):
-##    '''Evaluate and return the left and right operands for an operator'''
-##    try:
-##        left  = q[i - 1]
-##        right = q[i + 1]
-##    except IndexError: raise QueryError, "Malformed query"
-
-##    t=type(left)
-##    if t is ListType: left = evaluate(left, index)
-##    elif t is StringType: left=index[left]
-
-##    t=type(right)
-##    if t is ListType: right = evaluate(right, index)
-##    elif t is StringType: right=index[right]
-
-##    return (left, right)
-
-##def evaluate(q, index,ListType=type([])):
-##    '''Evaluate a parsed query'''
-
-##    if (len(q) == 1):
-##        if (type(q[0]) is ListType):
-##            return evaluate(q[0], index)
-
-##        return index[q[0]]
-      
-##    i = 0
-##    while (i < len(q)):
-##        if q[i] is AndNot:
-##            left, right = get_operands(q, i, index)
-##            val = left.and_not(right)
-##            q[(i - 1) : (i + 2)] = [ val ]
-##        else: i = i + 1
-
-##    i = 0
-##    while (i < len(q)):
-##        if q[i] is And:
-##            left, right = get_operands(q, i, index)
-##            val = left & right
-##            q[(i - 1) : (i + 2)] = [ val ]
-##        else: i = i + 1
-
-##    i = 0
-##    while (i < len(q)):
-##        if q[i] is Or:
-##            left, right = get_operands(q, i, index)
-##            val = left | right
-##            q[(i - 1) : (i + 2)] = [ val ]
-##        else: i = i + 1
-
-##    i = 0
-##    while (i < len(q)):
-##        if q[i] is Near:
-##            left, right = get_operands(q, i, index)
-##            val = left.near(right)
-##            q[(i - 1) : (i + 2)] = [ val ]
-##        else: i = i + 1
-
-##    if (len(q) != 1): raise QueryError, "Malformed query"
-
-##    return q[0]
-
-
-##stop_words=(
-##    'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
-##    'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
-##    'along', 'already', 'also', 'although', 'always', 'am', 'among',
-##    'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any',
-##    'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
-##    'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes',
-##    'becoming', 'been', 'before', 'beforehand', 'behind', 'being',
-##    'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
-##    'bottom', 'but', 'by', 'can', 'cannot', 'cant', 'con', 'could',
-##    'couldnt', 'cry', 'describe', 'detail', 'do', 'done', 'down', 'due',
-##    'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else',
-##    'elsewhere', 'empty', 'enough', 'even', 'ever', 'every', 'everyone',
-##    'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty',
-##    'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly',
-##    'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get',
-##    'give', 'go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her',
-##    'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers',
-##    'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i',
-##    'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'it',
-##    'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least',
-##    'less', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill',
-##    'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must',
-##    'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless',
-##    'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not',
-##    'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
-##    'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our',
-##    'ours', 'ourselves', 'out', 'over', 'own', 'per', 'perhaps',
-##    'please', 'pre', 'put', 'rather', 're', 'same', 'see', 'seem',
-##    'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should',
-##    'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some',
-##    'somehow', 'someone', 'something', 'sometime', 'sometimes',
-##    'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the',
-##    'their', 'them', 'themselves', 'then', 'thence', 'there',
-##    'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these',
-##    'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three',
-##    'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too',
-##    'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under',
-##    'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we', 'well',
-##    'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where',
-##    'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon',
-##    'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever',
-##    'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without',
-##    'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves',
-##    )
-##stop_word_dict={}
-##for word in stop_words: stop_word_dict[word]=None
-
-
-
 __doc__=""" Module breaks out Zope specific methods and behavior.  In
 addition, provides the Lexicon class which defines a word to integer
 mapping.

 """

+from Splitter import Splitter
 from Persistence import Persistent
 from Acquisition import Implicit
 import OIBTree
 OIBTree=OIBTree.BTree

+
 class Lexicon(Persistent, Implicit):
-    """ maps words to word ids """
+    """ maps words to word ids and then some
+
+    The Lexicon object is an attempt to abstract voacbularies out of
+    Text indexes.  This abstraction is not totally cooked yet, this
+    module still includes the parser for the 'Text Index Query
+    Language' and a few other hacks.
+
+    """

    def __init__(self):
        self._lexicon = OIBTree()
@@ -344,8 +129,14 @@ class Lexicon(Persistent, Implicit):
            self._lexicon[intern(word)] = self.counter
            self.counter = self.counter + 1
            return self.counter
+
+
+    def Splitter(self, astring, words):
+        """ wrap the splitter """
+        return Splitter(astring, words)
        

+
 AndNot    = 'andnot'
 And       = 'and'
 Or        = 'or'
@@ -570,3 +361,7 @@ stop_words=(
    )
 stop_word_dict={}
 for word in stop_words: stop_word_dict[word]=None
+
+
+
+
--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,7 +92,7 @@ is no longer known.


 """
-__version__='$Revision: 1.13 $'[11:-2]
+__version__='$Revision: 1.14 $'[11:-2]

 from Globals import Persistent
 import BTree, IIBTree, IOBTree, OIBTree
@@ -207,7 +207,7 @@ class UnTextIndex(Persistent):

        ## The Splitter should now be european compliant at least.
        ## Someone should test this.
-        src = Splitter(k, self._syn)
+        src = self._lexicon.Splitter(k, self._syn)
        ## This returns a tuple of stemmed words.  Stopwords have been 
        ## stripped.
        
@@ -291,7 +291,7 @@ class UnTextIndex(Persistent):
    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"
        """
-        src = tuple(Splitter(word, self._syn))
+        src = tuple(self._lexicon.Splitter(word, self._syn))
        if not src:
            return ResultList({}, (word,), self)

@@ -387,13 +387,13 @@ class UnTextIndex(Persistent):

        r = []
        for word in words:
-            r = r+Splitter(doc, self._syn).indexes(word)
+            r = r+self._lexicon.Splitter(doc, self._syn).indexes(word)
        return r


    def _subindex(self, isrc, d, old, last):

-        src = Splitter(isrc, self._syn)  
+        src = self._lexicon.Splitter(isrc, self._syn)  

        for s in src:
            if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)