Unscrewed Globbing dependencies in text index, fixed non-globbing

vocabularies. redid Lexicon interface.

Unscrewed Globbing dependencies in text index, fixed non-globbing
vocabularies. redid Lexicon interface.
8e6e5acb · Michel Pelletier · ea93883d · 8e6e5acb · 8e6e5acb · 8e6e5acb
Commit 8e6e5acb authored Mar 25, 2000 by Michel Pelletier
6 changed files
--- a/lib/python/Products/ZCatalog/Catalog.py
+++ b/lib/python/Products/ZCatalog/Catalog.py
@@ -533,6 +533,8 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
                          type(''): Query.String,
                          }, **kw):

+
+
        # Get search arguments:
        if REQUEST is None and not kw:
            try: REQUEST=self.REQUEST

--- a/lib/python/Products/ZCatalog/Vocabulary.py
+++ b/lib/python/Products/ZCatalog/Vocabulary.py
@@ -156,8 +156,11 @@ class Vocabulary(Item, Persistent, Implicit):
    def query(self, pattern):
        """ """
        result = []
-        for x in self.lexicon.query(pattern):
-            result.append(self.lexicon._inverseLex[x])
+        for x in self.lexicon.get(pattern):
+            if self.globbing:
+                result.append(self.lexicon._inverseLex[x])
+            else:
+                result.append(pattern)
        return result
            


--- a/lib/python/Products/ZCatalog/ZCatalog.py
+++ b/lib/python/Products/ZCatalog/ZCatalog.py
@@ -471,7 +471,7 @@ class ZCatalog(Folder, Persistent, Implicit):
        Search terms can be passed in the REQUEST or as keyword
        arguments. 
        """
-        
+
        return apply(self._catalog.searchResults,
                     (REQUEST,used, query_map), kw)


--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -103,6 +103,7 @@ OOBTree=BTree.BTree
 IOBTree=IOBTree.BTree
 import re

+from UnTextIndex import Or

 class GlobbingLexicon(Lexicon):
    """
@@ -160,7 +161,7 @@ class GlobbingLexicon(Lexicon):
            return self.counter


-    def query(self, pattern):
+    def get(self, pattern):
        """ Query the lexicon for words matching a pattern.

        """
@@ -218,26 +219,45 @@ class GlobbingLexicon(Lexicon):
                
    def __getitem__(self, word):
        """ """
-        return self.query(word)
+        return self.get(word)
+
+    def query_hook(self, q):
+        """expand wildcards
+
+        """
+        words = []
+        for w in q:
+            if ( (self.multi_wc in w) or
+                (self.single_wc in w) ):
+                wids = self.get(w)
+                for wid in wids:
+                    if words:
+                        words.append(Or)
+                    words.append(self._inverseLex[wid])
+            else:
+                words.append(w)
+
+        return words
+

    def translate(self, pat):
-            """Translate a PATTERN to a regular expression.
-
-            There is no way to quote meta-characters.
-            """
-
-            i, n = 0, len(pat)
-            res = ''
-            while i < n:
-                    c = pat[i]
-                    i = i+1
-                    if c == self.multi_wc:
-                            res = res + '.*'
-                    elif c == self.single_wc:
-                            res = res + '.'
-                    else:
-                            res = res + re.escape(c)
-            return res + "$"
+        """Translate a PATTERN to a regular expression.
+
+        There is no way to quote meta-characters.
+        """
+
+        i, n = 0, len(pat)
+        res = ''
+        while i < n:
+            c = pat[i]
+            i = i+1
+            if c == self.multi_wc:
+                res = res + '.*'
+            elif c == self.single_wc:
+                res = res + '.'
+            else:
+                res = res + re.escape(c)
+        return res + "$"




--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -134,7 +134,10 @@ class Lexicon(Persistent, Implicit):

    def get(self, key):
        """  """
-        return list(self._lexicon[key])
+        return [self._lexicon[key]]
+
+    def __getitem__(self, key):
+        return self.get(key)

    def __len__(self):
        return len(self._lexicon)
@@ -157,203 +160,12 @@ class Lexicon(Persistent, Implicit):
                hits.append(x)
        return hits

+    def query_hook(self, q):
+        """ we don't want to modify the query cuz we're dumb """
+        return q
+        


-AndNot    = 'andnot'
-And       = 'and'
-Or        = 'or'
-Near = '...'
-QueryError='TextIndex.QueryError'
-
-def query(s, index, default_operator = Or,
-          ws = (string.whitespace,)):
-    # First replace any occurences of " and not " with " andnot "
-    s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
-    q = parse(s)
-    q = parse_wc(q, index)
-    q = parse2(q, default_operator)
-    return evaluate(q, index)
-
-def parse_wc(q, index):
-    '''expand wildcards'''
-    lex = index.getLexicon(index._lexicon)
-    words = []
-    for w in q:
-        if ( (lex.multi_wc in w) or
-            (lex.single_wc in w) ):
-            wids = lex.query(w)
-            for wid in wids:
-                if words:
-                    words.append(Or)
-                words.append(lex._inverseLex[wid])
-        else:
-            words.append(w)
-
-    return words
-            
-def parse(s):
-    '''Parse parentheses and quotes'''
-    l = []
-    tmp = string.lower(s)
-
-    while (1):
-        p = parens(tmp)
-
-        if (p is None):
-            # No parentheses found.  Look for quotes then exit.
-            l = l + quotes(tmp)
-            break
-        else:
-            # Look for quotes in the section of the string before
-            # the parentheses, then parse the string inside the parens
-            l = l + quotes(tmp[:(p[0] - 1)])
-            l.append(parse(tmp[p[0] : p[1]]))
-
-            # continue looking through the rest of the string
-            tmp = tmp[(p[1] + 1):]
-
-    return l
-
-def parse2(q, default_operator,
-           operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
-           ListType=type([]),
-           ):
-    '''Find operators and operands'''
-    i = 0
-    isop=operator_dict.has_key
-    while (i < len(q)):
-        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
-
-        # every other item, starting with the first, should be an operand
-        if ((i % 2) != 0):
-            # This word should be an operator; if it is not, splice in
-            # the default operator.
-            
-            if type(q[i]) is not ListType and isop(q[i]):
-                q[i] = operator_dict[q[i]]
-            else: q[i : i] = [ default_operator ]
-
-        i = i + 1
-
-    return q
-
-
-def parens(s, parens_re = regex.compile('(\|)').search):
-
-    index=open_index=paren_count = 0
-
-    while 1:
-        index = parens_re(s, index)
-        if index < 0 : break
-    
-        if s[index] == '(':
-            paren_count = paren_count + 1
-            if open_index == 0 : open_index = index + 1
-        else:
-            paren_count = paren_count - 1
-
-        if paren_count == 0:
-            return open_index, index
-        else:
-            index = index + 1
-
-    if paren_count == 0: # No parentheses Found
-        return None
-    else:
-        raise QueryError, "Mismatched parentheses"      
-
-
-
-def quotes(s, ws = (string.whitespace,)):
-     # split up quoted regions
-     splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
-     split=string.split
-
-     if (len(splitted) > 1):
-         if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
-    
-         for i in range(1,len(splitted),2):
-             # split the quoted region into words
-             splitted[i] = filter(None, split(splitted[i]))
-
-             # put the Proxmity operator in between quoted words
-             for j in range(1, len(splitted[i])):
-                 splitted[i][j : j] = [ Near ]
-
-         for i in range(len(splitted)-1,-1,-2):
-             # split the non-quoted region into words
-             splitted[i:i+1] = filter(None, split(splitted[i]))
-
-         splitted = filter(None, splitted)
-     else:
-         # No quotes, so just split the string into words
-         splitted = filter(None, split(s))
-
-     return splitted
-
-def get_operands(q, i, index, ListType=type([]), StringType=type('')):
-    '''Evaluate and return the left and right operands for an operator'''
-    try:
-        left  = q[i - 1]
-        right = q[i + 1]
-    except IndexError: raise QueryError, "Malformed query"
-
-    t=type(left)
-    if t is ListType: left = evaluate(left, index)
-    elif t is StringType: left=index[left]
-
-    t=type(right)
-    if t is ListType: right = evaluate(right, index)
-    elif t is StringType: right=index[right]
-
-    return (left, right)
-
-def evaluate(q, index, ListType=type([])):
-    '''Evaluate a parsed query'''
-##    import pdb
-##    pdb.set_trace()
-
-    if (len(q) == 1):
-        if (type(q[0]) is ListType):
-            return evaluate(q[0], index)
-
-        return index[q[0]]
-      
-    i = 0
-    while (i < len(q)):
-        if q[i] is AndNot:
-            left, right = get_operands(q, i, index)
-            val = left.and_not(right)
-            q[(i - 1) : (i + 2)] = [ val ]
-        else: i = i + 1
-
-    i = 0
-    while (i < len(q)):
-        if q[i] is And:
-            left, right = get_operands(q, i, index)
-            val = left & right
-            q[(i - 1) : (i + 2)] = [ val ]
-        else: i = i + 1
-
-    i = 0
-    while (i < len(q)):
-        if q[i] is Or:
-            left, right = get_operands(q, i, index)
-            val = left | right
-            q[(i - 1) : (i + 2)] = [ val ]
-        else: i = i + 1
-
-    i = 0
-    while (i < len(q)):
-        if q[i] is Near:
-            left, right = get_operands(q, i, index)
-            val = left.near(right)
-            q[(i - 1) : (i + 2)] = [ val ]
-        else: i = i + 1
-
-    if (len(q) != 1): raise QueryError, "Malformed query"
-
-    return q[0]


 stop_words=(

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,7 +92,7 @@ is no longer known.


 """
-__version__='$Revision: 1.20 $'[11:-2]
+__version__='$Revision: 1.21 $'[11:-2]

 from Globals import Persistent
 import BTree, IIBTree, IOBTree, OIBTree
@@ -105,12 +105,22 @@ from intSet import intSet
 import operator
 from Splitter import Splitter
 from string import strip
-import string, regex, regsub, pdb
+import string, regex, regsub, ts_regex


-from Lexicon import Lexicon, query, stop_word_dict
+
+from Lexicon import Lexicon, stop_word_dict
 from ResultList import ResultList

+
+AndNot    = 'andnot'
+And       = 'and'
+Or        = 'or'
+Near = '...'
+QueryError='TextIndex.QueryError'
+
+            
+
 class UnTextIndex(Persistent, Implicit):

    def __init__(self, id=None, ignore_ex=None,
@@ -160,6 +170,8 @@ class UnTextIndex(Persistent, Implicit):
            pass

        if lexicon is None:
+
+            ## if no lexicon is provided, create a dumb one
            self._lexicon=Lexicon()
        else:
            self._lexicon = lexicon
@@ -365,7 +377,7 @@ class UnTextIndex(Persistent, Implicit):
            
            rr = IIBucket()
            try:
-                for i, score in query(key,self).items():
+                 for i, score in self.query(key).items():
                    if score:
                        rr[i] = score
            except KeyError:
@@ -406,7 +418,7 @@ class UnTextIndex(Persistent, Implicit):

    def _subindex(self, isrc, d, old, last):

-        src = self.getLexicon.Splitter(isrc, self._syn)  
+        src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn)  

        for s in src:
            if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
@@ -417,3 +429,197 @@ class UnTextIndex(Persistent, Implicit):

        return last

+
+    def query(self, s, default_operator = Or, ws = (string.whitespace,)):
+        """
+
+        This is called by TextIndexes.  A 'query term' which is a string
+        's' is passed in, along with an index object.  s is parsed, then
+        the wildcards are parsed, then something is parsed again, then the 
+        whole thing is 'evaluated'
+
+        """
+
+        # First replace any occurences of " and not " with " andnot "
+        s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
+
+        # do some parsing
+        q = parse(s)
+
+        ## here, we give lexicons a chance to transform the query.
+        ## For example, substitute wildcards, or translate words into
+        ## various languages.
+        q = self.getLexicon(self._lexicon).query_hook(q)
+
+        # do some more parsing
+        q = parse2(q, default_operator)
+
+        ## evalute the final 'expression'
+        return self.evaluate(q)
+
+
+    def get_operands(self, q, i, ListType=type([]), StringType=type('')):
+        '''Evaluate and return the left and right operands for an operator'''
+        try:
+            left  = q[i - 1]
+            right = q[i + 1]
+        except IndexError: raise QueryError, "Malformed query"
+
+        t=type(left)
+        if t is ListType: left = evaluate(left, self)
+        elif t is StringType: left=self[left]
+
+        t=type(right)
+        if t is ListType: right = evaluate(right, self)
+        elif t is StringType: right=self[right]
+
+        return (left, right)
+
+
+    def evaluate(self, q, ListType=type([])):
+        '''Evaluate a parsed query'''
+    ##    import pdb
+    ##    pdb.set_trace()
+
+        if (len(q) == 1):
+            if (type(q[0]) is ListType):
+                return evaluate(q[0], self)
+
+            return self[q[0]]
+
+        i = 0
+        while (i < len(q)):
+            if q[i] is AndNot:
+                left, right = self.get_operands(q, i)
+                val = left.and_not(right)
+                q[(i - 1) : (i + 2)] = [ val ]
+            else: i = i + 1
+
+        i = 0
+        while (i < len(q)):
+            if q[i] is And:
+                left, right = self.get_operands(q, i)
+                val = left & right
+                q[(i - 1) : (i + 2)] = [ val ]
+            else: i = i + 1
+
+        i = 0
+        while (i < len(q)):
+            if q[i] is Or:
+                left, right = self.get_operands(q, i)
+                val = left | right
+                q[(i - 1) : (i + 2)] = [ val ]
+            else: i = i + 1
+
+        i = 0
+        while (i < len(q)):
+            if q[i] is Near:
+                left, right = self.get_operands(q, i)
+                val = left.near(right)
+                q[(i - 1) : (i + 2)] = [ val ]
+            else: i = i + 1
+
+        if (len(q) != 1): raise QueryError, "Malformed query"
+
+        return q[0]
+
+
+def parse(s):
+    '''Parse parentheses and quotes'''
+    l = []
+    tmp = string.lower(s)
+
+    while (1):
+        p = parens(tmp)
+
+        if (p is None):
+            # No parentheses found.  Look for quotes then exit.
+            l = l + quotes(tmp)
+            break
+        else:
+            # Look for quotes in the section of the string before
+            # the parentheses, then parse the string inside the parens
+            l = l + quotes(tmp[:(p[0] - 1)])
+            l.append(parse(tmp[p[0] : p[1]]))
+
+            # continue looking through the rest of the string
+            tmp = tmp[(p[1] + 1):]
+
+    return l
+
+def parse2(q, default_operator,
+           operator_dict = {AndNot: AndNot, And: And, Or: Or, Near: Near},
+           ListType=type([]),
+           ):
+    '''Find operators and operands'''
+    i = 0
+    isop=operator_dict.has_key
+    while (i < len(q)):
+        if (type(q[i]) is ListType): q[i] = parse2(q[i], default_operator)
+
+        # every other item, starting with the first, should be an operand
+        if ((i % 2) != 0):
+            # This word should be an operator; if it is not, splice in
+            # the default operator.
+            
+            if type(q[i]) is not ListType and isop(q[i]):
+                q[i] = operator_dict[q[i]]
+            else: q[i : i] = [ default_operator ]
+
+        i = i + 1
+
+    return q
+
+
+def parens(s, parens_re = regex.compile('(\|)').search):
+
+    index=open_index=paren_count = 0
+
+    while 1:
+        index = parens_re(s, index)
+        if index < 0 : break
+    
+        if s[index] == '(':
+            paren_count = paren_count + 1
+            if open_index == 0 : open_index = index + 1
+        else:
+            paren_count = paren_count - 1
+
+        if paren_count == 0:
+            return open_index, index
+        else:
+            index = index + 1
+
+    if paren_count == 0: # No parentheses Found
+        return None
+    else:
+        raise QueryError, "Mismatched parentheses"      
+
+
+
+def quotes(s, ws = (string.whitespace,)):
+     # split up quoted regions
+     splitted = ts_regex.split(s, '[%s]*\"[%s]*' % (ws * 2))
+     split=string.split
+
+     if (len(splitted) > 1):
+         if ((len(splitted) % 2) == 0): raise QueryError, "Mismatched quotes"
+    
+         for i in range(1,len(splitted),2):
+             # split the quoted region into words
+             splitted[i] = filter(None, split(splitted[i]))
+
+             # put the Proxmity operator in between quoted words
+             for j in range(1, len(splitted[i])):
+                 splitted[i][j : j] = [ Near ]
+
+         for i in range(len(splitted)-1,-1,-2):
+             # split the non-quoted region into words
+             splitted[i:i+1] = filter(None, split(splitted[i]))
+
+         splitted = filter(None, splitted)
+     else:
+         # No quotes, so just split the string into words
+         splitted = filter(None, split(s))
+
+     return splitted