Merging in Catalog changes for the lexicon.

81625f82 · Christopher Petrilli · 381d6e48 · 81625f82 · 81625f82 · 81625f82
Commit 81625f82 authored Jan 23, 2001 by Christopher Petrilli
4 changed files
--- a/doc/CHANGES.txt
+++ b/doc/CHANGES.txt
@@ -19,6 +19,15 @@ Zope changes
          hook to create PythonScripts (for MIMEtype 'text/x-python')
          and DTMLMethods (for other 'text' MIMEtypes) (Collector #998).

+      Bugs Fixed
+
+	- Mechanisms in the underbelly of the Catalog and Globbing
+	  Lexicon (which is the default for all new Catalogs) has been
+	  overhauled given substantial performance increases.  On
+	  simple queries, performance should double (or more) in many
+	  situations, whereas with globbed queries it may increase by
+	  substantially more.
+
    Zope 2.3.0 beta 1

      Features Added

--- a/lib/python/Products/ZCatalog/ZCatalog.py
+++ b/lib/python/Products/ZCatalog/ZCatalog.py
@@ -101,9 +101,9 @@ from Catalog import Catalog, orify
 from SearchIndex import UnIndex, UnTextIndex
 from Vocabulary import Vocabulary
 import IOBTree
+from Shared.DC.ZRDB.TM import TM
 from AccessControl import getSecurityManager

-
 manage_addZCatalogForm=DTMLFile('dtml/addZCatalog',globals())

 def manage_addZCatalog(self, id, title, vocab_id=None, REQUEST=None):
@@ -217,6 +217,7 @@ class ZCatalog(Folder, Persistent, Implicit):

    threshold=10000
    _v_total=0
+    _v_transaction = None
    
    def __init__(self, id, title='', vocab_id=None, container=None):
        self.id=id
@@ -401,14 +402,31 @@ class ZCatalog(Folder, Persistent, Implicit):

    def catalog_object(self, obj, uid):
        """ wrapper around catalog """
-        self._v_total = (self._v_total +
-                         self._catalog.catalogObject(obj, uid, self.threshold))
+        self._catalog.catalogObject(obj, uid, None)
+        # None passed in to catalogObject as third argument indicates
+        # that we shouldn't try to commit subtransactions within any
+        # indexing code.  We throw away the result of the call to
+        # catalogObject (which is a word count), because it's
+        # worthless to us here.
        
        if self.threshold is not None:
+            # figure out whether or not to commit a subtransaction.
+            t = id(get_transaction())
+            if t != self._v_transaction:
+                self._v_total = 0
+            self._v_transaction = t
+            self._v_total = self._v_total + 1
+            # increment the _v_total counter for this thread only and get
+            # a reference to the current transaction.
+            # the _v_total counter is zeroed if we notice that we're in
+            # a different transaction than the last one that came by.
+            # self.threshold represents the number of times that
+            # catalog_object needs to be called in order for the catalog
+            # to commit a subtransaction.  The semantics here mean that
+            # we should commit a subtransaction if our threshhold is
+            # exceeded within the boundaries of the current transaction.
            if self._v_total > self.threshold:
-                # commit a subtransaction
                get_transaction().commit(1)
-                # kick the chache, this may be overkill but ya never know
                self._p_jar.cacheFullSweep(1)
                self._v_total = 0


--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
@@ -83,28 +83,22 @@
 # 
 ##############################################################################

-import string, regex, ts_regex
-import regsub
-
-from Lexicon import Lexicon
-
-
 __doc__=""" Lexicon object that supports 

 """
-
+from Lexicon import Lexicon
 from Splitter import Splitter
-from Persistence import Persistent
-from Acquisition import Implicit
-import OIBTree, BTree, IOBTree
 from intSet import intSet
-OIBTree=OIBTree.BTree
-OOBTree=BTree.BTree
-IOBTree=IOBTree.BTree
-import re
-
 from UnTextIndex import Or

+import re, time
+import OIBTree, BTree, IOBTree, IIBTree
+OIBTree = OIBTree.BTree                 # Object -> Integer
+OOBTree = BTree.BTree                   # Object -> Object
+IOBTree = IOBTree.BTree                 # Integer -> Object
+IIBucket = IIBTree.Bucket               # Integer -> Integer
+
+import pdb
 class GlobbingLexicon(Lexicon):
    """

@@ -155,7 +149,6 @@ class GlobbingLexicon(Lexicon):
                    
                set.insert(self.counter)

-            self._digrams = _digrams
            counter = self.counter
            self.counter = self.counter + 1
            return counter
@@ -163,14 +156,14 @@ class GlobbingLexicon(Lexicon):

    def get(self, pattern):
        """ Query the lexicon for words matching a pattern.
-
        """
-
        wc_set = [self.multi_wc, self.single_wc]
+
        digrams = []
+        globbing = 0
        for i in range(len(pattern)):
-
            if pattern[i] in wc_set:
+                globbing = 1
                continue

            if i == 0:
@@ -184,21 +177,19 @@ class GlobbingLexicon(Lexicon):
                except IndexError:
                    digrams.append( (pattern[i] + self.eow) )

-
+        if not globbing:
+            result =  self._lexicon.get(pattern, ())
+            return (result, )
        
        ## now get all of the intsets that contain the result digrams
-
-        result = None
+        result = IIBucket()
        for digram in digrams:
            if self._digrams.has_key(digram):
-                set = self._digrams[digram]
-                if set is not None:
-                    if result is None:
-                        result = set
-                    else:
-                        result.intersection(set)
+                matchSet = self._digrams[digram]
+                if matchSet is not None:
+                    result = IIBucket().union(matchSet)

-        if result is None:
+        if len(result) == 0:
            return ()
        else:
            ## now we have narrowed the list of possible candidates
@@ -211,10 +202,9 @@ class GlobbingLexicon(Lexicon):
            expr = re.compile(self.translate(pattern))
            words = []
            hits = []
-            for x in result:
-                if expr.search(self._inverseLex[x]):
+            for x in result.keys():
+                if expr.match(self._inverseLex[x]):
                    hits.append(x)
-
            return hits
                
    def __getitem__(self, word):
@@ -226,6 +216,7 @@ class GlobbingLexicon(Lexicon):

        """
        words = []
+        wids = []
        for w in q:
            if ( (self.multi_wc in w) or
                 (self.single_wc in w) ):
@@ -233,7 +224,7 @@ class GlobbingLexicon(Lexicon):
                for wid in wids:
                    if words:
                        words.append(Or)
-                    words.append(self._inverseLex[wid])
+                    words.append(wid)
            else:
                words.append(w)

@@ -262,19 +253,7 @@ class GlobbingLexicon(Lexicon):
            if c == self.multi_wc:
                res = res + '.*'
            elif c == self.single_wc:
-                res = res + '.'
+                res = res + '.?'
            else:
                res = res + re.escape(c)
-        return res + "$"
-
-
-
-
-
-
-
-
-
-
-
-
+        return res + '$'
--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,7 +92,7 @@ is no longer known.


 """
-__version__='$Revision: 1.33 $'[11:-2]
+__version__='$Revision: 1.34 $'[11:-2]


 from Globals import Persistent
@@ -368,22 +368,42 @@ class UnTextIndex(Persistent, Implicit):

    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"
-        """
-        src = tuple(self.getLexicon(self._lexicon).Splitter(word))
-        if not src: return ResultList({}, (word,), self)
-        if len(src) == 1:
-            src=src[0]
-            if src[:1]=='"' and src[-1:]=='"': return self[src]
-            r = self._index.get(self.getLexicon(self._lexicon).get(src)[0],
+
+        Note that this differentiates between being passed an Integer
+        and a String.  Strings are looked up in the lexicon, whereas
+        Integers are assumed to be resolved word ids. """
+        
+        if type(word) is IntType:
+            # We have a word ID
+            result = self._index.get(word, {})
+            return ResultList(result, (word,), self)
+        else:
+            splitSource = tuple(self.getLexicon(self._lexicon).Splitter(word))
+
+            if not splitSource:
+                return ResultList({}, (word,), self)
+        
+            if len(splitSource) == 1:
+                splitSource = splitSource[0]
+                if splitSource[:1]=='"' and splitSource[-1:]=='"':
+                    return self[splitSource]
+
+                r = self._index.get(
+                     self.getLexicon(self._lexicon).get(splitSource)[0],
                     None)
-            if r is None: r = {}
-            return ResultList(r, (src,), self)
+
+                if r is None:
+                    r = {}
+
+                return ResultList(r, (splitSource,), self)

            r = None
-        for word in src:
+            for word in splitSource:
                rr = self[word]
-            if r is None: r = rr
-            else: r = r.near(rr)
+                if r is None:
+                    r = rr
+                else:
+                    r = r.near(rr)

            return r

@@ -482,13 +502,11 @@ class UnTextIndex(Persistent, Implicit):
        whole thing is 'evaluated'

        """
-
        # First replace any occurences of " and not " with " andnot "
        s = ts_regex.gsub(
            '[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
            ' andnot ', s)

-
        # do some parsing
        q = parse(s)

@@ -509,65 +527,78 @@ class UnTextIndex(Persistent, Implicit):
        try:
            left  = q[i - 1]
            right = q[i + 1]
-        except IndexError: raise QueryError, "Malformed query"
-
-        t=type(left)
-        if t is ListType: left = evaluate(left, self)
-        elif t is StringType: left=self[left]
-
-        t=type(right)
-        if t is ListType: right = evaluate(right, self)
-        elif t is StringType: right=self[right]
+        except IndexError:
+            raise QueryError, "Malformed query"
+
+        operandType = type(left)
+        if operandType is IntType:
+            left = self[left]
+        elif operandType is StringType:
+            left = self[left]        
+        elif operandType is ListType:
+            left = evaluate(left, self)
+
+        operandType = type(right)
+        if operandType is IntType:
+            right = self[right]
+        elif operandType is StringType:
+            right = self[right]       
+        elif operandType is ListType:
+            right = evaluate(right, self)

        return (left, right)


-    def evaluate(self, q):
+    def evaluate(self, query):
        '''Evaluate a parsed query'''
-    ##    import pdb
-    ##    pdb.set_trace()
-
-        if (len(q) == 1):
-            if (type(q[0]) is ListType):
-                return evaluate(q[0], self)
-
-            return self[q[0]]
-
+        # There are two options if the query passed in is only one
+        # item. It means either it's an embedded query, in which case
+        # we'll recursively evaluate, other wise it's nothing for us
+        # to evaluate, and we just get the results and return them.
+        if (len(query) == 1):
+            if (type(query[0]) is ListType):
+                return evaluate(query[0], self)
+
+            return self[query[0]]       # __getitem__
+
+        # Now we need to loop through the query and expand out
+        # operators.  They are currently evaluated in the following
+        # order: AndNote -> And -> Or -> Near
        i = 0
-        while (i < len(q)):
-            if q[i] is AndNot:
-                left, right = self.get_operands(q, i)
+        while (i < len(query)):
+            if query[i] is AndNot:
+                left, right = self.get_operands(query, i)
                val = left.and_not(right)
-                q[(i - 1) : (i + 2)] = [ val ]
+                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
-        while (i < len(q)):
-            if q[i] is And:
-                left, right = self.get_operands(q, i)
+        while (i < len(query)):
+            if query[i] is And:
+                left, right = self.get_operands(query, i)
                val = left & right
-                q[(i - 1) : (i + 2)] = [ val ]
+                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
-        while (i < len(q)):
-            if q[i] is Or:
-                left, right = self.get_operands(q, i)
+        while (i < len(query)):
+            if query[i] is Or:
+                left, right = self.get_operands(query, i)
                val = left | right
-                q[(i - 1) : (i + 2)] = [ val ]
+                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

        i = 0
-        while (i < len(q)):
-            if q[i] is Near:
-                left, right = self.get_operands(q, i)
+        while (i < len(query)):
+            if query[i] is Near:
+                left, right = self.get_operands(query, i)
                val = left.near(right)
-                q[(i - 1) : (i + 2)] = [ val ]
+                query[(i - 1) : (i + 2)] = [ val ]
            else: i = i + 1

-        if (len(q) != 1): raise QueryError, "Malformed query"
+        if (len(query) != 1): raise QueryError, "Malformed query"

-        return q[0]
+        return query[0]


 def parse(s):