Merge of cleanup of the indexing code. This isn't fully complete yet,

but it does cover field and keyword indexes with some minor fixups with the Text indexing code.

Merge of cleanup of the indexing code. This isn't fully complete yet,
but it does cover field and keyword indexes with some minor fixups with the Text indexing code.
5dd63b09 · Christopher Petrilli · 4b959522 · 5dd63b09 · 5dd63b09 · 5dd63b09
Commit 5dd63b09 authored Jan 05, 2001 by Christopher Petrilli
3 changed files
--- a/lib/python/SearchIndex/UnIndex.py
+++ b/lib/python/SearchIndex/UnIndex.py
@@ -85,7 +85,7 @@

 """Simple column indices"""

-__version__='$Revision: 1.18 $'[11:-2]
+__version__='$Revision: 1.19 $'[11:-2]


 from Globals import Persistent
@@ -154,70 +154,81 @@ class UnIndex(Persistent, Implicit):
    def __len__(self):
        return len(self._unindex)

-    def index_object(self, i, obj, threshold=None):
-        """ index and object 'obj' with integer id 'i'"""
+    def removeForwardIndexEntry(self, entry, documentId):
+        """Take the entry provided and remove any reference to documentId
+        in its entry in the index."""

-        # Before we do anything, unindex the object we've been handed, as
-        # we can't depend on the user to do the right thing.
-        self.unindex_object(i)
+        indexRow = self._index.get(entry, MV)
+        if indexRow is not MV:
+            try:
+                indexRow.remove(documentId)
+            except:
+                LOG(self.__class__.__name__, ERROR,
+                    ('unindex_object could not remove '
+                     'integer id %s from index %s.  This '
+                     'should not happen.'
+                     % (str(i), str(k)))) 
+        else:
+            LOG(self.__class__.__name__, ERROR,
+                ('unindex_object tried to retrieve set %s '
+                 'from index %s but couldn\'t.  This '
+                 'should not happen.' % (repr(set),str(k))))
        
-        index = self._index
-        unindex = self._unindex
+    def insertForwardIndexEntry(self, entry, documentId):
+        """Take the entry provided and put it in the correct place
+        in the forward index.
+
+        This will also deal with creating the entire row if necessary."""

-        id = self.id
+        indexRow = self._index.get(entry, MV)

+        # Make sure there's actually a row there already.  If not, create
+        # an IntSet and stuff it in first.
+        if indexRow is MV:
+            self._index[entry] = intSet()
+            indexRow = self._index[entry]
+        indexRow.insert(documentId)
+
+    def index_object(self, documentId, obj, threshold=None):
+        """ index and object 'obj' with integer id 'documentId'"""
+
+        returnStatus = 0
+
+        # First we need to see if there's anything interesting to look at
+        # self.id is the name of the index, which is also the name of the
+        # attribute we're interested in.  If the attribute is callable,
+        # we'll do so.
        try:
-            k=getattr(obj, id)
-            if callable(k):
-                k = k()
+            datum = getattr(obj, self.id)
+            if callable(datum):
+                datum = datum()
        except:
-            k = MV
+            datum = MV
 
-##        if k is None or k == MV:
-##            return 0
-
-        set = index.get(k)
-        if set is None:
-            index[k] = set = intSet()
-            
-        set.insert(i)
-        unindex[i] = k
+        # We don't want to do anything that we don't have to here, so we'll
+        # check to see if the new and existing information is the same.
+        if not (datum == self._unindex.get(documentId, MV)):
+            self.insertForwardIndexEntry(datum, documentId)
+            self._unindex[documentId] = datum

+            returnStatus = 1
+            self._p_changed = 1         # Tickle the transaction

-        self._index = index
-        self._unindex = unindex
-
-        return 1
+        return returnStatus
    

-    def unindex_object(self, i):
-        """ Unindex the object with integer id 'i' and don't
+    def unindex_object(self, documentId):
+        """ Unindex the object with integer id 'documentId' and don't
        raise an exception if we fail """
-        index = self._index
-        unindex = self._unindex

-        k = unindex.get(i, None)
-        if k is None:
+        unindexRecord = self._unindex.get(documentId, None)
+        if unindexRecord is None:
            return None
-        set = index.get(k, None)
-        if set is not None:
-            try:
-                set.remove(i)
-            except:
-                LOG('UnIndex', ERROR, ('unindex_object could not remove '
-                                       'integer id %s from index %s.  This '
-                                       'should not happen.'
-                                       % (str(i), str(k)))) 
-        else:
-            LOG('UnIndex', ERROR, ('unindex_object tried to retrieve set %s '
-                                   'from index %s but couldn\'t.  This '
-                                   'should not happen.' % (repr(set),str(k))))

-        del unindex[i]
+        self.removeForwardIndexEntry(unindexRecord, documentId)
+        
+        del self._unindex[i]
        
-        self._index = index
-        self._unindex = unindex
-

    def _apply_index(self, request, cid=''): 
        """Apply the index to query parameters given in the argument,
@@ -252,9 +263,6 @@ class UnIndex(Persistent, Implicit):
        if type(keys) not in (ListType, TupleType):
            keys = [keys]

-        print "XXX,"
-        print keys
-            
        index = self._index
        r = None
        anyTrue = 0

--- a/lib/python/SearchIndex/UnKeywordIndex.py
+++ b/lib/python/SearchIndex/UnKeywordIndex.py
@@ -85,6 +85,7 @@

 from UnIndex import UnIndex, MV, intSet
 from zLOG import LOG, ERROR
+from Missing import MV
 from types import *

 class UnKeywordIndex(UnIndex):
@@ -98,7 +99,7 @@ class UnKeywordIndex(UnIndex):
    This should have an _apply_index that returns a relevance score
    """

-    def index_object(self, i, obj, threshold=None):
+    def index_object(self, documentId, obj, threshold=None):
        """ index an object 'obj' with integer id 'i'

        Ideally, we've been passed a sequence of some sort that we
@@ -106,64 +107,73 @@ class UnKeywordIndex(UnIndex):
        useful with the results. In the case of a string, this means
        indexing the entire string as a keyword."""

-        # Before we do anything, unindex the object we've been handed, as
-        # we can't depend on the user to do the right thing.
-        self.unindex_object(i)
-
-        index = self._index
-        unindex = self._unindex
-
-        id = self.id
-
+        # First we need to see if there's anything interesting to look at
+        # self.id is the name of the index, which is also the name of the
+        # attribute we're interested in.  If the attribute is callable,
+        # we'll do so.
        try:
-            kws=getattr(obj, id)
-            if callable(kws):
-                kws = kws()
+            newKeywords = getattr(obj, self.id)
+            if callable(newKeywords):
+                newKeywords = newKeywords()
        except:
-            return 0
+            newKeywords = MV

-        # Check to see if we've been handed a string and if so, tuplize it
-        if type(kws) is StringType:
-            kws = tuple(kws)
+        if type(newKeywords) is StringType:
+            newKeywords = (keywords, )

-        # index each item in the sequence. This also catches things that are
-        # not sequences.
-        try:
-            for kw in kws:
-                set = index.get(kw)
-                if set is None:
-                    index[kw] = set = intSet()
-                    set.insert(i)
-        except TypeError:
+        # Now comes the fun part, we need to figure out what's changed
+        # if anything from the previous record.
+        oldKeywords = self._unindex.get(documentId, MV)
+
+        if newKeywords is MV:
+            self.unindex_object(documentId)
            return 0
+        elif oldKeywords is MV:
+            try:
+                for kw in newKeywords:
+                    self.insertForwardIndexEntry(kw, documentId)
+            except TypeError:
+                return 0
+        else:
+            # We need the old keywords to be a mapping so we can manipulate
+            # them more easily.
+            tmp = {}
+            try:
+                for kw in oldKeywords:
+                    tmp[kw] = None
+                    oldKeywords = tmp
+
+                    # Now we're going to go through the new keywords,
+                    # and add those that aren't already indexed.  If
+                    # they are already indexed, just delete them from
+                    # the list.
+                    for kw in newKeywords:
+                        if oldKeywords.has_key(kw):
+                            del oldKeywords[kw]
+                        else:
+                            self.insertForwardIndexEntry(kw, documentId)
+
+                    # Now whatever is left in oldKeywords are keywords
+                    # that we no longer have, and need to be removed
+                    # from the indexes.
+                    for kw in oldKeywords.keys():
+                        self.removeForwardIndexEntry(kw, documentId)
+
+            except TypeError:
+                return 0
        
-        unindex[i] = kws
-
-        self._index = index
-        self._unindex = unindex
+        self._unindex[documentId] = newKeywords

        return 1
    

-    def unindex_object(self, i):
-        """ carefully unindex the object with integer id 'i' and do not
-        fail if it does not exist """
-        index = self._index
-        unindex = self._unindex
+    def unindex_object(self, documentId):
+        """ carefully unindex the object with integer id 'documentId'"""

-        kws = unindex.get(i, None)
-        if kws is None:
+        keywords = self._unindex.get(documentId, MV)
+        if keywords is MV:
            return None
-        for kw in kws:
-            set = index.get(kw, None)
-            if set is not None:
-                set.remove(i)
-            else:
-                LOG('UnKeywordIndex', ERROR, ('unindex_object could not '
-                                              'remove %s from set'
-                                              % str(i)))
-        del unindex[i]
-        
-        self._index = index
-        self._unindex = unindex
+        for kw in keywords:
+            self.removeForwardIndexEntry(kw, documentId)

+        del self._unindex[documentId]
--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,7 +92,7 @@ is no longer known.


 """
-__version__='$Revision: 1.30 $'[11:-2]
+__version__='$Revision: 1.31 $'[11:-2]


 from Globals import Persistent
@@ -172,18 +172,12 @@ class UnTextIndex(Persistent, Implicit):
            pass

        if lexicon is None:
-
-            ## if no lexicon is provided, create a dumb one
+            ## if no lexicon is provided, create a default one
            self._lexicon=Lexicon()
        else:
            self._lexicon = lexicon


-    def __setstate(self, state):
-        Persistent.__setstate__(self, state)
-        if hasattr(self, '_syn'):
-            del self._syn
-
    def getLexicon(self, vocab_id):
        
        """ bit of a hack, indexes have been made acquirers so that
@@ -201,123 +195,102 @@ class UnTextIndex(Persistent, Implicit):
    def __len__(self):
        return len(self._unindex)

-##    def __setstate__(self, state):
-##        Persistent.__setstate__(self, state)
-##        if not hasattr(self, '_lexicon'):
-##            self._lexicon = Lexicon()
-        
-
    def clear(self):
        self._index = IOBTree()
        self._unindex = IOBTree()


-    def index_object(self, i, obj, threshold=None):
+    def index_object(self, documentId, obj, threshold=None):
        
        """ Index an object:

-          'i' is the integer id of the document
-
-          'obj' is the objects to be indexed
+        'documentId' is the integer id of the document
+        
+        'obj' is the objects to be indexed

-          'threshold' is the number of words to process between
-          commiting subtransactions.  If 'None' subtransactions are
-          not used.
+        'threshold' is the number of words to process between
+        commiting subtransactions.  If 'None' subtransactions are
+        disabled. """

-          the next four arguments are default optimizations.
-          """
-        # Before we do anything, unindex the object we've been handed, as
-        # we can't depend on the user to do the right thing.
-        self.unindex_object(i)
-        
-        id = self.id
+        # sniff the object for our 'id', the 'document source' of the
+        # index is this attribute.  If it smells callable, call it.
        try:
-            ## sniff the object for our 'id', the 'document source' of 
-            ## the index is this attribute.  If it smells callable,
-            ## call it.
-            k = getattr(obj, id)
-            if callable(k):
-                k = str(k())
+            source = getattr(obj, self.id)
+            if callable(source):
+                source = str(source())
            else:
-                k = str(k)
+                source = str(source)
        except:
            return 0
        
-        d = OIBTree()
-        old = d.has_key
-        last = None

-        ## The Splitter should now be european compliant at least.
-        ## Someone should test this.
+        sourceWords = self.getLexicon(self._lexicon).Splitter(source)

-##        import pdb
-##        pdb.set_trace()
-        
-        src = self.getLexicon(self._lexicon).Splitter(k)
-        ## This returns a tuple of stemmed words.  Stopwords have been 
-        ## stripped.
+        wordList = OIBTree()
+        last = None
        
-        for s in src:
-            if s[0] == '\"': last=self.subindex(s[1:-1], d, old, last)
+        # Run through the words and score them
+        for word in sourceWords:
+            if word[0] == '\"':
+                last = self.subindex(word[1:-1], wordList,
+                                     wordList.has_key, last) # XXX
            else:
-                if old(s):
-                    if s != last: d[s] = d[s]+1
-                else: d[s] = 1
+                if wordList.has_key(word):
+                    if word != last:
+                        wordList[word] = wordList[word]+1
+                else:
+                    wordList[word] = 1

        index = self._index
        unindex = self._unindex
        lexicon = self.getLexicon(self._lexicon)
-        get = index.get
-        unindex[i] = []
-        times = 0
+        unindex[documentId] = []        # XXX this should be more intellegent
+        wordCount = 0

-        for word, score in d.items():
+        for word, score in wordList.items():
            if threshold is not None:
-                if times > threshold:
+                if ((wordCount % threshold) == 0) and not (wordCount == 0):
                    # commit a subtransaction hack
                    get_transaction().commit(1)
                    # kick the cache
                    self._p_jar.cacheFullSweep(1)
-                    times = 0
                    
-            word_id = lexicon.set(word)
+            wordId = lexicon.set(word)
            
-            r = get(word_id)
-            if r is not None:
-                r = index[word_id]
-                if type(r) is TupleType:
-                    r = {r[0]:r[1]}
-                    r[i] = score
-
-                    index[word_id] = r
-                    unindex[i].append(word_id)
+            indexRow = index.get(wordId)
+            if indexRow is not None:
+                indexRow = index[wordId] # Duplicate?
+                if type(indexRow) is TupleType:
+                    indexRow = {indexRow[0]:indexRow[1]}
+                    indexRow[documentId] = score
+
+                    index[wordId] = indexRow
+                    unindex[documentId].append(wordId)
                    
-                elif type(r) is DictType:
-                    if len(r) > 4:
+                elif type(indexRow) is DictType:
+                    if len(indexRow) > 4:
                        b = IIBucket()
-                        for k, v in r.items(): b[k] = v
-                        r = b
-                    r[i] = score
+                        for k, v in indexRow.items():
+                            b[k] = v
+                        indexRow = b
+
+                    indexRow[documentId] = score

-                    index[word_id] = r
-                    unindex[i].append(word_id)
+                    index[wordId] = indexRow
+                    unindex[documentId].append(wordId)
                    
                else:
-                    r[i] = score
-                    unindex[i].append(word_id)
+                    indexRow[documentId] = score
+                    unindex[documentId].append(wordId)
            else:
-                index[word_id] = i, score
-                unindex[i].append(word_id)
-            times = times + 1
+                index[wordId] = documentId, score
+                unindex[documentId].append(wordId)
+            wordCount = wordCount + 1

-        unindex[i] = tuple(unindex[i])
-        l = len(unindex[i])
+        unindex[documentId] = tuple(unindex[documentId])
        
-        self._index = index
-        self._unindex = unindex
-
        ## return the number of words you indexed
-        return times
+        return wordCount

    def unindex_object(self, i): 
        """ carefully unindex document with integer id 'i' from the text
@@ -338,8 +311,6 @@ class UnTextIndex(Persistent, Implicit):
                            'unindex_object tried to unindex nonexistent'
                            ' document %s' % str(i))
            del unindex[i]
-            self._index = index
-            self._unindex = unindex

    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"
@@ -378,10 +349,8 @@ class UnTextIndex(Persistent, Implicit):
        all data fields used.  
        """

-        id = self.id
-
-        if request.has_key(id):
-            keys = request[id]
+        if request.has_key(self.id):
+            keys = request[self.id]
        else:
            return None

@@ -410,26 +379,25 @@ class UnTextIndex(Persistent, Implicit):
                r = r.intersection(rr) 

        if r is not None:
-            return r, (id,)
-        return IIBucket(), (id,)
+            return r, (self.id,)
+        return (IIBucket(), (self.id,))


    def positions(self, docid, words, obj):
        """Return the positions in the document for the given document
        id of the word, word."""
-        id = self.id

        if self._schema is None:
            f = getattr
        else:
            f = operator.__getitem__
-            id = self._schema[id]
+            id = self._schema[self.id]


        if self.call_methods:
-            doc = str(f(obj, id)())
+            doc = str(f(obj, self.id)())
        else:
-            doc = str(f(obj, id))
+            doc = str(f(obj, self.id))

        r = []
        for word in words: