partial searching

63651bb3 · Michel Pelletier · d03616e0 · 63651bb3 · 63651bb3 · 63651bb3
Commit 63651bb3 authored Jan 31, 2000 by Michel Pelletier
5 changed files
--- a/lib/python/Products/ZCatalog/ZCatalog.py
+++ b/lib/python/Products/ZCatalog/ZCatalog.py
@@ -99,17 +99,46 @@ from DocumentTemplate.DT_Util import Eval, expr_globals
 from AccessControl.Permission import name_trans
 from Catalog import Catalog, orify
 from SearchIndex import UnIndex, UnTextIndex
+from Vocabulary import Vocabulary
 import IOBTree

 manage_addZCatalogForm=HTMLFile('addZCatalog',globals())

-def manage_addZCatalog(self,id,title,REQUEST=None):
+def manage_addZCatalog(self, id, title, vocab='', vocab_id='', REQUEST=None):
    """Add a ZCatalog object
    """
-    c=ZCatalog(id,title)
-    self._setObject(id,c)
+    c=ZCatalog(id, title, vocab, vocab_id, self)
+    self._setObject(id, c)
    if REQUEST is not None:
-        return self.manage_main(self,REQUEST)
+        return self.manage_main(self, REQUEST)
+
+
+def VocabularyIDs(self):
+    """ returns a list of acquireable vocabularies.  Stole this from
+    ZSQLMethods """
+
+    ids={}
+    have_id=ids.has_key
+    StringType=type('')
+
+    while self is not None:
+        if hasattr(self, 'objectValues'):
+            for o in self.objectValues():
+                if (hasattr(o,'_isAVocabulary') and o._isAVocabulary
+                    and hasattr(o,'id')):
+                    id=o.id
+                    if type(id) is not StringType: id=id()
+                    if not have_id(id):
+                        if hasattr(o,'title_and_id'): o=o.title_and_id()
+                        else: o=id
+                        ids[id]=id
+        if hasattr(self, 'aq_parent'): self=self.aq_parent
+        else: self=None
+
+    ids=map(lambda item: (item[1], item[0]), ids.items())
+    ids.sort()
+    return ids
+


 class ZCatalog(Folder, Persistent, Implicit):
@@ -191,13 +220,22 @@ class ZCatalog(Folder, Persistent, Implicit):
    threshold=10000
    _v_total=0

-
-    def __init__(self,id,title=''):
+    def __init__(self, id, title='', vocab=0, vocab_id='', container=None):
        self.id=id
        self.title=title
+        self.vocab_id = vocab_id
+        
        self.threshold = 10000
        self._v_total = 0
-        self._catalog = Catalog()
+
+        if not vocab:
+            v = Vocabulary('Vocabulary', 'Vocabulary', globbing=1)
+            self._setObject('Vocabulary', v)
+            v = 'Vocabulary'
+        else:
+            v = vocab_id
+
+        self._catalog = Catalog(vocabulary=v)

        self._catalog.addColumn('id')
        self._catalog.addIndex('id', 'FieldIndex')
@@ -213,7 +251,12 @@ class ZCatalog(Folder, Persistent, Implicit):

        self._catalog.addColumn('summary')
        self._catalog.addIndex('PrincipiaSearchSource', 'TextIndex')
-        
+
+
+    def getVocabulary(self):
+        """ more ack! """
+        return getattr(self, self.vocab_id)
+

    def manage_edit(self, RESPONSE, URL1, threshold=1000, REQUEST=None):
        """ edit the catalog """
@@ -359,7 +402,7 @@ class ZCatalog(Folder, Persistent, Implicit):
            if self._v_total > self.threshold:
                # commit a subtransaction
                get_transaction().commit(1)
-                # kick the chache
+                # kick the chache, this may be overkill but ya never know
                self._p_jar.cacheFullSweep(1)
                self._v_total = 0

@@ -545,10 +588,7 @@ class ZCatalog(Folder, Persistent, Implicit):
                )
                ):
                if apply_func:
-                    if apply_path:
-                        apply_func(ob, (apply_path+'/'+p))
-                    else:
-                        apply_func(ob, p)
+                    apply_func(ob, (apply_path+'/'+p))
                else:
                    add_result((p, ob))
                    dflag=0

--- a/lib/python/SearchIndex/GlobbingLexicon.py
+++ b/lib/python/SearchIndex/GlobbingLexicon.py
+##############################################################################
+# 
+# Zope Public License (ZPL) Version 1.0
+# -------------------------------------
+# 
+# Copyright (c) Digital Creations.  All rights reserved.
+# 
+# This license has been certified as Open Source(tm).
+# 
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+# 
+# 1. Redistributions in source code must retain the above copyright
+#    notice, this list of conditions, and the following disclaimer.
+# 
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions, and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+# 
+# 3. Digital Creations requests that attribution be given to Zope
+#    in any manner possible. Zope includes a "Powered by Zope"
+#    button that is installed by default. While it is not a license
+#    violation to remove this button, it is requested that the
+#    attribution remain. A significant investment has been put
+#    into Zope, and this effort will continue if the Zope community
+#    continues to grow. This is one way to assure that growth.
+# 
+# 4. All advertising materials and documentation mentioning
+#    features derived from or use of this software must display
+#    the following acknowledgement:
+# 
+#      "This product includes software developed by Digital Creations
+#      for use in the Z Object Publishing Environment
+#      (http://www.zope.org/)."
+# 
+#    In the event that the product being advertised includes an
+#    intact Zope distribution (with copyright and license included)
+#    then this clause is waived.
+# 
+# 5. Names associated with Zope or Digital Creations must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission from Digital Creations.
+# 
+# 6. Modified redistributions of any form whatsoever must retain
+#    the following acknowledgment:
+# 
+#      "This product includes software developed by Digital Creations
+#      for use in the Z Object Publishing Environment
+#      (http://www.zope.org/)."
+# 
+#    Intact (re-)distributions of any official Zope release do not
+#    require an external acknowledgement.
+# 
+# 7. Modifications are encouraged but must be packaged separately as
+#    patches to official Zope releases.  Distributions that do not
+#    clearly separate the patches from the original work must be clearly
+#    labeled as unofficial distributions.  Modifications which do not
+#    carry the name Zope may be packaged in any form, as long as they
+#    conform to all of the clauses above.
+# 
+# 
+# Disclaimer
+# 
+#   THIS SOFTWARE IS PROVIDED BY DIGITAL CREATIONS ``AS IS'' AND ANY
+#   EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+#   IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+#   PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL DIGITAL CREATIONS OR ITS
+#   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
+#   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+#   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+#   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+#   SUCH DAMAGE.
+# 
+# 
+# This software consists of contributions made by Digital Creations and
+# many individuals on behalf of Digital Creations.  Specific
+# attributions are listed in the accompanying credits file.
+# 
+##############################################################################
+
+import string, regex, ts_regex
+import regsub
+
+from Lexicon import Lexicon
+
+
+__doc__=""" Lexicon object that supports 
+
+"""
+
+from Splitter import Splitter
+from Persistence import Persistent
+from Acquisition import Implicit
+import OIBTree, BTree, IOBTree
+from intSet import intSet
+OIBTree=OIBTree.BTree
+OOBTree=BTree.BTree
+IOBTree=IOBTree.BTree
+import re
+
+
+class GlobbingLexicon(Lexicon):
+    """
+
+    Base class to support globbing lexicon object.
+    """
+
+    multi_wc = '*'
+    single_wc = '?'
+    eow = '$'
+
+    def __init__(self):
+
+        self.counter = 0
+        self._lexicon = OIBTree()
+        self._inverseLex = IOBTree()
+        self._digrams = OOBTree()
+
+    def set(self, word):
+        """  """
+
+        if self._lexicon.has_key(word):
+            return self._lexicon[word]
+
+        else:
+            word = intern(word)
+            self._lexicon[word] = self.counter
+            self._inverseLex[self.counter] = word
+
+            ## now, split the word into digrams and insert references
+            ## to 'word' into the digram object.  The first and last
+            ## digrams in the list are specially marked with $ to
+            ## indicate the beginning and end of the word
+
+            digrams = []
+            digrams.append(self.eow + word[0]) # mark the beginning
+
+            for i in range(len(word)):
+                digrams.append(word[i:i+2])
+
+            digrams[-1] = digrams[-1] + self.eow  # mark the end
+
+            _digrams = self._digrams
+            
+            for digram in digrams:
+                set = _digrams.get(digram)
+                if set is None:
+                    _digrams[digram] = set = intSet()
+                    
+                set.insert(self.counter)
+
+            self._digrams = _digrams
+            
+            self.counter = self.counter + 1
+            return self.counter
+
+
+    def query(self, pattern):
+        """ Query the lexicon for words matching a pattern.
+
+        """
+
+        wc_set = [self.multi_wc, self.single_wc]
+        digrams = []
+        for i in range(len(pattern)):
+
+            if pattern[i] in wc_set:
+                continue
+
+            if i == 0:
+                digrams.insert(i, (self.eow + pattern[i]) )
+                digrams.append((pattern[i] + pattern[i+1]))
+            else:
+                try:
+                    if pattern[i+1] not in wc_set:
+                        digrams.append( pattern[i] + pattern[i+1] )
+
+                except IndexError:
+                    digrams.append( (pattern[i] + self.eow) )
+
+
+
+        ## now get all of the intsets that contain the result digrams
+
+        result = None
+        for digram in digrams:
+            if self._digrams.has_key(digram):
+                set = self._digrams[digram]
+                if set is not None:
+                    if result is None:
+                        result = set
+                    else:
+                        result.intersection(set)
+
+        if result is None:
+            return ()
+        else:
+            ## now we have narrowed the list of possible candidates
+            ## down to those words which contain digrams.  However,
+            ## some words may have been returned that match digrams,
+            ## but do not match 'pattern'.  This is because some words
+            ## may contain all matching digrams, but in the wrong
+            ## order.
+
+            expr = re.compile(self.translate(pattern))
+            words = []
+            hits = []
+            for x in result:
+                if expr.search(self._inverseLex[x]):
+                    hits.append(x)
+
+            return hits
+                
+    def __getitem__(self, word):
+        """ """
+        return self.query(word)
+
+    def translate(self, pat):
+            """Translate a PATTERN to a regular expression.
+
+            There is no way to quote meta-characters.
+            """
+
+            i, n = 0, len(pat)
+            res = ''
+            while i < n:
+                    c = pat[i]
+                    i = i+1
+                    if c == self.multi_wc:
+                            res = res + '.*'
+                    elif c == self.single_wc:
+                            res = res + '.'
+                    else:
+                            res = res + re.escape(c)
+            return res + "$"
+
+
+
+
+
+
+
+
+
+
+
+
--- a/lib/python/SearchIndex/Lexicon.py
+++ b/lib/python/SearchIndex/Lexicon.py
@@ -113,15 +113,8 @@ class Lexicon(Persistent, Implicit):

    """

-    def __init__(self, globbish=None):
+    def __init__(self):
        self._lexicon = OIBTree()
-        if globbish:
-            self._ngrams = OOBTree()
-        self.counter = 0
-
-    def __getitem__(self, key):
-        """ overload mapping behavior """
-        return self._lexicon[key]

    def set(self, word):
        """ return the word id of 'word' """
@@ -134,19 +127,23 @@ class Lexicon(Persistent, Implicit):
            self.counter = self.counter + 1
            return self.counter

+    def get(self, key):
+        """  """
+        return self._lexicon[key]
+
    def __len__(self):
        return len(self._lexicon)

-
    def Splitter(self, astring, words):
        """ wrap the splitter """
        return Splitter(astring, words)
-        

    def grep(self, query):
        """
        regular expression search through the lexicon
        he he.
+
+        Do not use unless you know what your doing!!!
        """
        expr = re.compile(query)
        hits = []
@@ -155,6 +152,12 @@ class Lexicon(Persistent, Implicit):
                hits.append(x)
        return hits

+
+
+
+
+
+
 AndNot    = 'andnot'
 And       = 'and'
 Or        = 'or'
@@ -166,9 +169,27 @@ def query(s, index, default_operator = Or,
    # First replace any occurences of " and not " with " andnot "
    s = ts_regex.gsub('[%s]+and[%s]*not[%s]+' % (ws * 3), ' andnot ', s)
    q = parse(s)
+    q = parse_wc(q, index)
    q = parse2(q, default_operator)
    return evaluate(q, index)

+def parse_wc(q, index):
+    '''expand wildcards'''
+    lex = index.getLexicon(index._lexicon)
+    words = []
+    for w in q:
+        if ( (lex.multi_wc in w) or
+            (lex.single_wc in w) ):
+            wids = lex.query(w)
+            for wid in wids:
+                if words:
+                    words.append(Or)
+                words.append(lex._inverseLex[wid])
+        else:
+            words.append(w)
+
+    return words
+            
 def parse(s):
    '''Parse parentheses and quotes'''
    l = []

--- a/lib/python/SearchIndex/UnIndex.py
+++ b/lib/python/SearchIndex/UnIndex.py
@@ -84,9 +84,10 @@
 ##############################################################################

 """Simple column indices"""
-__version__='$Revision: 1.9 $'[11:-2]
+__version__='$Revision: 1.10 $'[11:-2]

 from Globals import Persistent
+from Acquisition import Implicit
 import BTree
 import IOBTree
 from intSet import intSet
@@ -107,7 +108,7 @@ def nonEmpty(s):
        return 1


-class UnIndex(Persistent):
+class UnIndex(Persistent, Implicit):
    """UnIndex object interface"""

    def __init__(self, id=None, ignore_ex=None, call_methods=None):

--- a/lib/python/SearchIndex/UnTextIndex.py
+++ b/lib/python/SearchIndex/UnTextIndex.py
@@ -92,10 +92,11 @@ is no longer known.


 """
-__version__='$Revision: 1.18 $'[11:-2]
+__version__='$Revision: 1.19 $'[11:-2]

 from Globals import Persistent
 import BTree, IIBTree, IOBTree, OIBTree
+from Acquisition import Implicit
 BTree=BTree.BTree
 IOBTree=IOBTree.BTree
 IIBucket=IIBTree.Bucket
@@ -110,7 +111,7 @@ import string, regex, regsub, pdb
 from Lexicon import Lexicon, query, stop_word_dict
 from ResultList import ResultList

-class UnTextIndex(Persistent):
+class UnTextIndex(Persistent, Implicit):

    def __init__(self, id=None, ignore_ex=None,
                 call_methods=None, lexicon=None):
@@ -161,9 +162,20 @@ class UnTextIndex(Persistent):
        if lexicon is None:
            self._lexicon=Lexicon()
        else:
-            self._lexicon=lexicon
+            self._lexicon = lexicon


+    def getLexicon(self, vocab_id):
+        
+        """ bit of a hack, indexes have been made acquirers so that
+        they can acquire a vocabulary object from the object system in 
+        Zope.  I don't think indexes were ever intended to participate 
+        in this way, but I don't see too much of a problem with it.
+        """
+        vocab =  getattr(self, vocab_id)
+        return vocab.lexicon
+        
+
    def __len__(self):
        return len(self._unindex)

@@ -213,7 +225,7 @@ class UnTextIndex(Persistent):

        ## The Splitter should now be european compliant at least.
        ## Someone should test this.
-        src = self._lexicon.Splitter(k, self._syn)
+        src = self.getLexicon(self._lexicon).Splitter(k, self._syn)
        ## This returns a tuple of stemmed words.  Stopwords have been 
        ## stripped.
        
@@ -226,7 +238,7 @@ class UnTextIndex(Persistent):

        index = self._index
        unindex = self._unindex
-        lexicon = self._lexicon
+        lexicon = self.getLexicon(self._lexicon)
        get = index.get
        unindex[i] = []
        times = 0
@@ -297,28 +309,20 @@ class UnTextIndex(Persistent):
    def __getitem__(self, word):
        """Return an InvertedIndex-style result "list"
        """
-        src = tuple(self._lexicon.Splitter(word, self._syn))
-        if not src:
-            return ResultList({}, (word,), self)
-
+        src = tuple(self.getLexicon(self._lexicon).Splitter(word, self._syn))
+        if not src: return ResultList({}, (word,), self)
        if len(src) == 1:
            src=src[0]
-            if src[:1]=='"' and src[-1:]=='"':
-                return self[src]
-
-            r = self._index.get(self._lexicon[word], None)
-            if r is None:
-                r = {}
+            if src[:1]=='"' and src[-1:]=='"': return self[src]
+            r = self._index.get(self.getLexicon(self._lexicon)[word][0],None)
+            if r is None: r = {}
            return ResultList(r, (word,), self)
            
        r = None
        for word in src:
            rr = self[word]
-
-            if r is None:
-                r = rr
-            else:
-                r = r.near(rr)
+            if r is None: r = rr
+            else: r = r.near(rr)

        return r

@@ -393,13 +397,13 @@ class UnTextIndex(Persistent):

        r = []
        for word in words:
-            r = r+self._lexicon.Splitter(doc, self._syn).indexes(word)
+            r = r+self.getLexicon(self._lexicon).Splitter(doc, self._syn).indexes(word)
        return r


    def _subindex(self, isrc, d, old, last):

-        src = self._lexicon.Splitter(isrc, self._syn)  
+        src = self.getLexicon.Splitter(isrc, self._syn)  

        for s in src:
            if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)