Merged TextIndexDS9-branch into trunk.

61e89f2f · Guido van Rossum · a340cb9d · 61e89f2f · 61e89f2f · 61e89f2f
Commit 61e89f2f authored May 14, 2002 by Guido van Rossum
35 changed files
--- a/lib/python/Products/ZCTextIndex/HTMLSplitter.py
+++ b/lib/python/Products/ZCTextIndex/HTMLSplitter.py
+from Products.ZCTextIndex.ISplitter import ISplitter
+
+import re
+
+class HTMLSplitter:
+
+    __implements__ = ISplitter
+
+    def process(self, text):
+        return re.sub('<[^>]*>', ' ', text).split()
+
+class HTMLWordSplitter:
+
+    __implements__ = ISplitter
+
+    def process(self, text):
+        splat = []
+        for t in text:
+            splat += self.split(t)
+        return splat    
+
+    def split(self, text):    
+        text = text.lower()
+        remove = ["<[^>]*>",
+                  "&[A-Za-z]+;",
+                  "\W+"]
+        for pat in remove:
+            text = re.sub(pat, " ", text)
+        rx = re.compile("[A-Za-z]")
+        return [word for word in text.split()
+                if len(word) > 1 and rx.search(word)]
+
+if __name__ == "__main__":
+    import sys
+    splitter = HTMLWordSplitter()
+    for path in sys.argv[1:]:
+        f = open(path, "rb")
+        buf = f.read()
+        f.close()
+        print path
+        print splitter.process([buf])
--- a/lib/python/Products/ZCTextIndex/IIndex.py
+++ b/lib/python/Products/ZCTextIndex/IIndex.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Index Interface."""
+
+import Interface
+
+class IIndex(Interface.Base):
+    """Interface for an Index."""
+
+    def search(term):
+        """Execute a search on a single term given as a string.
+
+        Return an IIBucket.
+        """
+
+    def search_phrase(phrase):
+        """Execute a search on a phrase given as a string.
+
+        Return an IIBucket.
+        """
+
+    def search_glob(pattern):
+        """Execute a pattern search.
+
+        The pattern represents a set of words by using * and ?.  For
+        example, "foo*" represents the set of all words in the lexicon
+        starting with "foo".
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Return an IIBucket.
+        """
+
+    def query_weight(terms):
+        """Return the weight for a set of query terms.
+
+        'terms' is a sequence of all terms included in the query,
+        although not terms with a not.  If a term appears more than
+        once in a query, it should appear more than once in terms.
+        """
+
+    def index_doc(docid, text):
+        "XXX"
+
+    def unindex_doc(docid):
+        "XXX"
--- a/lib/python/Products/ZCTextIndex/ILexicon.py
+++ b/lib/python/Products/ZCTextIndex/ILexicon.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ILexicon(Interface):
+    """Object responsible for converting text to word identifiers."""
+
+    def termToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parses the text as if they are search terms, and skips words that
+        aren't in the lexicon.
+        """
+
+    def sourceToWordIds(text):
+        """Return a sequence of ids of the words parsed from the text.
+
+        The input text may be either a string or a list of strings.
+
+        Parses the text as if they come from a source document, and creates
+        new word ids for words that aren't (yet) in the lexicon.
+        """
+
+    def globToWordIds(pattern):
+        """Return a sequence of ids of words matching the pattern.
+
+        The argument should be a single word using globbing syntax,
+        e.g. 'foo*' meaning anything starting with 'foo'.
+
+        NOTE: Currently only a single trailing * is supported.
+
+        Returns the wids for all words in the lexicon that match the
+        pattern.
+        """
+
+    def length():
+        """Return the number of unique term in the lexicon."""
--- a/lib/python/Products/ZCTextIndex/INBest.py
+++ b/lib/python/Products/ZCTextIndex/INBest.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""NBest Interface.
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method.  If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+
+import Interface
+
+class INBest(Interface.Base):
+    """Interface for an N-Best chooser."""
+
+    def add(item, score):
+        """Record that item 'item' has score 'score'.  No return value.
+
+        The N best-scoring items are remembered, where N was passed to
+        the constructor.  'item' can by anything.  'score' should be
+        a number, and larger numbers are considered better.
+        """
+
+    def addmany(sequence):
+        """Like "for item, score in sequence: self.add(item, score)".
+
+        This is simply faster than calling add() len(seq) times.
+        """
+
+    def getbest():
+        """Return the (at most) N best-scoring items as a sequence.
+
+        The return value is a sequence of 2-tuples, (item, score), with
+        the largest score first.  If .add() has been called fewer than
+        N times, this sequence will contain fewer than N pairs.
+        """
+
+    def pop_smallest():
+        """Return and remove the (item, score) pair with lowest score.
+
+        If len(self) is 0, raise IndexError.
+
+        To be cleaer, this is the lowest score among the N best-scoring
+        seen so far.  This is most useful if the capacity of the NBest
+        object is never exceeded, in which case  pop_smallest() allows
+        using the object as an ordinary smallest-in-first-out priority
+        queue.
+        """
+
+    def __len__():
+        """Return the number of (item, score) pairs currently known.
+
+        This is N (the value passed to the constructor), unless .add()
+        has been called fewer than N times.
+        """
+
+    def capacity():
+        """Return the maximum number of (item, score) pairs.
+
+        This is N (the value passed to the constructor).
+        """
--- a/lib/python/Products/ZCTextIndex/IPipelineElement.py
+++ b/lib/python/Products/ZCTextIndex/IPipelineElement.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class IPipelineElement(Interface):
+
+    def process(source):
+        """Provide a text processing step.
+
+        Process a source sequence of words into a result sequence.
+        """
--- a/lib/python/Products/ZCTextIndex/IQueryParser.py
+++ b/lib/python/Products/ZCTextIndex/IQueryParser.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser Interface."""
+
+import Interface
+
+class IQueryParser(Interface.Base):
+    """Interface for Query Parsers."""
+
+    def parseQuery(query):
+        """Parse a query string.
+
+        Return a parse tree (which implements IQueryParseTree).
+
+        May raise ParseTree.ParseError.
+        """
+
+class IQueryParseTree(Interface.Base):
+    """Interface for parse trees returned by parseQuery()."""
+
+    def nodeType():
+        """Return the node type.
+
+        This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
+        """
+
+    def getValue():
+        """Return a node-type specific value.
+
+        For node type:    Return:
+        'AND'             a list of parse trees
+        'OR'              a list of parse trees
+        'NOT'             a parse tree
+        'ATOM'            a string (representing a single search term)
+        'PHRASE'          a string (representing a search phrase)
+        'GLOB'            a string (representing a pattern, e.g. "foo*")
+        """
+
+    def terms():
+        """Return a list of all terms in this node, excluding NOT subtrees."""
+
+    def executeQuery(index):
+        """Execute the query represented by this node against the index.
+
+        The index argument must implement the IIndex interface.
+
+        Return an IIBucket or IIBTree mapping document ids to scores
+        (higher scores mean better results).
+
+        May raise ParseTree.QueryError.
+        """
--- a/lib/python/Products/ZCTextIndex/ISplitter.py
+++ b/lib/python/Products/ZCTextIndex/ISplitter.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from Interface import Base as Interface
+
+class ISplitter(Interface):
+    """A splitter."""
+
+    def process(text):
+        """Run the splitter over the input text, returning a list of terms."""
--- a/lib/python/Products/ZCTextIndex/Index.py
+++ b/lib/python/Products/ZCTextIndex/Index.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking."""
+
+import math
+
+from BTrees.IOBTree import IOBTree
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
+
+from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
+
+# Instead of storing floats, we generally store scaled ints.  Binary pickles
+# can store those more efficiently.  The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+    # We expect only positive inputs, so "add a half and chop" is the
+    # same as round().  Surprising, calling round() is significantly more
+    # expensive.
+    return int(f * scale + 0.5)
+
+class Index:
+
+    __implements__ = IIndex
+
+    def __init__(self, lexicon):
+        self._lexicon = lexicon
+
+        # wid -> { docid -> frequency }
+        self._wordinfo = IOBTree()
+
+        # docid -> W(docid)
+        self._docweight = IIBTree()
+
+        # docid -> [ wid ]
+        # used for un-indexing
+        self._docwords = IOBTree()
+
+    def length(self):
+        """Return the number of documents in the index."""
+        return len(self._docwords)
+
+    # Most of the computation for computing a relevance score for the
+    # document occurs in the search() method.  The code currently
+    # implements the cosine similarity function described in Managing
+    # Gigabytes, eq. 4.3, p. 187.  The index_object() method
+    # precomputes some values that are independent of the particular
+    # query.
+
+    # The equation is
+    #
+    #                     sum(for t in I(d,q): w(d,t) * w(q,t))
+    #     cosine(d, q) =  -------------------------------------
+    #                                  W(d) * W(q)
+    #
+    # where
+    #    I(d, q) = the intersection of the terms in d and q.
+    #
+    #    w(d, t) = 1 + log f(d, t)
+    #        computed by doc_term_weight(); for a given word t,
+    #        self._wordinfo[t] is a map from d to w(d, t).
+    #
+    #    w(q, t) = log(1 + N/f(t))
+    #        computed by query_term_weight()
+    #
+    #    W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
+    #        computed by _get_frequencies(), and remembered in
+    #        self._docweight[d]
+    #
+    #    W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
+    #        computed by self.query_weight()
+
+    def index_doc(self, docid, text):
+        wids = self._lexicon.sourceToWordIds(text)
+        uniqwids, freqs, docweight = self._get_frequencies(wids)
+        for i in range(len(uniqwids)):
+            self._add_wordinfo(uniqwids[i], freqs[i], docid)
+        self._docweight[docid] = docweight
+        self._add_undoinfo(docid, wids)
+
+    def unindex_doc(self, docid):
+        for wid in self._get_undoinfo(docid):
+            self._del_wordinfo(wid, docid)
+        del self._docwords[docid]
+        del self._docweight[docid]
+
+    def search(self, term):
+        wids = self._lexicon.termToWordIds(term)
+        return self._union(self._search_wids(wids))
+
+    def search_glob(self, pattern):
+        wids = self._lexicon.globToWordIds(pattern)
+        return self._union(self._search_wids(wids))
+
+    def search_phrase(self, phrase):
+        wids = self._lexicon.termToWordIds(phrase)
+        hits = self._intersection(self._search_wids(wids))
+        if not hits:
+            return hits
+        code = WidCode.encode(wids)
+        result = IIBTree()
+        for docid, weight in hits.items():
+            docwords = self._docwords[docid]
+            if docwords.find(code) >= 0:
+                result[docid] = weight
+        return result
+
+    def _search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(len(self._docweight))
+        L = []
+        DictType = type({})
+        for wid in wids:
+            d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
+            idf = query_term_weight(len(d2w), N)  # this is an unscaled float
+            #print "idf = %.3f" % idf
+            if isinstance(d2w, DictType):
+                d2w = IIBucket(d2w)
+            L.append((d2w, scaled_int(idf)))
+        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+        return L
+
+    def _intersection(self, L):
+        if not L:
+            return IIBTree()
+        d2w, weight = L[0]
+        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+        for d2w, weight in L[1:]:
+            dummy, result = weightedIntersection(result, d2w, 1, weight)
+        return result
+
+    def _union(self, L):
+        # XXX This can be optimized, see OkapiIndex
+        result = IIBTree()
+        for d2w, weight in L:
+            dummy, result = weightedUnion(result, d2w, 1, weight)
+        return result
+
+    def query_weight(self, terms):
+        wids = []
+        for term in terms:
+            wids += self._lexicon.termToWordIds(term)
+        N = float(len(self._docweight))
+        sum = 0.0
+        for wid in wids:
+            wt = math.log(1.0 + N / len(self._wordinfo[wid]))
+            sum += wt ** 2.0
+        return scaled_int(math.sqrt(sum))
+
+    def _get_frequencies(self, wids):
+        """Return individual doc-term weights and docweight."""
+        # Computes w(d, t) for each term, and W(d).
+        # Return triple:
+        #    [wid0, wid1, ...],
+        #    [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
+        #    W(d)
+        # The second list and W(d) are scaled_ints.
+        d = {}
+        for wid in wids:
+            d[wid] = d.get(wid, 0) + 1
+        Wsquares = 0.0
+        weights = []
+        push = weights.append
+        for count in d.values():
+            w = doc_term_weight(count)
+            Wsquares += w * w
+            push(w)
+        W = math.sqrt(Wsquares)
+        #print "W = %.3f" % W
+        for i in xrange(len(weights)):
+            #print i, ":", "%.3f" % weights[i],
+            weights[i] = scaled_int(weights[i] / W)
+            #print "->", weights[i]
+        return d.keys(), weights, scaled_int(W)
+
+    DICT_CUTOFF = 10
+
+    def _add_wordinfo(self, wid, f, docid):
+        # Store a wordinfo in a dict as long as there are less than
+        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
+
+        # The pickle of a dict is smaller than the pickle of an
+        # IIBTree, substantially so for small mappings.  Thus, we use
+        # a dictionary until the mapping reaches DICT_CUTOFF elements.
+
+        # The cutoff is chosen based on the implementation
+        # characteristics of Python dictionaries.  The dict hashtable
+        # always has 2**N slots and is resized whenever it is 2/3s
+        # full.  A pickled dict with 10 elts is half the size of an
+        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
+        # choose 10 as the cutoff for now.
+
+        # The IIBTree has a smaller in-memory representation than a
+        # dictionary, so pickle size isn't the only consideration when
+        # choosing the threshold.  The pickle of a 500-elt dict is 92%
+        # of the size of the same IIBTree, but the dict uses more
+        # space when it is live in memory.  An IIBTree stores two C
+        # arrays of ints, one for the keys and one for the values.  It
+        # holds upto 120 key-value pairs in a single bucket.
+        try:
+            map = self._wordinfo[wid]
+        except KeyError:
+            map = {}
+        else:
+            # _add_wordinfo() is called for each update.  If the map
+            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+            if len(map) == self.DICT_CUTOFF:
+                map = IIBTree(map)
+        map[docid] = f
+        self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+    def _del_wordinfo(self, wid, docid):
+        try:
+            map = self._wordinfo[wid]
+            del map[docid]
+        except KeyError:
+            return
+        if len(map) == 0:
+            del self._wordinfo[wid]
+            return
+        if len(map) == self.DICT_CUTOFF:
+            new = {}
+            for k, v in map.items():
+                new[k] = v
+            map = new
+        self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+    def _add_undoinfo(self, docid, wids):
+        self._docwords[docid] = WidCode.encode(wids)
+
+    def _get_undoinfo(self, docid):
+        return WidCode.decode(self._docwords[docid])
+
+    # The rest are helper methods to support unit tests
+
+    def _get_wdt(self, d, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
+
+    def _get_Wd(self, d):
+        return self._docweight[d]
+
+    def _get_ft(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        return len(self._wordinfo[wid])
+
+    def _get_wt(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
+
+def doc_term_weight(count):
+    """Return the doc-term weight for a term that appears count times."""
+    # implements w(d, t) = 1 + log f(d, t)
+    return 1.0 + math.log(count)
+
+def query_term_weight(term_count, num_items):
+    """Return the query-term weight for a term,
+
+    that appears in term_count items in a collection with num_items
+    total items.
+    """
+    # implements w(q, t) = log(1 + N/f(t))
+    return math.log(1.0 + float(num_items) / term_count)
--- a/lib/python/Products/ZCTextIndex/Lexicon.py
+++ b/lib/python/Products/ZCTextIndex/Lexicon.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+import re
+
+from BTrees.IOBTree import IOBTree
+from BTrees.OIBTree import OIBTree
+from Products.ZCTextIndex.ILexicon import ILexicon
+from Products.ZCTextIndex.StopDict import get_stopdict
+
+class Lexicon:
+    
+    __implements__ = ILexicon
+
+    def __init__(self, *pipeline):
+        self.__wids = OIBTree()
+        self.__words = IOBTree()
+        # XXX we're reserving wid 0, but that might be yagni
+        self.__nextwid = 1
+        self.__pipeline = pipeline
+
+    def length(self):
+        """Return the number of unique terms in the lexicon."""
+        return self.__nextwid - 1
+
+    def words(self):
+        return self.__wids.keys()
+
+    def wids(self):
+        return self.__words.keys()
+
+    def items(self):
+        return self.__wids.items()
+
+    def sourceToWordIds(self, text):
+        last = _text2list(text)
+        for element in self.__pipeline:
+            last = element.process(last)
+        return map(self._getWordIdCreate, last)
+
+    def termToWordIds(self, text):
+        last = _text2list(text)
+        for element in self.__pipeline:
+            last = element.process(last)
+        wids = []
+        for word in last:
+            wid = self.__wids.get(word)
+            if wid is not None:
+                wids.append(wid)
+        return wids
+
+    def globToWordIds(self, pattern):
+        if not re.match("^\w+\*$", pattern):
+            return []
+        pattern = pattern.lower()
+        assert pattern.endswith("*")
+        prefix = pattern[:-1]
+        assert prefix and not prefix.endswith("*")
+        keys = self.__wids.keys(prefix) # Keys starting at prefix
+        wids = []
+        words = []
+        for key in keys:
+            if not key.startswith(prefix):
+                break
+            wids.append(self.__wids[key])
+            words.append(key)
+        return wids
+
+    def _getWordIdCreate(self, word):
+        wid = self.__wids.get(word)
+        if wid is None:
+            wid = self.__new_wid()
+            self.__wids[word] = wid
+            self.__words[wid] = word
+        return wid
+
+    def __new_wid(self):
+        wid = self.__nextwid
+        self.__nextwid += 1
+        return wid
+
+def _text2list(text):
+    # Helper: splitter input may be a string or a list of strings
+    try:
+        text + ""
+    except:
+        return text
+    else:
+        return [text]
+
+# Sample pipeline elements
+
+class Splitter:
+
+    import re
+    rx = re.compile(r"\w+")
+
+    def process(self, lst):
+        result = []
+        for s in lst:
+            result += self.rx.findall(s)
+        return result
+
+class CaseNormalizer:
+
+    def process(self, lst):
+        return [w.lower() for w in lst]
+
+class StopWordRemover:
+
+    dict = get_stopdict().copy()
+    for c in range(255):
+        dict[chr(c)] = None
+
+    def process(self, lst):
+        has_key = self.dict.has_key
+        return [w for w in lst if not has_key(w)]
+
+try:
+    from Products.ZCTextIndex import stopper as _stopper
+except ImportError:
+    pass
+else:
+    _stopwords = StopWordRemover.dict
+    def StopWordRemover():
+        swr = _stopper.new()
+        swr.dict.update(_stopwords)
+        return swr
--- a/lib/python/Products/ZCTextIndex/NBest.py
+++ b/lib/python/Products/ZCTextIndex/NBest.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""NBest
+
+An NBest object remembers the N best-scoring items ever passed to its
+.add(item, score) method.  If .add() is called M times, the worst-case
+number of comparisons performed overall is M * log2(N).
+"""
+
+from bisect import bisect
+
+from Products.ZCTextIndex.INBest import INBest
+
+class NBest:
+    __implements__ = INBest
+
+    def __init__(self, N):
+        "Build an NBest object to remember the N best-scoring objects."
+
+        if N < 1:
+            raise ValueError("NBest() argument must be at least 1")
+        self._capacity = N
+
+        # This does a very simple thing with sorted lists.  For large
+        # N, a min-heap can be unboundedly better in terms of data
+        # movement time.
+        self.scores = []
+        self.items = []
+
+    def __len__(self):
+        return len(self.scores)
+
+    def capacity(self):
+        return self._capacity
+
+    def add(self, item, score):
+        self.addmany([(item, score)])
+
+    def addmany(self, sequence):
+        scores, items, capacity = self.scores, self.items, self._capacity
+        n = len(scores)
+        for item, score in sequence:
+            # When we're in steady-state, the usual case is that we're filled
+            # to capacity, and that an incoming item is worse than any of
+            # the best-seen so far.
+            if n >= capacity and score <= scores[0]:
+                continue
+            i = bisect(scores, score)
+            scores.insert(i, score)
+            items.insert(i, item)
+            if n == capacity:
+                del items[0], scores[0]
+            else:
+                n += 1
+        assert n == len(scores)
+
+    def getbest(self):
+        result = zip(self.items, self.scores)
+        result.reverse()
+        return result
+
+    def pop_smallest(self):
+        if self.scores:
+            return self.items.pop(0), self.scores.pop(0)
+        raise IndexError("pop_smallest() called on empty NBest object")
--- a/lib/python/Products/ZCTextIndex/OkapiIndex.py
+++ b/lib/python/Products/ZCTextIndex/OkapiIndex.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Full text index with relevance ranking, using an Okapi BM25 rank."""
+
+# Lots of comments are at the bottom of this file.  Read them to
+# understand what's going on.
+
+import math
+
+from BTrees.IOBTree import IOBTree
+from BTrees.IIBTree import IIBTree, IIBucket, IISet
+from BTrees.IIBTree import weightedIntersection, weightedUnion
+
+from Products.ZCTextIndex.IIndex import IIndex
+from Products.ZCTextIndex import WidCode
+from Products.ZCTextIndex.NBest import NBest
+
+# Instead of storing floats, we generally store scaled ints.  Binary pickles
+# can store those more efficiently.  The default SCALE_FACTOR of 1024
+# is large enough to get about 3 decimal digits of fractional info, and
+# small enough so that scaled values should almost always fit in a signed
+# 16-bit int (we're generally storing logs, so a few bits before the radix
+# point goes a long way; on the flip side, for reasonably small numbers x
+# most of the info in log(x) is in the fractional bits, so we do want to
+# save a lot of those).
+SCALE_FACTOR = 1024.0
+
+def scaled_int(f, scale=SCALE_FACTOR):
+    # We expect only positive inputs, so "add a half and chop" is the
+    # same as round().  Surprising, calling round() is significantly more
+    # expensive.
+    return int(f * scale + 0.5)
+
+class Index:
+
+    __implements__ = IIndex
+
+    # BM25 free parameters.
+    K1 = 1.2
+    B  = 0.75
+    assert K1 >= 0.0
+    assert 0.0 <= B <= 1.0
+
+    def __init__(self, lexicon):
+        self._lexicon = lexicon
+
+        # wid -> { docid -> frequency }; t -> D -> f(D, t)
+        self._wordinfo = IOBTree()
+
+        # docid -> # of words in the doc
+        # XXX this is just len(self._docwords[docid]), but if _docwords
+        # XXX is stored in compressed form then uncompressing just to count
+        # XXX the list length would be ridiculously expensive.
+        self._doclen = IIBTree()
+
+        # docid -> [ wid ]
+        # used for un-indexing
+        self._docwords = IOBTree()
+
+        # sum(self._doclen.values()), the total # of words in all docs
+        self._totaldoclen = 0L
+
+    def length(self):
+        """Return the number of documents in the index."""
+        return len(self._docwords)
+
+    # Most of the computation for computing a relevance score for the
+    # document occurs in the search() method.
+
+    def index_doc(self, docid, text):
+        wids = self._lexicon.sourceToWordIds(text)
+        self._doclen[docid] = len(wids)
+        self._totaldoclen += len(wids)
+        wid2count = self._get_frequencies(wids)
+        for wid, count in wid2count.items():
+            self._add_wordinfo(wid, count, docid)
+        self._add_undoinfo(docid, wids)
+
+    def unindex_doc(self, docid):
+        for wid in self._get_undoinfo(docid):
+            self._del_wordinfo(wid, docid)
+        del self._docwords[docid]
+        count = self._doclen[docid]
+        del self._doclen[docid]
+        self._totaldoclen -= count
+
+    def search(self, term):
+        wids = self._lexicon.termToWordIds(term)
+        return self._union(self._search_wids(wids))
+
+    def search_glob(self, pattern):
+        wids = self._lexicon.globToWordIds(pattern)
+        return self._union(self._search_wids(wids))
+
+    def search_phrase(self, phrase):
+        wids = self._lexicon.termToWordIds(phrase)
+        hits = self._intersection(self._search_wids(wids))
+        if not hits:
+            return hits
+        code = WidCode.encode(wids)
+        result = IIBTree()
+        for docid, weight in hits.items():
+            docwords = self._docwords[docid]
+            if docwords.find(code) >= 0:
+                result[docid] = weight
+        return result
+
+    def _search_wids(self, wids):
+        if not wids:
+            return []
+        N = float(len(self._doclen))
+        L = []
+        K1 = self.K1
+        B = self.B
+        K1_plus1 = K1 + 1.0
+        B_from1 = 1.0 - B
+        meandoclen = self._totaldoclen / N
+
+        #                           f(D, t) * (k1 + 1)
+        #   TF(D, t) =  -------------------------------------------
+        #               f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+        for wid in wids:
+            d2f = self._wordinfo[wid] # map {docid -> f(docid, wid)}
+            idf = inverse_doc_frequency(len(d2f), N)  # this is an unscaled float
+            result = IIBucket()
+            for docid, f in d2f.items():
+                lenweight = B_from1 + B * self._doclen[docid] / meandoclen
+                tf = f * K1_plus1 / (f + K1 * lenweight)
+                result[docid] = scaled_int(tf * idf)
+            L.append((result, 1))
+        return L
+
+        # Note about the above:  the result is tf * idf.  tf is small -- it
+        # can't be larger than k1+1 = 2.2.  idf is formally unbounded, but
+        # is less than 14 for a term that appears in only 1 of a million
+        # documents.  So the product is probably less than 32, or 5 bits
+        # before the radix point.  If we did the scaled-int business on
+        # both of them, we'd be up to 25 bits.  Add 64 of those and we'd
+        # be in overflow territory.  That's pretty unlikely, so we *could*
+        # just store scaled_int(tf) in result[docid], and use scaled_int(idf)
+        # as an invariant weight across the whole result.  But besides
+        # skating near the edge, it's not a speed cure, since the computation
+        # of tf would still done at Python speed, and it's a lot more
+        # work than just multiplying by idf.
+
+    def _intersection(self, L):
+        if not L:
+            return IIBTree()
+        # Intersect with smallest first.
+        L = L[:]    # don't mutate the caller's L
+        L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
+        d2w, weight = L[0]
+        dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
+        for d2w, weight in L[1:]:
+            dummy, result = weightedIntersection(result, d2w, 1, weight)
+        return result
+
+    def _union(self, L):
+        if not L:
+            return IIBTree()
+        # Balance unions as closely as possible, smallest to largest.
+        merge = NBest(len(L))
+        for x, weight in L:
+            merge.add((x, weight), len(x))
+        while len(merge) > 1:
+            # Merge the two smallest so far, and add back to the queue.
+            x, wx = merge.pop_smallest()
+            y, wy = merge.pop_smallest()
+            dummy, z = weightedUnion(x, y, wx, wy)
+            merge.add((z, 1), len(z))
+        (result, weight), score = merge.pop_smallest()
+        return result
+
+    def query_weight(self, terms):
+        # XXX I have no idea what to put here
+        return 10
+
+    def _get_frequencies(self, wids):
+        """Return individual term frequencies."""
+        # Computes f(d, t) for each term.
+        # Returns a dict mapping wid to the number of times wid appeared
+        # in wids, {t: f(d, t)}
+        d = {}
+        dget = d.get
+        for wid in wids:
+            d[wid] = dget(wid, 0) + 1
+        return d
+
+    DICT_CUTOFF = 10
+
+    def _add_wordinfo(self, wid, f, docid):
+        # Store a wordinfo in a dict as long as there are less than
+        # DICT_CUTOFF docids in the dict.  Otherwise use an IIBTree.
+
+        # The pickle of a dict is smaller than the pickle of an
+        # IIBTree, substantially so for small mappings.  Thus, we use
+        # a dictionary until the mapping reaches DICT_CUTOFF elements.
+
+        # The cutoff is chosen based on the implementation
+        # characteristics of Python dictionaries.  The dict hashtable
+        # always has 2**N slots and is resized whenever it is 2/3s
+        # full.  A pickled dict with 10 elts is half the size of an
+        # IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4.  So
+        # choose 10 as the cutoff for now.
+
+        # The IIBTree has a smaller in-memory representation than a
+        # dictionary, so pickle size isn't the only consideration when
+        # choosing the threshold.  The pickle of a 500-elt dict is 92%
+        # of the size of the same IIBTree, but the dict uses more
+        # space when it is live in memory.  An IIBTree stores two C
+        # arrays of ints, one for the keys and one for the values.  It
+        # holds upto 120 key-value pairs in a single bucket.
+        try:
+            map = self._wordinfo[wid]
+        except KeyError:
+            map = {}
+        else:
+            # _add_wordinfo() is called for each update.  If the map
+            # size exceeds the DICT_CUTOFF, convert to an IIBTree.
+            if len(map) == self.DICT_CUTOFF:
+                map = IIBTree(map)
+        map[docid] = f
+        self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+    def _del_wordinfo(self, wid, docid):
+        try:
+            map = self._wordinfo[wid]
+            del map[docid]
+        except KeyError:
+            return
+        if len(map) == 0:
+            del self._wordinfo[wid]
+            return
+        if len(map) == self.DICT_CUTOFF:
+            new = {}
+            for k, v in map.items():
+                new[k] = v
+            map = new
+        self._wordinfo[wid] = map # Not redundant, because of Persistency!
+
+    def _add_undoinfo(self, docid, wids):
+        self._docwords[docid] = WidCode.encode(wids)
+
+    def _get_undoinfo(self, docid):
+        return WidCode.decode(self._docwords[docid])
+
+    # The rest are helper methods to support unit tests
+    # XXX These don't work for Okapi, I assume
+
+    def _get_wdt(self, d, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return map.get(d, 0) * self._doclen[d] / SCALE_FACTOR
+
+    def _get_Wd(self, d):
+        return self._doclen[d]
+
+    def _get_ft(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        return len(self._wordinfo[wid])
+
+    def _get_wt(self, t):
+        wid, = self._lexicon.termToWordIds(t)
+        map = self._wordinfo[wid]
+        return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
+
+def inverse_doc_frequency(term_count, num_items):
+    """Return the inverse doc frequency for a term,
+
+    that appears in term_count items in a collection with num_items
+    total items.
+    """
+    # implements IDF(q, t) = log(1 + N/f(t))
+    return math.log(1.0 + float(num_items) / term_count)
+
+"""
+"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
+It's based on probability arguments about how words are distributed in
+documents, not on an abstract vector space model.  A long paper by its
+principal inventors gives an excellent overview of how it was derived:
+
+    A probabilistic model of information retrieval:  development and status
+    K. Sparck Jones, S. Walker, S.E. Robertson
+    http://citeseer.nj.nec.com/jones98probabilistic.html
+
+Spellings that ignore relevance information (which we don't have) are of this
+high-level form:
+
+    score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
+
+where
+
+    D         a specific document
+
+    Q         a specific query
+
+    t         a term (word, atomic phrase, whatever)
+
+    D&Q       the terms common to D and Q
+
+    TF(D, t)  a measure of t's importance in D -- a kind of term frequency
+              weight
+
+    IDF(Q, t) a measure of t's importance in the query and in the set of
+              documents as a whole -- a kind of inverse document frequency
+              weight
+
+The IDF(Q, t) here is identical to the one used for our cosine measure.
+Since queries are expected to be short, it ignores Q entirely:
+
+   IDF(Q, t) = log(1.0 + N / f(t))
+
+where
+
+   N        the total number of documents
+   f(t)     the number of documents in which t appears
+
+Most Okapi literature seems to use log(N/f(t)) instead.  We don't, because
+that becomes 0 for a term that's in every document, and, e.g., if someone
+is searching for "documentation" on python.org (a term that may well show
+up on every page, due to the top navigation bar), we still want to find the
+pages that use the word a lot (which is TF's job to find, not IDF's -- we
+just want to stop IDF from considering this t to be irrelevant).
+
+The TF(D, t) spellings are more interesting.  With lots of variations, the
+most basic spelling is of the form
+
+                   f(D, t)
+    TF(D, t) = ---------------
+                f(D, t) + K(D)
+
+where
+
+    f(D, t)   the number of times t appears in D
+    K(D)      a measure of the length of D, normalized to mean doc length
+
+The functional *form* f/(f+K) is clever.  It's a gross approximation to a
+mixture of two distinct Poisson distributions, based on the idea that t
+probably appears in D for one of two reasons:
+
+1. More or less at random.
+
+2. Because it's important to D's purpose in life ("eliteness" in papers).
+
+Note that f/(f+K) is always between 0 and 1.  If f is very large compared to
+K, it approaches 1.  If K is very large compared to f, it approaches 0.  If
+t appears in D more or less "for random reasons", f is likely to be small,
+and so K will dominate unless it's a very small doc, and the ratio will be
+small.  OTOH, if t appears a lot in D, f will dominate unless it's a very
+large doc, and the ratio will be close to 1.
+
+We use a variation on that simple theme, a simplification of what's called
+BM25 in the literature (it was the 25th stab at a Best Match function from
+the Okapi group; "a simplification" means we're setting some of BM25's more
+esoteric free parameters to 0):
+
+                f(D, t) * (k1 + 1)
+    TF(D, t) = --------------------
+                f(D, t) + k1 * K(D)
+
+where
+
+    k1      a "tuning factor", typically between 1.0 and 2.0.  We use 1.2,
+            the usual default value.  This constant adjusts the curve to
+            look more like a theoretical 2-Poisson curve.
+
+Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
+an asymptote of k1+1 from below.
+
+Finally, we use
+
+    K(D) = (1-b) + b * len(D)/E(len(D))
+
+where
+
+    b           is another free parameter, discussed below.  We use 0.75.
+
+    len(D)      the length of D in words
+
+    E(len(D))   the expected value of len(D) across the whole document set;
+                or, IOW, the average document length
+
+b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
+of the "Verbosity Hypothesis".  Suppose b is 1, and some word t appears
+10 times as often in document d2 than in document d1.  If document d2 is
+also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
+
+                     f(d2, t) * (k1 + 1)
+   TF(d2, t) = --------------------------------- =
+                f(d2, t) + k1 * len(d2)/E(len(D))
+
+                            10 * f(d1, t) * (k1 + 1)
+               ----------------------------------------------- = TF(d1, t)
+                10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
+
+because the 10's cancel out.  This is appropriate if we believe that a word
+appearing 10x more often in a doc 10x as long is simply due to that the
+longer doc is more verbose.  If we do believe that, the longer doc and the
+shorter doc are probably equally relevant.  OTOH, it *could* be that the
+longer doc is talking about t in greater depth too, in which case it's
+probably more relevant than the shorter doc.
+
+At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
+completely, and a doc scores higher for having more occurences of a word
+regardless of the doc's length.
+
+Reality is between these extremes, and probably varies by document and word
+too.  Reports in the literature suggest that b=0.75 is a good compromise "in
+general", favoring the "verbosity hypothesis" end of the scale.
+
+Putting it all together, the final TF function is
+
+                           f(D, t) * (k1 + 1)
+    TF(D, t) = --------------------------------------------
+                f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
+
+with k1=1.2 and b=0.75.
+"""
--- a/lib/python/Products/ZCTextIndex/ParseTree.py
+++ b/lib/python/Products/ZCTextIndex/ParseTree.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Generic parser support: exception and parse tree nodes."""
+
+from BTrees.IIBTree import difference, weightedIntersection, weightedUnion
+from Products.ZCTextIndex.NBest import NBest
+
+class QueryError(Exception):
+    pass
+
+class ParseError(Exception):
+    pass
+
+class ParseTreeNode:
+
+    _nodeType = None
+
+    def __init__(self, value):
+        self._value = value
+
+    def nodeType(self):
+        return self._nodeType
+
+    def getValue(self):
+        return self._value
+
+    def __repr__(self):
+        return "%s(%r)" % (self.__class__.__name__, self.getValue())
+
+    def terms(self):
+        t = []
+        for v in self.getValue():
+            t.extend(v.terms())
+        return t
+
+    def executeQuery(self, index):
+        raise NotImplementedError
+
+class NotNode(ParseTreeNode):
+
+    _nodeType = "NOT"
+
+    def terms(self):
+        return []
+
+    def executeQuery(self, index):
+        raise QueryError, "NOT operator must occur right after AND"
+
+class AndNode(ParseTreeNode):
+
+    _nodeType = "AND"
+
+    def executeQuery(self, index):
+        L = []
+        Nots = []
+        for subnode in self.getValue():
+            if subnode.nodeType() == "NOT":
+                Nots.append(subnode.getValue().executeQuery(index))
+            else:
+                L.append(subnode.executeQuery(index))
+        assert L
+        L.sort(lambda x, y: cmp(len(x), len(y)))
+        set = L[0]
+        for x in L[1:]:
+            dummy, set = weightedIntersection(set, x)
+        if Nots:
+            Nots.sort(lambda x, y: cmp(len(x), len(y)))
+            notset = Nots[0]
+            for x in Nots[1:]:
+                dummy, notset = weightedUnion(notset, x)
+            set = difference(set, notset)
+        return set
+
+class OrNode(ParseTreeNode):
+
+    _nodeType = "OR"
+
+    def executeQuery(self, index):
+        # Balance unions as closely as possible, smallest to largest.
+        allofem = self.getValue()
+        merge = NBest(len(allofem))
+        for subnode in allofem:
+            result = subnode.executeQuery(index)
+            merge.add(result, len(result))
+        while len(merge) > 1:
+            # Merge the two smallest so far, and add back to the queue.
+            x, dummy = merge.pop_smallest()
+            y, dummy = merge.pop_smallest()
+            dummy, z = weightedUnion(x, y)
+            merge.add(z, len(z))
+        result, dummy = merge.pop_smallest()
+        return result
+
+class AtomNode(ParseTreeNode):
+
+    _nodeType = "ATOM"
+
+    def terms(self):
+        return [self.getValue()]
+
+    def executeQuery(self, index):
+        return index.search(self.getValue())
+
+class PhraseNode(AtomNode):
+
+    _nodeType = "PHRASE"
+
+    def executeQuery(self, index):
+        return index.search_phrase(self.getValue())
+
+class GlobNode(AtomNode):
+
+    _nodeType = "GLOB"
+
+    def executeQuery(self, index):
+        return index.search_glob(self.getValue())
--- a/lib/python/Products/ZCTextIndex/QueryParser.py
+++ b/lib/python/Products/ZCTextIndex/QueryParser.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+"""Query Parser.
+
+This particular parser recognizes the following syntax:
+
+Start = OrExpr
+OrExpr = AndExpr ('OR' AndExpr)*
+AndExpr = Term ('AND' NotExpr)*
+NotExpr = ['NOT'] Term
+Term = '(' OrExpr ')' | ATOM+
+
+The key words (AND, OR, NOT) are recognized in any mixture of case.
+
+An ATOM is either:
+
+ A sequence of characters not containing whitespace or parentheses or
+  double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+
+ A non-empty string enclosed in double quotes.  The interior of the string
+  can contain whitespace, parentheses and key words.
+
+In addition, an ATOM may optionally be preceded by a hyphen, meaning
+that it must not be present.
+
+An unquoted ATOM may also end in a star.  This is a primitive
+"globbing" function, meaning to search for any word with a given
+prefix.
+
+When multiple consecutive ATOMs are found at the leaf level, they are
+connected by an implied AND operator, and an unquoted leading hyphen
+is interpreted as a NOT operator.
+
+Summarizing the default operator rules:
+
+- a sequence of words without operators implies AND, e.g. ``foo bar''
+- double-quoted text implies phrase search, e.g. ``"foo bar"''
+- words connected by punctuation implies phrase search, e.g. ``foo-bar''
+- a leading hyphen implies NOT, e.g. ``foo -bar''
+- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
+- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
+"""
+
+import re
+
+import ParseTree # relative import
+
+# Create unique symbols for token types.
+_AND    = intern("AND")
+_OR     = intern("OR")
+_NOT    = intern("NOT")
+_LPAREN = intern("(")
+_RPAREN = intern(")")
+_ATOM   = intern("ATOM")
+_EOF    = intern("EOF")
+
+# Map keyword string to token type.
+_keywords = {
+    _AND:       _AND,
+    _OR:        _OR,
+    _NOT:       _NOT,
+    _LPAREN:    _LPAREN,
+    _RPAREN:    _RPAREN,
+}
+
+# Regular expression to tokenize.
+_tokenizer_regex = re.compile(r"""
+    # a paren
+    [()]
+    # or an optional hyphen
+|   -?
+    # followed by
+    (?:
+        # a string
+        " [^"]* "
+        # or a non-empty stretch w/o whitespace, parens or double quotes
+    |    [^()\s"]+
+    )
+""", re.VERBOSE)
+
+class QueryParser:
+
+    def __init__(self):
+        pass # This parser has no persistent state
+
+    def parseQuery(self, query):
+        # Lexical analysis.
+        tokens = _tokenizer_regex.findall(query)
+        self.__tokens = tokens
+        # classify tokens
+        self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
+                             for token in tokens]
+        # add _EOF
+        self.__tokens.append(_EOF)
+        self.__tokentypes.append(_EOF)
+        self.__index = 0
+
+        # Syntactical analysis.
+        tree = self._parseOrExpr()
+        self._require(_EOF)
+        return tree
+
+    # Recursive descent parser
+
+    def _require(self, tokentype):
+        if not self._check(tokentype):
+            t = self.__tokens[self.__index]
+            msg = "Token %r required, %r found" % (tokentype, t)
+            raise ParseTree.ParseError, msg
+
+    def _check(self, tokentype):
+        if self.__tokentypes[self.__index] is tokentype:
+            self.__index += 1
+            return 1
+        else:
+            return 0
+
+    def _peek(self, tokentype):
+        return self.__tokentypes[self.__index] is tokentype
+
+    def _get(self, tokentype):
+        t = self.__tokens[self.__index]
+        self._require(tokentype)
+        return t
+
+    def _parseOrExpr(self):
+        L = []
+        L.append(self._parseAndExpr())
+        while self._check(_OR):
+            L.append(self._parseAndExpr())
+        if len(L) == 1:
+            return L[0]
+        else:
+            return ParseTree.OrNode(L)
+
+    def _parseAndExpr(self):
+        L = []
+        L.append(self._parseTerm())
+        while self._check(_AND):
+            L.append(self._parseNotExpr())
+        if len(L) == 1:
+            return L[0]
+        else:
+            return ParseTree.AndNode(L)
+
+    def _parseNotExpr(self):
+        if self._check(_NOT):
+            return ParseTree.NotNode(self._parseTerm())
+        else:
+            return self._parseTerm()
+
+    def _parseTerm(self):
+        if self._check(_LPAREN):
+            tree = self._parseOrExpr()
+            self._require(_RPAREN)
+        else:
+            atoms = [self._get(_ATOM)]
+            while self._peek(_ATOM):
+                atoms.append(self._get(_ATOM))
+            nodes = []
+            nots = []
+            for a in atoms:
+                words = re.findall(r"\w+\*?", a)
+                if not words:
+                    continue
+                if len(words) > 1:
+                    n = ParseTree.PhraseNode(" ".join(words))
+                elif words[0].endswith("*"):
+                    n = ParseTree.GlobNode(words[0])
+                else:
+                    n = ParseTree.AtomNode(words[0])
+                if a[0] == "-":
+                    n = ParseTree.NotNode(n)
+                    nots.append(n)
+                else:
+                    nodes.append(n)
+            if not nodes:
+                text = " ".join(atoms)
+                msg = "At least one positive term required: %r" % text
+                raise ParseTree.ParseError, msg
+            nodes.extend(nots)
+            if len(nodes) == 1:
+                tree = nodes[0]
+            else:
+                tree = ParseTree.AndNode(nodes)
+        return tree
--- a/lib/python/Products/ZCTextIndex/RiceCode.py
+++ b/lib/python/Products/ZCTextIndex/RiceCode.py
+"""Rice coding (a varaitn of Golomb coding)
+
+Based on a Java implementation by Glen McCluskey described in a Usenix
+ ;login: article at 
+http://www.usenix.org/publications/login/2000-4/features/java.html
+
+McCluskey's article explains the approach as follows.  The encoding
+for a value x is represented as a unary part and a binary part.  The
+unary part is a sequence of 1 bits followed by a 0 bit.  The binary
+part encodes some of the lower bits of x-1.
+
+The encoding is parameterized by a value m that describes how many
+bits to store in the binary part.  If most of the values are smaller
+than 2**m then they can be stored in only m+1 bits.
+
+Compute the length of the unary part, q, where
+   q = math.floor((x-1)/ 2 ** m)
+
+   Emit q 1 bits followed by a 0 bit.
+
+Emit the lower m bits of x-1, treating x-1 as a binary value.
+"""
+
+import array
+
+class BitArray:
+
+    def __init__(self, buf=None):
+        self.bytes = array.array('B')
+        self.nbits = 0
+        self.bitsleft = 0
+        self.tostring = self.bytes.tostring
+
+    def __getitem__(self, i):
+        byte, offset = divmod(i, 8)
+        mask = 2 ** offset 
+        if self.bytes[byte] & mask:
+            return 1
+        else:
+            return 0
+
+    def __setitem__(self, i, val):
+        byte, offset = divmod(i, 8)
+        mask = 2 ** offset 
+        if val:
+            self.bytes[byte] |= mask
+        else:
+            self.bytes[byte] &= ~mask
+        
+    def __len__(self):
+        return self.nbits
+
+    def append(self, bit):
+        """Append a 1 if bit is true or 1 if it is false."""
+        if self.bitsleft == 0:
+            self.bytes.append(0)
+            self.bitsleft = 8
+        self.__setitem__(self.nbits, bit)
+        self.nbits += 1
+        self.bitsleft -= 1
+
+    def __getstate__(self):
+        return self.nbits, self.bitsleft, self.tostring()
+
+    def __setstate__(self, (nbits, bitsleft, s)):
+        self.bytes = array.array('B', s)
+        self.nbits = nbits
+        self.bitsleft = bitsleft
+
+class RiceCode:
+    def __init__(self, m):
+        """Constructor a RiceCode for m-bit values."""
+        if not (0 <= m <= 16):
+            raise ValueError, "m must be between 0 and 16"
+        self.init(m)
+        self.bits = BitArray()
+        self.len = 0
+
+    def init(self, m):
+        self.m = m
+        self.lower = (1 << m) - 1 
+        self.mask = 1 << (m - 1)
+
+    def append(self, val):
+        """Append an item to the list."""
+        if val < 1:
+            raise ValueError, "value >= 1 expected, got %s" % `val`
+        val -= 1
+        # emit the unary part of the code
+        q = val >> self.m
+        for i in range(q):
+            self.bits.append(1)
+        self.bits.append(0)
+        # emit the binary part
+        r = val & self.lower
+        mask = self.mask
+        while mask:
+            self.bits.append(r & mask)
+            mask >>= 1
+        self.len += 1
+
+    def __len__(self):
+        return self.len
+
+    def tolist(self):
+        """Return the items as a list."""
+        l = []
+        i = 0 # bit offset
+        binary_range = range(self.m)
+        for j in range(self.len):
+            unary = 0
+            while self.bits[i] == 1:
+                unary += 1
+                i += 1
+            assert self.bits[i] == 0
+            i += 1
+            binary = 0
+            for k in binary_range:
+                binary = (binary << 1) | self.bits[i]
+                i += 1
+            l.append((unary << self.m) + (binary + 1))
+        return l
+
+    def tostring(self):
+        """Return a binary string containing the encoded data.
+        
+        The binary string may contain some extra zeros at the end.
+        """
+        return self.bits.tostring()
+
+    def __getstate__(self):
+        return self.m, self.bits
+
+    def __setstate__(self, (m, bits)):
+        self.init(m)
+        self.bits = bits
+
+def encode(m, l):
+    c = RiceCode(m)
+    for elt in l:
+        c.append(elt)
+    assert c.tolist() == l
+    return c
+
+def encode_deltas(l):
+    if len(l) == 1:
+        return l[0], []
+    deltas = RiceCode(6)
+    deltas.append(l[1] - l[0])
+    for i in range(2, len(l)):
+        deltas.append(l[i] - l[i - 1])
+    return l[0], deltas
+
+def decode_deltas(start, enc_deltas):
+    deltas = enc_deltas.tolist()
+    l = [start]
+    for i in range(1, len(deltas)):
+        l.append(l[i-1] + deltas[i])
+    l.append(l[-1] + deltas[-1])
+    return l
+
+def test():
+    import random
+    for size in [10, 20, 50, 100, 200]:
+        l = [random.randint(1, size) for i in range(50)]
+        c = encode(random.randint(1, 16), l)
+        assert c.tolist() == l
+    for size in [10, 20, 50, 100, 200]:
+        l = range(random.randint(1, size), size + random.randint(1, size))
+        t = encode_deltas(l)
+        l2 = decode_deltas(*t)
+        assert l == l2
+        if l != l2:
+            print l
+            print l2
+
+def pickle_efficiency():
+    import pickle
+    import random
+    for m in [4, 8, 12]:
+        for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
+            for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
+                l = [random.randint(1, elt_range) for i in range(size)]
+                raw = pickle.dumps(l, 1)
+                enc = pickle.dumps(encode(m, l), 1)
+                print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
+                print "%5d %5d" % (len(raw), len(enc)),
+                if len(raw) > len(enc):
+                    print "win"
+                else:
+                    print "lose"
+
+if __name__ == "__main__":
+    test()
--- a/lib/python/Products/ZCTextIndex/Setup
+++ b/lib/python/Products/ZCTextIndex/Setup
+*shared*
+stopper stopper.c
--- a/lib/python/Products/ZCTextIndex/StopDict.py
+++ b/lib/python/Products/ZCTextIndex/StopDict.py
+"""Provide a default list of stop words for the index.
+
+The specific splitter and lexicon are customizable, but the default
+ZCTextIndex should do something useful.
+"""
+
+def get_stopdict():
+    """Return a dictionary of stopwords."""
+    return _dict
+
+# This list of English stopwords comes from Lucene
+_words = [
+    "a", "and", "are", "as", "at", "be", "but", "by",
+    "for", "if", "in", "into", "is", "it",
+    "no", "not", "of", "on", "or", "such",
+    "that", "the", "their", "then", "there", "these",
+    "they", "this", "to", "was", "will", "with"
+]
+
+_dict = {}
+for w in _words:
+    _dict[w] = None
--- a/lib/python/Products/ZCTextIndex/WidCode.py
+++ b/lib/python/Products/ZCTextIndex/WidCode.py
+# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
+# for smaller ints.  This is intended for lists of word ids (wids).  The
+# ordinary string .find() method can be used to find the encoded form of a
+# desired wid-string in an encoded wid-string.  As in UTF-8, the initial byte
+# of an encoding can't appear in the interior of an encoding, so find() can't
+# be fooled into starting a match "in the middle" of an encoding.
+
+# Details:
+#
+# + Only the first byte of an encoding has the sign bit set.
+#
+# + The number of bytes in the encoding is encoded in unary at the start of
+#   the first byte (i.e., an encoding with n bytes begins with n 1-bits
+#   followed by a 0 bit).
+#
+# + Bytes beyond the first in an encoding have the sign bit clear, followed
+#   by 7 bits of data.
+#
+# + The number of data bits in the first byte of an encoding varies.
+#
+# The int to be encoded can contain no more than 24 bits.
+# XXX this could certainly be increased
+#
+# If it contains no more than 6 bits, 00abcdef, the encoding is
+#     10abcdef
+#
+# If it contains 7 thru 12 bits,
+#     0000abcd efghijkL
+# the encoding is
+#     110abcde 0fghijkL
+#
+# Static tables _encoding and _decoding capture all encodes and decodes for
+# 12 or fewer bits.
+#
+# If it contains 13 thru 18 bits,
+#    000000ab cdefghij kLmnopqr
+# the encoding is
+#    1110abcd 0efghijk 0Lmnopqr
+#
+# If it contains 19 thru 24 bits,
+#    abcdefgh ijkLmnop qrstuvwx
+# the encoding is
+#    11110abc 0defghij 0kLmnopq 0rstuvwx
+
+
+import re
+
+def encode(wids):
+    # Encode a list of wids as a string.
+    wid2enc = _encoding
+    n = len(wid2enc)
+    return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
+
+_encoding = [None] * 0x1000 # Filled later, and converted to a tuple
+
+def _encode(w):
+    assert 0x1000 <= w < 0x1000000
+    b, c = divmod(w, 0x80)
+    a, b = divmod(b, 0x80)
+    s = chr(b) + chr(c)
+    if a < 0x10:    # no more than 18 data bits
+        return chr(a + 0xE0) + s
+    a, b = divmod(a, 0x80)
+    assert a < 0x4, (w, a, b, s)  # else more than 24 data bits
+    return (chr(a + 0xF0) + chr(b)) + s
+
+_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
+
+def decode(code):
+    # Decode a string into a list of wids.
+    get = _decoding.get
+    # Obscure:  while _decoding does have the key '\x80', its value is 0,
+    # so the "or" here calls _decode('\x80') anyway.
+    return [get(p) or _decode(p) for p in _prog.findall(code)]
+
+_decoding = {} # Filled later
+
+def _decode(s):
+    if s == '\x80':
+        # See comment in decode().  This is here to allow a trick to work.
+        return 0
+    if len(s) == 3:
+        a, b, c = map(ord, s)
+        assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80
+        return ((a & 0xF) << 14) | (b << 7) | c
+    assert len(s) == 4, `s`
+    a, b, c, d = map(ord, s)
+    assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80
+    return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d
+
+def _fill():
+    global _encoding
+    for i in range(0x40):
+        s = chr(i + 0x80)
+        _encoding[i] = s
+        _decoding[s] = i
+    for i in range(0x40, 0x1000):
+        hi, lo = divmod(i, 0x80)
+        s = chr(hi + 0xC0) + chr(lo)
+        _encoding[i] = s
+        _decoding[s] = i
+    _encoding = tuple(_encoding)
+
+_fill()
+
+def test():
+    for i in range(2**20):
+        if i % 1000 == 0: print i
+        wids = [i]
+        code = encode(wids)
+        assert decode(code) == wids, (wids, code, decode(code))
+
+if __name__ == "__main__":
+    test()
--- a/lib/python/Products/ZCTextIndex/ZCTextIndex.py
+++ b/lib/python/Products/ZCTextIndex/ZCTextIndex.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+"""Plug in text index for ZCatalog with relevance ranking."""
+
+import ZODB
+from Persistence import Persistent
+import Acquisition
+from OFS.SimpleItem import SimpleItem
+
+from Products.PluginIndexes.common.PluggableIndex \
+     import PluggableIndexInterface
+
+from Products.ZCTextIndex.Index import Index
+from Products.ZCTextIndex.ILexicon import ILexicon
+from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Globals import DTMLFile
+from Interface import verify_class_implementation
+
+class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
+    __implements__ = PluggableIndexInterface
+    
+    meta_type = 'ZCTextIndex'
+    
+    manage_options= (
+        {'label': 'Settings', 'action': 'manage_main'},
+    )
+
+    def __init__(self, id, extra, caller):
+        self.id = id
+        self._fieldname = extra.doc_attr
+        lexicon = getattr(caller, extra.lexicon_id, None)
+        
+        if lexicon is None:
+            raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
+            
+        verify_class_implementation(ILexicon, lexicon.__class__)
+            
+        self.lexicon = lexicon
+        self.index = Index(self.lexicon)
+        self.parser = QueryParser()
+
+    def index_object(self, docid, obj):
+        self.index.index_doc(docid, self._get_object_text(obj))
+        self._p_changed = 1 # XXX
+
+    def unindex_object(self, docid):
+        self.index.unindex_doc(docid)
+        self._p_changed = 1 # XXX
+
+    def _apply_index(self, req):
+        pass # XXX
+
+    def query(self, query, nbest=10):
+        # returns a mapping from docids to scores
+        tree = self.parser.parseQuery(query)
+        results = tree.executeQuery(self.index)
+        chooser = NBest(nbest)
+        chooser.addmany(results.items())
+        return chooser.getbest()
+
+    def _get_object_text(self, obj):
+        x = getattr(obj, self._fieldname)
+        if callable(x):
+            return x()
+        else:
+            return x
+            
+    ## User Interface Methods ##
+    
+    manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
+
+def manage_addZCTextIndex(self, id, extra=None, REQUEST=None, 
+                          RESPONSE=None):
+    """Add a text index"""
+    return self.manage_addIndex(id, 'ZCTextIndex', extra, 
+                                REQUEST, RESPONSE, REQUEST.URL3)
+
+manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
+
+manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
+
+def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
+                      stopword=None, REQUEST=None):
+    elements = []
+    if splitter:
+        elements.append(Lexicon.Splitter())
+    if normalizer:
+        elements.append(CaseNormalizer())
+    if stopwords:
+        elements.append(StopWordRemover())
+    lexicon = Lexicon(*elements)
+    self._setObject(id, lexicon)
+    if REQUEST is not None:
+        return self.manage_main(self, REQUEST, update_menu=1)
--- a/lib/python/Products/ZCTextIndex/__init__.py
+++ b/lib/python/Products/ZCTextIndex/__init__.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""ZCatalog Text Index
+
+Experimental plugin text index for ZCatalog.
+"""
+
+def initialize(context):
+    from Products.ZCTextIndex import ZCTextIndex
+
+    context.registerClass(
+        ZCTextIndex.ZCTextIndex,
+        permission='Add Pluggable Index',
+        constructors=(ZCTextIndex.manage_addZCTextIndexForm,
+                      ZCTextIndex.manage_addZCTextIndex),
+        visibility=None
+    )
--- a/lib/python/Products/ZCTextIndex/dtml/addLexicon.dtml
+++ b/lib/python/Products/ZCTextIndex/dtml/addLexicon.dtml
+<dtml-var manage_page_header>
+
+<dtml-var "manage_form_title(this(), _,
+           form_title='Add Lexicon',
+	   )">
+
+<FORM ACTION="manage_addLexicon" METHOD="POST">
+<table cellspacing="0" cellpadding="2" border="0">
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    Id
+    </div>
+    </td>
+    <td align="left" valign="top">
+    <input type="text" name="id" size="40" />
+    </td>
+  </tr>
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-optional">
+    Title
+    </div>
+    </td>
+    <td align="left" valign="top">
+    <input type="text" name="title" size="40" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    splitter?
+    </td>
+    <td align="left" valign="top">
+    <input type="checkbox" name="splitter" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    case normalizer?
+    </td>
+    <td align="left" valign="top">
+    <input type="checkbox" name="normalizer" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    remove stop words?
+    </td>
+    <td align="left" valign="top">
+    <input type="checkbox" name="stopword" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    </td>
+    <td align="left" valign="top">
+    <div class="form-element">
+    <input class="form-element" type="submit" name="submit" 
+     value=" Add " /> 
+    </div>
+    </td>
+  </tr>
+</table>
+</form>
+
+
+<dtml-var manage_page_footer>
--- a/lib/python/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+++ b/lib/python/Products/ZCTextIndex/dtml/addZCTextIndex.dtml
+<dtml-var manage_page_header>
+
+<dtml-var "manage_form_title(this(), _,
+           form_title='Add ZCTextIndex',
+	   )">
+
+
+<p class="form-help">
+<strong>Text Indexes</strong> break text up into individual words, and 
+are often referred to as full-text indexes. Text indexes 
+sort results by score, meaning they return hits in order 
+from the most relevant to the least relevant.
+</p>
+
+
+<form action="manage_addZCTextIndex" method="post"
+      enctype="multipart/form-data">
+<table cellspacing="0" cellpadding="2" border="0">
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    Id
+    </div>
+    </td>
+    <td align="left" valign="top">
+    <input type="text" name="id" size="40" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    Vocabulary
+    </div>
+    </td>
+    <td>
+
+    <select name="extra.vocabulary:record">  
+      <dtml-in "this().aq_parent.objectItems('Vocabulary')">
+        <option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
+      </dtml-in>
+    </select>
+    
+    </td> 
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-label">
+    Field name
+    </div></td>
+    <td align="left" valign="top">
+    <input type="text" name="extra.doc_attr:record" size="40" />
+    </td>
+  </tr>
+
+  <tr>
+    <td align="left" valign"top">
+    <div class="form-label">
+    Lexicon
+    </div></td>
+    <td>
+    <select name="extra.lexicon_id:record">  
+      <dtml-in "this().aq_parent.objectItems('Lexicon')">
+        <option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
+      </dtml-in>
+    </select>
+    
+    </td> 
+  </tr>
+
+  <tr>
+    <td align="left" valign="top">
+    <div class="form-optional">
+    Type
+    </div>
+    </td>
+    <td align="left" valign="top">
+    ZCTextIndex
+    </td>
+  </tr>
+
+
+  <tr>
+    <td align="left" valign="top">
+    </td>
+    <td align="left" valign="top">
+    <div class="form-element">
+    <input class="form-element" type="submit" name="submit" 
+     value=" Add " /> 
+    </div>
+    </td>
+  </tr>
+</table>
+</form>
+
+<dtml-var manage_page_footer>
--- a/lib/python/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+++ b/lib/python/Products/ZCTextIndex/dtml/manageZCTextIndex.dtml
+<dtml-var manage_page_header>
+<dtml-var manage_tabs>
+
+<p class="form-help">
+
+  There is nothing to manage here. Move along.
+
+</p>
+<dtml-var manage_page_footer>
--- a/lib/python/Products/ZCTextIndex/stopper.c
+++ b/lib/python/Products/ZCTextIndex/stopper.c
+/*  stopper.c
+ *
+ *  Fast version of the StopWordRemover object.
+ */
+
+#include "Python.h"
+#include "structmember.h"
+
+typedef struct {
+    PyObject_HEAD
+    PyObject *swr_dict;
+} StopWordRemover;
+
+static PyObject *
+swr_process(StopWordRemover *self, PyObject *args)
+{
+    PyObject *result = NULL;
+    PyObject *seq;
+    int len, i;
+
+    if (!PyArg_ParseTuple(args, "O:process", &seq))
+        return NULL;
+    seq = PySequence_Fast(seq,
+                          "process() requires a sequence as the argument");
+    if (seq == NULL)
+        return NULL;
+    result = PyList_New(0);
+    if (result == NULL)
+        goto finally;
+#if PY_VERSION_HEX >= 0x02020000
+    /* Only available in Python 2.2 and newer. */
+    len = PySequence_Fast_GET_SIZE(seq);
+#else
+    len = PyObject_Length(seq);
+#endif
+    for (i = 0; i < len; ++i) {
+        PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
+        /*
+         * PyDict_GetItem() returns NULL if there isn't a matching
+         * item, but without setting an exception, so this does what
+         * we want.
+         */
+        if (PyDict_GetItem(self->swr_dict, s) == NULL)
+            if (PyList_Append(result, s) < 0) {
+                Py_DECREF(result);
+                result = NULL;
+                goto finally;
+            }
+    }
+ finally:
+    Py_XDECREF(seq);
+    return result;
+}
+
+static struct memberlist swr_members[] = {
+    {"dict",  T_OBJECT,  offsetof(StopWordRemover, swr_dict),  READONLY},
+    {NULL}
+};
+
+static PyMethodDef swr_methods[] = {
+    {"process", (PyCFunction)swr_process, METH_VARARGS,
+     "process([str, ...]) --> [str, ...]\n"
+     "Remove stop words from the input list of strings to create a new list."},
+    {NULL}
+};
+
+static PyObject *
+swr_getattr(PyObject *self, char *name)
+{
+    PyObject *res;
+
+    res = Py_FindMethod(swr_methods, self, name);
+    if (res != NULL)
+        return res;
+    PyErr_Clear();
+    return PyMember_Get((char *)self, swr_members, name);
+}
+
+static void
+swr_dealloc(StopWordRemover *self)
+{
+    Py_XDECREF(self->swr_dict);
+    PyObject_Del(self);
+}
+
+static PyTypeObject StopWordRemover_Type = {
+    PyObject_HEAD_INIT(NULL)    /* ob_type      */
+    0,                          /* ob_size      */
+    "stopper.StopWordRemover",  /* tp_name      */
+    sizeof(StopWordRemover),    /* tp_basicsize */
+    0,                          /* tp_itemsize  */
+    (destructor)swr_dealloc,    /* tp_dealloc   */
+    0,                          /* tp_print     */
+    (getattrfunc)swr_getattr,   /* tp_getattr   */
+    0,                          /* tp_setattr   */
+};
+
+static PyObject *
+swr_new(PyObject *notused, PyObject *args)
+{
+    StopWordRemover *swr = NULL;
+    PyObject *dict = NULL;
+
+    if (PyArg_ParseTuple(args, "|O!:new", &PyDict_Type, &dict)) {
+        swr = PyObject_New(StopWordRemover, &StopWordRemover_Type);
+        if (swr != NULL) {
+            if (dict != NULL) {
+                Py_INCREF(dict);
+                swr->swr_dict = dict;
+            }
+            else {
+                swr->swr_dict = PyDict_New();
+                if (swr->swr_dict == NULL) {
+                    Py_DECREF(swr);
+                    swr = NULL;
+                }
+            }
+        }
+    }
+    return (PyObject *) swr;
+}
+
+static PyObject*
+pickle_constructor = NULL;
+
+PyObject *
+swr_pickler(PyObject *unused, PyObject *args)
+{
+    StopWordRemover *swr;
+    PyObject *result = NULL;
+
+    if (PyArg_ParseTuple(args, "O!:_pickler", &StopWordRemover_Type, &swr)) {
+        result = Py_BuildValue("O(O)", pickle_constructor, swr->swr_dict);
+    }
+    return result;
+}
+
+static PyMethodDef stopper_functions[] = {
+    {"new",      swr_new,     METH_VARARGS,
+     "new() -> StopWordRemover instance\n"
+     "Create & return a new stop-word remover."},
+    {"_pickler", swr_pickler, METH_VARARGS,
+     "_pickler(StopWordRemover instance) -> pickle magic\n"
+     "Internal magic used to make stop-word removers picklable."},
+    {NULL}
+};
+
+void
+initstopper(void)
+{
+    PyObject *m, *copy_reg;
+
+    StopWordRemover_Type.ob_type = &PyType_Type;
+    m = Py_InitModule3("stopper", stopper_functions,
+                       "Fast StopWordRemover implementation.");
+    if (m == NULL)
+        return;
+    if (PyObject_SetAttrString(m, "StopWordRemoverType",
+                               (PyObject *) &StopWordRemover_Type) < 0)
+        return;
+
+    /* register to support pickling */
+    copy_reg = PyImport_ImportModule("copy_reg");
+    if (copy_reg != NULL) {
+        PyObject *pickler;
+
+        if (pickle_constructor == NULL) {
+            pickle_constructor = PyObject_GetAttrString(m, "new");
+            Py_XINCREF(pickle_constructor);
+        }
+        pickler = PyObject_GetAttrString(m, "_pickler");
+        if ((pickle_constructor != NULL) && (pickler != NULL)) {
+            PyObject *res;
+
+            res = PyObject_CallMethod(
+                    copy_reg, "pickle", "OOO", &StopWordRemover_Type,
+                    pickler, pickle_constructor);
+            Py_XDECREF(res);
+        }
+        Py_DECREF(copy_reg);
+    }
+}
--- a/lib/python/Products/ZCTextIndex/tests/__init__.py
+++ b/lib/python/Products/ZCTextIndex/tests/__init__.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+"""
+
+Revision information:
+$Id: __init__.py,v 1.2 2002/05/14 15:12:34 gvanrossum Exp $
+"""
--- a/lib/python/Products/ZCTextIndex/tests/hs-tool.py
+++ b/lib/python/Products/ZCTextIndex/tests/hs-tool.py
+#! /usr/bin/env python
+
+import cPickle
+import os.path
+import sys
+
+from hotshot.log import LogReader
+
+def load_line_info(log):
+    byline = {}
+    prevloc = None
+    for what, place, tdelta in log:
+        if tdelta > 0:
+            t, nhits = byline.get(prevloc, (0, 0))
+            byline[prevloc] = (tdelta + t), (nhits + 1)
+            prevloc = place
+    return byline
+
+def basename(path, cache={}):
+    try:
+        return cache[path]
+    except KeyError:
+        fn = os.path.split(path)[1]
+        cache[path] = fn
+        return fn
+
+def print_results(results):
+    for info, place in results:
+        if not place:
+            print 'Bad unpack:', info, place
+            continue
+        filename, line, funcname = place
+        print '%8d %8d' % info, basename(filename), line
+
+def annotate_results(results):
+    files = {}
+    for stats, place in results:
+        if not place:
+            continue
+        time, hits = stats
+        file, line, func = place
+        l = files.get(file)
+        if l is None:
+            l = files[file] = []
+        l.append((line, hits, time))
+    order = files.keys()
+    order.sort()
+    for k in order:
+        if os.path.exists(k):
+            v = files[k]
+            v.sort()
+            annotate(k, v)
+
+def annotate(file, lines):
+    print "-" * 60
+    print file
+    print "-" * 60
+    f = open(file)
+    i = 1
+    match = lines[0][0]
+    for line in f:
+        if match == i:
+            print "%6d %8d " % lines[0][1:], line,
+            del lines[0]
+            if lines:
+                match = lines[0][0]
+            else:
+                match = None
+        else:
+            print " " * 16, line,
+        i += 1
+    print
+
+def get_cache_name(filename):
+    d, fn = os.path.split(filename)
+    cache_dir = os.path.join(d, '.hs-tool')
+    cache_file = os.path.join(cache_dir, fn)
+    return cache_dir, cache_file
+
+def cache_results(filename, results):
+    cache_dir, cache_file = get_cache_name(filename)
+    if not os.path.exists(cache_dir):
+        os.mkdir(cache_dir)
+    fp = open(cache_file, 'wb')
+    try:
+        cPickle.dump(results, fp, 1)
+    finally:
+        fp.close()
+
+def main(filename, annotate):
+    cache_dir, cache_file = get_cache_name(filename)
+
+    if (  os.path.isfile(cache_file)
+          and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
+        # cached data is up-to-date:
+        fp = open(cache_file, 'rb')
+        results = cPickle.load(fp)
+        fp.close()
+    else:
+        log = LogReader(filename)
+        byline = load_line_info(log)
+        # Sort
+        results = [(v, k) for k, v in byline.items()]
+        results.sort()
+        cache_results(filename, results)
+
+    if annotate:
+        annotate_results(results)
+    else:
+        print_results(results)
+
+
+if __name__ == "__main__":
+    import getopt
+
+    annotate_p = 0
+    opts, args = getopt.getopt(sys.argv[1:], 'A')
+    for o, v in opts:
+        if o == '-A':
+            annotate_p = 1
+    if args:
+        filename, = args
+    else:
+        filename = "profile.dat"
+        
+    main(filename, annotate_p)
--- a/lib/python/Products/ZCTextIndex/tests/indexhtml.py
+++ b/lib/python/Products/ZCTextIndex/tests/indexhtml.py
+#! /usr/bin/env python
+
+"""Index a collection of HTML files on the filesystem.
+
+usage: indexhtml.py [options] dir
+
+Will create an index of all files in dir or its subdirectories.
+
+options:
+-f data.fs  -- the path to the filestorage datafile
+"""
+
+import os
+
+import ZODB
+from ZODB.FileStorage import FileStorage
+from BTrees.IOBTree import IOBTree
+
+from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
+from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
+from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
+
+def make_index():
+    # there's an elaborate dance necessary to construct an index
+    class Struct:
+        pass
+    extra = Struct()
+    extra.doc_attr = "read"
+    extra.lexicon_id = "lexicon"
+    caller = Struct()
+    caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
+    return ZCTextIndex(extra, caller)
+
+def main(db, root, dir):
+    rt["index"] = index = make_index()
+    rt["files"] = paths = IOBTree()
+    get_transaction().commit()
+
+    files = [os.path.join(dir, file) for file in os.listdir(dir)]
+    docid = 0
+    for file in files:
+        if os.path.isdir(file):
+            files += [os.path.join(file, sub) for sub in os.listdir(file)]
+        else:
+            if not file.endswith(".html"):
+                continue
+            docid += 1
+            print "%5d" % docid, file
+            f = open(file, "rb")
+            paths[docid] = file
+            index.index_object(docid, f)
+            f.close()
+            if docid % TXN_INTERVAL == 0:
+                get_transaction().commit()
+            if docid % PACK_INTERVAL == 0:
+                db.pack()
+    get_transaction().commit()
+
+if __name__ == "__main__":
+    import sys
+    import getopt
+
+    VERBOSE = 0
+    FSPATH = "Data.fs"
+    TXN_INTERVAL = 100
+    PACK_INTERVAL = 500
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'vf:')
+    except getopt.error, msg:
+        print msg
+        print __doc__
+        sys.exit(2)
+        
+    for o, v in opts:
+        if o == '-v':
+            VERBOSE += 1
+        if o == '-f':
+            FSPATH = v
+            
+    if len(args) != 1:
+        print "Expected on argument"
+        print __doc__
+        sys.exit(2)
+    dir = args[0]
+
+    fs = FileStorage(FSPATH)
+    db = ZODB.DB(fs)
+    cn = db.open()
+    rt = cn.root()
+    dir = os.path.join(os.getcwd(), dir)
+    print dir
+    main(db, rt, dir)
+    cn.close()
+    fs.close()        
+    
--- a/lib/python/Products/ZCTextIndex/tests/mailtest.py
+++ b/lib/python/Products/ZCTextIndex/tests/mailtest.py
+"""Test an index with a Unix mailbox file.
+
+usage: python mailtest.py [options] <data.fs>
+
+options:
+    -v     -- verbose
+    -n NNN -- max number of messages to read from mailbox
+    -q query
+    -i mailbox
+    -p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
+    -p 0   -- don't pack at all
+    -b NNN -- return the NNN best matches (default: 10)
+    -x     -- exclude the message text from the data.fs
+    -t NNN -- commit a transaction every NNN messages (default: 1)
+
+The script either indexes or queries depending on whether -q or -i is
+passed as an option.
+
+For -i mailbox, the script reads mail messages from the mailbox and
+indexes them.  It indexes one message at a time, then commits the
+transaction.
+
+For -q query, it performs a query on an existing index.
+
+If both are specified, the index is performed first.
+
+You can also interact with the index after it is completed. Load the
+index from the database:
+
+    import ZODB
+    from ZODB.FileStorage import FileStorage
+    fs = FileStorage(<data.fs>
+    db = ZODB.DB(fs)
+    index = cn.open().root()["index"]
+    index.search("python AND unicode")
+"""
+
+import ZODB
+import ZODB.FileStorage
+from Products.ZCTextIndex.Lexicon import Lexicon, \
+     CaseNormalizer, Splitter, StopWordRemover
+from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
+from BTrees.IOBTree import IOBTree
+
+import sys
+import mailbox
+import time
+
+def usage(msg):
+    print msg
+    print __doc__
+    sys.exit(2)
+
+class Message:
+
+    total_bytes = 0
+
+    def __init__(self, msg):
+        subject = msg.getheader('subject', '')
+        author = msg.getheader('from', '')
+        if author:
+            summary = "%s (%s)\n" % (subject, author)
+        else:
+            summary = "%s\n" % subject
+        self.text = summary + msg.fp.read()
+        Message.total_bytes += len(self.text)
+
+class Extra:
+    pass
+
+def index(rt, mboxfile, db):
+    global NUM
+    idx_time = 0
+    pack_time = 0
+
+    lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+    extra = Extra()
+    extra.lexicon_id = 'lexicon'
+    extra.doc_attr = 'text'
+    caller = Extra()
+    caller.lexicon = lexicon
+    rt["index"] = idx = ZCTextIndex("index", extra, caller)
+    if not EXCLUDE_TEXT:
+        rt["documents"] = docs = IOBTree()
+    get_transaction().commit()
+
+    mbox = mailbox.UnixMailbox(open(mboxfile))
+    if VERBOSE:
+        print "opened", mboxfile
+    if not NUM:
+        NUM = sys.maxint
+    i = 0
+    while i < NUM:
+        _msg = mbox.next()
+        if _msg is None:
+            break
+        i += 1
+        msg = Message(_msg)
+        if VERBOSE >= 2:
+            print "indexing msg", i
+        i0 = time.clock()
+        idx.index_object(i, msg)
+        if not EXCLUDE_TEXT:
+            docs[i] = msg
+        if i % TXN_SIZE == 0:
+            get_transaction().commit()
+        i1 = time.clock()
+        idx_time += i1 - i0
+        if VERBOSE and i % 50 == 0:
+            print i, "messages indexed"
+            print "cache size", db.cacheSize()
+        if PACK_INTERVAL and i % PACK_INTERVAL == 0:
+            if VERBOSE >= 2:
+                print "packing..."
+            p0 = time.clock()
+            db.pack(time.time())
+            p1 = time.clock()
+            if VERBOSE:
+                print "pack took %s sec" % (p1 - p0)
+            pack_time += p1 - p0
+
+    get_transaction().commit()
+
+    if PACK_INTERVAL and i % PACK_INTERVAL != 0:
+        if VERBOSE >= 2:
+            print "packing one last time..."
+        p0 = time.clock()
+        db.pack(time.time())
+        p1 = time.clock()
+        if VERBOSE:
+            print "pack took %s sec" % (p1 - p0)
+        pack_time += p1 - p0
+
+    if VERBOSE:
+        print "Index time", idx_time
+        print "Index bytes", Message.total_bytes
+        rate = (Message.total_bytes / idx_time) / 1024
+        print "Index rate %d KB/sec" % int(rate)
+
+def query(rt, query_str):
+    idx = rt["index"]
+    docs = rt["documents"]
+    results = idx.query(query_str, BEST)
+    print "query:", query_str
+    print "# results:", len(results)
+    for docid, score in results:
+        print "docid %4d score %2d" % (docid, score)
+        if VERBOSE:
+            msg = docs[docid]
+            # print 3 lines of context
+            CONTEXT = 5
+            ctx = msg.text.split("\n", CONTEXT)
+            del ctx[-1]
+            print "-" * 60
+            print "message:"
+            for l in ctx:
+                print l
+            print "-" * 60
+
+
+def main(fs_path, mbox_path, query_str):
+    f = ZODB.FileStorage.FileStorage(fs_path)
+    db = ZODB.DB(f, cache_size=CACHE_SIZE)
+    cn = db.open()
+    rt = cn.root()
+
+    if mbox_path is not None:
+        index(rt, mbox_path, db)
+    if query_str is not None:
+        query(rt, query_str)
+
+    cn.close()
+    db.close()
+    f.close()
+
+if __name__ == "__main__":
+    import getopt
+
+    NUM = 0
+    BEST = 10
+    VERBOSE = 0
+    PACK_INTERVAL = 500
+    EXCLUDE_TEXT = 0
+    CACHE_SIZE = 10000
+    TXN_SIZE = 1
+    query_str = None
+    mbox_path = None
+    profile = None
+    old_profile = None
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:xt:',
+                                   ['profile=', 'old-profile='])
+    except getopt.error, msg:
+        usage(msg)
+    if len(args) != 1:
+        usage("exactly 1 filename argument required")
+    for o, v in opts:
+        if o == '-n':
+            NUM = int(v)
+        elif o == '-v':
+            VERBOSE += 1
+        elif o == '-p':
+            PACK_INTERVAL = int(v)
+        elif o == '-q':
+            query_str = v
+        elif o == '-i':
+            mbox_path = v
+        elif o == '-b':
+            BEST = int(v)
+        elif o == '-x':
+            EXCLUDE_TEXT = 1
+        elif o == '-t':
+            TXN_SIZE = int(v)
+        elif o == '--profile':
+            profile = v
+        elif o == '--old-profile':
+            old_profile = v
+    fs_path, = args
+    if profile:
+        import hotshot
+        profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
+        profiler.runcall(main, fs_path, mbox_path, query_str)
+        profiler.close()
+    elif old_profile:
+        import profile, pstats
+        profiler = profile.Profile()
+        profiler.runcall(main, fs_path, mbox_path, query_str)
+        profiler.dump_stats(old_profile)
+        stats = pstats.Stats(old_profile)
+        stats.strip_dirs().sort_stats('time').print_stats(20)
+    else:
+        main(fs_path, mbox_path, query_str)
--- a/lib/python/Products/ZCTextIndex/tests/mhindex.py
+++ b/lib/python/Products/ZCTextIndex/tests/mhindex.py
+#! /usr/bin/env python2.1
+
+"""MH mail indexer."""
+
+import re
+import sys
+import time
+import mhlib
+import getopt
+import traceback
+from StringIO import StringIO
+
+DATAFS = "/home/guido/.Data.fs"
+ZOPECODE = "/home/guido/projects/ds9/lib/python"
+
+sys.path.append(ZOPECODE)
+
+from ZODB import DB
+from ZODB.FileStorage import FileStorage
+from Persistence import Persistent
+from BTrees.IOBTree import IOBTree
+from BTrees.OIBTree import OIBTree
+
+from Products.ZCTextIndex.NBest import NBest
+from Products.ZCTextIndex.OkapiIndex import Index
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Products.ZCTextIndex.StopDict import get_stopdict
+
+NBEST = 3
+MAXLINES = 3
+
+def main():
+    try:
+        opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
+    except getopt.error, msg:
+        print msg
+        sys.exit(2)
+    update = 0
+    bulk = 0
+    optimize = 0
+    nbest = NBEST
+    maxlines = MAXLINES
+    datafs = DATAFS
+    pack = 0
+    for o, a in opts:
+        if o == "-b":
+            bulk = 1
+        if o == "-d":
+            datafs = a
+        if o == "-m":
+            maxlines = int(a)
+        if o == "-n":
+            nbest = int(a)
+        if o == "-O":
+            optimize = 1
+        if o == "-p":
+            pack = 1
+        if o == "-u":
+            update = 1
+    ix = Indexer(datafs, update or bulk)
+    if bulk:
+        if optimize:
+            ix.optimize(args)
+        ix.bulkupdate(args)
+    elif update:
+        ix.update(args)
+        if pack:
+            ix.pack()
+    elif args:
+        for i in range(len(args)):
+            a = args[i]
+            if " " in a:
+                if a[0] == "-":
+                    args[i] = '-"' + a[1:] + '"'
+                else:
+                    args[i] = '"' + a + '"'
+        ix.query(" ".join(args), nbest, maxlines)
+    else:
+        ix.interact(nbest)
+
+class Indexer:
+
+    filestorage = database = connection = root = None
+
+    def __init__(self, datafs, writable=0):
+        self.stopdict = get_stopdict()
+        self.mh = mhlib.MH()
+        self.filestorage = FileStorage(datafs, read_only=(not writable))
+        self.database = DB(self.filestorage)
+        self.connection = self.database.open()
+        self.root = self.connection.root()
+        try:
+            self.index = self.root["index"]
+        except KeyError:
+            self.index = self.root["index"] = TextIndex()
+        try:
+            self.docpaths = self.root["docpaths"]
+        except KeyError:
+            self.docpaths = self.root["docpaths"] = IOBTree()
+        self.path2docid = OIBTree()
+        for docid in self.docpaths.keys():
+            path = self.docpaths[docid]
+            self.path2docid[path] = docid
+        try:
+            self.maxdocid = max(self.docpaths.keys())
+        except ValueError:
+            self.maxdocid = 0
+        print len(self.docpaths), "Document ids"
+        print len(self.path2docid), "Pathnames"
+
+    def close(self):
+        self.root = None
+        if self.connection is not None:
+            self.connection.close()
+            self.connection = None
+        if self.database is not None:
+            self.database.close()
+            self.database = None
+        if self.filestorage is not None:
+            self.filestorage.close()
+            self.filestorage = None
+
+    def interact(self, nbest=NBEST, maxlines=MAXLINES):
+        try:
+            import readline
+        except ImportError:
+            pass
+        text = ""
+        top = 0
+        while 1:
+            try:
+                line = raw_input("Query: ")
+            except EOFError:
+                print "\nBye."
+                break
+            line = line.strip()
+            if line:
+                text = line
+                top = 0
+            else:
+                if not text:
+                    continue
+            try:
+                n, results = self.timequery(text, top + nbest)
+            except:
+                reportexc()
+                text = ""
+                top = 0
+                continue
+            if len(results) <= top:
+                if not n:
+                    print "No hits for %r." % text
+                else:
+                    print "No more hits for %r." % text
+                text = ""
+                top = 0
+                continue
+            print "[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
+            print "for query %s]" % repr(text)
+            self.formatresults(text, results, maxlines, top, top+nbest)
+            top += nbest
+
+    def query(self, text, nbest=NBEST, maxlines=MAXLINES):
+        n, results = self.timequery(text, nbest)
+        if not n:
+            print "No hits for %r." % text
+            return
+        print "[Results 1-%d from %d]" % (len(results), n)
+        self.formatresults(text, results, maxlines)
+
+    def timequery(self, text, nbest):
+        t0 = time.time()
+        c0 = time.clock()
+        n, results = self.index.query(text, nbest)
+        t1 = time.time()
+        c1 = time.clock()
+        print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
+        return n, results
+
+    def formatresults(self, text, results, maxlines=MAXLINES,
+                      lo=0, hi=sys.maxint):
+        stop = self.stopdict.has_key
+        words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
+        pattern = r"\b(" + "|".join(words) + r")\b"
+        pattern = pattern.replace("*", ".*") # glob -> re syntax
+        prog = re.compile(pattern, re.IGNORECASE)
+        print '='*70
+        rank = lo
+        qw = max(1, self.index.query_weight(text))
+        factor = 100.0 / qw / 1024
+        for docid, score in results[lo:hi]:
+            rank += 1
+            path = self.docpaths[docid]
+            score = min(100, int(score * factor))
+            print "Rank:    %d   Score: %d%%   File: %s" % (rank, score, path)
+            fp = open(path)
+            msg = mhlib.Message("<folder>", 0, fp)
+            for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
+                h = msg.getheader(header)
+                if h:
+                    print "%-8s %s" % (header+":", h)
+            text = self.getmessagetext(msg)
+            if text:
+                print
+                nleft = maxlines
+                for part in text:
+                    for line in part.splitlines():
+                        if prog.search(line):
+                            print line
+                            nleft -= 1
+                            if nleft <= 0:
+                                break
+                    if nleft <= 0:
+                        break
+            print '-'*70
+
+    def update(self, args):
+        folder = None
+        seqs = []
+
+        for arg in args:
+            if arg.startswith("+"):
+                if folder is None:
+                    folder = arg[1:]
+                else:
+                    print "only one folder at a time"
+                    return
+            else:
+                seqs.append(arg)
+
+        if not folder:
+            folder = self.mh.getcontext()
+        if not seqs:
+            seqs = ['all']
+
+        try:
+            f = self.mh.openfolder(folder)
+        except mhlib.Error, msg:
+            print msg
+            return
+
+        dict = {}
+        for seq in seqs:
+            try:
+                nums = f.parsesequence(seq)
+            except mhlib.Error, msg:
+                print msg or "unparsable message sequence: %s" % `seq`
+                return
+            for n in nums:
+                dict[n] = n
+        msgs = dict.keys()
+        msgs.sort()
+
+        self.updatefolder(f, msgs)
+
+    def optimize(self, args):
+        uniqwords = {}
+        for folder in args:
+            if folder.startswith("+"):
+                folder = folder[1:]
+            print "\nOPTIMIZE FOLDER", folder
+            try:
+                f = self.mh.openfolder(folder)
+            except mhlib.Error, msg:
+                print msg
+                continue
+            self.prescan(f, f.listmessages(), uniqwords)
+        L = [(uniqwords[word], word) for word in uniqwords.keys()]
+        L.sort()
+        L.reverse()
+        for i in range(100):
+            print "%3d. %6d %s" % ((i+1,) + L[i])
+        self.index.lexicon.sourceToWordIds([word for (count, word) in L])
+
+    def prescan(self, f, msgs, uniqwords):
+        pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
+        for n in msgs:
+            print "prescanning", n
+            m = f.openmessage(n)
+            text = self.getmessagetext(m)
+            for p in pipeline:
+                text = p.process(text)
+            for word in text:
+                uniqwords[word] = uniqwords.get(word, 0) + 1
+
+    def bulkupdate(self, args):
+        chunk = 5000
+        target = len(self.docpaths) + chunk
+        for folder in args:
+            if len(self.docpaths) >= target:
+                self.pack()
+                target = len(self.docpaths) + chunk
+            if folder.startswith("+"):
+                folder = folder[1:]
+            print "\nFOLDER", folder
+            try:
+                f = self.mh.openfolder(folder)
+            except mhlib.Error, msg:
+                print msg
+                continue
+            self.updatefolder(f, f.listmessages())
+            print "Total", len(self.docpaths)
+        self.pack()
+
+    def updatefolder(self, f, msgs):
+        done = 0
+        new = 0
+        for n in msgs:
+            print "indexing", n
+            m = f.openmessage(n)
+            text = self.getmessagetext(m)
+            path = f.getmessagefilename(n)
+            self.unindexpath(path)
+            if not text:
+                continue
+            docid = self.newdocid(path)
+            self.index.index_text(docid, text)
+            done += 1
+            new = 1
+            if done%500 == 0:
+                self.commit()
+                new = 0
+        if new:
+            self.commit()
+        print "done."
+
+    def unindexpath(self, path):
+        if self.path2docid.has_key(path):
+            docid = self.path2docid[path]
+            print "unindexing", docid, path
+            del self.docpaths[docid]
+            del self.path2docid[path]
+            try:
+                self.index.unindex(docid)
+            except KeyError, msg:
+                print "KeyError", msg
+
+    def getmessagetext(self, m):
+        L = []
+        try:
+            self.getmsgparts(m, L, 0)
+        except:
+            print "(getmsgparts failed:)"
+            reportexc()
+        return L
+
+    def getmsgparts(self, m, L, level):
+        ctype = m.gettype()
+        if level or ctype != "text/plain":
+            print ". "*level + str(ctype)
+        if ctype == "text/plain":
+            L.append(m.getbodytext())
+        elif ctype in ("multipart/alternative", "multipart/mixed"):
+            for part in m.getbodyparts():
+                self.getmsgparts(part, L, level+1)
+        elif ctype == "message/rfc822":
+            f = StringIO(m.getbodytext())
+            m = mhlib.Message("<folder>", 0, f)
+            self.getmsgparts(m, L, level+1)
+
+    def newdocid(self, path):
+        docid = self.maxdocid + 1
+        self.maxdocid = docid
+        self.docpaths[docid] = path
+        self.path2docid[path] = docid
+        return docid
+
+    def commit(self):
+        print "committing..."
+        get_transaction().commit()
+
+    def pack(self):
+        print "packing..."
+        self.database.pack()
+
+class TextIndex(Persistent):
+
+    def __init__(self):
+        self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
+        self.index = Index(self.lexicon)
+
+    def index_text(self, docid, text):
+        self.index.index_doc(docid, text)
+        self._p_changed = 1 # XXX
+
+    def unindex(self, docid):
+        self.index.unindex_doc(docid)
+        self._p_changed = 1 # XXX
+
+    def query(self, query, nbest=10):
+        # returns a total hit count and a mapping from docids to scores
+        parser = QueryParser()
+        tree = parser.parseQuery(query)
+        results = tree.executeQuery(self.index)
+        chooser = NBest(nbest)
+        chooser.addmany(results.items())
+        return len(results), chooser.getbest()
+
+    def query_weight(self, query):
+        parser = QueryParser()
+        tree = parser.parseQuery(query)
+        terms = tree.terms()
+        return self.index.query_weight(terms)
+
+def reportexc():
+    traceback.print_exc()
+
+if __name__ == "__main__":
+    main()
--- a/lib/python/Products/ZCTextIndex/tests/testIndex.py
+++ b/lib/python/Products/ZCTextIndex/tests/testIndex.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.Index import Index
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+
+class IndexTest(TestCase):
+    def setUp(self):
+        self.lexicon = Lexicon(Splitter())
+        self.index = Index(self.lexicon)
+
+    def test_index_document(self, DOCID=1):
+        doc = "simple document contains five words"
+        self.index.index_doc(DOCID, doc)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 5)
+        self.assertEqual(len(self.index._docwords), 1)
+        self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
+        for map in self.index._wordinfo.values():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_unindex_document(self):
+        DOCID = 1
+        self.test_index_document(DOCID)
+        self.index.unindex_doc(DOCID)
+        self.assertEqual(len(self.index._docweight), 0)
+        self.assertEqual(len(self.index._wordinfo), 0)
+        self.assertEqual(len(self.index._docwords), 0)
+
+    def test_index_two_documents(self):
+        self.test_index_document()
+        doc = "another document just four"
+        DOCID = 2
+        self.index.index_doc(DOCID, doc)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 8)
+        self.assertEqual(len(self.index._docwords), 2)
+        self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
+        wids = self.lexicon.termToWordIds("document")
+        self.assertEqual(len(wids), 1)
+        document_wid = wids[0]
+        for wid, map in self.index._wordinfo.items():
+            if wid == document_wid:
+                self.assertEqual(len(map), 2)
+                self.assert_(map.has_key(1))
+                self.assert_(map.has_key(DOCID))
+            else:
+                self.assertEqual(len(map), 1)
+
+    def test_index_two_unindex_one(self):
+        # index two documents, unindex one, and test the results
+        self.test_index_two_documents()
+        self.index.unindex_doc(1)
+        DOCID = 2
+        self.assertEqual(len(self.index._docweight), 1)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 4)
+        self.assertEqual(len(self.index._docwords), 1)
+        self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
+        for map in self.index._wordinfo.values():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_index_duplicated_words(self, DOCID=1):
+        doc = "very simple repeat repeat repeat document test"
+        self.index.index_doc(DOCID, doc)
+        self.assert_(self.index._docweight[DOCID])
+        self.assertEqual(len(self.index._wordinfo), 5)
+        self.assertEqual(len(self.index._docwords), 1)
+##        self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
+        wids = self.lexicon.termToWordIds("repeat")
+        self.assertEqual(len(wids), 1)
+        repititive_wid = wids[0]
+        for wid, map in self.index._wordinfo.items():
+            self.assertEqual(len(map), 1)
+            self.assert_(map.has_key(DOCID))
+
+    def test_simple_query_oneresult(self):
+        self.index.index_doc(1, 'not the same document')
+        results = self.index.search("document")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_simple_query_noresults(self):
+        self.index.index_doc(1, 'not the same document')
+        results = self.index.search("frobnicate")
+        self.assertEqual(list(results.keys()), [])
+
+    def test_query_oneresult(self):
+        self.index.index_doc(1, 'not the same document')
+        self.index.index_doc(2, 'something about something else')
+        results = self.index.search("document")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_search_phrase(self):
+        self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
+        self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
+        results = self.index.search_phrase("quick brown fox")
+        self.assertEqual(list(results.keys()), [1])
+
+    def test_search_glob(self):
+        self.index.index_doc(1, "how now brown cow")
+        self.index.index_doc(2, "hough nough browne cough")
+        self.index.index_doc(3, "bar brawl")
+        results = self.index.search_glob("bro*")
+        self.assertEqual(list(results.keys()), [1, 2])
+        results = self.index.search_glob("b*")
+        self.assertEqual(list(results.keys()), [1, 2, 3])
+
+def test_suite():
+    return makeSuite(IndexTest)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testLexicon.py
+++ b/lib/python/Products/ZCTextIndex/tests/testLexicon.py
+##############################################################################
+#
+# Copyright (c) 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.Lexicon import Lexicon
+from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
+
+class StupidPipelineElement:
+    def __init__(self, fromword, toword):
+        self.__fromword = fromword
+        self.__toword = toword
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if term == self.__fromword:
+                res.append(self.__toword)
+            else:
+                res.append(term)
+        return res
+
+class WackyReversePipelineElement:
+    def __init__(self, revword):
+        self.__revword = revword
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if term == self.__revword:
+                x = list(term)
+                x.reverse()
+                res.append(''.join(x))
+            else:
+                res.append(term)
+        return res
+
+class StopWordPipelineElement:
+    def __init__(self, stopdict={}):
+        self.__stopdict = stopdict
+
+    def process(self, seq):
+        res = []
+        for term in seq:
+            if self.__stopdict.get(term):
+                continue
+            else:
+                res.append(term)
+        return res
+
+
+class Test(TestCase):
+    def testSourceToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        self.assertEqual(wids, [1, 2, 3])
+
+    def testTermToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('dogs')
+        self.assertEqual(wids, [3])
+
+    def testMissingTermToWordIds(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('boxes')
+        self.assertEqual(wids, [])
+
+    def testOnePipelineElement(self):
+        lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('fish')
+        self.assertEqual(wids, [3])
+
+    def testSplitterAdaptorFold(self):
+        lexicon = Lexicon(Splitter(), CaseNormalizer())
+        wids = lexicon.sourceToWordIds('CATS and dogs')
+        wids = lexicon.termToWordIds('cats and dogs')
+        self.assertEqual(wids, [1, 2, 3])
+
+    def testSplitterAdaptorNofold(self):
+        lexicon = Lexicon(Splitter())
+        wids = lexicon.sourceToWordIds('CATS and dogs')
+        wids = lexicon.termToWordIds('cats and dogs')
+        self.assertEqual(wids, [2, 3])
+
+    def testTwoElementPipeline(self):
+        lexicon = Lexicon(Splitter(),
+                          StupidPipelineElement('cats', 'fish'),
+                          WackyReversePipelineElement('fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('hsif')
+        self.assertEqual(wids, [1])
+
+    def testThreeElementPipeline(self):
+        lexicon = Lexicon(Splitter(),
+                          StopWordPipelineElement({'and':1}),
+                          StupidPipelineElement('dogs', 'fish'),
+                          WackyReversePipelineElement('fish'))
+        wids = lexicon.sourceToWordIds('cats and dogs')
+        wids = lexicon.termToWordIds('hsif')
+        self.assertEqual(wids, [2])
+
+
+def test_suite():
+    return makeSuite(Test)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testNBest.py
+++ b/lib/python/Products/ZCTextIndex/tests/testNBest.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.NBest import NBest
+
+class NBestTest(TestCase):
+
+    def testConstructor(self):
+        self.assertRaises(ValueError, NBest, 0)
+        self.assertRaises(ValueError, NBest, -1)
+
+        for n in range(1, 11):
+            nb = NBest(n)
+            self.assertEqual(len(nb), 0)
+            self.assertEqual(nb.capacity(), n)
+
+    def testOne(self):
+        nb = NBest(1)
+        nb.add('a', 0)
+        self.assertEqual(nb.getbest(), [('a', 0)])
+
+        nb.add('b', 1)
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('b', 1)])
+
+        nb.add('c', -1)
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('b', 1)])
+
+        nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
+        self.assertEqual(len(nb), 1)
+        self.assertEqual(nb.capacity(), 1)
+        self.assertEqual(nb.getbest(), [('f', 5)])
+
+    def testMany(self):
+        import random
+        inputs = [(-i, i) for i in range(50)]
+
+        reversed_inputs = inputs[:]
+        reversed_inputs.reverse()
+
+        # Test the N-best for a variety of n (1, 6, 11, ... 50).
+        for n in range(1, len(inputs)+1, 5):
+            expected = inputs[-n:]
+            expected.reverse()
+
+            random_inputs = inputs[:]
+            random.shuffle(random_inputs)
+
+            for source in inputs, reversed_inputs, random_inputs:
+                # Try feeding them one at a time.
+                nb = NBest(n)
+                for item, score in source:
+                    nb.add(item, score)
+                self.assertEqual(len(nb), n)
+                self.assertEqual(nb.capacity(), n)
+                self.assertEqual(nb.getbest(), expected)
+
+                # And again in one gulp.
+                nb = NBest(n)
+                nb.addmany(source)
+                self.assertEqual(len(nb), n)
+                self.assertEqual(nb.capacity(), n)
+                self.assertEqual(nb.getbest(), expected)
+
+                for i in range(1, n+1):
+                    self.assertEqual(nb.pop_smallest(), expected[-i])
+                self.assertRaises(IndexError, nb.pop_smallest)
+
+def test_suite():
+    return makeSuite(NBestTest)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testQueryEngine.py
+++ b/lib/python/Products/ZCTextIndex/tests/testQueryEngine.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from BTrees.IIBTree import IIBucket
+
+from Products.ZCTextIndex.QueryParser import QueryParser
+from Products.ZCTextIndex.ParseTree import ParseError, QueryError
+
+class FauxIndex:
+
+    def search(self, term):
+        b = IIBucket()
+        if term == "foo":
+            b[1] = b[3] = 1
+        elif term == "bar":
+            b[1] = b[2] = 1
+        elif term == "ham":
+            b[1] = b[2] = b[3] = b[4] = 1
+        return b
+
+class TestQueryEngine(TestCase):
+
+    def setUp(self):
+        self.parser = QueryParser()
+        self.index = FauxIndex()
+
+    def compareSet(self, set, dict):
+        d = {}
+        for k, v in set.items():
+            d[k] = v
+        self.assertEqual(d, dict)
+
+    def compareQuery(self, query, dict):
+        tree = self.parser.parseQuery(query)
+        set = tree.executeQuery(self.index)
+        self.compareSet(set, dict)
+
+    def testExecuteQuery(self):
+        self.compareQuery("foo AND bar", {1: 2})
+        self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
+        self.compareQuery("foo AND NOT bar", {3: 1})
+        self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
+        self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
+        self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
+        self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
+        self.compareQuery("ham AND foo AND bar", {1: 3})
+
+    def testInvalidQuery(self):
+        from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
+        tree = NotNode(AtomNode("foo"))
+        self.assertRaises(QueryError, tree.executeQuery, self.index)
+
+def test_suite():
+    return makeSuite(TestQueryEngine)
+
+if __name__=='__main__':
+    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testQueryParser.py
+++ b/lib/python/Products/ZCTextIndex/tests/testQueryParser.py
+##############################################################################
+#
+# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
+# All Rights Reserved.
+#
+# This software is subject to the provisions of the Zope Public License,
+# Version 2.0 (ZPL).  A copy of the ZPL should accompany this distribution.
+# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
+# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
+# FOR A PARTICULAR PURPOSE.
+#
+##############################################################################
+
+from unittest import TestCase, TestSuite, main, makeSuite
+
+from Products.ZCTextIndex.QueryParser import QueryParser
+
+from Products.ZCTextIndex.ParseTree import ParseError, ParseTreeNode
+from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
+from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
+
+class TestQueryParser(TestCase):
+
+    def compareParseTrees(self, got, expected):
+        self.assertEqual(isinstance(got, ParseTreeNode), 1)
+        self.assertEqual(got.__class__, expected.__class__)
+        if isinstance(got, PhraseNode):
+            self.assertEqual(got.nodeType(), "PHRASE")
+            self.assertEqual(got.getValue(), expected.getValue())
+        elif isinstance(got, GlobNode):
+            self.assertEqual(got.nodeType(), "GLOB")
+            self.assertEqual(got.getValue(), expected.getValue())
+        elif isinstance(got, AtomNode):
+            self.assertEqual(got.nodeType(), "ATOM")
+            self.assertEqual(got.getValue(), expected.getValue())
+        elif isinstance(got, NotNode):
+            self.assertEqual(got.nodeType(), "NOT")
+            self.compareParseTrees(got.getValue(), expected.getValue())
+        elif isinstance(got, AndNode) or isinstance(got, OrNode):
+            self.assertEqual(got.nodeType(),
+                             isinstance(got, AndNode) and "AND" or "OR")
+            list1 = got.getValue()
+            list2 = expected.getValue()
+            self.assertEqual(len(list1), len(list2))
+            for i in range(len(list1)):
+                self.compareParseTrees(list1[i], list2[i])
+
+    def expect(self, input, output):
+        tree = self.p.parseQuery(input)
+        self.compareParseTrees(tree, output)
+
+    def failure(self, input):
+        self.assertRaises(ParseError, self.p.parseQuery, input)
+
+    def setUp(self):
+        self.p = QueryParser()
+
+    def testParseQuery(self):
+        self.expect("foo", AtomNode("foo"))
+        self.expect("note", AtomNode("note"))
+        self.expect("a and b AND c",
+                    AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
+        self.expect("a OR b or c",
+                    OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
+        self.expect("a AND b OR c AnD d",
+                    OrNode([AndNode([AtomNode("a"), AtomNode("b")]),
+                            AndNode([AtomNode("c"), AtomNode("d")])]))
+        self.expect("(a OR b) AND (c OR d)",
+                    AndNode([OrNode([AtomNode("a"), AtomNode("b")]),
+                             OrNode([AtomNode("c"), AtomNode("d")])]))
+        self.expect("a AND not b",
+                    AndNode([AtomNode("a"), NotNode(AtomNode("b"))]))
+
+        self.expect('"foo bar"', PhraseNode("foo bar"))
+        self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
+    
+        self.expect('(("foo bar"))"', PhraseNode("foo bar"))
+        self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
+
+        self.expect('and/', AtomNode("and"))
+
+        self.expect("foo-bar", PhraseNode("foo bar"))
+        self.expect("foo -bar", AndNode([AtomNode("foo"),
+                                         NotNode(AtomNode("bar"))]))
+        self.expect("-foo bar", AndNode([AtomNode("bar"),
+                                         NotNode(AtomNode("foo"))]))
+        self.expect("booh -foo-bar",
+                    AndNode([AtomNode("booh"),
+                             NotNode(PhraseNode("foo bar"))]))
+        self.expect('booh -"foo bar"',
+                    AndNode([AtomNode("booh"),
+                             NotNode(PhraseNode("foo bar"))]))
+        self.expect('foo"bar"',
+                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+        self.expect('"foo"bar',
+                    AndNode([AtomNode("foo"), AtomNode("bar")]))
+        self.expect('foo"bar"blech',
+                    AndNode([AtomNode("foo"), AtomNode("bar"),
+                             AtomNode("blech")]))
+
+        self.expect("foo*", GlobNode("foo*"))
+        self.expect("foo* bar", AndNode([GlobNode("foo*"),
+                                         AtomNode("bar")]))
+
+    def testParseFailures(self):
+        self.failure("")
+        self.failure("not")
+        self.failure("OR")
+        self.failure("AND")
+        self.failure("not foo")
+        self.failure(")")
+        self.failure("(")
+        self.failure("foo OR")
+        self.failure("foo AND")
+        self.failure("OR foo")
+        self.failure("and foo")
+        self.failure("(foo) bar")
+        self.failure("(foo OR)")
+        self.failure("(foo AND)")
+        self.failure("(NOT foo)")
+        self.failure("-foo")
+        self.failure("-foo -bar")
+        self.failure('""')
+
+
+def test_suite():
+    return makeSuite(TestQueryParser)
+
+if __name__=="__main__":
+    main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+++ b/lib/python/Products/ZCTextIndex/tests/testZCTextIndex.py
+from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
+from Products.ZCTextIndex.tests \
+     import testIndex, testQueryEngine, testQueryParser
+from Products.ZCTextIndex.Index import scaled_int, SCALE_FACTOR
+from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
+from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
+
+import unittest
+
+class Indexable:
+    def __init__(self, text):
+        self.text = text
+        
+class LexiconHolder:
+    def __init__(self, lexicon):
+        self.lexicon = lexicon
+        
+class Extra:
+    pass
+
+# The tests classes below create a ZCTextIndex().  Then they create
+# instance variables that point to the internal components used by
+# ZCTextIndex.  These tests run the individual module unit tests with
+# the fully integrated ZCTextIndex.
+
+def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
+    if abs(scaled1 - scaled2) > epsilon:
+        raise AssertionError, "%s != %s" % (scaled1, scaled2)
+
+class IndexTests(testIndex.IndexTest):
+
+    def setUp(self):
+        extra = Extra()
+        extra.doc_attr = 'text'
+        extra.lexicon_id = 'lexicon'
+        caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
+                               StopWordRemover()))
+        self.zc_index = ZCTextIndex('name', extra, caller)
+        self.index = self.zc_index.index
+        self.lexicon = self.zc_index.lexicon
+
+    def testStopWords(self):
+        # the only non-stopword is question
+        text = ("to be or not to be "
+                "that is the question")
+        doc = Indexable(text)
+        self.zc_index.index_object(1, doc)
+        for word in text.split():
+            if word != "question":
+                wids = self.lexicon.termToWordIds(word)
+                self.assertEqual(wids, [])
+        self.assertEqual(len(self.index._get_undoinfo(1)), 1)
+
+    def testRanking(self):
+        # A fairly involved test of the ranking calculations based on
+        # an example set of documents in queries in Managing
+        # Gigabytes, pp. 180-188.
+        self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
+                      "pease", "porridge", "pot"]
+        self._ranking_index()
+        self._ranking_tf()
+        self._ranking_idf()
+        self._ranking_queries()
+
+    def _ranking_index(self):
+        docs = ["Pease porridge hot, pease porridge cold,",
+                "Pease porridge in the pot,",
+                "Nine days old.",
+                "In the pot cold, in the pot hot,",
+                "Pease porridge, pease porridge,",
+                "Eat the lot."]
+        for i in range(len(docs)):
+            self.zc_index.index_object(i + 1, Indexable(docs[i]))
+
+    def _ranking_tf(self):
+        # matrix of term weights for the rows are docids
+        # and the columns are indexes into this list:
+        l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
+               (0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
+               (1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
+               (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
+               (0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
+        l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
+
+        for i in range(len(l_Wd)):
+            docid = i + 1
+            scaled_Wd = scaled_int(l_Wd[i])
+            eq(scaled_Wd, self.index._get_Wd(docid))
+            wdts = [scaled_int(t) for t in l_wdt[i]]
+            for j in range(len(wdts)):
+                wdt = self.index._get_wdt(docid, self.words[j])
+                eq(wdts[j], wdt)
+
+    def _ranking_idf(self):
+        word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
+        idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
+        for i in range(len(self.words)):
+            word = self.words[i]
+            eq(word_freqs[i], self.index._get_ft(word))
+            eq(scaled_int(idfs[i]), self.index._get_wt(word))
+
+    def _ranking_queries(self):
+        queries = ["eat", "porridge", "hot OR porridge",
+                   "eat OR nine OR day OR old OR porridge"]
+        wqs = [1.95, 1.10, 1.77, 3.55]
+        results = [[(6, 0.71)],
+                   [(1, 0.61), (2, 0.58), (5, 0.71)],
+                   [(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
+                   [(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
+        for i in range(len(queries)):
+            raw = queries[i]
+            q = self.zc_index.parser.parseQuery(raw)
+            wq = self.index.query_weight(q.terms())
+            eq(wq, scaled_int(wqs[i]))
+            r = self.zc_index.query(raw)
+            self.assertEqual(len(r), len(results[i]))
+            # convert the results to a dict for each checking
+            d = {}
+            for doc, score in results[i]:
+                d[doc] = scaled_int(score)
+            for doc, score in r:
+                score = scaled_int(float(score / SCALE_FACTOR) / wq)
+                self.assert_(0 <= score <= SCALE_FACTOR)
+                eq(d[doc], score)
+
+class QueryTests(testQueryEngine.TestQueryEngine,
+                 testQueryParser.TestQueryParser):
+
+    # The FauxIndex in testQueryEngine contains four documents.
+    # docid 1: foo, bar, ham
+    # docid 2: bar, ham
+    # docid 3: foo, ham
+    # docid 4: ham
+
+    docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
+
+    def setUp(self):
+        extra = Extra()
+        extra.doc_attr = 'text'
+        extra.lexicon_id = 'lexicon'
+        caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
+                               StopWordRemover()))
+        self.zc_index = ZCTextIndex('name', extra, caller)
+        self.p = self.parser = self.zc_index.parser
+        self.index = self.zc_index.index
+        self.add_docs()
+
+    def add_docs(self):
+        for i in range(len(self.docs)):
+            text = self.docs[i]
+            obj = Indexable(text)
+            self.zc_index.index_object(i + 1, obj)
+
+    def compareSet(self, set, dict):
+        # XXX The FauxIndex and the real Index score documents very
+        # differently.  The set comparison can't actually compare the
+        # items, but it can compare the keys.  That will have to do for now.
+        d = {}
+        for k, v in set.items():
+            d[k] = v
+        self.assertEqual(d.keys(), dict.keys())
+
+
+def test_suite():
+    s = unittest.TestSuite()
+    for klass in IndexTests, QueryTests:
+        s.addTest(unittest.makeSuite(klass))
+    return s
+
+if __name__=='__main__':
+    unittest.main(defaultTest='test_suite')
--- a/lib/python/Products/ZCTextIndex/tests/wordstats.py
+++ b/lib/python/Products/ZCTextIndex/tests/wordstats.py
+#! /usr/bin/env python
+"""Dump statistics about each word in the index.
+
+usage: wordstats.py data.fs [index key]
+"""
+
+import ZODB
+from ZODB.FileStorage import FileStorage
+
+def main(fspath, key):
+    fs = FileStorage(fspath, read_only=1)
+    db = ZODB.DB(fs)
+    rt = db.open().root()
+    index = rt[key]
+
+    lex = index.lexicon
+    idx = index.index
+    print "Words", lex.length()
+    print "Documents", idx.length()
+
+    print "Word frequencies: count, word, wid"
+    for word, wid in lex.items():
+        docs = idx._wordinfo[wid]
+        print len(docs), word, wid
+
+    print "Per-doc scores: wid, (doc, score,)+"
+    for wid in lex.wids():
+        print wid,
+        docs = idx._wordinfo[wid]
+        for docid, score in docs.items():
+            print docid, score,
+        print
+
+if __name__ == "__main__":
+    import sys
+
+    args = sys.argv[1:]
+    index_key = "index"
+    if len(args) == 1:
+        fspath = args[0]
+    elif len(args) == 2:
+        fspath, index_key = args
+    else:
+        print "Expected 1 or 2 args, got", len(args)
+    main(fspath, index_key)