Commit 61e89f2f authored by Guido van Rossum's avatar Guido van Rossum

Merged TextIndexDS9-branch into trunk.

parent a340cb9d
from Products.ZCTextIndex.ISplitter import ISplitter
import re
class HTMLSplitter:
__implements__ = ISplitter
def process(self, text):
return re.sub('<[^>]*>', ' ', text).split()
class HTMLWordSplitter:
__implements__ = ISplitter
def process(self, text):
splat = []
for t in text:
splat += self.split(t)
return splat
def split(self, text):
text = text.lower()
remove = ["<[^>]*>",
"&[A-Za-z]+;",
"\W+"]
for pat in remove:
text = re.sub(pat, " ", text)
rx = re.compile("[A-Za-z]")
return [word for word in text.split()
if len(word) > 1 and rx.search(word)]
if __name__ == "__main__":
import sys
splitter = HTMLWordSplitter()
for path in sys.argv[1:]:
f = open(path, "rb")
buf = f.read()
f.close()
print path
print splitter.process([buf])
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Index Interface."""
import Interface
class IIndex(Interface.Base):
"""Interface for an Index."""
def search(term):
"""Execute a search on a single term given as a string.
Return an IIBucket.
"""
def search_phrase(phrase):
"""Execute a search on a phrase given as a string.
Return an IIBucket.
"""
def search_glob(pattern):
"""Execute a pattern search.
The pattern represents a set of words by using * and ?. For
example, "foo*" represents the set of all words in the lexicon
starting with "foo".
NOTE: Currently only a single trailing * is supported.
Return an IIBucket.
"""
def query_weight(terms):
"""Return the weight for a set of query terms.
'terms' is a sequence of all terms included in the query,
although not terms with a not. If a term appears more than
once in a query, it should appear more than once in terms.
"""
def index_doc(docid, text):
"XXX"
def unindex_doc(docid):
"XXX"
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class ILexicon(Interface):
"""Object responsible for converting text to word identifiers."""
def termToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they are search terms, and skips words that
aren't in the lexicon.
"""
def sourceToWordIds(text):
"""Return a sequence of ids of the words parsed from the text.
The input text may be either a string or a list of strings.
Parses the text as if they come from a source document, and creates
new word ids for words that aren't (yet) in the lexicon.
"""
def globToWordIds(pattern):
"""Return a sequence of ids of words matching the pattern.
The argument should be a single word using globbing syntax,
e.g. 'foo*' meaning anything starting with 'foo'.
NOTE: Currently only a single trailing * is supported.
Returns the wids for all words in the lexicon that match the
pattern.
"""
def length():
"""Return the number of unique term in the lexicon."""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""NBest Interface.
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
import Interface
class INBest(Interface.Base):
"""Interface for an N-Best chooser."""
def add(item, score):
"""Record that item 'item' has score 'score'. No return value.
The N best-scoring items are remembered, where N was passed to
the constructor. 'item' can by anything. 'score' should be
a number, and larger numbers are considered better.
"""
def addmany(sequence):
"""Like "for item, score in sequence: self.add(item, score)".
This is simply faster than calling add() len(seq) times.
"""
def getbest():
"""Return the (at most) N best-scoring items as a sequence.
The return value is a sequence of 2-tuples, (item, score), with
the largest score first. If .add() has been called fewer than
N times, this sequence will contain fewer than N pairs.
"""
def pop_smallest():
"""Return and remove the (item, score) pair with lowest score.
If len(self) is 0, raise IndexError.
To be cleaer, this is the lowest score among the N best-scoring
seen so far. This is most useful if the capacity of the NBest
object is never exceeded, in which case pop_smallest() allows
using the object as an ordinary smallest-in-first-out priority
queue.
"""
def __len__():
"""Return the number of (item, score) pairs currently known.
This is N (the value passed to the constructor), unless .add()
has been called fewer than N times.
"""
def capacity():
"""Return the maximum number of (item, score) pairs.
This is N (the value passed to the constructor).
"""
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class IPipelineElement(Interface):
def process(source):
"""Provide a text processing step.
Process a source sequence of words into a result sequence.
"""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser Interface."""
import Interface
class IQueryParser(Interface.Base):
"""Interface for Query Parsers."""
def parseQuery(query):
"""Parse a query string.
Return a parse tree (which implements IQueryParseTree).
May raise ParseTree.ParseError.
"""
class IQueryParseTree(Interface.Base):
"""Interface for parse trees returned by parseQuery()."""
def nodeType():
"""Return the node type.
This is one of 'AND', 'OR', 'NOT', 'ATOM', 'PHRASE' or 'GLOB'.
"""
def getValue():
"""Return a node-type specific value.
For node type: Return:
'AND' a list of parse trees
'OR' a list of parse trees
'NOT' a parse tree
'ATOM' a string (representing a single search term)
'PHRASE' a string (representing a search phrase)
'GLOB' a string (representing a pattern, e.g. "foo*")
"""
def terms():
"""Return a list of all terms in this node, excluding NOT subtrees."""
def executeQuery(index):
"""Execute the query represented by this node against the index.
The index argument must implement the IIndex interface.
Return an IIBucket or IIBTree mapping document ids to scores
(higher scores mean better results).
May raise ParseTree.QueryError.
"""
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from Interface import Base as Interface
class ISplitter(Interface):
"""A splitter."""
def process(text):
"""Run the splitter over the input text, returning a list of terms."""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking."""
import math
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IISet
from BTrees.IIBTree import weightedIntersection, weightedUnion
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
# small enough so that scaled values should almost always fit in a signed
# 16-bit int (we're generally storing logs, so a few bits before the radix
# point goes a long way; on the flip side, for reasonably small numbers x
# most of the info in log(x) is in the fractional bits, so we do want to
# save a lot of those).
SCALE_FACTOR = 1024.0
def scaled_int(f, scale=SCALE_FACTOR):
# We expect only positive inputs, so "add a half and chop" is the
# same as round(). Surprising, calling round() is significantly more
# expensive.
return int(f * scale + 0.5)
class Index:
__implements__ = IIndex
def __init__(self, lexicon):
self._lexicon = lexicon
# wid -> { docid -> frequency }
self._wordinfo = IOBTree()
# docid -> W(docid)
self._docweight = IIBTree()
# docid -> [ wid ]
# used for un-indexing
self._docwords = IOBTree()
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
# Most of the computation for computing a relevance score for the
# document occurs in the search() method. The code currently
# implements the cosine similarity function described in Managing
# Gigabytes, eq. 4.3, p. 187. The index_object() method
# precomputes some values that are independent of the particular
# query.
# The equation is
#
# sum(for t in I(d,q): w(d,t) * w(q,t))
# cosine(d, q) = -------------------------------------
# W(d) * W(q)
#
# where
# I(d, q) = the intersection of the terms in d and q.
#
# w(d, t) = 1 + log f(d, t)
# computed by doc_term_weight(); for a given word t,
# self._wordinfo[t] is a map from d to w(d, t).
#
# w(q, t) = log(1 + N/f(t))
# computed by query_term_weight()
#
# W(d) = sqrt(sum(for t in d: w(d, t) ** 2))
# computed by _get_frequencies(), and remembered in
# self._docweight[d]
#
# W(q) = sqrt(sum(for t in q: w(q, t) ** 2))
# computed by self.query_weight()
def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
uniqwids, freqs, docweight = self._get_frequencies(wids)
for i in range(len(uniqwids)):
self._add_wordinfo(uniqwids[i], freqs[i], docid)
self._docweight[docid] = docweight
self._add_undoinfo(docid, wids)
def unindex_doc(self, docid):
for wid in self._get_undoinfo(docid):
self._del_wordinfo(wid, docid)
del self._docwords[docid]
del self._docweight[docid]
def search(self, term):
wids = self._lexicon.termToWordIds(term)
return self._union(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return self._union(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
hits = self._intersection(self._search_wids(wids))
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._docweight))
L = []
DictType = type({})
for wid in wids:
d2w = self._wordinfo[wid] # maps docid to w(docid, wid)
idf = query_term_weight(len(d2w), N) # this is an unscaled float
#print "idf = %.3f" % idf
if isinstance(d2w, DictType):
d2w = IIBucket(d2w)
L.append((d2w, scaled_int(idf)))
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
return L
def _intersection(self, L):
if not L:
return IIBTree()
d2w, weight = L[0]
dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
for d2w, weight in L[1:]:
dummy, result = weightedIntersection(result, d2w, 1, weight)
return result
def _union(self, L):
# XXX This can be optimized, see OkapiIndex
result = IIBTree()
for d2w, weight in L:
dummy, result = weightedUnion(result, d2w, 1, weight)
return result
def query_weight(self, terms):
wids = []
for term in terms:
wids += self._lexicon.termToWordIds(term)
N = float(len(self._docweight))
sum = 0.0
for wid in wids:
wt = math.log(1.0 + N / len(self._wordinfo[wid]))
sum += wt ** 2.0
return scaled_int(math.sqrt(sum))
def _get_frequencies(self, wids):
"""Return individual doc-term weights and docweight."""
# Computes w(d, t) for each term, and W(d).
# Return triple:
# [wid0, wid1, ...],
# [w(d, wid0)/W(d), w(d, wid1)/W(d), ...],
# W(d)
# The second list and W(d) are scaled_ints.
d = {}
for wid in wids:
d[wid] = d.get(wid, 0) + 1
Wsquares = 0.0
weights = []
push = weights.append
for count in d.values():
w = doc_term_weight(count)
Wsquares += w * w
push(w)
W = math.sqrt(Wsquares)
#print "W = %.3f" % W
for i in xrange(len(weights)):
#print i, ":", "%.3f" % weights[i],
weights[i] = scaled_int(weights[i] / W)
#print "->", weights[i]
return d.keys(), weights, scaled_int(W)
DICT_CUTOFF = 10
def _add_wordinfo(self, wid, f, docid):
# Store a wordinfo in a dict as long as there are less than
# DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
# The pickle of a dict is smaller than the pickle of an
# IIBTree, substantially so for small mappings. Thus, we use
# a dictionary until the mapping reaches DICT_CUTOFF elements.
# The cutoff is chosen based on the implementation
# characteristics of Python dictionaries. The dict hashtable
# always has 2**N slots and is resized whenever it is 2/3s
# full. A pickled dict with 10 elts is half the size of an
# IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
# choose 10 as the cutoff for now.
# The IIBTree has a smaller in-memory representation than a
# dictionary, so pickle size isn't the only consideration when
# choosing the threshold. The pickle of a 500-elt dict is 92%
# of the size of the same IIBTree, but the dict uses more
# space when it is live in memory. An IIBTree stores two C
# arrays of ints, one for the keys and one for the values. It
# holds upto 120 key-value pairs in a single bucket.
try:
map = self._wordinfo[wid]
except KeyError:
map = {}
else:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
if len(map) == self.DICT_CUTOFF:
map = IIBTree(map)
map[docid] = f
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _del_wordinfo(self, wid, docid):
try:
map = self._wordinfo[wid]
del map[docid]
except KeyError:
return
if len(map) == 0:
del self._wordinfo[wid]
return
if len(map) == self.DICT_CUTOFF:
new = {}
for k, v in map.items():
new[k] = v
map = new
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _add_undoinfo(self, docid, wids):
self._docwords[docid] = WidCode.encode(wids)
def _get_undoinfo(self, docid):
return WidCode.decode(self._docwords[docid])
# The rest are helper methods to support unit tests
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return map.get(d, 0) * self._docweight[d] / SCALE_FACTOR
def _get_Wd(self, d):
return self._docweight[d]
def _get_ft(self, t):
wid, = self._lexicon.termToWordIds(t)
return len(self._wordinfo[wid])
def _get_wt(self, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def doc_term_weight(count):
"""Return the doc-term weight for a term that appears count times."""
# implements w(d, t) = 1 + log f(d, t)
return 1.0 + math.log(count)
def query_term_weight(term_count, num_items):
"""Return the query-term weight for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements w(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
import re
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.StopDict import get_stopdict
class Lexicon:
__implements__ = ILexicon
def __init__(self, *pipeline):
self.__wids = OIBTree()
self.__words = IOBTree()
# XXX we're reserving wid 0, but that might be yagni
self.__nextwid = 1
self.__pipeline = pipeline
def length(self):
"""Return the number of unique terms in the lexicon."""
return self.__nextwid - 1
def words(self):
return self.__wids.keys()
def wids(self):
return self.__words.keys()
def items(self):
return self.__wids.items()
def sourceToWordIds(self, text):
last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
return map(self._getWordIdCreate, last)
def termToWordIds(self, text):
last = _text2list(text)
for element in self.__pipeline:
last = element.process(last)
wids = []
for word in last:
wid = self.__wids.get(word)
if wid is not None:
wids.append(wid)
return wids
def globToWordIds(self, pattern):
if not re.match("^\w+\*$", pattern):
return []
pattern = pattern.lower()
assert pattern.endswith("*")
prefix = pattern[:-1]
assert prefix and not prefix.endswith("*")
keys = self.__wids.keys(prefix) # Keys starting at prefix
wids = []
words = []
for key in keys:
if not key.startswith(prefix):
break
wids.append(self.__wids[key])
words.append(key)
return wids
def _getWordIdCreate(self, word):
wid = self.__wids.get(word)
if wid is None:
wid = self.__new_wid()
self.__wids[word] = wid
self.__words[wid] = word
return wid
def __new_wid(self):
wid = self.__nextwid
self.__nextwid += 1
return wid
def _text2list(text):
# Helper: splitter input may be a string or a list of strings
try:
text + ""
except:
return text
else:
return [text]
# Sample pipeline elements
class Splitter:
import re
rx = re.compile(r"\w+")
def process(self, lst):
result = []
for s in lst:
result += self.rx.findall(s)
return result
class CaseNormalizer:
def process(self, lst):
return [w.lower() for w in lst]
class StopWordRemover:
dict = get_stopdict().copy()
for c in range(255):
dict[chr(c)] = None
def process(self, lst):
has_key = self.dict.has_key
return [w for w in lst if not has_key(w)]
try:
from Products.ZCTextIndex import stopper as _stopper
except ImportError:
pass
else:
_stopwords = StopWordRemover.dict
def StopWordRemover():
swr = _stopper.new()
swr.dict.update(_stopwords)
return swr
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""NBest
An NBest object remembers the N best-scoring items ever passed to its
.add(item, score) method. If .add() is called M times, the worst-case
number of comparisons performed overall is M * log2(N).
"""
from bisect import bisect
from Products.ZCTextIndex.INBest import INBest
class NBest:
__implements__ = INBest
def __init__(self, N):
"Build an NBest object to remember the N best-scoring objects."
if N < 1:
raise ValueError("NBest() argument must be at least 1")
self._capacity = N
# This does a very simple thing with sorted lists. For large
# N, a min-heap can be unboundedly better in terms of data
# movement time.
self.scores = []
self.items = []
def __len__(self):
return len(self.scores)
def capacity(self):
return self._capacity
def add(self, item, score):
self.addmany([(item, score)])
def addmany(self, sequence):
scores, items, capacity = self.scores, self.items, self._capacity
n = len(scores)
for item, score in sequence:
# When we're in steady-state, the usual case is that we're filled
# to capacity, and that an incoming item is worse than any of
# the best-seen so far.
if n >= capacity and score <= scores[0]:
continue
i = bisect(scores, score)
scores.insert(i, score)
items.insert(i, item)
if n == capacity:
del items[0], scores[0]
else:
n += 1
assert n == len(scores)
def getbest(self):
result = zip(self.items, self.scores)
result.reverse()
return result
def pop_smallest(self):
if self.scores:
return self.items.pop(0), self.scores.pop(0)
raise IndexError("pop_smallest() called on empty NBest object")
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Full text index with relevance ranking, using an Okapi BM25 rank."""
# Lots of comments are at the bottom of this file. Read them to
# understand what's going on.
import math
from BTrees.IOBTree import IOBTree
from BTrees.IIBTree import IIBTree, IIBucket, IISet
from BTrees.IIBTree import weightedIntersection, weightedUnion
from Products.ZCTextIndex.IIndex import IIndex
from Products.ZCTextIndex import WidCode
from Products.ZCTextIndex.NBest import NBest
# Instead of storing floats, we generally store scaled ints. Binary pickles
# can store those more efficiently. The default SCALE_FACTOR of 1024
# is large enough to get about 3 decimal digits of fractional info, and
# small enough so that scaled values should almost always fit in a signed
# 16-bit int (we're generally storing logs, so a few bits before the radix
# point goes a long way; on the flip side, for reasonably small numbers x
# most of the info in log(x) is in the fractional bits, so we do want to
# save a lot of those).
SCALE_FACTOR = 1024.0
def scaled_int(f, scale=SCALE_FACTOR):
# We expect only positive inputs, so "add a half and chop" is the
# same as round(). Surprising, calling round() is significantly more
# expensive.
return int(f * scale + 0.5)
class Index:
__implements__ = IIndex
# BM25 free parameters.
K1 = 1.2
B = 0.75
assert K1 >= 0.0
assert 0.0 <= B <= 1.0
def __init__(self, lexicon):
self._lexicon = lexicon
# wid -> { docid -> frequency }; t -> D -> f(D, t)
self._wordinfo = IOBTree()
# docid -> # of words in the doc
# XXX this is just len(self._docwords[docid]), but if _docwords
# XXX is stored in compressed form then uncompressing just to count
# XXX the list length would be ridiculously expensive.
self._doclen = IIBTree()
# docid -> [ wid ]
# used for un-indexing
self._docwords = IOBTree()
# sum(self._doclen.values()), the total # of words in all docs
self._totaldoclen = 0L
def length(self):
"""Return the number of documents in the index."""
return len(self._docwords)
# Most of the computation for computing a relevance score for the
# document occurs in the search() method.
def index_doc(self, docid, text):
wids = self._lexicon.sourceToWordIds(text)
self._doclen[docid] = len(wids)
self._totaldoclen += len(wids)
wid2count = self._get_frequencies(wids)
for wid, count in wid2count.items():
self._add_wordinfo(wid, count, docid)
self._add_undoinfo(docid, wids)
def unindex_doc(self, docid):
for wid in self._get_undoinfo(docid):
self._del_wordinfo(wid, docid)
del self._docwords[docid]
count = self._doclen[docid]
del self._doclen[docid]
self._totaldoclen -= count
def search(self, term):
wids = self._lexicon.termToWordIds(term)
return self._union(self._search_wids(wids))
def search_glob(self, pattern):
wids = self._lexicon.globToWordIds(pattern)
return self._union(self._search_wids(wids))
def search_phrase(self, phrase):
wids = self._lexicon.termToWordIds(phrase)
hits = self._intersection(self._search_wids(wids))
if not hits:
return hits
code = WidCode.encode(wids)
result = IIBTree()
for docid, weight in hits.items():
docwords = self._docwords[docid]
if docwords.find(code) >= 0:
result[docid] = weight
return result
def _search_wids(self, wids):
if not wids:
return []
N = float(len(self._doclen))
L = []
K1 = self.K1
B = self.B
K1_plus1 = K1 + 1.0
B_from1 = 1.0 - B
meandoclen = self._totaldoclen / N
# f(D, t) * (k1 + 1)
# TF(D, t) = -------------------------------------------
# f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
for wid in wids:
d2f = self._wordinfo[wid] # map {docid -> f(docid, wid)}
idf = inverse_doc_frequency(len(d2f), N) # this is an unscaled float
result = IIBucket()
for docid, f in d2f.items():
lenweight = B_from1 + B * self._doclen[docid] / meandoclen
tf = f * K1_plus1 / (f + K1 * lenweight)
result[docid] = scaled_int(tf * idf)
L.append((result, 1))
return L
# Note about the above: the result is tf * idf. tf is small -- it
# can't be larger than k1+1 = 2.2. idf is formally unbounded, but
# is less than 14 for a term that appears in only 1 of a million
# documents. So the product is probably less than 32, or 5 bits
# before the radix point. If we did the scaled-int business on
# both of them, we'd be up to 25 bits. Add 64 of those and we'd
# be in overflow territory. That's pretty unlikely, so we *could*
# just store scaled_int(tf) in result[docid], and use scaled_int(idf)
# as an invariant weight across the whole result. But besides
# skating near the edge, it's not a speed cure, since the computation
# of tf would still done at Python speed, and it's a lot more
# work than just multiplying by idf.
def _intersection(self, L):
if not L:
return IIBTree()
# Intersect with smallest first.
L = L[:] # don't mutate the caller's L
L.sort(lambda x, y: cmp(len(x[0]), len(y[0])))
d2w, weight = L[0]
dummy, result = weightedUnion(IIBTree(), d2w, 1, weight)
for d2w, weight in L[1:]:
dummy, result = weightedIntersection(result, d2w, 1, weight)
return result
def _union(self, L):
if not L:
return IIBTree()
# Balance unions as closely as possible, smallest to largest.
merge = NBest(len(L))
for x, weight in L:
merge.add((x, weight), len(x))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
x, wx = merge.pop_smallest()
y, wy = merge.pop_smallest()
dummy, z = weightedUnion(x, y, wx, wy)
merge.add((z, 1), len(z))
(result, weight), score = merge.pop_smallest()
return result
def query_weight(self, terms):
# XXX I have no idea what to put here
return 10
def _get_frequencies(self, wids):
"""Return individual term frequencies."""
# Computes f(d, t) for each term.
# Returns a dict mapping wid to the number of times wid appeared
# in wids, {t: f(d, t)}
d = {}
dget = d.get
for wid in wids:
d[wid] = dget(wid, 0) + 1
return d
DICT_CUTOFF = 10
def _add_wordinfo(self, wid, f, docid):
# Store a wordinfo in a dict as long as there are less than
# DICT_CUTOFF docids in the dict. Otherwise use an IIBTree.
# The pickle of a dict is smaller than the pickle of an
# IIBTree, substantially so for small mappings. Thus, we use
# a dictionary until the mapping reaches DICT_CUTOFF elements.
# The cutoff is chosen based on the implementation
# characteristics of Python dictionaries. The dict hashtable
# always has 2**N slots and is resized whenever it is 2/3s
# full. A pickled dict with 10 elts is half the size of an
# IIBTree with 10 elts, and 10 happens to be 2/3s of 2**4. So
# choose 10 as the cutoff for now.
# The IIBTree has a smaller in-memory representation than a
# dictionary, so pickle size isn't the only consideration when
# choosing the threshold. The pickle of a 500-elt dict is 92%
# of the size of the same IIBTree, but the dict uses more
# space when it is live in memory. An IIBTree stores two C
# arrays of ints, one for the keys and one for the values. It
# holds upto 120 key-value pairs in a single bucket.
try:
map = self._wordinfo[wid]
except KeyError:
map = {}
else:
# _add_wordinfo() is called for each update. If the map
# size exceeds the DICT_CUTOFF, convert to an IIBTree.
if len(map) == self.DICT_CUTOFF:
map = IIBTree(map)
map[docid] = f
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _del_wordinfo(self, wid, docid):
try:
map = self._wordinfo[wid]
del map[docid]
except KeyError:
return
if len(map) == 0:
del self._wordinfo[wid]
return
if len(map) == self.DICT_CUTOFF:
new = {}
for k, v in map.items():
new[k] = v
map = new
self._wordinfo[wid] = map # Not redundant, because of Persistency!
def _add_undoinfo(self, docid, wids):
self._docwords[docid] = WidCode.encode(wids)
def _get_undoinfo(self, docid):
return WidCode.decode(self._docwords[docid])
# The rest are helper methods to support unit tests
# XXX These don't work for Okapi, I assume
def _get_wdt(self, d, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return map.get(d, 0) * self._doclen[d] / SCALE_FACTOR
def _get_Wd(self, d):
return self._doclen[d]
def _get_ft(self, t):
wid, = self._lexicon.termToWordIds(t)
return len(self._wordinfo[wid])
def _get_wt(self, t):
wid, = self._lexicon.termToWordIds(t)
map = self._wordinfo[wid]
return scaled_int(math.log(1 + len(self._docweight) / float(len(map))))
def inverse_doc_frequency(term_count, num_items):
"""Return the inverse doc frequency for a term,
that appears in term_count items in a collection with num_items
total items.
"""
# implements IDF(q, t) = log(1 + N/f(t))
return math.log(1.0 + float(num_items) / term_count)
"""
"Okapi" (much like "cosine rule" also) is a large family of scoring gimmicks.
It's based on probability arguments about how words are distributed in
documents, not on an abstract vector space model. A long paper by its
principal inventors gives an excellent overview of how it was derived:
A probabilistic model of information retrieval: development and status
K. Sparck Jones, S. Walker, S.E. Robertson
http://citeseer.nj.nec.com/jones98probabilistic.html
Spellings that ignore relevance information (which we don't have) are of this
high-level form:
score(D, Q) = sum(for t in D&Q: TF(D, t) * IDF(Q, t))
where
D a specific document
Q a specific query
t a term (word, atomic phrase, whatever)
D&Q the terms common to D and Q
TF(D, t) a measure of t's importance in D -- a kind of term frequency
weight
IDF(Q, t) a measure of t's importance in the query and in the set of
documents as a whole -- a kind of inverse document frequency
weight
The IDF(Q, t) here is identical to the one used for our cosine measure.
Since queries are expected to be short, it ignores Q entirely:
IDF(Q, t) = log(1.0 + N / f(t))
where
N the total number of documents
f(t) the number of documents in which t appears
Most Okapi literature seems to use log(N/f(t)) instead. We don't, because
that becomes 0 for a term that's in every document, and, e.g., if someone
is searching for "documentation" on python.org (a term that may well show
up on every page, due to the top navigation bar), we still want to find the
pages that use the word a lot (which is TF's job to find, not IDF's -- we
just want to stop IDF from considering this t to be irrelevant).
The TF(D, t) spellings are more interesting. With lots of variations, the
most basic spelling is of the form
f(D, t)
TF(D, t) = ---------------
f(D, t) + K(D)
where
f(D, t) the number of times t appears in D
K(D) a measure of the length of D, normalized to mean doc length
The functional *form* f/(f+K) is clever. It's a gross approximation to a
mixture of two distinct Poisson distributions, based on the idea that t
probably appears in D for one of two reasons:
1. More or less at random.
2. Because it's important to D's purpose in life ("eliteness" in papers).
Note that f/(f+K) is always between 0 and 1. If f is very large compared to
K, it approaches 1. If K is very large compared to f, it approaches 0. If
t appears in D more or less "for random reasons", f is likely to be small,
and so K will dominate unless it's a very small doc, and the ratio will be
small. OTOH, if t appears a lot in D, f will dominate unless it's a very
large doc, and the ratio will be close to 1.
We use a variation on that simple theme, a simplification of what's called
BM25 in the literature (it was the 25th stab at a Best Match function from
the Okapi group; "a simplification" means we're setting some of BM25's more
esoteric free parameters to 0):
f(D, t) * (k1 + 1)
TF(D, t) = --------------------
f(D, t) + k1 * K(D)
where
k1 a "tuning factor", typically between 1.0 and 2.0. We use 1.2,
the usual default value. This constant adjusts the curve to
look more like a theoretical 2-Poisson curve.
Note that as f(D, t) increases, TF(D, t) increases monotonically, approaching
an asymptote of k1+1 from below.
Finally, we use
K(D) = (1-b) + b * len(D)/E(len(D))
where
b is another free parameter, discussed below. We use 0.75.
len(D) the length of D in words
E(len(D)) the expected value of len(D) across the whole document set;
or, IOW, the average document length
b is a free parameter between 0.0 and 1.0, and adjusts for the expected effect
of the "Verbosity Hypothesis". Suppose b is 1, and some word t appears
10 times as often in document d2 than in document d1. If document d2 is
also 10 times as long as d1, TF(d1, t) and TF(d2, t) are identical:
f(d2, t) * (k1 + 1)
TF(d2, t) = --------------------------------- =
f(d2, t) + k1 * len(d2)/E(len(D))
10 * f(d1, t) * (k1 + 1)
----------------------------------------------- = TF(d1, t)
10 * f(d1, t) + k1 * (10 * len(d1))/E(len(D))
because the 10's cancel out. This is appropriate if we believe that a word
appearing 10x more often in a doc 10x as long is simply due to that the
longer doc is more verbose. If we do believe that, the longer doc and the
shorter doc are probably equally relevant. OTOH, it *could* be that the
longer doc is talking about t in greater depth too, in which case it's
probably more relevant than the shorter doc.
At the other extreme, if we set b to 0, the len(D)/E(len(D)) term vanishes
completely, and a doc scores higher for having more occurences of a word
regardless of the doc's length.
Reality is between these extremes, and probably varies by document and word
too. Reports in the literature suggest that b=0.75 is a good compromise "in
general", favoring the "verbosity hypothesis" end of the scale.
Putting it all together, the final TF function is
f(D, t) * (k1 + 1)
TF(D, t) = --------------------------------------------
f(D, t) + k1 * ((1-b) + b*len(D)/E(len(D)))
with k1=1.2 and b=0.75.
"""
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Generic parser support: exception and parse tree nodes."""
from BTrees.IIBTree import difference, weightedIntersection, weightedUnion
from Products.ZCTextIndex.NBest import NBest
class QueryError(Exception):
pass
class ParseError(Exception):
pass
class ParseTreeNode:
_nodeType = None
def __init__(self, value):
self._value = value
def nodeType(self):
return self._nodeType
def getValue(self):
return self._value
def __repr__(self):
return "%s(%r)" % (self.__class__.__name__, self.getValue())
def terms(self):
t = []
for v in self.getValue():
t.extend(v.terms())
return t
def executeQuery(self, index):
raise NotImplementedError
class NotNode(ParseTreeNode):
_nodeType = "NOT"
def terms(self):
return []
def executeQuery(self, index):
raise QueryError, "NOT operator must occur right after AND"
class AndNode(ParseTreeNode):
_nodeType = "AND"
def executeQuery(self, index):
L = []
Nots = []
for subnode in self.getValue():
if subnode.nodeType() == "NOT":
Nots.append(subnode.getValue().executeQuery(index))
else:
L.append(subnode.executeQuery(index))
assert L
L.sort(lambda x, y: cmp(len(x), len(y)))
set = L[0]
for x in L[1:]:
dummy, set = weightedIntersection(set, x)
if Nots:
Nots.sort(lambda x, y: cmp(len(x), len(y)))
notset = Nots[0]
for x in Nots[1:]:
dummy, notset = weightedUnion(notset, x)
set = difference(set, notset)
return set
class OrNode(ParseTreeNode):
_nodeType = "OR"
def executeQuery(self, index):
# Balance unions as closely as possible, smallest to largest.
allofem = self.getValue()
merge = NBest(len(allofem))
for subnode in allofem:
result = subnode.executeQuery(index)
merge.add(result, len(result))
while len(merge) > 1:
# Merge the two smallest so far, and add back to the queue.
x, dummy = merge.pop_smallest()
y, dummy = merge.pop_smallest()
dummy, z = weightedUnion(x, y)
merge.add(z, len(z))
result, dummy = merge.pop_smallest()
return result
class AtomNode(ParseTreeNode):
_nodeType = "ATOM"
def terms(self):
return [self.getValue()]
def executeQuery(self, index):
return index.search(self.getValue())
class PhraseNode(AtomNode):
_nodeType = "PHRASE"
def executeQuery(self, index):
return index.search_phrase(self.getValue())
class GlobNode(AtomNode):
_nodeType = "GLOB"
def executeQuery(self, index):
return index.search_glob(self.getValue())
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
"""Query Parser.
This particular parser recognizes the following syntax:
Start = OrExpr
OrExpr = AndExpr ('OR' AndExpr)*
AndExpr = Term ('AND' NotExpr)*
NotExpr = ['NOT'] Term
Term = '(' OrExpr ')' | ATOM+
The key words (AND, OR, NOT) are recognized in any mixture of case.
An ATOM is either:
+ A sequence of characters not containing whitespace or parentheses or
double quotes, and not equal to one of the key words 'AND', 'OR', 'NOT'; or
+ A non-empty string enclosed in double quotes. The interior of the string
can contain whitespace, parentheses and key words.
In addition, an ATOM may optionally be preceded by a hyphen, meaning
that it must not be present.
An unquoted ATOM may also end in a star. This is a primitive
"globbing" function, meaning to search for any word with a given
prefix.
When multiple consecutive ATOMs are found at the leaf level, they are
connected by an implied AND operator, and an unquoted leading hyphen
is interpreted as a NOT operator.
Summarizing the default operator rules:
- a sequence of words without operators implies AND, e.g. ``foo bar''
- double-quoted text implies phrase search, e.g. ``"foo bar"''
- words connected by punctuation implies phrase search, e.g. ``foo-bar''
- a leading hyphen implies NOT, e.g. ``foo -bar''
- these can be combined, e.g. ``foo -"foo bar"'' or ``foo -foo-bar''
- a trailing * means globbing (i.e. prefix search), e.g. ``foo*''
"""
import re
import ParseTree # relative import
# Create unique symbols for token types.
_AND = intern("AND")
_OR = intern("OR")
_NOT = intern("NOT")
_LPAREN = intern("(")
_RPAREN = intern(")")
_ATOM = intern("ATOM")
_EOF = intern("EOF")
# Map keyword string to token type.
_keywords = {
_AND: _AND,
_OR: _OR,
_NOT: _NOT,
_LPAREN: _LPAREN,
_RPAREN: _RPAREN,
}
# Regular expression to tokenize.
_tokenizer_regex = re.compile(r"""
# a paren
[()]
# or an optional hyphen
| -?
# followed by
(?:
# a string
" [^"]* "
# or a non-empty stretch w/o whitespace, parens or double quotes
| [^()\s"]+
)
""", re.VERBOSE)
class QueryParser:
def __init__(self):
pass # This parser has no persistent state
def parseQuery(self, query):
# Lexical analysis.
tokens = _tokenizer_regex.findall(query)
self.__tokens = tokens
# classify tokens
self.__tokentypes = [_keywords.get(token.upper(), _ATOM)
for token in tokens]
# add _EOF
self.__tokens.append(_EOF)
self.__tokentypes.append(_EOF)
self.__index = 0
# Syntactical analysis.
tree = self._parseOrExpr()
self._require(_EOF)
return tree
# Recursive descent parser
def _require(self, tokentype):
if not self._check(tokentype):
t = self.__tokens[self.__index]
msg = "Token %r required, %r found" % (tokentype, t)
raise ParseTree.ParseError, msg
def _check(self, tokentype):
if self.__tokentypes[self.__index] is tokentype:
self.__index += 1
return 1
else:
return 0
def _peek(self, tokentype):
return self.__tokentypes[self.__index] is tokentype
def _get(self, tokentype):
t = self.__tokens[self.__index]
self._require(tokentype)
return t
def _parseOrExpr(self):
L = []
L.append(self._parseAndExpr())
while self._check(_OR):
L.append(self._parseAndExpr())
if len(L) == 1:
return L[0]
else:
return ParseTree.OrNode(L)
def _parseAndExpr(self):
L = []
L.append(self._parseTerm())
while self._check(_AND):
L.append(self._parseNotExpr())
if len(L) == 1:
return L[0]
else:
return ParseTree.AndNode(L)
def _parseNotExpr(self):
if self._check(_NOT):
return ParseTree.NotNode(self._parseTerm())
else:
return self._parseTerm()
def _parseTerm(self):
if self._check(_LPAREN):
tree = self._parseOrExpr()
self._require(_RPAREN)
else:
atoms = [self._get(_ATOM)]
while self._peek(_ATOM):
atoms.append(self._get(_ATOM))
nodes = []
nots = []
for a in atoms:
words = re.findall(r"\w+\*?", a)
if not words:
continue
if len(words) > 1:
n = ParseTree.PhraseNode(" ".join(words))
elif words[0].endswith("*"):
n = ParseTree.GlobNode(words[0])
else:
n = ParseTree.AtomNode(words[0])
if a[0] == "-":
n = ParseTree.NotNode(n)
nots.append(n)
else:
nodes.append(n)
if not nodes:
text = " ".join(atoms)
msg = "At least one positive term required: %r" % text
raise ParseTree.ParseError, msg
nodes.extend(nots)
if len(nodes) == 1:
tree = nodes[0]
else:
tree = ParseTree.AndNode(nodes)
return tree
"""Rice coding (a varaitn of Golomb coding)
Based on a Java implementation by Glen McCluskey described in a Usenix
;login: article at
http://www.usenix.org/publications/login/2000-4/features/java.html
McCluskey's article explains the approach as follows. The encoding
for a value x is represented as a unary part and a binary part. The
unary part is a sequence of 1 bits followed by a 0 bit. The binary
part encodes some of the lower bits of x-1.
The encoding is parameterized by a value m that describes how many
bits to store in the binary part. If most of the values are smaller
than 2**m then they can be stored in only m+1 bits.
Compute the length of the unary part, q, where
q = math.floor((x-1)/ 2 ** m)
Emit q 1 bits followed by a 0 bit.
Emit the lower m bits of x-1, treating x-1 as a binary value.
"""
import array
class BitArray:
def __init__(self, buf=None):
self.bytes = array.array('B')
self.nbits = 0
self.bitsleft = 0
self.tostring = self.bytes.tostring
def __getitem__(self, i):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if self.bytes[byte] & mask:
return 1
else:
return 0
def __setitem__(self, i, val):
byte, offset = divmod(i, 8)
mask = 2 ** offset
if val:
self.bytes[byte] |= mask
else:
self.bytes[byte] &= ~mask
def __len__(self):
return self.nbits
def append(self, bit):
"""Append a 1 if bit is true or 1 if it is false."""
if self.bitsleft == 0:
self.bytes.append(0)
self.bitsleft = 8
self.__setitem__(self.nbits, bit)
self.nbits += 1
self.bitsleft -= 1
def __getstate__(self):
return self.nbits, self.bitsleft, self.tostring()
def __setstate__(self, (nbits, bitsleft, s)):
self.bytes = array.array('B', s)
self.nbits = nbits
self.bitsleft = bitsleft
class RiceCode:
def __init__(self, m):
"""Constructor a RiceCode for m-bit values."""
if not (0 <= m <= 16):
raise ValueError, "m must be between 0 and 16"
self.init(m)
self.bits = BitArray()
self.len = 0
def init(self, m):
self.m = m
self.lower = (1 << m) - 1
self.mask = 1 << (m - 1)
def append(self, val):
"""Append an item to the list."""
if val < 1:
raise ValueError, "value >= 1 expected, got %s" % `val`
val -= 1
# emit the unary part of the code
q = val >> self.m
for i in range(q):
self.bits.append(1)
self.bits.append(0)
# emit the binary part
r = val & self.lower
mask = self.mask
while mask:
self.bits.append(r & mask)
mask >>= 1
self.len += 1
def __len__(self):
return self.len
def tolist(self):
"""Return the items as a list."""
l = []
i = 0 # bit offset
binary_range = range(self.m)
for j in range(self.len):
unary = 0
while self.bits[i] == 1:
unary += 1
i += 1
assert self.bits[i] == 0
i += 1
binary = 0
for k in binary_range:
binary = (binary << 1) | self.bits[i]
i += 1
l.append((unary << self.m) + (binary + 1))
return l
def tostring(self):
"""Return a binary string containing the encoded data.
The binary string may contain some extra zeros at the end.
"""
return self.bits.tostring()
def __getstate__(self):
return self.m, self.bits
def __setstate__(self, (m, bits)):
self.init(m)
self.bits = bits
def encode(m, l):
c = RiceCode(m)
for elt in l:
c.append(elt)
assert c.tolist() == l
return c
def encode_deltas(l):
if len(l) == 1:
return l[0], []
deltas = RiceCode(6)
deltas.append(l[1] - l[0])
for i in range(2, len(l)):
deltas.append(l[i] - l[i - 1])
return l[0], deltas
def decode_deltas(start, enc_deltas):
deltas = enc_deltas.tolist()
l = [start]
for i in range(1, len(deltas)):
l.append(l[i-1] + deltas[i])
l.append(l[-1] + deltas[-1])
return l
def test():
import random
for size in [10, 20, 50, 100, 200]:
l = [random.randint(1, size) for i in range(50)]
c = encode(random.randint(1, 16), l)
assert c.tolist() == l
for size in [10, 20, 50, 100, 200]:
l = range(random.randint(1, size), size + random.randint(1, size))
t = encode_deltas(l)
l2 = decode_deltas(*t)
assert l == l2
if l != l2:
print l
print l2
def pickle_efficiency():
import pickle
import random
for m in [4, 8, 12]:
for size in [10, 20, 50, 100, 200, 500, 1000, 2000, 5000]:
for elt_range in [10, 20, 50, 100, 200, 500, 1000]:
l = [random.randint(1, elt_range) for i in range(size)]
raw = pickle.dumps(l, 1)
enc = pickle.dumps(encode(m, l), 1)
print "m=%2d size=%4d range=%4d" % (m, size, elt_range),
print "%5d %5d" % (len(raw), len(enc)),
if len(raw) > len(enc):
print "win"
else:
print "lose"
if __name__ == "__main__":
test()
*shared*
stopper stopper.c
"""Provide a default list of stop words for the index.
The specific splitter and lexicon are customizable, but the default
ZCTextIndex should do something useful.
"""
def get_stopdict():
"""Return a dictionary of stopwords."""
return _dict
# This list of English stopwords comes from Lucene
_words = [
"a", "and", "are", "as", "at", "be", "but", "by",
"for", "if", "in", "into", "is", "it",
"no", "not", "of", "on", "or", "such",
"that", "the", "their", "then", "there", "these",
"they", "this", "to", "was", "will", "with"
]
_dict = {}
for w in _words:
_dict[w] = None
# A byte-aligned encoding for lists of non-negative ints, using fewer bytes
# for smaller ints. This is intended for lists of word ids (wids). The
# ordinary string .find() method can be used to find the encoded form of a
# desired wid-string in an encoded wid-string. As in UTF-8, the initial byte
# of an encoding can't appear in the interior of an encoding, so find() can't
# be fooled into starting a match "in the middle" of an encoding.
# Details:
#
# + Only the first byte of an encoding has the sign bit set.
#
# + The number of bytes in the encoding is encoded in unary at the start of
# the first byte (i.e., an encoding with n bytes begins with n 1-bits
# followed by a 0 bit).
#
# + Bytes beyond the first in an encoding have the sign bit clear, followed
# by 7 bits of data.
#
# + The number of data bits in the first byte of an encoding varies.
#
# The int to be encoded can contain no more than 24 bits.
# XXX this could certainly be increased
#
# If it contains no more than 6 bits, 00abcdef, the encoding is
# 10abcdef
#
# If it contains 7 thru 12 bits,
# 0000abcd efghijkL
# the encoding is
# 110abcde 0fghijkL
#
# Static tables _encoding and _decoding capture all encodes and decodes for
# 12 or fewer bits.
#
# If it contains 13 thru 18 bits,
# 000000ab cdefghij kLmnopqr
# the encoding is
# 1110abcd 0efghijk 0Lmnopqr
#
# If it contains 19 thru 24 bits,
# abcdefgh ijkLmnop qrstuvwx
# the encoding is
# 11110abc 0defghij 0kLmnopq 0rstuvwx
import re
def encode(wids):
# Encode a list of wids as a string.
wid2enc = _encoding
n = len(wid2enc)
return "".join([w < n and wid2enc[w] or _encode(w) for w in wids])
_encoding = [None] * 0x1000 # Filled later, and converted to a tuple
def _encode(w):
assert 0x1000 <= w < 0x1000000
b, c = divmod(w, 0x80)
a, b = divmod(b, 0x80)
s = chr(b) + chr(c)
if a < 0x10: # no more than 18 data bits
return chr(a + 0xE0) + s
a, b = divmod(a, 0x80)
assert a < 0x4, (w, a, b, s) # else more than 24 data bits
return (chr(a + 0xF0) + chr(b)) + s
_prog = re.compile(r"[\x80-\xFF][\x00-\x7F]*")
def decode(code):
# Decode a string into a list of wids.
get = _decoding.get
# Obscure: while _decoding does have the key '\x80', its value is 0,
# so the "or" here calls _decode('\x80') anyway.
return [get(p) or _decode(p) for p in _prog.findall(code)]
_decoding = {} # Filled later
def _decode(s):
if s == '\x80':
# See comment in decode(). This is here to allow a trick to work.
return 0
if len(s) == 3:
a, b, c = map(ord, s)
assert a & 0xF0 == 0xE0 and not b & 0x80 and not c & 0x80
return ((a & 0xF) << 14) | (b << 7) | c
assert len(s) == 4, `s`
a, b, c, d = map(ord, s)
assert a & 0xF8 == 0xF0 and not b & 0x80 and not c & 0x80 and not d & 0x80
return ((a & 0x7) << 21) | (b << 14) | (c << 7) | d
def _fill():
global _encoding
for i in range(0x40):
s = chr(i + 0x80)
_encoding[i] = s
_decoding[s] = i
for i in range(0x40, 0x1000):
hi, lo = divmod(i, 0x80)
s = chr(hi + 0xC0) + chr(lo)
_encoding[i] = s
_decoding[s] = i
_encoding = tuple(_encoding)
_fill()
def test():
for i in range(2**20):
if i % 1000 == 0: print i
wids = [i]
code = encode(wids)
assert decode(code) == wids, (wids, code, decode(code))
if __name__ == "__main__":
test()
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""Plug in text index for ZCatalog with relevance ranking."""
import ZODB
from Persistence import Persistent
import Acquisition
from OFS.SimpleItem import SimpleItem
from Products.PluginIndexes.common.PluggableIndex \
import PluggableIndexInterface
from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.ILexicon import ILexicon
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.QueryParser import QueryParser
from Globals import DTMLFile
from Interface import verify_class_implementation
class ZCTextIndex(Persistent, Acquisition.Implicit, SimpleItem):
__implements__ = PluggableIndexInterface
meta_type = 'ZCTextIndex'
manage_options= (
{'label': 'Settings', 'action': 'manage_main'},
)
def __init__(self, id, extra, caller):
self.id = id
self._fieldname = extra.doc_attr
lexicon = getattr(caller, extra.lexicon_id, None)
if lexicon is None:
raise LookupError, 'Lexicon "%s" not found' % extra.lexicon_id
verify_class_implementation(ILexicon, lexicon.__class__)
self.lexicon = lexicon
self.index = Index(self.lexicon)
self.parser = QueryParser()
def index_object(self, docid, obj):
self.index.index_doc(docid, self._get_object_text(obj))
self._p_changed = 1 # XXX
def unindex_object(self, docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def _apply_index(self, req):
pass # XXX
def query(self, query, nbest=10):
# returns a mapping from docids to scores
tree = self.parser.parseQuery(query)
results = tree.executeQuery(self.index)
chooser = NBest(nbest)
chooser.addmany(results.items())
return chooser.getbest()
def _get_object_text(self, obj):
x = getattr(obj, self._fieldname)
if callable(x):
return x()
else:
return x
## User Interface Methods ##
manage_main = DTMLFile('dtml/manageZCTextIndex', globals())
def manage_addZCTextIndex(self, id, extra=None, REQUEST=None,
RESPONSE=None):
"""Add a text index"""
return self.manage_addIndex(id, 'ZCTextIndex', extra,
REQUEST, RESPONSE, REQUEST.URL3)
manage_addZCTextIndexForm = DTMLFile('dtml/addZCTextIndex', globals())
manage_addLexiconForm = DTMLFile('dtml/addLexicon', globals())
def manage_addLexicon(self, id, title, splitter=None, normalizer=None,
stopword=None, REQUEST=None):
elements = []
if splitter:
elements.append(Lexicon.Splitter())
if normalizer:
elements.append(CaseNormalizer())
if stopwords:
elements.append(StopWordRemover())
lexicon = Lexicon(*elements)
self._setObject(id, lexicon)
if REQUEST is not None:
return self.manage_main(self, REQUEST, update_menu=1)
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""ZCatalog Text Index
Experimental plugin text index for ZCatalog.
"""
def initialize(context):
from Products.ZCTextIndex import ZCTextIndex
context.registerClass(
ZCTextIndex.ZCTextIndex,
permission='Add Pluggable Index',
constructors=(ZCTextIndex.manage_addZCTextIndexForm,
ZCTextIndex.manage_addZCTextIndex),
visibility=None
)
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add Lexicon',
)">
<FORM ACTION="manage_addLexicon" METHOD="POST">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Title
</div>
</td>
<td align="left" valign="top">
<input type="text" name="title" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
splitter?
</td>
<td align="left" valign="top">
<input type="checkbox" name="splitter" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
case normalizer?
</td>
<td align="left" valign="top">
<input type="checkbox" name="normalizer" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
remove stop words?
</td>
<td align="left" valign="top">
<input type="checkbox" name="stopword" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var "manage_form_title(this(), _,
form_title='Add ZCTextIndex',
)">
<p class="form-help">
<strong>Text Indexes</strong> break text up into individual words, and
are often referred to as full-text indexes. Text indexes
sort results by score, meaning they return hits in order
from the most relevant to the least relevant.
</p>
<form action="manage_addZCTextIndex" method="post"
enctype="multipart/form-data">
<table cellspacing="0" cellpadding="2" border="0">
<tr>
<td align="left" valign="top">
<div class="form-label">
Id
</div>
</td>
<td align="left" valign="top">
<input type="text" name="id" size="40" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Vocabulary
</div>
</td>
<td>
<select name="extra.vocabulary:record">
<dtml-in "this().aq_parent.objectItems('Vocabulary')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Field name
</div></td>
<td align="left" valign="top">
<input type="text" name="extra.doc_attr:record" size="40" />
</td>
</tr>
<tr>
<td align="left" valign"top">
<div class="form-label">
Lexicon
</div></td>
<td>
<select name="extra.lexicon_id:record">
<dtml-in "this().aq_parent.objectItems('Lexicon')">
<option value="&dtml-sequence-key;">&dtml-sequence-key; (<dtml-var "_['sequence-item'].title">)
</dtml-in>
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-optional">
Type
</div>
</td>
<td align="left" valign="top">
ZCTextIndex
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
<td align="left" valign="top">
<div class="form-element">
<input class="form-element" type="submit" name="submit"
value=" Add " />
</div>
</td>
</tr>
</table>
</form>
<dtml-var manage_page_footer>
<dtml-var manage_page_header>
<dtml-var manage_tabs>
<p class="form-help">
There is nothing to manage here. Move along.
</p>
<dtml-var manage_page_footer>
/* stopper.c
*
* Fast version of the StopWordRemover object.
*/
#include "Python.h"
#include "structmember.h"
typedef struct {
PyObject_HEAD
PyObject *swr_dict;
} StopWordRemover;
static PyObject *
swr_process(StopWordRemover *self, PyObject *args)
{
PyObject *result = NULL;
PyObject *seq;
int len, i;
if (!PyArg_ParseTuple(args, "O:process", &seq))
return NULL;
seq = PySequence_Fast(seq,
"process() requires a sequence as the argument");
if (seq == NULL)
return NULL;
result = PyList_New(0);
if (result == NULL)
goto finally;
#if PY_VERSION_HEX >= 0x02020000
/* Only available in Python 2.2 and newer. */
len = PySequence_Fast_GET_SIZE(seq);
#else
len = PyObject_Length(seq);
#endif
for (i = 0; i < len; ++i) {
PyObject *s = PySequence_Fast_GET_ITEM(seq, i);
/*
* PyDict_GetItem() returns NULL if there isn't a matching
* item, but without setting an exception, so this does what
* we want.
*/
if (PyDict_GetItem(self->swr_dict, s) == NULL)
if (PyList_Append(result, s) < 0) {
Py_DECREF(result);
result = NULL;
goto finally;
}
}
finally:
Py_XDECREF(seq);
return result;
}
static struct memberlist swr_members[] = {
{"dict", T_OBJECT, offsetof(StopWordRemover, swr_dict), READONLY},
{NULL}
};
static PyMethodDef swr_methods[] = {
{"process", (PyCFunction)swr_process, METH_VARARGS,
"process([str, ...]) --> [str, ...]\n"
"Remove stop words from the input list of strings to create a new list."},
{NULL}
};
static PyObject *
swr_getattr(PyObject *self, char *name)
{
PyObject *res;
res = Py_FindMethod(swr_methods, self, name);
if (res != NULL)
return res;
PyErr_Clear();
return PyMember_Get((char *)self, swr_members, name);
}
static void
swr_dealloc(StopWordRemover *self)
{
Py_XDECREF(self->swr_dict);
PyObject_Del(self);
}
static PyTypeObject StopWordRemover_Type = {
PyObject_HEAD_INIT(NULL) /* ob_type */
0, /* ob_size */
"stopper.StopWordRemover", /* tp_name */
sizeof(StopWordRemover), /* tp_basicsize */
0, /* tp_itemsize */
(destructor)swr_dealloc, /* tp_dealloc */
0, /* tp_print */
(getattrfunc)swr_getattr, /* tp_getattr */
0, /* tp_setattr */
};
static PyObject *
swr_new(PyObject *notused, PyObject *args)
{
StopWordRemover *swr = NULL;
PyObject *dict = NULL;
if (PyArg_ParseTuple(args, "|O!:new", &PyDict_Type, &dict)) {
swr = PyObject_New(StopWordRemover, &StopWordRemover_Type);
if (swr != NULL) {
if (dict != NULL) {
Py_INCREF(dict);
swr->swr_dict = dict;
}
else {
swr->swr_dict = PyDict_New();
if (swr->swr_dict == NULL) {
Py_DECREF(swr);
swr = NULL;
}
}
}
}
return (PyObject *) swr;
}
static PyObject*
pickle_constructor = NULL;
PyObject *
swr_pickler(PyObject *unused, PyObject *args)
{
StopWordRemover *swr;
PyObject *result = NULL;
if (PyArg_ParseTuple(args, "O!:_pickler", &StopWordRemover_Type, &swr)) {
result = Py_BuildValue("O(O)", pickle_constructor, swr->swr_dict);
}
return result;
}
static PyMethodDef stopper_functions[] = {
{"new", swr_new, METH_VARARGS,
"new() -> StopWordRemover instance\n"
"Create & return a new stop-word remover."},
{"_pickler", swr_pickler, METH_VARARGS,
"_pickler(StopWordRemover instance) -> pickle magic\n"
"Internal magic used to make stop-word removers picklable."},
{NULL}
};
void
initstopper(void)
{
PyObject *m, *copy_reg;
StopWordRemover_Type.ob_type = &PyType_Type;
m = Py_InitModule3("stopper", stopper_functions,
"Fast StopWordRemover implementation.");
if (m == NULL)
return;
if (PyObject_SetAttrString(m, "StopWordRemoverType",
(PyObject *) &StopWordRemover_Type) < 0)
return;
/* register to support pickling */
copy_reg = PyImport_ImportModule("copy_reg");
if (copy_reg != NULL) {
PyObject *pickler;
if (pickle_constructor == NULL) {
pickle_constructor = PyObject_GetAttrString(m, "new");
Py_XINCREF(pickle_constructor);
}
pickler = PyObject_GetAttrString(m, "_pickler");
if ((pickle_constructor != NULL) && (pickler != NULL)) {
PyObject *res;
res = PyObject_CallMethod(
copy_reg, "pickle", "OOO", &StopWordRemover_Type,
pickler, pickle_constructor);
Py_XDECREF(res);
}
Py_DECREF(copy_reg);
}
}
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
"""
Revision information:
$Id: __init__.py,v 1.2 2002/05/14 15:12:34 gvanrossum Exp $
"""
#! /usr/bin/env python
import cPickle
import os.path
import sys
from hotshot.log import LogReader
def load_line_info(log):
byline = {}
prevloc = None
for what, place, tdelta in log:
if tdelta > 0:
t, nhits = byline.get(prevloc, (0, 0))
byline[prevloc] = (tdelta + t), (nhits + 1)
prevloc = place
return byline
def basename(path, cache={}):
try:
return cache[path]
except KeyError:
fn = os.path.split(path)[1]
cache[path] = fn
return fn
def print_results(results):
for info, place in results:
if not place:
print 'Bad unpack:', info, place
continue
filename, line, funcname = place
print '%8d %8d' % info, basename(filename), line
def annotate_results(results):
files = {}
for stats, place in results:
if not place:
continue
time, hits = stats
file, line, func = place
l = files.get(file)
if l is None:
l = files[file] = []
l.append((line, hits, time))
order = files.keys()
order.sort()
for k in order:
if os.path.exists(k):
v = files[k]
v.sort()
annotate(k, v)
def annotate(file, lines):
print "-" * 60
print file
print "-" * 60
f = open(file)
i = 1
match = lines[0][0]
for line in f:
if match == i:
print "%6d %8d " % lines[0][1:], line,
del lines[0]
if lines:
match = lines[0][0]
else:
match = None
else:
print " " * 16, line,
i += 1
print
def get_cache_name(filename):
d, fn = os.path.split(filename)
cache_dir = os.path.join(d, '.hs-tool')
cache_file = os.path.join(cache_dir, fn)
return cache_dir, cache_file
def cache_results(filename, results):
cache_dir, cache_file = get_cache_name(filename)
if not os.path.exists(cache_dir):
os.mkdir(cache_dir)
fp = open(cache_file, 'wb')
try:
cPickle.dump(results, fp, 1)
finally:
fp.close()
def main(filename, annotate):
cache_dir, cache_file = get_cache_name(filename)
if ( os.path.isfile(cache_file)
and os.path.getmtime(cache_file) > os.path.getmtime(filename)):
# cached data is up-to-date:
fp = open(cache_file, 'rb')
results = cPickle.load(fp)
fp.close()
else:
log = LogReader(filename)
byline = load_line_info(log)
# Sort
results = [(v, k) for k, v in byline.items()]
results.sort()
cache_results(filename, results)
if annotate:
annotate_results(results)
else:
print_results(results)
if __name__ == "__main__":
import getopt
annotate_p = 0
opts, args = getopt.getopt(sys.argv[1:], 'A')
for o, v in opts:
if o == '-A':
annotate_p = 1
if args:
filename, = args
else:
filename = "profile.dat"
main(filename, annotate_p)
#! /usr/bin/env python
"""Index a collection of HTML files on the filesystem.
usage: indexhtml.py [options] dir
Will create an index of all files in dir or its subdirectories.
options:
-f data.fs -- the path to the filestorage datafile
"""
import os
import ZODB
from ZODB.FileStorage import FileStorage
from BTrees.IOBTree import IOBTree
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.HTMLSplitter import HTMLWordSplitter
from Products.ZCTextIndex.Lexicon import Lexicon, StopWordRemover
def make_index():
# there's an elaborate dance necessary to construct an index
class Struct:
pass
extra = Struct()
extra.doc_attr = "read"
extra.lexicon_id = "lexicon"
caller = Struct()
caller.lexicon = Lexicon(HTMLWordSplitter(), StopWordRemover())
return ZCTextIndex(extra, caller)
def main(db, root, dir):
rt["index"] = index = make_index()
rt["files"] = paths = IOBTree()
get_transaction().commit()
files = [os.path.join(dir, file) for file in os.listdir(dir)]
docid = 0
for file in files:
if os.path.isdir(file):
files += [os.path.join(file, sub) for sub in os.listdir(file)]
else:
if not file.endswith(".html"):
continue
docid += 1
print "%5d" % docid, file
f = open(file, "rb")
paths[docid] = file
index.index_object(docid, f)
f.close()
if docid % TXN_INTERVAL == 0:
get_transaction().commit()
if docid % PACK_INTERVAL == 0:
db.pack()
get_transaction().commit()
if __name__ == "__main__":
import sys
import getopt
VERBOSE = 0
FSPATH = "Data.fs"
TXN_INTERVAL = 100
PACK_INTERVAL = 500
try:
opts, args = getopt.getopt(sys.argv[1:], 'vf:')
except getopt.error, msg:
print msg
print __doc__
sys.exit(2)
for o, v in opts:
if o == '-v':
VERBOSE += 1
if o == '-f':
FSPATH = v
if len(args) != 1:
print "Expected on argument"
print __doc__
sys.exit(2)
dir = args[0]
fs = FileStorage(FSPATH)
db = ZODB.DB(fs)
cn = db.open()
rt = cn.root()
dir = os.path.join(os.getcwd(), dir)
print dir
main(db, rt, dir)
cn.close()
fs.close()
"""Test an index with a Unix mailbox file.
usage: python mailtest.py [options] <data.fs>
options:
-v -- verbose
-n NNN -- max number of messages to read from mailbox
-q query
-i mailbox
-p NNN -- pack <data.fs> every NNN messages (default: 500), and at end
-p 0 -- don't pack at all
-b NNN -- return the NNN best matches (default: 10)
-x -- exclude the message text from the data.fs
-t NNN -- commit a transaction every NNN messages (default: 1)
The script either indexes or queries depending on whether -q or -i is
passed as an option.
For -i mailbox, the script reads mail messages from the mailbox and
indexes them. It indexes one message at a time, then commits the
transaction.
For -q query, it performs a query on an existing index.
If both are specified, the index is performed first.
You can also interact with the index after it is completed. Load the
index from the database:
import ZODB
from ZODB.FileStorage import FileStorage
fs = FileStorage(<data.fs>
db = ZODB.DB(fs)
index = cn.open().root()["index"]
index.search("python AND unicode")
"""
import ZODB
import ZODB.FileStorage
from Products.ZCTextIndex.Lexicon import Lexicon, \
CaseNormalizer, Splitter, StopWordRemover
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from BTrees.IOBTree import IOBTree
import sys
import mailbox
import time
def usage(msg):
print msg
print __doc__
sys.exit(2)
class Message:
total_bytes = 0
def __init__(self, msg):
subject = msg.getheader('subject', '')
author = msg.getheader('from', '')
if author:
summary = "%s (%s)\n" % (subject, author)
else:
summary = "%s\n" % subject
self.text = summary + msg.fp.read()
Message.total_bytes += len(self.text)
class Extra:
pass
def index(rt, mboxfile, db):
global NUM
idx_time = 0
pack_time = 0
lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
extra = Extra()
extra.lexicon_id = 'lexicon'
extra.doc_attr = 'text'
caller = Extra()
caller.lexicon = lexicon
rt["index"] = idx = ZCTextIndex("index", extra, caller)
if not EXCLUDE_TEXT:
rt["documents"] = docs = IOBTree()
get_transaction().commit()
mbox = mailbox.UnixMailbox(open(mboxfile))
if VERBOSE:
print "opened", mboxfile
if not NUM:
NUM = sys.maxint
i = 0
while i < NUM:
_msg = mbox.next()
if _msg is None:
break
i += 1
msg = Message(_msg)
if VERBOSE >= 2:
print "indexing msg", i
i0 = time.clock()
idx.index_object(i, msg)
if not EXCLUDE_TEXT:
docs[i] = msg
if i % TXN_SIZE == 0:
get_transaction().commit()
i1 = time.clock()
idx_time += i1 - i0
if VERBOSE and i % 50 == 0:
print i, "messages indexed"
print "cache size", db.cacheSize()
if PACK_INTERVAL and i % PACK_INTERVAL == 0:
if VERBOSE >= 2:
print "packing..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
get_transaction().commit()
if PACK_INTERVAL and i % PACK_INTERVAL != 0:
if VERBOSE >= 2:
print "packing one last time..."
p0 = time.clock()
db.pack(time.time())
p1 = time.clock()
if VERBOSE:
print "pack took %s sec" % (p1 - p0)
pack_time += p1 - p0
if VERBOSE:
print "Index time", idx_time
print "Index bytes", Message.total_bytes
rate = (Message.total_bytes / idx_time) / 1024
print "Index rate %d KB/sec" % int(rate)
def query(rt, query_str):
idx = rt["index"]
docs = rt["documents"]
results = idx.query(query_str, BEST)
print "query:", query_str
print "# results:", len(results)
for docid, score in results:
print "docid %4d score %2d" % (docid, score)
if VERBOSE:
msg = docs[docid]
# print 3 lines of context
CONTEXT = 5
ctx = msg.text.split("\n", CONTEXT)
del ctx[-1]
print "-" * 60
print "message:"
for l in ctx:
print l
print "-" * 60
def main(fs_path, mbox_path, query_str):
f = ZODB.FileStorage.FileStorage(fs_path)
db = ZODB.DB(f, cache_size=CACHE_SIZE)
cn = db.open()
rt = cn.root()
if mbox_path is not None:
index(rt, mbox_path, db)
if query_str is not None:
query(rt, query_str)
cn.close()
db.close()
f.close()
if __name__ == "__main__":
import getopt
NUM = 0
BEST = 10
VERBOSE = 0
PACK_INTERVAL = 500
EXCLUDE_TEXT = 0
CACHE_SIZE = 10000
TXN_SIZE = 1
query_str = None
mbox_path = None
profile = None
old_profile = None
try:
opts, args = getopt.getopt(sys.argv[1:], 'vn:p:i:q:b:xt:',
['profile=', 'old-profile='])
except getopt.error, msg:
usage(msg)
if len(args) != 1:
usage("exactly 1 filename argument required")
for o, v in opts:
if o == '-n':
NUM = int(v)
elif o == '-v':
VERBOSE += 1
elif o == '-p':
PACK_INTERVAL = int(v)
elif o == '-q':
query_str = v
elif o == '-i':
mbox_path = v
elif o == '-b':
BEST = int(v)
elif o == '-x':
EXCLUDE_TEXT = 1
elif o == '-t':
TXN_SIZE = int(v)
elif o == '--profile':
profile = v
elif o == '--old-profile':
old_profile = v
fs_path, = args
if profile:
import hotshot
profiler = hotshot.Profile(profile, lineevents=1, linetimings=1)
profiler.runcall(main, fs_path, mbox_path, query_str)
profiler.close()
elif old_profile:
import profile, pstats
profiler = profile.Profile()
profiler.runcall(main, fs_path, mbox_path, query_str)
profiler.dump_stats(old_profile)
stats = pstats.Stats(old_profile)
stats.strip_dirs().sort_stats('time').print_stats(20)
else:
main(fs_path, mbox_path, query_str)
#! /usr/bin/env python2.1
"""MH mail indexer."""
import re
import sys
import time
import mhlib
import getopt
import traceback
from StringIO import StringIO
DATAFS = "/home/guido/.Data.fs"
ZOPECODE = "/home/guido/projects/ds9/lib/python"
sys.path.append(ZOPECODE)
from ZODB import DB
from ZODB.FileStorage import FileStorage
from Persistence import Persistent
from BTrees.IOBTree import IOBTree
from BTrees.OIBTree import OIBTree
from Products.ZCTextIndex.NBest import NBest
from Products.ZCTextIndex.OkapiIndex import Index
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.StopDict import get_stopdict
NBEST = 3
MAXLINES = 3
def main():
try:
opts, args = getopt.getopt(sys.argv[1:], "bd:m:n:Opu")
except getopt.error, msg:
print msg
sys.exit(2)
update = 0
bulk = 0
optimize = 0
nbest = NBEST
maxlines = MAXLINES
datafs = DATAFS
pack = 0
for o, a in opts:
if o == "-b":
bulk = 1
if o == "-d":
datafs = a
if o == "-m":
maxlines = int(a)
if o == "-n":
nbest = int(a)
if o == "-O":
optimize = 1
if o == "-p":
pack = 1
if o == "-u":
update = 1
ix = Indexer(datafs, update or bulk)
if bulk:
if optimize:
ix.optimize(args)
ix.bulkupdate(args)
elif update:
ix.update(args)
if pack:
ix.pack()
elif args:
for i in range(len(args)):
a = args[i]
if " " in a:
if a[0] == "-":
args[i] = '-"' + a[1:] + '"'
else:
args[i] = '"' + a + '"'
ix.query(" ".join(args), nbest, maxlines)
else:
ix.interact(nbest)
class Indexer:
filestorage = database = connection = root = None
def __init__(self, datafs, writable=0):
self.stopdict = get_stopdict()
self.mh = mhlib.MH()
self.filestorage = FileStorage(datafs, read_only=(not writable))
self.database = DB(self.filestorage)
self.connection = self.database.open()
self.root = self.connection.root()
try:
self.index = self.root["index"]
except KeyError:
self.index = self.root["index"] = TextIndex()
try:
self.docpaths = self.root["docpaths"]
except KeyError:
self.docpaths = self.root["docpaths"] = IOBTree()
self.path2docid = OIBTree()
for docid in self.docpaths.keys():
path = self.docpaths[docid]
self.path2docid[path] = docid
try:
self.maxdocid = max(self.docpaths.keys())
except ValueError:
self.maxdocid = 0
print len(self.docpaths), "Document ids"
print len(self.path2docid), "Pathnames"
def close(self):
self.root = None
if self.connection is not None:
self.connection.close()
self.connection = None
if self.database is not None:
self.database.close()
self.database = None
if self.filestorage is not None:
self.filestorage.close()
self.filestorage = None
def interact(self, nbest=NBEST, maxlines=MAXLINES):
try:
import readline
except ImportError:
pass
text = ""
top = 0
while 1:
try:
line = raw_input("Query: ")
except EOFError:
print "\nBye."
break
line = line.strip()
if line:
text = line
top = 0
else:
if not text:
continue
try:
n, results = self.timequery(text, top + nbest)
except:
reportexc()
text = ""
top = 0
continue
if len(results) <= top:
if not n:
print "No hits for %r." % text
else:
print "No more hits for %r." % text
text = ""
top = 0
continue
print "[Results %d-%d from %d" % (top+1, min(n, top+nbest), n),
print "for query %s]" % repr(text)
self.formatresults(text, results, maxlines, top, top+nbest)
top += nbest
def query(self, text, nbest=NBEST, maxlines=MAXLINES):
n, results = self.timequery(text, nbest)
if not n:
print "No hits for %r." % text
return
print "[Results 1-%d from %d]" % (len(results), n)
self.formatresults(text, results, maxlines)
def timequery(self, text, nbest):
t0 = time.time()
c0 = time.clock()
n, results = self.index.query(text, nbest)
t1 = time.time()
c1 = time.clock()
print "[Query time: %.3f real, %.3f user]" % (t1-t0, c1-c0)
return n, results
def formatresults(self, text, results, maxlines=MAXLINES,
lo=0, hi=sys.maxint):
stop = self.stopdict.has_key
words = [w for w in re.findall(r"\w+\*?", text.lower()) if not stop(w)]
pattern = r"\b(" + "|".join(words) + r")\b"
pattern = pattern.replace("*", ".*") # glob -> re syntax
prog = re.compile(pattern, re.IGNORECASE)
print '='*70
rank = lo
qw = max(1, self.index.query_weight(text))
factor = 100.0 / qw / 1024
for docid, score in results[lo:hi]:
rank += 1
path = self.docpaths[docid]
score = min(100, int(score * factor))
print "Rank: %d Score: %d%% File: %s" % (rank, score, path)
fp = open(path)
msg = mhlib.Message("<folder>", 0, fp)
for header in "From", "To", "Cc", "Bcc", "Subject", "Date":
h = msg.getheader(header)
if h:
print "%-8s %s" % (header+":", h)
text = self.getmessagetext(msg)
if text:
print
nleft = maxlines
for part in text:
for line in part.splitlines():
if prog.search(line):
print line
nleft -= 1
if nleft <= 0:
break
if nleft <= 0:
break
print '-'*70
def update(self, args):
folder = None
seqs = []
for arg in args:
if arg.startswith("+"):
if folder is None:
folder = arg[1:]
else:
print "only one folder at a time"
return
else:
seqs.append(arg)
if not folder:
folder = self.mh.getcontext()
if not seqs:
seqs = ['all']
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
return
dict = {}
for seq in seqs:
try:
nums = f.parsesequence(seq)
except mhlib.Error, msg:
print msg or "unparsable message sequence: %s" % `seq`
return
for n in nums:
dict[n] = n
msgs = dict.keys()
msgs.sort()
self.updatefolder(f, msgs)
def optimize(self, args):
uniqwords = {}
for folder in args:
if folder.startswith("+"):
folder = folder[1:]
print "\nOPTIMIZE FOLDER", folder
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
continue
self.prescan(f, f.listmessages(), uniqwords)
L = [(uniqwords[word], word) for word in uniqwords.keys()]
L.sort()
L.reverse()
for i in range(100):
print "%3d. %6d %s" % ((i+1,) + L[i])
self.index.lexicon.sourceToWordIds([word for (count, word) in L])
def prescan(self, f, msgs, uniqwords):
pipeline = [Splitter(), CaseNormalizer(), StopWordRemover()]
for n in msgs:
print "prescanning", n
m = f.openmessage(n)
text = self.getmessagetext(m)
for p in pipeline:
text = p.process(text)
for word in text:
uniqwords[word] = uniqwords.get(word, 0) + 1
def bulkupdate(self, args):
chunk = 5000
target = len(self.docpaths) + chunk
for folder in args:
if len(self.docpaths) >= target:
self.pack()
target = len(self.docpaths) + chunk
if folder.startswith("+"):
folder = folder[1:]
print "\nFOLDER", folder
try:
f = self.mh.openfolder(folder)
except mhlib.Error, msg:
print msg
continue
self.updatefolder(f, f.listmessages())
print "Total", len(self.docpaths)
self.pack()
def updatefolder(self, f, msgs):
done = 0
new = 0
for n in msgs:
print "indexing", n
m = f.openmessage(n)
text = self.getmessagetext(m)
path = f.getmessagefilename(n)
self.unindexpath(path)
if not text:
continue
docid = self.newdocid(path)
self.index.index_text(docid, text)
done += 1
new = 1
if done%500 == 0:
self.commit()
new = 0
if new:
self.commit()
print "done."
def unindexpath(self, path):
if self.path2docid.has_key(path):
docid = self.path2docid[path]
print "unindexing", docid, path
del self.docpaths[docid]
del self.path2docid[path]
try:
self.index.unindex(docid)
except KeyError, msg:
print "KeyError", msg
def getmessagetext(self, m):
L = []
try:
self.getmsgparts(m, L, 0)
except:
print "(getmsgparts failed:)"
reportexc()
return L
def getmsgparts(self, m, L, level):
ctype = m.gettype()
if level or ctype != "text/plain":
print ". "*level + str(ctype)
if ctype == "text/plain":
L.append(m.getbodytext())
elif ctype in ("multipart/alternative", "multipart/mixed"):
for part in m.getbodyparts():
self.getmsgparts(part, L, level+1)
elif ctype == "message/rfc822":
f = StringIO(m.getbodytext())
m = mhlib.Message("<folder>", 0, f)
self.getmsgparts(m, L, level+1)
def newdocid(self, path):
docid = self.maxdocid + 1
self.maxdocid = docid
self.docpaths[docid] = path
self.path2docid[path] = docid
return docid
def commit(self):
print "committing..."
get_transaction().commit()
def pack(self):
print "packing..."
self.database.pack()
class TextIndex(Persistent):
def __init__(self):
self.lexicon = Lexicon(Splitter(), CaseNormalizer(), StopWordRemover())
self.index = Index(self.lexicon)
def index_text(self, docid, text):
self.index.index_doc(docid, text)
self._p_changed = 1 # XXX
def unindex(self, docid):
self.index.unindex_doc(docid)
self._p_changed = 1 # XXX
def query(self, query, nbest=10):
# returns a total hit count and a mapping from docids to scores
parser = QueryParser()
tree = parser.parseQuery(query)
results = tree.executeQuery(self.index)
chooser = NBest(nbest)
chooser.addmany(results.items())
return len(results), chooser.getbest()
def query_weight(self, query):
parser = QueryParser()
tree = parser.parseQuery(query)
terms = tree.terms()
return self.index.query_weight(terms)
def reportexc():
traceback.print_exc()
if __name__ == "__main__":
main()
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.Index import Index
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
class IndexTest(TestCase):
def setUp(self):
self.lexicon = Lexicon(Splitter())
self.index = Index(self.lexicon)
def test_index_document(self, DOCID=1):
doc = "simple document contains five words"
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_unindex_document(self):
DOCID = 1
self.test_index_document(DOCID)
self.index.unindex_doc(DOCID)
self.assertEqual(len(self.index._docweight), 0)
self.assertEqual(len(self.index._wordinfo), 0)
self.assertEqual(len(self.index._docwords), 0)
def test_index_two_documents(self):
self.test_index_document()
doc = "another document just four"
DOCID = 2
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 8)
self.assertEqual(len(self.index._docwords), 2)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
wids = self.lexicon.termToWordIds("document")
self.assertEqual(len(wids), 1)
document_wid = wids[0]
for wid, map in self.index._wordinfo.items():
if wid == document_wid:
self.assertEqual(len(map), 2)
self.assert_(map.has_key(1))
self.assert_(map.has_key(DOCID))
else:
self.assertEqual(len(map), 1)
def test_index_two_unindex_one(self):
# index two documents, unindex one, and test the results
self.test_index_two_documents()
self.index.unindex_doc(1)
DOCID = 2
self.assertEqual(len(self.index._docweight), 1)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 4)
self.assertEqual(len(self.index._docwords), 1)
self.assertEqual(len(self.index._get_undoinfo(DOCID)), 4)
for map in self.index._wordinfo.values():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_index_duplicated_words(self, DOCID=1):
doc = "very simple repeat repeat repeat document test"
self.index.index_doc(DOCID, doc)
self.assert_(self.index._docweight[DOCID])
self.assertEqual(len(self.index._wordinfo), 5)
self.assertEqual(len(self.index._docwords), 1)
## self.assertEqual(len(self.index._get_undoinfo(DOCID)), 5)
wids = self.lexicon.termToWordIds("repeat")
self.assertEqual(len(wids), 1)
repititive_wid = wids[0]
for wid, map in self.index._wordinfo.items():
self.assertEqual(len(map), 1)
self.assert_(map.has_key(DOCID))
def test_simple_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_simple_query_noresults(self):
self.index.index_doc(1, 'not the same document')
results = self.index.search("frobnicate")
self.assertEqual(list(results.keys()), [])
def test_query_oneresult(self):
self.index.index_doc(1, 'not the same document')
self.index.index_doc(2, 'something about something else')
results = self.index.search("document")
self.assertEqual(list(results.keys()), [1])
def test_search_phrase(self):
self.index.index_doc(1, "the quick brown fox jumps over the lazy dog")
self.index.index_doc(2, "the quick fox jumps lazy over the brown dog")
results = self.index.search_phrase("quick brown fox")
self.assertEqual(list(results.keys()), [1])
def test_search_glob(self):
self.index.index_doc(1, "how now brown cow")
self.index.index_doc(2, "hough nough browne cough")
self.index.index_doc(3, "bar brawl")
results = self.index.search_glob("bro*")
self.assertEqual(list(results.keys()), [1, 2])
results = self.index.search_glob("b*")
self.assertEqual(list(results.keys()), [1, 2, 3])
def test_suite():
return makeSuite(IndexTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.Lexicon import Lexicon
from Products.ZCTextIndex.Lexicon import Splitter, CaseNormalizer
class StupidPipelineElement:
def __init__(self, fromword, toword):
self.__fromword = fromword
self.__toword = toword
def process(self, seq):
res = []
for term in seq:
if term == self.__fromword:
res.append(self.__toword)
else:
res.append(term)
return res
class WackyReversePipelineElement:
def __init__(self, revword):
self.__revword = revword
def process(self, seq):
res = []
for term in seq:
if term == self.__revword:
x = list(term)
x.reverse()
res.append(''.join(x))
else:
res.append(term)
return res
class StopWordPipelineElement:
def __init__(self, stopdict={}):
self.__stopdict = stopdict
def process(self, seq):
res = []
for term in seq:
if self.__stopdict.get(term):
continue
else:
res.append(term)
return res
class Test(TestCase):
def testSourceToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('dogs')
self.assertEqual(wids, [3])
def testMissingTermToWordIds(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('boxes')
self.assertEqual(wids, [])
def testOnePipelineElement(self):
lexicon = Lexicon(Splitter(), StupidPipelineElement('dogs', 'fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('fish')
self.assertEqual(wids, [3])
def testSplitterAdaptorFold(self):
lexicon = Lexicon(Splitter(), CaseNormalizer())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [1, 2, 3])
def testSplitterAdaptorNofold(self):
lexicon = Lexicon(Splitter())
wids = lexicon.sourceToWordIds('CATS and dogs')
wids = lexicon.termToWordIds('cats and dogs')
self.assertEqual(wids, [2, 3])
def testTwoElementPipeline(self):
lexicon = Lexicon(Splitter(),
StupidPipelineElement('cats', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [1])
def testThreeElementPipeline(self):
lexicon = Lexicon(Splitter(),
StopWordPipelineElement({'and':1}),
StupidPipelineElement('dogs', 'fish'),
WackyReversePipelineElement('fish'))
wids = lexicon.sourceToWordIds('cats and dogs')
wids = lexicon.termToWordIds('hsif')
self.assertEqual(wids, [2])
def test_suite():
return makeSuite(Test)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.NBest import NBest
class NBestTest(TestCase):
def testConstructor(self):
self.assertRaises(ValueError, NBest, 0)
self.assertRaises(ValueError, NBest, -1)
for n in range(1, 11):
nb = NBest(n)
self.assertEqual(len(nb), 0)
self.assertEqual(nb.capacity(), n)
def testOne(self):
nb = NBest(1)
nb.add('a', 0)
self.assertEqual(nb.getbest(), [('a', 0)])
nb.add('b', 1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.add('c', -1)
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('b', 1)])
nb.addmany([('d', 3), ('e', -6), ('f', 5), ('g', 4)])
self.assertEqual(len(nb), 1)
self.assertEqual(nb.capacity(), 1)
self.assertEqual(nb.getbest(), [('f', 5)])
def testMany(self):
import random
inputs = [(-i, i) for i in range(50)]
reversed_inputs = inputs[:]
reversed_inputs.reverse()
# Test the N-best for a variety of n (1, 6, 11, ... 50).
for n in range(1, len(inputs)+1, 5):
expected = inputs[-n:]
expected.reverse()
random_inputs = inputs[:]
random.shuffle(random_inputs)
for source in inputs, reversed_inputs, random_inputs:
# Try feeding them one at a time.
nb = NBest(n)
for item, score in source:
nb.add(item, score)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
# And again in one gulp.
nb = NBest(n)
nb.addmany(source)
self.assertEqual(len(nb), n)
self.assertEqual(nb.capacity(), n)
self.assertEqual(nb.getbest(), expected)
for i in range(1, n+1):
self.assertEqual(nb.pop_smallest(), expected[-i])
self.assertRaises(IndexError, nb.pop_smallest)
def test_suite():
return makeSuite(NBestTest)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from BTrees.IIBTree import IIBucket
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.ParseTree import ParseError, QueryError
class FauxIndex:
def search(self, term):
b = IIBucket()
if term == "foo":
b[1] = b[3] = 1
elif term == "bar":
b[1] = b[2] = 1
elif term == "ham":
b[1] = b[2] = b[3] = b[4] = 1
return b
class TestQueryEngine(TestCase):
def setUp(self):
self.parser = QueryParser()
self.index = FauxIndex()
def compareSet(self, set, dict):
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d, dict)
def compareQuery(self, query, dict):
tree = self.parser.parseQuery(query)
set = tree.executeQuery(self.index)
self.compareSet(set, dict)
def testExecuteQuery(self):
self.compareQuery("foo AND bar", {1: 2})
self.compareQuery("foo OR bar", {1: 2, 2: 1, 3:1})
self.compareQuery("foo AND NOT bar", {3: 1})
self.compareQuery("foo AND foo AND foo", {1: 3, 3: 3})
self.compareQuery("foo OR foo OR foo", {1: 3, 3: 3})
self.compareQuery("ham AND NOT foo AND NOT bar", {4: 1})
self.compareQuery("ham OR foo OR bar", {1: 3, 2: 2, 3: 2, 4: 1})
self.compareQuery("ham AND foo AND bar", {1: 3})
def testInvalidQuery(self):
from Products.ZCTextIndex.ParseTree import NotNode, AtomNode
tree = NotNode(AtomNode("foo"))
self.assertRaises(QueryError, tree.executeQuery, self.index)
def test_suite():
return makeSuite(TestQueryEngine)
if __name__=='__main__':
main(defaultTest='test_suite')
##############################################################################
#
# Copyright (c) 2001, 2002 Zope Corporation and Contributors.
# All Rights Reserved.
#
# This software is subject to the provisions of the Zope Public License,
# Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
# THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
# WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
# FOR A PARTICULAR PURPOSE.
#
##############################################################################
from unittest import TestCase, TestSuite, main, makeSuite
from Products.ZCTextIndex.QueryParser import QueryParser
from Products.ZCTextIndex.ParseTree import ParseError, ParseTreeNode
from Products.ZCTextIndex.ParseTree import OrNode, AndNode, NotNode
from Products.ZCTextIndex.ParseTree import AtomNode, PhraseNode, GlobNode
class TestQueryParser(TestCase):
def compareParseTrees(self, got, expected):
self.assertEqual(isinstance(got, ParseTreeNode), 1)
self.assertEqual(got.__class__, expected.__class__)
if isinstance(got, PhraseNode):
self.assertEqual(got.nodeType(), "PHRASE")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, GlobNode):
self.assertEqual(got.nodeType(), "GLOB")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, AtomNode):
self.assertEqual(got.nodeType(), "ATOM")
self.assertEqual(got.getValue(), expected.getValue())
elif isinstance(got, NotNode):
self.assertEqual(got.nodeType(), "NOT")
self.compareParseTrees(got.getValue(), expected.getValue())
elif isinstance(got, AndNode) or isinstance(got, OrNode):
self.assertEqual(got.nodeType(),
isinstance(got, AndNode) and "AND" or "OR")
list1 = got.getValue()
list2 = expected.getValue()
self.assertEqual(len(list1), len(list2))
for i in range(len(list1)):
self.compareParseTrees(list1[i], list2[i])
def expect(self, input, output):
tree = self.p.parseQuery(input)
self.compareParseTrees(tree, output)
def failure(self, input):
self.assertRaises(ParseError, self.p.parseQuery, input)
def setUp(self):
self.p = QueryParser()
def testParseQuery(self):
self.expect("foo", AtomNode("foo"))
self.expect("note", AtomNode("note"))
self.expect("a and b AND c",
AndNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a OR b or c",
OrNode([AtomNode("a"), AtomNode("b"), AtomNode("c")]))
self.expect("a AND b OR c AnD d",
OrNode([AndNode([AtomNode("a"), AtomNode("b")]),
AndNode([AtomNode("c"), AtomNode("d")])]))
self.expect("(a OR b) AND (c OR d)",
AndNode([OrNode([AtomNode("a"), AtomNode("b")]),
OrNode([AtomNode("c"), AtomNode("d")])]))
self.expect("a AND not b",
AndNode([AtomNode("a"), NotNode(AtomNode("b"))]))
self.expect('"foo bar"', PhraseNode("foo bar"))
self.expect("foo bar", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('(("foo bar"))"', PhraseNode("foo bar"))
self.expect("((foo bar))", AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('and/', AtomNode("and"))
self.expect("foo-bar", PhraseNode("foo bar"))
self.expect("foo -bar", AndNode([AtomNode("foo"),
NotNode(AtomNode("bar"))]))
self.expect("-foo bar", AndNode([AtomNode("bar"),
NotNode(AtomNode("foo"))]))
self.expect("booh -foo-bar",
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
self.expect('booh -"foo bar"',
AndNode([AtomNode("booh"),
NotNode(PhraseNode("foo bar"))]))
self.expect('foo"bar"',
AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('"foo"bar',
AndNode([AtomNode("foo"), AtomNode("bar")]))
self.expect('foo"bar"blech',
AndNode([AtomNode("foo"), AtomNode("bar"),
AtomNode("blech")]))
self.expect("foo*", GlobNode("foo*"))
self.expect("foo* bar", AndNode([GlobNode("foo*"),
AtomNode("bar")]))
def testParseFailures(self):
self.failure("")
self.failure("not")
self.failure("OR")
self.failure("AND")
self.failure("not foo")
self.failure(")")
self.failure("(")
self.failure("foo OR")
self.failure("foo AND")
self.failure("OR foo")
self.failure("and foo")
self.failure("(foo) bar")
self.failure("(foo OR)")
self.failure("(foo AND)")
self.failure("(NOT foo)")
self.failure("-foo")
self.failure("-foo -bar")
self.failure('""')
def test_suite():
return makeSuite(TestQueryParser)
if __name__=="__main__":
main(defaultTest='test_suite')
from Products.ZCTextIndex.ZCTextIndex import ZCTextIndex
from Products.ZCTextIndex.tests \
import testIndex, testQueryEngine, testQueryParser
from Products.ZCTextIndex.Index import scaled_int, SCALE_FACTOR
from Products.ZCTextIndex.Lexicon import Lexicon, Splitter
from Products.ZCTextIndex.Lexicon import CaseNormalizer, StopWordRemover
import unittest
class Indexable:
def __init__(self, text):
self.text = text
class LexiconHolder:
def __init__(self, lexicon):
self.lexicon = lexicon
class Extra:
pass
# The tests classes below create a ZCTextIndex(). Then they create
# instance variables that point to the internal components used by
# ZCTextIndex. These tests run the individual module unit tests with
# the fully integrated ZCTextIndex.
def eq(scaled1, scaled2, epsilon=scaled_int(0.01)):
if abs(scaled1 - scaled2) > epsilon:
raise AssertionError, "%s != %s" % (scaled1, scaled2)
class IndexTests(testIndex.IndexTest):
def setUp(self):
extra = Extra()
extra.doc_attr = 'text'
extra.lexicon_id = 'lexicon'
caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
StopWordRemover()))
self.zc_index = ZCTextIndex('name', extra, caller)
self.index = self.zc_index.index
self.lexicon = self.zc_index.lexicon
def testStopWords(self):
# the only non-stopword is question
text = ("to be or not to be "
"that is the question")
doc = Indexable(text)
self.zc_index.index_object(1, doc)
for word in text.split():
if word != "question":
wids = self.lexicon.termToWordIds(word)
self.assertEqual(wids, [])
self.assertEqual(len(self.index._get_undoinfo(1)), 1)
def testRanking(self):
# A fairly involved test of the ranking calculations based on
# an example set of documents in queries in Managing
# Gigabytes, pp. 180-188.
self.words = ["cold", "days", "eat", "hot", "lot", "nine", "old",
"pease", "porridge", "pot"]
self._ranking_index()
self._ranking_tf()
self._ranking_idf()
self._ranking_queries()
def _ranking_index(self):
docs = ["Pease porridge hot, pease porridge cold,",
"Pease porridge in the pot,",
"Nine days old.",
"In the pot cold, in the pot hot,",
"Pease porridge, pease porridge,",
"Eat the lot."]
for i in range(len(docs)):
self.zc_index.index_object(i + 1, Indexable(docs[i]))
def _ranking_tf(self):
# matrix of term weights for the rows are docids
# and the columns are indexes into this list:
l_wdt = [(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0),
(0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0),
(1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7),
(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.7, 1.7, 0.0),
(0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0)]
l_Wd = [2.78, 1.73, 1.73, 2.21, 2.39, 1.41]
for i in range(len(l_Wd)):
docid = i + 1
scaled_Wd = scaled_int(l_Wd[i])
eq(scaled_Wd, self.index._get_Wd(docid))
wdts = [scaled_int(t) for t in l_wdt[i]]
for j in range(len(wdts)):
wdt = self.index._get_wdt(docid, self.words[j])
eq(wdts[j], wdt)
def _ranking_idf(self):
word_freqs = [2, 1, 1, 2, 1, 1, 1, 3, 3, 2]
idfs = [1.39, 1.95, 1.95, 1.39, 1.95, 1.95, 1.95, 1.10, 1.10, 1.39]
for i in range(len(self.words)):
word = self.words[i]
eq(word_freqs[i], self.index._get_ft(word))
eq(scaled_int(idfs[i]), self.index._get_wt(word))
def _ranking_queries(self):
queries = ["eat", "porridge", "hot OR porridge",
"eat OR nine OR day OR old OR porridge"]
wqs = [1.95, 1.10, 1.77, 3.55]
results = [[(6, 0.71)],
[(1, 0.61), (2, 0.58), (5, 0.71)],
[(1, 0.66), (2, 0.36), (4, 0.36), (5, 0.44)],
[(1, 0.19), (2, 0.18), (3, 0.63), (5, 0.22), (6, 0.39)]]
for i in range(len(queries)):
raw = queries[i]
q = self.zc_index.parser.parseQuery(raw)
wq = self.index.query_weight(q.terms())
eq(wq, scaled_int(wqs[i]))
r = self.zc_index.query(raw)
self.assertEqual(len(r), len(results[i]))
# convert the results to a dict for each checking
d = {}
for doc, score in results[i]:
d[doc] = scaled_int(score)
for doc, score in r:
score = scaled_int(float(score / SCALE_FACTOR) / wq)
self.assert_(0 <= score <= SCALE_FACTOR)
eq(d[doc], score)
class QueryTests(testQueryEngine.TestQueryEngine,
testQueryParser.TestQueryParser):
# The FauxIndex in testQueryEngine contains four documents.
# docid 1: foo, bar, ham
# docid 2: bar, ham
# docid 3: foo, ham
# docid 4: ham
docs = ["foo bar ham", "bar ham", "foo ham", "ham"]
def setUp(self):
extra = Extra()
extra.doc_attr = 'text'
extra.lexicon_id = 'lexicon'
caller = LexiconHolder(Lexicon(Splitter(), CaseNormalizer(),
StopWordRemover()))
self.zc_index = ZCTextIndex('name', extra, caller)
self.p = self.parser = self.zc_index.parser
self.index = self.zc_index.index
self.add_docs()
def add_docs(self):
for i in range(len(self.docs)):
text = self.docs[i]
obj = Indexable(text)
self.zc_index.index_object(i + 1, obj)
def compareSet(self, set, dict):
# XXX The FauxIndex and the real Index score documents very
# differently. The set comparison can't actually compare the
# items, but it can compare the keys. That will have to do for now.
d = {}
for k, v in set.items():
d[k] = v
self.assertEqual(d.keys(), dict.keys())
def test_suite():
s = unittest.TestSuite()
for klass in IndexTests, QueryTests:
s.addTest(unittest.makeSuite(klass))
return s
if __name__=='__main__':
unittest.main(defaultTest='test_suite')
#! /usr/bin/env python
"""Dump statistics about each word in the index.
usage: wordstats.py data.fs [index key]
"""
import ZODB
from ZODB.FileStorage import FileStorage
def main(fspath, key):
fs = FileStorage(fspath, read_only=1)
db = ZODB.DB(fs)
rt = db.open().root()
index = rt[key]
lex = index.lexicon
idx = index.index
print "Words", lex.length()
print "Documents", idx.length()
print "Word frequencies: count, word, wid"
for word, wid in lex.items():
docs = idx._wordinfo[wid]
print len(docs), word, wid
print "Per-doc scores: wid, (doc, score,)+"
for wid in lex.wids():
print wid,
docs = idx._wordinfo[wid]
for docid, score in docs.items():
print docid, score,
print
if __name__ == "__main__":
import sys
args = sys.argv[1:]
index_key = "index"
if len(args) == 1:
fspath = args[0]
elif len(args) == 2:
fspath, index_key = args
else:
print "Expected 1 or 2 args, got", len(args)
main(fspath, index_key)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment