Commit 81625f82 authored by Christopher Petrilli's avatar Christopher Petrilli

Merging in Catalog changes for the lexicon.

parent 381d6e48
......@@ -19,6 +19,15 @@ Zope changes
hook to create PythonScripts (for MIMEtype 'text/x-python')
and DTMLMethods (for other 'text' MIMEtypes) (Collector #998).
Bugs Fixed
- Mechanisms in the underbelly of the Catalog and Globbing
Lexicon (which is the default for all new Catalogs) has been
overhauled given substantial performance increases. On
simple queries, performance should double (or more) in many
situations, whereas with globbed queries it may increase by
substantially more.
Zope 2.3.0 beta 1
Features Added
......
......@@ -101,9 +101,9 @@ from Catalog import Catalog, orify
from SearchIndex import UnIndex, UnTextIndex
from Vocabulary import Vocabulary
import IOBTree
from Shared.DC.ZRDB.TM import TM
from AccessControl import getSecurityManager
manage_addZCatalogForm=DTMLFile('dtml/addZCatalog',globals())
def manage_addZCatalog(self, id, title, vocab_id=None, REQUEST=None):
......@@ -217,6 +217,7 @@ class ZCatalog(Folder, Persistent, Implicit):
threshold=10000
_v_total=0
_v_transaction = None
def __init__(self, id, title='', vocab_id=None, container=None):
self.id=id
......@@ -401,14 +402,31 @@ class ZCatalog(Folder, Persistent, Implicit):
def catalog_object(self, obj, uid):
""" wrapper around catalog """
self._v_total = (self._v_total +
self._catalog.catalogObject(obj, uid, self.threshold))
self._catalog.catalogObject(obj, uid, None)
# None passed in to catalogObject as third argument indicates
# that we shouldn't try to commit subtransactions within any
# indexing code. We throw away the result of the call to
# catalogObject (which is a word count), because it's
# worthless to us here.
if self.threshold is not None:
# figure out whether or not to commit a subtransaction.
t = id(get_transaction())
if t != self._v_transaction:
self._v_total = 0
self._v_transaction = t
self._v_total = self._v_total + 1
# increment the _v_total counter for this thread only and get
# a reference to the current transaction.
# the _v_total counter is zeroed if we notice that we're in
# a different transaction than the last one that came by.
# self.threshold represents the number of times that
# catalog_object needs to be called in order for the catalog
# to commit a subtransaction. The semantics here mean that
# we should commit a subtransaction if our threshhold is
# exceeded within the boundaries of the current transaction.
if self._v_total > self.threshold:
# commit a subtransaction
get_transaction().commit(1)
# kick the chache, this may be overkill but ya never know
self._p_jar.cacheFullSweep(1)
self._v_total = 0
......
......@@ -83,28 +83,22 @@
#
##############################################################################
import string, regex, ts_regex
import regsub
from Lexicon import Lexicon
__doc__=""" Lexicon object that supports
"""
from Lexicon import Lexicon
from Splitter import Splitter
from Persistence import Persistent
from Acquisition import Implicit
import OIBTree, BTree, IOBTree
from intSet import intSet
OIBTree=OIBTree.BTree
OOBTree=BTree.BTree
IOBTree=IOBTree.BTree
import re
from UnTextIndex import Or
import re, time
import OIBTree, BTree, IOBTree, IIBTree
OIBTree = OIBTree.BTree # Object -> Integer
OOBTree = BTree.BTree # Object -> Object
IOBTree = IOBTree.BTree # Integer -> Object
IIBucket = IIBTree.Bucket # Integer -> Integer
import pdb
class GlobbingLexicon(Lexicon):
"""
......@@ -155,7 +149,6 @@ class GlobbingLexicon(Lexicon):
set.insert(self.counter)
self._digrams = _digrams
counter = self.counter
self.counter = self.counter + 1
return counter
......@@ -163,14 +156,14 @@ class GlobbingLexicon(Lexicon):
def get(self, pattern):
""" Query the lexicon for words matching a pattern.
"""
wc_set = [self.multi_wc, self.single_wc]
digrams = []
globbing = 0
for i in range(len(pattern)):
if pattern[i] in wc_set:
globbing = 1
continue
if i == 0:
......@@ -184,21 +177,19 @@ class GlobbingLexicon(Lexicon):
except IndexError:
digrams.append( (pattern[i] + self.eow) )
if not globbing:
result = self._lexicon.get(pattern, ())
return (result, )
## now get all of the intsets that contain the result digrams
result = None
result = IIBucket()
for digram in digrams:
if self._digrams.has_key(digram):
set = self._digrams[digram]
if set is not None:
if result is None:
result = set
else:
result.intersection(set)
matchSet = self._digrams[digram]
if matchSet is not None:
result = IIBucket().union(matchSet)
if result is None:
if len(result) == 0:
return ()
else:
## now we have narrowed the list of possible candidates
......@@ -211,10 +202,9 @@ class GlobbingLexicon(Lexicon):
expr = re.compile(self.translate(pattern))
words = []
hits = []
for x in result:
if expr.search(self._inverseLex[x]):
for x in result.keys():
if expr.match(self._inverseLex[x]):
hits.append(x)
return hits
def __getitem__(self, word):
......@@ -226,6 +216,7 @@ class GlobbingLexicon(Lexicon):
"""
words = []
wids = []
for w in q:
if ( (self.multi_wc in w) or
(self.single_wc in w) ):
......@@ -233,7 +224,7 @@ class GlobbingLexicon(Lexicon):
for wid in wids:
if words:
words.append(Or)
words.append(self._inverseLex[wid])
words.append(wid)
else:
words.append(w)
......@@ -262,19 +253,7 @@ class GlobbingLexicon(Lexicon):
if c == self.multi_wc:
res = res + '.*'
elif c == self.single_wc:
res = res + '.'
res = res + '.?'
else:
res = res + re.escape(c)
return res + "$"
return res + '$'
......@@ -92,7 +92,7 @@ is no longer known.
"""
__version__='$Revision: 1.33 $'[11:-2]
__version__='$Revision: 1.34 $'[11:-2]
from Globals import Persistent
......@@ -368,22 +368,42 @@ class UnTextIndex(Persistent, Implicit):
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
"""
src = tuple(self.getLexicon(self._lexicon).Splitter(word))
if not src: return ResultList({}, (word,), self)
if len(src) == 1:
src=src[0]
if src[:1]=='"' and src[-1:]=='"': return self[src]
r = self._index.get(self.getLexicon(self._lexicon).get(src)[0],
Note that this differentiates between being passed an Integer
and a String. Strings are looked up in the lexicon, whereas
Integers are assumed to be resolved word ids. """
if type(word) is IntType:
# We have a word ID
result = self._index.get(word, {})
return ResultList(result, (word,), self)
else:
splitSource = tuple(self.getLexicon(self._lexicon).Splitter(word))
if not splitSource:
return ResultList({}, (word,), self)
if len(splitSource) == 1:
splitSource = splitSource[0]
if splitSource[:1]=='"' and splitSource[-1:]=='"':
return self[splitSource]
r = self._index.get(
self.getLexicon(self._lexicon).get(splitSource)[0],
None)
if r is None: r = {}
return ResultList(r, (src,), self)
if r is None:
r = {}
return ResultList(r, (splitSource,), self)
r = None
for word in src:
for word in splitSource:
rr = self[word]
if r is None: r = rr
else: r = r.near(rr)
if r is None:
r = rr
else:
r = r.near(rr)
return r
......@@ -482,13 +502,11 @@ class UnTextIndex(Persistent, Implicit):
whole thing is 'evaluated'
"""
# First replace any occurences of " and not " with " andnot "
s = ts_regex.gsub(
'[%s]+[aA][nN][dD][%s]*[nN][oO][tT][%s]+' % (ws * 3),
' andnot ', s)
# do some parsing
q = parse(s)
......@@ -509,65 +527,78 @@ class UnTextIndex(Persistent, Implicit):
try:
left = q[i - 1]
right = q[i + 1]
except IndexError: raise QueryError, "Malformed query"
t=type(left)
if t is ListType: left = evaluate(left, self)
elif t is StringType: left=self[left]
t=type(right)
if t is ListType: right = evaluate(right, self)
elif t is StringType: right=self[right]
except IndexError:
raise QueryError, "Malformed query"
operandType = type(left)
if operandType is IntType:
left = self[left]
elif operandType is StringType:
left = self[left]
elif operandType is ListType:
left = evaluate(left, self)
operandType = type(right)
if operandType is IntType:
right = self[right]
elif operandType is StringType:
right = self[right]
elif operandType is ListType:
right = evaluate(right, self)
return (left, right)
def evaluate(self, q):
def evaluate(self, query):
'''Evaluate a parsed query'''
## import pdb
## pdb.set_trace()
if (len(q) == 1):
if (type(q[0]) is ListType):
return evaluate(q[0], self)
return self[q[0]]
# There are two options if the query passed in is only one
# item. It means either it's an embedded query, in which case
# we'll recursively evaluate, other wise it's nothing for us
# to evaluate, and we just get the results and return them.
if (len(query) == 1):
if (type(query[0]) is ListType):
return evaluate(query[0], self)
return self[query[0]] # __getitem__
# Now we need to loop through the query and expand out
# operators. They are currently evaluated in the following
# order: AndNote -> And -> Or -> Near
i = 0
while (i < len(q)):
if q[i] is AndNot:
left, right = self.get_operands(q, i)
while (i < len(query)):
if query[i] is AndNot:
left, right = self.get_operands(query, i)
val = left.and_not(right)
q[(i - 1) : (i + 2)] = [ val ]
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is And:
left, right = self.get_operands(q, i)
while (i < len(query)):
if query[i] is And:
left, right = self.get_operands(query, i)
val = left & right
q[(i - 1) : (i + 2)] = [ val ]
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Or:
left, right = self.get_operands(q, i)
while (i < len(query)):
if query[i] is Or:
left, right = self.get_operands(query, i)
val = left | right
q[(i - 1) : (i + 2)] = [ val ]
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
i = 0
while (i < len(q)):
if q[i] is Near:
left, right = self.get_operands(q, i)
while (i < len(query)):
if query[i] is Near:
left, right = self.get_operands(query, i)
val = left.near(right)
q[(i - 1) : (i + 2)] = [ val ]
query[(i - 1) : (i + 2)] = [ val ]
else: i = i + 1
if (len(q) != 1): raise QueryError, "Malformed query"
if (len(query) != 1): raise QueryError, "Malformed query"
return q[0]
return query[0]
def parse(s):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment