Commit d6376fb6 authored by Hanno Schlichting's avatar Hanno Schlichting

More catalog work, merge in the non-queryplan and non-btree changes of...

More catalog work, merge in the non-queryplan and non-btree changes of queryplan and optimize more of date range index internals
parent 850cb0a6
...@@ -56,6 +56,17 @@ Restructuring ...@@ -56,6 +56,17 @@ Restructuring
Features Added Features Added
++++++++++++++ ++++++++++++++
- Various optimizations to indexes _apply_index and the catalog's search
method inspired by experimental.catalogqueryplan.
- Added a new ILimitedResultIndex to Products.PluginIndexes and made most
built-in indexes compatible with it. This allows indexes to consider the
already calculated result set inside their own calculations.
- Changed the internals of the DateRangeIndex to always use IITreeSet and do
an inline migration from IISet. Some datum tend to have large number of
documents, for example when using default floor or ceiling dates.
- Added a new reporting tab to `Products.ZCatalog` instances. You can use this - Added a new reporting tab to `Products.ZCatalog` instances. You can use this
to get an overview of slow catalog queries, as specified by a configurable to get an overview of slow catalog queries, as specified by a configurable
threshold value. The reports are per running Zope process. threshold value. The reports are per running Zope process.
......
...@@ -157,7 +157,7 @@ class DateIndex(UnIndex, PropertyManager): ...@@ -157,7 +157,7 @@ class DateIndex(UnIndex, PropertyManager):
return returnStatus return returnStatus
def _apply_index(self, request): def _apply_index(self, request, resultset=None):
"""Apply the index to query parameters given in the argument """Apply the index to query parameters given in the argument
Normalize the 'query' arguments into integer values at minute Normalize the 'query' arguments into integer values at minute
...@@ -176,7 +176,7 @@ class DateIndex(UnIndex, PropertyManager): ...@@ -176,7 +176,7 @@ class DateIndex(UnIndex, PropertyManager):
#experimental code for specifing the operator #experimental code for specifing the operator
operator = record.get( 'operator', self.useOperator ) operator = record.get( 'operator', self.useOperator )
if not operator in self.operators : if not operator in self.operators :
raise RuntimeError, "operator not valid: %s" % operator raise RuntimeError("operator not valid: %s" % operator)
# depending on the operator we use intersection or union # depending on the operator we use intersection or union
if operator=="or": if operator=="or":
...@@ -223,6 +223,9 @@ class DateIndex(UnIndex, PropertyManager): ...@@ -223,6 +223,9 @@ class DateIndex(UnIndex, PropertyManager):
if set is not None: if set is not None:
if isinstance(set, int): if isinstance(set, int):
set = IISet((set,)) set = IISet((set,))
else:
# set can't be bigger than resultset
set = intersection(set, resultset)
r = set_func(r, set) r = set_func(r, set)
if isinstance(r, int): if isinstance(r, int):
......
...@@ -28,7 +28,6 @@ from BTrees.IIBTree import IISet ...@@ -28,7 +28,6 @@ from BTrees.IIBTree import IISet
from BTrees.IIBTree import IITreeSet from BTrees.IIBTree import IITreeSet
from BTrees.IIBTree import intersection from BTrees.IIBTree import intersection
from BTrees.IIBTree import multiunion from BTrees.IIBTree import multiunion
from BTrees.IIBTree import union
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
from BTrees.Length import Length from BTrees.Length import Length
from DateTime.DateTime import DateTime from DateTime.DateTime import DateTime
...@@ -242,7 +241,7 @@ class DateRangeIndex(UnIndex): ...@@ -242,7 +241,7 @@ class DateRangeIndex(UnIndex):
return tuple( result ) return tuple( result )
def _apply_index(self, request): def _apply_index(self, request, resultset=None):
""" """
Apply the index to query parameters given in 'request', which Apply the index to query parameters given in 'request', which
should be a mapping object. should be a mapping object.
...@@ -265,34 +264,17 @@ class DateRangeIndex(UnIndex): ...@@ -265,34 +264,17 @@ class DateRangeIndex(UnIndex):
# Aggregate sets for each bucket separately, to avoid # Aggregate sets for each bucket separately, to avoid
# large-small union penalties. # large-small union penalties.
# #
#until_only = IISet()
#map( until_only.update, self._until_only.values( term ) )
# XXX use multi-union
until_only = multiunion( self._until_only.values( term ) ) until_only = multiunion( self._until_only.values( term ) )
#since_only = IISet()
#map( since_only.update, self._since_only.values( None, term ) )
# XXX use multi-union
since_only = multiunion( self._since_only.values( None, term ) ) since_only = multiunion( self._since_only.values( None, term ) )
#until = IISet()
#map( until.update, self._until.values( term ) )
# XXX use multi-union
until = multiunion( self._until.values( term ) ) until = multiunion( self._until.values( term ) )
#since = IISet() # Total result is bound by resultset
#map( since.update, self._since.values( None, term ) ) until = intersection(resultset, until)
# XXX use multi-union since = multiunion(self._since.values(None, term))
since = multiunion( self._since.values( None, term ) ) bounded = intersection(until, since)
bounded = intersection( until, since )
# Merge from smallest to largest. # Merge from smallest to largest.
#result = union( self._always, until_only ) result = multiunion([bounded, until_only, since_only, self._always])
result = union( bounded, until_only )
result = union( result, since_only )
#result = union( result, bounded )
result = union( result, self._always )
return result, ( self._since_field, self._until_field ) return result, ( self._since_field, self._until_field )
...@@ -314,8 +296,8 @@ class DateRangeIndex(UnIndex): ...@@ -314,8 +296,8 @@ class DateRangeIndex(UnIndex):
if set is None: if set is None:
self._until_only[ until ] = documentId self._until_only[ until ] = documentId
else: else:
if isinstance(set, int): if isinstance(set, (int, IISet)):
set = self._until_only[ until ] = IISet((set, documentId)) set = self._until_only[until] = IITreeSet((set, documentId))
else: else:
set.insert( documentId ) set.insert( documentId )
elif until is None: elif until is None:
...@@ -324,8 +306,8 @@ class DateRangeIndex(UnIndex): ...@@ -324,8 +306,8 @@ class DateRangeIndex(UnIndex):
if set is None: if set is None:
self._since_only[ since ] = documentId self._since_only[ since ] = documentId
else: else:
if isinstance(set, int): if isinstance(set, (int, IISet)):
set = self._since_only[ since ] = IISet((set, documentId)) set = self._since_only[since] = IITreeSet((set, documentId))
else: else:
set.insert( documentId ) set.insert( documentId )
...@@ -335,8 +317,8 @@ class DateRangeIndex(UnIndex): ...@@ -335,8 +317,8 @@ class DateRangeIndex(UnIndex):
if set is None: if set is None:
self._since[ since ] = documentId self._since[ since ] = documentId
else: else:
if isinstance(set, int): if isinstance(set, (int, IISet)):
set = self._since[ since ] = IISet((set, documentId)) set = self._since[since] = IITreeSet((set, documentId))
else: else:
set.insert( documentId ) set.insert( documentId )
...@@ -344,8 +326,8 @@ class DateRangeIndex(UnIndex): ...@@ -344,8 +326,8 @@ class DateRangeIndex(UnIndex):
if set is None: if set is None:
self._until[ until ] = documentId self._until[ until ] = documentId
else: else:
if isinstance(set, int): if isinstance(set, (int, IISet)):
set = self._until[ until ] = IISet((set, documentId)) set = self._until[until] = IITreeSet((set, documentId))
else: else:
set.insert( documentId ) set.insert( documentId )
......
...@@ -21,7 +21,7 @@ import sys ...@@ -21,7 +21,7 @@ import sys
from BTrees.IIBTree import intersection from BTrees.IIBTree import intersection
from BTrees.IIBTree import IITreeSet from BTrees.IIBTree import IITreeSet
from BTrees.IIBTree import IISet from BTrees.IIBTree import IISet
from BTrees.IIBTree import union from BTrees.IIBTree import multiunion
from BTrees.IOBTree import IOBTree from BTrees.IOBTree import IOBTree
from BTrees.Length import Length from BTrees.Length import Length
from BTrees.OOBTree import OOBTree from BTrees.OOBTree import OOBTree
...@@ -31,7 +31,7 @@ from zope.interface import implements ...@@ -31,7 +31,7 @@ from zope.interface import implements
from Products.PluginIndexes.common import safe_callable from Products.PluginIndexes.common import safe_callable
from Products.PluginIndexes.common.util import parseIndexRequest from Products.PluginIndexes.common.util import parseIndexRequest
from Products.PluginIndexes.interfaces import IPluggableIndex from Products.PluginIndexes.interfaces import ILimitedResultIndex
from Products.PluginIndexes.interfaces import ISortIndex from Products.PluginIndexes.interfaces import ISortIndex
from Products.PluginIndexes.interfaces import IUniqueValueIndex from Products.PluginIndexes.interfaces import IUniqueValueIndex
...@@ -43,7 +43,7 @@ class UnIndex(SimpleItem): ...@@ -43,7 +43,7 @@ class UnIndex(SimpleItem):
"""Simple forward and reverse index. """Simple forward and reverse index.
""" """
implements(IPluggableIndex, IUniqueValueIndex, ISortIndex) implements(ILimitedResultIndex, IUniqueValueIndex, ISortIndex)
def __init__( def __init__(
self, id, ignore_ex=None, call_methods=None, extra=None, caller=None): self, id, ignore_ex=None, call_methods=None, extra=None, caller=None):
...@@ -302,7 +302,7 @@ class UnIndex(SimpleItem): ...@@ -302,7 +302,7 @@ class UnIndex(SimpleItem):
LOG.debug('Attempt to unindex nonexistent document' LOG.debug('Attempt to unindex nonexistent document'
' with id %s' % documentId,exc_info=True) ' with id %s' % documentId,exc_info=True)
def _apply_index(self, request): def _apply_index(self, request, resultset=None):
"""Apply the index to query parameters given in the request arg. """Apply the index to query parameters given in the request arg.
The request argument should be a mapping object. The request argument should be a mapping object.
...@@ -348,11 +348,7 @@ class UnIndex(SimpleItem): ...@@ -348,11 +348,7 @@ class UnIndex(SimpleItem):
# experimental code for specifing the operator # experimental code for specifing the operator
operator = record.get('operator',self.useOperator) operator = record.get('operator',self.useOperator)
if not operator in self.operators : if not operator in self.operators :
raise RuntimeError,"operator not valid: %s" % escape(operator) raise RuntimeError("operator not valid: %s" % escape(operator))
# depending on the operator we use intersection or union
if operator=="or": set_func = union
else: set_func = intersection
# Range parameter # Range parameter
range_parm = record.get('range',None) range_parm = record.get('range',None)
...@@ -375,24 +371,84 @@ class UnIndex(SimpleItem): ...@@ -375,24 +371,84 @@ class UnIndex(SimpleItem):
if 'max' in opr_args: hi = max(record.keys) if 'max' in opr_args: hi = max(record.keys)
else: hi = None else: hi = None
if hi: if hi:
setlist = index.items(lo,hi) setlist = index.values(lo,hi)
else:
setlist = index.values(lo)
# If we only use one key, intersect and return immediately
if len(setlist) == 1:
result = setlist[0]
if isinstance(result, int):
result = IISet((result,))
return result, (self.id,)
if operator == 'or':
tmp = []
for s in setlist:
if isinstance(s, int):
s = IISet((s,))
tmp.append(s)
r = multiunion(tmp)
else: else:
setlist = index.items(lo) # For intersection, sort with smallest data set first
tmp = []
for s in setlist:
if isinstance(s, int):
s = IISet((s,))
tmp.append(s)
if len(tmp) > 2:
setlist = sorted(tmp, key=len)
else:
setlist = tmp
r = resultset
for s in setlist:
# the result is bound by the resultset
r = intersection(r, s)
for k, set in setlist:
if isinstance(set, int):
set = IISet((set,))
r = set_func(r, set)
else: # not a range search else: # not a range search
for key in record.keys: # Filter duplicates
set=index.get(key, None) setlist = []
if set is None: for k in record.keys:
set = IISet(()) s = index.get(k, None)
elif isinstance(set, int): # If None, try to bail early
set = IISet((set,)) if s is None:
r = set_func(r, set) if operator == 'or':
# If union, we can't possibly get a bigger result
if isinstance(r, int): r=IISet((r,)) continue
# If intersection, we can't possibly get a smaller result
return IISet(), (self.id,)
elif isinstance(s, int):
s = IISet((s,))
setlist.append(s)
# If we only use one key return immediately
if len(setlist) == 1:
result = setlist[0]
if isinstance(result, int):
result = IISet((result,))
return result, (self.id,)
if operator == 'or':
# If we already get a small result set passed in, intersecting
# the various indexes with it and doing the union later is
# faster than creating a multiunion first.
if resultset is not None and len(resultset) < 200:
smalllist = []
for s in setlist:
smalllist.append(intersection(resultset, s))
r = multiunion(smalllist)
else:
r = multiunion(setlist)
else:
# For intersection, sort with smallest data set first
if len(setlist) > 2:
setlist = sorted(setlist, key=len)
r = resultset
for s in setlist:
r = intersection(r, s)
if isinstance(r, int):
r = IISet((r, ))
if r is None: if r is None:
return IISet(), (self.id,) return IISet(), (self.id,)
else: else:
......
...@@ -85,6 +85,15 @@ class IPluggableIndex(Interface): ...@@ -85,6 +85,15 @@ class IPluggableIndex(Interface):
"""Empty the index""" """Empty the index"""
class ILimitedResultIndex(IPluggableIndex):
def _apply_index(request, resultset=None):
"""Same as IPluggableIndex' _apply_index method. The additional
resultset argument contains the resultset, as already calculated by
ZCatalog's search method.
"""
class IUniqueValueIndex(IPluggableIndex): class IUniqueValueIndex(IPluggableIndex):
"""An index which can return lists of unique values contained in it""" """An index which can return lists of unique values contained in it"""
......
...@@ -22,6 +22,7 @@ import Acquisition ...@@ -22,6 +22,7 @@ import Acquisition
import ExtensionClass import ExtensionClass
from Missing import MV from Missing import MV
from Persistence import Persistent from Persistence import Persistent
from Products.PluginIndexes.interfaces import ILimitedResultIndex
import BTrees.Length import BTrees.Length
from BTrees.IIBTree import intersection, weightedIntersection, IISet from BTrees.IIBTree import intersection, weightedIntersection, IISet
...@@ -475,6 +476,10 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): ...@@ -475,6 +476,10 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
query[iid] = value query[iid] = value
return query return query
def _sorted_search_indexes(self, query):
# Simple implementation doing no ordering.
return self.indexes.keys()
def search(self, query, sort_index=None, reverse=0, limit=None, merge=1): def search(self, query, sort_index=None, reverse=0, limit=None, merge=1):
"""Iterate through the indexes, applying the query to each one. If """Iterate through the indexes, applying the query to each one. If
merge is true then return a lazy result set (sorted if appropriate) merge is true then return a lazy result set (sorted if appropriate)
...@@ -497,25 +502,44 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base): ...@@ -497,25 +502,44 @@ class Catalog(Persistent, Acquisition.Implicit, ExtensionClass.Base):
# Canonicalize the request into a sensible query before passing it on # Canonicalize the request into a sensible query before passing it on
query = self.make_query(query) query = self.make_query(query)
query_keys = query.keys()
cr = self.getCatalogReport(query) cr = self.getCatalogReport(query)
cr.start() cr.start()
for i in self.indexes.keys(): for i in self._sorted_search_indexes(query):
if i not in query_keys:
# Do not ask indexes to restrict the result, which aren't
# part of the query
continue
index = self.getIndex(i) index = self.getIndex(i)
_apply_index = getattr(index, "_apply_index", None) _apply_index = getattr(index, "_apply_index", None)
if _apply_index is None: if _apply_index is None:
continue continue
limit_result = False
if ILimitedResultIndex.providedBy(index):
limit_result = True
cr.split(i) cr.split(i)
r = _apply_index(query) if limit_result:
r = _apply_index(query, rs)
else:
r = _apply_index(query)
cr.split(i) cr.split(i)
if r is not None: if r is not None:
r, u = r r, u = r
# Short circuit if empty result
# BBB: We can remove the "r is not None" check in Zope 2.14
# once we don't need to support the "return everything" case
# anymore
if r is not None and not r:
return LazyCat([])
w, rs = weightedIntersection(rs, r) w, rs = weightedIntersection(rs, r)
if not rs: if not rs:
break break
cr.stop() cr.stop()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment