Commit 2167f3f6 authored by Casey Duncan's avatar Casey Duncan

Major refactor of the catalog search engine centered around optimizing sort by...

Major refactor of the catalog search engine centered around optimizing sort by index operations. The resulting code greatly outperforms the previous version and uses less memory.

Exposed a new ZCatalog method "search" which has a better interface for programmatic searches. Updated documentation as well.

Implemented a sort limit option which allows you to inform the catalog that you are only interested in a certain number of results. In the common case this allows the ZCatalog machinery to use a different sorting algorithm (N-Best) which scales much better then a full sort.

Also more tightly integrated the merge option which allows you to tell the catalog that you would like raw and unsorted intermediate results returned rather than sorted and lazified results. This can be used to efficiently merge search results across multiple catalogs.
parent 0f4c1093
This diff is collapsed.
......@@ -12,7 +12,7 @@
#
##############################################################################
"""
$Id: IZCatalog.py,v 1.4 2002/09/05 21:22:39 shane Exp $
$Id: IZCatalog.py,v 1.5 2002/12/05 21:17:05 caseman Exp $
"""
from Interface import Interface
......@@ -161,6 +161,10 @@ class IZCatalog(Interface):
sort_order -- You can specify 'reverse' or 'descending'.
Default behavior is to sort ascending.
sort_limit -- An optimization hint to tell the catalog how many
results you are really interested in. See the limit argument
to the search method for more details.
There are some rules to consider when querying this method:
......@@ -192,5 +196,30 @@ class IZCatalog(Interface):
def __call__(REQUEST=None, **kw):
"""Search the catalog, the same way as 'searchResults'.
"""
def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
"""Programmatic search interface, use for searching the catalog from
scripts.
query_request -- Dictionary containing catalog query. This uses the
same format as searchResults.
sort_index -- Name of sort index
reverse -- Boolean, reverse sort order (defaults to false)
limit -- Limit sorted result count to the n best records. This is an
optimization hint used in conjunction with a sort_index. If possible
ZCatalog will use a different sort algorithm that uses much less memory
and scales better then a full sort. The actual number of records
returned is not guaranteed to be <= limit. You still need to apply the
same batching to the results. Since the len() of the results will no
longer be the actual result count, you can use the
"actual_result_count" attribute of the lazy result object instead to
determine the size of the full result set.
merge -- Return merged, lazy results (like searchResults) or raw
results for later merging. This can be used to perform multiple
queries (even across catalogs) and merge and sort the combined results.
"""
__doc__ = IZCatalog.__doc__ + __doc__
......@@ -10,8 +10,8 @@
# FOR A PARTICULAR PURPOSE
#
##############################################################################
__doc__='''$Id: Lazy.py,v 1.7 2002/08/14 22:25:15 mj Exp $'''
__version__='$Revision: 1.7 $'[11:-2]
__doc__='''$Id: Lazy.py,v 1.8 2002/12/05 21:17:05 caseman Exp $'''
__version__='$Revision: 1.8 $'[11:-2]
class Lazy:
......@@ -238,3 +238,21 @@ class LazyMop(Lazy):
raise IndexError, index
self._eindex=e
return data[i]
class LazyValues(Lazy):
"""Given a sequence of two tuples typically (key, value) act as
though we are just a list of the values lazily"""
def __init__(self, seq):
self._seq = seq
def __len__(self):
return len(self._seq)
def __getitem__(self, index):
return self._seq[index][1]
def __getslice__(self, start, end):
return self.__class__(self._seq[start:end])
slice = __getslice__
......@@ -137,7 +137,7 @@ class ZCatalog(Folder, Persistent, Implicit):
['searchResults', '__call__', 'uniqueValuesFor',
'getpath', 'schema', 'indexes', 'index_objects',
'all_meta_types', 'valid_roles', 'resolve_url',
'getobject'],
'getobject', 'search'],
['Anonymous', 'Manager']),
(manage_zcatalog_indexes,
......@@ -610,15 +610,34 @@ class ZCatalog(Folder, Persistent, Implicit):
return r
def searchResults(self, REQUEST=None, used=None, **kw):
"""Search the catalog according to the ZTables search interface.
"""Search the catalog
Search terms can be passed in the REQUEST or as keyword
arguments.
The used argument is now deprecated and ignored
"""
return self._catalog.searchResults(REQUEST, used, **kw)
__call__=searchResults
def search(
self, query_request, sort_index=None, reverse=0, limit=None, merge=1):
"""Programmatic search interface, use for searching the catalog from
scripts.
query_request: Dictionary containing catalog query
sort_index: Name of sort index
reverse: Reverse sort order?
limit: Limit sorted result count (optimization hint)
merge: Return merged results (like searchResults) or raw
results for later merging.
"""
if sort_index is not None:
sort_index = self._catalog.indexes[sort_index]
return self._catalog.search(
query_request, sort_index, reverse, limit, merge)
## this stuff is so the find machinery works
......
......@@ -178,6 +178,10 @@ class ZCatalog:
sort_order -- You can specify 'reverse' or 'descending'.
Default behavior is to sort ascending.
sort_limit -- An optimization hint to tell the catalog how many
results you are really interested in. See the limit argument
to the search method for more details.
There are some rules to consider when querying this method:
......@@ -211,3 +215,26 @@ class ZCatalog:
"""
Search the catalog, the same way as 'searchResults'.
"""
def search(query_request, sort_index=None, reverse=0, limit=None, merge=1):
"""Programmatic search interface, use for searching the catalog from
scripts.
query_request -- Dictionary containing catalog query. This uses the
same format as searchResults.
sort_index -- Name of sort index
reverse -- Boolean, reverse sort order (defaults to false)
limit -- Limit sorted result count to the n best records. This is an
optimization hint used in conjunction with a sort_index. If possible
ZCatalog will use a different sort algorithm that uses much less memory
and scales better then a full sort. The actual number of records
returned is not guaranteed to be <= limit. You still need to apply the
same batching to the results.
merge -- Return merged, lazy results (like searchResults) or raw
results for later merging. This can be used to perform multiple
queries (even across catalogs) and merge and sort the combined results.
"""
......@@ -157,6 +157,13 @@ class TestZCatalog(unittest.TestCase):
testNum = str(self.upper - 3)
data = self._catalog.getIndexDataForUID(testNum)
assert data['title'][0] == testNum
def testSearch(self):
query = {'title': ['5','6','7']}
sr = self._catalog.searchResults(query)
self.assertEqual(len(sr), 3)
sr = self._catalog.search(query)
self.assertEqual(len(sr), 3)
class TestCatalogObject(unittest.TestCase):
def setUp(self):
......@@ -346,6 +353,11 @@ class TestCatalogObject(unittest.TestCase):
# set is much larger than the sort index.
a = self._catalog(sort_on='att1')
self.assertEqual(len(a), self.upper)
def testSortLimit(self):
a = self._catalog(sort_on='num', sort_limit=10)
self.assertEqual(a[0].num, self.upper - 1)
self.assertEqual(a.actual_result_count, self.upper)
class objRS(ExtensionClass.Base):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment