Commit 667c91df authored by Vincent Pelletier's avatar Vincent Pelletier

ZSQLCatalog: Make _reindexObjectList tolerate duplicate documents

When _reindexObjectList receives a document more than once (which may
happen when multiple different-tag indexation activities are spawned for
the same document), it would emit a meaningless error, saying that
document /foo/bar stole the uid of document /foo/bar.
Instead, fix duplicate tracking and skip such dulicates.

Also, simplify & make _reindexObjectList more robust:
- Always check path length.
- Allocate uids before looking for duplicates in catalog (it may not be
  actually needed at this level nowadays).
- Always check both uid-to-path and path-to-uid mappings.
- Reuse existing mappings to detect duplicates among objects being reindexed,
  removing the need for assigned_uid_dict.
- Avoid computing path more than once, as it's expensive.
parent 69aefdff
...@@ -1312,77 +1312,47 @@ class Catalog(Folder, ...@@ -1312,77 +1312,47 @@ class Catalog(Folder,
def _catalogObjectList(self, object_list, method_id_list=None, def _catalogObjectList(self, object_list, method_id_list=None,
disable_cache=0, check_uid=1, idxs=None): disable_cache=0, check_uid=1, idxs=None):
"""This is the real method to catalog objects.""" """This is the real method to catalog objects."""
LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_list))
if idxs not in (None, []): if idxs not in (None, []):
LOG('ZSLQCatalog.SQLCatalog:catalogObjectList', WARNING, LOG('ZSLQCatalog.SQLCatalog:catalogObjectList', WARNING,
'idxs is ignored in this function and is only provided to be compatible with CMFCatalogAware.reindexObject.') 'idxs is ignored in this function and is only provided to be compatible with CMFCatalogAware.reindexObject.')
if not self.getPortalObject().isIndexable(): if not self.getPortalObject().isIndexable():
return return
path_uid_dict = {} object_path_dict = {}
uid_path_dict = {}
if check_uid:
path_list = []
path_list_append = path_list.append
uid_list = [] uid_list = []
uid_list_append = uid_list.append uid_list_append = uid_list.append
for object in object_list: for object in object_list:
if object in object_path_dict:
continue
path = object.getPath() path = object.getPath()
if path is not None: if len(path) > MAX_PATH_LEN:
path_list_append(path) raise ValueError('path too long (%i): %r' % (len(path), path))
uid = object.uid object_path_dict[object] = path
if uid is not None:
uid_list_append(uid)
path_uid_dict = self.getUidDictForPathList(path_list=path_list)
uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
# This dict will store uids and objects which are verified below.
# The purpose is to prevent multiple objects from having the same
# uid inside object_list.
assigned_uid_dict = {}
for object in object_list:
try: try:
uid = aq_base(object).uid uid = aq_base(object).uid
except AttributeError: except AttributeError:
uid = None uid = None
if uid is None or uid == 0: if uid is None or uid == 0:
try: object.uid = uid = self.newUid()
object.uid = self.newUid() uid_list_append(uid)
except ConflictError: LOG('SQLCatalog', TRACE, 'catalogging %d objects' % len(object_path_dict))
raise if check_uid:
except: path_uid_dict = self.getUidDictForPathList(path_list=object_path_dict.values())
raise RuntimeError, 'could not set missing uid for %r' % (object,) uid_path_dict = self.getPathDictForUidList(uid_list=uid_list)
elif check_uid: for object, path in object_path_dict.iteritems():
if uid in assigned_uid_dict: uid = object.uid
error_message = 'uid of %r is %r and ' \ if path_uid_dict.setdefault(path, uid) != uid:
'is already assigned to %s in catalog !!! This can be fatal.' % \ error_message = 'path %r has uids %r (catalog) and %r (being indexed) ! This can break relations' % (
(object, uid, assigned_uid_dict[uid]) path,
if not self.sql_catalog_raise_error_on_uid_check: path_uid_dict[path],
LOG("SQLCatalog.catalogObjectList", ERROR, error_message) uid,
else: )
raise ValueError(error_message) if self.sql_catalog_raise_error_on_uid_check:
path = object.getPath()
index = path_uid_dict.get(path)
if index is not None:
if index < 0:
raise CatalogError, 'A negative uid %d is used for %s. Your catalog is broken. Recreate your catalog.' % (index, path)
if uid != index or isinstance(uid, int):
error_message = 'uid of %r changed from %r (property) to %r '\
'(catalog, by path) !!! This can be fatal' % (object, uid, index)
if not self.sql_catalog_raise_error_on_uid_check:
LOG("SQLCatalog.catalogObjectList", ERROR, error_message)
else:
raise ValueError(error_message) raise ValueError(error_message)
else: LOG("SQLCatalog._catalogObjectList", ERROR, error_message)
if uid in uid_path_dict: catalog_path = uid_path_dict.setdefault(uid, path)
catalog_path = uid_path_dict.get(uid) if catalog_path != path and catalog_path != 'deleted':
else: # Two possible cases if catalog_path == 'deleted':
catalog_path = self.getPathForUid(uid)
if catalog_path == 'deleted':
# Two possible cases:
# - Reindexed object's path changed (ie, it or at least one of its # - Reindexed object's path changed (ie, it or at least one of its
# parents was renamed) but unindexObject was not called yet. # parents was renamed) but unindexObject was not called yet.
# Reindexing is harmelss: unindexObject and then an # Reindexing is harmelss: unindexObject and then an
...@@ -1397,23 +1367,14 @@ class Catalog(Folder, ...@@ -1397,23 +1367,14 @@ class Catalog(Folder,
# We could also not index it to save the time needed to index, but # We could also not index it to save the time needed to index, but
# this would slow down all regular case to slightly improve an # this would slow down all regular case to slightly improve an
# exceptional case. # exceptional case.
pass error_message = 'uid %r is shared between %r (catalog) and %r (being indexed) ! This can break relations' % (
elif catalog_path is not None: uid,
# An uid conflict happened... Why? uid_path_dict[uid],
# can be due to path length path,
if len(path) > MAX_PATH_LEN: )
LOG('SQLCatalog', ERROR, 'path of object %r is too long for catalog. You should use a shorter path.' %(object,)) if self.sql_catalog_raise_error_on_uid_check:
error_message = 'uid of %r is %r and ' \
'is already assigned to %s in catalog !!! This can be fatal.' \
% (object, uid, catalog_path)
if not self.sql_catalog_raise_error_on_uid_check:
LOG('SQLCatalog', ERROR, error_message)
else:
raise ValueError(error_message) raise ValueError(error_message)
uid = object.uid LOG('SQLCatalog._catalogObjectList', ERROR, error_message)
assigned_uid_dict[uid] = object
if method_id_list is None: if method_id_list is None:
method_id_list = self.getSqlCatalogObjectListList() method_id_list = self.getSqlCatalogObjectListList()
...@@ -1449,7 +1410,7 @@ class Catalog(Folder, ...@@ -1449,7 +1410,7 @@ class Catalog(Folder,
except KeyError: except KeyError:
pass pass
if expression is None: if expression is None:
catalogged_object_list = object_list catalogged_object_list = object_path_dict.keys()
else: else:
text = expression.text text = expression.text
catalogged_object_list = catalogged_object_list_cache.get(text) catalogged_object_list = catalogged_object_list_cache.get(text)
...@@ -1463,7 +1424,7 @@ class Catalog(Folder, ...@@ -1463,7 +1424,7 @@ class Catalog(Folder,
% (method_name, text), DeprecationWarning) % (method_name, text), DeprecationWarning)
expression_cache_key_list = filter.get('expression_cache_key', ()) expression_cache_key_list = filter.get('expression_cache_key', ())
expression_result_cache = {} expression_result_cache = {}
for object in object_list: for object in object_path_dict:
if expression_cache_key_list: if expression_cache_key_list:
# Expressions are slow to evaluate because they are executed # Expressions are slow to evaluate because they are executed
# in restricted environment. So we try to save results of # in restricted environment. So we try to save results of
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment