From 02287fad78020142b011e37a4a4ecbbfb3436dce Mon Sep 17 00:00:00 2001 From: Jean-Paul Smets <jp@nexedi.com> Date: Mon, 26 Mar 2007 11:55:34 +0000 Subject: [PATCH] Various fixes for portal type discovery. Crawler has some method commented to prevent excessive ZODB usage (until this is optimised). git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@13635 20353a03-c40f-0410-a6d1-a30d3c3de9de --- product/ERP5/Tool/ContributionTool.py | 226 ++++++++++++++++++-------- 1 file changed, 162 insertions(+), 64 deletions(-) diff --git a/product/ERP5/Tool/ContributionTool.py b/product/ERP5/Tool/ContributionTool.py index 298a2b7b78..b754273432 100644 --- a/product/ERP5/Tool/ContributionTool.py +++ b/product/ERP5/Tool/ContributionTool.py @@ -29,6 +29,8 @@ import cStringIO import re import string +import socket +import md5 import urllib2, urllib from AccessControl import ClassSecurityInfo, getSecurityManager @@ -42,7 +44,18 @@ from zLOG import LOG from DateTime import DateTime from Acquisition import aq_base +# Install openers +import ContributionOpener +opener = urllib2.build_opener(ContributionOpener.DirectoryFileHandler) +urllib2.install_opener(opener) + +# A temporary hack until urllib2 supports timeout setting - XXX +import socket +socket.setdefaulttimeout(60) # 1 minute timeout + +# Global parameters TEMP_NEW_OBJECT_KEY = '_v_new_object' +MAX_REPEAT = 10 _marker = [] # Create a new marker object. @@ -111,46 +124,52 @@ class ContributionTool(BaseTool): # types share the same constructor. However, if Memo has # same constructor as Text and Memo is not in content_type_registry # then it should be considered. - valid_portal_type_list = [] + extra_valid_portal_type_list = [] content_registry_type_dict = getContentTypeRegistryTypeDict() portal_type_tool = self.portal_types for pt in portal_type_tool.objectValues(): if hasattr(pt, 'factory') and pt.factory == portal_type_tool[document.getPortalType()].factory: if not content_registry_type_dict.has_key(pt.id): - valid_portal_type_list.append(pt.id) + extra_valid_portal_type_list.append(pt.id) + + if not extra_valid_portal_type_list: + # There is really no ambiguity here + # The portal_type set by PUT_factory is appropriate + # This is the best case we can get + # LOG('findTypeName no ambiguity', 0, document.portal_type) + return document.portal_type + + valid_portal_type_list = [document.portal_type] + extra_valid_portal_type_list # Check if the filename tells which portal_type this is portal_type_list = self.getPropertyDictFromFileName(file_name).get('portal_type', []) + if isinstance(portal_type_list, str): portal_type_list = [portal_type_list] + portal_type_list = filter(lambda x: x in valid_portal_type_list, portal_type_list) + if not portal_type_list: + portal_type_list = valid_portal_type_list if len(portal_type_list) == 1: # if we have only one, then this is it + # LOG('findTypeName single portal_type_list', 0, portal_type_list[0]) return portal_type_list[0] # If it is still None, we need to read the document # to check which of the candidates is suitable - if portal_type is None: - # The document is now responsible of telling all its properties - portal_type = document.getPropertyDictFromContent().get('portal_type', None) - if portal_type is not None: - # we check if it matches the candidate list, if there were any - if len(portal_type_list)>1 and portal_type not in portal_type_list: - raise TypeError('%s not in the list of %s' % (portal_type, str(portal_type_list))) - return portal_type - else: - # if not found but the candidate list is there, return the first - if len(portal_type_list)>0: - return portal_type_list[0] - - if portal_type is None: - # We can not do anything anymore - #return document.portal_type # XXX Wrong or maybe right ? - return None - - if portal_type not in valid_portal_type_list: - # We will not be able to migrate ob to portal_type - #return ob.portal_type - return None - - return portal_type + # Let us give a chance to getPropertyDictFromContent to + # tell us what is the portal type of this document + content_portal_type_list = document.getPropertyDictFromContent().get('portal_type', None) + if content_portal_type_list: + if isinstance(portal_type, str): + content_portal_type_list = [content_portal_type_list] + # Filter valid candidates + content_portal_type_list = filter(lambda x: x in portal_type_list, content_portal_type_list) + if content_portal_type_list: + # if we have more than one, then return the first one + # LOG('findTypeName from content', 0, content_portal_type_list[0]) + return content_portal_type_list[0] + + # If portal_type_list is not empty, return the first one + # LOG('findTypeName from first portal_type_list', 0, portal_type_list[0]) + return portal_type_list[0] security.declareProtected(Permissions.AddPortalContent, 'newContent') def newContent(self, id=None, portal_type=None, url=None, container=None, @@ -209,28 +228,37 @@ class ContributionTool(BaseTool): del kw['file_name'] else: # build a new file from the url - data = urllib2.urlopen(url).read() + url_file = urllib2.urlopen(url) + data = url_file.read() # time out must be set or ... too long XXX file = cStringIO.StringIO() file.write(data) file.seek(0) + # Create a file name based on the URL and quote it file_name = url.split('/')[-1] or url.split('/')[-2] - file_name = self._encodeURL(file_name) - if hasattr(file, 'headers'): - headers = file.headers + file_name = urllib.quote(file_name, safe='') + file_name = file_name.replace('%', '') + # For URLs, we want an id by default equal to the encoded URL + if id is None: id = self._encodeURL(url) + if hasattr(url_file, 'headers'): + headers = url_file.headers if hasattr(headers, 'type'): mime_type = headers.type kw['file'] = file # If the portal_type was provided, we can go faster - if portal_type is not None and portal_type != '': - # We know the portal_type, let us find the module - module = self.getDefaultModule(portal_type) + if portal_type and container is None: + # We know the portal_type, let us find the default module + # and use it as container + container = self.getDefaultModule(portal_type) - # And return a document + if portal_type and container is not None: + # We could simplify things here and return a document immediately # NOTE: we use the module ID generator rather than the provided ID - document = module.newContent(portal_type=portal_type, **kw) - if discover_metadata: document.discoverMetadata(file_name=file_name, user_login=user_login) - return document + #document = module.newContent(portal_type=portal_type, **kw) + #if discover_metadata: + # document.activate().discoverMetadata(file_name=file_name, user_login=user_login) + #return document + pass # XXX - This needs to be implemented once the rest is stable # From here, there is no hope unless a file was provided if file is None: @@ -239,6 +267,7 @@ class ContributionTool(BaseTool): # So we will simulate WebDAV to get an empty object # with PUT_factory - we provide the mime_type as # parameter + # LOG('new content', 0, "%s -- %s" % (file_name, mime_type)) ob = self.PUT_factory(file_name, mime_type, None) # Raise an error if we could not guess the portal type @@ -250,7 +279,6 @@ class ContributionTool(BaseTool): document = BaseTool._getOb(self, file_name) # Then edit the document contents (so that upload can happen) - kw.setdefault('source_reference', file_name) # XXX redundant with discoverMetadata document._edit(**kw) if url: document.fromURL(url) @@ -260,15 +288,15 @@ class ContributionTool(BaseTool): # Move the document to where it belongs if container_path is not None: container = self.getPortalObject().restrictedTraverse(container_path) - document = self._setObject(file_name, ob, user_login=user_login, container=container, id=id) + document = self._setObject(file_name, ob, user_login=user_login, + container=container, id=id, discover_metadata=discover_metadata) document = self._getOb(file_name) # Call _getOb to purge cache # Notify workflows - document.notifyWorkflowCreated() + #document.notifyWorkflowCreated() # Reindex it and return the document document.reindexObject() - if document.getCrawlingDepth() > 0: document.activate().crawlContent() return document security.declareProtected( Permissions.AddPortalContent, 'newXML' ) @@ -315,7 +343,7 @@ class ContributionTool(BaseTool): return property_dict # WebDAV virtual folder support - def _setObject(self, name, ob, user_login=None, container=None, id=None): + def _setObject(self, name, ob, user_login=None, container=None, id=None, discover_metadata=1): """ The strategy is to let NullResource.PUT do everything as usual and at the last minute put the object in a different @@ -368,11 +396,27 @@ class ContributionTool(BaseTool): else: new_id = id ob.id = new_id - module._setObject(new_id, ob) - - # We can now discover metadata - document = module[new_id] - document.discoverMetadata(file_name=name, user_login=user_login) + existing_document = module.get(new_id, None) + if existing_document is None: + # There is no preexisting document - we can therefore + # set the new object + module._setObject(new_id, ob) + # We can now discover metadata + document = module[new_id] + if discover_metadata: + # Metadata disovery is done as an activity by default + # If we need to discoverMetadata synchronously, it must + # be for user interface and should thus be handled by + # ZODB scripts + document.activate().discoverMetadata(file_name=name, user_login=user_login) + else: + document = existing_document + if document.isExternalDocument(): + # If this is an external document, update its content + document.activate().updateContentFromURL() + else: + # This is where we may have to implement revision support + raise NotImplementedError # Keep the document close to us - this is only useful for # file upload from webdav @@ -465,13 +509,30 @@ class ContributionTool(BaseTool): we must anyway insert objects in btrees and this is simimar in cost to accessing them. """ + # Produce an MD5 from the URL + hex_md5 = md5.md5(url).hexdigest() + # Take the first part in the URL which is not empty + # LOG("_encodeURL", 0, url) + url_segment = url.split(':')[1] + url_segment_list = url_segment.split('/') + url_domain = None + for url_part in url_segment_list: + if url_part: + url_domain = url_part + break + # Return encoded url + if url_domain: + url_domain = urllib.quote(url_domain, safe='') + url_domain = url_domain.replace('%', '') + return "%s-%s" % (url_domain, hex_md5) + return hex_md5 url = urllib.quote(url, safe='') url = url.replace('_', '__') url = url.replace('%', '_') return url security.declareProtected(Permissions.AddPortalContent, 'crawlContent') - def crawlContent(self, content): + def crawlContent(self, content, container=None): """ Analyses content and download linked pages @@ -485,12 +546,14 @@ class ContributionTool(BaseTool): base_url = content.getContentBaseURL() url_list = map(lambda url: self._normaliseURL(url, base_url), set(content.getContentURLList())) for url in set(url_list): + # LOG('trying to crawl', 0, url) # Some url protocols should not be crawled if url.split(':')[0] in no_crawl_protocol_list: continue - #if content.getParentValue() - # in place of not ? - container = content.getParentValue() + if container is None: + #if content.getParentValue() + # in place of not ? + container = content.getParentValue() # Calculate the id under which content will be stored id = self._encodeURL(url) # Try to access the document if it already exists @@ -499,6 +562,7 @@ class ContributionTool(BaseTool): # XXX - This call is not working due to missing group_method_id # therefore, multiple call happen in parallel and eventually fail # (the same URL is created multiple times) + # LOG('activate newContentFromURL', 0, url) self.activate(activity="SQLQueue").newContentFromURL(container_path=container.getRelativeUrl(), id=id, url=url, crawling_depth=depth - 1) else: @@ -506,28 +570,35 @@ class ContributionTool(BaseTool): new_depth = max(depth - 1, document.getCrawlingDepth()) document._setCrawlingDepth(new_depth) # And activate updateContentFromURL on existing document - next_date = document.getNextAlarmDate() - document.activate(at_date=next_date).updateContentFromURL() + next_date = document.getNextAlarmDate() # This should prevent doing the update too often + # LOG('activate updateContentFromURL', 0, url) + document.activate(at_date=next_date).updateContentFromURL(crawling_depth=depth - 1) security.declareProtected(Permissions.AddPortalContent, 'updateContentFromURL') - def updateContentFromURL(self, content): + def updateContentFromURL(self, content, repeat=MAX_REPEAT, crawling_depth=0): """ Updates an existing content. """ + # Step 0: update crawling_depth if required + if crawling_depth > content.getCrawlingDepth(): + content._setCrawlingDepth(crawling_depth) # Step 1: download new content - url = content.asURL() - data = urllib2.urlopen(url).read() - file = cStringIO.StringIO() - file.write(data) - file.seek(0) + try: + url = content.asURL() + data = urllib2.urlopen(url).read() + file = cStringIO.StringIO() + file.write(data) + file.seek(0) + except socket.error, msg: # repeat multiple times in case of socket error + content.updateContentFromURL(repeat=repeat - 1) # Step 2: compare and update if necessary (md5) # do here some md5 stuff to compare contents... if 1: - content._edit(file=file) + # content._edit(file=file) # Commented for testing # Step 3: convert to base format - content.convertToBaseFormat() + # content.convertToBaseFormat() # Commented for testing # Step 4: activate populate (unless interaction workflow does it) - content.activate().populateContent() + # content.activate().populateContent() # Commented for testing # Step 5: activate crawlContent content.activate().crawlContent() else: @@ -539,7 +610,7 @@ class ContributionTool(BaseTool): content.activate(at_date=next_date).updateContentFromURL() security.declareProtected(Permissions.AddPortalContent, 'newContentFromURL') - def newContentFromURL(self, **kw): + def newContentFromURL(self, container_path=None, id=None, repeat=MAX_REPEAT, **kw): """ A wrapper method for newContent which provides extra safety in case or errors (ie. download, access, conflict, etc.). @@ -550,6 +621,33 @@ class ContributionTool(BaseTool): NOTE: implementation needs to be done. """ - return self.newContent(**kw) + # First of all, make sure do not try to create an existing document + if container_path is not None and id is not None: + container = self.restrictedTraverse(container_path) + document = container.get(id, None) + if document is not None: + # Document aleardy exists: no need to keep on crawling + return + try: + document = self.newContent(container_path=container_path, id=id, **kw) + if document.getCrawlingDepth() > 0: document.activate().crawlContent() + document.activate(at_date=document.getNextAlarmDate()).updateContentFromURL() + except urllib2.HTTPError, error: + # Catch any HTTP error + self.activate(at_date=DateTime() + 1).newContentFromURL( + container_path=container_path, id=id, + repeat=repeat - 1, **kw) + except urllib2.URLError, error: + if error.reason.args[0] == -3: + # Temporary failure in name resolution - try again in 1 day + self.activate(at_date=DateTime() + 1).newContentFromURL( + container_path=container_path, id=id, + repeat=repeat - 1, **kw) + else: + # Unknown errror - to be extended + raise + except: + # Pass exception to Zope (ex. conflict errors) + raise InitializeClass(ContributionTool) \ No newline at end of file -- 2.30.9