# -*- coding: utf-8 -*- ############################################################################## # # Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved. # Ivan Tyagov <ivan@nexedi.com> # # WARNING: This program as such is intended to be used by professional # programmers who take the whole responsibility of assessing all potential # consequences resulting from its eventual inadequacies and bugs # End users who are looking for a ready-to-use solution with commercial # guarantees and support are strongly adviced to contract a Free Software # Service Company # # This program is Free Software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################## from AccessControl import ClassSecurityInfo, getSecurityManager from Products.ERP5Type import Permissions from Products.ERP5Type.Utils import convertToUpperCase from Products.CMFCore.utils import getToolByName from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin import os import re try: import magic except ImportError: magic = None VALID_ORDER_KEY_LIST = ('user_login', 'content', 'filename', 'input') CONTENT_INFORMATION_FORMAT = '_idiscoverable_content_information' class DiscoverableMixin(CachedConvertableMixin): """ Implements IDiscoverable This class provide methods useful for Metadata extraction. It inherit from CachedConvertableMixin to access Cache storage API. As computed data needs to be stored in same backend. """ security = ClassSecurityInfo() security.declareProtected(Permissions.AccessContentsInformation, 'getPropertyDictFromUserLogin') def getPropertyDictFromUserLogin(self, user_login=None): """ Based on the user_login, find out as many properties as needed. returns properties which should be set on the document """ if user_login is None: user_login = str(getSecurityManager().getUser()) method = self._getTypeBasedMethod('getPropertyDictFromUserLogin', fallback_script_id='Document_getPropertyDictFromUserLogin') return method(user_login) security.declareProtected(Permissions.AccessContentsInformation, 'getPropertyDictFromContent') def getPropertyDictFromContent(self): """ Based on the document content, find out as many properties as needed. returns properties which should be set on the document """ # accesss data through convert mime, content = self.convert(None) if not content: # if document is empty, we will not find anything in its content return {} method = self._getTypeBasedMethod('getPropertyDictFromContent', fallback_script_id='Document_getPropertyDictFromContent') return method() security.declareProtected(Permissions.AccessContentsInformation, 'getPropertyDictFromFilename') def getPropertyDictFromFilename(self, filename): """ Based on the file name, find out as many properties as needed. returns properties which should be set on the document """ return self.portal_contributions.getPropertyDictFromFilename(filename) security.declareProtected(Permissions.AccessContentsInformation, 'getPropertyDictFromFileName') getPropertyDictFromFileName = getPropertyDictFromFilename security.declareProtected(Permissions.AccessContentsInformation, 'getPropertyDictFromInput') def getPropertyDictFromInput(self, input_parameter_dict): """ Fetch argument_dict, then filter pass this dictionary to getPropertyDictFromInput. """ method = self._getTypeBasedMethod('getPropertyDictFromInput') return method(input_parameter_dict) ### Metadata disovery and ingestion methods security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata') def discoverMetadata(self, filename=None, user_login=None, input_parameter_dict=None): """ This is the main metadata discovery function - controls the process of discovering data from various sources. The discovery itself is delegated to scripts or uses preference-configurable regexps. The method returns either self or the document which has been merged in the discovery process. filename - this parameter is a file name of the form "AA-BBB-CCC-223-en" user_login - this is a login string of a person; can be None if the user is currently logged in, then we'll get him from session input_parameter_dict - arguments provided to Create this content by user. """ # Preference is made of a sequence of 'user_login', 'content', 'filename', 'input' method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList') order_list = list(method()) order_list.reverse() # build a dictionary according to the order kw = {} for order_id in order_list: result = None if order_id not in VALID_ORDER_KEY_LIST: # Prevent security attack or bad preferences raise AttributeError, "%s is not in valid order key list" % order_id method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id) method = getattr(self, method_id) if order_id == 'filename': if filename is not None: result = method(filename) elif order_id == 'user_login': if user_login is not None: result = method(user_login) elif order_id == 'input': if input_parameter_dict is not None: result = method(input_parameter_dict) else: result = method() if result is not None: for key, value in result.iteritems(): if value not in (None, ''): kw[key]=value # Prepare the content edit parameters portal_type = kw.pop('portal_type', None) if portal_type and portal_type != self.getPortalType(): # Reingestion is required to update portal_type return self.migratePortalType(portal_type) # Try not to invoke an automatic transition here self._edit(**kw) if not portal_type: # If no portal_type was dicovered, pass self # through to portal_contribution_registry # to guess destination portal_type against all properties. # If returned portal_type is different, then reingest. registry = getToolByName(self.getPortalObject(), 'portal_contribution_registry') portal_type = registry.findPortalTypeName(context=self) if portal_type != self.getPortalType(): return self.migratePortalType(portal_type) # Finish ingestion by calling method self.finishIngestion() # XXX - is this really the right place ? self.reindexObject() # XXX - is this really the right place ? # Revision merge is tightly coupled # to metadata discovery - refer to the documentation of mergeRevision method merged_doc = self.mergeRevision() # XXX - is this really the right place ? merged_doc.reindexObject() # XXX - is this really the right place ? return merged_doc # XXX - is this really the right place ? security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion') def finishIngestion(self): """ Finish the ingestion process by calling the appropriate script. This script can for example allocate a reference number automatically if no reference was defined. """ method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion') return method() security.declareProtected(Permissions.AccessContentsInformation, 'getContentTypeFromContent') def getContentTypeFromContent(self): """ Return content_type read from metadata extraction of content. This method is called by portal_contribution_registry """ mime, content = self.convert(None) if not content: return if magic is not None: # This will be delegated soon to external web service # like cloudooo # ERP5 will no longer handle data itself. mimedetector = magic.Magic(mime=True) return mimedetector.from_buffer(content) security.declareProtected(Permissions.AccessContentsInformation, 'getExtensionFromFilename') def getExtensionFromFilename(self, filename=None): """ Return extension read from filename in lower case. """ if not filename: filename = self.getStandardFilename() basename, extension = os.path.splitext(filename) if extension: extension = extension[1:].lower() # remove first dot return extension security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation') def getContentInformation(self): """ Call private implementation, then store the result in conversion cache storage. """ format = CONTENT_INFORMATION_FORMAT # How to knows if a instance implement an interface try: mime, cached_value = self.getConversion(format=format) return cached_value except KeyError: value = self._getContentInformation() self.setConversion(value, format=format) return value def _getContentInformation(self): """ Returns the content information from the HTML conversion. The default implementation tries to build a dictionary from the HTML conversion of the document and extract the document title. """ result = {} html = self.asEntireHTML() if not html: return result title_list = re.findall(self.title_parser, str(html)) if title_list: result['title'] = title_list[0] return result