a complete refactoring

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@11921 20353a03-c40f-0410-a6d1-a30d3c3de9de

a complete refactoring
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@11921 20353a03-c40f-0410-a6d1-a30d3c3de9de
183169f4 · Bartek Górny · c9630ac9 · 183169f4
Commit 183169f4 authored Jan 08, 2007 by Bartek Górny
Show whitespace changes
Inline Side-by-side

Showing with 519 additions and 182 deletions

product/ERP5/Document/Document.py product/ERP5/Document/Document.py +519 -182

No files found.
--- a/product/ERP5/Document/Document.py
+++ b/product/ERP5/Document/Document.py
@@ -26,23 +26,33 @@
 #
 ##############################################################################

-from AccessControl import ClassSecurityInfo
+from DateTime import DateTime
+from operator import add

+from AccessControl import ClassSecurityInfo
 from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
 from Products.ERP5Type.XMLObject import XMLObject
 from Products.ERP5Type.WebDAVSupport import TextContent
-from DateTime import DateTime
+from Products.ERP5Type.Message import Message
+
+_MARKER = []
+VALID_ORDER_KEY_LIST = ('user', 'content', 'file_name', 'input')

 def makeSortedTuple(kw):
  items = kw.items()
  items.sort()
  return tuple(items)

+
 class ConversionCacheMixin:
  """
    This class provides a generic API to store in the ZODB
    various converted versions of a file or of a string.

+    Versions are stored in dictionaries; the class stores also
+    generation time of every format and its mime-type string.
+    Format can be a string or a tuple (e.g. format, resolution).
+
    TODO:
    * Implement ZODB BLOB
  """
@@ -74,16 +84,23 @@ class ConversionCacheMixin:
    """
    return self._cached_data.has_key(makeSortedTuple(format))

+  security.declareProtected(Permissions.View, 'getCacheTime')
  def getCacheTime(self, **format):
    """
      Checks when if ever was the file produced
    """
    return self._cached_time.get(makeSortedTuple(format), 0)

+  security.declareProtected(Permissions.ModifyPortalContent, 'updateConversion')
  def updateConversion(self, **format):
      self._cached_time[makeSortedTuple(format)] = DateTime()

+  security.declareProtected(Permissions.ModifyPortalContent, 'setConversion')
  def setConversion(self, data, mime=None, **format):
+    """
+    Saves a version of the document in a given format; records mime type
+    and conversion time (which is right now).
+    """
    tformat = makeSortedTuple(format)
    if mime is not None:
      self._cached_mime[tformat] = mime
@@ -92,11 +109,17 @@ class ConversionCacheMixin:
      self.updateConversion(**format)
    self._p_changed = 1

+  security.declareProtected(Permissions.View, 'getConversion')
  def getConversion(self, **format):
-    '''
-    we could be much cooler here - pass testing and updating methods to this function
-    so that it does it all by itself; this'd eliminate the need for cacheSet public method
-    '''
+    """
+    Returns version of the document in a given format, if it has it; otherwise
+    returns empty string (the caller should check hasConversion before calling
+    this function.
+
+    (we could be much cooler here - pass testing and updating methods to this function
+    so that it does it all by itself; this'd eliminate the need for setConversion public method)
+    XXX-BG: I'm not sure now what I meant by this...
+    """
    tformat = makeSortedTuple(format)
    return self._cached_mime.get(tformat, ''), self._cached_data.get(tformat, '')

@@ -106,8 +129,6 @@ class ConversionCacheMixin:
    Get cache details as string (for debugging)
    """
    s = 'CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
-    #self.log('getCacheInfo',self.cached_time)
-    #self.log('getCacheInfo',self.cached_data)
    for f in self._cached_time.keys():
      t = self._cached_time[f]
      data = self._cached_data.get(f)
@@ -125,6 +146,7 @@ class ConversionCacheMixin:
    s += '</table>'
    return s

+
 class Document(XMLObject):
  """
      Document is an abstract class with all methods
@@ -132,57 +154,94 @@ class Document(XMLObject):
      searchable text, explicit relations, implicit relations,
      metadata, versions, languages, etc.

-        There currently two types of Document subclasses:
+      There are currently two types of Document subclasses:

      * File for binary file based documents. File
        has subclasses such as Image, OOoDocument,
        PDFDocument, etc. to implement specific conversion
-          methods
+        methods.

      * TextDocument for text based documents. TextDocument
        has subclasses such as Wiki to implement specific
-          methods
+        methods.

      Document classes which implement conversion should use
-        the CachingMixin class so that converted values are
+      the ConversionCacheMixin class so that converted values are
      stored.

-        The Document class behaviour can be extended through scripts.
+      XXX IDEA - ISSUE: generic API for conversion.
+        converted_document = document.convert(...)
+
+      Instances can be created directly, or via portal_contributions tool
+      which manages document ingestion process whereby a file can be uploaded
+      by http or sent in by email or dropped in by webdav or in some other
+      way as yet unknown. The ingestion process has the following steps:
+
+      (1) portal type detection
+      (2) object creation and upload of data
+      (3) metadata discovery (optionally with conversion of data to another format)
+      (4) other possible actions
+
+      This class handles (3) and calls a ZMI script to do (4).
+
+      Metadata can be drawn from various sources:
+
+      input     -   data supplied with http request or set on the object during (2) (e.g.
+                    discovered from email text)
+      file_name -   data which might be encoded in file name
+      user_login-   information about user who is contributing the file
+      content   -   data which might be derived from document content

-        * Document_discoverMetadata (DMS_ingestFile)
-          finds all metadata or uses the metadata which was
-          provided as parameter. Document_discoverMetadata should
-          be overloaded if necessary for some classes
-          (ex. TextDocument_discoverMetadata, Image_discoverMetadata)
-          and should be called through a single API discoverMetadata()
-          Consider using _getTypeBasedMethod for implementation
+      If a certain property is defined in more than one source, it is set according to
+      preference order returned by a script 
+      Document_getPreferredDocumentMetadataDiscoveryOrderList (or type-based version).
+      Methods for discovering metadata are:
+        getPropertyDictFromInput
+        getPropertyDictFromFileName
+        getPropertyDictFromUserLogin
+        getPropertyDictFromContent

-        * Document_ingestFile (Document_uploadFile)
-          is called for http based ingestion and itself calls
-          Document_discoverMetadata. Many parameters may be
-          passed to Document_ingest through an
-          online form.
+      The Document class behaviour can be extended / customized through scripts
+      (which are type-based so can be adjusted per portal type).

-        * Document_ingestEmail is called for email based
-          ingestion and itself calls Document_ingestFile.
-          Document_ingestEmail is in charge of parsing email
-          to extract metadata before calling Document_ingestFile.
+      * Document_getFilenameParsingRegexp - returns a regular expression for extracting
+        properties encoded in file name

-        * PUT is called for DAV/FTP based ingestion directly from the class.
-          It itself calls Document_discoverMetadata.
+      * Document_getReferenceLookupRegexp - returns a regular expression for finding
+        references to documents within document text content

-        Custom scripts for automatic classification:
+      * Document_getPropertyListFromUser - finds a user (by user_login or from session)
+        and returns properties which should be set on the document

-        * Document_findWikiPredecessorList finds a list of documents
-          which are referencing us.
-          Should this be merged with WebSite_getDocumentValue ? XXX
+      * Document_getPropertyListFromContent - analyzes document content and returns
+        properties which should be set on the document
+
+      * Document_findImplicitSuccessor - finds appropriate version of a document
+        based on coordinates (which can be incomplete, depending if a document reference
+        found in text content contained version and/or language)
+
+      * Document_findImplicitPredecessorList - finds document predecessors based on
+        the document coordinates (can use only complete coordinates, or also partial)
+
+      * Document_getPreferredDocumentMetadataDiscoveryOrderList - returns an order
+        in which metadata should be set/overwritten
+
+      * Document_finishIngestion - called by portal_activities after all the ingestion
+        is completed (and after document has been converted, so text_content
+        is available if the document has it)
+
+      * Document_getNewRevisionNumber - calculates revision number which should be set
+        on this document. Implementation depends on revision numbering policy which
+        can be very different. Interaction workflow should call setNewRevision method.

-        * Document_findWikiSuccessor tries to find a document matching with
-          a given regexp.
-          Should this be merged with WebSite_getDocumentValue ? XXX

      Subcontent: documents may include subcontent (files, images, etc.)
      so that publication of rich content can be path independent.
+
+    Consistency checking:
+      Default implementation uses DocumentReferenceConstraint to check if the 
+      reference/language/version triplet is unique. Additional constraints
+      can be added if necessary.
  """

  meta_type = 'ERP5 Document'
@@ -209,101 +268,379 @@ class Document(XMLObject):
  __implements__ = ()

  searchable_property_list = ('title', 'description', 'id', 'reference',
-                                'version', 'short_title', 'keywords',
+                              'version', 'short_title', 'keyword',
                              'subject', 'source_reference', 'source_project_title')
-                                # What is keywords ?
-                                # XXX-JPS This is a plural
-                                # XXX-JPS subject_list would be better than subject in this case
-                                # and the getSearchableText should be able to process lists
-                                # Same for source_reference_list, source_project_title_list
+

  ### Content indexing methods
  security.declareProtected(Permissions.View, 'getSearchableText')
  def getSearchableText(self, md=None):
    """
    Used by the catalog for basic full text indexing.
+    Uses searchable_property_list attribute to put together various properties
+    of the document into one searchable text string.

    XXX-JPS - This method is nice. It should probably be moved to Base class
    searchable_property_list could become a standard class attribute.

    TODO (future): Make this property a per portal type property.
    """
-      searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_property_list))
+    def getPropertyListOrValue(property):
+      """
+      we try to get a list, else we get value and convert to list
+      """
+      val = self.getPropertyList(property)
+      if val is None:
+        val = self.getProperty(property)
+        if val is not None and val != '':
+          val=[val]
+      return val
+        
+    searchable_text = reduce(add, map(lambda x: self.getPropertyListOrValue(x) or ' ',
+                                                self.searchable_property_list))
    return searchable_text

  # Compatibility with CMF Catalog
  SearchableText = getSearchableText # XXX-JPS - Here wa have a security issue - ask seb what to do

-    security.declareProtected(Permissions.ModifyPortalContent, 'setPropertyListFromFilename')
-    def setPropertyListFromFilename(self, fname):
-      """
-        XXX-JPS missing description
-      """
-      rx_src = self.portal_preferences.getPreferredDocumentFilenameRegexp()
-      if rx_src:
-        rx_parse = re.compile()
-        if rx_parse is None:
-          self.setReference(fname) # XXX-JPS please use _setReference to prevent reindexing all the time
-          return
-        m = rx_parse.match(fname)
-        if m is None:
-          self.setReference(fname) # XXX-JPS please use _setReference to prevent reindexing all the time
-          return
-        for k,v in m.groupdict().items():
-          self.setProperty(k,v) # XXX-JPS please use _setProperty to prevent reindexing all the time
-        # XXX-JPS finally call self.reindexObject()
-      else:
-        # If no regexp defined, we use the file name as reference
-        # this is the failover behaviour
-        self.setReference(fname)
-  
-    security.declareProtected(Permissions.View, 'getWikiSuccessorReferenceList')
-    def getWikiSuccessorReferenceList(self):
+  ### Relation getters
+  def _getImplicitSuccessorReferenceList(self):
    """
-        find references in text_content, return matches
+      Private Implementation Method
+      
+      Find references in text_content, return matches
      with this we can then find objects
+      The reference regexp defined in Document_getFilenameParsingRegexp should 
+      contain named groups (usually reference, version, language)
+      which make keys of the dictionary returned by this function
+      This function returns a list of dictionaries.
    """
+    if getattr(self,'getTextContent',_MARKER) is _MARKER:
+      return []
    if self.getTextContent() is None:
      return []
-      rx_search = re.compile(self.portal_preferences.getPreferredDocumentReferenceRegexp()) # XXX-JPS Safe ? Better error required ?
    try:
-        res = rx_search.finditer(self.getTextContent())
-      except AttributeError:
+      method = self._getTypeBasedMethod('getReferenceLookupRegexp', 
+          fallback_script_id = 'Document_getReferenceLookupRegexp')
+      rx_search = method()
+    except TypeError: # no regexp in preference
+      self.log('please set document reference regexp in preferences')
      return []
+    res = rx_search.finditer(self.getTextContent())
    res = [(r.group(),r.groupdict()) for r in res]
    return res

-    security.declareProtected(Permissions.View, 'getWikiSuccessorValueList')
-    def getWikiSuccessorValueList(self):
+  security.declareProtected(Permissions.View, 'getImplicitSuccessorValueList')
+  def getImplicitSuccessorValueList(self):
    """
-        XXX-JPS Put a description then add notes (notes only is not enough)
-        
-        getWikiSuccessorValueList - the way to find objects is on 
-        implementation level
+    Find objects which we are referencing (if our text_content contains
+    references of other documents). The actual search is delegated to
+    Document_findImplicitSuccessor script. We can use only complete coordinate
+    triplets (reference-version-language) or also partial (e.g. reference only).
+    Normally, Document_findImplicitSuccessor would use getLatestVersionValue to
+    return only the most recent/relevant version.
    """
    # XXX results should be cached as volatile attributes
    # XXX-JPS - Please use TransactionCache in ERP5Type for this
    # TransactionCache does all the work for you
    lst = []
-      for ref in self.getWikiSuccessorReferenceList():
+    for ref in self._getImplicitSuccessorReferenceList():
      r = ref[1]
-        res = self.Document_findWikiSuccessor(**r)
+      res = self.Document_findImplicitSuccessor(**r)
      if len(res)>0:
        lst.append(res[0].getObject())
    return lst

-    security.declareProtected(Permissions.View, 'getWikiPredecessorValueList')
-    def getWikiPredecessorValueList(self):
+  security.declareProtected(Permissions.View, 'getImplicitPredecessorValueList')
+  def getImplicitPredecessorValueList(self):
    """
-        XXX-JPS Put a description then add notes (notes only is not enough)
+      This function tries to find document which are referencing us - by reference only, or
+      by reference/language etc.
+      Uses customizeable script Document_findImplicitPredecessorList.
      
-        it is mostly implementation level - depends on what parameters we use to identify
+      It is mostly implementation level - depends on what parameters we use to identify
      document, and on how a doc must reference me to be my predecessor (reference only,
      or with a language, etc
    """
    # XXX results should be cached as volatile attributes
-      lst = self.Document_findWikiPredecessorList()
+    method = self._getTypeBasedMethod('findImplicitPredecessorList', 
+        fallback_script_id = 'Document_findImplicitPredecessorList')
+    lst = method()
    lst = [r.getObject() for r in lst]
    di = dict.fromkeys(lst) # make it unique
    ref = self.getReference()
    return [o for o in di.keys() if o.getReference() != ref] # every object has its own reference in SearchableText
+
+  security.declareProtected(Permissions.View, 'getImplicitSimilarValueList')
+  def getImplicitSimilarValueList(self):
+    """
+      Analyses content of documents to find out by the content which documents
+      are similar. Not implemented yet. 
+
+      No cloud needed because transitive process
+    """
+    return []
+
+  security.declareProtected(Permissions.View, 'getSimilarCloudValueList')
+  def getSimilarCloudValueList(self):
+    """
+      Returns all documents which are similar to us, directly or indirectly, and
+      in both directions. In other words, it is a transitive closure of similar 
+      relation. Every document is returned in the latest version available.
+    """
+    lista = {}
+    depth = int(depth)
+
+    gettername = 'get%sValueList' % upperCase(category)
+    relatedgettername = 'get%sRelatedValueList' % upperCase(category)
+
+    def getRelatedList(self, level=0):
+      level += 1
+      getter = getattr(self, gettername)
+      relatedgetter = getattr(self, relatedgettername)
+      res = getter() + relatedgetter()
+      for r in res:
+        if lista.get(r) is None:
+          lista[r] = True # we use dict keys to ensure uniqueness
+        if level != depth:
+          getRelatedList(r, level)
+
+    getRelatedList(context)
+    lista_latest = {}
+    for o in lista.keys():
+      lista_latest[o.getLatestVersionValue()] = True # get latest versions avoiding duplicates again
+    if lista_latest.has_key(context): lista_latest.pop(context) # remove this document
+    if lista_latest.has_key(context.getLatestVersionValue()): lista_latest.pop(contextLatestVersionValue()) # remove this document
+
+    return lista_latest.keys()
+
+
+  ### Version and language getters
+  security.declareProtected(Permissions.View, 'getLatestVersionValue')
+  def getLatestVersionValue(self, language=None):
+    """
+    Tries to find the latest version with the latest revions
+    of self which the current user is allowed to access.
+
+    If language is provided, return the latest document
+    in the language.
+
+    If language is not provided, return the latest version
+    in any language or in the user language if the version is
+    the same.
+    """
+    # User portal_catalog
+    pass
+
+  security.declareProtected(Permissions.View, 'getVersionValueList')
+  def getVersionValueList(self, version=None, language=None):
+    """
+      Returns a list of documents with same reference, same portal_type
+      but different version and given language or any language if not given.
+    """
+    # User portal_catalog
+    pass
+
+  security.declareProtected(Permissions.View, 'isVersionUnique')
+  def isVersionUnique(self):
+    """
+      Returns true if no other document has the same version and language
+    """
+    # User portal_catalog
+    pass
+
+  security.declareProtected(Permissions.View, 'getLatestRevisionValue')
+  def getLatestRevisionValue(self):
+    """
+      Returns the latest revision of ourselves
+    """
+    # User portal_catalog
+    pass
+
+  security.declareProtected(Permissions.View, 'getRevisionValueList')
+  def getRevisionValueList(self):
+    """
+      Returns a list revision strings for a given reference, version, language
+    """
+    # User portal_catalog
+    pass
+  
+  security.declareProtected(Permissions.ModifyPortalContent, 'setNewRevision')
+  def setNewRevision(self):
+    """
+      Set a new revision number automatically
+      Delegates to ZMI script because revision numbering policies can be different.
+      Should be called by interaction workflow upon appropriate action.
+    """
+    # User portal_catalog without security
+    method = self._getTypeBasedMethod('getNewRevisionNumber', 
+        fallback_script_id = 'Document_getNewRevisionNumber')
+    new_rev = method()
+    self.setRevision(new_rev)
+  
+  security.declareProtected(Permissions.View, 'getLanguageList')
+  def getLanguageList(self, version=None):
+    """
+      Returns a list of languages which this document is available in
+      for the current user.
+    """
+    # User portal_catalog
+    pass
+
+  security.declareProtected(Permissions.View, 'getOriginalLanguage')
+  def getOriginalLanguage(self):
+    """
+      Returns the original language of this document.
+    """
+    # Approach 1: use portal_catalog and creation dates
+    # Approach 2: use workflow analysis (delegate to script if necessary)
+    #             workflow analysis is the only way for multiple orginals
+    # XXX - cache or set?
+    pass
+
+  ### Property getters
+  # Property Getters are document dependent so that we can
+  # handle the weird cases in which needed properties change with the type of document
+  # and the usual cases in which accessing content changes with the meta type
+  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromUserLogin')
+  def getPropertyDictFromUserLogin(self, user_login):
+    """
+      Based on the user_login, find out as many properties as needed.
+      returns properties which should be set on the document
+    """
+    if user_login is None:
+      user_login = self.portal_something.getUserLogin()
+    return self._getTypeBasedMethod('getPropertyDictFromUserLogin',
+        fallback_script_id='Document_getPropertyDictFromUserLogin')
+
+  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromContent')
+  def getPropertyDictFromContent(self):
+    """
+      Based on the document content, find out as many properties as needed.
+      returns properties which should be set on the document
+    """
+    return self._getTypeBasedMethod('getPropertyDictFromContent',
+        fallback_script_id='Document_getPropertyDictFromContent')
+
+  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromFileName')
+  def getPropertyDictFromFileName(self, file_name):
+    """
+      Based on the file name, find out as many properties as needed.
+      returns properties which should be set on the document
+    """
+    return self.portal_contributions.getPropertyDictFromFileName(file_name)
+
+  security.declareProtected(Permissions.ModifyPortalContent,'getPropertyDictFromInput')
+  def getPropertyDictFromInput(self):
+    """
+      Get properties which were supplied explicitly to the ingestion method
+      (discovered or supplied before the document was created).
+    """
+    if hasattr(self, '_backup_input'):
+      return getattr(self, '_backup_input')
+    kw = {}
+    for id in self.propertyIds():
+      # We should not consider file data
+      if id is not 'data' and self.hasProperty(id):
+        kw[id] = self.getProperty(id)
+    self._backup_input = kw # We could use volatile and pass kw in activate
+                            # if we are garanteed that _backup_input does not
+                            # disappear within a given transaction
+    return kw
+
+  ### Metadata disovery and ingestion methods
+  security.declareProtected(Permissions.ModifyPortalContent, 'discoverMetadata')
+  def discoverMetadata(self, file_name=None, user_login=None):
+    """
+    This is the main metadata discovery function - controls the process
+    of discovering data from various sources. The discovery itself is
+    delegated to scripts or uses preferences-configurable regexps.
+
+    file_name - this parameter is a file name of the form "AA-BBB-CCC-223-en"
+
+    user_login - this is a login string of a person; can be None if the user is
+      currently logged in, then we'll get him from session
+    """
+
+    # Get the order
+    # Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
+    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList', 
+        fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
+    order_list = method()
+
+    # Start with everything until content
+    content_index = order_list.index('content')
+
+    # XXX should be done in the reverse order
+    # Start with everything until content - build a dictionnary according to the order
+    kw = {}
+    for order_id in order_list[0:content_index-1]:
+      if order_id not in VALID_ORDER_KEY_LIST:
+        # Prevent security attack or bad preferences
+        raise AttributeError, "explain what..."
+      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
+      method = getattr(self, method_id)
+      if order_id == 'file_name':
+        result = method(file_name)
+      elif order_id == 'user_login':
+        result = method(file_name)
+      else:
+        result = method()
+      kw.update(result)
+      
+    # Edit content
+    self.edit(kw)
+
+    # Finish in second stage
+    self.activate().finishMetadataDiscovery()
+    
+  security.declareProtected(Permissions.ModifyPortalContent, 'finishMetadataDiscovery')
+  def finishMetadataDiscovery(self):
+    """
+    This is called by portal_activities, to leave time-consuming procedures
+    for later. It converts the OOoDocument (later maybe some other formats) and
+    does things that can be done only after it is converted).
+    """
+    # Get the order from preferences
+    # Preference is made of a sequence of 'user_login', 'content', 'file_name', 'input'
+    method = self._getTypeBasedMethod('getPreferredDocumentMetadataDiscoveryOrderList', 
+        fallback_script_id = 'Document_getPreferredDocumentMetadataDiscoveryOrderList')
+    order_list = method()
+
+    # Start with everything until content
+    content_index = order_list.index('content')
+
+    # Start with everything until content - build a dictionnary according to the order
+    kw = {}
+    for order_id in order_list[content_index:]:
+      if order_id not in VALID_ORDER_KEY_LIST:
+        # Prevent security attack or bad preferences
+        raise AttributeError, "explain what..."
+      method_id = 'getPropertyDictFrom%s' % convertToUpperCase(order_id)
+      method = getattr(self, method_id)
+      if order_id == 'file_name':
+        result = method(file_name)
+      elif order_id == 'user_login':
+        result = method(file_name)
+      else:
+        result = method()
+      kw.update(result)
+      
+    # Edit content
+    self.edit(kw)
+
+    # Erase backup attributes
+    delattr(self, '_backup_input')
+
+    # Finish ingestion by calling method
+    self.finishIngestion()
+
+  security.declareProtected(Permissions.ModifyPortalContent, 'finishIngestion')
+  def finishIngestion(self):
+    """
+      Finish the ingestion process by calling the appropriate script
+    """
+    return self._getTypeBasedMethod('finishIngestion',
+        fallback_script_id='Document_finishIngestion')
+
+# vim: filetype=python syntax=python shiftwidth=2