Commit ceca5d61 authored by Mame Coumba Sall's avatar Mame Coumba Sall

-Remove Conversion API in Document.py as it is now in mixin/convertable

-Modified to include methods that returns allowed target item list for
conversion


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@34092 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 6993cc96
......@@ -56,6 +56,12 @@ from Products.PythonScripts.Utility import allow_class
# Mixin Import
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
from Products.ERP5.mixin.convertable import ConvertableMixin
from Products.ERP5.mixin.text_convertable import TextConvertableMixin
from Products.ERP5.mixin.base_convertable import BaseConvertableMixin
from Products.ERP5.mixin.html_convertable import HTMLConvertableMixin
from Products.ERP5.mixin.metadata_discoverable import MetadataDiscoverableMixin
from Products.ERP5.mixin.document import DocumentMixin
_MARKER = []
VALID_ORDER_KEY_LIST = ('user_login', 'content', 'file_name', 'input')
......@@ -306,7 +312,8 @@ class UpdateMixIn:
return method()
class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
class Document(PermanentURLMixIn, XMLObject, UrlMixIn, ConvertableMixin, TextConvertableMixin,HTMLConvertableMixin,
DocumentMixin, BaseConvertableMixin, MetadataDiscoverableMixin, CachedConvertableMixin, SnapshotMixin, UpdateMixIn):
"""Document is an abstract class with all methods related to document
management in ERP5. This includes searchable text, explicit relations,
implicit relations, metadata, versions, languages, etc.
......@@ -1077,267 +1084,6 @@ class Document(PermanentURLMixIn, XMLObject, UrlMixIn, CachedConvertableMixin, S
method = self._getTypeBasedMethod('finishIngestion', fallback_script_id='Document_finishIngestion')
return method()
# Conversion methods
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
def convert(self, format, **kw):
"""
Main content conversion function, returns result which should
be returned and stored in cache.
format - the format specied in the form of an extension
string (ex. jpeg, html, text, txt, etc.)
**kw can be various things - e.g. resolution
Default implementation returns an empty string (html, text)
or raises an error.
TODO:
- implement guards API so that conversion to certain
formats require certain permission
"""
if format == 'html':
return 'text/html', '' # XXX - Why ?
if format in ('text', 'txt'):
return 'text/plain', '' # XXX - Why ?
raise NotImplementedError
security.declareProtected(Permissions.View, 'asSubjectText')
def asSubjectText(self, **kw):
"""
Converts the subject of the document to a textual representation.
"""
subject = self.getSubject()
if not subject:
# XXX not sure if this fallback is a good idea.
subject = self.getTitle()
if subject is None:
subject = ''
return str(subject)
security.declareProtected(Permissions.View, 'asText')
def asText(self, **kw):
"""
Converts the content of the document to a textual representation.
"""
kw['format'] = 'txt'
mime, data = self.convert(**kw)
return str(data)
security.declareProtected(Permissions.View, 'asEntireHTML')
def asEntireHTML(self, **kw):
"""
Returns a complete HTML representation of the document
(with body tags, etc.). Adds if necessary a base
tag so that the document can be displayed in an iframe
or standalone.
Actual conversion is delegated to _asHTML
"""
html = self._asHTML(**kw)
if self.getUrlString():
# If a URL is defined, add the base tag
# if base is defined yet.
html = str(html)
if not html.find('<base') >= 0:
base = '<base href="%s">' % self.getContentBaseURL()
html = html.replace('<head>', '<head>%s' % base)
self.setConversion(html, mime='text/html', format='base-html')
return html
security.declarePrivate('_asHTML')
def _asHTML(self, **kw):
"""
A private method which converts to HTML. This method
is the one to override in subclasses.
"""
if not self.hasBaseData():
raise ConversionError('This document has not been processed yet.')
try:
# FIXME: no substitution may occur in this case.
mime, data = self.getConversion(format='base-html')
return data
except KeyError:
kw['format'] = 'html'
mime, html = self.convert(**kw)
return html
security.declareProtected(Permissions.View, 'asStrippedHTML')
def asStrippedHTML(self, **kw):
"""
Returns a stripped HTML representation of the document
(without html and body tags, etc.) which can be used to inline
a preview of the document.
"""
if not self.hasBaseData():
return ''
try:
# FIXME: no substitution may occur in this case.
mime, data = self.getConversion(format='stripped-html')
return data
except KeyError:
kw['format'] = 'html'
mime, html = self.convert(**kw)
return self._stripHTML(str(html))
def _guessEncoding(self, string):
"""
Try to guess the encoding for this string.
Returns None if no encoding can be guessed.
"""
try:
import chardet
except ImportError:
return None
return chardet.detect(string).get('encoding', None)
def _stripHTML(self, html, charset=None):
"""
A private method which can be reused by subclasses
to strip HTML content
"""
body_list = re.findall(self.body_parser, str(html))
if len(body_list):
stripped_html = body_list[0]
else:
stripped_html = html
# find charset and convert to utf-8
charset_list = self.charset_parser.findall(str(html)) # XXX - Not efficient if this
# is datastream instance but hard to do better
if charset and not charset_list:
# Use optional parameter is we can not find encoding in HTML
charset_list = [charset]
if charset_list and charset_list[0] not in ('utf-8', 'UTF-8'):
try:
stripped_html = unicode(str(stripped_html),
charset_list[0]).encode('utf-8')
except (UnicodeDecodeError, LookupError):
return str(stripped_html)
return stripped_html
def _safeHTML(self, html, format='text/x-html-safe', charset=None):
"""
A private method to strip HTML content in safe mode,
w/o emmbed javascript, forms and any external plugins imports.
This should be used when we do not trust the user (Anonymous)
who push data into database.
- html: content to strip
- format: destination format
- charset: charset used to encode string. Take precedence
on charset values found in html string
"""
portal = self.getPortalObject()
if charset is None:
# find charset
charset_list = self.charset_parser.findall(html)
if charset_list:
charset = charset_list[0]
if charset and charset not in ('utf-8', 'UTF-8'):
try:
safe_html_string = html.decode(charset).encode('utf-8')
except (UnicodeDecodeError, LookupError):
pass
else:
charset = 'utf-8' # Override charset if convertion succeeds
transform_tool = getToolByName(portal, 'portal_transforms')
safe_html_string = transform_tool.convertToData(format, html,
encoding=charset)
return safe_html_string
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def getContentInformation(self):
"""
Returns the content information from the HTML conversion.
The default implementation tries to build a dictionnary
from the HTML conversion of the document and extract
the document title.
"""
result = {}
html = self.asEntireHTML()
if not html: return result
title_list = re.findall(self.title_parser, str(html))
if title_list:
result['title'] = title_list[0]
return result
# Base format support
security.declareProtected(Permissions.ModifyPortalContent, 'convertToBaseFormat')
def convertToBaseFormat(self, **kw):
"""
Converts the content of the document to a base format
which is later used for all conversions. This method
is common to all kinds of documents and handles
exceptions in a unified way.
Implementation is delegated to _convertToBaseFormat which
must be overloaded by subclasses of Document which
need a base format.
convertToBaseFormat is called upon file upload, document
ingestion by the processing_status_workflow.
NOTE: the data of the base format conversion should be stored
using the base_data property. Refer to Document.py propertysheet.
Use accessors (getBaseData, setBaseData, hasBaseData, etc.)
"""
if getattr(self, 'hasData', None) is not None and not self.hasData():
# Empty document cannot be converted
return
try:
message = self._convertToBaseFormat() # Call implemetation method
self.clearConversionCache() # Conversion cache is now invalid
if message is None:
# XXX Need to translate.
message = 'Converted to %s.' % self.getBaseContentType()
self.convertFile(comment=message) # Invoke workflow method
except NotImplementedError:
message = ''
return message
def _convertToBaseFormat(self):
"""
"""
raise NotImplementedError
security.declareProtected(Permissions.AccessContentsInformation,
'isSupportBaseDataConversion')
def isSupportBaseDataConversion(self):
"""
"""
return False
def convertFile(self, **kw): # XXX - It it really useful to explicitly define ?
"""
Workflow transition invoked when conversion occurs.
"""
convertFile = WorkflowMethod(convertFile)
security.declareProtected(Permissions.AccessContentsInformation,
'getMetadataMappingDict')
def getMetadataMappingDict(self):
"""
Return a dict of metadata mapping used to update base metadata of the
document
"""
try:
method = self._getTypeBasedMethod('getMetadataMappingDict')
except KeyError, AttributeError:
method = None
if method is not None:
return method()
else:
return {}
security.declareProtected(Permissions.ModifyPortalContent, 'updateBaseMetadata')
def updateBaseMetadata(self, **kw):
"""
Update the base format data with the latest properties entered
by the user. For example, if title is changed in ERP5 interface,
the base format file should be updated accordingly.
Default implementation does nothing. Refer to OOoDocument class
for an example of implementation.
"""
pass
# Transformation API
security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
def populateContent(self):
......
......@@ -53,6 +53,8 @@ except ImportError:
from zLOG import LOG, WARNING
from Products.CMFCore.utils import getToolByName
#Mixin import
from Products.ERP5.mixin.convertable import ConvertableMixin
default_displays_id_list = ('nano', 'micro', 'thumbnail',
'xsmall', 'small', 'medium',
......@@ -60,7 +62,7 @@ default_displays_id_list = ('nano', 'micro', 'thumbnail',
default_formats = ['jpg', 'jpeg', 'png', 'gif', 'pnm', 'ppm']
class Image(File, OFSImage):
class Image(File, OFSImage, ConvertableMixin):
"""
An Image is a File which contains image data. It supports
various conversions of format, size, resolution through
......@@ -322,11 +324,36 @@ class Image(File, OFSImage):
return mime_type, result
# Conversion API
security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
def getAllowedTargetItemList(self):
import commands
import re
import os
new_result = []
filename = os.path.abspath(self.getSourceReference())
result = commands.getstatusoutput('convert -list format %s ' % self.getSourceReference())
new_list = re.split('\n',result[1])
allowed = []
for new_str in new_list:
test_str = new_str.lstrip()
pattern = re.compile(r'''([A-z]+[*]?\s+[A-z]+\s+[rw+-]+\s+[A-z]+\s+[A-z]+\D+[A-z]+)''',re.VERBOSE)
if re.match(pattern,test_str):
new_result.append(test_str)
len_new_result = len(new_result)
for i in range(0,len_new_result):
allowed.append(list((new_result[i].split()[1].lower(),' '.join(new_result[i].split()[3:]))))
return [(y, x) for x, y in allowed]
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
def convert(self, format, display=None, quality=75, resolution=None, frame=None, **kw):
"""
Implementation of conversion for Image files
"""
# Raise an error if the format is not permitted
if not self.isTargetFormatPermitted(format):
raise Unauthorized("User does not have enough permission to access document"
" in %s format" % (format or 'original'))
if format in ('text', 'txt', 'html', 'base_html', 'stripped-html'):
try:
return self.getConversion(format=format)
......@@ -339,7 +366,7 @@ class Image(File, OFSImage):
if (display is not None or resolution is not None or quality != 75 or format != ''\
or frame is not None) and image_size:
kw = dict(display=display, format=format, quality=quality,
resolution=resolution, frame=frame, image_size=image_size)
resolution=resolution, frame=frame, image_size=image_size)
try:
mime, image = self.getConversion(**kw)
except KeyError:
......@@ -369,7 +396,7 @@ class Image(File, OFSImage):
# display may be set from a cookie (?)
image_size = self.getSizeFromImageDisplay(display)
kw = dict(display=display, format=format, quality=quality,
resolution=resolution, frame=frame, image_size=image_size)
resolution=resolution, frame=frame, image_size=image_size)
_setCacheHeaders(_ViewEmulator().__of__(self), kw)
if (display is not None or resolution is not None or quality != 75 or format != ''\
......
......@@ -37,7 +37,13 @@ from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionError
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin
class PDFDocument(Image, CachedConvertableMixin):
from zLOG import LOG, WARNING
# Mixin import
from Products.ERP5.mixin.convertable import ConvertableMixin
class PDFDocument(Image, ConvertableMixin, CachedConvertableMixin):
"""
PDFDocument is a subclass of Image which is able to
extract text content from a PDF file either as text
......@@ -98,6 +104,11 @@ class PDFDocument(Image, CachedConvertableMixin):
resolution=resolution, frame=frame)
# Conversion API
security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
def getAllowedTargetItemList(self):
return Image.getAllowedTargetItemList(self) + \
[('Text', 'txt'),('Plain Text','text'), ('HTML Document', 'html')]
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
def convert(self, format, **kw):
"""
......
......@@ -43,9 +43,12 @@ try:
except ImportError:
from Products.ERP5Type.patches.string import Template
# Mixin import
from Products.ERP5.mixin.convertable import ConvertableMixin
DEFAULT_TEXT_FORMAT = 'text/html'
class TextDocument(Document, TextContent):
class TextDocument(Document, TextContent, ConvertableMixin):
"""
A Document contains text which can be formatted using
*Structured Text* or *HTML*. Text can be automatically translated
......@@ -146,6 +149,10 @@ class TextDocument(Document, TextContent):
if format is None:
# The default is to use ERP5 Forms to render the page
return self.view()
# Raise an error if the format is not permitted
if not self.isTargetFormatPermitted(format):
raise Unauthorized("User does not have enough permission to access document"
" in %s format" % (format or 'original'))
mime, data = self.convert(format=format)
RESPONSE.setHeader('Content-Length', len(str(data))) # XXX - Not efficient
# if datastream instance
......@@ -200,12 +207,25 @@ class TextDocument(Document, TextContent):
substitution_method_parameter_dict = {}
return self._substituteTextContent(subject, safe_substitute=safe_substitute,
**substitution_method_parameter_dict)
security.declareProtected(Permissions.View, 'getAllowedTargetItemList')
def getAllowedTargetItemList(self):
mime_type = getToolByName(self, 'mimetypes_registry')
allowed=[]
for extension in mime_type.extensions:
allowed.append((mime_type.extensions[extension].name(),extension))
return [(y, x) for x, y in allowed]
security.declareProtected(Permissions.AccessContentsInformation, 'convert')
def convert(self, format, substitution_method_parameter_dict=None, safe_substitute=True, **kw):
"""
Convert text using portal_transforms or oood
"""
# Raise an error if the format is not permitted
if not self.isTargetFormatPermitted(format):
raise Unauthorized("User does not have enough permission to access document"
" in %s format" % (format or 'original'))
# Accelerate rendering in Web mode
_setCacheHeaders(_ViewEmulator().__of__(self), {'format' : format})
# Return the raw content
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment