Commit e9821d76 authored by Jean-Paul Smets's avatar Jean-Paul Smets

Code review and refactoring based on Document API.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@13628 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent ed729538
############################################################################## ##############################################################################
# #
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved. # Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
...@@ -31,23 +30,20 @@ from Products.CMFCore.WorkflowCore import WorkflowMethod ...@@ -31,23 +30,20 @@ from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.File import File, stripHtml
from Products.ERP5.Document.Document import ConversionCacheMixin from Products.ERP5.Document.Document import ConversionCacheMixin
from Products.CMFCore.utils import getToolByName from Products.CMFCore.utils import getToolByName
from zLOG import LOG from zLOG import LOG
import tempfile, os, glob, zipfile, cStringIO, re import tempfile, os, cStringIO
class PDFDocument(File, ConversionCacheMixin): class PDFDocument(Image, ConversionCacheMixin):
""" """
PdfDocument - same as file, but has its own getSearchableText method PDFDocument is a subclass of Image which is able to
(converts via pdftotext) extract text content from a PDF file either as text
in effect it has two separate caches - from CachingMixin for txt and html or as HTML.
and for image formats from Image
""" """
# CMF Type Definition # CMF Type Definition
meta_type = 'ERP5 PDF' meta_type = 'ERP5 PDF Document'
portal_type = 'PDF' portal_type = 'PDF'
isPortalContent = 1 isPortalContent = 1
isRADContent = 1 isRADContent = 1
...@@ -58,17 +54,20 @@ class PDFDocument(File, ConversionCacheMixin): ...@@ -58,17 +54,20 @@ class PDFDocument(File, ConversionCacheMixin):
# Default Properties # Default Properties
property_sheets = ( PropertySheet.Base property_sheets = ( PropertySheet.Base
, PropertySheet.XMLObject
, PropertySheet.CategoryCore , PropertySheet.CategoryCore
, PropertySheet.DublinCore , PropertySheet.DublinCore
, PropertySheet.Version , PropertySheet.Version
, PropertySheet.Reference , PropertySheet.Reference
, PropertySheet.Document , PropertySheet.Document
, PropertySheet.TextDocument
, PropertySheet.Data , PropertySheet.Data
, PropertySheet.ExternalDocument
, PropertySheet.Url
, PropertySheet.Periodicity
) )
security.declareProtected(Permissions.View, 'index_html')
def index_html(self, REQUEST, RESPONSE, format=None, force=0): def index_html(self, REQUEST, RESPONSE, display=None, format='', quality=75, resolution=None):
""" """
Returns data in the appropriate format (graphical) Returns data in the appropriate format (graphical)
it is always a zip because multi-page pdfs are converted into a zip it is always a zip because multi-page pdfs are converted into a zip
...@@ -77,126 +76,92 @@ class PDFDocument(File, ConversionCacheMixin): ...@@ -77,126 +76,92 @@ class PDFDocument(File, ConversionCacheMixin):
if format is None: if format is None:
RESPONSE.setHeader('Content-Type', 'application/pdf') RESPONSE.setHeader('Content-Type', 'application/pdf')
return self._unpackData(self.data) return self._unpackData(self.data)
if format in ('html', 'txt', 'text'):
mime, data = self.convert(format)
RESPONSE.setHeader('Content-Length', len(data))
RESPONSE.setHeader('Content-Type', '%s;charset=UTF-8' % mime)
RESPONSE.setHeader('Accept-Ranges', 'bytes')
return data
return Image.index_html(self, REQUEST, RESPONSE, display=display,
format=format, quality=quality, resolution=resolution)
# Conversion API
security.declareProtected(Permissions.ModifyPortalContent, 'convert')
def convert(self, format, **kw):
"""
Implementation of conversion for PDF files
"""
if format == 'html': if format == 'html':
RESPONSE.setHeader('Content-Type', 'text/html;charset=UTF-8') if not self.hasConversion(format=format):
return self.getHtmlRepresentation(force) data = self._convertToHTML()
if format == 'txt': self.setConversion(data, mime='text/html', format=format)
RESPONSE.setHeader('Content-Type', 'text/plain;charset=UTF-8') return self.getConversion(format=format)
self._convertToText(force) elif format in ('txt', 'text'):
return self.getTextContent() if not self.hasConversion(format='txt'):
mime = 'image/'+format.lower() data = self._convertToText()
if force or not self.hasConversion(format = format): self.setConversion(data, mime='text/plain', format='txt')
self.setConversion(self._makeFile(format), 'application/zip', format=format) return self.getConversion(format=format)
RESPONSE.setHeader('Content-Type', 'application/zip') else:
return self.getConversion(format = format) return Image.convert(self, format, **kw)
def _makeFile(self,format): security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
tempfile.tempdir = os.path.join(os.getenv('INSTANCE_HOME'), 'tmp') def populateContent(self):
os.putenv('TMPDIR', '/tmp') # because if we run zope as root, we have /root/tmp here and convert goes crazy
if not os.path.exists(tempfile.tempdir):
os.mkdir(tempfile.tempdir, 0775)
fr = tempfile.mktemp(suffix='.pdf')
to = tempfile.mktemp(suffix = '.' + format)
file_fr = open(fr, 'w')
file_fr.write(self._unpackData(self.data))
file_fr.close()
cmd = 'convert %s %s' % (fr, to)
os.system(cmd)
# pack it
f = cStringIO.StringIO()
z = zipfile.ZipFile(f, 'a')
for fname in glob.glob(to.replace('.', '*')):
base = os.path.basename(fname)
pg = re.match('.*?(\d*)\.'+format, base).groups()
if pg:
pg = pg[0]
arcname = '%s/page-%s.%s' % (format, pg, format)
else:
arcname = base
z.write(fname, arcname)
z.close()
f.seek(0)
return f.read()
searchable_property_list = File.searchable_property_list + ('text_content',)
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None, force=0):
""" """
Used by the catalog for basic full text indexing Convert each page to an Image and populate the
conditionally convert pdf to text PDF directory with converted images. May be useful
to provide online PDF reader
""" """
self._convertToText(force) raise NotImplementedError
return File.getSearchableText(self, md)
security.declarePrivate('_convertToText') security.declarePrivate('_convertToText')
def _convertToText(self, force): def _convertToText(self):
""" """
Private implementation method. Convert the PDF text content to text with pdftotext
If we don't have txt cache or we are forced to convert, we try to do it
using system pdftotext utility. We set the result as text_content property.
We mark it in cache as done, even if we fail, so we don't keep trying if it
doesn't work.
""" """
if hasattr(self, 'data') and (force == 1 or not self.hasConversion(format = 'txt')): tmp = tempfile.NamedTemporaryFile()
# XXX-JPS accessing attribute data is bad tmp.write(self._unpackData(self.data))
self.log('PdfDocument', 'regenerating txt') tmp.seek(0)
try: cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
try: r = os.popen(cmd)
tmp = tempfile.NamedTemporaryFile() h = r.read()
tmp.write(self._unpackData(self.data)) tmp.close()
tmp.seek(0) r.close()
cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name return h
r = os.popen(cmd)
self.setTextContent(r.read().replace('\n', ' ')) security.declarePrivate('_convertToHTML')
tmp.close() def _convertToHTML(self):
r.close() """
except Exception, e: Convert the PDF text content to HTML with pdftohtml
self.log(str(e)) """
msg = 'Conversion to text failed: ' + str(e) tmp = tempfile.NamedTemporaryFile()
else: tmp.write(self._unpackData(self.data))
msg = 'Converted to text' tmp.seek(0)
finally: cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
self.processFile(comment=msg) r = os.popen(cmd)
# we don't need to store it twice, just mark we have it (or rather we already tried) h = r.read()
# we try only once tmp.close()
self.setConversion('empty', format = 'txt') r.close()
h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
SearchableText=getSearchableText return h
security.declarePrivate('_convertToBase') security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
def _convertToBase(self): def getContentInformation(self):
self._convertToText(force=1) """
Returns the information about the PDF document with
security.declareProtected(Permissions.View, 'getHtmlRepresentation') pdfinfo.
def getHtmlRepresentation(self, force=0): """
''' tmp = tempfile.NamedTemporaryFile()
get simplified html version to display tmp.write(self._unpackData(self.data))
If we fail to convert, we set workflow message and put error message tmp.seek(0)
as html preview so that the user knows what's going on cmd = 'pdfinfo -meta -box %s' % tmp.name
''' r = os.popen(cmd)
portal_workflow = getToolByName(self, 'portal_workflow') h = r.read()
if not hasattr(self, 'data'): tmp.close()
return 'no data' r.close()
if force==1 or not self.hasConversion(format = 'html'): result = {}
try: for line in h.splitlines():
self.log('PDF', 'regenerating html') item_list = line.split(':')
tmp = tempfile.NamedTemporaryFile() key = item_list[0].strip()
tmp.write(self._unpackData(self.data)) value = ':'.join(item_list[1:]).strip()
tmp.seek(0) result[key] = value
cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name return result
r = os.popen(cmd)
h = r.read()
tmp.close()
r.close()
h = stripHtml(h)
except Exception, e:
msg = 'Could not convert to html: ' + str(e)
h = msg
portal_workflow.doActionFor(self, 'process', comment=msg)
self.setConversion(h, format = 'html')
return self.getConversion(format = 'html')[1]
# vim: syntax=python shiftwidth=2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment