PDFDocument.py 6.18 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
#
##############################################################################

28 29
import tempfile, os, cStringIO

30 31
from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
32 33
from Products.CMFCore.utils import getToolByName

34 35 36 37
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionCacheMixin
38
from Products.ERP5.Document.File import _unpackData
39

40
from zLOG import LOG
41

42
class PDFDocument(Image, ConversionCacheMixin):
43
  """
44 45 46
  PDFDocument is a subclass of Image which is able to
  extract text content from a PDF file either as text
  or as HTML.
47 48
  """
  # CMF Type Definition
49
  meta_type = 'ERP5 PDF Document'
50 51 52 53 54 55 56 57 58 59
  portal_type = 'PDF'
  isPortalContent = 1
  isRADContent = 1

  # Declarative security
  security = ClassSecurityInfo()
  security.declareObjectProtected(Permissions.AccessContentsInformation)

  # Default Properties
  property_sheets = ( PropertySheet.Base
60
                    , PropertySheet.XMLObject
61 62 63 64 65 66
                    , PropertySheet.CategoryCore
                    , PropertySheet.DublinCore
                    , PropertySheet.Version
                    , PropertySheet.Reference
                    , PropertySheet.Document
                    , PropertySheet.Data
67 68 69
                    , PropertySheet.ExternalDocument
                    , PropertySheet.Url
                    , PropertySheet.Periodicity
70 71
                    )

72 73
  security.declareProtected(Permissions.View, 'index_html')
  def index_html(self, REQUEST, RESPONSE, display=None, format='', quality=75, resolution=None):
74
    """
75 76 77
      Returns data in the appropriate format (graphical)
      it is always a zip because multi-page pdfs are converted into a zip
      file of many images
78
    """
79 80
    if format is None:
      RESPONSE.setHeader('Content-Type', 'application/pdf')
81
      return _unpackData(self.data)
82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
    if format in ('html', 'txt', 'text'):
      mime, data = self.convert(format)
      RESPONSE.setHeader('Content-Length', len(data))
      RESPONSE.setHeader('Content-Type', '%s;charset=UTF-8' % mime)
      RESPONSE.setHeader('Accept-Ranges', 'bytes')
      return data
    return Image.index_html(self, REQUEST, RESPONSE, display=display,
                            format=format, quality=quality, resolution=resolution)

  # Conversion API
  security.declareProtected(Permissions.ModifyPortalContent, 'convert')
  def convert(self, format, **kw):
    """
    Implementation of conversion for PDF files
    """
97
    if format == 'html':
98 99 100 101 102 103 104 105
      if not self.hasConversion(format=format):
        data = self._convertToHTML()
        self.setConversion(data, mime='text/html', format=format)
      return self.getConversion(format=format)
    elif format in ('txt', 'text'):
      if not self.hasConversion(format='txt'):
        data = self._convertToText()
        self.setConversion(data, mime='text/plain', format='txt')
106
      return self.getConversion(format='txt')
107 108 109 110 111
    else:
      return Image.convert(self, format, **kw)

  security.declareProtected(Permissions.ModifyPortalContent, 'populateContent')
  def populateContent(self):
112
    """
113 114 115
      Convert each page to an Image and populate the
      PDF directory with converted images. May be useful
      to provide online PDF reader
116
    """
117
    raise NotImplementedError
118 119

  security.declarePrivate('_convertToText')
120
  def _convertToText(self):
121
    """
122
      Convert the PDF text content to text with pdftotext
123
    """
124
    tmp = tempfile.NamedTemporaryFile()
125
    tmp.write(_unpackData(self.data))
126 127 128 129 130 131 132 133 134 135 136 137
    tmp.seek(0)
    cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    return h

  security.declarePrivate('_convertToHTML')
  def _convertToHTML(self):
    """
    Convert the PDF text content to HTML with pdftohtml
138 139 140

    NOTE: XXX check that command exists and was executed
    successfully
141 142
    """
    tmp = tempfile.NamedTemporaryFile()
143
    tmp.write(_unpackData(self.data))
144 145 146 147 148 149 150 151 152 153 154 155 156 157
    tmp.seek(0)
    cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX
    return h

  security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
  def getContentInformation(self):
    """
    Returns the information about the PDF document with
    pdfinfo.
158 159 160

    NOTE: XXX check that command exists and was executed
    successfully
161 162
    """
    tmp = tempfile.NamedTemporaryFile()
163
    tmp.write(_unpackData(self.data))
164 165 166 167 168 169 170 171 172 173 174 175 176
    tmp.seek(0)
    cmd = 'pdfinfo -meta -box %s' % tmp.name
    r = os.popen(cmd)
    h = r.read()
    tmp.close()
    r.close()
    result = {}
    for line in h.splitlines():
      item_list = line.split(':')
      key = item_list[0].strip()
      value = ':'.join(item_list[1:]).strip()
      result[key] = value
    return result