Commit 88e6dbfe authored by Nicolas Delaby's avatar Nicolas Delaby

Use subprocess instead of os.popen for reliability

No need to inherit from CachedConvertableMixin as Image already inherit.



git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@35234 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent a6edfadb
...@@ -34,10 +34,11 @@ from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\ ...@@ -34,10 +34,11 @@ from Products.CMFCore.utils import getToolByName, _setCacheHeaders,\
from Products.ERP5Type import Permissions, PropertySheet from Products.ERP5Type import Permissions, PropertySheet
from Products.ERP5.Document.Image import Image from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionError from Products.ERP5.Document.Document import ConversionError,\
from Products.ERP5.mixin.cached_convertable import CachedConvertableMixin VALID_TEXT_FORMAT_LIST
from subprocess import Popen, PIPE
class PDFDocument(Image, CachedConvertableMixin): class PDFDocument(Image):
""" """
PDFDocument is a subclass of Image which is able to PDFDocument is a subclass of Image which is able to
extract text content from a PDF file either as text extract text content from a PDF file either as text
...@@ -107,15 +108,14 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -107,15 +108,14 @@ class PDFDocument(Image, CachedConvertableMixin):
if not self.data: if not self.data:
return '' return ''
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
tmp.write(str(self.data)) tmp.write(str(self.getData()))
tmp.seek(0) tmp.seek(0)
cmd = 'pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name command_result = Popen(['pdftotext', '-layout', '-enc', 'UTF-8',
r = os.popen(cmd) '-nopgbrk', tmp.name, '-'],
h = r.read() stdout=PIPE).communicate()[0]
h = command_result
tmp.close() tmp.close()
r.close() if h:
if h != '':
return h return h
else: else:
# Try to use OCR # Try to use OCR
...@@ -189,13 +189,17 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -189,13 +189,17 @@ class PDFDocument(Image, CachedConvertableMixin):
tmp = tempfile.NamedTemporaryFile() tmp = tempfile.NamedTemporaryFile()
tmp.write(str(self.data)) tmp.write(str(self.data))
tmp.seek(0) tmp.seek(0)
cmd = 'pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name command_result = Popen(['pdftohtml', '-enc', 'UTF-8', '-stdout',
r = os.popen(cmd) '-noframes', '-i', tmp.name], stdout=PIPE)\
h = r.read() .communicate()[0]
h = command_result
tmp.close() tmp.close()
r.close() # Quick hack to remove bg color - XXX
h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ') # Quick hack to remove bg color - XXX h = h.replace('<BODY bgcolor="#A0A0A0"', '<BODY ')
h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1], 'href="asEntireHTML') # Make links relative # Make links relative
h = h.replace('href="%s.html' % tmp.name.split(os.sep)[-1],
'href="asEntireHTML')
return h return h
security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation') security.declareProtected(Permissions.AccessContentsInformation, 'getContentInformation')
...@@ -216,10 +220,9 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -216,10 +220,9 @@ class PDFDocument(Image, CachedConvertableMixin):
tmp.seek(0) tmp.seek(0)
try: try:
# First, we use pdfinfo to get standard metadata # First, we use pdfinfo to get standard metadata
cmd = 'pdfinfo -meta -box %s' % tmp.name command_result = Popen(['pdfinfo', '-meta', '-box', tmp.name],
r = os.popen(cmd) stdout=PIPE).communicate()[0]
h = r.read() h = command_result
r.close()
result = {} result = {}
for line in h.splitlines(): for line in h.splitlines():
item_list = line.split(':') item_list = line.split(':')
...@@ -228,10 +231,9 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -228,10 +231,9 @@ class PDFDocument(Image, CachedConvertableMixin):
result[key] = value result[key] = value
# Then we use pdftk to get extra metadata # Then we use pdftk to get extra metadata
cmd = 'pdftk %s dump_data output' % tmp.name command_result = Popen(['pdftk', tmp.name, 'dump_data', 'output'],
r = os.popen(cmd) stdout=PIPE).communicate()[0]
h = r.read() h = command_result
r.close()
line_list = (line for line in h.splitlines()) line_list = (line for line in h.splitlines())
while True: while True:
try: try:
...@@ -256,4 +258,4 @@ class PDFDocument(Image, CachedConvertableMixin): ...@@ -256,4 +258,4 @@ class PDFDocument(Image, CachedConvertableMixin):
del self._content_information del self._content_information
except (AttributeError, KeyError): except (AttributeError, KeyError):
pass pass
Image._setFile(self, data, precondition) Image._setFile(self, data, precondition=precondition)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment