Commit 52358c8a authored by Jérome Perrin's avatar Jérome Perrin

PDF: do not fail getting content information with document created using apple tools

Document created with some apple tools have a custom info 'AAPL:Keywords' for
which pypdf returns a non picklable instance.
parent 78ef5ad9
......@@ -26,7 +26,7 @@
#
##############################################################################
import tempfile, os
import tempfile, os, pickle
import zope.interface
from AccessControl import ClassSecurityInfo
......@@ -36,7 +36,7 @@ from Products.ERP5.Document.Image import Image
from Products.ERP5.Document.Document import ConversionError,\
VALID_TEXT_FORMAT_LIST
from subprocess import Popen, PIPE
from zLOG import LOG
from zLOG import LOG, INFO, PROBLEM
import errno
from StringIO import StringIO
......@@ -312,14 +312,25 @@ class PDFDocument(Image):
info_key = info_key.lstrip("/")
if isinstance(info_value, unicode):
info_value = info_value.encode("utf-8")
result.setdefault(info_key, info_value)
# Ignore values that cannot be pickled ( such as AAPL:Keywords )
try:
pickle.dumps(info_value)
except pickle.PicklingError, err:
LOG("PDFDocument.getContentInformation", INFO,
"Ignoring non picklable document info on %s: %s (%r)" % (
self.getRelativeUrl(), info_key, info_value))
else:
result.setdefault(info_key, info_value)
except PdfReadError:
LOG("PDFDocument.getContentInformation", 0,
LOG("PDFDocument.getContentInformation", PROBLEM,
"pyPdf is Unable to read PDF, probably corrupted PDF here : %s" % \
(self.getRelativeUrl(),))
finally:
tmp.close()
# Store cache as an instance of document. FIXME: we usually try to avoid this
# pattern and cache the result of methods using content md5 as a cache key.
self._content_information = result
return result.copy()
......
......@@ -1329,6 +1329,21 @@ class TestDocument(TestDocumentMixin):
# empty PDF have no content information
self.assertEquals(dict(), content_information)
def test_apple_PDF_metadata(self):
# PDF created with Apple software have a special 'AAPL:Keywords' info tag
# and when pypdf extracts pdf information, it is returned as an
# IndirectObject instance which is not picklable
document = self.portal.document_module.newContent(
portal_type='PDF',
file=makeFileUpload('apple_metadata.pdf'))
# content_information is picklable
content_information = document.getContentInformation()
from pickle import dumps
dumps(content_information)
# so document can be saved in ZODB
self.commit()
self.tic()
def test_PDF_content_content_type(self):
upload_file = makeFileUpload('REF-en-001.pdf')
document = self.portal.document_module.newContent(portal_type='PDF')
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment