PDF: do not fail getting content information with document created using apple tools

Document created with some apple tools have a custom info 'AAPL:Keywords' for which pypdf returns a non picklable instance.

PDF: do not fail getting content information with document created using apple tools
Document created with some apple tools have a custom info 'AAPL:Keywords' for which pypdf returns a non picklable instance.
52358c8a · Jérome Perrin · 78ef5ad9 · 52358c8a · 52358c8a · 52358c8a
Commit 52358c8a authored Aug 12, 2013 by Jérome Perrin
3 changed files
--- a/product/ERP5/Document/PDFDocument.py
+++ b/product/ERP5/Document/PDFDocument.py
@@ -26,7 +26,7 @@
 #
 ##############################################################################

-import tempfile, os
+import tempfile, os, pickle

 import zope.interface
 from AccessControl import ClassSecurityInfo
@@ -36,7 +36,7 @@ from Products.ERP5.Document.Image import Image
 from Products.ERP5.Document.Document import ConversionError,\
                                            VALID_TEXT_FORMAT_LIST
 from subprocess import Popen, PIPE
-from zLOG import LOG
+from zLOG import LOG, INFO, PROBLEM
 import errno
 from StringIO import StringIO

@@ -312,14 +312,25 @@ class PDFDocument(Image):
            info_key = info_key.lstrip("/")
            if isinstance(info_value, unicode):
              info_value = info_value.encode("utf-8")
-            result.setdefault(info_key, info_value)
+
+            # Ignore values that cannot be pickled ( such as AAPL:Keywords )
+            try:
+              pickle.dumps(info_value)
+            except pickle.PicklingError, err:
+              LOG("PDFDocument.getContentInformation", INFO,
+                "Ignoring non picklable document info on %s: %s (%r)" % (
+                self.getRelativeUrl(), info_key, info_value))
+            else:
+              result.setdefault(info_key, info_value)
        except PdfReadError:
-          LOG("PDFDocument.getContentInformation", 0,
+          LOG("PDFDocument.getContentInformation", PROBLEM,
            "pyPdf is Unable to read PDF, probably corrupted PDF here : %s" % \
            (self.getRelativeUrl(),))
    finally:
      tmp.close()

+    # Store cache as an instance of document. FIXME: we usually try to avoid this
+    # pattern and cache the result of methods using content md5 as a cache key.
    self._content_information = result
    return result.copy()


--- a/product/ERP5OOo/tests/testDms.py
+++ b/product/ERP5OOo/tests/testDms.py
@@ -1329,6 +1329,21 @@ class TestDocument(TestDocumentMixin):
    # empty PDF have no content information
    self.assertEquals(dict(), content_information)

+  def test_apple_PDF_metadata(self):
+    # PDF created with Apple software have a special 'AAPL:Keywords' info tag
+    # and when pypdf extracts pdf information, it is returned as an
+    # IndirectObject instance which is not picklable
+    document = self.portal.document_module.newContent(
+      portal_type='PDF',
+      file=makeFileUpload('apple_metadata.pdf'))
+    # content_information is picklable
+    content_information = document.getContentInformation()
+    from pickle import dumps
+    dumps(content_information)
+    # so document can be saved in ZODB
+    self.commit()
+    self.tic()
+
  def test_PDF_content_content_type(self):
    upload_file = makeFileUpload('REF-en-001.pdf')
    document = self.portal.document_module.newContent(portal_type='PDF')

--- a/product/ERP5OOo/tests/test_document/apple_metadata.pdf
+++ b/product/ERP5OOo/tests/test_document/apple_metadata.pdf