try pdftk to extract metadata, pdfinfo only returns standard PDF metadata.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31563 20353a03-c40f-0410-a6d1-a30d3c3de9de

try pdftk to extract metadata, pdfinfo only returns standard PDF metadata.
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@31563 20353a03-c40f-0410-a6d1-a30d3c3de9de
a0e8378a · Jérome Perrin · 26bb8448 · a0e8378a · a0e8378a · a0e8378a
Commit a0e8378a authored Jan 04, 2010 by Jérome Perrin
3 changed files
--- a/product/ERP5/Document/PDFDocument.py
+++ b/product/ERP5/Document/PDFDocument.py
@@ -249,17 +249,40 @@ class PDFDocument(Image, CachedConvertableMixin):
    tmp = tempfile.NamedTemporaryFile()
    tmp.write(str(self.data))
    tmp.seek(0)
-    cmd = 'pdfinfo -meta -box %s' % tmp.name
+    try:
-    r = os.popen(cmd)
+      # First, we use pdfinfo to get standard metadata
-    h = r.read()
+      cmd = 'pdfinfo -meta -box %s' % tmp.name
-    tmp.close()
+      r = os.popen(cmd)
-    r.close()
+      h = r.read()
-    result = {}
+      r.close()
-    for line in h.splitlines():
+      result = {}
-      item_list = line.split(':')
+      for line in h.splitlines():
-      key = item_list[0].strip()
+        item_list = line.split(':')
-      value = ':'.join(item_list[1:]).strip()
+        key = item_list[0].strip()
-      result[key] = value
+        value = ':'.join(item_list[1:]).strip()
+        result[key] = value
+      # Then we use pdftk to get extra metadata
+      cmd = 'pdftk %s dump_data output' % tmp.name
+      r = os.popen(cmd)
+      h = r.read()
+      r.close()
+      line_list = (line for line in h.splitlines())
+      while True:
+        try:
+          line = line_list.next()
+        except StopIteration:
+          break
+        if line.startswith('InfoKey'):
+          key = line[len('InfoKey: '):]
+          line = line_list.next()
+          assert line.startswith('InfoValue: '),\
+              "Wrong format returned by pdftk dump_data"
+          value = line[len('InfoValue: '):]
+          result.setdefault(key, value)
+    finally:
+      tmp.close()
    self._content_information = result
    return result.copy()

--- a/product/ERP5OOo/tests/testDms.py
+++ b/product/ERP5OOo/tests/testDms.py
@@ -792,6 +792,15 @@ class TestDocument(ERP5TypeTestCase, ZopeTestCase.Functional):
    self.assertEquals('title', content_information['Title'])
    self.assertEquals('application/pdf', document.getContentType())
+  def test_PDF_content_information_extra_metadata(self):
+    # Extra metadata, such as those stored by pdftk update_info are also
+    # available in document.getContentInformation()
+    upload_file = makeFileUpload('metadata.pdf')
+    document = self.portal.portal_contributions.newContent(file=upload_file)
+    self.assertEquals('PDF', document.getPortalType())
+    content_information = document.getContentInformation()
+    self.assertEquals('the value', content_information['NonStandardMetadata'])
  def test_PDF_content_content_type(self):
    upload_file = makeFileUpload('REF-en-001.pdf')
    document = self.portal.document_module.newContent(portal_type='PDF')

--- a/product/ERP5OOo/tests/test_document/metadata.pdf
+++ b/product/ERP5OOo/tests/test_document/metadata.pdf