use pyPdf instead of pdftk to extract non standard metadata from pdf

57402c11 · Sebastien Robin · 50e1a93a · 57402c11
Commit 57402c11 authored Feb 25, 2013 by Sebastien Robin
Hide whitespace changes
Inline Side-by-side

Showing with 11 additions and 20 deletions

product/ERP5/Document/PDFDocument.py product/ERP5/Document/PDFDocument.py +11 -20

No files found.
--- a/product/ERP5/Document/PDFDocument.py
+++ b/product/ERP5/Document/PDFDocument.py
@@ -266,28 +266,19 @@ class PDFDocument(Image):
        value = ':'.join(item_list[1:]).strip()
        result[key] = value

-      # Then we use pdftk to get extra metadata
+      # Then we use pyPdf to get extra metadata
      try:
-        command = ['pdftk', tmp.name, 'dump_data', 'output']
-        command_result = Popen(command, stdout=PIPE).communicate()[0]
-      except OSError, e:
-        # if pdftk not found, pass
-        if e.errno != errno.ENOENT:
-          raise
+        from pyPdf import PdfFileReader
+      except ImportError:
+        # if pyPdf not found, pass
+        pass
      else:
-        line_list = (line for line in command_result.splitlines())
-        while True:
-          try:
-            line = line_list.next()
-          except StopIteration:
-            break
-          if line.startswith('InfoKey'):
-            key = line[len('InfoKey: '):]
-            line = line_list.next()
-            assert line.startswith('InfoValue: '),\
-                "Wrong format returned by pdftk dump_data"
-            value = line[len('InfoValue: '):]
-            result.setdefault(key, value)
+        pdf_file = PdfFileReader(tmp)
+        for info_key, info_value in pdf_file.getDocumentInfo().iteritems():
+          info_key = info_key.lstrip("/")
+          if isinstance(info_value, unicode):
+            info_value = info_value.encode("utf-8")
+          result.setdefault(info_key, info_value)
    finally:
      tmp.close()