Commit 57402c11 authored by Sebastien Robin's avatar Sebastien Robin

use pyPdf instead of pdftk to extract non standard metadata from pdf

parent 50e1a93a
...@@ -266,28 +266,19 @@ class PDFDocument(Image): ...@@ -266,28 +266,19 @@ class PDFDocument(Image):
value = ':'.join(item_list[1:]).strip() value = ':'.join(item_list[1:]).strip()
result[key] = value result[key] = value
# Then we use pdftk to get extra metadata # Then we use pyPdf to get extra metadata
try: try:
command = ['pdftk', tmp.name, 'dump_data', 'output'] from pyPdf import PdfFileReader
command_result = Popen(command, stdout=PIPE).communicate()[0] except ImportError:
except OSError, e: # if pyPdf not found, pass
# if pdftk not found, pass pass
if e.errno != errno.ENOENT:
raise
else: else:
line_list = (line for line in command_result.splitlines()) pdf_file = PdfFileReader(tmp)
while True: for info_key, info_value in pdf_file.getDocumentInfo().iteritems():
try: info_key = info_key.lstrip("/")
line = line_list.next() if isinstance(info_value, unicode):
except StopIteration: info_value = info_value.encode("utf-8")
break result.setdefault(info_key, info_value)
if line.startswith('InfoKey'):
key = line[len('InfoKey: '):]
line = line_list.next()
assert line.startswith('InfoValue: '),\
"Wrong format returned by pdftk dump_data"
value = line[len('InfoValue: '):]
result.setdefault(key, value)
finally: finally:
tmp.close() tmp.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment