Commit 57402c11 authored by Sebastien Robin's avatar Sebastien Robin

use pyPdf instead of pdftk to extract non standard metadata from pdf

parent 50e1a93a
......@@ -266,28 +266,19 @@ class PDFDocument(Image):
value = ':'.join(item_list[1:]).strip()
result[key] = value
# Then we use pdftk to get extra metadata
# Then we use pyPdf to get extra metadata
try:
command = ['pdftk', tmp.name, 'dump_data', 'output']
command_result = Popen(command, stdout=PIPE).communicate()[0]
except OSError, e:
# if pdftk not found, pass
if e.errno != errno.ENOENT:
raise
from pyPdf import PdfFileReader
except ImportError:
# if pyPdf not found, pass
pass
else:
line_list = (line for line in command_result.splitlines())
while True:
try:
line = line_list.next()
except StopIteration:
break
if line.startswith('InfoKey'):
key = line[len('InfoKey: '):]
line = line_list.next()
assert line.startswith('InfoValue: '),\
"Wrong format returned by pdftk dump_data"
value = line[len('InfoValue: '):]
result.setdefault(key, value)
pdf_file = PdfFileReader(tmp)
for info_key, info_value in pdf_file.getDocumentInfo().iteritems():
info_key = info_key.lstrip("/")
if isinstance(info_value, unicode):
info_value = info_value.encode("utf-8")
result.setdefault(info_key, info_value)
finally:
tmp.close()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment