Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
cloudooo
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
1
Merge Requests
1
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Jérome Perrin
cloudooo
Commits
0ff799eb
Commit
0ff799eb
authored
Feb 07, 2018
by
Boris Kocherov
Committed by
Romain Courteaud
Feb 26, 2018
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
handler.pdf: use pyPdf in setMetada
parent
080f25b8
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
32 additions
and
33 deletions
+32
-33
cloudooo/handler/pdf/handler.py
cloudooo/handler/pdf/handler.py
+30
-32
setup.py
setup.py
+2
-1
No files found.
cloudooo/handler/pdf/handler.py
View file @
0ff799eb
...
...
@@ -25,6 +25,7 @@
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import
io
from
zope.interface
import
implements
from
cloudooo.interfaces.handler
import
IHandler
...
...
@@ -33,6 +34,8 @@ from cloudooo.util import logger, parseContentType
from
subprocess
import
Popen
,
PIPE
from
tempfile
import
mktemp
from
pyPdf
import
PdfFileWriter
,
PdfFileReader
from
pyPdf.generic
import
NameObject
,
createStringObject
class
Handler
(
object
):
"""PDF Handler is used to handler inputed pdf document."""
...
...
@@ -47,6 +50,7 @@ class Handler(object):
def
convert
(
self
,
destination_format
=
None
,
**
kw
):
""" Convert a pdf document """
# TODO: use pyPdf
logger
.
debug
(
"PDFConvert: %s > %s"
%
(
self
.
document
.
source_format
,
destination_format
))
output_url
=
mktemp
(
suffix
=
".%s"
%
destination_format
,
dir
=
self
.
document
.
directory_name
)
...
...
@@ -66,6 +70,7 @@ class Handler(object):
"""Returns a dictionary with all metadata of document.
along with the metadata.
"""
# TODO: use pyPdf and not use lower()
command
=
[
"pdfinfo"
,
self
.
document
.
getUrl
()]
stdout
,
stderr
=
Popen
(
command
,
stdout
=
PIPE
,
...
...
@@ -75,13 +80,10 @@ class Handler(object):
info_list
=
filter
(
None
,
stdout
.
split
(
"
\
n
"
))
metadata
=
{}
for
info
in
iter
(
info_list
):
if
info
.
count
(
":"
)
==
1
:
info_name
,
info_value
=
info
.
split
(
":"
)
else
:
info_name
,
info_value
=
info
.
split
(
" "
)
info_name
=
info_name
.
replace
(
":"
,
""
)
info_value
=
info_value
.
strip
()
metadata
[
info_name
.
lower
()]
=
info_value
info
=
info
.
split
(
":"
)
info_name
=
info
[
0
].
lower
()
info_value
=
":"
.
join
(
info
[
1
:]).
strip
()
metadata
[
info_name
]
=
info_value
self
.
document
.
trash
()
return
metadata
...
...
@@ -90,31 +92,27 @@ class Handler(object):
Keyword arguments:
metadata -- expected an dictionary with metadata.
"""
text_template
=
"InfoKey: %s
\
n
InfoValue: %s
\
n
"
text_list
=
[
text_template
%
(
key
.
capitalize
(),
value
)
\
for
key
,
value
in
metadata
.
iteritems
()]
metadata_file
=
File
(
self
.
document
.
directory_name
,
""
.
join
(
text_list
),
"txt"
)
output_url
=
mktemp
(
suffix
=
".pdf"
,
dir
=
self
.
document
.
directory_name
)
command
=
[
"pdftk"
,
self
.
document
.
getUrl
(),
"update_info"
,
metadata_file
.
getUrl
(),
"output"
,
output_url
]
stdout
,
stderr
=
Popen
(
command
,
stdout
=
PIPE
,
stderr
=
PIPE
,
close_fds
=
True
,
env
=
self
.
environment
).
communicate
()
self
.
document
.
reload
(
output_url
)
try
:
return
self
.
document
.
getContent
()
finally
:
self
.
document
.
trash
()
# TODO: date as "D:20090401124817-04'00'" ASN.1 for ModDate and CreationDate
input_pdf
=
PdfFileReader
(
open
(
self
.
document
.
getUrl
(),
"rb"
))
output_pdf
=
PdfFileWriter
()
modification_date
=
metadata
.
pop
(
"ModificationDate"
,
None
)
if
modification_date
:
metadata
[
'ModDate'
]
=
modification_date
if
type
(
metadata
.
get
(
'Keywords'
,
None
))
is
list
:
metadata
[
'Keywords'
]
=
metadata
[
'Keywords'
].
join
(
' '
)
args
=
{}
for
key
,
value
in
list
(
metadata
.
items
()):
args
[
NameObject
(
'/'
+
key
.
capitalize
())]
=
createStringObject
(
value
)
output_pdf
.
_info
.
getObject
().
update
(
args
)
for
page_num
in
range
(
input_pdf
.
getNumPages
()):
output_pdf
.
addPage
(
input_pdf
.
getPage
(
page_num
))
output_stream
=
io
.
BytesIO
()
output_pdf
.
write
(
output_stream
)
return
output_stream
.
getvalue
()
@
staticmethod
def
getAllowedConversionFormatList
(
source_mimetype
):
...
...
setup.py
View file @
0ff799eb
from
setuptools
import
setup
,
find_packages
import
sys
version
=
'1.2.
5
-dev'
version
=
'1.2.
6
-dev'
def
read
(
name
):
return
open
(
name
).
read
()
...
...
@@ -13,6 +13,7 @@ install_requires = [
'zope.interface'
,
'PasteDeploy'
,
'PasteScript'
,
'pyPdf'
,
'WSGIUtils'
,
'psutil>=3.0.0'
,
'lxml'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment