Commit 2e8fd048 authored by Boris Kocherov's avatar Boris Kocherov

x2t: add support metadata and all another libreoffice formats

parent 50329148
...@@ -27,9 +27,10 @@ ...@@ -27,9 +27,10 @@
############################################################################## ##############################################################################
from xml.etree import ElementTree from xml.etree import ElementTree
from subprocess import Popen, PIPE from subprocess import Popen, PIPE
from tempfile import NamedTemporaryFile, mktemp
import sys
import os import os
import json
import io
from mimetypes import guess_type
from zope.interface import implements from zope.interface import implements
...@@ -37,6 +38,9 @@ from cloudooo.interfaces.handler import IHandler ...@@ -37,6 +38,9 @@ from cloudooo.interfaces.handler import IHandler
from cloudooo.file import File from cloudooo.file import File
from cloudooo.util import logger, zipTree, unzip, parseContentType from cloudooo.util import logger, zipTree, unzip, parseContentType
from cloudooo.handler.ooo.handler import Handler as OOoHandler from cloudooo.handler.ooo.handler import Handler as OOoHandler
from cloudooo.handler.ooo.handler import bootstrapHandler
from zipfile import ZipFile
AVS_OFFICESTUDIO_FILE_UNKNOWN = "0" AVS_OFFICESTUDIO_FILE_UNKNOWN = "0"
AVS_OFFICESTUDIO_FILE_DOCUMENT_DOCX = "65" AVS_OFFICESTUDIO_FILE_DOCUMENT_DOCX = "65"
...@@ -68,13 +72,25 @@ yformat_map = { ...@@ -68,13 +72,25 @@ yformat_map = {
'ppty': 'pptx', 'ppty': 'pptx',
} }
yformat_service_map = { yformat2opendocument_map = {
'docy': 'com.sun.star.text.TextDocument', 'docy': 'odt',
'xlsy': 'com.sun.star.sheet.SpreadsheetDocument', 'xlsy': 'ods',
'ppty': 'com.sun.star.presentation.PresentationDocument', 'ppty': 'odp',
} }
yformat_tuple = ("docy", "xlsy", "ppty") yformat_tuple = (
"docy", "application/x-asc-text",
"xlsy", "application/x-asc-spreadsheet",
"ppty", "application/x-asc-presentation",
)
openxml_tuple = (
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
)
supported_formats = yformat_tuple + openxml_tuple
class Handler(object): class Handler(object):
""" """
...@@ -97,33 +113,57 @@ class Handler(object): ...@@ -97,33 +113,57 @@ class Handler(object):
self._data = data self._data = data
self._source_format = source_format self._source_format = source_format
self._init_kw = kw self._init_kw = kw
self.file = File(base_folder_url, data, source_format)
self.environment = kw.get("env", {}) self.environment = kw.get("env", {})
def convert(self, destination_format=None, **kw): def convert(self, destination_format=None, **kw):
""" Convert the inputed file to output as format that were informed """ """ Convert the inputed file to output as format that were informed """
source_format = self.file.source_format source_format = self._source_format
logger.debug("x2t convert: %s > %s" % (source_format, destination_format))
data = self._data
if source_format in yformat_tuple:
supported_format = yformat_map[source_format]
data = self._convert(data, source_format, supported_format)
source_format = supported_format
if destination_format in yformat_tuple:
supported_format = yformat_map[destination_format]
if supported_format != source_format:
data = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw)\
.convert(destination_format=supported_format)
data = self._convert(data, supported_format, destination_format)
elif destination_format != source_format:
data = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw)\
.convert(destination_format=destination_format)
return data
def _convert(self, data, source_format, destination_format):
""" Convert the inputed file to output as format that were informed """
self.file = File(self.base_folder_url, data, source_format)
logger.debug("x2t convert: %s > %s" % (source_format, destination_format)) logger.debug("x2t convert: %s > %s" % (source_format, destination_format))
# init vars and xml configuration file # init vars and xml configuration file
in_format = format_code_map[source_format] in_format = format_code_map[source_format]
out_format = format_code_map[destination_format] out_format = format_code_map[destination_format]
root_dir = self.file.directory_name root_dir = self.file.directory_name
input_dir = os.path.join(root_dir, "input"); input_dir = os.path.join(root_dir, "input")
output_dir = os.path.join(root_dir, "output"); output_dir = os.path.join(root_dir, "output")
final_file_name = os.path.join(root_dir, "document.%s" % destination_format) final_file_name = os.path.join(root_dir, "document.%s" % destination_format)
input_file_name = self.file.getUrl() input_file_name = self.file.getUrl()
output_file_name = final_file_name output_file_name = final_file_name
config_file_name = os.path.join(root_dir, "config.xml") config_file_name = os.path.join(root_dir, "config.xml")
metadata = None
output_data = None
if source_format in yformat_tuple: if source_format in yformat_tuple:
if self._data.startswith("PK\x03\x04"): if data.startswith("PK\x03\x04"):
os.mkdir(input_dir) os.mkdir(input_dir)
unzip(self.file.getUrl(), input_dir) unzip(self.file.getUrl(), input_dir)
for _, _, files in os.walk(input_dir): input_file_name = os.path.join(input_dir, "body.txt")
input_file_name, = files metadata_file_name = os.path.join(input_dir, "metadata.json")
break if os.path.isfile(metadata_file_name):
input_file_name = os.path.join(input_dir, input_file_name) with open(metadata_file_name) as metadata_file:
metadata = json.loads(metadata_file.read())
if destination_format in yformat_tuple: if destination_format in yformat_tuple:
os.mkdir(output_dir) os.mkdir(output_dir)
output_file_name = os.path.join(output_dir, "body.txt") output_file_name = os.path.join(output_dir, "body.txt")
...@@ -160,54 +200,74 @@ class Handler(object): ...@@ -160,54 +200,74 @@ class Handler(object):
if p.returncode != 0: if p.returncode != 0:
raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n "))) raise RuntimeError("x2t: exit code %d != 0\n+ %s\n> stdout: %s\n> stderr: %s@ x2t xml:\n%s" % (p.returncode, " ".join(["x2t", config_file.name]), stdout, stderr, " " + open(config_file.name).read().replace("\n", "\n ")))
if destination_format in yformat_tuple: self.file.reload(final_file_name)
try:
if source_format in yformat_tuple:
if (metadata):
output_data = OOoHandler(self.base_folder_url, self.file.getContent(), source_format, **self._init_kw)\
.setMetadata(metadata)
else:
output_data = self.file.getContent()
elif destination_format in yformat_tuple:
dir_name = os.path.dirname(output_file_name)
metadata_file_name = os.path.join(dir_name, "metadata.json")
with open(metadata_file_name, 'w') as metadata_file:
metadata = OOoHandler(self.base_folder_url, data, source_format, **self._init_kw).getMetadata()
metadata.pop('MIMEType', None)
metadata.pop('Generator', None)
metadata.pop('AppVersion', None)
metadata.pop('ImplementationName', None)
metadata_file.write(json.dumps(metadata))
zipTree( zipTree(
final_file_name, final_file_name,
(output_file_name, ""), (output_file_name, ""),
(os.path.join(os.path.dirname(output_file_name), "media"), ""), (metadata_file_name, ""),
(os.path.join(dir_name, "media"), ""),
) )
output_data = self.file.getContent()
self.file.reload(final_file_name)
try:
return self.file.getContent()
finally: finally:
self.file.trash() self.file.trash()
return output_data
def _getContentType(self):
mimetype_type = None
if "/" not in self._source_format:
mimetype_type = guess_type('a.' + self._source_format)[0]
if mimetype_type is None:
mimetype_type = self._source_format
return mimetype_type
def getMetadata(self, base_document=False): def getMetadata(self, base_document=False):
r"""Returns a dictionary with all metadata of document. r"""Returns a dictionary with all metadata of document.
/!\ Not Implemented: no format are handled correctly.
""" """
# XXX Cloudooo takes the first handler that can "handle" source_mimetype. if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
# However, docx documents metadata can only be "handled" by the ooo handler. with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile:
# Handlers should provide a way to tell if such capability is available for the required source mimetype. try:
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. metadata = zipfile.read("metadata.json")
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. except KeyError:
if self._source_format in ( metadata = '{}'
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document", metadata = json.loads(metadata)
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", metadata['MIMEType'] = self._getContentType()
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation", if base_document:
): opendocument_format = yformat2opendocument_map[self._source_format]
metadata['MIMEType'] = guess_type('a.' + opendocument_format)[0]
metadata['Data'] = self.convert(opendocument_format)
return metadata
else:
return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document) return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).getMetadata(base_document)
return {}
def setMetadata(self, metadata={}): def setMetadata(self, metadata={}):
r"""Returns document with new metadata. r"""Returns document with new metadata.
/!\ Not Implemented: no format are handled correctly.
Keyword arguments: Keyword arguments:
metadata -- expected an dictionary with metadata. metadata -- expected an dictionary with metadata.
""" """
# XXX Cloudooo takes the first handler that can "handle" source_mimetype. if self._source_format in yformat_tuple and self._data.startswith("PK\x03\x04"):
# However, docx documents metadata can only be "handled" by the ooo handler. with io.BytesIO(self._data) as memfile, ZipFile(memfile) as zipfile:
# Handlers should provide a way to tell if such capability is available for the required source mimetype. zipfile.write("metadata.json", json.dumps(metadata))
# We have to define a precise direction on how to know/get what are handlers capabilities according to Cloudooo configuration. return memfile.getvalue()
# And then, this method MUST raise on unhandled format. Here xformats are "handled" by cheating. else:
if self._source_format in (
"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
"pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation",
):
return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata) return OOoHandler(self.base_folder_url, self._data, self._source_format, **self._init_kw).setMetadata(metadata)
return self.file.getContent()
@staticmethod @staticmethod
def getAllowedConversionFormatList(source_mimetype): def getAllowedConversionFormatList(source_mimetype):
...@@ -218,17 +278,26 @@ class Handler(object): ...@@ -218,17 +278,26 @@ class Handler(object):
... ...
] ]
""" """
getFormatList = OOoHandler.getAllowedConversionFormatList
source_mimetype = parseContentType(source_mimetype).gettype() source_mimetype = parseContentType(source_mimetype).gettype()
if source_mimetype in ("docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"):
return [("application/x-asc-text", "OnlyOffice Text Document")]
if source_mimetype in ("docy", "application/x-asc-text"): if source_mimetype in ("docy", "application/x-asc-text"):
return [("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "Word 2007 Document")] return getFormatList("application/vnd.openxmlformats-officedocument.wordprocessingml.document")
if source_mimetype in ("xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"):
return [("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet")]
if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"): if source_mimetype in ("xlsy", "application/x-asc-spreadsheet"):
return [("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "Excel 2007 Spreadsheet")] return getFormatList("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
if source_mimetype in ("pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"):
return [("application/x-asc-presentation", "OnlyOffice Presentation")]
if source_mimetype in ("ppty", "application/x-asc-presentation"): if source_mimetype in ("ppty", "application/x-asc-presentation"):
return [("application/vnd.openxmlformats-officedocument.presentationml.presentation", "PowerPoint 2007 Presentation")] return getFormatList("application/vnd.openxmlformats-officedocument.presentationml.presentation")
return []
format_list = getFormatList(source_mimetype)
format_list_append = format_list.append
for type, _ in format_list:
if type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
format_list_append(("application/x-asc-text", "OnlyOffice Text Document"))
break
if type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
format_list_append(("application/x-asc-spreadsheet", "OnlyOffice Spreadsheet"))
break
if type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
format_list_append(("application/x-asc-presentation", "OnlyOffice Presentation"))
break
return format_list
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment