############################################################################## # # Copyright (c) 2003-2005 Nexedi SARL and Contributors. All Rights Reserved. # Kevin DELDYCKE <kevin@nexedi.com> # Guillaume MICHON <guillaume@nexedi.com> # # WARNING: This program as such is intended to be used by professional # programmers who take the whole responsability of assessing all potential # consequences resulting from its eventual inadequacies and bugs # End users who are looking for a ready-to-use solution with commercial # garantees and support are strongly adviced to contract a Free Software # Service Company # # This program is Free Software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # ############################################################################## from Products.PythonScripts.Utility import allow_class from ZPublisher.HTTPRequest import FileUpload from xml.dom.ext.reader import PyExpat from xml.dom import Node from AccessControl import ClassSecurityInfo from Globals import InitializeClass from zipfile import ZipFile from zLOG import LOG import imghdr class CorruptedOOoFile(Exception): pass class OOoParser: """ General purpose tools to parse and handle OpenOffice v1.x documents. """ # Declarative security security = ClassSecurityInfo() security.declarePrivate('__init__') def __init__(self): # Create the PyExpat reader self.reader = PyExpat.Reader() self.oo_content_dom = None self.oo_styles_dom = None self.oo_files = {} self.pictures = {} self.ns = {} security.declarePublic('openFile') def openFile(self, file_raw_data): """ Load all files in the zipped OpenOffice document """ # Try to unzip the Open Office doc LOG('KevLog >>>>>>>>',0,repr(file_raw_data)) try: oo_unzipped = ZipFile(file_raw_data, mode="r") except: raise CorruptedOOoFile # Test the integrity of the file if oo_unzipped.testzip() != None: raise CorruptedOOoFile # Initialize internal variables self.__init__() # List and load the content of the zip file for name in oo_unzipped.namelist(): self.oo_files[name] = oo_unzipped.read(name) # Get the main content and style definitions self.oo_content_dom = self.reader.fromString(self.oo_files["content.xml"]) self.oo_styles_dom = self.reader.fromString(self.oo_files["styles.xml"]) # Create a namespace table doc_ns = self.oo_styles_dom.getElementsByTagName("office:document-styles") for i in range(doc_ns[0].attributes.length): if doc_ns[0].attributes.item(i).nodeType == Node.ATTRIBUTE_NODE: name = doc_ns[0].attributes.item(i).name if name[:5] == "xmlns": self.ns[name[6:]] = doc_ns[0].attributes.item(i).value security.declarePublic('getPictures') def getPictures(self): """ Return a dictionnary of all pictures in the document """ if len(self.pictures) <= 0: for file_name in self.oo_files: raw_data = self.oo_files[file_name] pict_type = imghdr.what(None, raw_data) if pict_type != None: self.pictures[file_name] = raw_data return self.pictures security.declarePublic('getContentAsDom') def getContentAsDom(self): """ Return the DOM tree of the main OpenOffice content """ return self.oo_content_dom security.declarePublic('getEmbeddedSpreadsheetsAsDom') def getEmbeddedSpreadsheetsAsDom(self): """ Return a list of existing embedded spreadsheets in the file as DOM tree """ spreadsheets = [] # List all embedded spreadsheets emb_objects = self.oo_content_dom.getElementsByTagName("draw:object") for embedded in emb_objects: document = embedded.getAttributeNS(self.ns["xlink"], "href") if document: try: object_content = self.reader.fromString(self.oo_files[document[3:] + '/content.xml']) if object_content.getElementsByTagName("table:table"): spreadsheets.append(object_content) except: pass return spreadsheets security.declarePublic('getEmbeddedSpreadsheetsAsTable') def getEmbeddedSpreadsheetsAsTable(self): """ Return a list of existing spreadsheets in the file as table """ tables = [] for spreadsheet in self.getEmbeddedSpreadsheetsAsDom(): new_table = self.getSpreadsheetAsTable(spreadsheet) if new_table != None: tables.append(new_table) return tables security.declarePublic('getSpreadsheetAsTable') def getSpreadsheetAsTable(self, spreadsheet=None): """ This method convert an OpenOffice spreadsheet to a simple table. This code is base on the oo2pt tool (http://cvs.sourceforge.net/viewcvs.py/collective/CMFReportTool/oo2pt). """ if spreadsheet == None: return None # Create the table for table in spreadsheet.getElementsByTagName("table:table"): texts = [] # Store informations on column widths and default styles line_number = 0 for col in table.getElementsByTagName("table:table-column"): repeated = col.getAttributeNS(self.ns["table"],"number-columns-repeated") # Scan table and store usable informations for line in table.getElementsByTagName("table:table-row"): repeated_lines = line.getAttributeNS(self.ns["table"], "number-rows-repeated") if not repeated_lines: repeated_lines = 1 else: repeated_lines = int(repeated_lines) for i in range(repeated_lines): texts_line = {'line':[]} col_number=0 for cell in line.getElementsByTagName("table:table-cell"): repeated_cells = cell.getAttributeNS(self.ns["table"],"number-columns-repeated") if not repeated_cells: repeated_cells = 1 else: repeated_cells = int(repeated_cells) for j in range(repeated_cells): texts_cell = {'texts':[]} textTags = cell.getElementsByTagName("text:p") for text in textTags: for k in range(text.childNodes.length): child = text.childNodes[k] if child.nodeType == Node.TEXT_NODE: texts_cell['texts'].append(child.nodeValue) texts_line['line'].append(texts_cell) col_number += 1 texts.append(texts_line) line_number += 1 # Reduce the table to the minimum text_min_bounds = self._getTableMinimalBounds(texts) self._setTableBounds(texts, width=text_min_bounds['width'], height=text_min_bounds['height']) return texts security.declarePrivate('_getTableMinimalBounds') def _getTableMinimalBounds(self, texts): """ Calcul the minimum size of a text table """ empty_lines = 0 no_more_empty_lines = 0 # Eliminate all empty cells at the ends of lines and columns for line in range(len(texts)-1, -1, -1): empty_cells = 0 line_content = texts[line]['line'] for cell in range(len(line_content)-1, -1, -1): if len(line_content[cell]['texts']) == 0: empty_cells += 1 else: break if (not no_more_empty_lines) and (empty_cells == len(line_content)): empty_lines += 1 else: line_size = len(line_content) - empty_cells texts[line]['line'] = line_content[:line_size] no_more_empty_lines = 1 texts_size = len(texts) - empty_lines texts = texts[:texts_size] # Determine minimum bounds max_cols = 0 for line in range(len(texts)): line_content = texts[line]['line'] if len(line_content) > max_cols: max_cols = len(line_content) return { 'width':max_cols, 'height':len(texts) } security.declarePrivate('_setTableBounds') def _setTableBounds(self, texts, width=0, height=0): """ Enlarge a text table to given bounds """ while height > len(texts): texts.append( {'line':[]} ) for line in range(height): while width > len(texts[line]['line']): texts[line]['line'].append( {'texts':[]} ) InitializeClass(OOoParser) allow_class(OOoParser)