Commit 841b0206 authored by Jérome Perrin's avatar Jérome Perrin

OOoParser: rewrite the way we extract text from cells

Some specific features such as text:s, text:tab or text:line-break are now
supported.
parent aaca1dce
...@@ -428,10 +428,43 @@ class OOoParser(Implicit): ...@@ -428,10 +428,43 @@ class OOoParser(Implicit):
xpath = '@*[local-name()="%s"]' % attribute_type_mapping[value_type] xpath = '@*[local-name()="%s"]' % attribute_type_mapping[value_type]
cell_data = str(cell.xpath(xpath)[0]) cell_data = str(cell.xpath(xpath)[0])
else: # read text nodes else: # read text nodes
text_tags = cell.findall('./{%s}p' % cell.nsmap['text']) # Text nodes can contain multiple <text:p> tags, one for each
if len(text_tags): # line. There are also some tags for special entities, for
cell_data = ''.join([text.xpath('string(.)') # instance <text:s/> for a space (or using <text:s text:c="3"/>
for text in text_tags]) # for multiple spaces) <text:tab/> for a tab and <text:line-break/>
# for new line
text_ns = cell.nsmap['text']
def format_node(node):
if node.tag == '{%s}table-cell' % node.nsmap['table']:
return "\n".join(part for part in
[format_node(child) for child in node.iterchildren()]
if part is not None)
elif node.tag == '{%s}p' % node.nsmap['text']:
part_list = [node.text]
part_list.extend(format_node(child)
for child in node.iterchildren())
return ''.join(part for part in part_list if part)
elif node.tag == '{%s}s' % node.nsmap['text']:
count = int(node.get('{%s}c' % node.nsmap['text'], 1))
return ''.join(part for part in
[node.text, ' ' * count, node.tail] if part)
elif node.tag == '{%s}span' % node.nsmap['text']:
part_list = [node.text]
part_list.extend(format_node(child)
for child in node.iterchildren())
part_list.append(node.tail)
return ''.join(part for part in part_list if part)
elif node.tag == '{%s}tab' % node.nsmap['text']:
return ''.join(part for part in
[node.text, '\t', node.tail] if part)
elif node.tag == '{%s}line-break' % node.nsmap['text']:
return ''.join(part for part in
[node.text, '\n', node.tail] if part)
elif node.tag == '{%s}a' % node.nsmap['text']:
return ''.join(part for part in
[node.text, node.tail] if part)
# we can also have table:annotation, and they are ignored
cell_data = format_node(cell)
# Add the cell to the line # Add the cell to the line
table_line.append(cell_data) table_line.append(cell_data)
......
...@@ -106,6 +106,16 @@ class TestOOoParser(unittest.TestCase): ...@@ -106,6 +106,16 @@ class TestOOoParser(unittest.TestCase):
if not_ok: if not_ok:
self.fail('Spreadsheet not read!') self.fail('Spreadsheet not read!')
def test_getSpreadSheetMappingText(self):
parser = OOoParser()
parser.openFile(open(makeFilePath('complex_text.ods'), 'rb'))
mapping = parser.getSpreadsheetsMapping()
self.assertEquals(['Feuille1'], mapping.keys())
self.assertEquals(mapping['Feuille1'][0], [' leading space'])
self.assertEquals(mapping['Feuille1'][1], [' leading space'])
self.assertEquals(mapping['Feuille1'][2], ['tab\t'])
self.assertEquals(mapping['Feuille1'][3], ['New\nLine'])
def test_suite(): def test_suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment