Commit 7b951d3f authored by Hugo H. Maia Vieira's avatar Hugo H. Maia Vieira

Implement getParagraphItemList and getParagraphItem


git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk/utils@41241 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent c4e63930
1.0.10 (unreleased) 1.0.10 (unreleased)
=================== ===================
- Add getImage for OOGranulate - Add getParagraphItemList and getParagraphItem for OOGranulate
- Add getImageItemList for OOGranulate - Add getImageItemList and getImage for OOGranulate
- Add OdfDocument - Add OdfDocument
- Add granulate interface. - Add granulate interface.
......
...@@ -42,6 +42,17 @@ class OOGranulate(object): ...@@ -42,6 +42,17 @@ class OOGranulate(object):
def __init__(self, file, source_format): def __init__(self, file, source_format):
self.document = OdfDocument(file, source_format) self.document = OdfDocument(file, source_format)
def _relevantParagraphList(self):
"""Returns a list with the relevants lxml.etree._Element 'p' tags of
self.document.parsed_content. It exclude the 'p' inside 'draw:frame'."""
# XXX: this algorithm could be improved to not iterate with the file twice
# and probably get all relevant paragraph list by a single xpath call
all_p_list = self.document.parsed_content.xpath('//text:p',
namespaces=self.document.parsed_content.nsmap)
draw_p_list = self.document.parsed_content.xpath('//draw:frame//text:p',
namespaces=self.document.parsed_content.nsmap)
return [x for x in all_p_list if x not in draw_p_list]
def getTableItemList(self, file): def getTableItemList(self, file):
"""Returns the list of table IDs in the form of (id, title).""" """Returns the list of table IDs in the form of (id, title)."""
raise NotImplementedError raise NotImplementedError
...@@ -73,14 +84,27 @@ class OOGranulate(object): ...@@ -73,14 +84,27 @@ class OOGranulate(object):
path = 'Pictures/%s' % id path = 'Pictures/%s' % id
return self.document.getFile(path) return self.document.getFile(path)
def getParagraphItemList(self, file): def getParagraphItemList(self):
"""Returns the list of paragraphs in the form of (id, class) where class """Returns the list of paragraphs in the form of (id, class) where class
may have special meaning to define TOC/TOI.""" may have special meaning to define TOC/TOI."""
raise NotImplementedError key = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name'
id = 0
def getParagraphItem(self, file, paragraph_id): paragraph_list = []
for p in self._relevantParagraphList():
paragraph_list.append((id, p.attrib[key]))
id += 1
return paragraph_list
def getParagraphItem(self, paragraph_id):
"""Returns the paragraph in the form of (text, class).""" """Returns the paragraph in the form of (text, class)."""
raise NotImplementedError try:
paragraph = self._relevantParagraphList()[paragraph_id]
text = ''.join(paragraph.xpath('.//text()', namespaces=paragraph.nsmap))
key = '{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name'
p_class = paragraph.attrib[key]
return (text, p_class)
except IndexError:
return None
def getChapterItemList(self, file): def getChapterItemList(self, file):
"""Returns the list of chapters in the form of (id, level).""" """Returns the list of chapters in the form of (id, level)."""
......
...@@ -55,11 +55,11 @@ class IImageGranulator(Interface): ...@@ -55,11 +55,11 @@ class IImageGranulator(Interface):
class ITextGranulator(Interface): class ITextGranulator(Interface):
"""Provides methods to granulate a document into chapters and paragraphs.""" """Provides methods to granulate a document into chapters and paragraphs."""
def getParagraphItemList(file): def getParagraphItemList():
"""Returns the list of paragraphs in the form of (id, class) where class may """Returns the list of paragraphs in the form of (id, class) where class may
have special meaning to define TOC/TOI.""" have special meaning to define TOC/TOI."""
def getParagraphItem(file, paragraph_id): def getParagraphItem(paragraph_id):
"""Returns the paragraph in the form of (text, class).""" """Returns the paragraph in the form of (text, class)."""
def getChapterItemList(file): def getChapterItemList(file):
......
# -*- coding: utf-8 -*-
############################################################################## ##############################################################################
# #
# Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved. # Copyright (c) 2010 Nexedi SA and Contributors. All Rights Reserved.
...@@ -83,17 +84,40 @@ class TestOOGranulate(cloudoooTestCase): ...@@ -83,17 +84,40 @@ class TestOOGranulate(cloudoooTestCase):
obtained_image = self.oogranulate.getImage('anything.png') obtained_image = self.oogranulate.getImage('anything.png')
self.assertEquals('', obtained_image) self.assertEquals('', obtained_image)
def testGetParagraphItemList(self): def testRelevantParagraphList(self):
"""Test if getParagraphItemList() returns the right paragraphs list""" """Test if _relevantParagraphList returns a list with 'p' excluding the 'p'
self.assertRaises(NotImplementedError, inside 'draw:frame'"""
self.oogranulate.getParagraphItemList, draw_p_list = self.oogranulate.document.parsed_content.xpath(
'file') '//draw:frame//text:p',
namespaces=self.oogranulate.document.parsed_content.nsmap)
self.assertTrue(draw_p_list not in self.oogranulate._relevantParagraphList())
def testGetParagraphItem(self): def testGetParagraphItemList(self):
"""Test if getParagraphItemList() returns the right paragraphs list, with
the ids always in the same order"""
for i in range(5):
data = open('./data/granulate_test.odt').read()
oogranulate = OOGranulate(data, 'odt')
paragraph_list = oogranulate.getParagraphItemList()
self.assertEquals((0, 'P3'), paragraph_list[0])
self.assertEquals((1, 'P1'), paragraph_list[1])
self.assertEquals((2, 'P12'), paragraph_list[2])
self.assertEquals((8, 'P13'), paragraph_list[8])
self.assertEquals((19, 'Standard'), paragraph_list[19])
def testGetParagraphItemSuccessfully(self):
"""Test if getParagraphItem() returns the right paragraph""" """Test if getParagraphItem() returns the right paragraph"""
self.assertRaises(NotImplementedError, self.oogranulate.getParagraphItem, self.assertEquals(('Some images without title', 'P13'),
'file', self.oogranulate.getParagraphItem(8))
'paragraph_id')
big_paragraph = self.oogranulate.getParagraphItem(5)
self.assertEquals('P8', big_paragraph[1])
self.assertTrue(big_paragraph[0].startswith(u'A prática cotidiana prova'))
self.assertTrue(big_paragraph[0].endswith(u'corresponde às necessidades.'))
def testGetParagraphItemWithoutSuccess(self):
"""Test if getParagraphItem() returns None for not existent id"""
self.assertEquals(None, self.oogranulate.getParagraphItem(200))
def testGetChapterItemList(self): def testGetChapterItemList(self):
"""Test if getChapterItemList() returns the right chapters list""" """Test if getChapterItemList() returns the right chapters list"""
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment