Commit 2e0efbf5 authored by Bartek Górny's avatar Bartek Górny

major refactoring; fixed snapshot generation; plain text extraction from...

major refactoring; fixed snapshot generation; plain text extraction from PdfDocument (req. pdftotext);

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9403 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent c37a799d
......@@ -43,8 +43,7 @@ class DMSFile(XMLObject,File):
"""
Special base class, different from File only in that it can contain things
(like Role Definition, for example)
Could (perhaps should) be a parent class for OOoDocument
Should probably be located somewhere else
will be merged with File when WebDAV issues are solved
"""
# CMF Type Definition
meta_type = 'ERP5 DMS File'
......@@ -70,20 +69,32 @@ class DMSFile(XMLObject,File):
_edit=File._edit
edit=File.edit
searchable_attrs=('title','description','id','reference','version',
'short_title','keywords','subject','original_filename','source_project_title')
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None):
"""\
"""
Used by the catalog for basic full text indexing
And so we end up with a strange hybrid of File and Document
This is the same as in OOoDocument except that no text_content here
Some people call it 'copy-and-paste programming'
"""
searchable_attrs=('title','description','id','reference','version',
'short_title','keywords','subject','original_filename','source_project_title')
searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_attrs))
return searchable_text
security.declarePrivate('_unpackData')
def _unpackData(self,data):
"""
Unpack Pdata into string
"""
if isinstance(data,str):
return data
else:
data_list=[]
while data is not None:
data_list.append(data.data)
data=data.next
return ''.join(data_list)
SearchableText=getSearchableText
security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
......@@ -93,8 +104,8 @@ class DMSFile(XMLObject,File):
if fname:
content_type,enc=mimetypes.guess_type(fname)
if content_type is not None:
self.content_type=content_type
return content_type
self.content_type=content_type
return content_type
# BG copied from File in case
......
......@@ -35,21 +35,20 @@ from Products.ERP5Type.Message import Message
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5OOo.Document.DMSFile import DMSFile
from DateTime import DateTime
import xmlrpclib, base64, mimetypes, re, zipfile, cStringIO
import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile
from Products.CMFCore.utils import getToolByName
mimetypes.init()
enc=base64.encodestring
dec=base64.decodestring
class ConvertionError(Exception):pass
#class OOoDocument(File):
class OOoDocument(XMLObject,File):
class OOoDocument(DMSFile):
"""
A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to
......@@ -121,20 +120,7 @@ class OOoDocument(XMLObject,File):
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+')
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None):
"""\
Used by the catalog for basic full text indexing
And so we end up with a strange hybrid of File and Document
"""
searchable_attrs=('title','description','id','text_content','reference','version',
'short_title','keywords','subject','original_filename','source_project_title')
searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
return searchable_text
SearchableText=getSearchableText
searchable_attrs=DMSFile.searchable_attrs+('text_content',)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
......@@ -313,20 +299,6 @@ class OOoDocument(XMLObject,File):
data=self.oo_data
return data
security.declarePrivate('_unpackData')
def _unpackData(self,data):
"""
Unpack Pdata into string
"""
if isinstance(data,str):
return data
else:
data_list=[]
while data is not None:
data_list.append(data.data)
data=data.next
return ''.join(data_list)
security.declareProtected(Permissions.View,'hasFile')
def hasFile(self):
"""
......@@ -373,8 +345,15 @@ class OOoDocument(XMLObject,File):
return self.returnMessage('already has a snapshot')
raise ConvertionError('already has a snapshot')
# making snapshot
self.makeFile('pdf')
self.snapshot=Pdata(self._unpackData(self.cached_data['pdf'])) # XXX - use propertysheet accessors
# we have to figure out which pdf format to use
tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].endswith('pdf')]
if len(tgts)>1:
return self.returnMessage('multiple pdf formats found - this shouldnt happen')
if len(tgts)==0:
return self.returnMessage('no pdf format found')
fmt=tgts[0]
self.makeFile(fmt)
self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors
return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot')
......@@ -515,19 +494,6 @@ class OOoDocument(XMLObject,File):
s+='</table>'
return s
# this will go out after refactoring (will be inherited from DMS File
# and eventually from File
security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
def guessMimeType(self,fname=''):
'''get mime type from file name'''
if fname=='':fname=self.getOriginalFilename()
if fname:
content_type,enc=mimetypes.guess_type(fname)
if content_type is not None:
self.content_type=content_type
return content_type
# make sure to call the right edit methods
_edit=File._edit
edit=File.edit
......
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5OOo.Document.DMSFile import DMSFile
import tempfile, os
class PdfDocument(DMSFile):
"""
PdfDocument - same as file, but has its own getSearchableText method
(converts via pdftotext)
"""
# CMF Type Definition
meta_type = 'ERP5 Pdf Document'
portal_type = 'Pdf Document'
isPortalContent = 1
isRADContent = 1
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
# Default Properties
property_sheets = ( PropertySheet.Base
, PropertySheet.CategoryCore
, PropertySheet.DublinCore
, PropertySheet.Version
, PropertySheet.Reference
, PropertySheet.DMSFile
, PropertySheet.Document
)
searchable_attrs=DMSFile.searchable_attrs+('text_content',)
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None, force=0):
"""
Used by the catalog for basic full text indexing
we get text content by using pdftotext
but we have to do it only once
"""
if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
cmd='pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
r=os.popen(cmd)
self.setTextContent(r.read().replace('\n',' '))
tmp.close()
r.close()
return DMSFile.getSearchableText(self,md)
SearchableText=getSearchableText
# vim: syntax=python shiftwidth=2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment