Commit 2e0efbf5 authored by Bartek Górny's avatar Bartek Górny

major refactoring; fixed snapshot generation; plain text extraction from...

major refactoring; fixed snapshot generation; plain text extraction from PdfDocument (req. pdftotext);

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9403 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent c37a799d
...@@ -43,8 +43,7 @@ class DMSFile(XMLObject,File): ...@@ -43,8 +43,7 @@ class DMSFile(XMLObject,File):
""" """
Special base class, different from File only in that it can contain things Special base class, different from File only in that it can contain things
(like Role Definition, for example) (like Role Definition, for example)
Could (perhaps should) be a parent class for OOoDocument will be merged with File when WebDAV issues are solved
Should probably be located somewhere else
""" """
# CMF Type Definition # CMF Type Definition
meta_type = 'ERP5 DMS File' meta_type = 'ERP5 DMS File'
...@@ -70,20 +69,32 @@ class DMSFile(XMLObject,File): ...@@ -70,20 +69,32 @@ class DMSFile(XMLObject,File):
_edit=File._edit _edit=File._edit
edit=File.edit edit=File.edit
searchable_attrs=('title','description','id','reference','version',
'short_title','keywords','subject','original_filename','source_project_title')
### Content indexing methods ### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText') security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None): def getSearchableText(self, md=None):
"""\ """
Used by the catalog for basic full text indexing Used by the catalog for basic full text indexing
And so we end up with a strange hybrid of File and Document
This is the same as in OOoDocument except that no text_content here
Some people call it 'copy-and-paste programming'
""" """
searchable_attrs=('title','description','id','reference','version', searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',self.searchable_attrs))
'short_title','keywords','subject','original_filename','source_project_title')
searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
return searchable_text return searchable_text
security.declarePrivate('_unpackData')
def _unpackData(self,data):
"""
Unpack Pdata into string
"""
if isinstance(data,str):
return data
else:
data_list=[]
while data is not None:
data_list.append(data.data)
data=data.next
return ''.join(data_list)
SearchableText=getSearchableText SearchableText=getSearchableText
security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType') security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
...@@ -93,8 +104,8 @@ class DMSFile(XMLObject,File): ...@@ -93,8 +104,8 @@ class DMSFile(XMLObject,File):
if fname: if fname:
content_type,enc=mimetypes.guess_type(fname) content_type,enc=mimetypes.guess_type(fname)
if content_type is not None: if content_type is not None:
self.content_type=content_type self.content_type=content_type
return content_type return content_type
# BG copied from File in case # BG copied from File in case
......
...@@ -35,21 +35,20 @@ from Products.ERP5Type.Message import Message ...@@ -35,21 +35,20 @@ from Products.ERP5Type.Message import Message
from Products.ERP5Type.Cache import CachingMethod from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5OOo.Document.DMSFile import DMSFile
from DateTime import DateTime from DateTime import DateTime
import xmlrpclib, base64, mimetypes, re, zipfile, cStringIO import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods # to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile from Products.CMFDefault.File import File as CMFFile
from Products.CMFCore.utils import getToolByName from Products.CMFCore.utils import getToolByName
mimetypes.init()
enc=base64.encodestring enc=base64.encodestring
dec=base64.decodestring dec=base64.decodestring
class ConvertionError(Exception):pass class ConvertionError(Exception):pass
#class OOoDocument(File): #class OOoDocument(File):
class OOoDocument(XMLObject,File): class OOoDocument(DMSFile):
""" """
A file document able to convert OOo compatible files to A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to any OOo supported format, to capture metadata and to
...@@ -121,20 +120,7 @@ class OOoDocument(XMLObject,File): ...@@ -121,20 +120,7 @@ class OOoDocument(XMLObject,File):
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE) rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+') rx_compr=re.compile('\s+')
### Content indexing methods searchable_attrs=DMSFile.searchable_attrs+('text_content',)
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None):
"""\
Used by the catalog for basic full text indexing
And so we end up with a strange hybrid of File and Document
"""
searchable_attrs=('title','description','id','text_content','reference','version',
'short_title','keywords','subject','original_filename','source_project_title')
searchable_text = ' '.join(map(lambda x: self.getProperty(x) or ' ',searchable_attrs))
return searchable_text
SearchableText=getSearchableText
security.declareProtected(Permissions.ModifyPortalContent,'clearCache') security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self): def clearCache(self):
...@@ -313,20 +299,6 @@ class OOoDocument(XMLObject,File): ...@@ -313,20 +299,6 @@ class OOoDocument(XMLObject,File):
data=self.oo_data data=self.oo_data
return data return data
security.declarePrivate('_unpackData')
def _unpackData(self,data):
"""
Unpack Pdata into string
"""
if isinstance(data,str):
return data
else:
data_list=[]
while data is not None:
data_list.append(data.data)
data=data.next
return ''.join(data_list)
security.declareProtected(Permissions.View,'hasFile') security.declareProtected(Permissions.View,'hasFile')
def hasFile(self): def hasFile(self):
""" """
...@@ -373,8 +345,15 @@ class OOoDocument(XMLObject,File): ...@@ -373,8 +345,15 @@ class OOoDocument(XMLObject,File):
return self.returnMessage('already has a snapshot') return self.returnMessage('already has a snapshot')
raise ConvertionError('already has a snapshot') raise ConvertionError('already has a snapshot')
# making snapshot # making snapshot
self.makeFile('pdf') # we have to figure out which pdf format to use
self.snapshot=Pdata(self._unpackData(self.cached_data['pdf'])) # XXX - use propertysheet accessors tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].endswith('pdf')]
if len(tgts)>1:
return self.returnMessage('multiple pdf formats found - this shouldnt happen')
if len(tgts)==0:
return self.returnMessage('no pdf format found')
fmt=tgts[0]
self.makeFile(fmt)
self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors
return self.returnMessage('snapshot created') return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot') security.declareProtected(Permissions.View,'getSnapshot')
...@@ -515,19 +494,6 @@ class OOoDocument(XMLObject,File): ...@@ -515,19 +494,6 @@ class OOoDocument(XMLObject,File):
s+='</table>' s+='</table>'
return s return s
# this will go out after refactoring (will be inherited from DMS File
# and eventually from File
security.declareProtected(Permissions.ModifyPortalContent, 'guessMimeType')
def guessMimeType(self,fname=''):
'''get mime type from file name'''
if fname=='':fname=self.getOriginalFilename()
if fname:
content_type,enc=mimetypes.guess_type(fname)
if content_type is not None:
self.content_type=content_type
return content_type
# make sure to call the right edit methods # make sure to call the right edit methods
_edit=File._edit _edit=File._edit
edit=File.edit edit=File.edit
......
##############################################################################
#
# Copyright (c) 2002-2006 Nexedi SARL and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5OOo.Document.DMSFile import DMSFile
import tempfile, os
class PdfDocument(DMSFile):
"""
PdfDocument - same as file, but has its own getSearchableText method
(converts via pdftotext)
"""
# CMF Type Definition
meta_type = 'ERP5 Pdf Document'
portal_type = 'Pdf Document'
isPortalContent = 1
isRADContent = 1
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
# Default Properties
property_sheets = ( PropertySheet.Base
, PropertySheet.CategoryCore
, PropertySheet.DublinCore
, PropertySheet.Version
, PropertySheet.Reference
, PropertySheet.DMSFile
, PropertySheet.Document
)
searchable_attrs=DMSFile.searchable_attrs+('text_content',)
### Content indexing methods
security.declareProtected(Permissions.View, 'getSearchableText')
def getSearchableText(self, md=None, force=0):
"""
Used by the catalog for basic full text indexing
we get text content by using pdftotext
but we have to do it only once
"""
if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
cmd='pdftotext -layout -enc UTF-8 -nopgbrk %s -' % tmp.name
r=os.popen(cmd)
self.setTextContent(r.read().replace('\n',' '))
tmp.close()
r.close()
return DMSFile.getSearchableText(self,md)
SearchableText=getSearchableText
# vim: syntax=python shiftwidth=2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment