Commit 464bf99d authored by Bartek Górny's avatar Bartek Górny

caching moved out to mixin class; stripping headers from html representation;...

caching moved out to mixin class; stripping headers from html representation; caching in PdfDocument;

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9427 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 891e8078
...@@ -35,10 +35,98 @@ from Products.ERP5Type.XMLObject import XMLObject ...@@ -35,10 +35,98 @@ from Products.ERP5Type.XMLObject import XMLObject
# to overwrite WebDAV methods # to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile from Products.CMFDefault.File import File as CMFFile
import mimetypes import mimetypes, re
from DateTime import DateTime
mimetypes.init() mimetypes.init()
rs=[]
rs.append(re.compile('<!.*>'))
rs.append(re.compile('<HEAD>.*</HEAD>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
rs.append(re.compile('<.?(HTML|BODY)[^>]*>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
def stripHtml(txt):
for r in rs:
txt=r.sub('',txt)
return txt
class CachingMixin:
# time of generation of various formats
cached_time={}
# generated files (cache)
cached_data={}
# mime types for cached formats XXX to be refactored
cached_mime={}
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
"""
Clear cache (invoked by interaction workflow upon file upload
needed here to overwrite class attribute with instance attrs
"""
self.cached_time={}
self.cached_data={}
self.cached_mime={}
security.declareProtected(Permissions.View,'hasFileCache')
def hasFileCache(self,format):
"""
Checks whether we have a version in this format
"""
return self.cached_data.has_key(format)
def getCacheTime(self,format):
"""
Checks when if ever was the file produced
"""
return self.cached_time.get(format,0)
def cacheUpdate(self,format):
self.cached_time[format]=DateTime()
def cacheSet(self,format,mime=None,data=None):
if mime is not None:
self.cached_mime[format]=mime
if data is not None:
self.cached_data[format]=data
def cacheGet(self,format):
'''
we could be much cooler here - pass testing and updating methods to this function
so that it does it all by itself; this'd eliminate the need for cacheSet public method
'''
return self.cached_mime.get(format,''),self.cached_data.get(format,'')
security.declareProtected(Permissions.View,'getCacheInfo')
def getCacheInfo(self):
"""
Get cache details as string (for debugging)
"""
s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
#self.log('getCacheInfo',self.cached_time)
#self.log('getCacheInfo',self.cached_data)
for f in self.cached_time.keys():
t=self.cached_time[f]
data=self.cached_data.get(f)
if data:
if isinstance(data,str):
ln=len(data)
else:
ln=0
while data is not None:
ln+=len(data.data)
data=data.next
else:
ln='no data!!!'
s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
s+='</table>'
return s
class DMSFile(XMLObject,File): class DMSFile(XMLObject,File):
""" """
Special base class, different from File only in that it can contain things Special base class, different from File only in that it can contain things
......
...@@ -35,7 +35,7 @@ from Products.ERP5Type.Message import Message ...@@ -35,7 +35,7 @@ from Products.ERP5Type.Message import Message
from Products.ERP5Type.Cache import CachingMethod from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5OOo.Document.DMSFile import DMSFile from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
from DateTime import DateTime from DateTime import DateTime
import xmlrpclib, base64, re, zipfile, cStringIO import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods # to overwrite WebDAV methods
...@@ -48,7 +48,7 @@ dec=base64.decodestring ...@@ -48,7 +48,7 @@ dec=base64.decodestring
class ConvertionError(Exception):pass class ConvertionError(Exception):pass
#class OOoDocument(File): #class OOoDocument(File):
class OOoDocument(DMSFile): class OOoDocument(DMSFile, CachingMixin):
""" """
A file document able to convert OOo compatible files to A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to any OOo supported format, to capture metadata and to
...@@ -107,31 +107,12 @@ class OOoDocument(DMSFile): ...@@ -107,31 +107,12 @@ class OOoDocument(DMSFile):
, PropertySheet.OOoDocument , PropertySheet.OOoDocument
) )
# time of generation of various formats
cached_time={}
# generated files (cache)
cached_data={}
# mime types for cached formats XXX to be refactored
cached_mime={}
# XXX the above craves for a separate class, but I'm not sure how to handle
# it in ZODB, so for now let it be
# regexps for stripping xml from docs # regexps for stripping xml from docs
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE) rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+') rx_compr=re.compile('\s+')
searchable_attrs=DMSFile.searchable_attrs+('text_content',) searchable_attrs=DMSFile.searchable_attrs+('text_content',)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
"""
Clear cache (invoked by interaction workflow upon file upload
needed here to overwrite class attribute with instance attrs
"""
self.cached_time={}
self.cached_data={}
self.cached_mime={}
def _getServerCoordinates(self): def _getServerCoordinates(self):
""" """
Returns OOo conversion server data from Returns OOo conversion server data from
...@@ -353,7 +334,8 @@ class OOoDocument(DMSFile): ...@@ -353,7 +334,8 @@ class OOoDocument(DMSFile):
return self.returnMessage('no pdf format found') return self.returnMessage('no pdf format found')
fmt=tgts[0] fmt=tgts[0]
self.makeFile(fmt) self.makeFile(fmt)
self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors #self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))
self.snapshot=Pdata(self._unpackData(self.cacheGet(format)[1]))
return self.returnMessage('snapshot created') return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot') security.declareProtected(Permissions.View,'getSnapshot')
...@@ -364,7 +346,7 @@ class OOoDocument(DMSFile): ...@@ -364,7 +346,7 @@ class OOoDocument(DMSFile):
'''getSnapshot''' '''getSnapshot'''
if not self.hasSnapshot(): if not self.hasSnapshot():
self.createSnapshot() self.createSnapshot()
return self.snapshot # XXX - use propertysheet accessors return self.snapshot
security.declareProtected(Permissions.ManagePortal,'deleteSnapshot') security.declareProtected(Permissions.ManagePortal,'deleteSnapshot')
def deleteSnapshot(self): def deleteSnapshot(self):
...@@ -380,7 +362,6 @@ class OOoDocument(DMSFile): ...@@ -380,7 +362,6 @@ class OOoDocument(DMSFile):
''' '''
get simplified html version to display get simplified html version to display
''' '''
# XXX use caching method
# we have to figure out which html format to use # we have to figure out which html format to use
tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].startswith('html')] tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].startswith('html')]
if len(tgts)==0: if len(tgts)==0:
...@@ -398,7 +379,7 @@ class OOoDocument(DMSFile): ...@@ -398,7 +379,7 @@ class OOoDocument(DMSFile):
break break
z.close() z.close()
cs.close() cs.close()
return h return stripHtml(h)
security.declareProtected(Permissions.View,'getTargetFile') security.declareProtected(Permissions.View,'getTargetFile')
def getTargetFile(self,format,REQUEST=None): def getTargetFile(self,format,REQUEST=None):
...@@ -409,7 +390,7 @@ class OOoDocument(DMSFile): ...@@ -409,7 +390,7 @@ class OOoDocument(DMSFile):
return self.returnMessage('can not convert to '+format+' for some reason') return self.returnMessage('can not convert to '+format+' for some reason')
try: try:
self.makeFile(format) self.makeFile(format)
return self.cached_mime[format],self.cached_data[format] return self.cacheGet(format)
except ConvertionError,e: except ConvertionError,e:
return self.returnMessage(str(e)) return self.returnMessage(str(e))
...@@ -421,19 +402,6 @@ class OOoDocument(DMSFile): ...@@ -421,19 +402,6 @@ class OOoDocument(DMSFile):
if not self.hasOOfile():return True if not self.hasOOfile():return True
return self.getLastUploadTime() > self.getLastConvertTime() return self.getLastUploadTime() > self.getLastConvertTime()
security.declareProtected(Permissions.View,'hasFileCache')
def hasFileCache(self,format):
"""
Checks whether we have a version in this format
"""
return self.cached_data.has_key(format)
def getCacheTime(self,format):
"""
Checks when if ever was the file produced
"""
return self.cached_time.get(format,0)
security.declareProtected(Permissions.View,'isFileChanged') security.declareProtected(Permissions.View,'isFileChanged')
def isFileChanged(self,format): def isFileChanged(self,format):
""" """
...@@ -467,14 +435,15 @@ class OOoDocument(DMSFile): ...@@ -467,14 +435,15 @@ class OOoDocument(DMSFile):
raise ConvertionError('needs conversion') raise ConvertionError('needs conversion')
if self.isFileChanged(format): if self.isFileChanged(format):
try: try:
self.cached_mime[format],self.cached_data[format]=self._makeFile(format) mime,data=self._makeFile(format)
self.cacheSet(format,mime,data)
self._p_changed=1 # XXX not sure it is necessary self._p_changed=1 # XXX not sure it is necessary
except xmlrpclib.Fault,e: except xmlrpclib.Fault,e:
if REQUEST is not None: if REQUEST is not None:
return self.returnMessage('Problem: %s' % str(e)) return self.returnMessage('Problem: %s' % str(e))
else: else:
raise ConvertionError(str(e)) raise ConvertionError(str(e))
self.cached_time[format]=DateTime() self.cacheUpdate(format)
if REQUEST is not None: if REQUEST is not None:
return self.returnMessage('%s created' % format) return self.returnMessage('%s created' % format)
else: else:
...@@ -493,31 +462,6 @@ class OOoDocument(DMSFile): ...@@ -493,31 +462,6 @@ class OOoDocument(DMSFile):
#self.log('_makeFile',mime) #self.log('_makeFile',mime)
return kw['mime'],Pdata(dec(kw['data'])) return kw['mime'],Pdata(dec(kw['data']))
security.declareProtected(Permissions.View,'getCacheInfo')
def getCacheInfo(self):
"""
Get cache details as string (for debugging)
"""
s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
#self.log('getCacheInfo',self.cached_time)
#self.log('getCacheInfo',self.cached_data)
for f in self.cached_time.keys():
t=self.cached_time[f]
data=self.cached_data.get(f)
if data:
if isinstance(data,str):
ln=len(data)
else:
ln=0
while data is not None:
ln+=len(data.data)
data=data.next
else:
ln='no data!!!'
s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
s+='</table>'
return s
# make sure to call the right edit methods # make sure to call the right edit methods
_edit=File._edit _edit=File._edit
edit=File.edit edit=File.edit
......
...@@ -30,12 +30,12 @@ from AccessControl import ClassSecurityInfo ...@@ -30,12 +30,12 @@ from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5OOo.Document.DMSFile import DMSFile from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
import tempfile, os import tempfile, os
class PdfDocument(DMSFile): class PdfDocument(DMSFile, CachingMixin):
""" """
PdfDocument - same as file, but has its own getSearchableText method PdfDocument - same as file, but has its own getSearchableText method
(converts via pdftotext) (converts via pdftotext)
...@@ -68,9 +68,12 @@ class PdfDocument(DMSFile): ...@@ -68,9 +68,12 @@ class PdfDocument(DMSFile):
""" """
Used by the catalog for basic full text indexing Used by the catalog for basic full text indexing
we get text content by using pdftotext we get text content by using pdftotext
but we have to do it only once but we have to do it only once after uplad
for simplicity we check only modification_date, which means we rebuild txt and html after every edit
but that shouldn't hurt too much
""" """
if hasattr(self,'data') and (force==1 or self.getTextContent() is None): if hasattr(self,'data') and (force==1 or self.getCacheTime('txt')<self.getModificationDate() or self.getTextContent() is None):
self.log('PdfDocument','regenerating txt')
tmp=tempfile.NamedTemporaryFile() tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data)) tmp.write(self._unpackData(self.data))
tmp.seek(0) tmp.seek(0)
...@@ -79,26 +82,31 @@ class PdfDocument(DMSFile): ...@@ -79,26 +82,31 @@ class PdfDocument(DMSFile):
self.setTextContent(r.read().replace('\n',' ')) self.setTextContent(r.read().replace('\n',' '))
tmp.close() tmp.close()
r.close() r.close()
self.cacheUpdate('txt')
return DMSFile.getSearchableText(self,md) return DMSFile.getSearchableText(self,md)
SearchableText=getSearchableText SearchableText=getSearchableText
def getHtmlRepresentation(self): def getHtmlRepresentation(self, force=0):
''' '''
get simplified html version to display get simplified html version to display
''' '''
# XXX use caching method
if not hasattr(self,'data'): if not hasattr(self,'data'):
return 'no data' return 'no data'
tmp=tempfile.NamedTemporaryFile() if force==1 or self.getCacheTime('html')<self.getModificationDate():
tmp.write(self._unpackData(self.data)) self.log('PdfDocument','regenerating html')
tmp.seek(0) tmp=tempfile.NamedTemporaryFile()
cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name tmp.write(self._unpackData(self.data))
r=os.popen(cmd) tmp.seek(0)
h=r.read() cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
tmp.close() r=os.popen(cmd)
r.close() h=r.read()
return h tmp.close()
r.close()
h=stripHtml(h)
self.cacheSet('html',data=h)
self.cacheUpdate('html')
return self.cacheGet('html')[1]
# vim: syntax=python shiftwidth=2 # vim: syntax=python shiftwidth=2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment