Commit 464bf99d authored by Bartek Górny's avatar Bartek Górny

caching moved out to mixin class; stripping headers from html representation;...

caching moved out to mixin class; stripping headers from html representation; caching in PdfDocument;

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9427 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent 891e8078
......@@ -35,10 +35,98 @@ from Products.ERP5Type.XMLObject import XMLObject
# to overwrite WebDAV methods
from Products.CMFDefault.File import File as CMFFile
import mimetypes
import mimetypes, re
from DateTime import DateTime
mimetypes.init()
rs=[]
rs.append(re.compile('<!.*>'))
rs.append(re.compile('<HEAD>.*</HEAD>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
rs.append(re.compile('<.?(HTML|BODY)[^>]*>',re.DOTALL|re.MULTILINE|re.IGNORECASE))
def stripHtml(txt):
for r in rs:
txt=r.sub('',txt)
return txt
class CachingMixin:
# time of generation of various formats
cached_time={}
# generated files (cache)
cached_data={}
# mime types for cached formats XXX to be refactored
cached_mime={}
# Declarative security
security = ClassSecurityInfo()
security.declareObjectProtected(Permissions.AccessContentsInformation)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
"""
Clear cache (invoked by interaction workflow upon file upload
needed here to overwrite class attribute with instance attrs
"""
self.cached_time={}
self.cached_data={}
self.cached_mime={}
security.declareProtected(Permissions.View,'hasFileCache')
def hasFileCache(self,format):
"""
Checks whether we have a version in this format
"""
return self.cached_data.has_key(format)
def getCacheTime(self,format):
"""
Checks when if ever was the file produced
"""
return self.cached_time.get(format,0)
def cacheUpdate(self,format):
self.cached_time[format]=DateTime()
def cacheSet(self,format,mime=None,data=None):
if mime is not None:
self.cached_mime[format]=mime
if data is not None:
self.cached_data[format]=data
def cacheGet(self,format):
'''
we could be much cooler here - pass testing and updating methods to this function
so that it does it all by itself; this'd eliminate the need for cacheSet public method
'''
return self.cached_mime.get(format,''),self.cached_data.get(format,'')
security.declareProtected(Permissions.View,'getCacheInfo')
def getCacheInfo(self):
"""
Get cache details as string (for debugging)
"""
s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
#self.log('getCacheInfo',self.cached_time)
#self.log('getCacheInfo',self.cached_data)
for f in self.cached_time.keys():
t=self.cached_time[f]
data=self.cached_data.get(f)
if data:
if isinstance(data,str):
ln=len(data)
else:
ln=0
while data is not None:
ln+=len(data.data)
data=data.next
else:
ln='no data!!!'
s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
s+='</table>'
return s
class DMSFile(XMLObject,File):
"""
Special base class, different from File only in that it can contain things
......
......@@ -35,7 +35,7 @@ from Products.ERP5Type.Message import Message
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5.Document.File import File
from Products.ERP5Type.XMLObject import XMLObject
from Products.ERP5OOo.Document.DMSFile import DMSFile
from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
from DateTime import DateTime
import xmlrpclib, base64, re, zipfile, cStringIO
# to overwrite WebDAV methods
......@@ -48,7 +48,7 @@ dec=base64.decodestring
class ConvertionError(Exception):pass
#class OOoDocument(File):
class OOoDocument(DMSFile):
class OOoDocument(DMSFile, CachingMixin):
"""
A file document able to convert OOo compatible files to
any OOo supported format, to capture metadata and to
......@@ -107,31 +107,12 @@ class OOoDocument(DMSFile):
, PropertySheet.OOoDocument
)
# time of generation of various formats
cached_time={}
# generated files (cache)
cached_data={}
# mime types for cached formats XXX to be refactored
cached_mime={}
# XXX the above craves for a separate class, but I'm not sure how to handle
# it in ZODB, so for now let it be
# regexps for stripping xml from docs
rx_strip=re.compile('<[^>]*?>',re.DOTALL|re.MULTILINE)
rx_compr=re.compile('\s+')
searchable_attrs=DMSFile.searchable_attrs+('text_content',)
security.declareProtected(Permissions.ModifyPortalContent,'clearCache')
def clearCache(self):
"""
Clear cache (invoked by interaction workflow upon file upload
needed here to overwrite class attribute with instance attrs
"""
self.cached_time={}
self.cached_data={}
self.cached_mime={}
def _getServerCoordinates(self):
"""
Returns OOo conversion server data from
......@@ -353,7 +334,8 @@ class OOoDocument(DMSFile):
return self.returnMessage('no pdf format found')
fmt=tgts[0]
self.makeFile(fmt)
self.snapshot=Pdata(self._unpackData(self.cached_data[fmt])) # XXX - use propertysheet accessors
#self.snapshot=Pdata(self._unpackData(self.cached_data[fmt]))
self.snapshot=Pdata(self._unpackData(self.cacheGet(format)[1]))
return self.returnMessage('snapshot created')
security.declareProtected(Permissions.View,'getSnapshot')
......@@ -364,7 +346,7 @@ class OOoDocument(DMSFile):
'''getSnapshot'''
if not self.hasSnapshot():
self.createSnapshot()
return self.snapshot # XXX - use propertysheet accessors
return self.snapshot
security.declareProtected(Permissions.ManagePortal,'deleteSnapshot')
def deleteSnapshot(self):
......@@ -380,7 +362,6 @@ class OOoDocument(DMSFile):
'''
get simplified html version to display
'''
# XXX use caching method
# we have to figure out which html format to use
tgts=[x[1] for x in self.getTargetFormatItemList() if x[1].startswith('html')]
if len(tgts)==0:
......@@ -398,7 +379,7 @@ class OOoDocument(DMSFile):
break
z.close()
cs.close()
return h
return stripHtml(h)
security.declareProtected(Permissions.View,'getTargetFile')
def getTargetFile(self,format,REQUEST=None):
......@@ -409,7 +390,7 @@ class OOoDocument(DMSFile):
return self.returnMessage('can not convert to '+format+' for some reason')
try:
self.makeFile(format)
return self.cached_mime[format],self.cached_data[format]
return self.cacheGet(format)
except ConvertionError,e:
return self.returnMessage(str(e))
......@@ -421,19 +402,6 @@ class OOoDocument(DMSFile):
if not self.hasOOfile():return True
return self.getLastUploadTime() > self.getLastConvertTime()
security.declareProtected(Permissions.View,'hasFileCache')
def hasFileCache(self,format):
"""
Checks whether we have a version in this format
"""
return self.cached_data.has_key(format)
def getCacheTime(self,format):
"""
Checks when if ever was the file produced
"""
return self.cached_time.get(format,0)
security.declareProtected(Permissions.View,'isFileChanged')
def isFileChanged(self,format):
"""
......@@ -467,14 +435,15 @@ class OOoDocument(DMSFile):
raise ConvertionError('needs conversion')
if self.isFileChanged(format):
try:
self.cached_mime[format],self.cached_data[format]=self._makeFile(format)
mime,data=self._makeFile(format)
self.cacheSet(format,mime,data)
self._p_changed=1 # XXX not sure it is necessary
except xmlrpclib.Fault,e:
if REQUEST is not None:
return self.returnMessage('Problem: %s' % str(e))
else:
raise ConvertionError(str(e))
self.cached_time[format]=DateTime()
self.cacheUpdate(format)
if REQUEST is not None:
return self.returnMessage('%s created' % format)
else:
......@@ -493,31 +462,6 @@ class OOoDocument(DMSFile):
#self.log('_makeFile',mime)
return kw['mime'],Pdata(dec(kw['data']))
security.declareProtected(Permissions.View,'getCacheInfo')
def getCacheInfo(self):
"""
Get cache details as string (for debugging)
"""
s='CACHE INFO:<br/><table><tr><td>format</td><td>size</td><td>time</td><td>is changed</td></tr>'
#self.log('getCacheInfo',self.cached_time)
#self.log('getCacheInfo',self.cached_data)
for f in self.cached_time.keys():
t=self.cached_time[f]
data=self.cached_data.get(f)
if data:
if isinstance(data,str):
ln=len(data)
else:
ln=0
while data is not None:
ln+=len(data.data)
data=data.next
else:
ln='no data!!!'
s+='<tr><td>%s</td><td>%s</td><td>%s</td><td>%s</td></tr>' % (f,str(ln),str(t),str(self.isFileChanged(f)))
s+='</table>'
return s
# make sure to call the right edit methods
_edit=File._edit
edit=File.edit
......
......@@ -30,12 +30,12 @@ from AccessControl import ClassSecurityInfo
from Products.CMFCore.WorkflowCore import WorkflowMethod
from Products.ERP5Type import Permissions, PropertySheet, Constraint, Interface
from Products.ERP5Type.Cache import CachingMethod
from Products.ERP5OOo.Document.DMSFile import DMSFile
from Products.ERP5OOo.Document.DMSFile import DMSFile, CachingMixin, stripHtml
import tempfile, os
class PdfDocument(DMSFile):
class PdfDocument(DMSFile, CachingMixin):
"""
PdfDocument - same as file, but has its own getSearchableText method
(converts via pdftotext)
......@@ -68,9 +68,12 @@ class PdfDocument(DMSFile):
"""
Used by the catalog for basic full text indexing
we get text content by using pdftotext
but we have to do it only once
but we have to do it only once after uplad
for simplicity we check only modification_date, which means we rebuild txt and html after every edit
but that shouldn't hurt too much
"""
if hasattr(self,'data') and (force==1 or self.getTextContent() is None):
if hasattr(self,'data') and (force==1 or self.getCacheTime('txt')<self.getModificationDate() or self.getTextContent() is None):
self.log('PdfDocument','regenerating txt')
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
......@@ -79,26 +82,31 @@ class PdfDocument(DMSFile):
self.setTextContent(r.read().replace('\n',' '))
tmp.close()
r.close()
self.cacheUpdate('txt')
return DMSFile.getSearchableText(self,md)
SearchableText=getSearchableText
def getHtmlRepresentation(self):
def getHtmlRepresentation(self, force=0):
'''
get simplified html version to display
'''
# XXX use caching method
if not hasattr(self,'data'):
return 'no data'
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
r=os.popen(cmd)
h=r.read()
tmp.close()
r.close()
return h
if force==1 or self.getCacheTime('html')<self.getModificationDate():
self.log('PdfDocument','regenerating html')
tmp=tempfile.NamedTemporaryFile()
tmp.write(self._unpackData(self.data))
tmp.seek(0)
cmd='pdftohtml -enc UTF-8 -stdout -noframes -i %s' % tmp.name
r=os.popen(cmd)
h=r.read()
tmp.close()
r.close()
h=stripHtml(h)
self.cacheSet('html',data=h)
self.cacheUpdate('html')
return self.cacheGet('html')[1]
# vim: syntax=python shiftwidth=2
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment