strip html tags from found text displayed in listbox

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9072 20353a03-c40f-0410-a6d1-a30d3c3de9de

strip html tags from found text displayed in listbox
git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@9072 20353a03-c40f-0410-a6d1-a30d3c3de9de
dc691a29 · Bartek Górny · d027369d · dc691a29
Commit dc691a29 authored Aug 07, 2006 by Bartek Górny
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 1 deletion

bt5/erp5_dms/ExtensionTemplateItem/cutFound.py bt5/erp5_dms/ExtensionTemplateItem/cutFound.py +10 -1

No files found.
--- a/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py
+++ b/bt5/erp5_dms/ExtensionTemplateItem/cutFound.py
-import string
+import string, re

 redundant_chars='"\'.:;,-' # chars we need to strip from a word before we see if it matches
 tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
@@ -59,7 +59,16 @@ def generateParts(context,text,sw,tags,trail,maxlines):


 def cutFound(context,txt,sw,tags,trail,maxlines):
+  # initialize class
  FoundWord.tags=tags
+  # strip html tags (in case it is a web page - we show result without formatting)
+  r=re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
+  r=re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
+  txt=re.sub(r,'',txt)
+  r=re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
+  txt=re.sub(r,'',txt)
+  r=re.compile('\s+')
+  txt=re.sub(r,' ',txt)
  text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
  return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]