Commit cee5922b authored by Fabien Morin's avatar Fabien Morin

simplify Base_showFoundText to make it not dependent from erp5_dms bt.

Copy DocumentExtraction extension from erp5_dms to here because it's used by Base_showFoundText

DocumentExtraction should be rewrited and refactored. Copy it to here temporary to use it before rewriting it.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@26337 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent fcbe09a2
##############################################################################
#
# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import string, re
redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars
tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
class Done(Exception):
pass
class Word(str):pass
class FoundWord(str):
def __str__(self):
return self.tags[0]+self+self.tags[1]
class Part:
def __init__(self,tags,trail):
self.chain=[]
self.limit=trail
self.trail=trail
self.has=False
self.tags=tags
def push(self,w):
self.chain.insert(0,Word(w))
if len(self.chain)>self.limit:
if self.has:
self.chain.reverse()
raise Done()
self.chain.pop()
def add(self,w):
self.chain.insert(0,FoundWord(w))
self.limit+=self.trail+1
self.has=True
def __str__(self):
return '...%s...' % ' '.join(map(str,self.chain))
def generateParts(context,text,sw,tags,trail,maxlines):
par=Part(tags,trail)
sw=sw.translate(tr).strip().lower().split()
test=lambda w:w.translate(tr).strip().lower() in sw
i=0
length=len(text)
for counter,aw in enumerate(text):
if i==maxlines:
raise StopIteration
if test(aw):
par.add(aw)
else:
try:
par.push(aw)
except Done:
i+=1
yield par
par=Part(tags,trail)
if counter==length-1:
if par.has:
par.chain.reverse()
yield par # return the last marked part
def getExcerptText(context, txt, sw, tags, trail, maxlines):
"""
Returns an excerpt of text found in the txt string
"""
txt = str(txt)
# initialize class
FoundWord.tags=tags
# strip html tags (in case it is a web page - we show result without formatting)
r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
txt = re.sub(r,'',txt)
r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
txt = re.sub(r,'',txt)
r = re.compile('\s+')
txt = re.sub(r,' ',txt)
txt = txt.replace('-',' - ') # to find hyphenated occurrences
text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
if __name__=='__main__':
sw='pricing priority right acting proportion'
txt=' '.join([l.strip() for l in open('offer.txt').readlines()])
# configuration
tags=('<b>','</b>')
trail=5
maxlines=5
for p in cutFound(None,txt,sw,tags,trail,maxlines):
print p
# vim: filetype=python syntax=python shiftwidth=2
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<tuple>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
<tuple/>
</tuple>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>getExcerptText</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>DocumentExtraction</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_getExcerptText</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
...@@ -61,67 +61,21 @@ ...@@ -61,67 +61,21 @@
containing searched words as well highlighting the searched \n containing searched words as well highlighting the searched \n
words in the text itself.\n words in the text itself.\n
"""\n """\n
is_gadget_mode = context.REQUEST.get(\'is_gadget_mode\', 0)\n
\n
if is_gadget_mode:\n
# in gadget mode less space is available thus show less text\n
max_text_length = 100\n
max_lines = 1\n
\n
def getRandomDocumentTextExcerpt():\n
# try to get somewhat arbitrary choice of searchable attrs\n
if isinstance(document_text, str) and document_text!=\'\':\n
start = min(len(document_text) - 300, 200)\n
return \'... %s ...\' %document_text[start:start + max_text_length]\n
\n
# get search words from listbox selection\n
argument_names = (\'advanced_search_text\', \n
\'title\',\n
\'reference\',\n
\'searchabletext\', \n
\'searchabletext_any\',\n
\'searchabletext_all\', \n
\'searchabletext_phrase\',)\n
\n
if document_text is None:\n
# convert object to text (if possible)\n
if getattr(context, \'asText\', None) is not None and \\\n
getattr(context, \'hasBaseData\', None) is not None:\n
if context.hasBaseData():\n
# document is successfully converted\n
document_text = context.asText()\n
else:\n
# document not converted (due to a conversion error), return message to user\n
return context.Base_translateString(\'Document is not converted or missing content.\')\n
\n
\n \n
if selection is not None:\n if selection is not None:\n
params = selection.getParams()\n params = selection.getParams()\n
else:\n else:\n
params = context.portal_selections.getSelectionParamsFor(\'web_search_result_selection\')\n params = context.portal_selections.getSelectionParamsFor(\'search_result_selection\')\n
\n search_words = params.get(\'your_search_text\')\n
params = [params.get(name, \'\') for name in argument_names]\n
params = [(hasattr(par, \'sort\') and \'\'.join(par) or par) for par in params]\n
search_string = \' \'.join(params)\n
\n \n
if search_string.strip() == \'\':\n if document_text is None:\n
# listbox uses its own method, not searching\n document_text = context.getSearchableText()\n
return getRandomDocumentTextExcerpt()\n
\n
search_argument_list = context.Base_parseSearchString(search_string)\n
search_words = search_argument_list.get(\'searchabletext\', None)\n
\n
if search_words in (\'\', None,):\n
# the searched words are empty (e.g. because we used only parameters \n
# without pure searchable text)\n
return getRandomDocumentTextExcerpt()\n
\n \n
# get fragments of text containing searched words\n
found_text_fragments = context.Base_getExcerptText(\n found_text_fragments = context.Base_getExcerptText(\n
context, \\\n context, \\\n
document_text, \\\n document_text, \\\n
search_words, \\\n search_words, \\\n
tags = (\'<div style="font-weight:bold;display:inline;">\', \'</div>\'), \\\n tags = (\'<em>\', \'</em>\'), \\\n
trail = 5, \\\n trail = 5, \\\n
maxlines = max_lines)\n maxlines = max_lines)\n
result = \' \'.join(map(str, found_text_fragments))\n result = \' \'.join(map(str, found_text_fragments))\n
...@@ -173,22 +127,10 @@ return result\n ...@@ -173,22 +127,10 @@ return result\n
<string>selection</string> <string>selection</string>
<string>max_lines</string> <string>max_lines</string>
<string>max_text_length</string> <string>max_text_length</string>
<string>_getattr_</string>
<string>context</string>
<string>is_gadget_mode</string>
<string>getRandomDocumentTextExcerpt</string>
<string>argument_names</string>
<string>None</string> <string>None</string>
<string>getattr</string> <string>_getattr_</string>
<string>params</string> <string>params</string>
<string>append</string> <string>context</string>
<string>$append0</string>
<string>_getiter_</string>
<string>name</string>
<string>par</string>
<string>hasattr</string>
<string>search_string</string>
<string>search_argument_list</string>
<string>search_words</string> <string>search_words</string>
<string>found_text_fragments</string> <string>found_text_fragments</string>
<string>map</string> <string>map</string>
......
1146 1148
\ No newline at end of file \ No newline at end of file
StandardSecurity StandardSecurity
DocumentExtraction
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment