Commit cee5922b authored by Fabien Morin's avatar Fabien Morin

simplify Base_showFoundText to make it not dependent from erp5_dms bt.

Copy DocumentExtraction extension from erp5_dms to here because it's used by Base_showFoundText

DocumentExtraction should be rewrited and refactored. Copy it to here temporary to use it before rewriting it.

git-svn-id: https://svn.erp5.org/repos/public/erp5/trunk@26337 20353a03-c40f-0410-a6d1-a30d3c3de9de
parent fcbe09a2
##############################################################################
#
# Copyright (c) 2006-2007 Nexedi SA and Contributors. All Rights Reserved.
#
# WARNING: This program as such is intended to be used by professional
# programmers who take the whole responsability of assessing all potential
# consequences resulting from its eventual inadequacies and bugs
# End users who are looking for a ready-to-use solution with commercial
# garantees and support are strongly adviced to contract a Free Software
# Service Company
#
# This program is Free Software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
##############################################################################
import string, re
redundant_chars='"\'.:;,-+<>()*~' # chars we need to strip from a word before we see if it matches, and from the searchwords to eliminate boolean mode chars
tr=string.maketrans(redundant_chars,' '*len(redundant_chars))
class Done(Exception):
pass
class Word(str):pass
class FoundWord(str):
def __str__(self):
return self.tags[0]+self+self.tags[1]
class Part:
def __init__(self,tags,trail):
self.chain=[]
self.limit=trail
self.trail=trail
self.has=False
self.tags=tags
def push(self,w):
self.chain.insert(0,Word(w))
if len(self.chain)>self.limit:
if self.has:
self.chain.reverse()
raise Done()
self.chain.pop()
def add(self,w):
self.chain.insert(0,FoundWord(w))
self.limit+=self.trail+1
self.has=True
def __str__(self):
return '...%s...' % ' '.join(map(str,self.chain))
def generateParts(context,text,sw,tags,trail,maxlines):
par=Part(tags,trail)
sw=sw.translate(tr).strip().lower().split()
test=lambda w:w.translate(tr).strip().lower() in sw
i=0
length=len(text)
for counter,aw in enumerate(text):
if i==maxlines:
raise StopIteration
if test(aw):
par.add(aw)
else:
try:
par.push(aw)
except Done:
i+=1
yield par
par=Part(tags,trail)
if counter==length-1:
if par.has:
par.chain.reverse()
yield par # return the last marked part
def getExcerptText(context, txt, sw, tags, trail, maxlines):
"""
Returns an excerpt of text found in the txt string
"""
txt = str(txt)
# initialize class
FoundWord.tags=tags
# strip html tags (in case it is a web page - we show result without formatting)
r = re.compile('<script>.*?</script>',re.DOTALL|re.IGNORECASE)
r = re.compile('<head>.*?</head>',re.DOTALL|re.IGNORECASE)
txt = re.sub(r,'',txt)
r = re.compile('<([^>]+)>',re.DOTALL|re.IGNORECASE)
txt = re.sub(r,'',txt)
r = re.compile('\s+')
txt = re.sub(r,' ',txt)
txt = txt.replace('-',' - ') # to find hyphenated occurrences
text = ' '.join(txt.split('\n')).split(' ') # very rough tokenization
return [p for p in generateParts(context,text,sw,tags,trail,maxlines)]
if __name__=='__main__':
sw='pricing priority right acting proportion'
txt=' '.join([l.strip() for l in open('offer.txt').readlines()])
# configuration
tags=('<b>','</b>')
trail=5
maxlines=5
for p in cutFound(None,txt,sw,tags,trail,maxlines):
print p
# vim: filetype=python syntax=python shiftwidth=2
<?xml version="1.0"?>
<ZopeData>
<record id="1" aka="AAAAAAAAAAE=">
<pickle>
<tuple>
<global name="ExternalMethod" module="Products.ExternalMethod.ExternalMethod"/>
<tuple/>
</tuple>
</pickle>
<pickle>
<dictionary>
<item>
<key> <string>_function</string> </key>
<value> <string>getExcerptText</string> </value>
</item>
<item>
<key> <string>_module</string> </key>
<value> <string>DocumentExtraction</string> </value>
</item>
<item>
<key> <string>id</string> </key>
<value> <string>Base_getExcerptText</string> </value>
</item>
<item>
<key> <string>title</string> </key>
<value> <string></string> </value>
</item>
</dictionary>
</pickle>
</record>
</ZopeData>
......@@ -61,67 +61,21 @@
containing searched words as well highlighting the searched \n
words in the text itself.\n
"""\n
is_gadget_mode = context.REQUEST.get(\'is_gadget_mode\', 0)\n
\n
if is_gadget_mode:\n
# in gadget mode less space is available thus show less text\n
max_text_length = 100\n
max_lines = 1\n
\n
def getRandomDocumentTextExcerpt():\n
# try to get somewhat arbitrary choice of searchable attrs\n
if isinstance(document_text, str) and document_text!=\'\':\n
start = min(len(document_text) - 300, 200)\n
return \'... %s ...\' %document_text[start:start + max_text_length]\n
\n
# get search words from listbox selection\n
argument_names = (\'advanced_search_text\', \n
\'title\',\n
\'reference\',\n
\'searchabletext\', \n
\'searchabletext_any\',\n
\'searchabletext_all\', \n
\'searchabletext_phrase\',)\n
\n
if document_text is None:\n
# convert object to text (if possible)\n
if getattr(context, \'asText\', None) is not None and \\\n
getattr(context, \'hasBaseData\', None) is not None:\n
if context.hasBaseData():\n
# document is successfully converted\n
document_text = context.asText()\n
else:\n
# document not converted (due to a conversion error), return message to user\n
return context.Base_translateString(\'Document is not converted or missing content.\')\n
\n
\n
if selection is not None:\n
params = selection.getParams()\n
else:\n
params = context.portal_selections.getSelectionParamsFor(\'web_search_result_selection\')\n
\n
params = [params.get(name, \'\') for name in argument_names]\n
params = [(hasattr(par, \'sort\') and \'\'.join(par) or par) for par in params]\n
search_string = \' \'.join(params)\n
params = context.portal_selections.getSelectionParamsFor(\'search_result_selection\')\n
search_words = params.get(\'your_search_text\')\n
\n
if search_string.strip() == \'\':\n
# listbox uses its own method, not searching\n
return getRandomDocumentTextExcerpt()\n
\n
search_argument_list = context.Base_parseSearchString(search_string)\n
search_words = search_argument_list.get(\'searchabletext\', None)\n
\n
if search_words in (\'\', None,):\n
# the searched words are empty (e.g. because we used only parameters \n
# without pure searchable text)\n
return getRandomDocumentTextExcerpt()\n
if document_text is None:\n
document_text = context.getSearchableText()\n
\n
# get fragments of text containing searched words\n
found_text_fragments = context.Base_getExcerptText(\n
context, \\\n
document_text, \\\n
search_words, \\\n
tags = (\'<div style="font-weight:bold;display:inline;">\', \'</div>\'), \\\n
tags = (\'<em>\', \'</em>\'), \\\n
trail = 5, \\\n
maxlines = max_lines)\n
result = \' \'.join(map(str, found_text_fragments))\n
......@@ -173,22 +127,10 @@ return result\n
<string>selection</string>
<string>max_lines</string>
<string>max_text_length</string>
<string>_getattr_</string>
<string>context</string>
<string>is_gadget_mode</string>
<string>getRandomDocumentTextExcerpt</string>
<string>argument_names</string>
<string>None</string>
<string>getattr</string>
<string>_getattr_</string>
<string>params</string>
<string>append</string>
<string>$append0</string>
<string>_getiter_</string>
<string>name</string>
<string>par</string>
<string>hasattr</string>
<string>search_string</string>
<string>search_argument_list</string>
<string>context</string>
<string>search_words</string>
<string>found_text_fragments</string>
<string>map</string>
......
1146
\ No newline at end of file
1148
\ No newline at end of file
StandardSecurity
\ No newline at end of file
StandardSecurity
DocumentExtraction
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment