Commit addcfc68 authored by Andreas Jung's avatar Andreas Jung

- TextIndex: Enhanced splitter functionality now allows the

        TextIndex to index numbers, single characters. It is also
        possible to enable case-sensitive indexing. The new
        configuration options are available through the addForm
        of the Vocabulary object.
parent 700b73e7
...@@ -42,6 +42,13 @@ Zope Changes ...@@ -42,6 +42,13 @@ Zope Changes
- Nicer formatting for the increasingly tall permissions - Nicer formatting for the increasingly tall permissions
table. table.
- TextIndex: Enhanced splitter functionality now allows the
TextIndex to index numbers, single characters. It is also
possible to enable case-sensitive indexing. The new
configuration options are available through the addForm
of the Vocabulary object.
Bugs: Bugs:
- Collector #32: Use difflib instead of ndiff - Collector #32: Use difflib instead of ndiff
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
from Lexicon import Lexicon from Lexicon import Lexicon
import Splitter import Splitter
import re, string import re, string
from BTrees.IIBTree import IISet, union, IITreeSet from BTrees.IIBTree import IISet, union, IITreeSet
...@@ -56,9 +55,10 @@ class GlobbingLexicon(Lexicon): ...@@ -56,9 +55,10 @@ class GlobbingLexicon(Lexicon):
eow = '$' eow = '$'
def __init__(self,useSplitter=None): def __init__(self,useSplitter=None,extra=None):
self.clear() self.clear()
self.useSplitter = useSplitter self.useSplitter = useSplitter
self.splitterParams = extra
self.SplitterFunc = Splitter.getSplitter(self.useSplitter) self.SplitterFunc = Splitter.getSplitter(self.useSplitter)
def clear(self): def clear(self):
...@@ -239,9 +239,16 @@ class GlobbingLexicon(Lexicon): ...@@ -239,9 +239,16 @@ class GlobbingLexicon(Lexicon):
## sense in stemming a globbing lexicon. ## sense in stemming a globbing lexicon.
try: try:
return self.SplitterFunc(astring,None,encoding) return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except: except:
return self.SplitterFunc(astring,None) return self.SplitterFunc(astring, words)
def createRegex(self, pat): def createRegex(self, pat):
...@@ -269,4 +276,3 @@ class GlobbingLexicon(Lexicon): ...@@ -269,4 +276,3 @@ class GlobbingLexicon(Lexicon):
return "%s$" % result return "%s$" % result
...@@ -41,8 +41,7 @@ class Lexicon(Persistent, Implicit): ...@@ -41,8 +41,7 @@ class Lexicon(Persistent, Implicit):
# default for older objects # default for older objects
stop_syn={} stop_syn={}
def __init__(self, stop_syn=None,useSplitter=None): def __init__(self, stop_syn=None,useSplitter=None,extra=None):
self.clear() self.clear()
if stop_syn is None: if stop_syn is None:
...@@ -52,7 +51,7 @@ class Lexicon(Persistent, Implicit): ...@@ -52,7 +51,7 @@ class Lexicon(Persistent, Implicit):
self.useSplitter = Splitter.splitterNames[0] self.useSplitter = Splitter.splitterNames[0]
if useSplitter: self.useSplitter=useSplitter if useSplitter: self.useSplitter=useSplitter
self.splitterParams = extra
self.SplitterFunc = Splitter.getSplitter(self.useSplitter) self.SplitterFunc = Splitter.getSplitter(self.useSplitter)
...@@ -153,10 +152,17 @@ class Lexicon(Persistent, Implicit): ...@@ -153,10 +152,17 @@ class Lexicon(Persistent, Implicit):
def Splitter(self, astring, words=None, encoding = "latin1"): def Splitter(self, astring, words=None, encoding = "latin1"):
""" wrap the splitter """ """ wrap the splitter """
if words is None: if words is None: words = self.stop_syn
words = self.stop_syn
try: try:
return self.SplitterFunc(astring, words, encoding) return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except: except:
return self.SplitterFunc(astring, words) return self.SplitterFunc(astring, words)
...@@ -165,10 +171,6 @@ class Lexicon(Persistent, Implicit): ...@@ -165,10 +171,6 @@ class Lexicon(Persistent, Implicit):
""" we don't want to modify the query cuz we're dumb """ """ we don't want to modify the query cuz we're dumb """
return q return q
stop_words=( stop_words=(
'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across', 'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
...@@ -217,6 +219,3 @@ stop_words=( ...@@ -217,6 +219,3 @@ stop_words=(
stop_word_dict={} stop_word_dict={}
for word in stop_words: stop_word_dict[word]=None for word in stop_words: stop_word_dict[word]=None
from ISO_8859_1_Splitter import ISO_8859_1_Splitter
def Splitter(txt,stopwords=None,encoding='latin1'):
return ISO_8859_1_Splitter(txt,stopwords)
from ZopeSplitter import ZopeSplitter
def Splitter(txt,stopwords={},encoding="latin1"):
return ZopeSplitter(txt,stopwords)
...@@ -28,8 +28,7 @@ def getSplitter(name=None): ...@@ -28,8 +28,7 @@ def getSplitter(name=None):
if not name: name = splitterNames[0] if not name: name = splitterNames[0]
if not vars().has_key(name): if not vars().has_key(name):
exec( "from %s import Splitter as %s" % (name,name)) exec( "from %s.%s import %s" % (name,name,name))
return vars()[name] return vars()[name]
......
...@@ -23,18 +23,21 @@ from Products.PluginIndexes.TextIndex import Splitter ...@@ -23,18 +23,21 @@ from Products.PluginIndexes.TextIndex import Splitter
manage_addVocabularyForm=DTMLFile('dtml/addVocabulary',globals()) manage_addVocabularyForm=DTMLFile('dtml/addVocabulary',globals())
def manage_addVocabulary(self, id, title, globbing=None, splitter='', REQUEST=None): def manage_addVocabulary(self, id, title, globbing=None, extra=None,
splitter='', REQUEST=None):
"""Add a Vocabulary object """Add a Vocabulary object
""" """
id=str(id) id=str(id)
title=str(title) title=str(title)
if globbing: globbing=1 if globbing: globbing=1
c=Vocabulary(id, title, globbing,splitter) c=Vocabulary(id, title, globbing,splitter,extra)
self._setObject(id, c) self._setObject(id, c)
if REQUEST is not None: if REQUEST is not None:
return self.manage_main(self,REQUEST,update_menu=1) return self.manage_main(self,REQUEST,update_menu=1)
class _extra: pass
class Vocabulary(Item, Persistent, Implicit, class Vocabulary(Item, Persistent, Implicit,
AccessControl.Role.RoleManager, AccessControl.Role.RoleManager,
...@@ -75,7 +78,7 @@ class Vocabulary(Item, Persistent, Implicit, ...@@ -75,7 +78,7 @@ class Vocabulary(Item, Persistent, Implicit,
manage_main = DTMLFile('dtml/manage_vocab', globals()) manage_main = DTMLFile('dtml/manage_vocab', globals())
manage_query = DTMLFile('dtml/vocab_query', globals()) manage_query = DTMLFile('dtml/vocab_query', globals())
def __init__(self, id, title='', globbing=None,splitter=None): def __init__(self, id, title='', globbing=None,splitter=None,extra=None):
""" create the lexicon to manage... """ """ create the lexicon to manage... """
self.id = id self.id = id
self.title = title self.title = title
...@@ -85,10 +88,18 @@ class Vocabulary(Item, Persistent, Implicit, ...@@ -85,10 +88,18 @@ class Vocabulary(Item, Persistent, Implicit,
if splitter: if splitter:
self.useSplitter = splitter self.useSplitter = splitter
if not extra:
extra = _extra()
extra.splitterIndexNumbers = 0
extra.splitterSingleChars = 0
extra.splitterCasefolding = 1
if globbing: if globbing:
self.lexicon = GlobbingLexicon.GlobbingLexicon(useSplitter=self.useSplitter) self.lexicon = GlobbingLexicon.GlobbingLexicon(
useSplitter=self.useSplitter,extra=extra)
else: else:
self.lexicon = Lexicon.Lexicon(stop_word_dict,useSplitter=self.useSplitter) self.lexicon = Lexicon.Lexicon(stop_word_dict,
useSplitter=self.useSplitter,extra=extra)
def getLexicon(self): def getLexicon(self):
return self.lexicon return self.lexicon
...@@ -115,8 +126,6 @@ class Vocabulary(Item, Persistent, Implicit, ...@@ -115,8 +126,6 @@ class Vocabulary(Item, Persistent, Implicit,
def manage_stop_syn(self, stop_syn, REQUEST=None): def manage_stop_syn(self, stop_syn, REQUEST=None):
pass pass
def insert(self, word=''): def insert(self, word=''):
self.lexicon.set(word) self.lexicon.set(word)
......
...@@ -45,15 +45,56 @@ ...@@ -45,15 +45,56 @@
</td> </td>
</tr> </tr>
</dtml-if> </dtml-if>
<tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index numbers
</td>
<td align="left" valign="top">
<select name="extra.splitterIndexNumbers:record:int">
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top"> <td align="left" valign="top">
<div class="form-label"> <div class="form-label">
Globbing? Index single characters
</td>
<td align="left" valign="top">
<select name="extra.splitterSingleChars:record:int" >
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Case-insensitive
</td>
<td align="left" valign="top">
<select name="extra.splitterCasefolding:record:int">
<option value="0" >no
<option value="1"selected>yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
globbing?
</td> </td>
<td align="left" valign="top"> <td align="left" valign="top">
<input type="checkbox" name="globbing" /> <input type="checkbox" name="globbing" />
</td> </td>
</tr> </tr>
<tr> <tr>
<td align="left" valign="top"> <td align="left" valign="top">
</td> </td>
......
...@@ -4,15 +4,28 @@ ...@@ -4,15 +4,28 @@
<dtml-var manage_tabs> <dtml-var manage_tabs>
<p class="form-text"> <p class="form-text">
<dtml-let lexicon="getLexicon()">
<dtml-try> <dtml-try>
<dtml-let x="getLexicon().multi_wc"></dtml-let> <dtml-let x="lexicon().multi_wc"></dtml-let>
Globbing is <em>enabled</em> Globbing is <em>enabled</em>
<dtml-except> <dtml-except>
Globbing is <em>disabled</em> Globbing is <em>disabled</em>
</dtml-try> </dtml-try>
<dtml-if useSplitter> <dtml-if useSplitter>
, Splitter is <em><dtml-var useSplitter></em> , Splitter is <em><dtml-var useSplitter></em>
</dtml-if> </dtml-if>
<dtml-try>
, Index number=<dtml-var "lexicon.splitterParams.splitterIndexNumbers">
, Case-insensitve=<dtml-var "lexicon.splitterParams.splitterCasefolding">
, Index single characters=<dtml-var "lexicon.splitterParams.splitterSingleChars">
<dtml-except>
</dtml-try>
</dtml-let>
</p> </p>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment