Commit addcfc68 authored by Andreas Jung's avatar Andreas Jung

- TextIndex: Enhanced splitter functionality now allows the

        TextIndex to index numbers, single characters. It is also
        possible to enable case-sensitive indexing. The new
        configuration options are available through the addForm
        of the Vocabulary object.
parent 700b73e7
......@@ -42,6 +42,13 @@ Zope Changes
- Nicer formatting for the increasingly tall permissions
table.
- TextIndex: Enhanced splitter functionality now allows the
TextIndex to index numbers, single characters. It is also
possible to enable case-sensitive indexing. The new
configuration options are available through the addForm
of the Vocabulary object.
Bugs:
- Collector #32: Use difflib instead of ndiff
......
......@@ -14,7 +14,6 @@
from Lexicon import Lexicon
import Splitter
import re, string
from BTrees.IIBTree import IISet, union, IITreeSet
......@@ -56,9 +55,10 @@ class GlobbingLexicon(Lexicon):
eow = '$'
def __init__(self,useSplitter=None):
def __init__(self,useSplitter=None,extra=None):
self.clear()
self.useSplitter = useSplitter
self.splitterParams = extra
self.SplitterFunc = Splitter.getSplitter(self.useSplitter)
def clear(self):
......@@ -239,9 +239,16 @@ class GlobbingLexicon(Lexicon):
## sense in stemming a globbing lexicon.
try:
return self.SplitterFunc(astring,None,encoding)
return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except:
return self.SplitterFunc(astring,None)
return self.SplitterFunc(astring, words)
def createRegex(self, pat):
......@@ -269,4 +276,3 @@ class GlobbingLexicon(Lexicon):
return "%s$" % result
......@@ -41,8 +41,7 @@ class Lexicon(Persistent, Implicit):
# default for older objects
stop_syn={}
def __init__(self, stop_syn=None,useSplitter=None):
def __init__(self, stop_syn=None,useSplitter=None,extra=None):
self.clear()
if stop_syn is None:
......@@ -52,7 +51,7 @@ class Lexicon(Persistent, Implicit):
self.useSplitter = Splitter.splitterNames[0]
if useSplitter: self.useSplitter=useSplitter
self.splitterParams = extra
self.SplitterFunc = Splitter.getSplitter(self.useSplitter)
......@@ -153,10 +152,17 @@ class Lexicon(Persistent, Implicit):
def Splitter(self, astring, words=None, encoding = "latin1"):
""" wrap the splitter """
if words is None:
words = self.stop_syn
if words is None: words = self.stop_syn
try:
return self.SplitterFunc(astring, words, encoding)
return self.SplitterFunc(
astring,
words,
encoding=encoding,
singlechar=self.splitterParams.splitterSingleChars,
indexnumbers=self.splitterParams.splitterIndexNumbers,
casefolding=self.splitterParams.splitterCasefolding
)
except:
return self.SplitterFunc(astring, words)
......@@ -164,10 +170,6 @@ class Lexicon(Persistent, Implicit):
def query_hook(self, q):
""" we don't want to modify the query cuz we're dumb """
return q
stop_words=(
'am', 'ii', 'iii', 'per', 'po', 're', 'a', 'about', 'above', 'across',
......@@ -217,6 +219,3 @@ stop_words=(
stop_word_dict={}
for word in stop_words: stop_word_dict[word]=None
from ISO_8859_1_Splitter import ISO_8859_1_Splitter
def Splitter(txt,stopwords=None,encoding='latin1'):
return ISO_8859_1_Splitter(txt,stopwords)
from ZopeSplitter import ZopeSplitter
def Splitter(txt,stopwords={},encoding="latin1"):
return ZopeSplitter(txt,stopwords)
......@@ -28,8 +28,7 @@ def getSplitter(name=None):
if not name: name = splitterNames[0]
if not vars().has_key(name):
exec( "from %s import Splitter as %s" % (name,name))
exec( "from %s.%s import %s" % (name,name,name))
return vars()[name]
......
......@@ -23,18 +23,21 @@ from Products.PluginIndexes.TextIndex import Splitter
manage_addVocabularyForm=DTMLFile('dtml/addVocabulary',globals())
def manage_addVocabulary(self, id, title, globbing=None, splitter='', REQUEST=None):
def manage_addVocabulary(self, id, title, globbing=None, extra=None,
splitter='', REQUEST=None):
"""Add a Vocabulary object
"""
id=str(id)
title=str(title)
if globbing: globbing=1
c=Vocabulary(id, title, globbing,splitter)
c=Vocabulary(id, title, globbing,splitter,extra)
self._setObject(id, c)
if REQUEST is not None:
return self.manage_main(self,REQUEST,update_menu=1)
class _extra: pass
class Vocabulary(Item, Persistent, Implicit,
AccessControl.Role.RoleManager,
......@@ -75,20 +78,28 @@ class Vocabulary(Item, Persistent, Implicit,
manage_main = DTMLFile('dtml/manage_vocab', globals())
manage_query = DTMLFile('dtml/vocab_query', globals())
def __init__(self, id, title='', globbing=None,splitter=None):
def __init__(self, id, title='', globbing=None,splitter=None,extra=None):
""" create the lexicon to manage... """
self.id = id
self.title = title
self.globbing = not not globbing
self.useSplitter = Splitter.splitterNames[0]
if splitter:
self.useSplitter = splitter
if not extra:
extra = _extra()
extra.splitterIndexNumbers = 0
extra.splitterSingleChars = 0
extra.splitterCasefolding = 1
if globbing:
self.lexicon = GlobbingLexicon.GlobbingLexicon(useSplitter=self.useSplitter)
self.lexicon = GlobbingLexicon.GlobbingLexicon(
useSplitter=self.useSplitter,extra=extra)
else:
self.lexicon = Lexicon.Lexicon(stop_word_dict,useSplitter=self.useSplitter)
self.lexicon = Lexicon.Lexicon(stop_word_dict,
useSplitter=self.useSplitter,extra=extra)
def getLexicon(self):
return self.lexicon
......@@ -115,8 +126,6 @@ class Vocabulary(Item, Persistent, Implicit,
def manage_stop_syn(self, stop_syn, REQUEST=None):
pass
def insert(self, word=''):
self.lexicon.set(word)
......
......@@ -45,15 +45,56 @@
</td>
</tr>
</dtml-if>
<tr>
<td align="left" valign="top">
<div class="form-label">
Globbing?
</td>
<td align="left" valign="top">
<input type="checkbox" name="globbing" />
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index numbers
</td>
<td align="left" valign="top">
<select name="extra.splitterIndexNumbers:record:int">
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Index single characters
</td>
<td align="left" valign="top">
<select name="extra.splitterSingleChars:record:int" >
<option value="0" selected>no
<option value="1">yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
Case-insensitive
</td>
<td align="left" valign="top">
<select name="extra.splitterCasefolding:record:int">
<option value="0" >no
<option value="1"selected>yes
</select>
</td>
</tr>
<tr>
<td align="left" valign="top">
<div class="form-label">
globbing?
</td>
<td align="left" valign="top">
<input type="checkbox" name="globbing" />
</td>
</tr>
<tr>
<td align="left" valign="top">
</td>
......
......@@ -4,15 +4,28 @@
<dtml-var manage_tabs>
<p class="form-text">
<dtml-try>
<dtml-let x="getLexicon().multi_wc"></dtml-let>
Globbing is <em>enabled</em>
<dtml-let lexicon="getLexicon()">
<dtml-try>
<dtml-let x="lexicon().multi_wc"></dtml-let>
Globbing is <em>enabled</em>
<dtml-except>
Globbing is <em>disabled</em>
</dtml-try>
<dtml-if useSplitter>
, Splitter is <em><dtml-var useSplitter></em>
</dtml-if>
<dtml-try>
, Index number=<dtml-var "lexicon.splitterParams.splitterIndexNumbers">
, Case-insensitve=<dtml-var "lexicon.splitterParams.splitterCasefolding">
, Index single characters=<dtml-var "lexicon.splitterParams.splitterSingleChars">
<dtml-except>
Globbing is <em>disabled</em>
</dtml-try>
<dtml-if useSplitter>
, Splitter is <em><dtml-var useSplitter></em>
</dtml-if>
</dtml-try>
</dtml-let>
</p>
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment