Commit b8c3a39b authored by Michel Pelletier's avatar Michel Pelletier

Added stop word interface to Lexicon.

parent 8e6e5acb
......@@ -91,7 +91,7 @@ from Persistence import Persistent
from OFS.SimpleItem import Item
from SearchIndex import Lexicon, GlobbingLexicon
from VocabularyInterface import VocabularyInterface
from SearchIndex.Lexicon import stop_word_dict
manage_addVocabularyForm=HTMLFile('addVocabulary',globals())
......@@ -113,7 +113,6 @@ class Vocabulary(Item, Persistent, Implicit):
meta_type = "Vocabulary"
_isAVocabulary = 1
__extends__=(VocabularyInterface,)
manage_options=(
......@@ -137,6 +136,7 @@ class Vocabulary(Item, Persistent, Implicit):
['Anonymous', 'Manager']),
)
## manage_main = HTMLFile('vocab_manage_main', globals())
manage_vocabulary = HTMLFile('manage_vocab', globals())
......@@ -151,7 +151,7 @@ class Vocabulary(Item, Persistent, Implicit):
if globbing:
self.lexicon = GlobbingLexicon.GlobbingLexicon()
else:
self.lexicon = Lexicon.Lexicon()
self.lexicon = Lexicon.Lexicon(stop_word_dict)
def query(self, pattern):
""" """
......@@ -171,6 +171,11 @@ class Vocabulary(Item, Persistent, Implicit):
if RESPONSE:
RESPONSE.redirect(URL1 + '/manage_vocabulary')
def manage_stop_syn(self, stop_syn, REQUEST=None):
pass
def insert(self, word=''):
self.lexicon.set(word)
......
<html>
<head>
<title>Edit <dtml-var title_or_id></title>
</head>
<body bgcolor="#ffffff" link="#000099" vlink="#555555" alink="#77003b">
<dtml-var manage_tabs>
<form action="manage_stop_syn" method="POST">
<textarea name="stop_syn:lines">
</textarea>
</form>
<br>
</body>
</html>
......@@ -239,6 +239,14 @@ class GlobbingLexicon(Lexicon):
return words
def Splitter(self, astring, words=None):
""" wrap the splitter """
## don't do anything, less efficient but there's not much
## sense in stemming a globbing lexicon.
return Splitter(astring)
def translate(self, pat):
"""Translate a PATTERN to a regular expression.
......
......@@ -113,11 +113,27 @@ class Lexicon(Persistent, Implicit):
"""
counter = 0
def __init__(self):
def __init__(self, stop_syn=None):
self._lexicon = OIBTree()
self.counter = 0
if stop_syn is None:
self.stop_syn = {}
else:
self.stop_syn = {}
def set_stop_syn(selfb, stop_syn):
""" pass in a mapping of stopwords and synonyms. Format is:
{'word' : [syn1, syn2, ..., synx]}
Vocabularies do not necesarily need to implement this if their
splitters do not support stemming or stoping.
"""
self.stop_syn = stop_syn
def set(self, word):
""" return the word id of 'word' """
......@@ -142,8 +158,11 @@ class Lexicon(Persistent, Implicit):
def __len__(self):
return len(self._lexicon)
def Splitter(self, astring, words):
def Splitter(self, astring, words=None):
""" wrap the splitter """
if words is None:
word = self.stop_syn
return Splitter(astring, words)
def grep(self, query):
......
......@@ -92,7 +92,7 @@ is no longer known.
"""
__version__='$Revision: 1.21 $'[11:-2]
__version__='$Revision: 1.22 $'[11:-2]
from Globals import Persistent
import BTree, IIBTree, IOBTree, OIBTree
......@@ -164,7 +164,6 @@ class UnTextIndex(Persistent, Implicit):
self.call_methods=call_methods
self._index=IOBTree()
self._unindex=IOBTree()
self._syn=stop_word_dict
else:
pass
......@@ -177,6 +176,11 @@ class UnTextIndex(Persistent, Implicit):
self._lexicon = lexicon
def __setstate(self, state):
Persistent.__setstate__(self, state)
if hasattr(self, '_syn'):
del self._syn
def getLexicon(self, vocab_id):
""" bit of a hack, indexes have been made acquirers so that
......@@ -194,10 +198,10 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self):
return len(self._unindex)
def __setstate__(self, state):
Persistent.__setstate__(self, state)
if not hasattr(self, '_lexicon'):
self._lexicon = Lexicon()
## def __setstate__(self, state):
## Persistent.__setstate__(self, state)
## if not hasattr(self, '_lexicon'):
## self._lexicon = Lexicon()
def clear(self):
......@@ -240,7 +244,11 @@ class UnTextIndex(Persistent, Implicit):
## The Splitter should now be european compliant at least.
## Someone should test this.
src = self.getLexicon(self._lexicon).Splitter(k, self._syn)
## import pdb
## pdb.set_trace()
src = self.getLexicon(self._lexicon).Splitter(k)
## This returns a tuple of stemmed words. Stopwords have been
## stripped.
......@@ -324,7 +332,7 @@ class UnTextIndex(Persistent, Implicit):
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
"""
src = tuple(self.getLexicon(self._lexicon).Splitter(word, self._syn))
src = tuple(self.getLexicon(self._lexicon).Splitter(word))
if not src: return ResultList({}, (word,), self)
if len(src) == 1:
src=src[0]
......@@ -412,13 +420,13 @@ class UnTextIndex(Persistent, Implicit):
r = []
for word in words:
r = r+self.getLexicon(self._lexicon).Splitter(doc, self._syn).indexes(word)
r = r+self.getLexicon(self._lexicon).Splitter(doc).indexes(word)
return r
def _subindex(self, isrc, d, old, last):
src = self.getLexicon(self._lexicon).Splitter(isrc, self._syn)
src = self.getLexicon(self._lexicon).Splitter(isrc)
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1],d,old,last)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment