Commit 5dd63b09 authored by Christopher Petrilli's avatar Christopher Petrilli

Merge of cleanup of the indexing code. This isn't fully complete yet,

but it does cover field and keyword indexes with some minor fixups
with the Text indexing code.
parent 4b959522
...@@ -85,7 +85,7 @@ ...@@ -85,7 +85,7 @@
"""Simple column indices""" """Simple column indices"""
__version__='$Revision: 1.18 $'[11:-2] __version__='$Revision: 1.19 $'[11:-2]
from Globals import Persistent from Globals import Persistent
...@@ -154,69 +154,80 @@ class UnIndex(Persistent, Implicit): ...@@ -154,69 +154,80 @@ class UnIndex(Persistent, Implicit):
def __len__(self): def __len__(self):
return len(self._unindex) return len(self._unindex)
def index_object(self, i, obj, threshold=None): def removeForwardIndexEntry(self, entry, documentId):
""" index and object 'obj' with integer id 'i'""" """Take the entry provided and remove any reference to documentId
in its entry in the index."""
# Before we do anything, unindex the object we've been handed, as indexRow = self._index.get(entry, MV)
# we can't depend on the user to do the right thing. if indexRow is not MV:
self.unindex_object(i) try:
indexRow.remove(documentId)
except:
LOG(self.__class__.__name__, ERROR,
('unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
% (str(i), str(k))))
else:
LOG(self.__class__.__name__, ERROR,
('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k))))
index = self._index def insertForwardIndexEntry(self, entry, documentId):
unindex = self._unindex """Take the entry provided and put it in the correct place
in the forward index.
id = self.id This will also deal with creating the entire row if necessary."""
try: indexRow = self._index.get(entry, MV)
k=getattr(obj, id)
if callable(k): # Make sure there's actually a row there already. If not, create
k = k() # an IntSet and stuff it in first.
except: if indexRow is MV:
k = MV self._index[entry] = intSet()
indexRow = self._index[entry]
indexRow.insert(documentId)
## if k is None or k == MV: def index_object(self, documentId, obj, threshold=None):
## return 0 """ index and object 'obj' with integer id 'documentId'"""
set = index.get(k) returnStatus = 0
if set is None:
index[k] = set = intSet()
set.insert(i) # First we need to see if there's anything interesting to look at
unindex[i] = k # self.id is the name of the index, which is also the name of the
# attribute we're interested in. If the attribute is callable,
# we'll do so.
try:
datum = getattr(obj, self.id)
if callable(datum):
datum = datum()
except:
datum = MV
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
if not (datum == self._unindex.get(documentId, MV)):
self.insertForwardIndexEntry(datum, documentId)
self._unindex[documentId] = datum
self._index = index returnStatus = 1
self._unindex = unindex self._p_changed = 1 # Tickle the transaction
return 1 return returnStatus
def unindex_object(self, i): def unindex_object(self, documentId):
""" Unindex the object with integer id 'i' and don't """ Unindex the object with integer id 'documentId' and don't
raise an exception if we fail """ raise an exception if we fail """
index = self._index
unindex = self._unindex
k = unindex.get(i, None) unindexRecord = self._unindex.get(documentId, None)
if k is None: if unindexRecord is None:
return None return None
set = index.get(k, None)
if set is not None:
try:
set.remove(i)
except:
LOG('UnIndex', ERROR, ('unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
% (str(i), str(k))))
else:
LOG('UnIndex', ERROR, ('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k))))
del unindex[i] self.removeForwardIndexEntry(unindexRecord, documentId)
self._index = index del self._unindex[i]
self._unindex = unindex
def _apply_index(self, request, cid=''): def _apply_index(self, request, cid=''):
...@@ -252,9 +263,6 @@ class UnIndex(Persistent, Implicit): ...@@ -252,9 +263,6 @@ class UnIndex(Persistent, Implicit):
if type(keys) not in (ListType, TupleType): if type(keys) not in (ListType, TupleType):
keys = [keys] keys = [keys]
print "XXX,"
print keys
index = self._index index = self._index
r = None r = None
anyTrue = 0 anyTrue = 0
......
...@@ -85,6 +85,7 @@ ...@@ -85,6 +85,7 @@
from UnIndex import UnIndex, MV, intSet from UnIndex import UnIndex, MV, intSet
from zLOG import LOG, ERROR from zLOG import LOG, ERROR
from Missing import MV
from types import * from types import *
class UnKeywordIndex(UnIndex): class UnKeywordIndex(UnIndex):
...@@ -98,7 +99,7 @@ class UnKeywordIndex(UnIndex): ...@@ -98,7 +99,7 @@ class UnKeywordIndex(UnIndex):
This should have an _apply_index that returns a relevance score This should have an _apply_index that returns a relevance score
""" """
def index_object(self, i, obj, threshold=None): def index_object(self, documentId, obj, threshold=None):
""" index an object 'obj' with integer id 'i' """ index an object 'obj' with integer id 'i'
Ideally, we've been passed a sequence of some sort that we Ideally, we've been passed a sequence of some sort that we
...@@ -106,64 +107,73 @@ class UnKeywordIndex(UnIndex): ...@@ -106,64 +107,73 @@ class UnKeywordIndex(UnIndex):
useful with the results. In the case of a string, this means useful with the results. In the case of a string, this means
indexing the entire string as a keyword.""" indexing the entire string as a keyword."""
# Before we do anything, unindex the object we've been handed, as # First we need to see if there's anything interesting to look at
# we can't depend on the user to do the right thing. # self.id is the name of the index, which is also the name of the
self.unindex_object(i) # attribute we're interested in. If the attribute is callable,
# we'll do so.
try:
newKeywords = getattr(obj, self.id)
if callable(newKeywords):
newKeywords = newKeywords()
except:
newKeywords = MV
index = self._index if type(newKeywords) is StringType:
unindex = self._unindex newKeywords = (keywords, )
id = self.id # Now comes the fun part, we need to figure out what's changed
# if anything from the previous record.
oldKeywords = self._unindex.get(documentId, MV)
if newKeywords is MV:
self.unindex_object(documentId)
return 0
elif oldKeywords is MV:
try: try:
kws=getattr(obj, id) for kw in newKeywords:
if callable(kws): self.insertForwardIndexEntry(kw, documentId)
kws = kws() except TypeError:
except:
return 0 return 0
else:
# We need the old keywords to be a mapping so we can manipulate
# them more easily.
tmp = {}
try:
for kw in oldKeywords:
tmp[kw] = None
oldKeywords = tmp
# Now we're going to go through the new keywords,
# and add those that aren't already indexed. If
# they are already indexed, just delete them from
# the list.
for kw in newKeywords:
if oldKeywords.has_key(kw):
del oldKeywords[kw]
else:
self.insertForwardIndexEntry(kw, documentId)
# Check to see if we've been handed a string and if so, tuplize it # Now whatever is left in oldKeywords are keywords
if type(kws) is StringType: # that we no longer have, and need to be removed
kws = tuple(kws) # from the indexes.
for kw in oldKeywords.keys():
self.removeForwardIndexEntry(kw, documentId)
# index each item in the sequence. This also catches things that are
# not sequences.
try:
for kw in kws:
set = index.get(kw)
if set is None:
index[kw] = set = intSet()
set.insert(i)
except TypeError: except TypeError:
return 0 return 0
unindex[i] = kws self._unindex[documentId] = newKeywords
self._index = index
self._unindex = unindex
return 1 return 1
def unindex_object(self, i): def unindex_object(self, documentId):
""" carefully unindex the object with integer id 'i' and do not """ carefully unindex the object with integer id 'documentId'"""
fail if it does not exist """
index = self._index
unindex = self._unindex
kws = unindex.get(i, None) keywords = self._unindex.get(documentId, MV)
if kws is None: if keywords is MV:
return None return None
for kw in kws: for kw in keywords:
set = index.get(kw, None) self.removeForwardIndexEntry(kw, documentId)
if set is not None:
set.remove(i)
else:
LOG('UnKeywordIndex', ERROR, ('unindex_object could not '
'remove %s from set'
% str(i)))
del unindex[i]
self._index = index
self._unindex = unindex
del self._unindex[documentId]
...@@ -92,7 +92,7 @@ is no longer known. ...@@ -92,7 +92,7 @@ is no longer known.
""" """
__version__='$Revision: 1.30 $'[11:-2] __version__='$Revision: 1.31 $'[11:-2]
from Globals import Persistent from Globals import Persistent
...@@ -172,18 +172,12 @@ class UnTextIndex(Persistent, Implicit): ...@@ -172,18 +172,12 @@ class UnTextIndex(Persistent, Implicit):
pass pass
if lexicon is None: if lexicon is None:
## if no lexicon is provided, create a default one
## if no lexicon is provided, create a dumb one
self._lexicon=Lexicon() self._lexicon=Lexicon()
else: else:
self._lexicon = lexicon self._lexicon = lexicon
def __setstate(self, state):
Persistent.__setstate__(self, state)
if hasattr(self, '_syn'):
del self._syn
def getLexicon(self, vocab_id): def getLexicon(self, vocab_id):
""" bit of a hack, indexes have been made acquirers so that """ bit of a hack, indexes have been made acquirers so that
...@@ -201,123 +195,102 @@ class UnTextIndex(Persistent, Implicit): ...@@ -201,123 +195,102 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self): def __len__(self):
return len(self._unindex) return len(self._unindex)
## def __setstate__(self, state):
## Persistent.__setstate__(self, state)
## if not hasattr(self, '_lexicon'):
## self._lexicon = Lexicon()
def clear(self): def clear(self):
self._index = IOBTree() self._index = IOBTree()
self._unindex = IOBTree() self._unindex = IOBTree()
def index_object(self, i, obj, threshold=None): def index_object(self, documentId, obj, threshold=None):
""" Index an object: """ Index an object:
'i' is the integer id of the document 'documentId' is the integer id of the document
'obj' is the objects to be indexed 'obj' is the objects to be indexed
'threshold' is the number of words to process between 'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are commiting subtransactions. If 'None' subtransactions are
not used. disabled. """
the next four arguments are default optimizations. # sniff the object for our 'id', the 'document source' of the
""" # index is this attribute. If it smells callable, call it.
# Before we do anything, unindex the object we've been handed, as
# we can't depend on the user to do the right thing.
self.unindex_object(i)
id = self.id
try: try:
## sniff the object for our 'id', the 'document source' of source = getattr(obj, self.id)
## the index is this attribute. If it smells callable, if callable(source):
## call it. source = str(source())
k = getattr(obj, id)
if callable(k):
k = str(k())
else: else:
k = str(k) source = str(source)
except: except:
return 0 return 0
d = OIBTree()
old = d.has_key
last = None
## The Splitter should now be european compliant at least.
## Someone should test this.
## import pdb sourceWords = self.getLexicon(self._lexicon).Splitter(source)
## pdb.set_trace()
src = self.getLexicon(self._lexicon).Splitter(k) wordList = OIBTree()
## This returns a tuple of stemmed words. Stopwords have been last = None
## stripped.
for s in src: # Run through the words and score them
if s[0] == '\"': last=self.subindex(s[1:-1], d, old, last) for word in sourceWords:
if word[0] == '\"':
last = self.subindex(word[1:-1], wordList,
wordList.has_key, last) # XXX
else: else:
if old(s): if wordList.has_key(word):
if s != last: d[s] = d[s]+1 if word != last:
else: d[s] = 1 wordList[word] = wordList[word]+1
else:
wordList[word] = 1
index = self._index index = self._index
unindex = self._unindex unindex = self._unindex
lexicon = self.getLexicon(self._lexicon) lexicon = self.getLexicon(self._lexicon)
get = index.get unindex[documentId] = [] # XXX this should be more intellegent
unindex[i] = [] wordCount = 0
times = 0
for word, score in d.items(): for word, score in wordList.items():
if threshold is not None: if threshold is not None:
if times > threshold: if ((wordCount % threshold) == 0) and not (wordCount == 0):
# commit a subtransaction hack # commit a subtransaction hack
get_transaction().commit(1) get_transaction().commit(1)
# kick the cache # kick the cache
self._p_jar.cacheFullSweep(1) self._p_jar.cacheFullSweep(1)
times = 0
word_id = lexicon.set(word) wordId = lexicon.set(word)
r = get(word_id) indexRow = index.get(wordId)
if r is not None: if indexRow is not None:
r = index[word_id] indexRow = index[wordId] # Duplicate?
if type(r) is TupleType: if type(indexRow) is TupleType:
r = {r[0]:r[1]} indexRow = {indexRow[0]:indexRow[1]}
r[i] = score indexRow[documentId] = score
index[word_id] = r index[wordId] = indexRow
unindex[i].append(word_id) unindex[documentId].append(wordId)
elif type(r) is DictType: elif type(indexRow) is DictType:
if len(r) > 4: if len(indexRow) > 4:
b = IIBucket() b = IIBucket()
for k, v in r.items(): b[k] = v for k, v in indexRow.items():
r = b b[k] = v
r[i] = score indexRow = b
indexRow[documentId] = score
index[word_id] = r index[wordId] = indexRow
unindex[i].append(word_id) unindex[documentId].append(wordId)
else: else:
r[i] = score indexRow[documentId] = score
unindex[i].append(word_id) unindex[documentId].append(wordId)
else: else:
index[word_id] = i, score index[wordId] = documentId, score
unindex[i].append(word_id) unindex[documentId].append(wordId)
times = times + 1 wordCount = wordCount + 1
unindex[i] = tuple(unindex[i]) unindex[documentId] = tuple(unindex[documentId])
l = len(unindex[i])
self._index = index
self._unindex = unindex
## return the number of words you indexed ## return the number of words you indexed
return times return wordCount
def unindex_object(self, i): def unindex_object(self, i):
""" carefully unindex document with integer id 'i' from the text """ carefully unindex document with integer id 'i' from the text
...@@ -338,8 +311,6 @@ class UnTextIndex(Persistent, Implicit): ...@@ -338,8 +311,6 @@ class UnTextIndex(Persistent, Implicit):
'unindex_object tried to unindex nonexistent' 'unindex_object tried to unindex nonexistent'
' document %s' % str(i)) ' document %s' % str(i))
del unindex[i] del unindex[i]
self._index = index
self._unindex = unindex
def __getitem__(self, word): def __getitem__(self, word):
"""Return an InvertedIndex-style result "list" """Return an InvertedIndex-style result "list"
...@@ -378,10 +349,8 @@ class UnTextIndex(Persistent, Implicit): ...@@ -378,10 +349,8 @@ class UnTextIndex(Persistent, Implicit):
all data fields used. all data fields used.
""" """
id = self.id if request.has_key(self.id):
keys = request[self.id]
if request.has_key(id):
keys = request[id]
else: else:
return None return None
...@@ -410,26 +379,25 @@ class UnTextIndex(Persistent, Implicit): ...@@ -410,26 +379,25 @@ class UnTextIndex(Persistent, Implicit):
r = r.intersection(rr) r = r.intersection(rr)
if r is not None: if r is not None:
return r, (id,) return r, (self.id,)
return IIBucket(), (id,) return (IIBucket(), (self.id,))
def positions(self, docid, words, obj): def positions(self, docid, words, obj):
"""Return the positions in the document for the given document """Return the positions in the document for the given document
id of the word, word.""" id of the word, word."""
id = self.id
if self._schema is None: if self._schema is None:
f = getattr f = getattr
else: else:
f = operator.__getitem__ f = operator.__getitem__
id = self._schema[id] id = self._schema[self.id]
if self.call_methods: if self.call_methods:
doc = str(f(obj, id)()) doc = str(f(obj, self.id)())
else: else:
doc = str(f(obj, id)) doc = str(f(obj, self.id))
r = [] r = []
for word in words: for word in words:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment