Commit 5dd63b09 authored by Christopher Petrilli's avatar Christopher Petrilli

Merge of cleanup of the indexing code. This isn't fully complete yet,

but it does cover field and keyword indexes with some minor fixups
with the Text indexing code.
parent 4b959522
......@@ -85,7 +85,7 @@
"""Simple column indices"""
__version__='$Revision: 1.18 $'[11:-2]
__version__='$Revision: 1.19 $'[11:-2]
from Globals import Persistent
......@@ -154,70 +154,81 @@ class UnIndex(Persistent, Implicit):
def __len__(self):
return len(self._unindex)
def index_object(self, i, obj, threshold=None):
""" index and object 'obj' with integer id 'i'"""
def removeForwardIndexEntry(self, entry, documentId):
"""Take the entry provided and remove any reference to documentId
in its entry in the index."""
# Before we do anything, unindex the object we've been handed, as
# we can't depend on the user to do the right thing.
self.unindex_object(i)
indexRow = self._index.get(entry, MV)
if indexRow is not MV:
try:
indexRow.remove(documentId)
except:
LOG(self.__class__.__name__, ERROR,
('unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
% (str(i), str(k))))
else:
LOG(self.__class__.__name__, ERROR,
('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k))))
index = self._index
unindex = self._unindex
def insertForwardIndexEntry(self, entry, documentId):
"""Take the entry provided and put it in the correct place
in the forward index.
This will also deal with creating the entire row if necessary."""
id = self.id
indexRow = self._index.get(entry, MV)
# Make sure there's actually a row there already. If not, create
# an IntSet and stuff it in first.
if indexRow is MV:
self._index[entry] = intSet()
indexRow = self._index[entry]
indexRow.insert(documentId)
def index_object(self, documentId, obj, threshold=None):
""" index and object 'obj' with integer id 'documentId'"""
returnStatus = 0
# First we need to see if there's anything interesting to look at
# self.id is the name of the index, which is also the name of the
# attribute we're interested in. If the attribute is callable,
# we'll do so.
try:
k=getattr(obj, id)
if callable(k):
k = k()
datum = getattr(obj, self.id)
if callable(datum):
datum = datum()
except:
k = MV
datum = MV
## if k is None or k == MV:
## return 0
set = index.get(k)
if set is None:
index[k] = set = intSet()
set.insert(i)
unindex[i] = k
# We don't want to do anything that we don't have to here, so we'll
# check to see if the new and existing information is the same.
if not (datum == self._unindex.get(documentId, MV)):
self.insertForwardIndexEntry(datum, documentId)
self._unindex[documentId] = datum
returnStatus = 1
self._p_changed = 1 # Tickle the transaction
self._index = index
self._unindex = unindex
return 1
return returnStatus
def unindex_object(self, i):
""" Unindex the object with integer id 'i' and don't
def unindex_object(self, documentId):
""" Unindex the object with integer id 'documentId' and don't
raise an exception if we fail """
index = self._index
unindex = self._unindex
k = unindex.get(i, None)
if k is None:
unindexRecord = self._unindex.get(documentId, None)
if unindexRecord is None:
return None
set = index.get(k, None)
if set is not None:
try:
set.remove(i)
except:
LOG('UnIndex', ERROR, ('unindex_object could not remove '
'integer id %s from index %s. This '
'should not happen.'
% (str(i), str(k))))
else:
LOG('UnIndex', ERROR, ('unindex_object tried to retrieve set %s '
'from index %s but couldn\'t. This '
'should not happen.' % (repr(set),str(k))))
del unindex[i]
self.removeForwardIndexEntry(unindexRecord, documentId)
del self._unindex[i]
self._index = index
self._unindex = unindex
def _apply_index(self, request, cid=''):
"""Apply the index to query parameters given in the argument,
......@@ -252,9 +263,6 @@ class UnIndex(Persistent, Implicit):
if type(keys) not in (ListType, TupleType):
keys = [keys]
print "XXX,"
print keys
index = self._index
r = None
anyTrue = 0
......
......@@ -85,6 +85,7 @@
from UnIndex import UnIndex, MV, intSet
from zLOG import LOG, ERROR
from Missing import MV
from types import *
class UnKeywordIndex(UnIndex):
......@@ -98,7 +99,7 @@ class UnKeywordIndex(UnIndex):
This should have an _apply_index that returns a relevance score
"""
def index_object(self, i, obj, threshold=None):
def index_object(self, documentId, obj, threshold=None):
""" index an object 'obj' with integer id 'i'
Ideally, we've been passed a sequence of some sort that we
......@@ -106,64 +107,73 @@ class UnKeywordIndex(UnIndex):
useful with the results. In the case of a string, this means
indexing the entire string as a keyword."""
# Before we do anything, unindex the object we've been handed, as
# we can't depend on the user to do the right thing.
self.unindex_object(i)
index = self._index
unindex = self._unindex
id = self.id
# First we need to see if there's anything interesting to look at
# self.id is the name of the index, which is also the name of the
# attribute we're interested in. If the attribute is callable,
# we'll do so.
try:
kws=getattr(obj, id)
if callable(kws):
kws = kws()
newKeywords = getattr(obj, self.id)
if callable(newKeywords):
newKeywords = newKeywords()
except:
return 0
newKeywords = MV
# Check to see if we've been handed a string and if so, tuplize it
if type(kws) is StringType:
kws = tuple(kws)
if type(newKeywords) is StringType:
newKeywords = (keywords, )
# index each item in the sequence. This also catches things that are
# not sequences.
try:
for kw in kws:
set = index.get(kw)
if set is None:
index[kw] = set = intSet()
set.insert(i)
except TypeError:
# Now comes the fun part, we need to figure out what's changed
# if anything from the previous record.
oldKeywords = self._unindex.get(documentId, MV)
if newKeywords is MV:
self.unindex_object(documentId)
return 0
elif oldKeywords is MV:
try:
for kw in newKeywords:
self.insertForwardIndexEntry(kw, documentId)
except TypeError:
return 0
else:
# We need the old keywords to be a mapping so we can manipulate
# them more easily.
tmp = {}
try:
for kw in oldKeywords:
tmp[kw] = None
oldKeywords = tmp
# Now we're going to go through the new keywords,
# and add those that aren't already indexed. If
# they are already indexed, just delete them from
# the list.
for kw in newKeywords:
if oldKeywords.has_key(kw):
del oldKeywords[kw]
else:
self.insertForwardIndexEntry(kw, documentId)
# Now whatever is left in oldKeywords are keywords
# that we no longer have, and need to be removed
# from the indexes.
for kw in oldKeywords.keys():
self.removeForwardIndexEntry(kw, documentId)
except TypeError:
return 0
unindex[i] = kws
self._index = index
self._unindex = unindex
self._unindex[documentId] = newKeywords
return 1
def unindex_object(self, i):
""" carefully unindex the object with integer id 'i' and do not
fail if it does not exist """
index = self._index
unindex = self._unindex
def unindex_object(self, documentId):
""" carefully unindex the object with integer id 'documentId'"""
kws = unindex.get(i, None)
if kws is None:
keywords = self._unindex.get(documentId, MV)
if keywords is MV:
return None
for kw in kws:
set = index.get(kw, None)
if set is not None:
set.remove(i)
else:
LOG('UnKeywordIndex', ERROR, ('unindex_object could not '
'remove %s from set'
% str(i)))
del unindex[i]
self._index = index
self._unindex = unindex
for kw in keywords:
self.removeForwardIndexEntry(kw, documentId)
del self._unindex[documentId]
......@@ -92,7 +92,7 @@ is no longer known.
"""
__version__='$Revision: 1.30 $'[11:-2]
__version__='$Revision: 1.31 $'[11:-2]
from Globals import Persistent
......@@ -172,18 +172,12 @@ class UnTextIndex(Persistent, Implicit):
pass
if lexicon is None:
## if no lexicon is provided, create a dumb one
## if no lexicon is provided, create a default one
self._lexicon=Lexicon()
else:
self._lexicon = lexicon
def __setstate(self, state):
Persistent.__setstate__(self, state)
if hasattr(self, '_syn'):
del self._syn
def getLexicon(self, vocab_id):
""" bit of a hack, indexes have been made acquirers so that
......@@ -201,123 +195,102 @@ class UnTextIndex(Persistent, Implicit):
def __len__(self):
return len(self._unindex)
## def __setstate__(self, state):
## Persistent.__setstate__(self, state)
## if not hasattr(self, '_lexicon'):
## self._lexicon = Lexicon()
def clear(self):
self._index = IOBTree()
self._unindex = IOBTree()
def index_object(self, i, obj, threshold=None):
def index_object(self, documentId, obj, threshold=None):
""" Index an object:
'i' is the integer id of the document
'obj' is the objects to be indexed
'documentId' is the integer id of the document
'obj' is the objects to be indexed
'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are
not used.
'threshold' is the number of words to process between
commiting subtransactions. If 'None' subtransactions are
disabled. """
the next four arguments are default optimizations.
"""
# Before we do anything, unindex the object we've been handed, as
# we can't depend on the user to do the right thing.
self.unindex_object(i)
id = self.id
# sniff the object for our 'id', the 'document source' of the
# index is this attribute. If it smells callable, call it.
try:
## sniff the object for our 'id', the 'document source' of
## the index is this attribute. If it smells callable,
## call it.
k = getattr(obj, id)
if callable(k):
k = str(k())
source = getattr(obj, self.id)
if callable(source):
source = str(source())
else:
k = str(k)
source = str(source)
except:
return 0
d = OIBTree()
old = d.has_key
last = None
## The Splitter should now be european compliant at least.
## Someone should test this.
sourceWords = self.getLexicon(self._lexicon).Splitter(source)
## import pdb
## pdb.set_trace()
src = self.getLexicon(self._lexicon).Splitter(k)
## This returns a tuple of stemmed words. Stopwords have been
## stripped.
wordList = OIBTree()
last = None
for s in src:
if s[0] == '\"': last=self.subindex(s[1:-1], d, old, last)
# Run through the words and score them
for word in sourceWords:
if word[0] == '\"':
last = self.subindex(word[1:-1], wordList,
wordList.has_key, last) # XXX
else:
if old(s):
if s != last: d[s] = d[s]+1
else: d[s] = 1
if wordList.has_key(word):
if word != last:
wordList[word] = wordList[word]+1
else:
wordList[word] = 1
index = self._index
unindex = self._unindex
lexicon = self.getLexicon(self._lexicon)
get = index.get
unindex[i] = []
times = 0
unindex[documentId] = [] # XXX this should be more intellegent
wordCount = 0
for word, score in d.items():
for word, score in wordList.items():
if threshold is not None:
if times > threshold:
if ((wordCount % threshold) == 0) and not (wordCount == 0):
# commit a subtransaction hack
get_transaction().commit(1)
# kick the cache
self._p_jar.cacheFullSweep(1)
times = 0
word_id = lexicon.set(word)
wordId = lexicon.set(word)
r = get(word_id)
if r is not None:
r = index[word_id]
if type(r) is TupleType:
r = {r[0]:r[1]}
r[i] = score
index[word_id] = r
unindex[i].append(word_id)
indexRow = index.get(wordId)
if indexRow is not None:
indexRow = index[wordId] # Duplicate?
if type(indexRow) is TupleType:
indexRow = {indexRow[0]:indexRow[1]}
indexRow[documentId] = score
index[wordId] = indexRow
unindex[documentId].append(wordId)
elif type(r) is DictType:
if len(r) > 4:
elif type(indexRow) is DictType:
if len(indexRow) > 4:
b = IIBucket()
for k, v in r.items(): b[k] = v
r = b
r[i] = score
for k, v in indexRow.items():
b[k] = v
indexRow = b
indexRow[documentId] = score
index[word_id] = r
unindex[i].append(word_id)
index[wordId] = indexRow
unindex[documentId].append(wordId)
else:
r[i] = score
unindex[i].append(word_id)
indexRow[documentId] = score
unindex[documentId].append(wordId)
else:
index[word_id] = i, score
unindex[i].append(word_id)
times = times + 1
index[wordId] = documentId, score
unindex[documentId].append(wordId)
wordCount = wordCount + 1
unindex[i] = tuple(unindex[i])
l = len(unindex[i])
unindex[documentId] = tuple(unindex[documentId])
self._index = index
self._unindex = unindex
## return the number of words you indexed
return times
return wordCount
def unindex_object(self, i):
""" carefully unindex document with integer id 'i' from the text
......@@ -338,8 +311,6 @@ class UnTextIndex(Persistent, Implicit):
'unindex_object tried to unindex nonexistent'
' document %s' % str(i))
del unindex[i]
self._index = index
self._unindex = unindex
def __getitem__(self, word):
"""Return an InvertedIndex-style result "list"
......@@ -378,10 +349,8 @@ class UnTextIndex(Persistent, Implicit):
all data fields used.
"""
id = self.id
if request.has_key(id):
keys = request[id]
if request.has_key(self.id):
keys = request[self.id]
else:
return None
......@@ -410,26 +379,25 @@ class UnTextIndex(Persistent, Implicit):
r = r.intersection(rr)
if r is not None:
return r, (id,)
return IIBucket(), (id,)
return r, (self.id,)
return (IIBucket(), (self.id,))
def positions(self, docid, words, obj):
"""Return the positions in the document for the given document
id of the word, word."""
id = self.id
if self._schema is None:
f = getattr
else:
f = operator.__getitem__
id = self._schema[id]
id = self._schema[self.id]
if self.call_methods:
doc = str(f(obj, id)())
doc = str(f(obj, self.id)())
else:
doc = str(f(obj, id))
doc = str(f(obj, self.id))
r = []
for word in words:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment