added 3 new parameters for all zope splitters

17f69863 · Andreas Jung · fc443b19 · 17f69863 · 17f69863 · 17f69863
Commit 17f69863 authored Jan 09, 2002 by Andreas Jung
4 changed files
--- a/doc/CHANGES.txt
+++ b/doc/CHANGES.txt
@@ -27,6 +27,19 @@ Zope Changes
    Features Added
+      - TextIndex/Splitters: the constructor of all three splitters
+        has now three new optional parameters:
+        'maxlen'=(1-256) -  to specify the maximum length of 
+                            splitted words
+        'singlechar'=(1|0) - allows single characters to be indexed
+        'indexnumbers'=(1|0)- allows numbers to be indexed
+        The default values of all parameters reflect the standard
+        behaviour.
      - Enhancements to utilites/requestprofiler.py:
        Added readstats and writestats features which allow for saves and

--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/ISO_8859_1_Splitter/src/ISO_8859_1_Splitter.c
@@ -32,6 +32,9 @@ typedef struct
    PyObject *text, *synstop;
    char *here, *end;
    int index;
+    int allow_single_chars;
+    int index_numbers;
+    int max_len;
 }
 Splitter;
@@ -117,6 +120,32 @@ Splitter_length(Splitter *self)
    return self->index+1;
 }
+static PyObject *
+Splitter_split(Splitter*self)
+{
+    PyObject *list=NULL,*word=NULL;
+    UNLESS(list = PyList_New(0)) return NULL;
+    Splitter_reset(self);
+    while (1) {
+        Py_XDECREF(word);
+        UNLESS(word = next_word(self,NULL,NULL)) return NULL;
+        if (word == Py_None) {
+            return list;
+        }
+        PyList_Append(list,word);
+    }
+    return list;
+}
 static PyObject *
 Splitter_concat(Splitter *self, PyObject *other)
 {
@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word)
    len = PyString_Size(word);
-    if(len < 2)	/* Single-letter words are stop words! */
+    if(len < 2 && ! self->allow_single_chars)	/* Single-letter words are stop words! */
    {
        Py_INCREF(Py_None);
        return Py_None;
@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word)
    for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
        ;
-    if (len < 0) {
+    if (len < 0 && ! self->index_numbers) {
        Py_INCREF(Py_None);
        return Py_None;
    }
@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word)
    return value;		/* Which must be None! */
 }
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-    char wbuf[MAX_WORD];
+    char wbuf[256];
    char *end, *here, *b;
    int i = 0, c;
    PyObject *pyword, *res;
@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
            if(startpos && i==0)
                *startpos=here;
-            if(i++ < MAX_WORD)
+            if(i++ < self->max_len)
                *b++ = c;
        } else if (i != 0) { /* We've found the end of a word */
-            if(i >= MAX_WORD)
+            if(i >= self->max_len)
-                i=MAX_WORD; /* "stem" the long word */
+                i=self->max_len; /* "stem" the long word */
            UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
                self->here=here;
@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
    /* We've reached the end of the string */
-    if(i >= MAX_WORD)
+    if(i >= self->max_len)
-        i=MAX_WORD; /* "stem" the long word */
+        i=self->max_len; /* "stem" the long word */
    if (i == 0) {
        /* No words */
@@ -416,6 +444,9 @@ err:
 static struct PyMethodDef Splitter_methods[] =
    {
+        { "split", (PyCFunction)Splitter_split, 0,
+            "split() -- Split the string in one run"
+        },
        { "pos", (PyCFunction)Splitter_pos, 0,
            "pos(index) -- Return the starting and ending position of a token"
        },
@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = {
                                       SplitterType__doc__ /* Documentation string */
                                   };
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
 static PyObject *
 get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
    Splitter *self;
    PyObject *doc, *synstop = NULL;
    char * encoding="latin1";
+    int single_char = 0;
+    int index_numbers = 0;
+    int max_len=64;
+    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
-    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
    UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
    UNLESS(self->here=PyString_AsString(self->text)) goto err;
    self->end = self->here + PyString_Size(self->text);
+    self->allow_single_chars    = single_char;
+    self->index_numbers         = index_numbers;
+    self->max_len               = max_len;
    self->index = -1;
@@ -498,7 +553,7 @@ err:
 static struct PyMethodDef Splitter_module_methods[] =
    {
        { "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
-            "ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter"
+          "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
        },
        { NULL, NULL }
@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] =
    "\n"
    "for use in an inverted index\n"
    "\n"
-    "$Id: ISO_8859_1_Splitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n"
+    "$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
    ;
@@ -518,7 +573,7 @@ void
 initISO_8859_1_Splitter(void)
 {
    PyObject *m, *d;
-    char *rev="$Revision: 1.5 $";
+    char *rev="$Revision: 1.6 $";
    /* Create the module and add the functions */
    initSplitterTrtabs();

--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c
@@ -13,8 +13,6 @@
 #include "Python.h"
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 #ifndef min
 #define min(a,b) ((a)<(b)?(a):(b))
 #endif
@@ -24,8 +22,12 @@ typedef struct
    PyObject_HEAD
    PyObject *list;
    PyObject *synstop;
+    int max_len;
+    int allow_single_chars;
+    int index_numbers;
 }
 Splitter;
 static
 PyUnicodeObject *prepareString(PyUnicodeObject *o);
@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word)
    /* Always returns a borrowed reference */
    PyObject *value;
+    if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
+        return Py_None;
    if (self->synstop) {
        value = PyDict_GetItem(self->synstop,word);
        if (value != NULL) {
@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i)
  return item;
 }
+static PyObject * 
+Splitter_split(Splitter *self) {
+    Py_INCREF(self->list);
+    return self->list;
+}
 static PyObject *
 Splitter_indexes(Splitter *self, PyObject *args)
@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args)
 static struct PyMethodDef Splitter_methods[] =
    {
+        { "split", (PyCFunction) Splitter_split, 0,
+          "split() -- Split string in one run" },
        { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
          "indexes(word) -- Return a list of the indexes of word in the sequence",
        },
@@ -198,22 +213,27 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
        register Py_UNICODE ch;
        ch = *s;
-#ifdef DEBUG
-        printf("%d %c %d\n",i,ch,ch);
-        fflush(stdout);
-#endif
        if (!inside_word) {
+            if (self->index_numbers) {
+                if (Py_UNICODE_ISALNUM(ch)) {
+                    inside_word=1;
+                    start = i;
+                }
+            } else {
                if (Py_UNICODE_ISALPHA(ch)) {
                    inside_word=1;
                    start = i;
                }
+            }
        } else {
            if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
                inside_word = 0;
                word = PySequence_GetSlice((PyObject *)doc1,start,
-                                           min(i, start + MAX_WORD));
+                                           min(i, start + self->max_len));
                if (word==NULL)
                  goto err;
@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
    if (inside_word) {
        word = PySequence_GetSlice((PyObject *)doc1,start,
-                                   min(len, start + MAX_WORD));
+                                   min(len, start + self->max_len));
        if (word==NULL)
          goto err;
@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o)
    return  u;
 }
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
 static PyObject *
@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
    Splitter *self=NULL;
    PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
    char *encoding = "latin1";
+    int index_numbers = 0;
+    int max_len=64;
+    int single_char = 0;
-    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
+    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
-    if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
 #ifdef DEBUG
    puts("got text");
@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
    fflush(stdout);
 #endif
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
    if (PyString_Check(doc)) {
        unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
        return NULL;
    }
+    if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
    if (synstop) {
        self->synstop = synstop;
        Py_INCREF(synstop);
    } else  self->synstop=NULL;
+    self->index_numbers      = index_numbers;
+    self->max_len            = max_len;
+    self->allow_single_chars = single_char;
    if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
      goto err;
@@ -344,11 +387,6 @@ err:
 static struct PyMethodDef Splitter_module_methods[] =
    {
-        { "pos", (PyCFunction) Splitter_pos, 0,
-          "pos(index) -- Return the starting and ending position of a token" },
-        { "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
-          "indexes(word) -- Return a list of the indexes of word in sequence" },
        { "UnicodeSplitter", (PyCFunction)newSplitter,
          METH_VARARGS|METH_KEYWORDS,
          "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] =
    "\n"
    "for use in an inverted index\n"
    "\n"
-    "$Id: UnicodeSplitter.c,v 1.12 2001/11/28 15:51:04 matt Exp $\n"
+    "$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n"
    ;
@@ -370,7 +408,7 @@ void
 initUnicodeSplitter(void)
 {
    PyObject *m, *d;
-    char *rev="$Revision: 1.12 $";
+    char *rev="$Revision: 1.13 $";
    /* Create the module and add the functions */
    m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,

--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
@@ -10,6 +10,8 @@
  FOR A PARTICULAR PURPOSE
 ****************************************************************************/
 #include "Python.h"
 #include <ctype.h>
@@ -23,6 +25,9 @@ typedef struct
    PyObject *text, *synstop;
    char *here, *end;
    int index;
+    int allow_single_chars;
+    int index_numbers;
+    int max_len;
 }
 Splitter;
@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word)
    cword = PyString_AsString(word);
    len = PyString_Size(word);
-    if(len < 2)	/* Single-letter words are stop words! */
+    if(len < 2 && ! self->allow_single_chars)	/* Single-letter words are stop words! */
    {
        Py_INCREF(Py_None);
        return Py_None;
@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word)
    for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
        ;
-    if (len < 0) {
+    if (len < 0 && ! self->index_numbers) {
        Py_INCREF(Py_None);
        return Py_None;
    }
@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word)
    return value;		/* Which must be None! */
 }
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-    char wbuf[MAX_WORD];
+    char wbuf[256];
    char *end, *here, *b;
    int i = 0, c;
    PyObject *pyword, *res;
@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
            if(startpos && i==0)
                *startpos=here;
-            if(i++ < MAX_WORD)
+            if(i++ < self->max_len)
                *b++ = c;
        } else if (i != 0) { /* We've found the end of a word */
-            if(i >= MAX_WORD)
+            if(i >= self->max_len)
-                i=MAX_WORD; /* "stem" the long word */
+                i=self->max_len; /* "stem" the long word */
            UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
                self->here=here;
@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
    /* We've reached the end of the string */
-    if(i >= MAX_WORD)
+    if(i >= self->max_len)
-        i=MAX_WORD; /* "stem" the long word */
+        i=self->max_len; /* "stem" the long word */
    if (i == 0) {
        /* No words */
@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i)
    return word;
 }
+static PyObject *
+Splitter_split(Splitter*self)
+{
+    PyObject *list=NULL,*word=NULL;
+    UNLESS(list = PyList_New(0)) return NULL;
+    Splitter_reset(self);
+    while (1) {
+        Py_XDECREF(word);
+        UNLESS(word = next_word(self,NULL,NULL)) return NULL;
+        if (word == Py_None) {
+            return list;
+        }
+        PyList_Append(list,word);
+    }
+    return list;
+}
 static PyObject *
 Splitter_slice(Splitter *self, int i, int j)
 {
@@ -289,7 +318,7 @@ static PySequenceMethods Splitter_as_sequence = {
    (intintargfunc)Splitter_slice,   /*sq_slice*/
    (intobjargproc)0,                    /*sq_ass_item*/
    (intintobjargproc)0,                 /*sq_ass_slice*/
-        };
+};
 static PyObject *
 Splitter_pos(Splitter *self, PyObject *args)
@@ -359,6 +388,10 @@ err:
 static struct PyMethodDef Splitter_methods[] =
    {
+        { "split", (PyCFunction)Splitter_split, 0,
+            "split() -- Split complete string in one run"
+        },
        { "pos", (PyCFunction)Splitter_pos, 0,
          "pos(index) -- Return the starting and ending position of a token"
        },
@@ -400,9 +433,9 @@ static PyTypeObject SplitterType = {
    /* Space for future expansion */
    0L,0L,0L,0L,
    SplitterType__doc__ /* Documentation string */
-                                   };
+};
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
 static PyObject *
@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
    Splitter *self;
    PyObject *doc, *synstop = NULL;
    char *encoding = "latin1";
+    int single_char = 0;
+    int index_numbers = 0;
+    int max_len= 64;
+    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
+                                       &doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
-    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
+    if (index_numbers<0 || index_numbers>1) {
+        PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
+        return NULL;
+    }
+    if (single_char<0 || single_char>1) {
+        PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
+        return NULL;
+    }
+    if (max_len<1 || max_len>128) {
+        PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
+        return NULL;
+    }
    UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
    self->end = self->here + PyString_Size(self->text);
    self->index = -1;
+    self->allow_single_chars = single_char;
+    self->index_numbers      = index_numbers;
+    self->max_len            = max_len;
    return (PyObject*)self;
@@ -442,7 +498,7 @@ err:
 static struct PyMethodDef Splitter_module_methods[] =
    {
        { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
-            "ZopeSplitter(doc[,synstop]) -- Return a word splitter"
+            "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
        },
        { NULL, NULL }
@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] =
    "\n"
    "for use in an inverted index\n"
    "\n"
-    "$Id: ZopeSplitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n"
+    "$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
    ;
@@ -461,7 +517,7 @@ void
 initZopeSplitter(void)
 {
    PyObject *m, *d;
-    char *rev="$Revision: 1.5 $";
+    char *rev="$Revision: 1.6 $";
    /* Create the module and add the functions */
    m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,