Commit 17f69863 authored by Andreas Jung's avatar Andreas Jung

added 3 new parameters for all zope splitters

parent fc443b19
...@@ -27,6 +27,19 @@ Zope Changes ...@@ -27,6 +27,19 @@ Zope Changes
Features Added Features Added
- TextIndex/Splitters: the constructor of all three splitters
has now three new optional parameters:
'maxlen'=(1-256) - to specify the maximum length of
splitted words
'singlechar'=(1|0) - allows single characters to be indexed
'indexnumbers'=(1|0)- allows numbers to be indexed
The default values of all parameters reflect the standard
behaviour.
- Enhancements to utilites/requestprofiler.py: - Enhancements to utilites/requestprofiler.py:
Added readstats and writestats features which allow for saves and Added readstats and writestats features which allow for saves and
......
...@@ -32,6 +32,9 @@ typedef struct ...@@ -32,6 +32,9 @@ typedef struct
PyObject *text, *synstop; PyObject *text, *synstop;
char *here, *end; char *here, *end;
int index; int index;
int allow_single_chars;
int index_numbers;
int max_len;
} }
Splitter; Splitter;
...@@ -117,6 +120,32 @@ Splitter_length(Splitter *self) ...@@ -117,6 +120,32 @@ Splitter_length(Splitter *self)
return self->index+1; return self->index+1;
} }
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject * static PyObject *
Splitter_concat(Splitter *self, PyObject *other) Splitter_concat(Splitter *self, PyObject *other)
{ {
...@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -155,7 +184,7 @@ check_synstop(Splitter *self, PyObject *word)
len = PyString_Size(word); len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{ {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
...@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -167,7 +196,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); ) for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
; ;
if (len < 0) { if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -197,12 +226,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */ return value; /* Which must be None! */
} }
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject * static PyObject *
next_word(Splitter *self, char **startpos, char **endpos) next_word(Splitter *self, char **startpos, char **endpos)
{ {
char wbuf[MAX_WORD]; char wbuf[256];
char *end, *here, *b; char *end, *here, *b;
int i = 0, c; int i = 0, c;
PyObject *pyword, *res; PyObject *pyword, *res;
...@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -232,13 +260,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0) if(startpos && i==0)
*startpos=here; *startpos=here;
if(i++ < MAX_WORD) if(i++ < self->max_len)
*b++ = c; *b++ = c;
} else if (i != 0) { /* We've found the end of a word */ } else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) { UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here; self->here=here;
...@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -282,8 +310,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */ /* We've reached the end of the string */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
if (i == 0) { if (i == 0) {
/* No words */ /* No words */
...@@ -416,6 +444,9 @@ err: ...@@ -416,6 +444,9 @@ err:
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split the string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0, { "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" "pos(index) -- Return the starting and ending position of a token"
}, },
...@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = { ...@@ -459,7 +490,7 @@ static PyTypeObject SplitterType = {
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject * static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
...@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -467,8 +498,29 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self; Splitter *self;
PyObject *doc, *synstop = NULL; PyObject *doc, *synstop = NULL;
char * encoding="latin1"; char * encoding="latin1";
int single_char = 0;
int index_numbers = 0;
int max_len=64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
...@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -484,6 +536,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
UNLESS(self->here=PyString_AsString(self->text)) goto err; UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text); self->end = self->here + PyString_Size(self->text);
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->index = -1; self->index = -1;
...@@ -498,7 +553,7 @@ err: ...@@ -498,7 +553,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ISO_8859_1_Splitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ISO_8859_1_Splitter(doc[,synstop]) -- Return a word splitter" "ISO_8859_1_Splitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] = ...@@ -509,7 +564,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ISO_8859_1_Splitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n" "$Id: ISO_8859_1_Splitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -518,7 +573,7 @@ void ...@@ -518,7 +573,7 @@ void
initISO_8859_1_Splitter(void) initISO_8859_1_Splitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.5 $"; char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
initSplitterTrtabs(); initSplitterTrtabs();
......
...@@ -13,8 +13,6 @@ ...@@ -13,8 +13,6 @@
#include "Python.h" #include "Python.h"
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
#ifndef min #ifndef min
#define min(a,b) ((a)<(b)?(a):(b)) #define min(a,b) ((a)<(b)?(a):(b))
#endif #endif
...@@ -24,8 +22,12 @@ typedef struct ...@@ -24,8 +22,12 @@ typedef struct
PyObject_HEAD PyObject_HEAD
PyObject *list; PyObject *list;
PyObject *synstop; PyObject *synstop;
int max_len;
int allow_single_chars;
int index_numbers;
} }
Splitter; Splitter;
static static
PyUnicodeObject *prepareString(PyUnicodeObject *o); PyUnicodeObject *prepareString(PyUnicodeObject *o);
...@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word) ...@@ -34,6 +36,9 @@ static PyObject *checkSynword(Splitter *self, PyObject *word)
/* Always returns a borrowed reference */ /* Always returns a borrowed reference */
PyObject *value; PyObject *value;
if (PyUnicode_GetSize(word)==1 && ! self->allow_single_chars)
return Py_None;
if (self->synstop) { if (self->synstop) {
value = PyDict_GetItem(self->synstop,word); value = PyDict_GetItem(self->synstop,word);
if (value != NULL) { if (value != NULL) {
...@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i) ...@@ -82,6 +87,14 @@ Splitter_item(Splitter *self, int i)
return item; return item;
} }
static PyObject *
Splitter_split(Splitter *self) {
Py_INCREF(self->list);
return self->list;
}
static PyObject * static PyObject *
Splitter_indexes(Splitter *self, PyObject *args) Splitter_indexes(Splitter *self, PyObject *args)
...@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args) ...@@ -133,6 +146,8 @@ Splitter_pos(Splitter *self, PyObject *args)
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction) Splitter_split, 0,
"split() -- Split string in one run" },
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS, { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence", "indexes(word) -- Return a list of the indexes of word in the sequence",
}, },
...@@ -198,22 +213,27 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -198,22 +213,27 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
register Py_UNICODE ch; register Py_UNICODE ch;
ch = *s; ch = *s;
#ifdef DEBUG
printf("%d %c %d\n",i,ch,ch);
fflush(stdout);
#endif
if (!inside_word) { if (!inside_word) {
if (self->index_numbers) {
if (Py_UNICODE_ISALNUM(ch)) {
inside_word=1;
start = i;
}
} else {
if (Py_UNICODE_ISALPHA(ch)) { if (Py_UNICODE_ISALPHA(ch)) {
inside_word=1; inside_word=1;
start = i; start = i;
} }
}
} else { } else {
if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) { if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
inside_word = 0; inside_word = 0;
word = PySequence_GetSlice((PyObject *)doc1,start, word = PySequence_GetSlice((PyObject *)doc1,start,
min(i, start + MAX_WORD)); min(i, start + self->max_len));
if (word==NULL) if (word==NULL)
goto err; goto err;
...@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc) ...@@ -234,7 +254,7 @@ static int splitUnicodeString(Splitter *self,PyUnicodeObject *doc)
if (inside_word) { if (inside_word) {
word = PySequence_GetSlice((PyObject *)doc1,start, word = PySequence_GetSlice((PyObject *)doc1,start,
min(len, start + MAX_WORD)); min(len, start + self->max_len));
if (word==NULL) if (word==NULL)
goto err; goto err;
...@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o) ...@@ -288,7 +308,7 @@ PyUnicodeObject *prepareString(PyUnicodeObject *o)
return u; return u;
} }
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","indexnumbers","singlechar","maxlen",NULL};
static PyObject * static PyObject *
...@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -297,9 +317,11 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
Splitter *self=NULL; Splitter *self=NULL;
PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL; PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
char *encoding = "latin1"; char *encoding = "latin1";
int index_numbers = 0;
int max_len=64;
int single_char = 0;
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL; if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args,&doc,&synstop,&encoding,&index_numbers,&single_char,&max_len))) return NULL;
if (! (PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args,&doc,&synstop,&encoding))) return NULL;
#ifdef DEBUG #ifdef DEBUG
puts("got text"); puts("got text");
...@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -307,6 +329,21 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
fflush(stdout); fflush(stdout);
#endif #endif
if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
if (PyString_Check(doc)) { if (PyString_Check(doc)) {
unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict"); unicodedoc = PyUnicode_FromEncodedObject(doc,encoding,"strict");
...@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds) ...@@ -324,11 +361,17 @@ newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
return NULL; return NULL;
} }
if (! (self = PyObject_NEW(Splitter, &SplitterType))) return NULL;
if (synstop) { if (synstop) {
self->synstop = synstop; self->synstop = synstop;
Py_INCREF(synstop); Py_INCREF(synstop);
} else self->synstop=NULL; } else self->synstop=NULL;
self->index_numbers = index_numbers;
self->max_len = max_len;
self->allow_single_chars = single_char;
if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0) if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
goto err; goto err;
...@@ -344,11 +387,6 @@ err: ...@@ -344,11 +387,6 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "pos", (PyCFunction) Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" },
{ "indexes", (PyCFunction) Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in sequence" },
{ "UnicodeSplitter", (PyCFunction)newSplitter, { "UnicodeSplitter", (PyCFunction)newSplitter,
METH_VARARGS|METH_KEYWORDS, METH_VARARGS|METH_KEYWORDS,
"UnicodeSplitter(doc[,synstop][,encoding='latin1']) " "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
...@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] = ...@@ -362,7 +400,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: UnicodeSplitter.c,v 1.12 2001/11/28 15:51:04 matt Exp $\n" "$Id: UnicodeSplitter.c,v 1.13 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -370,7 +408,7 @@ void ...@@ -370,7 +408,7 @@ void
initUnicodeSplitter(void) initUnicodeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.12 $"; char *rev="$Revision: 1.13 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods, m = Py_InitModule4("UnicodeSplitter", Splitter_module_methods,
......
...@@ -10,6 +10,8 @@ ...@@ -10,6 +10,8 @@
FOR A PARTICULAR PURPOSE FOR A PARTICULAR PURPOSE
****************************************************************************/ ****************************************************************************/
#include "Python.h" #include "Python.h"
#include <ctype.h> #include <ctype.h>
...@@ -23,6 +25,9 @@ typedef struct ...@@ -23,6 +25,9 @@ typedef struct
PyObject *text, *synstop; PyObject *text, *synstop;
char *here, *end; char *here, *end;
int index; int index;
int allow_single_chars;
int index_numbers;
int max_len;
} }
Splitter; Splitter;
...@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -98,7 +103,7 @@ check_synstop(Splitter *self, PyObject *word)
cword = PyString_AsString(word); cword = PyString_AsString(word);
len = PyString_Size(word); len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */ if(len < 2 && ! self->allow_single_chars) /* Single-letter words are stop words! */
{ {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
...@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -110,7 +115,7 @@ check_synstop(Splitter *self, PyObject *word)
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); ) for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
; ;
if (len < 0) { if (len < 0 && ! self->index_numbers) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
...@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -140,12 +145,11 @@ check_synstop(Splitter *self, PyObject *word)
return value; /* Which must be None! */ return value; /* Which must be None! */
} }
#define MAX_WORD 64 /* Words longer than MAX_WORD are stemmed */
static PyObject * static PyObject *
next_word(Splitter *self, char **startpos, char **endpos) next_word(Splitter *self, char **startpos, char **endpos)
{ {
char wbuf[MAX_WORD]; char wbuf[256];
char *end, *here, *b; char *end, *here, *b;
int i = 0, c; int i = 0, c;
PyObject *pyword, *res; PyObject *pyword, *res;
...@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -175,13 +179,13 @@ next_word(Splitter *self, char **startpos, char **endpos)
if(startpos && i==0) if(startpos && i==0)
*startpos=here; *startpos=here;
if(i++ < MAX_WORD) if(i++ < self->max_len)
*b++ = c; *b++ = c;
} else if (i != 0) { /* We've found the end of a word */ } else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) { UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here; self->here=here;
...@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -225,8 +229,8 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */ /* We've reached the end of the string */
if(i >= MAX_WORD) if(i >= self->max_len)
i=MAX_WORD; /* "stem" the long word */ i=self->max_len; /* "stem" the long word */
if (i == 0) { if (i == 0) {
/* No words */ /* No words */
...@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i) ...@@ -274,6 +278,31 @@ Splitter_item(Splitter *self, int i)
return word; return word;
} }
static PyObject *
Splitter_split(Splitter*self)
{
PyObject *list=NULL,*word=NULL;
UNLESS(list = PyList_New(0)) return NULL;
Splitter_reset(self);
while (1) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None) {
return list;
}
PyList_Append(list,word);
}
return list;
}
static PyObject * static PyObject *
Splitter_slice(Splitter *self, int i, int j) Splitter_slice(Splitter *self, int i, int j)
{ {
...@@ -289,7 +318,7 @@ static PySequenceMethods Splitter_as_sequence = { ...@@ -289,7 +318,7 @@ static PySequenceMethods Splitter_as_sequence = {
(intintargfunc)Splitter_slice, /*sq_slice*/ (intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/ (intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/ (intintobjargproc)0, /*sq_ass_slice*/
}; };
static PyObject * static PyObject *
Splitter_pos(Splitter *self, PyObject *args) Splitter_pos(Splitter *self, PyObject *args)
...@@ -359,6 +388,10 @@ err: ...@@ -359,6 +388,10 @@ err:
static struct PyMethodDef Splitter_methods[] = static struct PyMethodDef Splitter_methods[] =
{ {
{ "split", (PyCFunction)Splitter_split, 0,
"split() -- Split complete string in one run"
},
{ "pos", (PyCFunction)Splitter_pos, 0, { "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" "pos(index) -- Return the starting and ending position of a token"
}, },
...@@ -400,9 +433,9 @@ static PyTypeObject SplitterType = { ...@@ -400,9 +433,9 @@ static PyTypeObject SplitterType = {
/* Space for future expansion */ /* Space for future expansion */
0L,0L,0L,0L, 0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding",NULL}; static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxlen",NULL};
static PyObject * static PyObject *
...@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -411,8 +444,28 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
Splitter *self; Splitter *self;
PyObject *doc, *synstop = NULL; PyObject *doc, *synstop = NULL;
char *encoding = "latin1"; char *encoding = "latin1";
int single_char = 0;
int index_numbers = 0;
int max_len= 64;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
&doc,&synstop,&encoding,&single_char,&index_numbers,&max_len)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL; if (index_numbers<0 || index_numbers>1) {
PyErr_SetString(PyExc_ValueError,"indexnumbers must be 0 or 1");
return NULL;
}
if (single_char<0 || single_char>1) {
PyErr_SetString(PyExc_ValueError,"singlechar must be 0 or 1");
return NULL;
}
if (max_len<1 || max_len>128) {
PyErr_SetString(PyExc_ValueError,"maxlen must be between 1 and 128");
return NULL;
}
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
...@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds) ...@@ -430,6 +483,9 @@ get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
self->end = self->here + PyString_Size(self->text); self->end = self->here + PyString_Size(self->text);
self->index = -1; self->index = -1;
self->allow_single_chars = single_char;
self->index_numbers = index_numbers;
self->max_len = max_len;
return (PyObject*)self; return (PyObject*)self;
...@@ -442,7 +498,7 @@ err: ...@@ -442,7 +498,7 @@ err:
static struct PyMethodDef Splitter_module_methods[] = static struct PyMethodDef Splitter_module_methods[] =
{ {
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS, { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter" "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers][,maxlen]) -- Return a word splitter"
}, },
{ NULL, NULL } { NULL, NULL }
...@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] = ...@@ -453,7 +509,7 @@ static char Splitter_module_documentation[] =
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ZopeSplitter.c,v 1.5 2001/11/28 15:51:04 matt Exp $\n" "$Id: ZopeSplitter.c,v 1.6 2002/01/09 15:17:34 andreasjung Exp $\n"
; ;
...@@ -461,7 +517,7 @@ void ...@@ -461,7 +517,7 @@ void
initZopeSplitter(void) initZopeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.5 $"; char *rev="$Revision: 1.6 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods, m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment