Commit 6e60995a authored by Andreas Jung's avatar Andreas Jung

- unified indentations

- added encoding parameter to make pre-2.5 Data.fs installations happy because
  of adding the encoding parameter to the Splitter API
parent 0709760d
...@@ -95,7 +95,9 @@ typedef struct ...@@ -95,7 +95,9 @@ typedef struct
PyObject *text, *synstop; PyObject *text, *synstop;
char *here, *end; char *here, *end;
int index; int index;
} Splitter; }
Splitter;
static PyObject *next_word(Splitter *, char **, char **); static PyObject *next_word(Splitter *, char **, char **);
...@@ -120,15 +122,15 @@ Splitter_length(Splitter *self) ...@@ -120,15 +122,15 @@ Splitter_length(Splitter *self)
PyObject *res=0; PyObject *res=0;
Splitter_reset(self); Splitter_reset(self);
while(1)
{ while(1) {
UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1; UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
UNLESS(PyString_Check(res)) UNLESS(PyString_Check(res)) {
{
Py_DECREF(res); Py_DECREF(res);
break; break;
} }
} }
return self->index+1; return self->index+1;
} }
...@@ -167,6 +169,7 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -167,6 +169,7 @@ check_synstop(Splitter *self, PyObject *word)
cword = PyString_AsString(word); cword = PyString_AsString(word);
len = PyString_Size(word); len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */ if(len < 2) /* Single-letter words are stop words! */
{ {
Py_INCREF(Py_None); Py_INCREF(Py_None);
...@@ -176,29 +179,32 @@ check_synstop(Splitter *self, PyObject *word) ...@@ -176,29 +179,32 @@ check_synstop(Splitter *self, PyObject *word)
/************************************************************* /*************************************************************
Test whether a word has any letters. * Test whether a word has any letters. *
*/ */
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); ); for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
if (len < 0)
{ ;
if (len < 0) {
Py_INCREF(Py_None); Py_INCREF(Py_None);
return Py_None; return Py_None;
} }
/* /*
* If no letters, treat it as a stop word. * If no letters, treat it as a stop word.
*************************************************************/ *************************************************************/
Py_INCREF(word); Py_INCREF(word);
if (self->synstop == NULL) return word; if (self->synstop == NULL)
return word;
while ((value = PyObject_GetItem(self->synstop, word)) && while ((value = PyObject_GetItem(self->synstop, word)) &&
PyString_Check(value)) PyString_Check(value)) {
{
ASSIGN(word,value); ASSIGN(word,value);
if(len++ > 100) break; /* Avoid infinite recurssion */
if(len++ > 100)
break; /* Avoid infinite recurssion */
} }
if (value == NULL) if (value == NULL) {
{
PyErr_Clear(); PyErr_Clear();
return word; return word;
} }
...@@ -219,55 +225,68 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -219,55 +225,68 @@ next_word(Splitter *self, char **startpos, char **endpos)
here=self->here; here=self->here;
end=self->end; end=self->end;
b=wbuf; b=wbuf;
while (here < end)
{ while (here < end) {
/* skip hyphens */ /* skip hyphens */
if ((i > 0) && (*here == '-'))
{ if ((i > 0) && (*here == '-')) {
here++;
while (isspace((unsigned char) *here) && (here < end))
here++; here++;
while (isspace((unsigned char) *here) && (here < end)) here++;
continue; continue;
} }
c=tolower((unsigned char) *here); c=tolower((unsigned char) *here);
/* Check to see if this character is part of a word */ /* Check to see if this character is part of a word */
if(isalnum((unsigned char)c) || c=='/' || c=='_')
{ /* Found a word character */
if(startpos && i==0) *startpos=here;
if(i++ < MAX_WORD) *b++ = c;
}
else if (i != 0)
{ /* We've found the end of a word */
if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) if(isalnum((unsigned char)c) || c=='/' || c=='_') { /* Found a word character */
{
if(startpos && i==0)
*startpos=here;
if(i++ < MAX_WORD)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here; self->here=here;
return NULL; return NULL;
} }
UNLESS(res = check_synstop(self, pyword)) UNLESS(res = check_synstop(self, pyword)) {
{
self->here=here; self->here=here;
Py_DECREF(pyword); Py_DECREF(pyword);
return NULL; return NULL;
} }
if (res != Py_None) if (res != Py_None) {
{ if(endpos)
if(endpos) *endpos=here; *endpos=here;
self->here=here; self->here=here;
Py_DECREF(pyword); Py_DECREF(pyword);
self->index++; self->index++;
return res; return res;
} }
/* The word is a stopword, so ignore it */ /* The word is a stopword, so ignore it */
Py_DECREF(res); Py_DECREF(res);
Py_DECREF(pyword); Py_DECREF(pyword);
i = 0; i = 0;
b=wbuf; b=wbuf;
} }
...@@ -278,9 +297,10 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -278,9 +297,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */ /* We've reached the end of the string */
if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */ if(i >= MAX_WORD)
if (i == 0) i=MAX_WORD; /* "stem" the long word */
{
if (i == 0) {
/* No words */ /* No words */
self->here=here; self->here=here;
Py_INCREF(Py_None); Py_INCREF(Py_None);
...@@ -289,10 +309,16 @@ next_word(Splitter *self, char **startpos, char **endpos) ...@@ -289,10 +309,16 @@ next_word(Splitter *self, char **startpos, char **endpos)
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL; UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
if(endpos) *endpos=here; if(endpos)
*endpos=here;
res = check_synstop(self, pyword); res = check_synstop(self, pyword);
Py_DECREF(pyword); Py_DECREF(pyword);
if(PyString_Check(res)) self->index++;
if(PyString_Check(res))
self->index++;
return res; return res;
} }
...@@ -301,15 +327,15 @@ Splitter_item(Splitter *self, int i) ...@@ -301,15 +327,15 @@ Splitter_item(Splitter *self, int i)
{ {
PyObject *word = NULL; PyObject *word = NULL;
if (i <= self->index) Splitter_reset(self); if (i <= self->index)
Splitter_reset(self);
while(self->index < i) while(self->index < i) {
{
Py_XDECREF(word); Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL; UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None)
{ if (word == Py_None) {
Py_DECREF(word); Py_DECREF(word);
PyErr_SetString(PyExc_IndexError, PyErr_SetString(PyExc_IndexError,
"Splitter index out of range"); "Splitter index out of range");
...@@ -335,7 +361,7 @@ static PySequenceMethods Splitter_as_sequence = { ...@@ -335,7 +361,7 @@ static PySequenceMethods Splitter_as_sequence = {
(intintargfunc)Splitter_slice, /*sq_slice*/ (intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/ (intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/ (intintobjargproc)0, /*sq_ass_slice*/
}; };
static PyObject * static PyObject *
Splitter_pos(Splitter *self, PyObject *args) Splitter_pos(Splitter *self, PyObject *args)
...@@ -346,17 +372,18 @@ Splitter_pos(Splitter *self, PyObject *args) ...@@ -346,17 +372,18 @@ Splitter_pos(Splitter *self, PyObject *args)
UNLESS(PyArg_Parse(args, "i", &i)) return NULL; UNLESS(PyArg_Parse(args, "i", &i)) return NULL;
if (i <= self->index) Splitter_reset(self); if (i <= self->index)
Splitter_reset(self);
while(self->index < i) while(self->index < i) {
{
UNLESS(res=next_word(self, &start, &end)) return NULL; UNLESS(res=next_word(self, &start, &end)) return NULL;
if(PyString_Check(res))
{ if(PyString_Check(res)) {
self->index++; self->index++;
Py_DECREF(res); Py_DECREF(res);
continue; continue;
} }
Py_DECREF(res); Py_DECREF(res);
PyErr_SetString(PyExc_IndexError, "Splitter index out of range"); PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
return NULL; return NULL;
...@@ -377,17 +404,21 @@ Splitter_indexes(Splitter *self, PyObject *args) ...@@ -377,17 +404,21 @@ Splitter_indexes(Splitter *self, PyObject *args)
UNLESS(word=check_synstop(self, word)) goto err; UNLESS(word=check_synstop(self, word)) goto err;
Splitter_reset(self); Splitter_reset(self);
while(1)
{ while(1) {
UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err; UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
UNLESS(PyString_Check(w)) break; UNLESS(PyString_Check(w)) break;
if(PyObject_Compare(word,w)==0)
{ if(PyObject_Compare(word,w)==0) {
UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err; UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
if(PyList_Append(r,index) < 0) goto err;
if(PyList_Append(r,index) < 0)
goto err;
} }
i++; i++;
} }
Py_XDECREF(w); Py_XDECREF(w);
Py_XDECREF(index); Py_XDECREF(index);
return r; return r;
...@@ -398,14 +429,17 @@ err: ...@@ -398,14 +429,17 @@ err:
return NULL; return NULL;
} }
static struct PyMethodDef Splitter_methods[] = { static struct PyMethodDef Splitter_methods[] =
{
{ "pos", (PyCFunction)Splitter_pos, 0, { "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" }, "pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS, { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence", "indexes(word) -- Return a list of the indexes of word in the sequence",
}, },
{ NULL, NULL } /* sentinel */ { NULL, NULL } /* sentinel */
}; };
static PyObject * static PyObject *
Splitter_getattr(Splitter *self, char *name) Splitter_getattr(Splitter *self, char *name)
...@@ -438,55 +472,68 @@ static PyTypeObject SplitterType = { ...@@ -438,55 +472,68 @@ static PyTypeObject SplitterType = {
/* Space for future expansion */ /* Space for future expansion */
0L,0L,0L,0L, 0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */ SplitterType__doc__ /* Documentation string */
}; };
static char *splitter_args[]={"doc","synstop","encoding",NULL};
static PyObject * static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args) get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
{ {
Splitter *self; Splitter *self;
PyObject *doc, *synstop = NULL; PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL; UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL; UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop) if(synstop) {
{
self->synstop=synstop; self->synstop=synstop;
Py_INCREF(synstop); Py_INCREF(synstop);
}
else self->synstop=NULL; } else
self->synstop=NULL;
UNLESS(self->text = PyObject_Str(doc)) goto err; UNLESS(self->text = PyObject_Str(doc)) goto err;
UNLESS(self->here=PyString_AsString(self->text)) goto err; UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text); self->end = self->here + PyString_Size(self->text);
self->index = -1; self->index = -1;
return (PyObject*)self; return (PyObject*)self;
err: err:
Py_DECREF(self); Py_DECREF(self);
return NULL; return NULL;
} }
static struct PyMethodDef Splitter_module_methods[] = { static struct PyMethodDef Splitter_module_methods[] =
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS, {
"ZopeSplitter(doc[,synstop]) -- Return a word splitter" }, { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter"
},
{ NULL, NULL } { NULL, NULL }
}; };
static char Splitter_module_documentation[] = static char Splitter_module_documentation[] =
"Parse source strings into sequences of words\n" "Parse source strings into sequences of words\n"
"\n" "\n"
"for use in an inverted index\n" "for use in an inverted index\n"
"\n" "\n"
"$Id: ZopeSplitter.c,v 1.3 2001/10/29 17:42:51 andreasjung Exp $\n" "$Id: ZopeSplitter.c,v 1.4 2001/11/14 13:45:49 andreasjung Exp $\n"
; ;
void void
initZopeSplitter(void) initZopeSplitter(void)
{ {
PyObject *m, *d; PyObject *m, *d;
char *rev="$Revision: 1.3 $"; char *rev="$Revision: 1.4 $";
/* Create the module and add the functions */ /* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods, m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
...@@ -498,5 +545,6 @@ initZopeSplitter(void) ...@@ -498,5 +545,6 @@ initZopeSplitter(void)
PyDict_SetItemString(d, "__version__", PyDict_SetItemString(d, "__version__",
PyString_FromStringAndSize(rev+11,strlen(rev+11)-2)); PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter"); if (PyErr_Occurred())
Py_FatalError("can't initialize module Splitter");
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment