Commit 6e60995a authored by Andreas Jung's avatar Andreas Jung

- unified indentations

- added encoding parameter to make pre-2.5 Data.fs installations happy because
  of adding the encoding parameter to the Splitter API
parent 0709760d
......@@ -95,7 +95,9 @@ typedef struct
PyObject *text, *synstop;
char *here, *end;
int index;
} Splitter;
}
Splitter;
static PyObject *next_word(Splitter *, char **, char **);
......@@ -120,15 +122,15 @@ Splitter_length(Splitter *self)
PyObject *res=0;
Splitter_reset(self);
while(1)
{
while(1) {
UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
UNLESS(PyString_Check(res))
{
UNLESS(PyString_Check(res)) {
Py_DECREF(res);
break;
}
}
return self->index+1;
}
......@@ -167,6 +169,7 @@ check_synstop(Splitter *self, PyObject *word)
cword = PyString_AsString(word);
len = PyString_Size(word);
if(len < 2) /* Single-letter words are stop words! */
{
Py_INCREF(Py_None);
......@@ -176,29 +179,32 @@ check_synstop(Splitter *self, PyObject *word)
/*************************************************************
Test whether a word has any letters. *
*/
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); );
if (len < 0)
{
for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
;
if (len < 0) {
Py_INCREF(Py_None);
return Py_None;
}
/*
* If no letters, treat it as a stop word.
*************************************************************/
Py_INCREF(word);
if (self->synstop == NULL) return word;
if (self->synstop == NULL)
return word;
while ((value = PyObject_GetItem(self->synstop, word)) &&
PyString_Check(value))
{
PyString_Check(value)) {
ASSIGN(word,value);
if(len++ > 100) break; /* Avoid infinite recurssion */
if(len++ > 100)
break; /* Avoid infinite recurssion */
}
if (value == NULL)
{
if (value == NULL) {
PyErr_Clear();
return word;
}
......@@ -219,55 +225,68 @@ next_word(Splitter *self, char **startpos, char **endpos)
here=self->here;
end=self->end;
b=wbuf;
while (here < end)
{
while (here < end) {
/* skip hyphens */
if ((i > 0) && (*here == '-'))
{
if ((i > 0) && (*here == '-')) {
here++;
while (isspace((unsigned char) *here) && (here < end))
here++;
while (isspace((unsigned char) *here) && (here < end)) here++;
continue;
}
c=tolower((unsigned char) *here);
/* Check to see if this character is part of a word */
if(isalnum((unsigned char)c) || c=='/' || c=='_')
{ /* Found a word character */
if(startpos && i==0) *startpos=here;
if(i++ < MAX_WORD) *b++ = c;
}
else if (i != 0)
{ /* We've found the end of a word */
if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i))
{
if(isalnum((unsigned char)c) || c=='/' || c=='_') { /* Found a word character */
if(startpos && i==0)
*startpos=here;
if(i++ < MAX_WORD)
*b++ = c;
} else if (i != 0) { /* We've found the end of a word */
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
self->here=here;
return NULL;
}
UNLESS(res = check_synstop(self, pyword))
{
UNLESS(res = check_synstop(self, pyword)) {
self->here=here;
Py_DECREF(pyword);
return NULL;
}
if (res != Py_None)
{
if(endpos) *endpos=here;
if (res != Py_None) {
if(endpos)
*endpos=here;
self->here=here;
Py_DECREF(pyword);
self->index++;
return res;
}
/* The word is a stopword, so ignore it */
Py_DECREF(res);
Py_DECREF(pyword);
i = 0;
b=wbuf;
}
......@@ -278,9 +297,10 @@ next_word(Splitter *self, char **startpos, char **endpos)
/* We've reached the end of the string */
if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
if (i == 0)
{
if(i >= MAX_WORD)
i=MAX_WORD; /* "stem" the long word */
if (i == 0) {
/* No words */
self->here=here;
Py_INCREF(Py_None);
......@@ -289,10 +309,16 @@ next_word(Splitter *self, char **startpos, char **endpos)
UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
if(endpos) *endpos=here;
if(endpos)
*endpos=here;
res = check_synstop(self, pyword);
Py_DECREF(pyword);
if(PyString_Check(res)) self->index++;
if(PyString_Check(res))
self->index++;
return res;
}
......@@ -301,15 +327,15 @@ Splitter_item(Splitter *self, int i)
{
PyObject *word = NULL;
if (i <= self->index) Splitter_reset(self);
if (i <= self->index)
Splitter_reset(self);
while(self->index < i)
{
while(self->index < i) {
Py_XDECREF(word);
UNLESS(word = next_word(self,NULL,NULL)) return NULL;
if (word == Py_None)
{
if (word == Py_None) {
Py_DECREF(word);
PyErr_SetString(PyExc_IndexError,
"Splitter index out of range");
......@@ -335,7 +361,7 @@ static PySequenceMethods Splitter_as_sequence = {
(intintargfunc)Splitter_slice, /*sq_slice*/
(intobjargproc)0, /*sq_ass_item*/
(intintobjargproc)0, /*sq_ass_slice*/
};
};
static PyObject *
Splitter_pos(Splitter *self, PyObject *args)
......@@ -346,17 +372,18 @@ Splitter_pos(Splitter *self, PyObject *args)
UNLESS(PyArg_Parse(args, "i", &i)) return NULL;
if (i <= self->index) Splitter_reset(self);
if (i <= self->index)
Splitter_reset(self);
while(self->index < i)
{
while(self->index < i) {
UNLESS(res=next_word(self, &start, &end)) return NULL;
if(PyString_Check(res))
{
if(PyString_Check(res)) {
self->index++;
Py_DECREF(res);
continue;
}
Py_DECREF(res);
PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
return NULL;
......@@ -377,17 +404,21 @@ Splitter_indexes(Splitter *self, PyObject *args)
UNLESS(word=check_synstop(self, word)) goto err;
Splitter_reset(self);
while(1)
{
while(1) {
UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
UNLESS(PyString_Check(w)) break;
if(PyObject_Compare(word,w)==0)
{
if(PyObject_Compare(word,w)==0) {
UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
if(PyList_Append(r,index) < 0) goto err;
if(PyList_Append(r,index) < 0)
goto err;
}
i++;
}
Py_XDECREF(w);
Py_XDECREF(index);
return r;
......@@ -398,14 +429,17 @@ err:
return NULL;
}
static struct PyMethodDef Splitter_methods[] = {
static struct PyMethodDef Splitter_methods[] =
{
{ "pos", (PyCFunction)Splitter_pos, 0,
"pos(index) -- Return the starting and ending position of a token" },
"pos(index) -- Return the starting and ending position of a token"
},
{ "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
"indexes(word) -- Return a list of the indexes of word in the sequence",
},
{ NULL, NULL } /* sentinel */
};
};
static PyObject *
Splitter_getattr(Splitter *self, char *name)
......@@ -438,55 +472,68 @@ static PyTypeObject SplitterType = {
/* Space for future expansion */
0L,0L,0L,0L,
SplitterType__doc__ /* Documentation string */
};
};
static char *splitter_args[]={"doc","synstop","encoding",NULL};
static PyObject *
get_Splitter(PyObject *modinfo, PyObject *args)
get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
{
Splitter *self;
PyObject *doc, *synstop = NULL;
char *encoding = "latin1";
UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
if(synstop)
{
if(synstop) {
self->synstop=synstop;
Py_INCREF(synstop);
}
else self->synstop=NULL;
} else
self->synstop=NULL;
UNLESS(self->text = PyObject_Str(doc)) goto err;
UNLESS(self->here=PyString_AsString(self->text)) goto err;
self->end = self->here + PyString_Size(self->text);
self->index = -1;
return (PyObject*)self;
err:
Py_DECREF(self);
return NULL;
}
static struct PyMethodDef Splitter_module_methods[] = {
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter" },
static struct PyMethodDef Splitter_module_methods[] =
{
{ "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
"ZopeSplitter(doc[,synstop]) -- Return a word splitter"
},
{ NULL, NULL }
};
};
static char Splitter_module_documentation[] =
"Parse source strings into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ZopeSplitter.c,v 1.3 2001/10/29 17:42:51 andreasjung Exp $\n"
;
"Parse source strings into sequences of words\n"
"\n"
"for use in an inverted index\n"
"\n"
"$Id: ZopeSplitter.c,v 1.4 2001/11/14 13:45:49 andreasjung Exp $\n"
;
void
initZopeSplitter(void)
{
PyObject *m, *d;
char *rev="$Revision: 1.3 $";
char *rev="$Revision: 1.4 $";
/* Create the module and add the functions */
m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
......@@ -498,5 +545,6 @@ initZopeSplitter(void)
PyDict_SetItemString(d, "__version__",
PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter");
if (PyErr_Occurred())
Py_FatalError("can't initialize module Splitter");
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment