- unified indentations

- added encoding parameter to make pre-2.5 Data.fs installations happy because of adding the encoding parameter to the Splitter API

- unified indentations
- added encoding parameter to make pre-2.5 Data.fs installations happy because of adding the encoding parameter to the Splitter API
6e60995a · Andreas Jung · 0709760d · 6e60995a
Commit 6e60995a authored Nov 14, 2001 by Andreas Jung
Hide whitespace changes
Inline Side-by-side

Showing with 269 additions and 221 deletions

lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c ...ndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c +269 -221

No files found.
--- a/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
+++ b/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c
@@ -89,14 +89,16 @@
 #define UNLESS(E) if(!(E))
 #define UNLESS_ASSIGN(V,E) ASSIGN(V,E) UNLESS(V)

-typedef struct 
+typedef struct
 {
    PyObject_HEAD
    PyObject *text, *synstop;
    char *here, *end;
    int index;
-} Splitter;
- 
+}
+
+Splitter;
+
 static PyObject *next_word(Splitter *, char **, char **);

 static void
@@ -107,7 +109,7 @@ Splitter_reset(Splitter *self)
 }

 static void
-Splitter_dealloc(Splitter *self) 
+Splitter_dealloc(Splitter *self)
 {
    Py_XDECREF(self->text);
    Py_XDECREF(self->synstop);
@@ -120,15 +122,15 @@ Splitter_length(Splitter *self)
    PyObject *res=0;

    Splitter_reset(self);
-    while(1)
-      {
-	UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
-	UNLESS(PyString_Check(res))
-	  {
-	    Py_DECREF(res);
-	    break;
-	  }
-      }
+
+    while(1) {
+        UNLESS_ASSIGN(res,next_word(self,NULL,NULL)) return -1;
+        UNLESS(PyString_Check(res)) {
+            Py_DECREF(res);
+            break;
+        }
+    }
+
    return self->index+1;
 }

@@ -149,13 +151,13 @@ Splitter_repeat(Splitter *self, long n)
 /*
  Map an input word to an output word by applying standard
  filtering/mapping words, including synonyms/stop words.
-
+ 
  Input is a word.
  
  Output is:
-
+ 
     None -- The word is a stop word
-
+ 
     sometext -- A replacement for the word
 */
 static PyObject *
@@ -164,136 +166,160 @@ check_synstop(Splitter *self, PyObject *word)
    PyObject *value;
    char *cword;
    int len;
-    
+
    cword = PyString_AsString(word);
    len = PyString_Size(word);
+
    if(len < 2)	/* Single-letter words are stop words! */
    {
-      Py_INCREF(Py_None);
-      return Py_None;
+        Py_INCREF(Py_None);
+        return Py_None;
    }

    /*************************************************************
      Test whether a word has any letters.                       *
-                                                                 */    
-    for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); );
-    if (len < 0)
-    {
+                                                                 */
+    for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
+
+        ;
+    if (len < 0) {
        Py_INCREF(Py_None);
        return Py_None;
    }
+
    /*
     * If no letters, treat it as a stop word.
     *************************************************************/

    Py_INCREF(word);

-    if (self->synstop == NULL) return word;
+    if (self->synstop == NULL)
+        return word;

    while ((value = PyObject_GetItem(self->synstop, word)) &&
-	   PyString_Check(value))
-    {
+            PyString_Check(value)) {
        ASSIGN(word,value);
-	if(len++ > 100) break;	/* Avoid infinite recurssion */
+
+        if(len++ > 100)
+            break;	/* Avoid infinite recurssion */
    }

-    if (value == NULL)
-    {
+    if (value == NULL) {
        PyErr_Clear();
        return word;
    }

    return value;		/* Which must be None! */
 }
- 
+
 #define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
-   
+
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-  char wbuf[MAX_WORD];
-  char *end, *here, *b;
-  int i = 0, c;
-  PyObject *pyword, *res;
-
-  here=self->here;
-  end=self->end;
-  b=wbuf;
-  while (here < end)
-    {
-      /* skip hyphens */ 
-      if ((i > 0) && (*here == '-'))
-        {
-	  here++;
-	  while (isspace((unsigned char) *here) && (here < end)) here++;
-	  continue;
-	}
-
-      c=tolower((unsigned char) *here);
-      
-      /* Check to see if this character is part of a word */
-      if(isalnum((unsigned char)c) || c=='/' || c=='_')
-        { /* Found a word character */
-	  if(startpos && i==0) *startpos=here;
-	  if(i++ < MAX_WORD) *b++ = c;
+    char wbuf[MAX_WORD];
+    char *end, *here, *b;
+    int i = 0, c;
+    PyObject *pyword, *res;
+
+    here=self->here;
+    end=self->end;
+    b=wbuf;
+
+    while (here < end) {
+        /* skip hyphens */
+
+        if ((i > 0) && (*here == '-')) {
+            here++;
+
+            while (isspace((unsigned char) *here) && (here < end))
+                here++;
+
+            continue;
        }
-      else if (i != 0)
-        { /* We've found the end of a word */
-	  if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
-
-	  UNLESS(pyword = PyString_FromStringAndSize(wbuf, i))
-            {
-	      self->here=here;
-	      return NULL;
-	    }
-	  
-	  UNLESS(res = check_synstop(self, pyword))
-            {
-	      self->here=here;
-	      Py_DECREF(pyword);
-	      return NULL;
-	    }
-	  
-	  if (res != Py_None)
-            {
-	      if(endpos) *endpos=here;
-	      self->here=here;
-	      Py_DECREF(pyword);
-	      self->index++;
-	      return res;
-	    }
-
-	  /* The word is a stopword, so ignore it */ 
-
-	  Py_DECREF(res);          
-	  Py_DECREF(pyword);
-	  i = 0;
-	  b=wbuf;
+
+        c=tolower((unsigned char) *here);
+
+        /* Check to see if this character is part of a word */
+
+        if(isalnum((unsigned char)c) || c=='/' || c=='_') { /* Found a word character */
+
+            if(startpos && i==0)
+                *startpos=here;
+
+            if(i++ < MAX_WORD)
+                *b++ = c;
+
+        } else if (i != 0) { /* We've found the end of a word */
+
+            if(i >= MAX_WORD)
+                i=MAX_WORD; /* "stem" the long word */
+
+            UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
+                self->here=here;
+                return NULL;
+            }
+
+            UNLESS(res = check_synstop(self, pyword)) {
+                self->here=here;
+                Py_DECREF(pyword);
+                return NULL;
+            }
+
+            if (res != Py_None) {
+                if(endpos)
+                    *endpos=here;
+
+                self->here=here;
+
+                Py_DECREF(pyword);
+
+                self->index++;
+
+                return res;
+            }
+
+            /* The word is a stopword, so ignore it */
+
+            Py_DECREF(res);
+
+            Py_DECREF(pyword);
+
+            i = 0;
+
+            b=wbuf;
        }
-      
-      here++;
+
+        here++;
    }

-  self->here=here;
+    self->here=here;

-  /* We've reached the end of the string */
+    /* We've reached the end of the string */

-  if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
-  if (i == 0)
-    { 
-      /* No words */
-      self->here=here;
-      Py_INCREF(Py_None);
-      return Py_None;
+    if(i >= MAX_WORD)
+        i=MAX_WORD; /* "stem" the long word */
+
+    if (i == 0) {
+        /* No words */
+        self->here=here;
+        Py_INCREF(Py_None);
+        return Py_None;
    }
-  
-  UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
-  
-  if(endpos) *endpos=here;
-  res = check_synstop(self, pyword);
-  Py_DECREF(pyword);
-  if(PyString_Check(res)) self->index++;
-  return res;
+
+    UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
+
+    if(endpos)
+        *endpos=here;
+
+    res = check_synstop(self, pyword);
+
+    Py_DECREF(pyword);
+
+    if(PyString_Check(res))
+        self->index++;
+
+    return res;
 }

 static PyObject *
@@ -301,18 +327,18 @@ Splitter_item(Splitter *self, int i)
 {
    PyObject *word = NULL;

-    if (i <= self->index) Splitter_reset(self);
+    if (i <= self->index)
+        Splitter_reset(self);

-    while(self->index < i)
-    {
+    while(self->index < i) {
        Py_XDECREF(word);

-        UNLESS(word = next_word(self,NULL,NULL)) return NULL; 
-        if (word == Py_None)
-        {
+        UNLESS(word = next_word(self,NULL,NULL)) return NULL;
+
+        if (word == Py_None) {
            Py_DECREF(word);
            PyErr_SetString(PyExc_IndexError,
-			    "Splitter index out of range");
+                            "Splitter index out of range");
            return NULL;
        }
    }
@@ -328,14 +354,14 @@ Splitter_slice(Splitter *self, int i, int j)
 }

 static PySequenceMethods Splitter_as_sequence = {
-    (inquiry)Splitter_length,        /*sq_length*/
-    (binaryfunc)Splitter_concat,     /*sq_concat*/
-    (intargfunc)Splitter_repeat,     /*sq_repeat*/
-    (intargfunc)Splitter_item,       /*sq_item*/
-    (intintargfunc)Splitter_slice,   /*sq_slice*/
-    (intobjargproc)0,                    /*sq_ass_item*/
-    (intintobjargproc)0,                 /*sq_ass_slice*/
-};
+            (inquiry)Splitter_length,        /*sq_length*/
+            (binaryfunc)Splitter_concat,     /*sq_concat*/
+            (intargfunc)Splitter_repeat,     /*sq_repeat*/
+            (intargfunc)Splitter_item,       /*sq_item*/
+            (intintargfunc)Splitter_slice,   /*sq_slice*/
+            (intobjargproc)0,                    /*sq_ass_item*/
+            (intintobjargproc)0,                 /*sq_ass_slice*/
+        };

 static PyObject *
 Splitter_pos(Splitter *self, PyObject *args)
@@ -346,20 +372,21 @@ Splitter_pos(Splitter *self, PyObject *args)

    UNLESS(PyArg_Parse(args, "i", &i)) return NULL;

-    if (i <= self->index) Splitter_reset(self);
+    if (i <= self->index)
+        Splitter_reset(self);

-    while(self->index < i)
-    {
-	UNLESS(res=next_word(self, &start, &end)) return NULL;
-	if(PyString_Check(res))
-	  {
+    while(self->index < i) {
+        UNLESS(res=next_word(self, &start, &end)) return NULL;
+
+        if(PyString_Check(res)) {
            self->index++;
-	    Py_DECREF(res);
-	    continue;
-	  }
-	Py_DECREF(res);
-	PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
-	return NULL;
+            Py_DECREF(res);
+            continue;
+        }
+
+        Py_DECREF(res);
+        PyErr_SetString(PyExc_IndexError, "Splitter index out of range");
+        return NULL;
    }

    ctext=PyString_AsString(self->text);
@@ -369,46 +396,53 @@ Splitter_pos(Splitter *self, PyObject *args)
 static PyObject *
 Splitter_indexes(Splitter *self, PyObject *args)
 {
-  PyObject *word, *r, *w=0, *index=0;
-  int i=0;
+    PyObject *word, *r, *w=0, *index=0;
+    int i=0;

-  UNLESS(PyArg_ParseTuple(args,"O",&word)) return NULL;
-  UNLESS(r=PyList_New(0)) return NULL;
-  UNLESS(word=check_synstop(self, word)) goto err;
+    UNLESS(PyArg_ParseTuple(args,"O",&word)) return NULL;
+    UNLESS(r=PyList_New(0)) return NULL;
+    UNLESS(word=check_synstop(self, word)) goto err;

-  Splitter_reset(self);
-  while(1)
-    {
-      UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
-      UNLESS(PyString_Check(w)) break;
-      if(PyObject_Compare(word,w)==0)
-	{
-	  UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
-	  if(PyList_Append(r,index) < 0) goto err;
-	}
-      i++;
+    Splitter_reset(self);
+
+    while(1) {
+        UNLESS_ASSIGN(w,next_word(self, NULL, NULL)) goto err;
+        UNLESS(PyString_Check(w)) break;
+
+        if(PyObject_Compare(word,w)==0) {
+            UNLESS_ASSIGN(index,PyInt_FromLong(i)) goto err;
+
+            if(PyList_Append(r,index) < 0)
+                goto err;
+        }
+
+        i++;
    }
-  Py_XDECREF(w);
-  Py_XDECREF(index);
-  return r;
+
+    Py_XDECREF(w);
+    Py_XDECREF(index);
+    return r;

 err:
-  Py_DECREF(r);
-  Py_XDECREF(index);
-  return NULL;
+    Py_DECREF(r);
+    Py_XDECREF(index);
+    return NULL;
 }

-static struct PyMethodDef Splitter_methods[] = {
-    { "pos", (PyCFunction)Splitter_pos, 0,
-      "pos(index) -- Return the starting and ending position of a token" },
-    { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
-      "indexes(word) -- Return a list of the indexes of word in the sequence",
-    },
-    { NULL, NULL }		/* sentinel */
-};
+static struct PyMethodDef Splitter_methods[] =
+    {
+        { "pos", (PyCFunction)Splitter_pos, 0,
+            "pos(index) -- Return the starting and ending position of a token"
+        },
+
+        { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
+          "indexes(word) -- Return a list of the indexes of word in the sequence",
+        },
+        { NULL, NULL }		/* sentinel */
+    };

 static PyObject *
-Splitter_getattr(Splitter *self, char *name) 
+Splitter_getattr(Splitter *self, char *name)
 {
    return Py_FindMethod(Splitter_methods, (PyObject *)self, name);
 }
@@ -416,87 +450,101 @@ Splitter_getattr(Splitter *self, char *name)
 static char SplitterType__doc__[] = "";

 static PyTypeObject SplitterType = {
-    PyObject_HEAD_INIT(NULL)
-    0,                                 /*ob_size*/
-    "Splitter",                    /*tp_name*/
-    sizeof(Splitter),              /*tp_basicsize*/
-    0,                                 /*tp_itemsize*/
-    /* methods */
-    (destructor)Splitter_dealloc,  /*tp_dealloc*/
-    (printfunc)0,                      /*tp_print*/
-    (getattrfunc)Splitter_getattr, /*tp_getattr*/
-    (setattrfunc)0,                    /*tp_setattr*/
-    (cmpfunc)0,                        /*tp_compare*/
-    (reprfunc)0,                       /*tp_repr*/
-    0,                                 /*tp_as_number*/
-    &Splitter_as_sequence,         /*tp_as_sequence*/
-    0,                                 /*tp_as_mapping*/
-    (hashfunc)0,                       /*tp_hash*/
-    (ternaryfunc)0,                    /*tp_call*/
-    (reprfunc)0,                       /*tp_str*/
-
-    /* Space for future expansion */
-    0L,0L,0L,0L,
-    SplitterType__doc__ /* Documentation string */
-};
+                                       PyObject_HEAD_INIT(NULL)
+                                       0,                                 /*ob_size*/
+                                       "Splitter",                    /*tp_name*/
+                                       sizeof(Splitter),              /*tp_basicsize*/
+                                       0,                                 /*tp_itemsize*/
+                                       /* methods */
+                                       (destructor)Splitter_dealloc,  /*tp_dealloc*/
+                                       (printfunc)0,                      /*tp_print*/
+                                       (getattrfunc)Splitter_getattr, /*tp_getattr*/
+                                       (setattrfunc)0,                    /*tp_setattr*/
+                                       (cmpfunc)0,                        /*tp_compare*/
+                                       (reprfunc)0,                       /*tp_repr*/
+                                       0,                                 /*tp_as_number*/
+                                       &Splitter_as_sequence,         /*tp_as_sequence*/
+                                       0,                                 /*tp_as_mapping*/
+                                       (hashfunc)0,                       /*tp_hash*/
+                                       (ternaryfunc)0,                    /*tp_call*/
+                                       (reprfunc)0,                       /*tp_str*/
+
+                                       /* Space for future expansion */
+                                       0L,0L,0L,0L,
+                                       SplitterType__doc__ /* Documentation string */
+                                   };
+
+static char *splitter_args[]={"doc","synstop","encoding",NULL};
+

 static PyObject *
-get_Splitter(PyObject *modinfo, PyObject *args)
+get_Splitter(PyObject *modinfo, PyObject *args,PyObject * keywds)
 {
    Splitter *self;
    PyObject *doc, *synstop = NULL;
+    char *encoding = "latin1";

-    UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
+    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;

    UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;

-    if(synstop)
-      {
-	self->synstop=synstop;
-	Py_INCREF(synstop);
-      }
-    else self->synstop=NULL;
+    if(synstop) {
+        self->synstop=synstop;
+        Py_INCREF(synstop);
+
+    } else
+        self->synstop=NULL;

    UNLESS(self->text = PyObject_Str(doc)) goto err;
+
    UNLESS(self->here=PyString_AsString(self->text)) goto err;
+
    self->end = self->here + PyString_Size(self->text);
+
    self->index = -1;
+
    return (PyObject*)self;
+
 err:
    Py_DECREF(self);
+
    return NULL;
 }

-static struct PyMethodDef Splitter_module_methods[] = {
-    { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS,
-      "ZopeSplitter(doc[,synstop]) -- Return a word splitter" },
-    { NULL, NULL }
-};
+static struct PyMethodDef Splitter_module_methods[] =
+    {
+        { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
+            "ZopeSplitter(doc[,synstop]) -- Return a word splitter"
+        },
+
+        { NULL, NULL }
+    };

-static char Splitter_module_documentation[] = 
-"Parse source strings into sequences of words\n"
-"\n"
-"for use in an inverted index\n"
-"\n"
-"$Id: ZopeSplitter.c,v 1.3 2001/10/29 17:42:51 andreasjung Exp $\n"
-;
+static char Splitter_module_documentation[] =
+    "Parse source strings into sequences of words\n"
+    "\n"
+    "for use in an inverted index\n"
+    "\n"
+    "$Id: ZopeSplitter.c,v 1.4 2001/11/14 13:45:49 andreasjung Exp $\n"
+    ;


 void
 initZopeSplitter(void)
 {
-  PyObject *m, *d;
-  char *rev="$Revision: 1.3 $";
-  
-  /* Create the module and add the functions */
-  m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
-                     Splitter_module_documentation,
-                     (PyObject*)NULL,PYTHON_API_VERSION);
-  
-  /* Add some symbolic constants to the module */
-  d = PyModule_GetDict(m);
-  PyDict_SetItemString(d, "__version__",
-		       PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));
+    PyObject *m, *d;
+    char *rev="$Revision: 1.4 $";
+
+    /* Create the module and add the functions */
+    m = Py_InitModule4("ZopeSplitter", Splitter_module_methods,
+                       Splitter_module_documentation,
+                       (PyObject*)NULL,PYTHON_API_VERSION);
+
+    /* Add some symbolic constants to the module */
+    d = PyModule_GetDict(m);
+    PyDict_SetItemString(d, "__version__",
+                         PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));

-  if (PyErr_Occurred()) Py_FatalError("can't initialize module Splitter");
+    if (PyErr_Occurred())
+        Py_FatalError("can't initialize module Splitter");
 }