[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.5.10.2

Andreas Jung andreas@zope.com
Tue, 8 Jan 2002 14:09:34 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv16661/src

Modified Files:
      Tag: ajung-textindexng-branch
	ZopeSplitter.c 
Log Message:
introducing new constructor parameters:

'maxwords' --  


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.5.10.1 => 1.5.10.2 ===
   
  ****************************************************************************/
+
+
 #include "Python.h"
 #include <ctype.h>
 
@@ -23,6 +25,9 @@
     PyObject *text, *synstop;
     char *here, *end;
     int index;
+    int allow_single_chars;
+    int index_numbers;
+    int max_words;
 }
 
 Splitter;
@@ -98,7 +103,7 @@
     cword = PyString_AsString(word);
     len = PyString_Size(word);
 
-    if(len < 2)	/* Single-letter words are stop words! */
+    if(len < 2 && ! self->allow_single_chars)	/* Single-letter words are stop words! */
     {
         Py_INCREF(Py_None);
         return Py_None;
@@ -110,7 +115,7 @@
     for (; --len >= 0 && ! isalpha((unsigned char)cword[len]); )
 
         ;
-    if (len < 0) {
+    if (len < 0 && ! self->index_numbers) {
         Py_INCREF(Py_None);
         return Py_None;
     }
@@ -140,12 +145,11 @@
     return value;		/* Which must be None! */
 }
 
-#define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-    char wbuf[MAX_WORD];
+    char wbuf[256];
     char *end, *here, *b;
     int i = 0, c;
     PyObject *pyword, *res;
@@ -175,13 +179,13 @@
             if(startpos && i==0)
                 *startpos=here;
 
-            if(i++ < MAX_WORD)
+            if(i++ < self->max_words)
                 *b++ = c;
 
         } else if (i != 0) { /* We've found the end of a word */
 
-            if(i >= MAX_WORD)
-                i=MAX_WORD; /* "stem" the long word */
+            if(i >= self->max_words)
+                i=self->max_words; /* "stem" the long word */
 
             UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) {
                 self->here=here;
@@ -225,8 +229,8 @@
 
     /* We've reached the end of the string */
 
-    if(i >= MAX_WORD)
-        i=MAX_WORD; /* "stem" the long word */
+    if(i >= self->max_words)
+        i=self->max_words; /* "stem" the long word */
 
     if (i == 0) {
         /* No words */
@@ -307,14 +311,14 @@
 }
 
 static PySequenceMethods Splitter_as_sequence = {
-            (inquiry)Splitter_length,        /*sq_length*/
-            (binaryfunc)Splitter_concat,     /*sq_concat*/
-            (intargfunc)Splitter_repeat,     /*sq_repeat*/
-            (intargfunc)Splitter_item,       /*sq_item*/
-            (intintargfunc)Splitter_slice,   /*sq_slice*/
-            (intobjargproc)0,                    /*sq_ass_item*/
-            (intintobjargproc)0,                 /*sq_ass_slice*/
-        };
+    (inquiry)Splitter_length,        /*sq_length*/
+    (binaryfunc)Splitter_concat,     /*sq_concat*/
+    (intargfunc)Splitter_repeat,     /*sq_repeat*/
+    (intargfunc)Splitter_item,       /*sq_item*/
+    (intintargfunc)Splitter_slice,   /*sq_slice*/
+    (intobjargproc)0,                    /*sq_ass_item*/
+    (intintobjargproc)0,                 /*sq_ass_slice*/
+};
 
 static PyObject *
 Splitter_pos(Splitter *self, PyObject *args)
@@ -407,31 +411,31 @@
 static char SplitterType__doc__[] = "";
 
 static PyTypeObject SplitterType = {
-                                       PyObject_HEAD_INIT(NULL)
-                                       0,                                 /*ob_size*/
-                                       "Splitter",                    /*tp_name*/
-                                       sizeof(Splitter),              /*tp_basicsize*/
-                                       0,                                 /*tp_itemsize*/
-                                       /* methods */
-                                       (destructor)Splitter_dealloc,  /*tp_dealloc*/
-                                       (printfunc)0,                      /*tp_print*/
-                                       (getattrfunc)Splitter_getattr, /*tp_getattr*/
-                                       (setattrfunc)0,                    /*tp_setattr*/
-                                       (cmpfunc)0,                        /*tp_compare*/
-                                       (reprfunc)0,                       /*tp_repr*/
-                                       0,                                 /*tp_as_number*/
-                                       &Splitter_as_sequence,         /*tp_as_sequence*/
-                                       0,                                 /*tp_as_mapping*/
-                                       (hashfunc)0,                       /*tp_hash*/
-                                       (ternaryfunc)0,                    /*tp_call*/
-                                       (reprfunc)0,                       /*tp_str*/
-
-                                       /* Space for future expansion */
-                                       0L,0L,0L,0L,
-                                       SplitterType__doc__ /* Documentation string */
-                                   };
+    PyObject_HEAD_INIT(NULL)
+    0,                                 /*ob_size*/
+    "Splitter",                    /*tp_name*/
+    sizeof(Splitter),              /*tp_basicsize*/
+    0,                                 /*tp_itemsize*/
+    /* methods */
+    (destructor)Splitter_dealloc,  /*tp_dealloc*/
+    (printfunc)0,                      /*tp_print*/
+    (getattrfunc)Splitter_getattr, /*tp_getattr*/
+    (setattrfunc)0,                    /*tp_setattr*/
+    (cmpfunc)0,                        /*tp_compare*/
+    (reprfunc)0,                       /*tp_repr*/
+    0,                                 /*tp_as_number*/
+    &Splitter_as_sequence,         /*tp_as_sequence*/
+    0,                                 /*tp_as_mapping*/
+    (hashfunc)0,                       /*tp_hash*/
+    (ternaryfunc)0,                    /*tp_call*/
+    (reprfunc)0,                       /*tp_str*/
+
+    /* Space for future expansion */
+    0L,0L,0L,0L,
+    SplitterType__doc__ /* Documentation string */
+};
 
-static char *splitter_args[]={"doc","synstop","encoding",NULL};
+static char *splitter_args[]={"doc","synstop","encoding","singlechar","indexnumbers","maxwords",NULL};
 
 
 static PyObject *
@@ -440,8 +444,12 @@
     Splitter *self;
     PyObject *doc, *synstop = NULL;
     char *encoding = "latin1";
+    int single_char = 0;
+    int index_numbers = 0;
+    int max_words= 64;
 
-    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Os",splitter_args, &doc,&synstop,&encoding)) return NULL;
+    UNLESS(PyArg_ParseTupleAndKeywords(args,keywds,"O|Osiii",splitter_args, \
+                                       &doc,&synstop,&encoding,&single_char,&index_numbers,&max_words)) return NULL;
 
     UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
 
@@ -459,6 +467,9 @@
     self->end = self->here + PyString_Size(self->text);
 
     self->index = -1;
+    self->allow_single_chars = single_char;
+    self->index_numbers      = index_numbers;
+    self->max_words          = max_words;
 
     return (PyObject*)self;
 
@@ -471,7 +482,7 @@
 static struct PyMethodDef Splitter_module_methods[] =
     {
         { "ZopeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
-            "ZopeSplitter(doc[,synstop]) -- Return a word splitter"
+            "ZopeSplitter(doc[,synstop][,encoding][,singlechar][,indexnumbers]) -- Return a word splitter"
         },
 
         { NULL, NULL }