[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src - UnicodeSplitter.c:1.8

Shane Hathaway shane@digicool.com
Fri, 19 Oct 2001 16:08:06 -0400


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv20701/src

Modified Files:
	UnicodeSplitter.c 
Log Message:
- Fixed some refcount bugs.

- Implemented stemming in a simpler way.

- Made checkSynword() easier to read.

- Used PyList_GetItem() to do bounds checking in Splitter_item().

- Made Splitter_indexes slightly faster by keeping a local copy of the length.

- splitUnicodeString() now returns -1 on error.

- Made splitUnicodeString() easier to read.

- prepareString() performs a copy the standard way.


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/UnicodeSplitter/src/UnicodeSplitter.c 1.7 => 1.8 ===
 #include "Python.h"
 
 #define MAX_WORD 64		/* Words longer than MAX_WORD are stemmed */
 
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
 typedef struct
 {
     PyObject_HEAD
@@ -12,19 +17,18 @@
 static
 PyUnicodeObject *prepareString(PyUnicodeObject *o);
 
-static PyObject * checkSynword(Splitter *self,PyObject *word)
+static PyObject *checkSynword(Splitter *self, PyObject *word)
 {
+    /* Always returns a borrowed reference */
     PyObject *value;
-    PyObject *res;
 
     if (self->synstop) {
         value = PyDict_GetItem(self->synstop,word);
-        if (value) {
-            res = value;
-        } else res = word;
-    } res = word;
-
-    return res;
+        if (value != NULL) {
+          return value;
+        }
+    }
+    return word;
 }
 
 static void
@@ -60,36 +64,29 @@
 static PyObject *
 Splitter_item(Splitter *self, int i)
 {
-    PyObject *item=NULL;
-
-    if (i >= PyList_Size(self->list)) {
-        PyErr_SetString(PyExc_IndexError,"Splitter index out of range");
-        return NULL;
-    }
-
-    item=PyList_GET_ITEM(self->list , i);
-    Py_INCREF(item);
-
-    return item;
+  PyObject *item;
+  item = PyList_GetItem(self->list, i);
+  Py_XINCREF(item);  /* Promote borrowed ref unless exception */
+  return item;
 }
 
 
 static PyObject *
 Splitter_indexes(Splitter *self, PyObject *args)
 {
-    int i=0;
+    int i=0, size;
     PyObject *word=NULL,*item=NULL,*r=NULL,*index=NULL;
 
     if (! (PyArg_ParseTuple(args,"O",&word))) return NULL;
     if (! (r=PyList_New(0))) return NULL;
 
-    for (i=0;i<PyList_Size(self->list);i++) {
+    size = PyList_Size(self->list);
+    for (i=0;i<size;i++) {
         item=PyList_GET_ITEM(self->list,i);
 
         if (PyUnicode_Compare(word,item)==0) {
             index=PyInt_FromLong(i);
             if(!index) return NULL;
-            Py_INCREF(item);
             PyList_Append(r,index);
         }
     }
@@ -125,11 +122,11 @@
 static struct PyMethodDef Splitter_methods[] =
     {
         { "pos", (PyCFunction)Splitter_pos, 0,
-            "pos(index) -- Return the starting and ending position of a token"
+          "pos(index) -- Return the starting and ending position of a token"
         },
 
         { "indexes", (PyCFunction)Splitter_indexes, METH_VARARGS,
-          "indexes(word) -- Return al list of the indexes of word in the sequence",
+          "indexes(word) -- Return a list of the indexes of word in the sequence",
         },
         { NULL, NULL }		/* sentinel */
     };
@@ -181,16 +178,15 @@
     int i=0;
     int start=0;
 
-    if (! (doc1 = prepareString(doc))) {
-
-        return 0;
-    }
+    doc1 = prepareString(doc);
+    if (doc1 == NULL)
+      return -1;
 
     s=doc1->str;
 
     self->list = PyList_New(0);
 
-    do {
+    for (i = 0; i < len; s++, i++) {
         register Py_UNICODE ch;
 
         ch = *s;
@@ -208,66 +204,38 @@
             if (!(Py_UNICODE_ISALNUM(ch) || ch=='/' || ch=='_' || ch=='-')) {
                 inside_word = 0;
 
-                word = PySequence_GetSlice((PyObject *)doc,start,i);
-                if (word==NULL) {
-                    Py_DECREF(doc1);
-                    return 0;
-                }
-
-                // Stem word
-                if (PyUnicode_GET_SIZE(word)>MAX_WORD) {
-                    PyObject *tmpword=word;
-                    tmpword = PySequence_GetSlice(word,0,MAX_WORD);
-                    if (tmpword==NULL) {
-                        Py_DECREF(doc1);
-                        return 0;
-                    }
-
-                    Py_DECREF(word);
-
-                    word = tmpword;
-                }
+                word = PySequence_GetSlice((PyObject *)doc1,start,
+                                           // Stem word
+                                           min(i, start + MAX_WORD));
+                if (word==NULL)
+                  goto err;
 
                 synword = checkSynword(self,word);
                 if (synword != Py_None) {
-                    PyList_Append(self->list,synword);
+                  PyList_Append(self->list,synword);
                 }
 
-                Py_DECREF(word);
-
                 start =  0;
 #ifdef DEBUG
                 PyObject_Print(word,stdout,0);
                 fflush(stdout);
 #endif
+                Py_DECREF(word);
             }
         }
-
-        s++;
-
-    } while(++i < len);
+    }
 
     if (inside_word) {
-        word = PySequence_GetSlice((PyObject *)doc,start,i);
-        if (word==NULL) {
-            Py_DECREF(doc1);
-            return 0;
-        }
-
-        // Stem word
-        if (PyUnicode_GET_SIZE(word)>MAX_WORD) {
-            word = PySequence_GetSlice(word,0,MAX_WORD);
-            if (word==NULL) {
-                Py_DECREF(doc1);
-                return 0;
-            }
-
-        }
+        word = PySequence_GetSlice((PyObject *)doc1,start,
+                                   // Stem word
+                                   min(len, start + MAX_WORD));
+        if (word==NULL)
+          goto err;
 
         synword = checkSynword(self,word);
         if (synword != Py_None) {
-            PyList_Append(self->list,synword);
-        } else Py_DECREF(synword);
+          PyList_Append(self->list,synword);
+        }
 
         Py_DECREF(word);
     }
@@ -279,6 +247,10 @@
 
     Py_DECREF(doc1);
     return 1;
+
+ err:
+    Py_DECREF(doc1);
+    return -1;
 }
 
 
@@ -304,12 +276,9 @@
 {
     PyUnicodeObject *u;
 
-    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, o->length);
-    if (u == NULL) return NULL;
-
-    Py_UNICODE_COPY(u->str, o->str, o->length);
-    fixlower(u);
-
+    u = (PyUnicodeObject*) PyUnicode_FromUnicode(o->str, o->length);
+    if (u != NULL)
+      fixlower(u);
     return  u;
 }
 
@@ -317,7 +286,7 @@
 
 
 static PyObject *
-get_Splitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
+newSplitter(PyObject *modinfo, PyObject *args,PyObject *keywds)
 {
     Splitter *self=NULL;
     PyObject *doc=NULL, *unicodedoc=NULL,*synstop=NULL;
@@ -349,17 +318,13 @@
         return NULL;
     }
 
-
-
     if (synstop) {
         self->synstop = synstop;
         Py_INCREF(synstop);
     } else  self->synstop=NULL;
 
-    if (! (splitUnicodeString(self,(PyUnicodeObject *)unicodedoc))) {
-        goto err;
-    }
-
+    if ((splitUnicodeString(self,(PyUnicodeObject *)unicodedoc)) < 0)
+      goto err;
 
     Py_DECREF(unicodedoc);
     return (PyObject*)self;
@@ -373,8 +338,10 @@
 
 static struct PyMethodDef Splitter_module_methods[] =
     {
-        { "UnicodeSplitter", (PyCFunction)get_Splitter, METH_VARARGS|METH_KEYWORDS,
-            "UnicodeSplitter(doc[,synstop][,encoding='latin1']) -- Return a word splitter"
+        { "UnicodeSplitter", (PyCFunction)newSplitter,
+          METH_VARARGS|METH_KEYWORDS,
+          "UnicodeSplitter(doc[,synstop][,encoding='latin1']) "
+          "-- Return a word splitter"
         },
         { NULL, NULL }
     };