[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src - ZopeSplitter.c:1.2.10.1

Andreas Jung andreas@zope.com
Thu, 27 Sep 2001 11:37:12 -0400


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src
In directory cvs.zope.org:/tmp/cvs-serv32342

Modified Files:
      Tag: ajung-unicode
	ZopeSplitter.c 
Log Message:
very rough prototype of unicode-aware splitter


=== Zope/lib/python/Products/PluginIndexes/TextIndex/Splitter/ZopeSplitter/src/ZopeSplitter.c 1.2 => 1.2.10.1 ===
     PyObject_HEAD
     PyObject *text, *synstop;
+    PyObject *list;
     char *here, *end;
     int index;
 } Splitter;
@@ -117,6 +118,10 @@
 static int
 Splitter_length(Splitter *self)
 {
+
+    puts("inside Splitter_length()");
+
+    return PyList_Size(self->list);
     PyObject *res=0;
 
     Splitter_reset(self);
@@ -211,113 +216,29 @@
 static PyObject *
 next_word(Splitter *self, char **startpos, char **endpos)
 {
-  char wbuf[MAX_WORD];
-  char *end, *here, *b;
-  int i = 0, c;
-  PyObject *pyword, *res;
-
-  here=self->here;
-  end=self->end;
-  b=wbuf;
-  while (here < end)
-    {
-      /* skip hyphens */ 
-      if ((i > 0) && (*here == '-'))
-        {
-	  here++;
-	  while (isspace((unsigned char) *here) && (here < end)) here++;
-	  continue;
-	}
+    PyObject *list=NULL, *word=NULL;
 
-      c=tolower((unsigned char) *here);
-      
-      /* Check to see if this character is part of a word */
-      if(isalnum((unsigned char)c) || c=='/' || c=='_')
-        { /* Found a word character */
-	  if(startpos && i==0) *startpos=here;
-	  if(i++ < MAX_WORD) *b++ = c;
-        }
-      else if (i != 0)
-        { /* We've found the end of a word */
-	  if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
-
-	  UNLESS(pyword = PyString_FromStringAndSize(wbuf, i))
-            {
-	      self->here=here;
-	      return NULL;
-	    }
-	  
-	  UNLESS(res = check_synstop(self, pyword))
-            {
-	      self->here=here;
-	      Py_DECREF(pyword);
-	      return NULL;
-	    }
-	  
-	  if (res != Py_None)
-            {
-	      if(endpos) *endpos=here;
-	      self->here=here;
-	      Py_DECREF(pyword);
-	      self->index++;
-	      return res;
-	    }
-
-	  /* The word is a stopword, so ignore it */ 
-
-	  Py_DECREF(res);          
-	  Py_DECREF(pyword);
-	  i = 0;
-	  b=wbuf;
-        }
-      
-      here++;
+    if (self->text == NULL) {
+        Py_INCREF(Py_None);
+        return Py_None;
     }
 
-  self->here=here;
-
-  /* We've reached the end of the string */
+    list = PyUnicode_Split(self->text,NULL,1);
+    word = PyList_GetItem(list,0);    
 
-  if(i >= MAX_WORD) i=MAX_WORD; /* "stem" the long word */
-  if (i == 0)
-    { 
-      /* No words */
-      self->here=here;
-      Py_INCREF(Py_None);
-      return Py_None;
-    }
-  
-  UNLESS(pyword = PyString_FromStringAndSize(wbuf, i)) return NULL;
-  
-  if(endpos) *endpos=here;
-  res = check_synstop(self, pyword);
-  Py_DECREF(pyword);
-  if(PyString_Check(res)) self->index++;
-  return res;
+    if (PyList_Size(list)>1) self->text= PyList_GetItem(list,1);
+    else                     self->text=NULL;
+    
+    PyObject_Print(word,stdout,0);
+    fflush(stdout);
+        
+    return word;
 }
 
 static PyObject *
 Splitter_item(Splitter *self, int i)
 {
-    PyObject *word = NULL;
-
-    if (i <= self->index) Splitter_reset(self);
-
-    while(self->index < i)
-    {
-        Py_XDECREF(word);
-
-        UNLESS(word = next_word(self,NULL,NULL)) return NULL; 
-        if (word == Py_None)
-        {
-            Py_DECREF(word);
-            PyErr_SetString(PyExc_IndexError,
-			    "Splitter index out of range");
-            return NULL;
-        }
-    }
-
-    return word;
+    return PyList_GetItem(self->list , i);
 }
 
 static PyObject *
@@ -444,11 +365,12 @@
 get_Splitter(PyObject *modinfo, PyObject *args)
 {
     Splitter *self;
-    PyObject *doc, *synstop = NULL;
+    PyObject *synstop = NULL;
+    PyObject *doc=NULL;
 
+    UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
     UNLESS(PyArg_ParseTuple(args,"O|O",&doc,&synstop)) return NULL;
 
-    UNLESS(self = PyObject_NEW(Splitter, &SplitterType)) return NULL;
 
     if(synstop)
       {
@@ -457,11 +379,12 @@
       }
     else self->synstop=NULL;
 
-    UNLESS(self->text = PyObject_Str(doc)) goto err;
-    UNLESS(self->here=PyString_AsString(self->text)) goto err;
-    self->end = self->here + PyString_Size(self->text);
+    UNLESS(self->list = PyUnicode_Split(doc,NULL,-1)) goto err;
+    UNLESS(self->text = doc) goto err;
+
     self->index = -1;
     return (PyObject*)self;
+
 err:
     Py_DECREF(self);
     return NULL;