[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/src - Stemmer.c:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:28 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/src
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/src

Added Files:
      Tag: ajung-textindexng-branch
	Stemmer.c 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/src/Stemmer.c ===
#include "Python.h"

#include "german/stem.h"
#include "french/stem.h"
#include "porter/stem.h"
#include "english/stem.h"
#include "dutch/stem.h"
#include "spanish/stem.h"
#include "portuguese/stem.h"
#include "swedish/stem.h"
#include "italian/stem.h"
#include "russian/stem.h"
#include "danish/stem.h"
#include "norwegian/stem.h"
#include "header.h"

typedef struct
{
    PyObject_HEAD
    PyObject *cache;
    int cache_size;
    struct SN_env *env;
    char * language;
    int (* stem_func)(struct SN_env *);
    
}
Stemmer;


static void
Stemmer_dealloc(Stemmer *self)
{
    if (!strcmp(self->language,"porter")) {
        porter_close_env(self->env);
    } else if (!strcmp(self->language,"german")) {
        german_close_env(self->env);
    } else if (!strcmp(self->language,"french")) {
        french_close_env(self->env);
    } else if (!strcmp(self->language,"dutch")) {
        dutch_close_env(self->env);
    } else if (!strcmp(self->language,"spanish")) {
        spanish_close_env(self->env);
    } else if (!strcmp(self->language,"english")) {
        english_close_env(self->env);
    } else if (!strcmp(self->language,"swedish")) {
        swedish_close_env(self->env);
    } else if (!strcmp(self->language,"italian")) {
        italian_close_env(self->env);
    } else if (!strcmp(self->language,"portuguese")) {
        portuguese_close_env(self->env);
    } else if (!strcmp(self->language,"danish")) {
        danish_close_env(self->env);
    } else if (!strcmp(self->language,"russian")) {
        russian_close_env(self->env);
    } else if (!strcmp(self->language,"norwegian")) {
        norwegian_close_env(self->env);
    }

    free(self->language);

    Py_DECREF(self->cache);

    PyMem_DEL(self);
}

static PyObject *Stemmer_availableStemmers(Stemmer *self,PyObject*args)
{
    PyObject *list;

    list = PyList_New(0);

    PyList_Append(list,PyString_FromString("german"));
    PyList_Append(list,PyString_FromString("french"));
    PyList_Append(list,PyString_FromString("porter"));
    PyList_Append(list,PyString_FromString("english"));
    PyList_Append(list,PyString_FromString("dutch"));
    PyList_Append(list,PyString_FromString("spanish"));
    PyList_Append(list,PyString_FromString("portuguese"));
    PyList_Append(list,PyString_FromString("swedish"));
    PyList_Append(list,PyString_FromString("italian"));
    PyList_Append(list,PyString_FromString("russian"));
    PyList_Append(list,PyString_FromString("danish" ));
    PyList_Append(list,PyString_FromString("norwegian"));
    PyList_Sort(list);

    return list;
}

static PyObject *Stemmer_language(Stemmer *self,PyObject*args)
{
    PyObject *language;

    language = PyString_FromString(self->language);

    return language;
}


static PyObject *Stemmer_getCacheSize(Stemmer *self)
{
    PyObject * size;
        
    size = PyInt_FromLong( (long) self->cache_size);

    return size;
}

static PyObject *Stemmer_setCacheSize(Stemmer *self,PyObject*args)
{
    int size;

    if (! (PyArg_ParseTuple(args,"i",&size)))
        return NULL;

    self->cache_size = size;

    return  Stemmer_getCacheSize(self);
}



static PyObject *stem_word(Stemmer *self,PyObject *pyword) {

    char * word;
    PyObject *stemmed;

    stemmed = PyDict_GetItem(self->cache,pyword);

    if (stemmed==NULL) {

        word = PyString_AsString(pyword);

        SN_set_current(self->env,strlen(word),word);
        self->stem_func(self->env);
        self->env->p[self->env->l] = '\0';

        stemmed = PyString_FromString(self->env->p);

        PyDict_SetItem(self->cache, pyword, stemmed);
        Py_INCREF(stemmed);
    } else {
        Py_INCREF(stemmed);
    }

    return stemmed;
}

static PyObject *Stemmer_stem(Stemmer *self,PyObject *args)
{
    PyObject *stemmed,*data;

    if (self==NULL) {
        PyErr_SetString(PyExc_TypeError, "can not call stem() on unbound method");
        return NULL;
    }

    if (! (PyArg_ParseTuple(args,"O",&data)))
        return NULL;

    if (PyString_Check(data)) {

        stemmed = stem_word(self,data);

        return stemmed;

    } else if (PyList_Check(data)) {

        PyObject * item;
        PyObject * res;
        int i;

        res = PyList_New(0);

        for (i=0; i<PyList_Size(data);i++) {

            item = PyList_GetItem(data,i);
            if (!PyString_Check(item)) {

                PyErr_SetString(PyExc_TypeError, "Unsupported datatype found in list (only strings allowed)");
                return NULL;
            }

            stemmed = stem_word(self,item);

            PyList_Append(res, stemmed);
            Py_DECREF(stemmed);
        }

        return res;

    } else {

        PyErr_SetString(PyExc_TypeError, "Unsupported datatype (must be string or list)");

        return NULL;
    }
}


static struct PyMethodDef Stemmer_methods[] =
    {
        { "language", (PyCFunction)Stemmer_language, METH_VARARGS,
            "language() -- Returns the language of the stemmer object",
        } ,
        { "getCacheSize", (PyCFunction)Stemmer_getCacheSize, METH_VARARGS,
            "getCacheSize() -- Returns the size of the stemmer cache",
        } ,
        { "setCacheSize", (PyCFunction)Stemmer_setCacheSize, METH_VARARGS,
            "setCacheSize(n) -- Set the size of the internal stemmer cache to n",
        } ,
        { "stem", (PyCFunction)Stemmer_stem, METH_VARARGS,
          "stem(word) -- Return stemmed word",
        },
        { NULL, NULL }		/* sentinel */
    };

static  PyObject *
Stemmer_getattr(Stemmer *self, char *name)
{
    return Py_FindMethod(Stemmer_methods, (PyObject *)self, name);
}

static char StemmerType__doc__[] = "Stemmer object";

static PyTypeObject StemmerType = {
                                      PyObject_HEAD_INIT(NULL)
                                      0,                            /*ob_size*/
                                      "Stemmer",                    /*tp_name*/
                                      sizeof(Stemmer),              /*tp_basicsize*/
                                      0,                            /*tp_itemsize*/
                                      /* methods */
                                      (destructor)Stemmer_dealloc,  /*tp_dealloc*/
                                      (printfunc)0,                 /*tp_print*/
                                      (getattrfunc)Stemmer_getattr, /*tp_getattr*/
                                      (setattrfunc)0,               /*tp_setattr*/
                                      (cmpfunc)0,                   /*tp_compare*/
                                      (reprfunc)0,                  /*tp_repr*/
                                      0,                            /*tp_as_number*/
                                      0,                            /*tp_as_sequence*/
                                      0,                            /*tp_as_mapping*/
                                      (hashfunc)0,                  /*tp_hash*/
                                      (ternaryfunc)0,               /*tp_call*/
                                      (reprfunc)0,                  /*tp_str*/

                                      /* Space for future expansion */
                                      0L,0L,0L,0L,
                                      StemmerType__doc__            /* Documentation string */
                                  };



static PyObject *
newStemmer(PyObject *modinfo, PyObject *args)
{
    Stemmer *self=NULL;
    char *language;

    if (! (self = PyObject_NEW(Stemmer, &StemmerType)))
        return NULL;

    self->cache = PyDict_New();
    self->cache_size = 5000;

    if (! (PyArg_ParseTuple(args,"s",&language)))
        return NULL;

    strcpy(self->language=malloc(strlen(language)+1),language);

    if (!strcmp(language,"porter")) {
        self->env = porter_create_env();
        self->stem_func = porter_stem;
    } else if (!strcmp(language,"german")) {
        self->env = german_create_env();
        self->stem_func = german_stem;
    } else if (!strcmp(language,"french")) {
        self->env = french_create_env();
        self->stem_func = french_stem;
    } else if (!strcmp(language,"dutch")) {
        self->env = dutch_create_env();
        self->stem_func = dutch_stem;
    } else if (!strcmp(language,"spanish")) {
        self->env = spanish_create_env();
        self->stem_func = spanish_stem;
    } else if (!strcmp(language,"english")) {
        self->env = english_create_env();
        self->stem_func = english_stem;
    } else if (!strcmp(language,"swedish")) {
        self->env = swedish_create_env();
        self->stem_func = swedish_stem;
    } else if (!strcmp(language,"italian")) {
        self->env = italian_create_env();
        self->stem_func = italian_stem;
    } else if (!strcmp(language,"portuguese")) {
        self->env = portuguese_create_env();
        self->stem_func = portuguese_stem;
    } else if (!strcmp(language,"danish")) {
        self->env = danish_create_env();
        self->stem_func = danish_stem;
    } else if (!strcmp(language,"russian")) {
        self->env = russian_create_env();
        self->stem_func = russian_stem;
    } else if (!strcmp(language,"norwegian")) {
        self->env = norwegian_create_env();
        self->stem_func = norwegian_stem;
    } else {
        char err[255];
        sprintf(err,"PyStemmer: Unsupported language '%s'",language);
        PyErr_SetString(PyExc_TypeError, err);
        goto err;
    }

    return (PyObject*)self;

err:
    Py_DECREF(self);

    return NULL;
}

static struct PyMethodDef Stemmer_module_methods[] =
    {
        { "availableStemmers", (PyCFunction) Stemmer_availableStemmers, METH_VARARGS,
            "availableStemmers() -- Return a list of all available stemmers"
        },
        { "Stemmer", (PyCFunction)newStemmer,
          METH_VARARGS,
          "Stemmer(language) "
          "-- Return a new language specific stemmer"
        },
        { NULL, NULL }
    };

static char Stemmer_module_documentation[] =
    "Stemmer module for eleven different languages.\n"
    "\n"
    "$Id: Stemmer.c,v 1.1.2.1 2002/02/13 16:26:26 andreasjung Exp $\n"
    ;


void
initStemmer(void)
{
    PyObject *m, *d;
    char *rev="$Revision: 1.1.2.1 $";

    /* Create the module and add the functions */
    m = Py_InitModule4("Stemmer", Stemmer_module_methods,
                       Stemmer_module_documentation,
                       (PyObject*)NULL,PYTHON_API_VERSION);

    /* Add some symbolic constants to the module */
    d = PyModule_GetDict(m);
    PyDict_SetItemString(d, "__version__",
                         PyString_FromStringAndSize(rev+11,strlen(rev+11)-2));

    if (PyErr_Occurred())
        Py_FatalError("can't initialize module Stemmer");
}