[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese - output.txt:1.1.2.1 portuguesestem.c:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:26 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/portuguese

Added Files:
      Tag: ajung-textindexng-branch
	output.txt portuguesestem.c stem.h stem.sbl stemmer.html 
	voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/output.txt === (31916/32016 lines abridged)
a
 
…
Æ
aach
aacut
abacax
abad
abaet
abaf
abaf
abaix
abaix
abaix
abaix
abaix
abal
abal
abal
abal
abal
abalro
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abarrot
abarrot
abarrot
abast
abast
abast
abastec
abastec
abastec
abastec
abat
abat
abatedour
abat

[-=- -=- -=- 31916 lines omitted -=- -=- -=-]

zapping
zar
zaragoz
zarin
zaz
z‚
zebr
zebr
zebu
zec
zed
zeferin
zehnd
zelador
zelƒnd
zel
zen
zenild
zenild
zentel
zepellin
zequinh
zer
zerinh
zer
zer
zez
zhiling
zic
zilberman
zimb bu
zinc
zinh
zĄp
zirald
zit
zoar
zodĄac
zol
zoli
zon
zon
zoneament
zonz
zoobor
zooląg
zoomp
zul
zumb
zumb


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/portuguesestem.c === (553/653 lines abridged)

#include "header.h"

extern int portuguese_stem(struct SN_env * z);
static int r_residual_form(struct SN_env * z);
static int r_residual_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);

static struct among a_0[3] =
{
/*  0 */ { 0, (byte *)"", -1, 3, 0},
/*  1 */ { 1, (byte *)"\xC6" "", 0, 1, 0},
/*  2 */ { 1, (byte *)"\xE4" "", 0, 2, 0}
};

static struct among a_1[3] =
{
/*  0 */ { 0, (byte *)"", -1, 3, 0},
/*  1 */ { 2, (byte *)"a~", 0, 1, 0},
/*  2 */ { 2, (byte *)"o~", 0, 2, 0}
};

static struct among a_2[4] =
{
/*  0 */ { 2, (byte *)"ic", -1, -1, 0},
/*  1 */ { 2, (byte *)"ad", -1, -1, 0},
/*  2 */ { 2, (byte *)"os", -1, -1, 0},
/*  3 */ { 2, (byte *)"iv", -1, 1, 0}
};

static struct among a_3[2] =
{
/*  0 */ { 4, (byte *)"avel", -1, 1, 0},
/*  1 */ { 4, (byte *)"\xA1" "vel", -1, 1, 0}
};

static struct among a_4[3] =
{
/*  0 */ { 2, (byte *)"ic", -1, 1, 0},
/*  1 */ { 4, (byte *)"abil", -1, 1, 0},
/*  2 */ { 2, (byte *)"iv", -1, 1, 0}
};


[-=- -=- -=- 553 lines omitted -=- -=- -=-]


    {   int m = z->l - z->c; /* do, line 203 */
        {   int m = z->l - z->c; /* or, line 207 */
            {   int m = z->l - z->c; /* or, line 204 */
                if (!r_standard_suffix(z)) goto lab6; /* call standard_suffix, line 204 */
                goto lab5;
            lab6:
                z->c = z->l - m;
                if (!r_verb_suffix(z)) goto lab4; /* call verb_suffix, line 204 */
            }
        lab5:
            {   int m = z->l - z->c; /* do, line 205 */
                z->ket = z->c; /* [, line 205 */
                if (!(eq_s_b(z, 1, "i"))) goto lab7;
                z->bra = z->c; /* ], line 205 */
                {   int m_test = z->l - z->c; /* test, line 205 */
                    if (!(eq_s_b(z, 1, "c"))) goto lab7;
                    z->c = z->l - m_test;
                }
                if (!r_RV(z)) goto lab7; /* call RV, line 205 */
                slice_del(z); /* delete, line 205 */
            lab7:
                z->c = z->l - m;
            }
            goto lab3;
        lab4:
            z->c = z->l - m;
            if (!r_residual_suffix(z)) goto lab2; /* call residual_suffix, line 207 */
        }
    lab3:
    lab2:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 209 */
        if (!r_residual_form(z)) goto lab8; /* call residual_form, line 209 */
    lab8:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 211 */
        if (!r_postlude(z)) goto lab9; /* call postlude, line 211 */
    lab9:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * portuguese_create_env(void) { return SN_create_env(0, 3, 0); }

extern void portuguese_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stem.h ===

extern struct SN_env * portuguese_create_env(void);
extern void portuguese_close_env(struct SN_env * z);

extern int portuguese_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stem.sbl ===
routines (
           prelude postlude mark_regions
           RV R1 R2
           standard_suffix
           verb_suffix
           residual_suffix
           residual_form
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a'   hex 'A0'  // a-acute
stringdef a^   hex '83'  // a-circumflex e.g. 'bota^nico
stringdef e'   hex '82'  // e-acute
stringdef e^   hex '88'  // e-circumflex
stringdef i'   hex 'A1'  // i-acute
stringdef o^   hex '93'  // o-circumflex
stringdef o'   hex 'A2'  // o-acute
stringdef u'   hex 'A3'  // u-acute
stringdef c,   hex '87'  // c-cedilla

stringdef a~   hex 'C6'  // a-tilde
stringdef o~   hex 'E4'  // o-tilde


define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'

define prelude as repeat (
    [substring] among(
        '{a~}' (<- 'a~')
        '{o~}' (<- 'o~')
    ) or next
)

define mark_regions as (

    $pV = limit
    $p1 = limit
    $p2 = limit  // defaults

    do (
        ( v (non-v gopast v) or (v gopast non-v) )
        or
        ( non-v (non-v gopast v) or (v next) )
        setmark pV
    )
    do (
        gopast v gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define postlude as repeat (
    [substring] among(
        'a~' (<- '{a~}')
        'o~' (<- '{o~}')
    ) or next
)

backwardmode (

    define RV as $pV <= cursor
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define standard_suffix as (
        [substring] among(

            'eza' 'ezas'
            'ico' 'ica' 'icos' 'icas'
            'ismo' 'ismos'
            '{a'}vel'
            '{i'}vel'
            'ista' 'istas'
            'oso' 'osa' 'osos' 'osas'
            'amento' 'amentos'
            'imento' 'imentos'

           'adora' 'ador' 'a{c,}a~o'
           'adoras' 'adores' 'a{c,}o~es'  // no -ic test
            (
                R2 delete
            )
            'log{i'}a'
            'log{i'}as'
            (
                R2 <- 'log'
            )
            'uci{o'}n' 'uciones'
            (
                R2 <- 'u'
            )
            '{e^}ncia' '{e^}ncias'
            (
                R2 <- 'ente'
            )
            'amente'
            (
                R1 delete
                try (
                    [substring] R2 delete among(
                        'iv' (['at'] R2 delete)
                        'os'
                        'ic'
                        'ad'
                    )
                )
            )
            'mente'
            (
                R2 delete
                try (
                    [substring] among(
                        'avel'
                        '{i'}vel' (R2 delete)
                    )
                )
            )
            'idade'
            'idades'
            (
                R2 delete
                try (
                    [substring] among(
                        'abil'
                        'ic'
                        'iv'   (R2 delete)
                    )
                )
            )
            'iva' 'ivo'
            'ivas' 'ivos'
            (
                R2 delete
                try (
                    ['at'] R2 delete // but not a further   ['ic'] R2 delete
                )
            )
            'ira' 'iras'
            (
                RV 'e'  // -eira -eiras usually non-verbal
                <- 'ir'
            )
        )
    )

    define verb_suffix as setlimit tomark pV for (
        [substring] among(
            'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
            'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
            'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
            'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
            'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
            'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
            'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
            'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
            'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
            'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
            '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
            '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
            '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
            'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
            'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
            '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'

            'ira' 'iras'
                (delete)
        )
    )

    define residual_suffix as (
        [substring] among(
            'os'
            'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
                ( RV delete )
        )
    )

    define residual_form as (
        [substring] among(
            'e' '{e'}' '{e^}'
                ( RV delete [('u'] test 'g') or
                             ('i'] test 'c') RV delete )
            '{c,}' (<-'c')
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards (
        do (
            ( standard_suffix or verb_suffix
              do ( ['i'] test 'c' RV delete )
            )
            or residual_suffix
        )
        do residual_form
    )
    do postlude
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/stemmer.html === (478/578 lines abridged)

<HTML>
<HEAD>
<TITLE>Portuguese stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>Portuguese stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample Portuguese vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
                  Romance language stemmers</A>
</TABLE></DL>

</TR>

<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of Portuguese vocabulary, with the stemmed forms that will
be generated with this algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>

[-=- -=- -=- 478 lines omitted -=- -=- -=-]

            'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
            '{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
            '{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
            '{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
            'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
            'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
            '{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'

            'ira' 'iras'
                (delete)
        )
    )

    define residual_suffix as (
        [substring] among(
            'os'
            'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
                ( RV delete )
        )
    )

    define residual_form as (
        [substring] among(
            'e' '{e'}' '{e^}'
                ( RV delete [('u'] test 'g') or
                             ('i'] test 'c') RV delete )
            '{c,}' (<-'c')
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards (
        do (
            ( standard_suffix or verb_suffix
              do ( ['i'] test 'c' RV delete )
            )
            or residual_suffix
        )
        do residual_form
    )
    do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/portuguese/voc.txt === (31916/32016 lines abridged)
a
 
…
Æ
aacho
aacute
abacaxi
abade
abaet‚
abafar
abafaram
abaixa
abaixe
abaixei
abaixo
abaixou
abalada
abalado
abalaram
abalos
abalou
abalroado
abandona
abandon 
abandonada
abandonadas
abandonado
abandonados
abandonam
abandonando
abandonar
abandonara
abandonaram
abandonasse
abandono
abandonou
abarrotado
abarrotados
abarrotou
abastada
abastado
abastados
abastecem
abastecer
abastecida
abastecimento
abata
abate
abatedouro
abatem

[-=- -=- -=- 31916 lines omitted -=- -=- -=-]

zapping
zara
zaragoza
zarin
zaz
z‚
zebra
zebras
zebu
zeca
zedias
zeferina
zehnder
zelador
zelƒndia
zelar
zen
zenilda
zenildo
zentel
zepellin
zequinha
zerados
zerinho
zero
zerou
zez‚
zhiling
zico
zilberman
zimb bue
zinco
zinhos
zĄper
ziraldo
zita
zoar
zodĄaco
zola
zolio
zona
zonas
zoneamento
zonzo
zooboros
zoolągico
zoomp
zul
zumbi
zumbido