[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german - germanstem.c:1.1.2.1 output.txt:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:23 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/german

Added Files:
      Tag: ajung-textindexng-branch
	germanstem.c output.txt stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/germanstem.c ===

#include "header.h"

extern int german_stem(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);

static struct among a_0[6] =
{
/*  0 */ { 0, (byte *)"", -1, 6, 0},
/*  1 */ { 1, (byte *)"U", 0, 2, 0},
/*  2 */ { 1, (byte *)"Y", 0, 1, 0},
/*  3 */ { 1, (byte *)"\x81" "", 0, 5, 0},
/*  4 */ { 1, (byte *)"\x84" "", 0, 3, 0},
/*  5 */ { 1, (byte *)"\x94" "", 0, 4, 0}
};

static struct among a_1[7] =
{
/*  0 */ { 1, (byte *)"e", -1, 1, 0},
/*  1 */ { 2, (byte *)"em", -1, 1, 0},
/*  2 */ { 2, (byte *)"en", -1, 1, 0},
/*  3 */ { 3, (byte *)"ern", -1, 1, 0},
/*  4 */ { 2, (byte *)"er", -1, 1, 0},
/*  5 */ { 1, (byte *)"s", -1, 2, 0},
/*  6 */ { 2, (byte *)"es", 5, 1, 0}
};

static struct among a_2[4] =
{
/*  0 */ { 2, (byte *)"en", -1, 1, 0},
/*  1 */ { 2, (byte *)"er", -1, 1, 0},
/*  2 */ { 2, (byte *)"st", -1, 2, 0},
/*  3 */ { 3, (byte *)"est", 2, 1, 0}
};

static struct among a_3[2] =
{
/*  0 */ { 2, (byte *)"ig", -1, 1, 0},
/*  1 */ { 4, (byte *)"lich", -1, 1, 0}
};

static struct among a_4[8] =
{
/*  0 */ { 3, (byte *)"end", -1, 1, 0},
/*  1 */ { 2, (byte *)"ig", -1, 2, 0},
/*  2 */ { 3, (byte *)"ung", -1, 1, 0},
/*  3 */ { 4, (byte *)"lich", -1, 3, 0},
/*  4 */ { 4, (byte *)"isch", -1, 2, 0},
/*  5 */ { 2, (byte *)"ik", -1, 2, 0},
/*  6 */ { 4, (byte *)"heit", -1, 3, 0},
/*  7 */ { 4, (byte *)"keit", -1, 4, 0}
};


static byte g_v[] = { 17, 65, 16, 1, 9, 0, 8 };

static byte g_s_ending[] = { 117, 30, 5 };

static byte g_st_ending[] = { 117, 30, 4 };

static int r_prelude(struct SN_env * z) {
    {   int c_test = z->c; /* test, line 30 */
        while(1) { /* repeat, line 30 */
            int c = z->c;
            {   int c = z->c; /* or, line 33 */
                z->bra = z->c; /* [, line 32 */
                if (!(eq_s(z, 1, "\xE1" ""))) goto lab2;
                z->ket = z->c; /* ], line 32 */
                slice_from_s(z, 2, "ss"); /* <-, line 32 */
                goto lab1;
            lab2:
                z->c = c;
                if (z->c >= z->l) goto lab0;
                z->c++; /* next, line 33 */
            }
        lab1:
            continue;
        lab0:
            z->c = c;
            break;
        }
        z->c = c_test;
    }
    while(1) { /* repeat, line 36 */
        int c = z->c;
        while(1) { /* goto, line 36 */
            int c = z->c;
            if (!(in_grouping(z, g_v, 97, 148))) goto lab4;
            z->bra = z->c; /* [, line 37 */
            {   int c = z->c; /* or, line 37 */
                if (!(eq_s(z, 1, "u"))) goto lab6;
                z->ket = z->c; /* ], line 37 */
                if (!(in_grouping(z, g_v, 97, 148))) goto lab6;
                slice_from_s(z, 1, "U"); /* <-, line 37 */
                goto lab5;
            lab6:
                z->c = c;
                if (!(eq_s(z, 1, "y"))) goto lab4;
                z->ket = z->c; /* ], line 38 */
                if (!(in_grouping(z, g_v, 97, 148))) goto lab4;
                slice_from_s(z, 1, "Y"); /* <-, line 38 */
            }
        lab5:
            z->c = c;
            break;
        lab4:
            z->c = c;
            if (z->c >= z->l) goto lab3;
            z->c++;
        }
        continue;
    lab3:
        z->c = c;
        break;
    }
    return 1;
}

static int r_mark_regions(struct SN_env * z) {
    z->I[0] = z->l;
    z->I[1] = z->l;
    while(1) { /* gopast, line 47 */
        if (!(in_grouping(z, g_v, 97, 148))) goto lab0;
        break;
    lab0:
        if (z->c >= z->l) return 0;
        z->c++;
    }
    while(1) { /* gopast, line 47 */
        if (!(out_grouping(z, g_v, 97, 148))) goto lab1;
        break;
    lab1:
        if (z->c >= z->l) return 0;
        z->c++;
    }
    z->I[0] = z->c; /* setmark p1, line 47 */
     /* try, line 48 */
    if (!(z->I[0] < 3)) goto lab2;
    z->I[0] = 3;
lab2:
    while(1) { /* gopast, line 49 */
        if (!(in_grouping(z, g_v, 97, 148))) goto lab3;
        break;
    lab3:
        if (z->c >= z->l) return 0;
        z->c++;
    }
    while(1) { /* gopast, line 49 */
        if (!(out_grouping(z, g_v, 97, 148))) goto lab4;
        break;
    lab4:
        if (z->c >= z->l) return 0;
        z->c++;
    }
    z->I[1] = z->c; /* setmark p2, line 49 */
    return 1;
}

static int r_postlude(struct SN_env * z) {
    int among_var;
    while(1) { /* repeat, line 53 */
        int c = z->c;
        z->bra = z->c; /* [, line 55 */
        among_var = find_among(z, a_0, 6); /* substring, line 55 */
        if (!(among_var)) goto lab0;
        z->ket = z->c; /* ], line 55 */
        switch(among_var) {
            case 0: goto lab0;
            case 1:
                slice_from_s(z, 1, "y"); /* <-, line 56 */
                break;
            case 2:
                slice_from_s(z, 1, "u"); /* <-, line 57 */
                break;
            case 3:
                slice_from_s(z, 1, "a"); /* <-, line 58 */
                break;
            case 4:
                slice_from_s(z, 1, "o"); /* <-, line 59 */
                break;
            case 5:
                slice_from_s(z, 1, "u"); /* <-, line 60 */
                break;
            case 6:
                if (z->c >= z->l) goto lab0;
                z->c++; /* next, line 61 */
                break;
        }
        continue;
    lab0:
        z->c = c;
        break;
    }
    return 1;
}

static int r_R1(struct SN_env * z) {
    if (!(z->I[0] <= z->c)) return 0;
    return 1;
}

static int r_R2(struct SN_env * z) {
    if (!(z->I[1] <= z->c)) return 0;
    return 1;
}

static int r_standard_suffix(struct SN_env * z) {
    int among_var;
    {   int m = z->l - z->c; /* do, line 72 */
        z->ket = z->c; /* [, line 73 */
        among_var = find_among_b(z, a_1, 7); /* substring, line 73 */
        if (!(among_var)) goto lab0;
        z->bra = z->c; /* ], line 73 */
        if (!r_R1(z)) goto lab0; /* call R1, line 73 */
        switch(among_var) {
            case 0: goto lab0;
            case 1:
                slice_del(z); /* delete, line 75 */
                break;
            case 2:
                if (!(in_grouping_b(z, g_s_ending, 98, 116))) goto lab0;
                slice_del(z); /* delete, line 78 */
                break;
        }
    lab0:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 82 */
        z->ket = z->c; /* [, line 83 */
        among_var = find_among_b(z, a_2, 4); /* substring, line 83 */
        if (!(among_var)) goto lab1;
        z->bra = z->c; /* ], line 83 */
        if (!r_R1(z)) goto lab1; /* call R1, line 83 */
        switch(among_var) {
            case 0: goto lab1;
            case 1:
                slice_del(z); /* delete, line 85 */
                break;
            case 2:
                if (!(in_grouping_b(z, g_st_ending, 98, 116))) goto lab1;
                {   int c = z->c - 3;
                    if (z->lb > c || c > z->l) goto lab1;
                    z->c = c; /* hop, line 88 */
                }
                slice_del(z); /* delete, line 88 */
                break;
        }
    lab1:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 92 */
        z->ket = z->c; /* [, line 93 */
        among_var = find_among_b(z, a_4, 8); /* substring, line 93 */
        if (!(among_var)) goto lab2;
        z->bra = z->c; /* ], line 93 */
        if (!r_R2(z)) goto lab2; /* call R2, line 93 */
        switch(among_var) {
            case 0: goto lab2;
            case 1:
                slice_del(z); /* delete, line 95 */
                {   int m = z->l - z->c; /* try, line 96 */
                    z->ket = z->c; /* [, line 96 */
                    if (!(eq_s_b(z, 2, "ig"))) { z->c = z->l - m; goto lab3; }
                    z->bra = z->c; /* ], line 96 */
                    {   int m = z->l - z->c; /* not, line 96 */
                        if (!(eq_s_b(z, 1, "e"))) goto lab4;
                        { z->c = z->l - m; goto lab3; }
                    lab4:
                        z->c = z->l - m;
                    }
                    if (!r_R2(z)) { z->c = z->l - m; goto lab3; } /* call R2, line 96 */
                    slice_del(z); /* delete, line 96 */
                lab3:
                }
                break;
            case 2:
                {   int m = z->l - z->c; /* not, line 99 */
                    if (!(eq_s_b(z, 1, "e"))) goto lab5;
                    goto lab2;
                lab5:
                    z->c = z->l - m;
                }
                slice_del(z); /* delete, line 99 */
                break;
            case 3:
                slice_del(z); /* delete, line 102 */
                {   int m = z->l - z->c; /* try, line 103 */
                    z->ket = z->c; /* [, line 104 */
                    {   int m = z->l - z->c; /* or, line 104 */
                        if (!(eq_s_b(z, 2, "er"))) goto lab8;
                        goto lab7;
                    lab8:
                        z->c = z->l - m;
                        if (!(eq_s_b(z, 2, "en"))) { z->c = z->l - m; goto lab6; }
                    }
                lab7:
                    z->bra = z->c; /* ], line 104 */
                    if (!r_R1(z)) { z->c = z->l - m; goto lab6; } /* call R1, line 104 */
                    slice_del(z); /* delete, line 104 */
                lab6:
                }
                break;
            case 4:
                slice_del(z); /* delete, line 108 */
                {   int m = z->l - z->c; /* try, line 109 */
                    z->ket = z->c; /* [, line 110 */
                    among_var = find_among_b(z, a_3, 2); /* substring, line 110 */
                    if (!(among_var)) { z->c = z->l - m; goto lab9; }
                    z->bra = z->c; /* ], line 110 */
                    if (!r_R2(z)) { z->c = z->l - m; goto lab9; } /* call R2, line 110 */
                    switch(among_var) {
                        case 0: { z->c = z->l - m; goto lab9; }
                        case 1:
                            slice_del(z); /* delete, line 112 */
                            break;
                    }
                lab9:
                }
                break;
        }
    lab2:
        z->c = z->l - m;
    }
    return 1;
}

extern int german_stem(struct SN_env * z) {
    {   int c = z->c; /* do, line 123 */
        if (!r_prelude(z)) goto lab0; /* call prelude, line 123 */
    lab0:
        z->c = c;
    }
    {   int c = z->c; /* do, line 124 */
        if (!r_mark_regions(z)) goto lab1; /* call mark_regions, line 124 */
    lab1:
        z->c = c;
    }
    z->lb = z->c; z->c = z->l; /* backwards, line 125 */

    {   int m = z->l - z->c; /* do, line 126 */
        if (!r_standard_suffix(z)) goto lab2; /* call standard_suffix, line 126 */
    lab2:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 127 */
        if (!r_postlude(z)) goto lab3; /* call postlude, line 127 */
    lab3:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * german_create_env(void) { return SN_create_env(0, 2, 0); }

extern void german_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/output.txt === (34933/35033 lines abridged)
a
aa
aalglatt
aargau
aargau
aas
aasgeruch
aasholl
ab
abaddon
abart
abbeisst
abbild
abbild
abbiss
abbrech
abbruch
abend
abendbrot
abenddammer
abenddunkl
abend
abendess
abendherr
abendhimmel
abendland
abendland
abend
abendlicht
abendluft
abendmahl
abendrot
abend
abendschein
abendschein
abendschoppch
abendsonn
abend
abendtanz
abendwind
abendzeit
abenteu
abenteu
abenteu
abenteu
abenteu
abenteu
abenteu
abenteuerspielplatz
abenteur

[-=- -=- -=- 34933 lines omitted -=- -=- -=-]

zweiundachtz
zweiunddreiss
zweiundfunfz
zweiundsechz
zweiundvierz
zweiundzwanz
zwerg
zwergfink
zwickt
zwieback
zwiegesprach
zwiegsang
zwiespalt
zwiespalt
zwiesprach
zwillich
zwillingsmilchflasch
zwing
zwingend
zwing
zwingt
zwinkert
zwinkert
zwirchkamm
zwirn
zwirnfabr
zwisch
zwischendurch
zwischenraum
zwischenzeit
zwitschermus
zwitsch
zwitschernd
zwitschert
zwoa
zwolf
zwolferstang
zwolfhundert
zwolfjahr
zwolfmal
zwolft
zwung
zylind
zylinderhut
zylinderhut
zylinderhut
zynik
zynisch
zypern
zypress


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/stem.h ===

extern struct SN_env * german_create_env(void);
extern void german_close_env(struct SN_env * z);

extern int german_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/stem.sbl ===
routines (
           prelude postlude
           mark_regions
           R1 R2
           standard_suffix
)

externals ( stem )

integers ( p1 p2 )

groupings ( v s_ending st_ending )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a"   hex '84'
stringdef o"   hex '94'
stringdef u"   hex '81'
stringdef ss   hex 'E1'

define v 'aeiouy{a"}{o"}{u"}'

define s_ending  'bdfghklmnrt'
define st_ending s_ending - 'r'

define prelude as (

    test repeat (
        (
            ['{ss}'] <- 'ss'
        ) or next
    )

    repeat goto (
        v [('u'] v <- 'U') or
           ('y'] v <- 'Y')
    )
)

define mark_regions as (

    $p1 = limit
    $p2 = limit

    gopast v  gopast non-v  setmark p1
    try($p1 < 3  $p1 = 3)  // at least 3
    gopast v  gopast non-v  setmark p2

)

define postlude as repeat (

    [substring] among(
        'Y'    (<- 'y')
        'U'    (<- 'u')
        '{a"}' (<- 'a')
        '{o"}' (<- 'o')
        '{u"}' (<- 'u')
    ) or next

)

backwardmode (

    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define standard_suffix as (
        do (
            [substring] R1 among(
                'e' 'em' 'en' 'ern' 'er' 'es'
                (   delete
                )
                's'
                (   s_ending delete
                )
            )
        )
        do (
            [substring] R1 among(
                'en' 'er' 'est'
                (   delete
                )
                'st'
                (   st_ending hop 3 delete
                )
            )
        )
        do (
            [substring] R2 among(
                'end' 'ung'
                (   delete
                    try (['ig'] not 'e' R2 delete)
                )
                'ig' 'ik' 'isch'
                (   not 'e' delete
                )
                'lich' 'heit'
                (   delete
                    try (
                        ['er' or 'en'] R1 delete
                    )
                )
                'keit'
                (   delete
                    try (
                        [substring] R2 among(
                            'lich' 'ig'
                            (   delete
                            )
                        )
                    )
                )
            )
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards
        do standard_suffix
    do postlude
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/stemmer.html ===

<HTML>
<HEAD>
<TITLE>German stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>German stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample German vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/germanic.html">
                  Germanic language stemmers</A>
</TABLE></DL>

</TR>

<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of German vocabulary, with the stemmed forms that will
be generated with this algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>
aufeinander<BR>
aufeinanderbiss<BR>
aufeinanderfolge<BR>
aufeinanderfolgen<BR>
aufeinanderfolgend<BR>
aufeinanderfolgende<BR>
aufeinanderfolgenden<BR>
aufeinanderfolgender<BR>
aufeinanderfolgt<BR>
aufeinanderfolgten<BR>
aufeinanderschl&uuml;gen<BR>
aufenthalt<BR>
aufenthalten<BR>
aufenthaltes<BR>
auferlegen<BR>
auferlegt<BR>
auferlegten<BR>
auferstand<BR>
auferstanden<BR>
auferstehen<BR>
aufersteht<BR>
auferstehung<BR>
auferst&uuml;nde<BR>
auferwecken<BR>
auferweckt<BR>
auferzogen<BR>
aufessen<BR>
auffa<BR>
auffallen<BR>
auffallend<BR>
auffallenden<BR>
auffallender<BR>
auff&auml;llig<BR>
auff&auml;lligen<BR>
auff&auml;lliges<BR>
auffassen<BR>
auffasst<BR>
auffa&szlig;t<BR>
auffassung<BR>
auffassungsverm&ouml;gen<BR>
</TD>
<TD></TD><TD> &nbsp;<TT><B> => </B></TT>&nbsp; </TD>
<TD></TD><TD>
aufeinand<BR>
aufeinanderbiss<BR>
aufeinanderfolg<BR>
aufeinanderfolg<BR>
aufeinanderfolg<BR>
aufeinanderfolg<BR>
aufeinanderfolg<BR>
aufeinanderfolg<BR>
aufeinanderfolgt<BR>
aufeinanderfolgt<BR>
aufeinanderschlug<BR>
aufenthalt<BR>
aufenthalt<BR>
aufenthalt<BR>
auferleg<BR>
auferlegt<BR>
auferlegt<BR>
auferstand<BR>
auferstand<BR>
aufersteh<BR>
aufersteht<BR>
aufersteh<BR>
auferstund<BR>
auferweck<BR>
auferweckt<BR>
auferzog<BR>
aufess<BR>
auffa<BR>
auffall<BR>
auffall<BR>
auffall<BR>
auffall<BR>
auffall<BR>
auffall<BR>
auffall<BR>
auffass<BR>
auffasst<BR>
auffasst<BR>
auffass<BR>
auffassungsvermog<BR>
</TD>
<TD></TD><TD> </TD>
<TD></TD><TD>
kategorie<BR>
kategorien<BR>
kategorisch<BR>
kategorische<BR>
kategorischen<BR>
kategorischer<BR>
kater<BR>
katerliede<BR>
katern<BR>
katers<BR>
k&auml;thchen<BR>
kathedrale<BR>
kathinka<BR>
katholik<BR>
katholische<BR>
katholischen<BR>
katholischer<BR>
kattun<BR>
kattunhalst&uuml;cher<BR>
katz<BR>
k&auml;tzchen<BR>
k&auml;tzchens<BR>
katze<BR>
katzen<BR>
katzenschmer<BR>
katzensprung<BR>
katzenw&uuml;rde<BR>
k&auml;tzin<BR>
k&auml;tzlein<BR>
katzmann<BR>
kauen<BR>
kauerte<BR>
kauf<BR>
kaufe<BR>
kaufen<BR>
k&auml;ufer<BR>
kauffahrer<BR>
kaufherr<BR>
kaufleute<BR>
k&auml;uflich<BR>
</TD>
<TD></TD><TD> &nbsp;<TT><B> => </B></TT>&nbsp; </TD>
<TD></TD><TD>
kategori<BR>
kategori<BR>
kategor<BR>
kategor<BR>
kategor<BR>
kategor<BR>
kat<BR>
katerlied<BR>
kat<BR>
kat<BR>
kathch<BR>
kathedral<BR>
kathinka<BR>
kathol<BR>
kathol<BR>
kathol<BR>
kathol<BR>
kattun<BR>
kattunhalstuch<BR>
katz<BR>
katzch<BR>
katzch<BR>
katz<BR>
katz<BR>
katzenschm<BR>
katzenspr<BR>
katzenwurd<BR>
katzin<BR>
katzlein<BR>
katzmann<BR>
kau<BR>
kauert<BR>
kauf<BR>
kauf<BR>
kauf<BR>
kauf<BR>
kauffahr<BR>
kaufherr<BR>
kaufleut<BR>
kauflich<BR>
</TD>
</TR>
</TABLE></DL>


</TR>

<TR><TD>

<BR><BR>
<BR>&nbsp;<H2>The stemming algorith</H2>

German includes the following accented forms,
<DL><DD>
    <B><I>&auml;  &nbsp;  &ouml;  &nbsp;  &uuml;</I></B>
</DL>
and a special letter, <B><I>&szlig;</I></B>, equivalent to double <B><I>s</I></B>.
<BR><BR>
The following letters are vowels:
<DL><DD>
    <B><I>a  &nbsp;  e  &nbsp;  i  &nbsp;  o  &nbsp;  u  &nbsp;  y  &nbsp;  &auml;  &nbsp;  &ouml;  &nbsp;  &uuml;</I></B>
</DL>
First, replace <B><I>&szlig;</I></B> by <B><I>ss</I></B>, and put <B><I>u</I></B> and <B><I>y</I></B> between vowels into upper case.
<I>R</I>1 and <I>R</I>2 are first set up in the standard way
(see the <A HREF="../texts/r1r2.html"> note</A> on <I>R</I>1 and <I>R</I>2),
but then <I>R</I>1 is
adjusted so that the region before it contains at least 3 letters.
<BR><BR>
Define a valid <B><I>s</I></B>-ending as one of <B><I>b</I></B>, <B><I>d</I></B>, <B><I>f</I></B>, <B><I>g</I></B>, <B><I>h</I></B>, <B><I>k</I></B>, <B><I>l</I></B>, <B><I>m</I></B>, <B><I>n</I></B>, <B><I>r</I></B> or <B><I>t</I></B>.
<BR><BR>
Define a valid <B><I>st</I></B>-ending as the same list, excluding letter <B><I>r</I></B>.
<BR><BR>
Do each of steps 1, 2 and 3.
<BR><BR>
Step 1:
<DL><DD>
    Search for the longest among the following suffixes,
<BR><BR><DL><DD>
        (<I>a</I>) <B><I>e  &nbsp;  em  &nbsp;  en  &nbsp;  ern  &nbsp;  er  &nbsp;  es</I></B><BR>
        (<I>b</I>) <B><I>s</I></B> (preceded by a valid <B><I>s</I></B>-ending)
</DL><BR>
    and delete if in <I>R</I>1. (Of course the letter of the valid <B><I>s</I></B>-ending is
    not necessarily in <I>R</I>1)
<BR><BR>
    (For example, <I>&auml;ckern</I> <TT>-&gt;</TT> <I>&auml;ck</I>, <I>ackers</I> <TT>-&gt;</TT> <I>acker</I>, <I>armes</I> <TT>-&gt;</TT> <I>arm</I>)
</DL>
Step 2:
<DL><DD>
    Search for the longest among the following suffixes,
<BR><BR><DL><DD>
        (<I>a</I>) <B><I>en  &nbsp;  er  &nbsp;  est</I></B><BR>
        (<I>b</I>) <B><I>st</I></B> (preceded by a valid <B><I>st</I></B>-ending, itself preceded by at least 3
        letters)
</DL><BR>
    and delete if in <I>R</I>1.
<BR><BR>
    (For example, <I>derbsten</I> <TT>-&gt;</TT> <I>derbst</I> by step 1, and <I>derbst</I> <TT>-&gt;</TT> <I>derb</I> by step
    2, since <B><I>b</I></B> is a valid <B><I>st</I></B>-ending, and is preceded by just 3 letters)
</DL>
Step 3: <I>d</I>-suffixes
<DL><DD>
    Search for the longest among the following suffixes, and perform the
    action indicated.
<BR><BR>
<DL>
    <DT><B><I>end  &nbsp;  ung</I></B>
        <DD>delete if in <I>R</I>2
        <DD>if preceded by <B><I>ig</I></B>, delete if in <I>R</I>2 and not preceded by <B><I>e</I></B>
<BR><BR>
    <DT><B><I>ig  &nbsp;  ik  &nbsp;  isch</I></B>
        <DD>delete if in <I>R</I>2 and not preceded by <B><I>e</I></B>
<BR><BR>
    <DT><B><I>lich  &nbsp;  heit</I></B>
        <DD>delete if in <I>R</I>2
        <DD>if preceded by <B><I>er</I></B> or <B><I>en</I></B>, delete if in <I>R</I>1
<BR><BR>
    <DT><B><I>keit</I></B>
        <DD>delete if in <I>R</I>2
        <DD>if preceded by <B><I>lich</I></B> or <B><I>ig</I></B>, delete if in <I>R</I>2
</DL>
</DL>
Finally,
<DL><DD>
    turn <B><I>U</I></B> and <B><I>Y</I></B> back into lower case, and remove the umlaut accent from <B><I>a</I></B>,
    <B><I>o</I></B> and <B><I>u</I></B>.
</DL>

</TR>

<TR><TD BGCOLOR="lightblue">

<BR>&nbsp;<H2>The same algorithm in Snowball</H2>

<FONT SIZE=-1><PRE>
<DL><DD>
routines (
           prelude postlude
           mark_regions
           R1 R2
           standard_suffix
)

externals ( stem )

integers ( p1 p2 )

groupings ( v s_ending st_ending )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a"   hex '84'
stringdef o"   hex '94'
stringdef u"   hex '81'
stringdef ss   hex 'E1'

define v 'aeiouy{a"}{o"}{u"}'

define s_ending  'bdfghklmnrt'
define st_ending s_ending - 'r'

define prelude as (

    test repeat (
        (
            ['{ss}'] <- 'ss'
        ) or next
    )

    repeat goto (
        v [('u'] v <- 'U') or
           ('y'] v <- 'Y')
    )
)

define mark_regions as (

    $p1 = limit
    $p2 = limit

    gopast v  gopast non-v  setmark p1
    try($p1 < 3  $p1 = 3)  // at least 3
    gopast v  gopast non-v  setmark p2

)

define postlude as repeat (

    [substring] among(
        'Y'    (<- 'y')
        'U'    (<- 'u')
        '{a"}' (<- 'a')
        '{o"}' (<- 'o')
        '{u"}' (<- 'u')
    ) or next

)

backwardmode (

    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define standard_suffix as (
        do (
            [substring] R1 among(
                'e' 'em' 'en' 'ern' 'er' 'es'
                (   delete
                )
                's'
                (   s_ending delete
                )
            )
        )
        do (
            [substring] R1 among(
                'en' 'er' 'est'
                (   delete
                )
                'st'
                (   st_ending hop 3 delete
                )
            )
        )
        do (
            [substring] R2 among(
                'end' 'ung'
                (   delete
                    try (['ig'] not 'e' R2 delete)
                )
                'ig' 'ik' 'isch'
                (   not 'e' delete
                )
                'lich' 'heit'
                (   delete
                    try (
                        ['er' or 'en'] R1 delete
                    )
                )
                'keit'
                (   delete
                    try (
                        [substring] R2 among(
                            'lich' 'ig'
                            (   delete
                            )
                        )
                    )
                )
            )
        )
    )
)

define stem as (
    do prelude
    do mark_regions
    backwards
        do standard_suffix
    do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/german/voc.txt === (34933/35033 lines abridged)
a
aa
aalglatten
aargau
aargauer
aasen
aasgeruch
aash”llen
ab
abaddon
abarten
abbeisst
abbild
abbildung
abbiss
abbrechen
abbruch
abend
abendbrot
abendd„mmerung
abenddunklen
abende
abendessen
abendherren
abendhimmel
abendl„ndischem
abendl„ndischen
abendlichen
abendlichte
abendluft
abendmahl
abendr”te
abends
abendschein
abendscheine
abendsch”ppchen
abendsonne
abendstern
abendtanz
abendwind
abendzeit
abenteuer
abenteuerlich
abenteuerliche
abenteuerlichem
abenteuerliches
abenteuern
abenteuers
abenteuerspielpl„tzen
abenteurer

[-=- -=- -=- 34933 lines omitted -=- -=- -=-]

zweiundachtzig
zweiunddreissig
zweiundfnfzig
zweiundsechzig
zweiundvierzig
zweiundzwanzig
zwerge
zwergfinken
zwickten
zwieback
zwiegespr„ch
zwiegsang
zwiespalt
zwiespaltes
zwiesprache
zwillich
zwillingsmilchflasche
zwingen
zwingend
zwinger
zwingt
zwinkert
zwinkerte
zwirchkammer
zwirn
zwirnfabrik
zwischen
zwischendurch
zwischenr„umen
zwischenzeitlich
zwitschermusik
zwitschern
zwitschernd
zwitscherte
zwoa
zw”lf
zw”lferstangen
zw”lfhundert
zw”lfj„hrigen
zw”lfmal
zw”lfte
zwungen
zylinder
zylinderhut
zylinderhte
zylinderhten
zyniker
zynischen
zyperns
zypressen