[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french - frenchstem.c:1.1.2.1 output.txt:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:20 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/french

Added Files:
      Tag: ajung-textindexng-branch
	frenchstem.c output.txt stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/frenchstem.c === (688/788 lines abridged)

#include "header.h"

extern int french_stem(struct SN_env * z);
static int r_un_accent(struct SN_env * z);
static int r_un_double(struct SN_env * z);
static int r_residual_suffix(struct SN_env * z);
static int r_verb_suffix(struct SN_env * z);
static int r_i_verb_suffix(struct SN_env * z);
static int r_standard_suffix(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_RV(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);

static struct among a_0[4] =
{
/*  0 */ { 0, (byte *)"", -1, 4, 0},
/*  1 */ { 1, (byte *)"I", 0, 1, 0},
/*  2 */ { 1, (byte *)"U", 0, 2, 0},
/*  3 */ { 1, (byte *)"Y", 0, 3, 0}
};

static struct among a_1[4] =
{
/*  0 */ { 3, (byte *)"iqU", -1, 3, 0},
/*  1 */ { 3, (byte *)"abl", -1, 3, 0},
/*  2 */ { 3, (byte *)"eus", -1, 2, 0},
/*  3 */ { 2, (byte *)"iv", -1, 1, 0}
};

static struct among a_2[3] =
{
/*  0 */ { 2, (byte *)"ic", -1, 2, 0},
/*  1 */ { 4, (byte *)"abil", -1, 1, 0},
/*  2 */ { 2, (byte *)"iv", -1, 3, 0}
};

static struct among a_3[43] =
{
/*  0 */ { 4, (byte *)"iqUe", -1, 1, 0},
/*  1 */ { 6, (byte *)"atrice", -1, 2, 0},
/*  2 */ { 4, (byte *)"ance", -1, 1, 0},
/*  3 */ { 4, (byte *)"ence", -1, 5, 0},
/*  4 */ { 5, (byte *)"logie", -1, 3, 0},
/*  5 */ { 4, (byte *)"able", -1, 1, 0},
/*  6 */ { 4, (byte *)"isme", -1, 1, 0},
/*  7 */ { 4, (byte *)"euse", -1, 11, 0},

[-=- -=- -=- 688 lines omitted -=- -=- -=-]

            lab5:
                z->c = z->l - m;
                {   int m = z->l - z->c; /* try, line 223 */
                    z->ket = z->c; /* [, line 223 */
                    {   int m = z->l - z->c; /* or, line 223 */
                        if (!(eq_s_b(z, 1, "Y"))) goto lab10;
                        z->bra = z->c; /* ], line 223 */
                        slice_from_s(z, 1, "i"); /* <-, line 223 */
                        goto lab9;
                    lab10:
                        z->c = z->l - m;
                        if (!(eq_s_b(z, 1, "\x87" ""))) { z->c = z->l - m; goto lab8; }
                        z->bra = z->c; /* ], line 224 */
                        slice_from_s(z, 1, "c"); /* <-, line 224 */
                    }
                lab9:
                lab8:
                }
            }
            goto lab3;
        lab4:
            z->c = z->l - m;
            if (!r_residual_suffix(z)) goto lab2; /* call residual_suffix, line 227 */
        }
    lab3:
    lab2:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 232 */
        if (!r_un_double(z)) goto lab11; /* call un_double, line 232 */
    lab11:
        z->c = z->l - m;
    }
    {   int m = z->l - z->c; /* do, line 233 */
        if (!r_un_accent(z)) goto lab12; /* call un_accent, line 233 */
    lab12:
        z->c = z->l - m;
    }
    z->c = z->lb;    {   int c = z->c; /* do, line 235 */
        if (!r_postlude(z)) goto lab13; /* call postlude, line 235 */
    lab13:
        z->c = c;
    }
    return 1;
}

extern struct SN_env * french_create_env(void) { return SN_create_env(0, 3, 0); }

extern void french_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/output.txt === (20307/20407 lines abridged)
a
…
abailard
abaiss
abaiss
abaiss
abaiss
abaissement
abaissent
abaiss
abaiss
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abandon
abasourd
abat
abatt
abatt
abatt
abattr
abbay
abb‚
abb‚
abbess
abeil
abhorr
abhorr
abhorr
abŒm
abŒm
abŒm
abŒm
abject
abjur
ablut
abneg
aboi
aboi
abol
abomin
abomin

[-=- -=- -=- 20307 lines omitted -=- -=- -=-]

xii
xii
xiv
xix
xv
xvi
xvii
xvii
xx
xxi
xxii
xxii
xxiv
xxix
xxv
xxvi
xxvii
xxvii
xxx
xxxi
xxxii
xxxii
xxxiv
xxxv
xxxvi
xxxvii
y
yacht
yacht
yakounin
yanke
yeddo
yert
yet
yeux
yokoham
york
young
zambajon
zeb
zebr
z‚bus
zel
zel
z‚nith
zigzag
zingarel
zonder
zoroastr
zurl


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stem.h ===

extern struct SN_env * french_create_env(void);
extern void french_close_env(struct SN_env * z);

extern int french_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stem.sbl ===
routines (
           prelude postlude mark_regions
           RV R1 R2
           standard_suffix
           i_verb_suffix
           verb_suffix
           residual_suffix
           un_double
           un_accent
)

externals ( stem )

integers ( pV p1 p2 )

groupings ( v keep_with_s )

stringescapes {}

/* special characters (in ISO Latin) */

stringdef a^   hex '83'  // a-circumflex
stringdef a`   hex '85'  // a-grave
stringdef c,   hex '87'  // c-cedilla

stringdef e"   hex '89'  // e-diaeresis (rare)
stringdef e'   hex '82'  // e-acute
stringdef e^   hex '88'  // e-circumflex
stringdef e`   hex '8A'  // e-grave
stringdef i"   hex '8B'  // i-diaeresis
stringdef i^   hex '8C'  // i-circumflex
stringdef o^   hex '93'  // o-circumflex
stringdef u^   hex '96'  // u-circumflex
stringdef u`   hex '97'  // u-grave

define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'

define prelude as repeat goto (

    (  v [ ('u' ] v <- 'U') or
           ('i' ] v <- 'I') or
           ('y' ] <- 'Y')
    )
    or
    (  ['y'] v <- 'Y' )
    or
    (  'q' ['u'] <- 'U' )
)

define mark_regions as (

    $pV = limit
    $p1 = limit
    $p2 = limit  // defaults

    do (
        ( v v next ) or ( next gopast v )
        setmark pV
    )
    do (
        gopast v gopast non-v setmark p1
        gopast v gopast non-v setmark p2
    )
)

define postlude as repeat (

    [substring] among(
        'I' (<- 'i')
        'U' (<- 'u')
        'Y' (<- 'y')
    ) or next
)

backwardmode (

    define RV as $pV <= cursor
    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define standard_suffix as (
        [substring] among(

            'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
            'ances' 'iqUes' 'ismes' 'ables' 'istes'
               ( R2 delete )
            'atrice' 'ateur' 'ation'
            'atrices' 'ateurs' 'ations'
               ( R2 delete
                 try ( ['ic'] (R2 delete) or <-'iqU' )
               )
            'logie'
            'logies'
               ( R2 <- 'log' )
            'usion' 'ution'
            'usions' 'utions'
               ( R2 <- 'u' )
            'ence'
            'ences'
               ( R2 <- 'ent' )
            'ement'
            'ements'
            (
                RV delete
                try (
                    [substring] among(
                        'iv' (R2 delete ['at'] R2 delete)
                        'eus' ((R2 delete) or (R1<-'eux'))
                        'abl' 'iqU' (R2 delete)
                    )
                )
            )
            'it{e'}'
            'it{e'}s'
            (
                R2 delete
                try (
                    [substring] among(
                        'abil' ((R2 delete) or <-'abl')
                        'ic'   ((R2 delete) or <-'iqU')
                        'iv'   (R2 delete)
                    )
                )
            )
            'if' 'ive'
            'ifs' 'ives'
            (
                R2 delete
                try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
            )
            'eaux' (<- 'eau')
            'aux'  (R1 <- 'al')
            'euse'
            'euses'((R2 delete) or (R1<-'eux'))

            'issement'
            'issements'(R1 non-v delete) // verbal

            // fail(...) below forces entry to verb_suffix. -ment typically
            // follows the p.p., e.g 'confus{e'}ment'.

            'amment'   (RV fail(<- 'ant'))
            'emment'   (RV fail(<- 'ent'))
            'ment'
            'ments'    (test(v RV) fail(delete))
                       // v is e,i,u,{e'},I or U
        )
    )

    define i_verb_suffix as setlimit tomark pV for (
        [substring] among (
            '{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
            'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
            'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
            'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
            'issez' 'issiez' 'issions' 'issons' 'it'
                (non-v delete)
        )
    )

    define verb_suffix as setlimit tomark pV for (
        [substring] among (
            'ions'
                (R2 delete)

            '{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
            'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
            'erons' 'eront' 'ez' 'iez'

            // 'ons' //-best omitted

                (delete)

            '{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
            'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
            'assions'
                (delete
                 try(['e'] delete)
                )
        )
    )

    define keep_with_s 'aiou{e`}s'

    define residual_suffix as (
        try(['s'] test non-keep_with_s delete)
        setlimit tomark pV for (
            [substring] among(
                'ion'           (R2 's' or 't' delete)
                'ier' 'i{e`}re'
                'Ier' 'I{e`}re' (<-'i')
                'e'             (delete)
                '{e"}'          ('gu' delete)
            )
        )
    )

    define un_double as (
        test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
    )

    define un_accent as (
        atleast 1 non-v
        [ '{e'}' or '{e`}' ] <-'e'
    )
)

define stem as (

    do prelude
    do mark_regions
    backwards (

        do (
            (
                 ( standard_suffix or
                   i_verb_suffix or
                   verb_suffix
                 )
                 and
                 try( [ ('Y'   ] <- 'i' ) or
                        ('{c,}'] <- 'c' )
                 )
            ) or
            residual_suffix
        )

        // try(['ent'] RV delete) // is best omitted

        do un_double
        do un_accent
    )
    do postlude
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/stemmer.html === (590/690 lines abridged)

<HTML>
<HEAD>
<TITLE>French stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>French stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample French vocabulary (ISO Latin codings)</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent in pure ASCII</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="../texts/romance.html">
                  Romance language stemmers</A>
</TABLE></DL>

</TR>

<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of French vocabulary, with the stemmed forms that will
be generated with this algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>

[-=- -=- -=- 590 lines omitted -=- -=- -=-]

                'ier' 'i{e`}re'
                'Ier' 'I{e`}re' (<-'i')
                'e'             (delete)
                '{e"}'          ('gu' delete)
            )
        )
    )

    define un_double as (
        test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
    )

    define un_accent as (
        atleast 1 non-v
        [ '{e'}' or '{e`}' ] <-'e'
    )
)

define stem as (

    do prelude
    do mark_regions
    backwards (

        do (
            (
                 ( standard_suffix or
                   i_verb_suffix or
                   verb_suffix
                 )
                 and
                 try( [ ('Y'   ] <- 'i' ) or
                        ('{c,}'] <- 'c' )
                 )
            ) or
            residual_suffix
        )

        // try(['ent'] RV delete) // is best omitted

        do un_double
        do un_accent
    )
    do postlude
)
</DL>
</PRE></FONT>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/french/voc.txt === (20307/20407 lines abridged)
a
…
abailard
abaissait
abaissant
abaisse
abaiss‚
abaissement
abaissent
abaisser
abaisserai
abandon
abandonna
abandonnait
abandonnant
abandonne
abandonn‚
abandonn‚e
abandonner
abandonnera
abandonnerait
abandonn‚s
abandonnez
abasourdi
abat
abattant
abattement
abattit
abattre
abbaye
abb‚
abb‚s
abbesse
abeille
abhorrait
abhorre
abhorr‚
abŒmait
abŒme
abŒm‚
abŒm‚e
abject
abjurant
ablutions
abn‚gation
aboiements
aboiera
abolir
abominable
abominablement

[-=- -=- -=- 20307 lines omitted -=- -=- -=-]

xii
xiii
xiv
xix
xv
xvi
xvii
xviii
xx
xxi
xxii
xxiii
xxiv
xxix
xxv
xxvi
xxvii
xxviii
xxx
xxxi
xxxii
xxxiii
xxxiv
xxxv
xxxvi
xxxvii
y
yacht
yachts
yakounines
yankee
yeddo
yert
yet
yeux
yokohama
york
young
zambajon
zeb
z‚br‚s
z‚bus
zŠle
z‚l‚s
z‚nith
zigzags
zingarelli
zonders
zoroastre
zurla