[Zope-Checkins] CVS: Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english - englishstem.c:1.1.2.1 output.txt:1.1.2.1 stem.h:1.1.2.1 stem.sbl:1.1.2.1 stemmer.html:1.1.2.1 voc.txt:1.1.2.1

Andreas Jung andreas@digicool.com
Wed, 13 Feb 2002 11:26:20 -0500


Update of /cvs-repository/Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english
In directory cvs.zope.org:/tmp/cvs-serv30556/PyStemmer/english

Added Files:
      Tag: ajung-textindexng-branch
	englishstem.c output.txt stem.h stem.sbl stemmer.html voc.txt 
Log Message:
added PyStemmer


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/englishstem.c === (615/715 lines abridged)

#include "header.h"

extern int english_stem(struct SN_env * z);
static int r_exception(struct SN_env * z);
static int r_Step_5(struct SN_env * z);
static int r_Step_4(struct SN_env * z);
static int r_Step_3(struct SN_env * z);
static int r_Step_2(struct SN_env * z);
static int r_Step_1c(struct SN_env * z);
static int r_Step_1b(struct SN_env * z);
static int r_Step_1a(struct SN_env * z);
static int r_R2(struct SN_env * z);
static int r_R1(struct SN_env * z);
static int r_shortv(struct SN_env * z);
static int r_mark_regions(struct SN_env * z);
static int r_postlude(struct SN_env * z);
static int r_prelude(struct SN_env * z);

static struct among a_0[1] =
{
/*  0 */ { 5, (byte *)"gener", -1, -1, 0}
};

static struct among a_1[6] =
{
/*  0 */ { 3, (byte *)"ied", -1, 2, 0},
/*  1 */ { 1, (byte *)"s", -1, 3, 0},
/*  2 */ { 3, (byte *)"ies", 1, 2, 0},
/*  3 */ { 4, (byte *)"sses", 1, 1, 0},
/*  4 */ { 2, (byte *)"ss", 1, -1, 0},
/*  5 */ { 2, (byte *)"us", 1, -1, 0}
};

static struct among a_2[13] =
{
/*  0 */ { 0, (byte *)"", -1, 3, 0},
/*  1 */ { 2, (byte *)"bb", 0, 2, 0},
/*  2 */ { 2, (byte *)"dd", 0, 2, 0},
/*  3 */ { 2, (byte *)"ff", 0, 2, 0},
/*  4 */ { 2, (byte *)"gg", 0, 2, 0},
/*  5 */ { 2, (byte *)"bl", 0, 1, 0},
/*  6 */ { 2, (byte *)"mm", 0, 2, 0},
/*  7 */ { 2, (byte *)"nn", 0, 2, 0},
/*  8 */ { 2, (byte *)"pp", 0, 2, 0},
/*  9 */ { 2, (byte *)"rr", 0, 2, 0},
/* 10 */ { 2, (byte *)"at", 0, 1, 0},
/* 11 */ { 2, (byte *)"tt", 0, 2, 0},
/* 12 */ { 2, (byte *)"iz", 0, 1, 0}
};

[-=- -=- -=- 615 lines omitted -=- -=- -=-]


        {   int m = z->l - z->c; /* do, line 196 */
            if (!r_Step_1a(z)) goto lab4; /* call Step_1a, line 196 */
        lab4:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 197 */
            if (!r_Step_1b(z)) goto lab5; /* call Step_1b, line 197 */
        lab5:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 198 */
            if (!r_Step_1c(z)) goto lab6; /* call Step_1c, line 198 */
        lab6:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 200 */
            if (!r_Step_2(z)) goto lab7; /* call Step_2, line 200 */
        lab7:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 201 */
            if (!r_Step_3(z)) goto lab8; /* call Step_3, line 201 */
        lab8:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 202 */
            if (!r_Step_4(z)) goto lab9; /* call Step_4, line 202 */
        lab9:
            z->c = z->l - m;
        }
        {   int m = z->l - z->c; /* do, line 204 */
            if (!r_Step_5(z)) goto lab10; /* call Step_5, line 204 */
        lab10:
            z->c = z->l - m;
        }
        z->c = z->lb;        {   int c = z->c; /* do, line 206 */
            if (!r_postlude(z)) goto lab11; /* call postlude, line 206 */
        lab11:
            z->c = c;
        }
    }
lab0:
    return 1;
}

extern struct SN_env * english_create_env(void) { return SN_create_env(0, 2, 1); }

extern void english_close_env(struct SN_env * z) { SN_close_env(z); }



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/output.txt === (29301/29401 lines abridged)
a
aback
abandon
abandon
abandon
abandon
abandon
abas
abash
abat
abat
abbey
abbott
abbrevi
abdic
abdic
abdomen
abdomin
abe
abear
abe
abel
aberr
abershaw
abet
abettor
abey
abhor
abhorr
abhor
abid
abid
abil
abil
abime
abingdon
abipon
abject
abject
abject
abjur
abjur
abl
abl
ablut
abneg
abnorm
abnorm
abnorm
aboard

[-=- -=- -=- 29301 lines omitted -=- -=- -=-]

youth
youth
youth
youth
yquem
yseult
yucca
yushin
yusupov
z
zag
zaharovitch
zametov
zampl
zaraisk
zaraiski
zarnitsyn
zeal
zealand
zealand
zealand
zealous
zebra
zelandia
zelinda
zenaida
zenith
zest
zeus
zig
zigzag
zigzag
zimmerman
zone
zone
zonotrichia
zoo
zoodl
zook
zoolog
zoolog
zoolog
zoolog
zoophyt
zoophyt
zoophyt
zorillo
zorillo
zossimov
zu


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/stem.h ===

extern struct SN_env * english_create_env(void);
extern void english_close_env(struct SN_env * z);

extern int english_stem(struct SN_env * z);



=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/stem.sbl ===
integers ( p1 p2 )
booleans ( Y_found )

routines (
    prelude postlude
    mark_regions
    shortv
    R1 R2
    Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
    exception
)

externals ( stem )

groupings ( v v_WXY valid_LI )

define v        'aeiouy'
define v_WXY    v + 'wxY'

define valid_LI 'bcdeghkmnrt'

define prelude as (
    unset Y_found
    do ( ['y'] v <-'Y' set Y_found)
    do repeat(goto (v ['y']) <-'Y' set Y_found)
)

define mark_regions as (
    $p1 = limit
    $p2 = limit
    do(
        among (
            'gener'
            // ... extensions possible here ...
        ) or (gopast v  gopast non-v)
        setmark p1
        gopast v  gopast non-v  setmark p2
    )
)

backwardmode (

    define shortv as (
        ( non-v_WXY v non-v )
        or
        ( non-v v atlimit )
    )

    define R1 as $p1 <= cursor
    define R2 as $p2 <= cursor

    define Step_1a as (
        [substring] among (
            'sses' (<-'ss')
            'ied' 'ies'
                   ((next atlimit <-'ie') or <-'i')
            's'    (next gopast v delete)
            'us' 'ss'
        )
    )

    define Step_1b as (
        [substring] among (
            'eed' 'eedly'
                (R1 <-'ee')
            'ed' 'edly' 'ing' 'ingly'
                (
                test gopast v  delete
                test substring among(
                    'at' 'bl' 'iz'
                         (<+ 'e')
                    'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
                    // ignoring double c, h, j, k, q, v, w, and x
                         ([next]  delete)
                    ''   (atmark p1  test shortv  <+ 'e')
                )
            )
        )
    )

    define Step_1c as (
        ['y' or 'Y']
        non-v not atlimit
        <-'i'
    )

    define Step_2 as (
        [substring] R1 among (
            'tional'  (<-'tion')
            'enci'    (<-'ence')
            'anci'    (<-'ance')
            'abli'    (<-'able')
            'entli'   (<-'ent')
            'izer' 'ization'
                      (<-'ize')
            'ational' 'ation' 'ator'
                      (<-'ate')
            'alism' 'aliti' 'alli'
                      (<-'al')
            'fulness' (<-'ful')
            'ousli' 'ousness'
                      (<-'ous')
            'iveness' 'iviti'
                      (<-'ive')
            'biliti' 'bli'
                      (<-'ble')
            'ogi'     ('l' <-'og')
            'fulli'   (<-'ful')
            'lessli'  (<-'less')
            'li'      (valid_LI delete)
        )
    )

    define Step_3 as (
        [substring] R1 among (
            'tional'  (<- 'tion')
            'ational' (<- 'ate')
            'alize'   (<-'al')
            'icate' 'iciti' 'ical'
                      (<-'ic')
            'ative' 'ful' 'ness'
                      (delete)
        )
    )

    define Step_4 as (
        [substring] R2 among (
            'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
            'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
                      (delete)
            'ion'     ('s' or 't' delete)
        )
    )

    define Step_5a as (
        ['e']
        R2 or (R1 not shortv)
        delete
    )

    define Step_5b as (
        ['l']
        R2 'l'
        delete
    )
)

define exception as (

    [substring] atlimit among(

        /* special changes: */

        'skis'      (<-'ski')
        'skies'     (<-'sky')
        'dying'     (<-'die')
        'lying'     (<-'lie')
        'tying'     (<-'tie')
        'innings'   (<-'inning')
        'outings'   (<-'outing')
        'cannings'  (<-'canning')

        /* special -LY cases */

        'idly'      (<-'idl')
        'gently'    (<-'gentl')
        'ugly'      (<-'ugli')
        'early'     (<-'earli')
        'only'      (<-'onli')
        'singly'    (<-'singl')

        // ... extensions possible here ...

        /* invariant forms: */

        'sky'
        'news'
        'howe'
        'inning' 'outing' 'canning'
        'proceed' 'exceed' 'succeed'

        'atlas' 'cosmos' 'bias' 'andes' // not plural forms

        // ... extensions possible here ...
    )
)

define postlude as (Y_found  repeat(goto (['Y']) <-'y'))

define stem as (

    exception or (

        test hop 3
        do prelude
        do mark_regions
        backwards (

            do Step_1a
            do Step_1b
            do Step_1c

            do Step_2
            do Step_3
            do Step_4

            do Step_5a
            do Step_5b
        )
        do postlude
    )
)


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/stemmer.html === (784/884 lines abridged)

<HTML>
<HEAD>

<TITLE>The English (Porter2) stemming algorith</TITLE></HEAD>
<BODY BGCOLOR=WHITE>
<TABLE WIDTH=75% ALIGN=CENTER COLS=1>
<H1 ALIGN=CENTER>The English (Porter2) stemming algorithm</H1>

<TR><TD>
<BR>&nbsp;<H2>Links to resources</H2>

<DL><DD><TABLE CELLPADDING=0>
<TR><TD><A HREF="stem.sbl">    The stemmer in Snowball</A>
<TR><TD><A HREF="stem.c">      The ANSI C stemmer</A>
<TR><TD><A HREF="stem.h">      - and its header</A>
<TR><TD><A HREF="voc.txt">     Sample English vocabulary</A>
<TR><TD><A HREF="output.txt">  Its stemmed equivalent</A>
<TR><TD><A HREF="diffs.txt">   Vocabulary + stemmed equivalent</A>
<TR><TD><A HREF="tarball.tgz"> Tar-gzipped file of all of the above</A>
</TABLE></DL>

<BR><BR>



<TR><TD BGCOLOR="lightpink">

<BR><BR>

Here is a sample of vocabulary, with the stemmed forms that will
be generated with the algorithm.

<BR><BR>



<DL><DD><TABLE CELLPADDING=0>
<TR><TD>  <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
 <TD></TD><TD>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;</TD>
 <TD></TD><TD> <B>word</B> </TD>
 <TD></TD><TD> </TD>
 <TD></TD><TD> <B>stem</B> </TD>
</TR>

<TR><TD>
consign<BR>
consigned<BR>

[-=- -=- -=- 784 lines omitted -=- -=- -=-]

        'only'      (<-'onli')
        'singly'    (<-'singl')

        // ... extensions possible here ...

        /* invariant forms: */

        'sky'
        'news'
        'howe'
        'inning' 'outing' 'canning'
        'proceed' 'exceed' 'succeed'

        'atlas' 'cosmos' 'bias' 'andes' // not plural forms

        // ... extensions possible here ...
    )
)

define postlude as (Y_found  repeat(goto (['Y']) <-'y'))

define stem as (

    exception or (

        test hop 3
        do prelude
        do mark_regions
        backwards (

            do Step_1a
            do Step_1b
            do Step_1c

            do Step_2
            do Step_3
            do Step_4

            do Step_5a
            do Step_5b
        )
        do postlude
    )
)
</DL>
</PRE></FONT>
</TR>
</TABLE>
</BODY>
</HTML>


=== Added File Zope/lib/python/Products/PluginIndexes/TextIndexNG/src/PyStemmer/english/voc.txt === (29301/29401 lines abridged)
a
aback
abandon
abandoned
abandoning
abandonment
abandons
abasement
abashed
abate
abated
abbey
abbott
abbreviation
abdicate
abdicating
abdomen
abdominal
abe
abear
abed
abel
aberration
abershaw
abet
abettor
abeyance
abhorred
abhorrence
abhorring
abide
abiding
abilities
ability
abimee
abingdon
abipones
abject
abjectly
abjectness
abjure
abjured
abl
able
ablutions
abnegation
abnormal
abnormality
abnormally
aboard

[-=- -=- -=- 29301 lines omitted -=- -=- -=-]

youthful
youthfully
youthfulness
youths
yquem
yseulte
yucca
yushin
yusupov
z
zag
zaharovitch
zametov
zample
zaraisk
zaraisky
zarnitsyn
zeal
zealand
zealander
zealanders
zealous
zebras
zelandiae
zelinda
zenaida
zenith
zest
zeus
zig
zigzag
zigzags
zimmerman
zone
zones
zonotrichia
zoo
zoodle
zooks
zoolog
zoological
zoologically
zoology
zoophyt
zoophyte
zoophytes
zorillo
zorillos
zossimov
zu