Commit c825e3d9 authored by Kazuhiko Shiozaki's avatar Kazuhiko Shiozaki Committed by Kazuhiko

update libstemmer_c.

parent 1c6e8c67
......@@ -11,8 +11,6 @@ src_c/stem_ISO_8859_1_french.c
src_c/stem_ISO_8859_1_french.h
src_c/stem_ISO_8859_1_german.c
src_c/stem_ISO_8859_1_german.h
src_c/stem_ISO_8859_1_hungarian.c
src_c/stem_ISO_8859_1_hungarian.h
src_c/stem_ISO_8859_1_italian.c
src_c/stem_ISO_8859_1_italian.h
src_c/stem_ISO_8859_1_norwegian.c
......@@ -25,6 +23,8 @@ src_c/stem_ISO_8859_1_spanish.c
src_c/stem_ISO_8859_1_spanish.h
src_c/stem_ISO_8859_1_swedish.c
src_c/stem_ISO_8859_1_swedish.h
src_c/stem_ISO_8859_2_hungarian.c
src_c/stem_ISO_8859_2_hungarian.h
src_c/stem_ISO_8859_2_romanian.c
src_c/stem_ISO_8859_2_romanian.h
src_c/stem_KOI8_R_russian.c
......
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
undouble
)
externals ( stem )
strings ( ch )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef ae hex 'E6'
stringdef ao hex 'E5'
stringdef o/ hex 'F8'
define v 'aeiouy{ae}{ao}{o/}'
define s_ending 'abcdfghjklmnoprtvyz{ao}'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
'erets' 'et' 'eret'
(delete)
's'
(s_ending delete)
)
)
define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'gd' // significant in the call from other_suffix
'dt' 'gt' 'kt'
)
)
next] delete
)
define other_suffix as (
do ( ['st'] 'ig' delete )
setlimit tomark p1 for ([substring])
among(
'ig' 'lig' 'elig' 'els'
(delete do consonant_pair)
'l{o/}st'
(<-'l{o/}s')
)
)
define undouble as (
setlimit tomark p1 for ([non-v] ->ch)
ch
delete
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
do undouble
)
)
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
undouble
)
externals ( stem )
strings ( ch )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef ae hex '91'
stringdef ao hex '86'
stringdef o/ hex '9B'
define v 'aeiouy{ae}{ao}{o/}'
define s_ending 'abcdfghjklmnoprtvyz{ao}'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'hed' 'ethed' 'ered' 'e' 'erede' 'ende' 'erende' 'ene' 'erne' 'ere'
'en' 'heden' 'eren' 'er' 'heder' 'erer' 'heds' 'es' 'endes'
'erendes' 'enes' 'ernes' 'eres' 'ens' 'hedens' 'erens' 'ers' 'ets'
'erets' 'et' 'eret'
(delete)
's'
(s_ending delete)
)
)
define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'gd' // significant in the call from other_suffix
'dt' 'gt' 'kt'
)
)
next] delete
)
define other_suffix as (
do ( ['st'] 'ig' delete )
setlimit tomark p1 for ([substring])
among(
'ig' 'lig' 'elig' 'els'
(delete do consonant_pair)
'l{o/}st'
(<-'l{o/}s')
)
)
define undouble as (
setlimit tomark p1 for ([non-v] ->ch)
ch
delete
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
do undouble
)
)
routines (
prelude postlude
e_ending
en_ending
mark_regions
R1 R2
undouble
standard_suffix
)
externals ( stem )
booleans ( e_found )
integers ( p1 p2 )
groupings ( v v_I v_j )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef e" hex 'EB'
stringdef i" hex 'EF'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef a' hex 'E1'
stringdef e' hex 'E9'
stringdef i' hex 'ED'
stringdef o' hex 'F3'
stringdef u' hex 'FA'
stringdef e` hex 'E8'
define v 'aeiouy{e`}'
define v_I v + 'I'
define v_j v + 'j'
define prelude as (
test repeat (
[substring] among(
'{a"}' '{a'}'
(<- 'a')
'{e"}' '{e'}'
(<- 'e')
'{i"}' '{i'}'
(<- 'i')
'{o"}' '{o'}'
(<- 'o')
'{u"}' '{u'}'
(<- 'u')
'' (next)
) //or next
)
try(['y'] <- 'Y')
repeat goto (
v [('i'] v <- 'I') or
('y'] <- 'Y')
)
)
define mark_regions as (
$p1 = limit
$p2 = limit
gopast v gopast non-v setmark p1
try($p1 < 3 $p1 = 3) // at least 3
gopast v gopast non-v setmark p2
)
define postlude as repeat (
[substring] among(
'Y' (<- 'y')
'I' (<- 'i')
'' (next)
) //or next
)
backwardmode (
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define undouble as (
test among('kk' 'dd' 'tt') [next] delete
)
define e_ending as (
unset e_found
['e'] R1 test non-v delete
set e_found
undouble
)
define en_ending as (
R1 non-v and not 'gem' delete
undouble
)
define standard_suffix as (
do (
[substring] among(
'heden'
( R1 <- 'heid'
)
'en' 'ene'
( en_ending
)
's' 'se'
( R1 non-v_j delete
)
)
)
do e_ending
do ( ['heid'] R2 not 'c' delete
['en'] en_ending
)
do (
[substring] among(
'end' 'ing'
( R2 delete
(['ig'] R2 not 'e' delete) or undouble
)
'ig'
( R2 not 'e' delete
)
'lijk'
( R2 delete e_ending
)
'baar'
( R2 delete
)
'bar'
( R2 e_found delete
)
)
)
do (
non-v_I
test (
among ('aa' 'ee' 'oo' 'uu')
non-v
)
[next] delete
)
)
)
define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)
routines (
prelude postlude
e_ending
en_ending
mark_regions
R1 R2
undouble
standard_suffix
)
externals ( stem )
booleans ( e_found )
integers ( p1 p2 )
groupings ( v v_I v_j )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a" hex '84'
stringdef e" hex '89'
stringdef i" hex '8B'
stringdef o" hex '94'
stringdef u" hex '81'
stringdef a' hex 'A0'
stringdef e' hex '82'
stringdef i' hex 'A1'
stringdef o' hex 'A2'
stringdef u' hex 'A3'
stringdef e` hex '8A'
define v 'aeiouy{e`}'
define v_I v + 'I'
define v_j v + 'j'
define prelude as (
test repeat (
[substring] among(
'{a"}' '{a'}'
(<- 'a')
'{e"}' '{e'}'
(<- 'e')
'{i"}' '{i'}'
(<- 'i')
'{o"}' '{o'}'
(<- 'o')
'{u"}' '{u'}'
(<- 'u')
'' (next)
) //or next
)
try(['y'] <- 'Y')
repeat goto (
v [('i'] v <- 'I') or
('y'] <- 'Y')
)
)
define mark_regions as (
$p1 = limit
$p2 = limit
gopast v gopast non-v setmark p1
try($p1 < 3 $p1 = 3) // at least 3
gopast v gopast non-v setmark p2
)
define postlude as repeat (
[substring] among(
'Y' (<- 'y')
'I' (<- 'i')
'' (next)
) //or next
)
backwardmode (
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define undouble as (
test among('kk' 'dd' 'tt') [next] delete
)
define e_ending as (
unset e_found
['e'] R1 test non-v delete
set e_found
undouble
)
define en_ending as (
R1 non-v and not 'gem' delete
undouble
)
define standard_suffix as (
do (
[substring] among(
'heden'
( R1 <- 'heid'
)
'en' 'ene'
( en_ending
)
's' 'se'
( R1 non-v_j delete
)
)
)
do e_ending
do ( ['heid'] R2 not 'c' delete
['en'] en_ending
)
do (
[substring] among(
'end' 'ing'
( R2 delete
(['ig'] R2 not 'e' delete) or undouble
)
'ig'
( R2 not 'e' delete
)
'lijk'
( R2 delete e_ending
)
'baar'
( R2 delete
)
'bar'
( R2 e_found delete
)
)
)
do (
non-v_I
test (
among ('aa' 'ee' 'oo' 'uu')
non-v
)
[next] delete
)
)
)
define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)
./Snowball stem_ISO_8859_1.sbl -u -eprefix dutch_UTF_8_ -r ../runtime -o stem_UTF_8_dutch
\ No newline at end of file
strings ( ch )
integers ( x p1 p2 )
booleans ( Y_found stemmed /*GE_removed*/ )
routines (
prelude
R1 R2
C V VX
lengthen_V
Step_1 Step_2 Step_3 Step_4 Step_7
Step_6
// Step_1c
// Lose_prefix
// Lose_infix
measure
)
externals ( stem )
groupings ( v v_WX AOU AIOU)
stringescapes {}
stringdef ' hex '27' // yuk
define v 'aeiouy'
define v_WX v + 'wx'
define AOU 'aou'
define AIOU 'aiou'
stringdef a" hex 'E4'
stringdef e" hex 'EB'
stringdef i" hex 'EF'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef a' hex 'E1'
stringdef e' hex 'E9'
stringdef i' hex 'ED'
stringdef o' hex 'F3'
stringdef u' hex 'FA'
stringdef e` hex 'E8'
//define v_I v + 'I'
//define v_j v + 'j'
define prelude as (
test repeat (
[substring] among(
'{a"}' '{a'}'
(<- 'a')
'{e"}' '{e'}'
(<- 'e')
'{i"}' '{i'}'
(<- 'i')
'{o"}' '{o'}'
(<- 'o')
'{u"}' '{u'}'
(<- 'u')
'' (next)
) //or next
)
try(['y'] <- 'Y')
repeat goto (
v [('i'] v <- 'I') or
('y'] <- 'Y')
)
)
backwardmode (
define R1 as (setmark x $x >= p1)
define R2 as (setmark x $x >= p2)
define V as test (v or 'ij')
define VX as test (next v or 'ij')
define C as test (not 'ij' non-v)
define lengthen_V as do (
non-v_WX [ (AOU] test (non-v or atlimit)) or
('e'] test (non-v or atlimit
not AIOU
not (next AIOU non-v)))
->ch insert ch
)
define Step_1 as
(
[among ( (])
'{'}s' (delete)
's' (R1 not ('t' R1) C delete)
'ies' (R1 <-'ie')
'es'
(('ar' R1 C ] delete lengthen_V) or
('er' R1 C ] delete) or
(R1 C <-'e'))
'aus' (R1 V <-'au')
'alen' (R1 <- 'aal')
'ieen' (R1 <- 'ie')
'ven' (R1 <- 'f')
'en' (('hed' R1 ] <-'heid') or
('nd' delete) or
('d' R1 C ] delete) or
('i' or 'j' V delete) or
(R1 C delete lengthen_V))
'nde' (<-'nd')
)
)
define Step_2 as
(
[among ( (])
'je' (('{'}t' ] delete) or
('et' ] R1 C delete) or
('rnt' ] <-'rn') or
('t' ] R1 VX delete) or
('ink' ] <-'ing') or
('mp' ] <-'m') or
('{'}' ] R1 delete) or
(] R1 C delete))
'ge' (R1 <-'g')
'lijke'(R1 <-'lijk')
'ische'(R1 <-'isch')
'de' (R1 C delete)
'te' (R1 <-'t')
'se' (R1 <-'s')
're' (R1 <-'r')
'le' (R1 delete attach 'l' lengthen_V)
'ene' (R1 C delete attach 'en' lengthen_V)
'ieve' (R1 C <-'ief')
)
)
define Step_3 as
(
[among ( (])
'atie' (R1 <-'eer')
'iteit' (R1 delete lengthen_V)
'heid'
'sel'
'ster' (R1 delete)
'rder' (<-'r')
'ing'
'isme'
'erij' (R1 delete lengthen_V)
'arij' (R1 C <-'aar')
'fie' (R2 delete attach 'f' lengthen_V)
'gie' (R2 delete attach 'g' lengthen_V)
'tst' (R1 C <-'t')
'dst' (R1 C <-'d')
)
)
define Step_4 as
(
( [among ( (])
'ioneel' (R1 <-'ie')
'atief' (R1 <-'eer')
'baar' (R1 delete)
'naar' (R1 V <-'n')
'laar' (R1 V <-'l')
'raar' (R1 V <-'r')
'tant' (R1 <-'teer')
'lijker'
'lijkst' (R1 <-'lijk')
'achtig'
'achtiger'
'achtigst'(R1 delete)
'eriger'
'erigst'
'erig'
'end' (R1 C delete lengthen_V)
)
)
or
( [among ( (])
'iger'
'igst'
'ig' (R1 C delete lengthen_V)
)
)
)
define Step_7 as
(
[among ( (])
'iee' (<-'ie')
'eee' (<-'ee')
'kt' (<-'k')
'ft' (<-'f')
'pt' (<-'p')
)
)
define Step_6 as
(
[among ( (])
'bb' (<-'b')
'cc' (<-'c')
'dd' (<-'d')
'ff' (<-'f')
'gg' (<-'g')
'hh' (<-'h')
'jj' (<-'j')
'kk' (<-'k')
'll' (<-'l')
'mm' (<-'m')
'nn' (<-'n')
'pp' (<-'p')
'qq' (<-'q')
'rr' (<-'r')
'ss' (<-'s')
'tt' (<-'t')
'vv' (<-'v')
'ww' (<-'w')
'xx' (<-'x')
'zz' (<-'z')
'v' (<-'f')
'z' (<-'s')
)
)
/*
define Step_1c as
(
[among ( (] R1 C)
'd' (not ('n' R1) delete)
't' (not ('h' R1) delete)
)
)
*/
)
/*
define Lose_prefix as (
['ge'] test hop 3 (goto v goto non-v)
set GE_removed
delete
)
define Lose_infix as (
next
gopast (['ge']) test hop 3 (goto v goto non-v)
set GE_removed
delete
)
*/
define measure as (
do (
tolimit
setmark p1
setmark p2
)
do(
repeat non-v atleast 1 ('ij' or v) non-v setmark p1
repeat non-v atleast 1 ('ij' or v) non-v setmark p2
)
)
define stem as (
prelude
unset Y_found
unset stemmed
do ( ['y'] <-'Y' set Y_found )
do repeat(goto (v ['y'])<-'Y' set Y_found )
measure
backwards (
do (Step_1 set stemmed )
do (Step_2 set stemmed )
do (Step_3 set stemmed )
do (Step_4 set stemmed )
)
/*
unset GE_removed
do (Lose_prefix and measure)
backwards (
do (GE_removed Step_1c)
)
unset GE_removed
do (Lose_infix and measure)
backwards (
do (GE_removed Step_1c)
)
*/
backwards (
do (Step_7 set stemmed )
do (stemmed or Step_6)
)
do(Y_found repeat(goto (['Y']) <-'y'))
)
integers ( p1 p2 )
booleans ( Y_found )
routines (
prelude postlude
mark_regions
shortv
R1 R2
Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5
exception1
exception2
)
externals ( stem )
groupings ( v v_WXY valid_LI )
stringescapes {}
define v 'aeiouy'
define v_WXY v + 'wxY'
define valid_LI 'cdeghkmnrt'
define prelude as (
unset Y_found
do ( ['{'}'] delete)
do ( ['y'] <-'Y' set Y_found)
do repeat(goto (v ['y']) <-'Y' set Y_found)
)
define mark_regions as (
$p1 = limit
$p2 = limit
do(
among (
'gener'
'commun' // added May 2005
'arsen' // added Nov 2006 (arsenic/arsenal)
// ... extensions possible here ...
) or (gopast v gopast non-v)
setmark p1
gopast v gopast non-v setmark p2
)
)
backwardmode (
define shortv as (
( non-v_WXY v non-v )
or
( non-v v atlimit )
)
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define Step_1a as (
try (
[substring] among (
'{'}' '{'}s' '{'}s{'}'
(delete)
)
)
[substring] among (
'sses' (<-'ss')
'ied' 'ies'
((hop 2 <-'i') or <-'ie')
's' (next gopast v delete)
'us' 'ss'
)
)
define Step_1b as (
[substring] among (
'eed' 'eedly'
(R1 <-'ee')
'ed' 'edly' 'ing' 'ingly'
(
test gopast v delete
test substring among(
'at' 'bl' 'iz'
(<+ 'e')
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
// ignoring double c, h, j, k, q, v, w, and x
([next] delete)
'' (atmark p1 test shortv <+ 'e')
)
)
)
)
define Step_1c as (
['y' or 'Y']
non-v not atlimit
<-'i'
)
define Step_2 as (
[substring] R1 among (
'tional' (<-'tion')
'enci' (<-'ence')
'anci' (<-'ance')
'abli' (<-'able')
'entli' (<-'ent')
'izer' 'ization'
(<-'ize')
'ational' 'ation' 'ator'
(<-'ate')
'alism' 'aliti' 'alli'
(<-'al')
'fulness' (<-'ful')
'ousli' 'ousness'
(<-'ous')
'iveness' 'iviti'
(<-'ive')
'biliti' 'bli'
(<-'ble')
'ogi' ('l' <-'og')
'fulli' (<-'ful')
'lessli' (<-'less')
'li' (valid_LI delete)
)
)
define Step_3 as (
[substring] R1 among (
'tional' (<- 'tion')
'ational' (<- 'ate')
'alize' (<-'al')
'icate' 'iciti' 'ical'
(<-'ic')
'ful' 'ness'
(delete)
'ative'
(R2 delete) // 'R2' added Dec 2001
)
)
define Step_4 as (
[substring] R2 among (
'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
'ment' 'ent' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
(delete)
'ion' ('s' or 't' delete)
)
)
define Step_5 as (
[substring] among (
'e' (R2 or (R1 not shortv) delete)
'l' (R2 'l' delete)
)
)
define exception2 as (
[substring] atlimit among(
'inning' 'outing' 'canning' 'herring' 'earring'
'proceed' 'exceed' 'succeed'
// ... extensions possible here ...
)
)
)
define exception1 as (
[substring] atlimit among(
/* special changes: */
'skis' (<-'ski')
'skies' (<-'sky')
'dying' (<-'die')
'lying' (<-'lie')
'tying' (<-'tie')
/* special -LY cases */
'idly' (<-'idl')
'gently' (<-'gentl')
'ugly' (<-'ugli')
'early' (<-'earli')
'only' (<-'onli')
'singly' (<-'singl')
// ... extensions possible here ...
/* invariant forms: */
'sky'
'news'
'howe'
'atlas' 'cosmos' 'bias' 'andes' // not plural forms
// ... extensions possible here ...
)
)
define postlude as (Y_found repeat(goto (['Y']) <-'y'))
define stem as (
exception1 or
not hop 3 or (
do prelude
do mark_regions
backwards (
do Step_1a
exception2 or (
do Step_1b
do Step_1c
do Step_2
do Step_3
do Step_4
do Step_5
)
)
do postlude
)
)
/* Finnish stemmer.
Numbers in square brackets refer to the sections in
Fred Karlsson, Finnish: An Essential Grammar. Routledge, 1999
ISBN 0-415-20705-3
*/
routines (
mark_regions
R2
particle_etc possessive
LONG VI
case_ending
i_plural
t_plural
other_endings
tidy
)
externals ( stem )
integers ( p1 p2 )
strings ( x )
booleans ( ending_removed )
groupings ( AEI V1 V2 particle_end )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef o" hex 'F6'
define AEI 'a{a"}ei'
define V1 'aeiouy{a"}{o"}'
define V2 'aeiou{a"}{o"}'
define particle_end V1 + 'nt'
define mark_regions as (
$p1 = limit
$p2 = limit
goto V1 gopast non-V1 setmark p1
goto V1 gopast non-V1 setmark p2
)
backwardmode (
define R2 as $p2 <= cursor
define particle_etc as (
setlimit tomark p1 for ([substring])
among(
'kin'
'kaan' 'k{a"}{a"}n'
'ko' 'k{o"}'
'han' 'h{a"}n'
'pa' 'p{a"}' // Particles [91]
(particle_end)
'sti' // Adverb [87]
(R2)
)
delete
)
define possessive as ( // [36]
setlimit tomark p1 for ([substring])
among(
'si'
(not 'k' delete) // take 'ksi' as the Comitative case
'ni'
(delete ['kse'] <- 'ksi') // kseni = ksi + ni
'nsa' 'ns{a"}'
'mme'
'nne'
(delete)
/* Now for Vn possessives after case endings: [36] */
'an'
(among('ta' 'ssa' 'sta' 'lla' 'lta' 'na') delete)
'{a"}n'
(among('t{a"}' 'ss{a"}' 'st{a"}'
'll{a"}' 'lt{a"}' 'n{a"}') delete)
'en'
(among('lle' 'ine') delete)
)
)
define LONG as
among('aa' 'ee' 'ii' 'oo' 'uu' '{a"}{a"}' '{o"}{o"}')
define VI as ('i' V2)
define case_ending as (
setlimit tomark p1 for ([substring])
among(
'han' ('a') //-.
'hen' ('e') // |
'hin' ('i') // |
'hon' ('o') // |
'h{a"}n' ('{a"}') // Illative [43]
'h{o"}n' ('{o"}') // |
'siin' VI // |
'seen' LONG //-'
'den' VI
'tten' VI // Genitive plurals [34]
()
'n' // Genitive or Illative
( try ( LONG // Illative
or 'ie' // Genitive
and next ]
)
/* otherwise Genitive */
)
'a' '{a"}' //-.
(V1 non-V1) // |
'tta' 'tt{a"}' // Partitive [32]
('e') // |
'ta' 't{a"}' //-'
'ssa' 'ss{a"}' // Inessive [41]
'sta' 'st{a"}' // Elative [42]
'lla' 'll{a"}' // Adessive [44]
'lta' 'lt{a"}' // Ablative [51]
'lle' // Allative [46]
'na' 'n{a"}' // Essive [49]
'ksi' // Translative[50]
'ine' // Comitative [51]
/* Abessive and Instructive are too rare for
inclusion [51] */
)
delete
set ending_removed
)
define other_endings as (
setlimit tomark p2 for ([substring])
among(
'mpi' 'mpa' 'mp{a"}'
'mmi' 'mma' 'mm{a"}' // Comparative forms [85]
(not 'po') //-improves things
'impi' 'impa' 'imp{a"}'
'immi' 'imma' 'imm{a"}' // Superlative forms [86]
'eja' 'ej{a"}' // indicates agent [93.1B]
)
delete
)
define i_plural as ( // [26]
setlimit tomark p1 for ([substring])
among(
'i' 'j'
)
delete
)
define t_plural as ( // [26]
setlimit tomark p1 for (
['t'] test V1
delete
)
setlimit tomark p2 for ([substring])
among(
'mma' (not 'po') //-mmat endings
'imma' //-immat endings
)
delete
)
define tidy as (
setlimit tomark p1 for (
do ( LONG and ([next] delete ) ) // undouble vowel
do ( [AEI] non-V1 delete ) // remove trailing a, a", e, i
do ( ['j'] 'o' or 'u' delete )
do ( ['o'] 'j' delete )
)
goto non-V1 [next] -> x x delete // undouble consonant
)
)
define stem as (
do mark_regions
unset ending_removed
backwards (
do particle_etc
do possessive
do case_ending
do other_endings
(ending_removed do i_plural) or do t_plural
do tidy
)
)
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
i_verb_suffix
verb_suffix
residual_suffix
un_double
un_accent
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v keep_with_s )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a^ hex 'E2' // a-circumflex
stringdef a` hex 'E0' // a-grave
stringdef c, hex 'E7' // c-cedilla
stringdef e" hex 'EB' // e-diaeresis (rare)
stringdef e' hex 'E9' // e-acute
stringdef e^ hex 'EA' // e-circumflex
stringdef e` hex 'E8' // e-grave
stringdef i" hex 'EF' // i-diaeresis
stringdef i^ hex 'EE' // i-circumflex
stringdef o^ hex 'F4' // o-circumflex
stringdef u^ hex 'FB' // u-circumflex
stringdef u` hex 'F9' // u-grave
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
define prelude as repeat goto (
( v [ ('u' ] v <- 'U') or
('i' ] v <- 'I') or
('y' ] <- 'Y')
)
or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v v next )
or
among ( // this exception list begun Nov 2006
'par' // paris, parie, pari
'col' // colis
'tap' // tapis
// extensions possible here
)
or
( next gopast v )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
'ances' 'iqUes' 'ismes' 'ables' 'istes'
( R2 delete )
'atrice' 'ateur' 'ation'
'atrices' 'ateurs' 'ations'
( R2 delete
try ( ['ic'] (R2 delete) or <-'iqU' )
)
'logie'
'logies'
( R2 <- 'log' )
'usion' 'ution'
'usions' 'utions'
( R2 <- 'u' )
'ence'
'ences'
( R2 <- 'ent' )
'ement'
'ements'
(
RV delete
try (
[substring] among(
'iv' (R2 delete ['at'] R2 delete)
'eus' ((R2 delete) or (R1<-'eux'))
'abl' 'iqU'
(R2 delete)
'i{e`}r' 'I{e`}r' //)
(RV <-'i') //)--new 2 Sept 02
)
)
)
'it{e'}'
'it{e'}s'
(
R2 delete
try (
[substring] among(
'abil' ((R2 delete) or <-'abl')
'ic' ((R2 delete) or <-'iqU')
'iv' (R2 delete)
)
)
)
'if' 'ive'
'ifs' 'ives'
(
R2 delete
try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
)
'eaux' (<- 'eau')
'aux' (R1 <- 'al')
'euse'
'euses'((R2 delete) or (R1<-'eux'))
'issement'
'issements'(R1 non-v delete) // verbal
// fail(...) below forces entry to verb_suffix. -ment typically
// follows the p.p., e.g 'confus{e'}ment'.
'amment' (RV fail(<- 'ant'))
'emment' (RV fail(<- 'ent'))
'ment'
'ments' (test(v RV) fail(delete))
// v is e,i,u,{e'},I or U
)
)
define i_verb_suffix as setlimit tomark pV for (
[substring] among (
'{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
(non-v delete)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among (
'ions'
(R2 delete)
'{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
'erons' 'eront' 'ez' 'iez'
// 'ons' //-best omitted
(delete)
'{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
'assions'
(delete
try(['e'] delete)
)
)
)
define keep_with_s 'aiou{e`}s'
define residual_suffix as (
try(['s'] test non-keep_with_s delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)
define un_double as (
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
)
define un_accent as (
atleast 1 non-v
[ '{e'}' or '{e`}' ] <-'e'
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
(
( standard_suffix or
i_verb_suffix or
verb_suffix
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
)
) or
residual_suffix
)
// try(['ent'] RV delete) // is best omitted
do un_double
do un_accent
)
do postlude
)
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
i_verb_suffix
verb_suffix
residual_suffix
un_double
un_accent
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v keep_with_s )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a^ hex '83' // a-circumflex
stringdef a` hex '85' // a-grave
stringdef c, hex '87' // c-cedilla
stringdef e" hex '89' // e-diaeresis (rare)
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef e` hex '8A' // e-grave
stringdef i" hex '8B' // i-diaeresis
stringdef i^ hex '8C' // i-circumflex
stringdef o^ hex '93' // o-circumflex
stringdef u^ hex '96' // u-circumflex
stringdef u` hex '97' // u-grave
define v 'aeiouy{a^}{a`}{e"}{e'}{e^}{e`}{i"}{i^}{o^}{u^}{u`}'
define prelude as repeat goto (
( v [ ('u' ] v <- 'U') or
('i' ] v <- 'I') or
('y' ] <- 'Y')
)
or
( ['y'] v <- 'Y' )
or
( 'q' ['u'] <- 'U' )
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v v next ) or ( next gopast v )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'Y' (<- 'y')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'ance' 'iqUe' 'isme' 'able' 'iste' 'eux'
'ances' 'iqUes' 'ismes' 'ables' 'istes'
( R2 delete )
'atrice' 'ateur' 'ation'
'atrices' 'ateurs' 'ations'
( R2 delete
try ( ['ic'] (R2 delete) or <-'iqU' )
)
'logie'
'logies'
( R2 <- 'log' )
'usion' 'ution'
'usions' 'utions'
( R2 <- 'u' )
'ence'
'ences'
( R2 <- 'ent' )
'ement'
'ements'
(
RV delete
try (
[substring] among(
'iv' (R2 delete ['at'] R2 delete)
'eus' ((R2 delete) or (R1<-'eux'))
'abl' 'iqU'
(R2 delete)
'i{e`}r' 'I{e`}r' //)
(RV <-'i') //)--new 2 Sept 02
)
)
)
'it{e'}'
'it{e'}s'
(
R2 delete
try (
[substring] among(
'abil' ((R2 delete) or <-'abl')
'ic' ((R2 delete) or <-'iqU')
'iv' (R2 delete)
)
)
)
'if' 'ive'
'ifs' 'ives'
(
R2 delete
try ( ['at'] R2 delete ['ic'] (R2 delete) or <-'iqU' )
)
'eaux' (<- 'eau')
'aux' (R1 <- 'al')
'euse'
'euses'((R2 delete) or (R1<-'eux'))
'issement'
'issements'(R1 non-v delete) // verbal
// fail(...) below forces entry to verb_suffix. -ment typically
// follows the p.p., e.g 'confus{e'}ment'.
'amment' (RV fail(<- 'ant'))
'emment' (RV fail(<- 'ent'))
'ment'
'ments' (test(v RV) fail(delete))
// v is e,i,u,{e'},I or U
)
)
define i_verb_suffix as setlimit tomark pV for (
[substring] among (
'{i^}mes' '{i^}t' '{i^}tes' 'i' 'ie' 'ies' 'ir' 'ira' 'irai'
'iraIent' 'irais' 'irait' 'iras' 'irent' 'irez' 'iriez'
'irions' 'irons' 'iront' 'is' 'issaIent' 'issais' 'issait'
'issant' 'issante' 'issantes' 'issants' 'isse' 'issent' 'isses'
'issez' 'issiez' 'issions' 'issons' 'it'
(non-v delete)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among (
'ions'
(R2 delete)
'{e'}' '{e'}e' '{e'}es' '{e'}s' '{e`}rent' 'er' 'era' 'erai'
'eraIent' 'erais' 'erait' 'eras' 'erez' 'eriez' 'erions'
'erons' 'eront' 'ez' 'iez'
// 'ons' //-best omitted
(delete)
'{a^}mes' '{a^}t' '{a^}tes' 'a' 'ai' 'aIent' 'ais' 'ait' 'ant'
'ante' 'antes' 'ants' 'as' 'asse' 'assent' 'asses' 'assiez'
'assions'
(delete
try(['e'] delete)
)
)
)
define keep_with_s 'aiou{e`}s'
define residual_suffix as (
try(['s'] test non-keep_with_s delete)
setlimit tomark pV for (
[substring] among(
'ion' (R2 's' or 't' delete)
'ier' 'i{e`}re'
'Ier' 'I{e`}re' (<-'i')
'e' (delete)
'{e"}' ('gu' delete)
)
)
)
define un_double as (
test among('enn' 'onn' 'ett' 'ell' 'eill') [next] delete
)
define un_accent as (
atleast 1 non-v
[ '{e'}' or '{e`}' ] <-'e'
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
(
( standard_suffix or
i_verb_suffix or
verb_suffix
)
and
try( [ ('Y' ] <- 'i' ) or
('{c,}'] <- 'c' )
)
) or
residual_suffix
)
// try(['ent'] RV delete) // is best omitted
do un_double
do un_accent
)
do postlude
)
routines (
prelude postlude
mark_regions
R1 R2
standard_suffix
)
externals ( stem )
integers ( p1 p2 x )
groupings ( v s_ending st_ending )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef ss hex 'DF'
define v 'aeiouy{a"}{o"}{u"}'
define s_ending 'bdfghklmnrt'
define st_ending s_ending - 'r'
define prelude as (
test repeat (
(
['{ss}'] <- 'ss'
) or next
)
repeat goto (
v [('u'] v <- 'U') or
('y'] v <- 'Y')
)
)
define mark_regions as (
$p1 = limit
$p2 = limit
test(hop 3 setmark x)
gopast v gopast non-v setmark p1
try($p1 < x $p1 = x) // at least 3
gopast v gopast non-v setmark p2
)
define postlude as repeat (
[substring] among(
'Y' (<- 'y')
'U' (<- 'u')
'{a"}' (<- 'a')
'{o"}' (<- 'o')
'{u"}' (<- 'u')
'' (next)
)
)
backwardmode (
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
do (
[substring] R1 among(
'e' 'em' 'en' 'ern' 'er' 'es'
( delete
)
's'
( s_ending delete
)
)
)
do (
[substring] R1 among(
'en' 'er' 'est'
( delete
)
'st'
( st_ending hop 3 delete
)
)
)
do (
[substring] R2 among(
'end' 'ung'
( delete
try (['ig'] not 'e' R2 delete)
)
'ig' 'ik' 'isch'
( not 'e' delete
)
'lich' 'heit'
( delete
try (
['er' or 'en'] R1 delete
)
)
'keit'
( delete
try (
[substring] R2 among(
'lich' 'ig'
( delete
)
)
)
)
)
)
)
)
define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)
routines (
prelude postlude
mark_regions
R1 R2
standard_suffix
)
externals ( stem )
integers ( p1 p2 x )
groupings ( v s_ending st_ending )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a" hex '84'
stringdef o" hex '94'
stringdef u" hex '81'
stringdef ss hex 'E1'
define v 'aeiouy{a"}{o"}{u"}'
define s_ending 'bdfghklmnrt'
define st_ending s_ending - 'r'
define prelude as (
test repeat (
(
['{ss}'] <- 'ss'
) or next
)
repeat goto (
v [('u'] v <- 'U') or
('y'] v <- 'Y')
)
)
define mark_regions as (
$p1 = limit
$p2 = limit
test(hop 3 setmark x)
gopast v gopast non-v setmark p1
try($p1 < x $p1 = x) // at least 3
gopast v gopast non-v setmark p2
)
define postlude as repeat (
[substring] among(
'Y' (<- 'y')
'U' (<- 'u')
'{a"}' (<- 'a')
'{o"}' (<- 'o')
'{u"}' (<- 'u')
'' (next)
)
)
backwardmode (
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
do (
[substring] R1 among(
'e' 'em' 'en' 'ern' 'er' 'es'
( delete
)
's'
( s_ending delete
)
)
)
do (
[substring] R1 among(
'en' 'er' 'est'
( delete
)
'st'
( st_ending hop 3 delete
)
)
)
do (
[substring] R2 among(
'end' 'ung'
( delete
try (['ig'] not 'e' R2 delete)
)
'ig' 'ik' 'isch'
( not 'e' delete
)
'lich' 'heit'
( delete
try (
['er' or 'en'] R1 delete
)
)
'keit'
( delete
try (
[substring] R2 among(
'lich' 'ig'
( delete
)
)
)
)
)
)
)
)
define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)
routines (
prelude postlude
mark_regions
R1 R2
standard_suffix
)
externals ( stem )
integers ( p1 p2 x )
groupings ( v s_ending st_ending )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef o" hex 'F6'
stringdef u" hex 'FC'
stringdef ss hex 'DF'
define v 'aeiouy{a"}{o"}{u"}'
define s_ending 'bdfghklmnrt'
define st_ending s_ending - 'r'
define prelude as (
test repeat goto (
v [('u'] v <- 'U') or
('y'] v <- 'Y')
)
repeat (
[substring] among(
'{ss}' (<- 'ss')
'ae' (<- '{a"}')
'oe' (<- '{o"}')
'ue' (<- '{u"}')
'qu' (hop 2)
'' (next)
)
)
)
define mark_regions as (
$p1 = limit
$p2 = limit
test(hop 3 setmark x)
gopast v gopast non-v setmark p1
try($p1 < x $p1 = x) // at least 3
gopast v gopast non-v setmark p2
)
define postlude as repeat (
[substring] among(
'Y' (<- 'y')
'U' (<- 'u')
'{a"}' (<- 'a')
'{o"}' (<- 'o')
'{u"}' (<- 'u')
'' (next)
)
)
backwardmode (
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
do (
[substring] R1 among(
'e' 'em' 'en' 'ern' 'er' 'es'
( delete
)
's'
( s_ending delete
)
)
)
do (
[substring] R1 among(
'en' 'er' 'est'
( delete
)
'st'
( st_ending hop 3 delete
)
)
)
do (
[substring] R2 among(
'end' 'ung'
( delete
try (['ig'] not 'e' R2 delete)
)
'ig' 'ik' 'isch'
( not 'e' delete
)
'lich' 'heit'
( delete
try (
['er' or 'en'] R1 delete
)
)
'keit'
( delete
try (
[substring] R2 among(
'lich' 'ig'
( delete
)
)
)
)
)
)
)
)
define stem as (
do prelude
do mark_regions
backwards
do standard_suffix
do postlude
)
/*
Hungarian Stemmer
Removes noun inflections
*/
routines (
mark_regions
R1
v_ending
case
case_special
case_other
plural
owned
sing_owner
plur_owner
instrum
factive
undouble
double
)
externals ( stem )
integers ( p1 )
groupings ( v )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a' hex 'E1' //a-acute
stringdef e' hex 'E9' //e-acute
stringdef i' hex 'ED' //i-acute
stringdef o' hex 'F3' //o-acute
stringdef o" hex 'F6' //o-umlaut
stringdef oq hex 'F5' //o-double acute
stringdef u' hex 'FA' //u-acute
stringdef u" hex 'FC' //u-umlaut
stringdef uq hex 'FB' //u-double acute
define v 'aeiou{a'}{e'}{i'}{o'}{o"}{oq}{u'}{u"}{uq}'
define mark_regions as (
$p1 = limit
(v goto non-v
among('cs' 'gy' 'ly' 'ny' 'sz' 'ty' 'zs' 'dzs') or next
setmark p1)
or
(non-v gopast v setmark p1)
)
backwardmode (
define R1 as $p1 <= cursor
define v_ending as (
[substring] R1 among(
'{a'}' (<- 'a')
'{e'}' (<- 'e')
)
)
define double as (
test among('bb' 'cc' 'ccs' 'dd' 'ff' 'gg' 'ggy' 'jj' 'kk' 'll' 'lly' 'mm'
'nn' 'nny' 'pp' 'rr' 'ss' 'ssz' 'tt' 'tty' 'vv' 'zz' 'zzs')
)
define undouble as (
next [hop 1] delete
)
define instrum as(
[substring] R1 among(
'al' (double)
'el' (double)
)
delete
undouble
)
define case as (
[substring] R1 among(
'ban' 'ben'
'ba' 'be'
'ra' 're'
'nak' 'nek'
'val' 'vel'
't{o'}l' 't{oq}l'
'r{o'}l' 'r{oq}l'
'b{o'}l' 'b{oq}l'
'hoz' 'hez' 'h{o"}z'
'n{a'}l' 'n{e'}l'
'ig'
'at' 'et' 'ot' '{o"}t'
'{e'}rt'
'k{e'}pp' 'k{e'}ppen'
'kor'
'ul' '{u"}l'
'v{a'}' 'v{e'}'
'onk{e'}nt' 'enk{e'}nt' 'ank{e'}nt'
'k{e'}nt'
'en' 'on' 'an' '{o"}n'
'n'
't'
)
delete
v_ending
)
define case_special as(
[substring] R1 among(
'{e'}n' (<- 'e')
'{a'}n' (<- 'a')
'{a'}nk{e'}nt' (<- 'a')
)
)
define case_other as(
[substring] R1 among(
'astul' 'est{u"}l' (delete)
'stul' 'st{u"}l' (delete)
'{a'}stul' (<- 'a')
'{e'}st{u"}l' (<- 'e')
)
)
define factive as(
[substring] R1 among(
'{a'}' (double)
'{e'}' (double)
)
delete
undouble
)
define plural as (
[substring] R1 among(
'{a'}k' (<- 'a')
'{e'}k' (<- 'e')
'{o"}k' (delete)
'ak' (delete)
'ok' (delete)
'ek' (delete)
'k' (delete)
)
)
define owned as (
[substring] R1 among (
'ok{e'}' '{o"}k{e'}' 'ak{e'}' 'ek{e'}' (delete)
'{e'}k{e'}' (<- 'e')
'{a'}k{e'}' (<- 'a')
'k{e'}' (delete)
'{e'}{e'}i' (<- 'e')
'{a'}{e'}i' (<- 'a')
'{e'}i' (delete)
'{e'}{e'}' (<- 'e')
'{e'}' (delete)
)
)
define sing_owner as (
[substring] R1 among(
'{u"}nk' 'unk' (delete)
'{a'}nk' (<- 'a')
'{e'}nk' (<- 'e')
'nk' (delete)
'{a'}juk' (<- 'a')
'{e'}j{u"}k' (<- 'e')
'juk' 'j{u"}k' (delete)
'uk' '{u"}k' (delete)
'em' 'om' 'am' (delete)
'{a'}m' (<- 'a')
'{e'}m' (<- 'e')
'm' (delete)
'od' 'ed' 'ad' '{o"}d' (delete)
'{a'}d' (<- 'a')
'{e'}d' (<- 'e')
'd' (delete)
'ja' 'je' (delete)
'a' 'e' 'o' (delete)
'{a'}' (<- 'a')
'{e'}' (<- 'e')
)
)
define plur_owner as (
[substring] R1 among(
'jaim' 'jeim' (delete)
'{a'}im' (<- 'a')
'{e'}im' (<- 'e')
'aim' 'eim' (delete)
'im' (delete)
'jaid' 'jeid' (delete)
'{a'}id' (<- 'a')
'{e'}id' (<- 'e')
'aid' 'eid' (delete)
'id' (delete)
'jai' 'jei' (delete)
'{a'}i' (<- 'a')
'{e'}i' (<- 'e')
'ai' 'ei' (delete)
'i' (delete)
'jaink' 'jeink' (delete)
'eink' 'aink' (delete)
'{a'}ink' (<- 'a')
'{e'}ink' (<- 'e')
'ink'
'jaitok' 'jeitek' (delete)
'aitok' 'eitek' (delete)
'{a'}itok' (<- 'a')
'{e'}itek' (<- 'e')
'itek' (delete)
'jeik' 'jaik' (delete)
'aik' 'eik' (delete)
'{a'}ik' (<- 'a')
'{e'}ik' (<- 'e')
'ik' (delete)
)
)
)
define stem as (
do mark_regions
backwards (
do instrum
do case
do case_special
do case_other
do factive
do owned
do sing_owner
do plur_owner
do plural
)
)
routines (
prelude postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v AEIO CG )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a' hex 'E1'
stringdef a` hex 'E0'
stringdef e' hex 'E9'
stringdef e` hex 'E8'
stringdef i' hex 'ED'
stringdef i` hex 'EC'
stringdef o' hex 'F3'
stringdef o` hex 'F2'
stringdef u' hex 'FA'
stringdef u` hex 'F9'
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
define prelude as (
test repeat (
[substring] among(
'{a'}' (<- '{a`}')
'{e'}' (<- '{e`}')
'{i'}' (<- '{i`}')
'{o'}' (<- '{o`}')
'{u'}' (<- '{u`}')
'qu' (<- 'qU')
'' (next)
)
)
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'ci' 'gli' 'la' 'le' 'li' 'lo'
'mi' 'ne' 'si' 'ti' 'vi'
// the compound forms are:
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
'mela' 'mele' 'meli' 'melo' 'mene'
'tela' 'tele' 'teli' 'telo' 'tene'
'cela' 'cele' 'celi' 'celo' 'cene'
'vela' 'vele' 'veli' 'velo' 'vene'
)
among( (RV)
'ando' 'endo' (delete)
'ar' 'er' 'ir' (<- 'e')
)
)
define standard_suffix as (
[substring] among(
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
'atrice' 'atrici'
'ante' 'anti' // Note 1
( R2 delete )
'azione' 'azioni' 'atore' 'atori'
( R2 delete
try ( ['ic'] R2 delete )
)
'logia' 'logie'
( R2 <- 'log' )
'uzione' 'uzioni' 'usione' 'usioni'
( R2 <- 'u' )
'enza' 'enze'
( R2 <- 'ente' )
'amento' 'amenti' 'imento' 'imenti'
( RV delete )
'amente' (
R1 delete
try (
[substring] R2 delete among(
'iv' ( ['at'] R2 delete )
'os' 'ic' 'abil'
)
)
)
'it{a`}' (
R2 delete
try (
[substring] among(
'abil' 'ic' 'iv' (R2 delete)
)
)
)
'ivo' 'ivi' 'iva' 'ive' (
R2 delete
try ( ['at'] R2 delete ['ic'] R2 delete )
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
'ono' 'uta' 'ute' 'uti' 'uto'
'ar' 'ir' // but 'er' is problematical
(delete)
)
)
define AEIO 'aeio{a`}{e`}{i`}{o`}'
define CG 'cg'
define vowel_suffix as (
try (
[AEIO] RV delete
['i'] RV delete
)
try (
['h'] CG RV delete
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do attached_pronoun
do (standard_suffix or verb_suffix)
do vowel_suffix
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/
routines (
prelude postlude mark_regions
RV R1 R2
attached_pronoun
standard_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v AEIO CG )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a' hex 'A0'
stringdef a` hex '85'
stringdef e' hex '82'
stringdef e` hex '8A'
stringdef i' hex 'A1'
stringdef i` hex '8D'
stringdef o' hex 'A2'
stringdef o` hex '95'
stringdef u' hex 'A3'
stringdef u` hex '97'
define v 'aeiou{a`}{e`}{i`}{o`}{u`}'
define prelude as (
test repeat (
[substring] among(
'{a'}' (<- '{a`}')
'{e'}' (<- '{e`}')
'{i'}' (<- '{i`}')
'{o'}' (<- '{o`}')
'{u'}' (<- '{u`}')
'qu' (<- 'qU')
'' (next)
)
)
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define attached_pronoun as (
[substring] among(
'ci' 'gli' 'la' 'le' 'li' 'lo'
'mi' 'ne' 'si' 'ti' 'vi'
// the compound forms are:
'sene' 'gliela' 'gliele' 'glieli' 'glielo' 'gliene'
'mela' 'mele' 'meli' 'melo' 'mene'
'tela' 'tele' 'teli' 'telo' 'tene'
'cela' 'cele' 'celi' 'celo' 'cene'
'vela' 'vele' 'veli' 'velo' 'vene'
)
among( (RV)
'ando' 'endo' (delete)
'ar' 'er' 'ir' (<- 'e')
)
)
define standard_suffix as (
[substring] among(
'anza' 'anze' 'ico' 'ici' 'ica' 'ice' 'iche' 'ichi' 'ismo'
'ismi' 'abile' 'abili' 'ibile' 'ibili' 'ista' 'iste' 'isti'
'ist{a`}' 'ist{e`}' 'ist{i`}' 'oso' 'osi' 'osa' 'ose' 'mente'
'atrice' 'atrici'
'ante' 'anti' // Note 1
( R2 delete )
'azione' 'azioni' 'atore' 'atori'
( R2 delete
try ( ['ic'] R2 delete )
)
'logia' 'logie'
( R2 <- 'log' )
'uzione' 'uzioni' 'usione' 'usioni'
( R2 <- 'u' )
'enza' 'enze'
( R2 <- 'ente' )
'amento' 'amenti' 'imento' 'imenti'
( RV delete )
'amente' (
R1 delete
try (
[substring] R2 delete among(
'iv' ( ['at'] R2 delete )
'os' 'ic' 'abil'
)
)
)
'it{a`}' (
R2 delete
try (
[substring] among(
'abil' 'ic' 'iv' (R2 delete)
)
)
)
'ivo' 'ivi' 'iva' 'ive' (
R2 delete
try ( ['at'] R2 delete ['ic'] R2 delete )
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ammo' 'ando' 'ano' 'are' 'arono' 'asse' 'assero' 'assi'
'assimo' 'ata' 'ate' 'ati' 'ato' 'ava' 'avamo' 'avano' 'avate'
'avi' 'avo' 'emmo' 'enda' 'ende' 'endi' 'endo' 'er{a`}' 'erai'
'eranno' 'ere' 'erebbe' 'erebbero' 'erei' 'eremmo' 'eremo'
'ereste' 'eresti' 'erete' 'er{o`}' 'erono' 'essero' 'ete'
'eva' 'evamo' 'evano' 'evate' 'evi' 'evo' 'Yamo' 'iamo' 'immo'
'ir{a`}' 'irai' 'iranno' 'ire' 'irebbe' 'irebbero' 'irei'
'iremmo' 'iremo' 'ireste' 'iresti' 'irete' 'ir{o`}' 'irono'
'isca' 'iscano' 'isce' 'isci' 'isco' 'iscono' 'issero' 'ita'
'ite' 'iti' 'ito' 'iva' 'ivamo' 'ivano' 'ivate' 'ivi' 'ivo'
'ono' 'uta' 'ute' 'uti' 'uto'
'ar' 'ir' // but 'er' is problematical
(delete)
)
)
define AEIO 'aeio{a`}{e`}{i`}{o`}'
define CG 'cg'
define vowel_suffix as (
try (
[AEIO] RV delete
['i'] RV delete
)
try (
['h'] CG RV delete
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do attached_pronoun
do (standard_suffix or verb_suffix)
do vowel_suffix
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/
strings ( ch )
integers ( x p1 p2 )
booleans ( Y_found stemmed GE_removed )
routines (
R1 R2
C V VX
lengthen_V
Step_1 Step_2 Step_3 Step_4 Step_7
Step_6 Step_1c
Lose_prefix
Lose_infix
measure
)
externals ( stem )
groupings ( v v_WX AOU AIOU )
stringescapes {}
stringdef ' hex '27' // yuk
define v 'aeiouy'
define v_WX v + 'wx'
define AOU 'aou'
define AIOU 'aiou'
backwardmode (
define R1 as (setmark x $x >= p1)
define R2 as (setmark x $x >= p2)
define V as test (v or 'ij')
define VX as test (next v or 'ij')
define C as test (not 'ij' non-v)
define lengthen_V as do (
non-v_WX [ (AOU] test (non-v or atlimit)) or
('e'] test (non-v or atlimit
not AIOU
not (next AIOU non-v)))
->ch insert ch
)
define Step_1 as
(
[among ( (])
'{'}s' (delete)
's' (R1 not ('t' R1) C delete)
'ies' (R1 <-'ie')
'es'
(('ar' R1 C ] delete lengthen_V) or
('er' R1 C ] delete) or
(R1 C <-'e'))
'aus' (R1 V <-'au')
'en' (('hed' R1 ] <-'heid') or
('nd' delete) or
('d' R1 C ] delete) or
('i' or 'j' V delete) or
(R1 C delete lengthen_V))
'nde' (<-'nd')
)
)
define Step_2 as
(
[among ( (])
'je' (('{'}t' ] delete) or
('et' ] R1 C delete) or
('rnt' ] <-'rn') or
('t' ] R1 VX delete) or
('ink' ] <-'ing') or
('mp' ] <-'m') or
('{'}' ] R1 delete) or
(] R1 C delete))
'ge' (R1 <-'g')
'lijke'(R1 <-'lijk')
'ische'(R1 <-'isch')
'de' (R1 C delete)
'te' (R1 <-'t')
'se' (R1 <-'s')
're' (R1 <-'r')
'le' (R1 delete attach 'l' lengthen_V)
'ene' (R1 C delete attach 'en' lengthen_V)
'ieve' (R1 C <-'ief')
)
)
define Step_3 as
(
[among ( (])
'atie' (R1 <-'eer')
'iteit' (R1 delete lengthen_V)
'heid'
'sel'
'ster' (R1 delete)
'rder' (<-'r')
'ing'
'isme'
'erij' (R1 delete lengthen_V)
'arij' (R1 C <-'aar')
'fie' (R2 delete attach 'f' lengthen_V)
'gie' (R2 delete attach 'g' lengthen_V)
'tst' (R1 C <-'t')
'dst' (R1 C <-'d')
)
)
define Step_4 as
(
( [among ( (])
'ioneel' (R1 <-'ie')
'atief' (R1 <-'eer')
'baar' (R1 delete)
'naar' (R1 V <-'n')
'laar' (R1 V <-'l')
'raar' (R1 V <-'r')
'tant' (R1 <-'teer')
'lijker'
'lijkst' (R1 <-'lijk')
'achtig'
'achtiger'
'achtigst'(R1 delete)
'eriger'
'erigst'
'erig'
'end' (R1 C delete lengthen_V)
)
)
or
( [among ( (])
'iger'
'igst'
'ig' (R1 C delete lengthen_V)
)
)
)
define Step_7 as
(
[among ( (])
'kt' (<-'k')
'ft' (<-'f')
'pt' (<-'p')
)
)
define Step_6 as
(
[among ( (])
'bb' (<-'b')
'cc' (<-'c')
'dd' (<-'d')
'ff' (<-'f')
'gg' (<-'g')
'hh' (<-'h')
'jj' (<-'j')
'kk' (<-'k')
'll' (<-'l')
'mm' (<-'m')
'nn' (<-'n')
'pp' (<-'p')
'qq' (<-'q')
'rr' (<-'r')
'ss' (<-'s')
'tt' (<-'t')
'vv' (<-'v')
'ww' (<-'w')
'xx' (<-'x')
'zz' (<-'z')
'v' (<-'f')
'z' (<-'s')
)
)
define Step_1c as
(
[among ( (] R1 C)
'd' (not ('n' R1) delete)
't' (not ('h' R1) delete)
)
)
)
define Lose_prefix as (
['ge'] test hop 3 (goto v goto non-v)
set GE_removed
delete
)
define Lose_infix as (
next
gopast (['ge']) test hop 3 (goto v goto non-v)
set GE_removed
delete
)
define measure as (
do (
tolimit
setmark p1
setmark p2
)
do(
repeat non-v atleast 1 ('ij' or v) non-v setmark p1
repeat non-v atleast 1 ('ij' or v) non-v setmark p2
)
)
define stem as (
unset Y_found
unset stemmed
do ( ['y'] <-'Y' set Y_found )
do repeat(goto (v ['y'])<-'Y' set Y_found )
measure
backwards (
do (Step_1 set stemmed )
do (Step_2 set stemmed )
do (Step_3 set stemmed )
do (Step_4 set stemmed )
)
unset GE_removed
do (Lose_prefix and measure)
backwards (
do (GE_removed Step_1c)
)
unset GE_removed
do (Lose_infix and measure)
backwards (
do (GE_removed Step_1c)
)
backwards (
do (Step_7 set stemmed )
do (stemmed or GE_removed Step_6)
)
do(Y_found repeat(goto (['Y']) <-'y'))
)
stringescapes {}
routines (
A B C D E F G H I J K L M N O P Q R S T U V W X Y Z AA BB CC
endings
undouble respell
)
externals ( stem )
backwardmode (
/* Lovins' conditions A, B ... CC, as given in her Appendix B, where
a test for a two letter prefix ('test hop 2') is implicitly
assumed. Note that 'e' next 'u' corresponds to her u*e because
Snowball is scanning backwards. */
define A as ( hop 2 )
define B as ( hop 3 )
define C as ( hop 4 )
define D as ( hop 5 )
define E as ( test hop 2 not 'e' )
define F as ( test hop 3 not 'e' )
define G as ( test hop 3 'f' )
define H as ( test hop 2 't' or 'll' )
define I as ( test hop 2 not 'o' not 'e' )
define J as ( test hop 2 not 'a' not 'e' )
define K as ( test hop 3 'l' or 'i' or ('e' next 'u') )
define L as ( test hop 2 not 'u' not 'x' not ('s' not 'o') )
define M as ( test hop 2 not 'a' not 'c' not 'e' not 'm' )
define N as ( test hop 3 ( hop 2 not 's' or hop 2 ) )
define O as ( test hop 2 'l' or 'i' )
define P as ( test hop 2 not 'c' )
define Q as ( test hop 2 test hop 3 not 'l' not 'n' )
define R as ( test hop 2 'n' or 'r' )
define S as ( test hop 2 'dr' or ('t' not 't') )
define T as ( test hop 2 's' or ('t' not 'o') )
define U as ( test hop 2 'l' or 'm' or 'n' or 'r' )
define V as ( test hop 2 'c' )
define W as ( test hop 2 not 's' not 'u' )
define X as ( test hop 2 'l' or 'i' or ('e' next 'u') )
define Y as ( test hop 2 'in' )
define Z as ( test hop 2 not 'f' )
define AA as ( test hop 2 among ( 'd' 'f' 'ph' 'th' 'l' 'er' 'or'
'es' 't' ) )
define BB as ( test hop 3 not 'met' not 'ryst' )
define CC as ( test hop 2 'l' )
/* The system of endings, as given in Appendix A. */
define endings as (
[substring] among(
'alistically' B 'arizability' A 'izationally' B
'antialness' A 'arisations' A 'arizations' A 'entialness' A
'allically' C 'antaneous' A 'antiality' A 'arisation' A
'arization' A 'ationally' B 'ativeness' A 'eableness' E
'entations' A 'entiality' A 'entialize' A 'entiation' A
'ionalness' A 'istically' A 'itousness' A 'izability' A
'izational' A
'ableness' A 'arizable' A 'entation' A 'entially' A
'eousness' A 'ibleness' A 'icalness' A 'ionalism' A
'ionality' A 'ionalize' A 'iousness' A 'izations' A
'lessness' A
'ability' A 'aically' A 'alistic' B 'alities' A
'ariness' E 'aristic' A 'arizing' A 'ateness' A
'atingly' A 'ational' B 'atively' A 'ativism' A
'elihood' E 'encible' A 'entally' A 'entials' A
'entiate' A 'entness' A 'fulness' A 'ibility' A
'icalism' A 'icalist' A 'icality' A 'icalize' A
'ication' G 'icianry' A 'ination' A 'ingness' A
'ionally' A 'isation' A 'ishness' A 'istical' A
'iteness' A 'iveness' A 'ivistic' A 'ivities' A
'ization' F 'izement' A 'oidally' A 'ousness' A
'aceous' A 'acious' B 'action' G 'alness' A
'ancial' A 'ancies' A 'ancing' B 'ariser' A
'arized' A 'arizer' A 'atable' A 'ations' B
'atives' A 'eature' Z 'efully' A 'encies' A
'encing' A 'ential' A 'enting' C 'entist' A
'eously' A 'ialist' A 'iality' A 'ialize' A
'ically' A 'icance' A 'icians' A 'icists' A
'ifully' A 'ionals' A 'ionate' D 'ioning' A
'ionist' A 'iously' A 'istics' A 'izable' E
'lessly' A 'nesses' A 'oidism' A
'acies' A 'acity' A 'aging' B 'aical' A
'alist' A 'alism' B 'ality' A 'alize' A
'allic'BB 'anced' B 'ances' B 'antic' C
'arial' A 'aries' A 'arily' A 'arity' B
'arize' A 'aroid' A 'ately' A 'ating' I
'ation' B 'ative' A 'ators' A 'atory' A
'ature' E 'early' Y 'ehood' A 'eless' A
'elity' A 'ement' A 'enced' A 'ences' A
'eness' E 'ening' E 'ental' A 'ented' C
'ently' A 'fully' A 'ially' A 'icant' A
'ician' A 'icide' A 'icism' A 'icist' A
'icity' A 'idine' I 'iedly' A 'ihood' A
'inate' A 'iness' A 'ingly' B 'inism' J
'inity'CC 'ional' A 'ioned' A 'ished' A
'istic' A 'ities' A 'itous' A 'ively' A
'ivity' A 'izers' F 'izing' F 'oidal' A
'oides' A 'otide' A 'ously' A
'able' A 'ably' A 'ages' B 'ally' B
'ance' B 'ancy' B 'ants' B 'aric' A
'arly' K 'ated' I 'ates' A 'atic' B
'ator' A 'ealy' Y 'edly' E 'eful' A
'eity' A 'ence' A 'ency' A 'ened' E
'enly' E 'eous' A 'hood' A 'ials' A
'ians' A 'ible' A 'ibly' A 'ical' A
'ides' L 'iers' A 'iful' A 'ines' M
'ings' N 'ions' B 'ious' A 'isms' B
'ists' A 'itic' H 'ized' F 'izer' F
'less' A 'lily' A 'ness' A 'ogen' A
'ward' A 'wise' A 'ying' B 'yish' A
'acy' A 'age' B 'aic' A 'als'BB
'ant' B 'ars' O 'ary' F 'ata' A
'ate' A 'eal' Y 'ear' Y 'ely' E
'ene' E 'ent' C 'ery' E 'ese' A
'ful' A 'ial' A 'ian' A 'ics' A
'ide' L 'ied' A 'ier' A 'ies' P
'ily' A 'ine' M 'ing' N 'ion' Q
'ish' C 'ism' B 'ist' A 'ite'AA
'ity' A 'ium' A 'ive' A 'ize' F
'oid' A 'one' R 'ous' A
'ae' A 'al'BB 'ar' X 'as' B
'ed' E 'en' F 'es' E 'ia' A
'ic' A 'is' A 'ly' B 'on' S
'or' T 'um' U 'us' V 'yl' R
'{'}s' A 's{'}' A
'a' A 'e' A 'i' A 'o' A
's' W 'y' B
(delete)
)
)
/* Undoubling is rule 1 of appendix C. */
define undouble as (
test substring among ('bb' 'dd' 'gg' 'll' 'mm' 'nn' 'pp' 'rr' 'ss'
'tt')
[next] delete
)
/* The other appendix C rules can be done together. */
define respell as (
[substring] among (
'iev' (<-'ief')
'uct' (<-'uc')
'umpt' (<-'um')
'rpt' (<-'rb')
'urs' (<-'ur')
'istr' (<-'ister')
'metr' (<-'meter')
'olv' (<-'olut')
'ul' (not 'a' not 'i' not 'o' <-'l')
'bex' (<-'bic')
'dex' (<-'dic')
'pex' (<-'pic')
'tex' (<-'tic')
'ax' (<-'ac')
'ex' (<-'ec')
'ix' (<-'ic')
'lux' (<-'luc')
'uad' (<-'uas')
'vad' (<-'vas')
'cid' (<-'cis')
'lid' (<-'lis')
'erid' (<-'eris')
'pand' (<-'pans')
'end' (not 's' <-'ens')
'ond' (<-'ons')
'lud' (<-'lus')
'rud' (<-'rus')
'her' (not 'p' not 't' <-'hes')
'mit' (<-'mis')
'ent' (not 'm' <-'ens')
/* 'ent' was 'end' in the 1968 paper - a typo. */
'ert' (<-'ers')
'et' (not 'n' <-'es')
'yt' (<-'ys')
'yz' (<-'ys')
)
)
)
define stem as (
backwards (
do endings
do undouble
do respell
)
)
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)
externals ( stem )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef ae hex 'E6'
stringdef ao hex 'E5'
stringdef o/ hex 'F8'
define v 'aeiouy{ae}{ao}{o/}'
define s_ending 'bcdfghjlmnoprtvyz'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
'hetens' 'ers' 'ets' 'et' 'het' 'ast'
(delete)
's'
(s_ending or ('k' non-v) delete)
'erte' 'ert'
(<-'er')
)
)
define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'dt' 'vt'
)
)
next] delete
)
define other_suffix as (
setlimit tomark p1 for ([substring])
among(
'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
'hetslov'
(delete)
)
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)
externals ( stem )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef ae hex '91'
stringdef ao hex '86'
stringdef o/ hex '9B'
define v 'aeiouy{ae}{ao}{o/}'
define s_ending 'bcdfghjlmnoprtvyz'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'a' 'e' 'ede' 'ande' 'ende' 'ane' 'ene' 'hetene' 'en' 'heten' 'ar'
'er' 'heter' 'as' 'es' 'edes' 'endes' 'enes' 'hetenes' 'ens'
'hetens' 'ers' 'ets' 'et' 'het' 'ast'
(delete)
's'
(s_ending or ('k' non-v) delete)
'erte' 'ert'
(<-'er')
)
)
define consonant_pair as (
test (
setlimit tomark p1 for ([substring])
among(
'dt' 'vt'
)
)
next] delete
)
define other_suffix as (
setlimit tomark p1 for ([substring])
among(
'leg' 'eleg' 'ig' 'eig' 'lig' 'elig' 'els' 'lov' 'elov' 'slov'
'hetslov'
(delete)
)
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)
integers ( p1 p2 )
booleans ( Y_found )
routines (
shortv
R1 R2
Step_1a Step_1b Step_1c Step_2 Step_3 Step_4 Step_5a Step_5b
)
externals ( stem )
groupings ( v v_WXY )
define v 'aeiouy'
define v_WXY v + 'wxY'
backwardmode (
define shortv as ( non-v_WXY v non-v )
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define Step_1a as (
[substring] among (
'sses' (<-'ss')
'ies' (<-'i')
'ss' ()
's' (delete)
)
)
define Step_1b as (
[substring] among (
'eed' (R1 <-'ee')
'ed'
'ing' (
test gopast v delete
test substring among(
'at' 'bl' 'iz'
(<+ 'e')
'bb' 'dd' 'ff' 'gg' 'mm' 'nn' 'pp' 'rr' 'tt'
// ignoring double c, h, j, k, q, v, w, and x
([next] delete)
'' (atmark p1 test shortv <+ 'e')
)
)
)
)
define Step_1c as (
['y' or 'Y']
gopast v
<-'i'
)
define Step_2 as (
[substring] R1 among (
'tional' (<-'tion')
'enci' (<-'ence')
'anci' (<-'ance')
'abli' (<-'able')
'entli' (<-'ent')
'eli' (<-'e')
'izer' 'ization'
(<-'ize')
'ational' 'ation' 'ator'
(<-'ate')
'alli' (<-'al')
'alism' 'aliti'
(<-'al')
'fulness' (<-'ful')
'ousli' 'ousness'
(<-'ous')
'iveness' 'iviti'
(<-'ive')
'biliti' (<-'ble')
)
)
define Step_3 as (
[substring] R1 among (
'alize' (<-'al')
'icate' 'iciti' 'ical'
(<-'ic')
'ative' 'ful' 'ness'
(delete)
)
)
define Step_4 as (
[substring] R2 among (
'al' 'ance' 'ence' 'er' 'ic' 'able' 'ible' 'ant' 'ement'
'ment' 'ent' 'ou' 'ism' 'ate' 'iti' 'ous' 'ive' 'ize'
(delete)
'ion' ('s' or 't' delete)
)
)
define Step_5a as (
['e']
R2 or (R1 not shortv)
delete
)
define Step_5b as (
['l']
R2 'l'
delete
)
)
define stem as (
unset Y_found
do ( ['y'] <-'Y' set Y_found)
do repeat(goto (v ['y']) <-'Y' set Y_found)
$p1 = limit
$p2 = limit
do(
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
backwards (
do Step_1a
do Step_1b
do Step_1c
do Step_2
do Step_3
do Step_4
do Step_5a
do Step_5b
)
do(Y_found repeat(goto (['Y']) <-'y'))
)
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
verb_suffix
residual_suffix
residual_form
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a' hex 'E1' // a-acute
stringdef a^ hex 'E2' // a-circumflex e.g. 'bota^nico
stringdef e' hex 'E9' // e-acute
stringdef e^ hex 'EA' // e-circumflex
stringdef i' hex 'ED' // i-acute
stringdef o^ hex 'F4' // o-circumflex
stringdef o' hex 'F3' // o-acute
stringdef u' hex 'FA' // u-acute
stringdef c, hex 'E7' // c-cedilla
stringdef a~ hex 'E3' // a-tilde
stringdef o~ hex 'F5' // o-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
define prelude as repeat (
[substring] among(
'{a~}' (<- 'a~')
'{o~}' (<- 'o~')
'' (next)
) //or next
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'a~' (<- '{a~}')
'o~' (<- '{o~}')
'' (next)
) //or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'eza' 'ezas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'{a'}vel'
'{i'}vel'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amento' 'amentos'
'imento' 'imentos'
'adora' 'ador' 'a{c,}a~o'
'adoras' 'adores' 'a{c,}o~es' // no -ic test
'ante' 'antes' '{a^}ncia' // Note 1
(
R2 delete
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'{e^}ncia' '{e^}ncias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'avel'
'{i'}vel' (R2 delete)
)
)
)
'idade'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
'ira' 'iras'
(
RV 'e' // -eira -eiras usually non-verbal
<- 'ir'
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
'ira' 'iras'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)
define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
( ( standard_suffix or verb_suffix )
and do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/
routines (
prelude postlude mark_regions
RV R1 R2
standard_suffix
verb_suffix
residual_suffix
residual_form
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a' hex 'A0' // a-acute
stringdef a^ hex '83' // a-circumflex e.g. 'bota^nico
stringdef e' hex '82' // e-acute
stringdef e^ hex '88' // e-circumflex
stringdef i' hex 'A1' // i-acute
stringdef o^ hex '93' // o-circumflex
stringdef o' hex 'A2' // o-acute
stringdef u' hex 'A3' // u-acute
stringdef c, hex '87' // c-cedilla
stringdef a~ hex 'C6' // a-tilde
stringdef o~ hex 'E4' // o-tilde
define v 'aeiou{a'}{e'}{i'}{o'}{u'}{a^}{e^}{o^}'
define prelude as repeat (
[substring] among(
'{a~}' (<- 'a~')
'{o~}' (<- 'o~')
'' (next)
) //or next
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'a~' (<- '{a~}')
'o~' (<- '{o~}')
'' (next)
) //or next
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define standard_suffix as (
[substring] among(
'eza' 'ezas'
'ico' 'ica' 'icos' 'icas'
'ismo' 'ismos'
'{a'}vel'
'{i'}vel'
'ista' 'istas'
'oso' 'osa' 'osos' 'osas'
'amento' 'amentos'
'imento' 'imentos'
'adora' 'ador' 'a{c,}a~o'
'adoras' 'adores' 'a{c,}o~es' // no -ic test
'ante' 'antes' '{a^}ncia' // Note 1
(
R2 delete
)
'log{i'}a'
'log{i'}as'
(
R2 <- 'log'
)
'uci{o'}n' 'uciones'
(
R2 <- 'u'
)
'{e^}ncia' '{e^}ncias'
(
R2 <- 'ente'
)
'amente'
(
R1 delete
try (
[substring] R2 delete among(
'iv' (['at'] R2 delete)
'os'
'ic'
'ad'
)
)
)
'mente'
(
R2 delete
try (
[substring] among(
'ante' // Note 1
'avel'
'{i'}vel' (R2 delete)
)
)
)
'idade'
'idades'
(
R2 delete
try (
[substring] among(
'abil'
'ic'
'iv' (R2 delete)
)
)
)
'iva' 'ivo'
'ivas' 'ivos'
(
R2 delete
try (
['at'] R2 delete // but not a further ['ic'] R2 delete
)
)
'ira' 'iras'
(
RV 'e' // -eira -eiras usually non-verbal
<- 'ir'
)
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
'ada' 'ida' 'ia' 'aria' 'eria' 'iria' 'ar{a'}' 'ara' 'er{a'}'
'era' 'ir{a'}' 'ava' 'asse' 'esse' 'isse' 'aste' 'este' 'iste'
'ei' 'arei' 'erei' 'irei' 'am' 'iam' 'ariam' 'eriam' 'iriam'
'aram' 'eram' 'iram' 'avam' 'em' 'arem' 'erem' 'irem' 'assem'
'essem' 'issem' 'ado' 'ido' 'ando' 'endo' 'indo' 'ara~o'
'era~o' 'ira~o' 'ar' 'er' 'ir' 'as' 'adas' 'idas' 'ias'
'arias' 'erias' 'irias' 'ar{a'}s' 'aras' 'er{a'}s' 'eras'
'ir{a'}s' 'avas' 'es' 'ardes' 'erdes' 'irdes' 'ares' 'eres'
'ires' 'asses' 'esses' 'isses' 'astes' 'estes' 'istes' 'is'
'ais' 'eis' '{i'}eis' 'ar{i'}eis' 'er{i'}eis' 'ir{i'}eis'
'{a'}reis' 'areis' '{e'}reis' 'ereis' '{i'}reis' 'ireis'
'{a'}sseis' '{e'}sseis' '{i'}sseis' '{a'}veis' 'ados' 'idos'
'{a'}mos' 'amos' '{i'}amos' 'ar{i'}amos' 'er{i'}amos'
'ir{i'}amos' '{a'}ramos' '{e'}ramos' '{i'}ramos' '{a'}vamos'
'emos' 'aremos' 'eremos' 'iremos' '{a'}ssemos' '{e^}ssemos'
'{i'}ssemos' 'imos' 'armos' 'ermos' 'irmos' 'eu' 'iu' 'ou'
'ira' 'iras'
(delete)
)
)
define residual_suffix as (
[substring] among(
'os'
'a' 'i' 'o' '{a'}' '{i'}' '{o'}'
( RV delete )
)
)
define residual_form as (
[substring] among(
'e' '{e'}' '{e^}'
( RV delete [('u'] test 'g') or
('i'] test 'c') RV delete )
'{c,}' (<-'c')
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do (
( ( standard_suffix or verb_suffix )
and do ( ['i'] test 'c' RV delete )
)
or residual_suffix
)
do residual_form
)
do postlude
)
/*
Note 1: additions of 15 Jun 2005
*/
routines (
prelude postlude mark_regions
RV R1 R2
step_0
standard_suffix combo_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
booleans ( standard_suffix_removed )
stringescapes {}
/* special characters */
stringdef a^ hex 'E2' // a circumflex
stringdef i^ hex 'EE' // i circumflex
stringdef a+ hex 'E3' // a breve
stringdef s, hex 'BA' // s cedilla
stringdef t, hex 'FE' // t cedilla
define v 'aeiou{a^}{i^}{a+}'
define prelude as (
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define step_0 as (
[substring] R1 among(
'ul' 'ului'
( delete )
'aua'
( <-'a' )
'ea' 'ele' 'elor'
( <-'e' )
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
( <-'i')
'ile'
( not 'ab' <- 'i' )
'atei'
( <- 'at' )
'a{t,}ie' 'a{t,}ia'
( <- 'a{t,}i' )
)
)
define combo_suffix as test (
[substring] R1 (
among(
/* 'IST'. alternative: include the following
'alism' 'alisme'
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
<- 'al'
)
*/
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
<- 'abil'
)
'ibilitate' (
<- 'ibil'
)
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
<- 'iv'
)
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
'icator' 'icatori'
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
<- 'ic'
)
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
'atoare' 'ator' 'atori'
'{a+}toare' '{a+}tor' '{a+}tori' (
<- 'at'
)
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
'itoare' 'itor' 'itori' (
<- 'it'
)
)
set standard_suffix_removed
)
)
define standard_suffix as (
unset standard_suffix_removed
repeat combo_suffix
[substring] R2 (
among(
// past participle is treated here, rather than
// as a verb ending:
'at' 'ata' 'at{a+}' 'ati' 'ate'
'ut' 'uta' 'ut{a+}' 'uti' 'ute'
'it' 'ita' 'it{a+}' 'iti' 'ite'
'ic' 'ica' 'ice' 'ici' 'ic{a+}'
'abil' 'abila' 'abile' 'abili' 'abil{a+}'
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
'ant' 'anta' 'ante' 'anti' 'ant{a+}'
'ator' 'atori'
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
delete
)
'iune' 'iuni' (
'{t,}'] <- 't'
)
'ism' 'isme'
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
<- 'ist'
/* 'IST'. alternative: remove with <- '' */
)
)
set standard_suffix_removed
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
// 'long' infinitive:
'are' 'ere' 'ire' '{a^}re'
// gerund:
'ind' '{a^}nd'
'indu' '{a^}ndu'
'eze'
'easc{a+}'
// present:
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
'e{s,}te'
'{a+}sc' '{a+}{s,}ti'
'{a+}{s,}te'
// imperfect:
'am' 'ai' 'au'
'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
// past: // (not 'ii')
'ui'
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
// pluferfect:
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
'{a^}ser{a+}'
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
( non-v or 'u' delete )
// present:
'{a+}m' 'a{t,}i'
'em' 'e{t,}i'
'im' 'i{t,}i'
'{a^}m' '{a^}{t,}i'
// past:
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
'sei' 'se'
// pluperfect:
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
(delete)
)
)
define vowel_suffix as (
[substring] RV among (
'a' 'e' 'i' 'ie' '{a+}' ( delete )
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do step_0
do standard_suffix
do ( standard_suffix_removed or verb_suffix )
do vowel_suffix
)
do postlude
)
routines (
prelude postlude mark_regions
RV R1 R2
step_0
standard_suffix combo_suffix
verb_suffix
vowel_suffix
)
externals ( stem )
integers ( pV p1 p2 )
groupings ( v )
booleans ( standard_suffix_removed )
stringescapes {}
/* special characters */
stringdef a^ hex '0E2' // a circumflex
stringdef i^ hex '0EE' // i circumflex
stringdef a+ hex '103' // a breve
stringdef s, hex '15F' // s cedilla
stringdef t, hex '163' // t cedilla
define v 'aeiou{a^}{i^}{a+}'
define prelude as (
repeat goto (
v [ ('u' ] v <- 'U') or
('i' ] v <- 'I')
)
)
define mark_regions as (
$pV = limit
$p1 = limit
$p2 = limit // defaults
do (
( v (non-v gopast v) or (v gopast non-v) )
or
( non-v (non-v gopast v) or (v next) )
setmark pV
)
do (
gopast v gopast non-v setmark p1
gopast v gopast non-v setmark p2
)
)
define postlude as repeat (
[substring] among(
'I' (<- 'i')
'U' (<- 'u')
'' (next)
)
)
backwardmode (
define RV as $pV <= cursor
define R1 as $p1 <= cursor
define R2 as $p2 <= cursor
define step_0 as (
[substring] R1 among(
'ul' 'ului'
( delete )
'aua'
( <-'a' )
'ea' 'ele' 'elor'
( <-'e' )
'ii' 'iua' 'iei' 'iile' 'iilor' 'ilor'
( <-'i')
'ile'
( not 'ab' <- 'i' )
'atei'
( <- 'at' )
'a{t,}ie' 'a{t,}ia'
( <- 'a{t,}i' )
)
)
define combo_suffix as test (
[substring] R1 (
among(
/* 'IST'. alternative: include the following
'alism' 'alisme'
'alist' 'alista' 'aliste' 'alisti' 'alist{a+}' 'ali{s,}ti' (
<- 'al'
)
*/
'abilitate' 'abilitati' 'abilit{a+}i' 'abilit{a+}{t,}i' (
<- 'abil'
)
'ibilitate' (
<- 'ibil'
)
'ivitate' 'ivitati' 'ivit{a+}i' 'ivit{a+}{t,}i' (
<- 'iv'
)
'icitate' 'icitati' 'icit{a+}i' 'icit{a+}{t,}i'
'icator' 'icatori'
'iciv' 'iciva' 'icive' 'icivi' 'iciv{a+}'
'ical' 'icala' 'icale' 'icali' 'ical{a+}' (
<- 'ic'
)
'ativ' 'ativa' 'ative' 'ativi' 'ativ{a+}' 'a{t,}iune'
'atoare' 'ator' 'atori'
'{a+}toare' '{a+}tor' '{a+}tori' (
<- 'at'
)
'itiv' 'itiva' 'itive' 'itivi' 'itiv{a+}' 'i{t,}iune'
'itoare' 'itor' 'itori' (
<- 'it'
)
)
set standard_suffix_removed
)
)
define standard_suffix as (
unset standard_suffix_removed
repeat combo_suffix
[substring] R2 (
among(
// past participle is treated here, rather than
// as a verb ending:
'at' 'ata' 'at{a+}' 'ati' 'ate'
'ut' 'uta' 'ut{a+}' 'uti' 'ute'
'it' 'ita' 'it{a+}' 'iti' 'ite'
'ic' 'ica' 'ice' 'ici' 'ic{a+}'
'abil' 'abila' 'abile' 'abili' 'abil{a+}'
'ibil' 'ibila' 'ibile' 'ibili' 'ibil{a+}'
'oasa' 'oas{a+}' 'oase' 'os' 'osi' 'o{s,}i'
'ant' 'anta' 'ante' 'anti' 'ant{a+}'
'ator' 'atori'
'itate' 'itati' 'it{a+}i' 'it{a+}{t,}i'
'iv' 'iva' 'ive' 'ivi' 'iv{a+}' (
delete
)
'iune' 'iuni' (
'{t,}'] <- 't'
)
'ism' 'isme'
'ist' 'ista' 'iste' 'isti' 'ist{a+}' 'i{s,}ti' (
<- 'ist'
/* 'IST'. alternative: remove with <- '' */
)
)
set standard_suffix_removed
)
)
define verb_suffix as setlimit tomark pV for (
[substring] among(
// 'long' infinitive:
'are' 'ere' 'ire' '{a^}re'
// gerund:
'ind' '{a^}nd'
'indu' '{a^}ndu'
'eze'
'easc{a+}'
// present:
'ez' 'ezi' 'eaz{a+}' 'esc' 'e{s,}ti'
'e{s,}te'
'{a+}sc' '{a+}{s,}ti'
'{a+}{s,}te'
// imperfect:
'am' 'ai' 'au'
'eam' 'eai' 'ea' 'ea{t,}i' 'eau'
'iam' 'iai' 'ia' 'ia{t,}i' 'iau'
// past: // (not 'ii')
'ui'
'a{s,}i' 'ar{a+}m' 'ar{a+}{t,}i' 'ar{a+}'
'u{s,}i' 'ur{a+}m' 'ur{a+}{t,}i' 'ur{a+}'
'i{s,}i' 'ir{a+}m' 'ir{a+}{t,}i' 'ir{a+}'
'{a^}i' '{a^}{s,}i' '{a^}r{a+}m' '{a^}r{a+}{t,}i' '{a^}r{a+}'
// pluferfect:
'asem' 'ase{s,}i' 'ase' 'aser{a+}m' 'aser{a+}{t,}i' 'aser{a+}'
'isem' 'ise{s,}i' 'ise' 'iser{a+}m' 'iser{a+}{t,}i' 'iser{a+}'
'{a^}sem' '{a^}se{s,}i' '{a^}se' '{a^}ser{a+}m' '{a^}ser{a+}{t,}i'
'{a^}ser{a+}'
'usem' 'use{s,}i' 'use' 'user{a+}m' 'user{a+}{t,}i' 'user{a+}'
( non-v or 'u' delete )
// present:
'{a+}m' 'a{t,}i'
'em' 'e{t,}i'
'im' 'i{t,}i'
'{a^}m' '{a^}{t,}i'
// past:
'se{s,}i' 'ser{a+}m' 'ser{a+}{t,}i' 'ser{a+}'
'sei' 'se'
// pluperfect:
'sesem' 'sese{s,}i' 'sese' 'seser{a+}m' 'seser{a+}{t,}i' 'seser{a+}'
(delete)
)
)
define vowel_suffix as (
[substring] RV among (
'a' 'e' 'i' 'ie' '{a+}' ( delete )
)
)
)
define stem as (
do prelude
do mark_regions
backwards (
do step_0
do standard_suffix
do ( standard_suffix_removed or verb_suffix )
do vowel_suffix
)
do postlude
)
stringescapes {}
/* the 32 Cyrillic letters in the KOI8-R coding scheme, and represented
in Latin characters following the conventions of the standard Library
of Congress transliteration: */
stringdef a hex 'C1'
stringdef b hex 'C2'
stringdef v hex 'D7'
stringdef g hex 'C7'
stringdef d hex 'C4'
stringdef e hex 'C5'
stringdef zh hex 'D6'
stringdef z hex 'DA'
stringdef i hex 'C9'
stringdef i` hex 'CA'
stringdef k hex 'CB'
stringdef l hex 'CC'
stringdef m hex 'CD'
stringdef n hex 'CE'
stringdef o hex 'CF'
stringdef p hex 'D0'
stringdef r hex 'D2'
stringdef s hex 'D3'
stringdef t hex 'D4'
stringdef u hex 'D5'
stringdef f hex 'C6'
stringdef kh hex 'C8'
stringdef ts hex 'C3'
stringdef ch hex 'DE'
stringdef sh hex 'DB'
stringdef shch hex 'DD'
stringdef " hex 'DF'
stringdef y hex 'D9'
stringdef ' hex 'D8'
stringdef e` hex 'DC'
stringdef iu hex 'C0'
stringdef ia hex 'D1'
routines ( mark_regions R2
perfective_gerund
adjective
adjectival
reflexive
verb
noun
derivational
tidy_up
)
externals ( stem )
integers ( pV p2 )
groupings ( v )
define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
define mark_regions as (
$pV = limit
$p2 = limit
do (
gopast v setmark pV gopast non-v
gopast v gopast non-v setmark p2
)
)
backwardmode (
define R2 as $p2 <= cursor
define perfective_gerund as (
[substring] among (
'{v}'
'{v}{sh}{i}'
'{v}{sh}{i}{s}{'}'
('{a}' or '{ia}' delete)
'{i}{v}'
'{i}{v}{sh}{i}'
'{i}{v}{sh}{i}{s}{'}'
'{y}{v}'
'{y}{v}{sh}{i}'
'{y}{v}{sh}{i}{s}{'}'
(delete)
)
)
define adjective as (
[substring] among (
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
'{ia}{ia}'
// and -
'{o}{iu}' // - which is somewhat archaic
'{e}{iu}' // - soft form of {o}{iu}
(delete)
)
)
define adjectival as (
adjective
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
errors. Removing im, uem, enn creates too many errors.
*/
try (
[substring] among (
'{e}{m}' // present passive participle
'{n}{n}' // adjective from past passive participle
'{v}{sh}' // past active participle
'{iu}{shch}' '{shch}' // present active participle
('{a}' or '{ia}' delete)
//but not '{i}{m}' '{u}{e}{m}' // present passive participle
//or '{e}{n}{n}' // adjective from past passive participle
'{i}{v}{sh}' '{y}{v}{sh}'// past active participle
'{u}{iu}{shch}' // present active participle
(delete)
)
)
)
define reflexive as (
[substring] among (
'{s}{ia}'
'{s}{'}'
(delete)
)
)
define verb as (
[substring] among (
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
'{n}{y}' '{t}{'}' '{e}{sh}{'}'
'{n}{n}{o}'
('{a}' or '{ia}' delete)
'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
(delete)
/* note the short passive participle tests:
'{n}{a}' '{n}' '{n}{o}' '{n}{y}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
*/
)
)
define noun as (
[substring] among (
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
(delete)
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
omitted - they only occur on 12 words.
*/
)
)
define derivational as (
[substring] R2 among (
'{o}{s}{t}'
'{o}{s}{t}{'}'
(delete)
)
)
define tidy_up as (
[substring] among (
'{e}{i`}{sh}'
'{e}{i`}{sh}{e}' // superlative forms
(delete
['{n}'] '{n}' delete
)
'{n}'
('{n}' delete) // e.g. -nno endings
'{'}'
(delete) // with some slight false conflations
)
)
)
define stem as (
do mark_regions
backwards setlimit tomark pV for (
do (
perfective_gerund or
( try reflexive
adjectival or verb or noun
)
)
try([ '{i}' ] delete)
// because noun ending -i{iu} is being treated as verb ending -{iu}
do derivational
do tidy_up
)
)
stringescapes {}
/* the 32 Cyrillic letters in Unicode */
stringdef a hex '430'
stringdef b hex '431'
stringdef v hex '432'
stringdef g hex '433'
stringdef d hex '434'
stringdef e hex '435'
stringdef zh hex '436'
stringdef z hex '437'
stringdef i hex '438'
stringdef i` hex '439'
stringdef k hex '43A'
stringdef l hex '43B'
stringdef m hex '43C'
stringdef n hex '43D'
stringdef o hex '43E'
stringdef p hex '43F'
stringdef r hex '440'
stringdef s hex '441'
stringdef t hex '442'
stringdef u hex '443'
stringdef f hex '444'
stringdef kh hex '445'
stringdef ts hex '446'
stringdef ch hex '447'
stringdef sh hex '448'
stringdef shch hex '449'
stringdef " hex '44A'
stringdef y hex '44B'
stringdef ' hex '44C'
stringdef e` hex '44D'
stringdef iu hex '44E'
stringdef ia hex '44F'
routines ( mark_regions R2
perfective_gerund
adjective
adjectival
reflexive
verb
noun
derivational
tidy_up
)
externals ( stem )
integers ( pV p2 )
groupings ( v )
define v '{a}{e}{i}{o}{u}{y}{e`}{iu}{ia}'
define mark_regions as (
$pV = limit
$p2 = limit
do (
gopast v setmark pV gopast non-v
gopast v gopast non-v setmark p2
)
)
backwardmode (
define R2 as $p2 <= cursor
define perfective_gerund as (
[substring] among (
'{v}'
'{v}{sh}{i}'
'{v}{sh}{i}{s}{'}'
('{a}' or '{ia}' delete)
'{i}{v}'
'{i}{v}{sh}{i}'
'{i}{v}{sh}{i}{s}{'}'
'{y}{v}'
'{y}{v}{sh}{i}'
'{y}{v}{sh}{i}{s}{'}'
(delete)
)
)
define adjective as (
[substring] among (
'{e}{e}' '{i}{e}' '{y}{e}' '{o}{e}' '{i}{m}{i}' '{y}{m}{i}'
'{e}{i`}' '{i}{i`}' '{y}{i`}' '{o}{i`}' '{e}{m}' '{i}{m}'
'{y}{m}' '{o}{m}' '{e}{g}{o}' '{o}{g}{o}' '{e}{m}{u}'
'{o}{m}{u}' '{i}{kh}' '{y}{kh}' '{u}{iu}' '{iu}{iu}' '{a}{ia}'
'{ia}{ia}'
// and -
'{o}{iu}' // - which is somewhat archaic
'{e}{iu}' // - soft form of {o}{iu}
(delete)
)
)
define adjectival as (
adjective
/* of the participle forms, em, vsh, ivsh, yvsh are readily removable.
nn, {iu}shch, shch, u{iu}shch can be removed, with a small proportion of
errors. Removing im, uem, enn creates too many errors.
*/
try (
[substring] among (
'{e}{m}' // present passive participle
'{n}{n}' // adjective from past passive participle
'{v}{sh}' // past active participle
'{iu}{shch}' '{shch}' // present active participle
('{a}' or '{ia}' delete)
//but not '{i}{m}' '{u}{e}{m}' // present passive participle
//or '{e}{n}{n}' // adjective from past passive participle
'{i}{v}{sh}' '{y}{v}{sh}'// past active participle
'{u}{iu}{shch}' // present active participle
(delete)
)
)
)
define reflexive as (
[substring] among (
'{s}{ia}'
'{s}{'}'
(delete)
)
)
define verb as (
[substring] among (
'{l}{a}' '{n}{a}' '{e}{t}{e}' '{i`}{t}{e}' '{l}{i}' '{i`}'
'{l}' '{e}{m}' '{n}' '{l}{o}' '{n}{o}' '{e}{t}' '{iu}{t}'
'{n}{y}' '{t}{'}' '{e}{sh}{'}'
'{n}{n}{o}'
('{a}' or '{ia}' delete)
'{i}{l}{a}' '{y}{l}{a}' '{e}{n}{a}' '{e}{i`}{t}{e}'
'{u}{i`}{t}{e}' '{i}{t}{e}' '{i}{l}{i}' '{y}{l}{i}' '{e}{i`}'
'{u}{i`}' '{i}{l}' '{y}{l}' '{i}{m}' '{y}{m}' '{e}{n}'
'{i}{l}{o}' '{y}{l}{o}' '{e}{n}{o}' '{ia}{t}' '{u}{e}{t}'
'{u}{iu}{t}' '{i}{t}' '{y}{t}' '{e}{n}{y}' '{i}{t}{'}'
'{y}{t}{'}' '{i}{sh}{'}' '{u}{iu}' '{iu}'
(delete)
/* note the short passive participle tests:
'{n}{a}' '{n}' '{n}{o}' '{n}{y}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{o}' '{e}{n}{y}'
*/
)
)
define noun as (
[substring] among (
'{a}' '{e}{v}' '{o}{v}' '{i}{e}' '{'}{e}' '{e}'
'{i}{ia}{m}{i}' '{ia}{m}{i}' '{a}{m}{i}' '{e}{i}' '{i}{i}'
'{i}' '{i}{e}{i`}' '{e}{i`}' '{o}{i`}' '{i}{i`}' '{i`}'
'{i}{ia}{m}' '{ia}{m}' '{i}{e}{m}' '{e}{m}' '{a}{m}' '{o}{m}'
'{o}' '{u}' '{a}{kh}' '{i}{ia}{kh}' '{ia}{kh}' '{y}' '{'}'
'{i}{iu}' '{'}{iu}' '{iu}' '{i}{ia}' '{'}{ia}' '{ia}'
(delete)
/* the small class of neuter forms '{e}{n}{i}' '{e}{n}{e}{m}'
'{e}{n}{a}' '{e}{n}' '{e}{n}{a}{m}' '{e}{n}{a}{m}{i}' '{e}{n}{a}{x}'
omitted - they only occur on 12 words.
*/
)
)
define derivational as (
[substring] R2 among (
'{o}{s}{t}'
'{o}{s}{t}{'}'
(delete)
)
)
define tidy_up as (
[substring] among (
'{e}{i`}{sh}'
'{e}{i`}{sh}{e}' // superlative forms
(delete
['{n}'] '{n}' delete
)
'{n}'
('{n}' delete) // e.g. -nno endings
'{'}'
(delete) // with some slight false conflations
)
)
)
define stem as (
do mark_regions
backwards setlimit tomark pV for (
do (
perfective_gerund or
( try reflexive
adjectival or verb or noun
)
)
try([ '{i}' ] delete)
// because noun ending -i{iu} is being treated as verb ending -{iu}
do derivational
do tidy_up
)
)
This diff is collapsed.
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)
externals ( stem )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in ISO Latin I) */
stringdef a" hex 'E4'
stringdef ao hex 'E5'
stringdef o" hex 'F6'
define v 'aeiouy{a"}{ao}{o"}'
define s_ending 'bcdfghjklmnoprtvy'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
'hetens' 'erns' 'at' 'andet' 'het' 'ast'
(delete)
's'
(s_ending delete)
)
)
define consonant_pair as setlimit tomark p1 for (
among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
and ([next] delete)
)
define other_suffix as setlimit tomark p1 for (
[substring] among(
'lig' 'ig' 'els' (delete)
'l{o"}st' (<-'l{o"}s')
'fullt' (<-'full')
)
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)
routines (
mark_regions
main_suffix
consonant_pair
other_suffix
)
externals ( stem )
integers ( p1 x )
groupings ( v s_ending )
stringescapes {}
/* special characters (in MS-DOS Latin I) */
stringdef a" hex '84'
stringdef ao hex '86'
stringdef o" hex '94'
define v 'aeiouy{a"}{ao}{o"}'
define s_ending 'bcdfghjklmnoprtvy'
define mark_regions as (
$p1 = limit
test ( hop 3 setmark x )
goto v gopast non-v setmark p1
try ( $p1 < x $p1 = x )
)
backwardmode (
define main_suffix as (
setlimit tomark p1 for ([substring])
among(
'a' 'arna' 'erna' 'heterna' 'orna' 'ad' 'e' 'ade' 'ande' 'arne'
'are' 'aste' 'en' 'anden' 'aren' 'heten' 'ern' 'ar' 'er' 'heter'
'or' 'as' 'arnas' 'ernas' 'ornas' 'es' 'ades' 'andes' 'ens' 'arens'
'hetens' 'erns' 'at' 'andet' 'het' 'ast'
(delete)
's'
(s_ending delete)
)
)
define consonant_pair as setlimit tomark p1 for (
among('dd' 'gd' 'nn' 'dt' 'gt' 'kt' 'tt')
and ([next] delete)
)
define other_suffix as setlimit tomark p1 for (
[substring] among(
'lig' 'ig' 'els' (delete)
'l{o"}st' (<-'l{o"}s')
'fullt' (<-'full')
)
)
)
define stem as (
do mark_regions
backwards (
do main_suffix
do consonant_pair
do other_suffix
)
)
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment