| Lingua-Stem-Ru documentation | Contained in the Lingua-Stem-Ru distribution. |
Lingua::Stem::Ru - Porter's stemming algorithm for Russian (KOI8-R only)
use Lingua::Stem::Ru;
my $stems = Lingua::Stem::Ru::stem({ -words => $word_list_reference,
-locale => 'ru',
-exceptions => $exceptions_hash,
});
my $stem = Lingua::Stem::Ru::stem_word( $word );
This module applies the Porter Stemming Algorithm to its parameters, returning the stemmed words.
The algorithm is implemented exactly as described in:
http://snowball.tartarus.org/russian/stemmer.html
The code is carefully crafted to work in conjunction with the Lingua::Stem module by Benjamin Franz. This stemmer is also based on the work of Aldo Capini, see Lingua::Stem::It.
Stems a list of passed words. Returns an anonymous list reference to the stemmed words.
Example:
my $stemmed_words = Lingua::Stem::Ru::stem({ -words => \@words,
-locale => 'ru',
-exceptions => \%exceptions,
});
Stems a single word and returns the stem directly.
Example:
my $stem = Lingua::Stem::Ru::stem_word( $word );
Sets the level of stem caching.
'0' means 'no caching'. This is the default level.
'1' means 'cache per run'. This caches stemming results during a single call to 'stem'.
'2' means 'cache indefinitely'. This caches stemming results until either the process exits or the 'clear_stem_cache' method is called.
Clears the cache of stemmed words
None by default.
Aleksandr Guidrevitch <pillgrim@mail.ru>
Lingua::Stem
Copyright (C) 2003 by Aldo Calpini <dada@perl.it>
Copyright (C) 2004 by Aleksandr Guidrevitch <pillgrim@mail.ru>
This software may be freely copied and distributed under the same terms and conditions as Perl itself, either Perl version 5.8.3 or, at your option, any later version of Perl 5 you may have available..
| Lingua-Stem-Ru documentation | Contained in the Lingua-Stem-Ru distribution. |
package Lingua::Stem::Ru; use strict; use strict; use Exporter; use Carp; use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION); BEGIN { @ISA = qw (Exporter); @EXPORT = (); @EXPORT_OK = qw (stem stem_word clear_stem_cache stem_caching); %EXPORT_TAGS = (); } $VERSION = "0.01"; my $Stem_Caching = 0; my $Stem_Cache = {}; my $VOWEL = qr/ÁÅÉÏÕÙÜÀÑ/; my $PERFECTIVEGROUND = qr/((É×|É×ÛÉ|É×ÛÉÓØ|Ù×|Ù×ÛÉ|Ù×ÛÉÓØ)|((?<=[ÁÑ])(×|×ÛÉ|×ÛÉÓØ)))$/; my $REFLEXIVE = qr/(Ó[ÑØ])$/; my $ADJECTIVE = qr/(ÅÅ|ÉÅ|ÙÅ|ÏÅ|ÉÍÉ|ÙÍÉ|ÅÊ|ÉÊ|ÙÊ|ÏÊ|ÅÍ|ÉÍ|ÙÍ|ÏÍ|ÅÇÏ|ÏÇÏ|ÅÍÕ|ÏÍÕ|ÉÈ|ÙÈ|ÕÀ|ÀÀ|ÁÑ|ÑÑ|ÏÀ|ÅÀ)$/; my $PARTICIPLE = qr/((É×Û|Ù×Û|ÕÀÝ)|((?<=[ÁÑ])(ÅÍ|ÎÎ|×Û|ÀÝ|Ý)))$/; my $VERB = qr/((ÉÌÁ|ÙÌÁ|ÅÎÁ|ÅÊÔÅ|ÕÊÔÅ|ÉÔÅ|ÉÌÉ|ÙÌÉ|ÅÊ|ÕÊ|ÉÌ|ÙÌ|ÉÍ|ÙÍ|ÅÎ|ÉÌÏ|ÙÌÏ|ÅÎÏ|ÑÔ|ÕÅÔ|ÕÀÔ|ÉÔ|ÙÔ|ÅÎÙ|ÉÔØ|ÙÔØ|ÉÛØ|ÕÀ|À)|((?<=[ÁÑ])(ÌÁ|ÎÁ|ÅÔÅ|ÊÔÅ|ÌÉ|Ê|Ì|ÅÍ|Î|ÌÏ|ÎÏ|ÅÔ|ÀÔ|ÎÙ|ÔØ|ÅÛØ|ÎÎÏ)))$/; my $NOUN = qr/(Á|Å×|Ï×|ÉÅ|ØÅ|Å|ÉÑÍÉ|ÑÍÉ|ÁÍÉ|ÅÉ|ÉÉ|É|ÉÅÊ|ÅÊ|ÏÊ|ÉÊ|Ê|ÉÑÍ|ÑÍ|ÉÅÍ|ÅÍ|ÁÍ|ÏÍ|Ï|Õ|ÁÈ|ÉÑÈ|ÑÈ|Ù|Ø|ÉÀ|ØÀ|À|ÉÑ|ØÑ|Ñ)$/; my $RVRE = qr/^(.*?[$VOWEL])(.*)$/; my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=Ï)ÓÔØ?$/; sub stem { return [] if ($#_ == -1); my $parm_ref; if (ref $_[0]) { $parm_ref = shift; } else { $parm_ref = { @_ }; } my $words = []; my $locale = 'ru'; my $exceptions = {}; foreach (keys %$parm_ref) { my $key = lc ($_); if ($key eq '-words') { @$words = @{$parm_ref->{$key}}; } elsif ($key eq '-exceptions') { $exceptions = $parm_ref->{$key}; } elsif ($key eq '-locale') { $locale = $parm_ref->{$key}; } else { croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n"); } } local( $_ ); foreach (@$words) { # Flatten case $_ = lc $_; # Check against exceptions list if (exists $exceptions->{$_}) { $_ = $exceptions->{$_}; next; } # Check against cache of stemmed words my $original_word = $_; if ($Stem_Caching && exists $Stem_Cache->{$original_word}) { $_ = $Stem_Cache->{$original_word}; next; } $_ = stem_word($_); $Stem_Cache->{$original_word} = $_ if $Stem_Caching; } $Stem_Cache = {} if ($Stem_Caching < 2); return $words; } sub stem_word { my $word = lc shift; # Check against cache of stemmed words if ($Stem_Caching && exists $Stem_Cache->{$word}) { return $Stem_Cache->{$word}; } my ($start, $RV) = $word =~ /$RVRE/; return $word unless $RV; # Step 1 unless ($RV =~ s/$PERFECTIVEGROUND//) { $RV =~ s/$REFLEXIVE//; if ($RV =~ s/$ADJECTIVE//) { $RV =~ s/$PARTICIPLE//; } else { $RV =~ s/$NOUN// unless $RV =~ s/$VERB//; } } # Step 2 $RV =~ s/É$//; # Step 3 $RV =~ s/ÏÓÔØ?$// if $RV =~ /$DERIVATIONAL/; # Step 4 unless ($RV =~ s/Ø$//) { $RV =~ s/ÅÊÛÅ?//; $RV =~ s/ÎÎ$/Î/; } return $start.$RV; } sub stem_caching { my $parm_ref; if (ref $_[0]) { $parm_ref = shift; } else { $parm_ref = { @_ }; } my $caching_level = $parm_ref->{-level}; if (defined $caching_level) { if ($caching_level !~ m/^[012]$/) { croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value"); } $Stem_Caching = $caching_level; } return $Stem_Caching; } sub clear_stem_cache { $Stem_Cache = {}; } 1; __END__