Lingua::Stem::Ru - Porter's stemming algorithm for Russian (KOI8-R only)


Lingua-Stem-Ru documentation Contained in the Lingua-Stem-Ru distribution.

Index


Code Index:

NAME

Top

Lingua::Stem::Ru - Porter's stemming algorithm for Russian (KOI8-R only)

SYNOPSIS

Top

    use Lingua::Stem::Ru;
    my $stems = Lingua::Stem::Ru::stem({ -words => $word_list_reference,
                                         -locale => 'ru',
                                         -exceptions => $exceptions_hash,
                                      });

    my $stem = Lingua::Stem::Ru::stem_word( $word );

DESCRIPTION

Top

This module applies the Porter Stemming Algorithm to its parameters, returning the stemmed words.

The algorithm is implemented exactly as described in:

    http://snowball.tartarus.org/russian/stemmer.html

The code is carefully crafted to work in conjunction with the Lingua::Stem module by Benjamin Franz. This stemmer is also based on the work of Aldo Capini, see Lingua::Stem::It.

METHODS

Top

stem({ -words => \@words, -locale => 'ru', -exceptions => \%exceptions });

Stems a list of passed words. Returns an anonymous list reference to the stemmed words.

Example:

  my $stemmed_words = Lingua::Stem::Ru::stem({ -words => \@words,
                                              -locale => 'ru',
                                          -exceptions => \%exceptions,
                          });

stem_word( $word );

Stems a single word and returns the stem directly.

Example:

  my $stem = Lingua::Stem::Ru::stem_word( $word );

stem_caching({ -level => 0|1|2 });

Sets the level of stem caching.

'0' means 'no caching'. This is the default level.

'1' means 'cache per run'. This caches stemming results during a single call to 'stem'.

'2' means 'cache indefinitely'. This caches stemming results until either the process exits or the 'clear_stem_cache' method is called.

clear_stem_cache;

Clears the cache of stemmed words

EXPORT

None by default.

HISTORY

Top

AUTHOR

Top

Aleksandr Guidrevitch <pillgrim@mail.ru>

SEE ALSO

Top

 Lingua::Stem

COPYRIGHT

Top


Lingua-Stem-Ru documentation Contained in the Lingua-Stem-Ru distribution.
package Lingua::Stem::Ru;

use strict;


use strict;
use Exporter;
use Carp;
use vars qw (@ISA @EXPORT_OK @EXPORT %EXPORT_TAGS $VERSION);
BEGIN {
    @ISA         = qw (Exporter);
    @EXPORT      = ();
    @EXPORT_OK   = qw (stem stem_word clear_stem_cache stem_caching);
    %EXPORT_TAGS = ();
}
$VERSION = "0.01";

my $Stem_Caching  = 0;
my $Stem_Cache    = {};

my $VOWEL        = qr/ÁÅÉÏÕÙÜÀÑ/;
my $PERFECTIVEGROUND = qr/((É×|É×ÛÉ|É×ÛÉÓØ|Ù×|Ù×ÛÉ|Ù×ÛÉÓØ)|((?<=[ÁÑ])(×|×ÛÉ|×ÛÉÓØ)))$/;
my $REFLEXIVE    = qr/(Ó[ÑØ])$/;
my $ADJECTIVE    = qr/(ÅÅ|ÉÅ|ÙÅ|ÏÅ|ÉÍÉ|ÙÍÉ|ÅÊ|ÉÊ|ÙÊ|ÏÊ|ÅÍ|ÉÍ|ÙÍ|ÏÍ|ÅÇÏ|ÏÇÏ|ÅÍÕ|ÏÍÕ|ÉÈ|ÙÈ|ÕÀ|ÀÀ|ÁÑ|ÑÑ|ÏÀ|ÅÀ)$/;
my $PARTICIPLE   = qr/((É×Û|Ù×Û|ÕÀÝ)|((?<=[ÁÑ])(ÅÍ|ÎÎ|×Û|ÀÝ|Ý)))$/;
my $VERB         = qr/((ÉÌÁ|ÙÌÁ|ÅÎÁ|ÅÊÔÅ|ÕÊÔÅ|ÉÔÅ|ÉÌÉ|ÙÌÉ|ÅÊ|ÕÊ|ÉÌ|ÙÌ|ÉÍ|ÙÍ|ÅÎ|ÉÌÏ|ÙÌÏ|ÅÎÏ|ÑÔ|ÕÅÔ|ÕÀÔ|ÉÔ|ÙÔ|ÅÎÙ|ÉÔØ|ÙÔØ|ÉÛØ|ÕÀ|À)|((?<=[ÁÑ])(ÌÁ|ÎÁ|ÅÔÅ|ÊÔÅ|ÌÉ|Ê|Ì|ÅÍ|Î|ÌÏ|ÎÏ|ÅÔ|ÀÔ|ÎÙ|ÔØ|ÅÛØ|ÎÎÏ)))$/;
my $NOUN         = qr/(Á|Å×|Ï×|ÉÅ|ØÅ|Å|ÉÑÍÉ|ÑÍÉ|ÁÍÉ|ÅÉ|ÉÉ|É|ÉÅÊ|ÅÊ|ÏÊ|ÉÊ|Ê|ÉÑÍ|ÑÍ|ÉÅÍ|ÅÍ|ÁÍ|ÏÍ|Ï|Õ|ÁÈ|ÉÑÈ|ÑÈ|Ù|Ø|ÉÀ|ØÀ|À|ÉÑ|ØÑ|Ñ)$/;
my $RVRE         = qr/^(.*?[$VOWEL])(.*)$/;
my $DERIVATIONAL = qr/[^$VOWEL][$VOWEL]+[^$VOWEL]+[$VOWEL].*(?<=Ï)ÓÔØ?$/;

sub stem {
    return [] if ($#_ == -1);
    my $parm_ref;
    if (ref $_[0]) {
        $parm_ref = shift;
    } else {
        $parm_ref = { @_ };
    }

    my $words      = [];
    my $locale     = 'ru';
    my $exceptions = {};
    foreach (keys %$parm_ref) {
        my $key = lc ($_);
        if ($key eq '-words') {
            @$words = @{$parm_ref->{$key}};
        } elsif ($key eq '-exceptions') {
            $exceptions = $parm_ref->{$key};
        } elsif ($key eq '-locale') {
            $locale = $parm_ref->{$key};
        } else {
            croak (__PACKAGE__ . "::stem() - Unknown parameter '$key' with value '$parm_ref->{$key}'\n");
        }
    }

    local( $_ );
    foreach (@$words) {
        # Flatten case
        $_ = lc $_;

        # Check against exceptions list
        if (exists $exceptions->{$_}) {
            $_ = $exceptions->{$_};
            next;
        }

        # Check against cache of stemmed words
        my $original_word = $_;
        if ($Stem_Caching && exists $Stem_Cache->{$original_word}) {
            $_ = $Stem_Cache->{$original_word};
            next;
        }

        $_ = stem_word($_);

        $Stem_Cache->{$original_word} = $_ if $Stem_Caching;
    }
    $Stem_Cache = {} if ($Stem_Caching < 2);

    return $words;
}

sub stem_word {
    my $word = lc shift;

    # Check against cache of stemmed words
    if ($Stem_Caching && exists $Stem_Cache->{$word}) {
        return $Stem_Cache->{$word};
    }

     my ($start, $RV) = $word =~ /$RVRE/;
     return $word unless $RV;

     # Step 1
     unless ($RV =~ s/$PERFECTIVEGROUND//) {
         $RV =~ s/$REFLEXIVE//;

         if ($RV =~ s/$ADJECTIVE//) {
             $RV =~ s/$PARTICIPLE//;
         } else {
             $RV =~ s/$NOUN// unless $RV =~ s/$VERB//;
         }
     }

     # Step 2
     $RV =~ s/É$//;

     # Step 3
     $RV =~ s/ÏÓÔØ?$// if $RV =~ /$DERIVATIONAL/;

     # Step 4
     unless ($RV =~ s/Ø$//) {
         $RV =~ s/ÅÊÛÅ?//;
         $RV =~ s/ÎÎ$/Î/;	
     }

     return $start.$RV;
}

sub stem_caching {
    my $parm_ref;
    if (ref $_[0]) {
        $parm_ref = shift;
    } else {
        $parm_ref = { @_ };
    }
    my $caching_level = $parm_ref->{-level};
    if (defined $caching_level) {
        if ($caching_level !~ m/^[012]$/) {
            croak(__PACKAGE__ . "::stem_caching() - Legal values are '0','1' or '2'. '$caching_level' is not a legal value");
        }
        $Stem_Caching = $caching_level;
    }
    return $Stem_Caching;
}

sub clear_stem_cache {
    $Stem_Cache = {};
}
	

1;
__END__