Plucene::Plugin::Analyzer::SnowballAnalyzer - Stemmed analyzer with Lingua::Stem::Snowball and Lingua::StopWords


Plucene-Plugin-Analyzer-SnowballAnalyzer documentation Contained in the Plucene-Plugin-Analyzer-SnowballAnalyzer distribution.

Index


Code Index:

NAME

Top

Plucene::Plugin::Analyzer::SnowballAnalyzer - Stemmed analyzer with Lingua::Stem::Snowball and Lingua::StopWords

DESCRIPTION

Top

Filters StandardTokenizer with SnowballAnalyzer.

Change $Plucene::Plugin::Analysis::SnowballAnalyzer::LANG to the language of your choice. (see Lingua::Stem::Snowball documentation for all available languages).

EXAMPLE

Top

  #!/usr/bin/perl

  use strict;
  use Plucene;
  use Plucene::Index::Writer;
  use Plucene::Plugin::Analyzer::SnowballAnalyzer;
  use Plucene::Search::IndexSearcher;
  use Plucene::QueryParser;

  $Plucene::Plugin::Analyzer::SnowballAnalyzer::LANG = 'fr';

  my $db = "plucene_index";

  my $writer = Plucene::Index::Writer->new($db, Plucene::Plugin::Analyzer::SnowballAnalyzer->new(), 1);

  $doc = Plucene::Document->new();
  $doc->add(Plucene::Document::Field->Keyword(filename => 'test2.html'));
  $doc->add(Plucene::Document::Field->Text(title => 'Another file title'));
  $doc->add(Plucene::Document::Field->UnStored(content => 'Nothing HERE. la folie...'));
  $writer->add_document($doc);

  $doc = Plucene::Document->new();
  $doc->add(Plucene::Document::Field->Keyword(filename => 'test2.html'));
  $doc->add(Plucene::Document::Field->Text(title => 'Fichier en français'));
  $doc->add(Plucene::Document::Field->UnStored(content => 'Vive le français! Je t\'aime tendrement...'));
  $writer->add_document($doc);

  $writer->optimize;

  my $searcher = Plucene::Search::IndexSearcher->new($db);
  my $parser = Plucene::QueryParser->new({
  	analyzer => Plucene::Plugin::Analyzer::SnowballAnalyzer->new(),
  	default => 'content',
  });
  my $parsed = $parser->parse("la +france +TENDRE title:fichier");

  my @docs;
  my $hc = Plucene::Search::HitCollector->new(
  	collect => sub {
  		my ($self, $doc, $score) = @_;
  		my $res = eval { $searcher->doc($doc); };
  		push @docs, $res if res;
  	},
  );

  $searcher->search_hc($parsed, $hc);
  my @results = map {{
  	filename => $_->get('filename')->string,
  	title => $_->get('title')->string,
  }} @docs;

  print "Results:\n";
  foreach my $result (@results) {
  	print "\t$result->{title} ($result->{filename})\n";
  }

AUTHOR

Top

Fabien POTENCIER, fabpot@cpan.org

LICENSE

Top

You may distribute this code under the same terms as Plucene himself.


Plucene-Plugin-Analyzer-SnowballAnalyzer documentation Contained in the Plucene-Plugin-Analyzer-SnowballAnalyzer distribution.

package Plucene::Plugin::Analyzer::SnowballAnalyzer;

use base 'Plucene::Analysis::Analyzer';
use 5.006;
use strict;
use warnings;
our $VERSION = '1.1';
use Plucene::Analysis::Standard::StandardTokenizer;
use Plucene::Plugin::Analyzer::SnowballFilter;
use Plucene::Analysis::StopFilter;
use Lingua::StopWords;

our $LANG = 'en';

sub tokenstream {
	my $self = shift;

	my @stopwords = keys %{ Lingua::StopWords::getStopWords($LANG) };
	return Plucene::Analysis::StopFilter->new({
		input => Plucene::Plugin::Analyzer::SnowballFilter->new({
			input => Plucene::Analysis::Standard::StandardTokenizer->new(@_)
		}),
		stoplist => \@stopwords,
	});
}