Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer


Plucene documentation Contained in the Plucene distribution.

Index


Code Index:

NAME

Top

Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer

SYNOPSIS

Top

	# isa Plucene::Analysis::CharTokenizer

DESCRIPTION

Top

This is the standard tokenizer.

This should be a good tokenizer for most European-language documents.

METHODS

Top

token_re

The regular expression for tokenising.

normalize

Remove 's and .


Plucene documentation Contained in the Plucene distribution.
package Plucene::Analysis::Standard::StandardTokenizer;

use strict;
use warnings;

use base 'Plucene::Analysis::CharTokenizer';

# Don't blame me, blame the Plucene people!
my $alpha      = qr/\p{IsAlpha}+/;
my $apostrophe = qr/$alpha('$alpha)+/;
my $acronym    = qr/$alpha\.($alpha\.)+/;
my $company    = qr/$alpha(&|\@)$alpha/;
my $hostname   = qr/\w+(\.\w+)+/;
my $email      = qr/\w+\@$hostname/;
my $p          = qr/[_\/.,-]/;
my $hasdigit   = qr/\w*\d\w*/;
my $num        = qr/\w+$p$hasdigit|$hasdigit$p\w+
                                      |\w+($p$hasdigit$p\w+)+
                                      |$hasdigit($p\w+$p$hasdigit)+
                                      |\w+$p$hasdigit($p\w+$p$hasdigit)+
                                      |$hasdigit$p\w+($p$hasdigit$p\w+)+/x;

sub token_re {
	qr/
                $apostrophe | $acronym | $company | $hostname | $email | $num
                | \w+
        /x;
}

sub normalize {
	my $class = shift;

	# These are in the StandardFilter in Java, but Perl is not Java.
	# Thankfully.
	local $_ = shift;
	if (/$apostrophe/) { s/'s//; }
	if (/$company/)    { s/\.//g; }
	return $_;
}

1;