| Plucene documentation | Contained in the Plucene distribution. |
Plucene::Analysis::Standard::StandardTokenizer - standard tokenizer
# isa Plucene::Analysis::CharTokenizer
This is the standard tokenizer.
This should be a good tokenizer for most European-language documents.
The regular expression for tokenising.
Remove 's and .
| Plucene documentation | Contained in the Plucene distribution. |
package Plucene::Analysis::Standard::StandardTokenizer;
use strict; use warnings; use base 'Plucene::Analysis::CharTokenizer'; # Don't blame me, blame the Plucene people! my $alpha = qr/\p{IsAlpha}+/; my $apostrophe = qr/$alpha('$alpha)+/; my $acronym = qr/$alpha\.($alpha\.)+/; my $company = qr/$alpha(&|\@)$alpha/; my $hostname = qr/\w+(\.\w+)+/; my $email = qr/\w+\@$hostname/; my $p = qr/[_\/.,-]/; my $hasdigit = qr/\w*\d\w*/; my $num = qr/\w+$p$hasdigit|$hasdigit$p\w+ |\w+($p$hasdigit$p\w+)+ |$hasdigit($p\w+$p$hasdigit)+ |\w+$p$hasdigit($p\w+$p$hasdigit)+ |$hasdigit$p\w+($p$hasdigit$p\w+)+/x;
sub token_re { qr/ $apostrophe | $acronym | $company | $hostname | $email | $num | \w+ /x; }
sub normalize { my $class = shift; # These are in the StandardFilter in Java, but Perl is not Java. # Thankfully. local $_ = shift; if (/$apostrophe/) { s/'s//; } if (/$company/) { s/\.//g; } return $_; } 1;