| Lingua-EN-Segmenter documentation | Contained in the Lingua-EN-Segmenter distribution. |
Lingua::EN::Splitter - Split text into words, paragraphs, segments, and tiles
use Lingua::EN::Splitter qw(words paragraphs paragraph_breaks
segment_breaks tiles set_tokens_per_tile);
my $text = <<EOT;
Lingua::EN::Splitter is a useful module that allows text to be split up
into words, paragraphs, segments, and tiles.
Paragraphs are by default indicated by blank lines. Known segment breaks are
indicated by a line with only the word "segment_break" in it.
segment_break
This module does not make any attempt to guess segment boundaries. For that,
see L<Lingua::EN::Segmenter::TextTiling>.
EOT
# Set the number of tokens per tile to 20 (the default)
set_tokens_per_tile(20);
my @words = words $text;
my @paragraphs = paragraphs $text;
my @paragraph_breaks = paragraph_breaks $text;
my @segment_breaks = segment_breaks $text;
my @tiles = tile words $text;
print "@words[0..3,5]"; # Prints "lingua en segmenter is useful"
print "@words[43..46,53]"; # Prints "this module does not guess"
print $paragraphs[2]; # Prints the third paragraph of the above text
print $paragraph_breaks[2]; # Prints which tile the 3rd paragraph starts on
print $segment_breaks[1]; # Prints which tile the 2nd segment starts on
print $tiles[1]; # Prints @words[20..39] filtered for stopwords
# and stemmed
# This module can also be used in an object-oriented fashion
my $splitter = new Lingua::EN::Splitter;
@words = $splitter->words $text;
See synopsis.
This module can be used in an object-oriented fashion or the routines can be exported.
David James <splice@cpan.org>
| Lingua-EN-Segmenter documentation | Contained in the Lingua-EN-Segmenter distribution. |
package Lingua::EN::Splitter;
$VERSION = 0.10; @EXPORT_OK = qw( words paragraphs breaks paragraph_breaks segment_breaks set_tokens_per_tile set_paragraph_regexp set_non_word_regexp set_locale set_stop_words ); use Math::HashSum qw(hashsum); use base 'Class::Exporter'; use Lingua::Stem; use Lingua::EN::StopWords qw(%StopWords); use strict; use Carp qw(croak); no warnings; # Create a new instance of this object sub new { my $class = shift; my $stemmer = Lingua::Stem->new; $stemmer->stem_caching({ -level=>2 }); bless { PARAGRAPH_BREAK=>qr/\n\s*(segment_break)?\s*\n/, NON_WORD_CHARACTER=>qr/\W/, TOKENS_PER_TILE=>20, STEMMER=>$stemmer, STOP_WORDS=>\%StopWords, @_ }, $class; } # Split text into words sub words { my $self = shift; my $input = lc shift; $input =~ s/$self->{PARAGRAPH_BREAK}/ /g; return [ split /$self->{NON_WORD_CHARACTER}+/, $input ]; } # Split text into paragraphs sub paragraphs { my ($self, $input) = @_; return [ grep { /\S/ and !/^segment_break$/i } split /$self->{PARAGRAPH_BREAK}/i, $input ]; } # Return a list of paragraph and segment breaks sub breaks { my $self = shift; my $input = lc shift; # Eliminate empty paragraphs at the very end $input =~ s/$self->{PARAGRAPH_BREAK}\s*\Z//; # Convert paragraph breaks to tokens $input =~ s/$self->{PARAGRAPH_BREAK}/ PNO$1 /g; my @words = split /(?:$self->{NON_WORD_CHARACTER})+/, $input; my (@breaks,%segment_breaks,$num_words); foreach (@words) { if (/^PNO(segment_break)?$/) { my $segment_break = $1; $segment_break and $segment_breaks{scalar @breaks}++; push @breaks, $num_words / $self->{TOKENS_PER_TILE}; } else { $num_words++; } } return (\@breaks,\%segment_breaks); } # Return a list of paragraph breaks sub paragraph_breaks { my $self = shift; return ($self->breaks(@_))[0]; } # Return a list of real segment breaks sub segment_breaks { my $self = shift; return ($self->breaks(@_))[1]; } # Convert a list of words into tiles sub tile { my $self = shift; my $words = ref $_[0] ? shift : \@_; my @tiles; while (@$words) { push @tiles, { hashsum map { @{$self->{STEMMER}->stem($_)}, 1 } grep { !exists $self->{STOP_WORDS}->{$_} } splice @$words, 0, $self->{TOKENS_PER_TILE} }; } return \@tiles; } ######################################################### # Mutator methods ######################################################### sub set_tokens_per_tile { my $self = shift; $self->{TOKENS_PER_TILE} = shift; } sub set_paragraph_regexp { my $self = shift; $self->{PARAGRAPH_BREAK} = shift; } sub set_non_word_regexp { my $self = shift; $self->{NON_WORD_CHARACTER} = shift; } sub set_locale { my $self = shift; $self->{STEMMER}->set_locale(shift); } sub set_stop_words { my $self = shift; $self->{STOP_WORDS} = shift; } 1;