Algorithm::NGram - Algorithm::NGram documentation


Algorithm-NGram documentation Contained in the Algorithm-NGram distribution.

Index


Code Index:

NAME

Top

Algorithm::NGram

SYNPOSIS

Top

    use Algorithm::NGram;
    my $ng = Algorithm::NGram->new(ngram_width => 3); # use trigrams

    # feed in text
    $ng->add_text($text1); # analyze $text1
    $ng->add_text($text2); # analyze $text2

    # feed in arbitrary sequence of tokens
    $ng->add_start_token;
    $ng->add_tokens(qw/token1 token2 token3/);
    $ng->add_end_token;

    my $output = $ng->generate_text;

DESCRIPTION

Top

This is a module for analyzing token sequences with n-grams. You can use it to parse a block of text, or feed in your own tokens. It can generate new sequences of tokens from what has been fed in.

EXPORT

Top

None.

METHODS

Top

new

Create a new n-gram analyzer instance.

Options:

ngram_width

This is the "window size" of how many tokens the analyzer will keep track of. A ngram_width of two will make a bigram, a ngram_width of three will make a trigram, etc...

ngram_width

Returns token window size (e.g. the "n" in n-gram)

token_table

Returns n-gram table

add_text

Splits a block of text up by whitespace and processes each word as a token. Automatically calls add_start_token() at the beginning of the text and add_end_token() at the end.

add_tokens

Adds an arbitrary list of tokens.

add_start_token

Adds the "start token." This is useful because you often will want to mark the beginnings and ends of a token sequence so that when generating your output the generator will know what tokens start a sequence and when to end.

add_end_token

Adds the "end token." See add_start_token().

analyze

Generates an n-gram frequency table. Returns a hashref of N => tokens => count, where N is the number of tokens (will be from 2 to ngram_width). You will not normally need to call this unless you want to get the n-gram frequency table.

generate_text

After feeding in text tokens, this will return a new block of text based on whatever text was added.

generate

Generates a new sequence of tokens based on whatever tokens have previously been fed in.

next_tok

Given a list of tokens, will pick a possible token to come next.

token_lookup

Returns a hashref of the counts of tokens that follow a sequence of tokens.

token_key

Serializes a sequence of tokens for use as a key into the n-gram table. You will not normally need to call this.

serialize

Returns the tokens and n-gram (if one has been generated) in a string

deserialize($string)

Deserializes a string and returns an Algorithm::NGram instance

SEE ALSO

Top

Text::Ngram, Text::Ngrams

AUTHOR

Top

Mischa Spiegelmock, <mspiegelmock@gmail.com>

COPYRIGHT AND LICENSE

Top


Algorithm-NGram documentation Contained in the Algorithm-NGram distribution.
package Algorithm::NGram;
use strict;
use warnings;

use Carp qw (croak);
use Class::Accessor::Fast;
use List::Util qw (shuffle);
use Storable;

use base qw/Class::Accessor::Fast/;
__PACKAGE__->mk_accessors(qw/ngram_width token_table tokens dirty/);

use constant {
    START_TOK => ':STARTTOK:',
    END_TOK => ':ENDTOK:',
};

our $VERSION = 0.5;

sub new {
    my ($class, %opts) = @_;

    # trigram by default
    my $ngram_width = delete $opts{ngram_width} || 3;

    my $token_table = delete $opts{token_table} || {};
    my $tokens = delete $opts{tokens} || [];

    my $self = {
        ngram_width => $ngram_width,
        token_table => $token_table,
        tokens => $tokens,
    };

    bless $self, $class;
    $self->dirty(1);

    return $self;
}

# process a block of text, auto-tokenizing it
sub add_text {
    my ($self, $text) = @_;

    $self->add_start_token;

    # tokenize text
    foreach my $tok (split(/ /, $text)) {
        $tok =~ s/ +//g; # remove spaces

        next unless $tok;

        $self->add_token($tok);
    }

    $self->add_end_token;
}

*add_token = \&add_tokens;
sub add_tokens {
    my ($self, @tokens) = @_;
    push @{$self->{tokens}}, @tokens;
    $self->dirty(1);
}

sub add_start_token {
    my ($self) = @_;
    $self->add_token(START_TOK);
}

sub add_end_token {
    my ($self) = @_;
    $self->add_token(END_TOK);
}

sub analyze {
    my $self = shift;

    $self->{token_table} = {};

    my @all_tokens = @{$self->tokens};

    for (my $i = 1; $i <= $self->ngram_width; $i++) {
        for (my $tok_idx = 0; $tok_idx < @all_tokens; $tok_idx++) {
            my $tok_idx_end = $tok_idx + $i - 1;
            $tok_idx_end = @all_tokens if $tok_idx_end > @all_tokens;

            # get tokens
            my @tokens = @all_tokens[$tok_idx ... $tok_idx_end];

            # get the token that follows this ngram
            my $next_tok = $all_tokens[$tok_idx_end + 1];
            next unless $next_tok && @tokens;

            # don't care about what follows END_TOK
            next if $tokens[0] eq END_TOK;

            my $token_key = $self->token_key(@tokens);

            # increment the count of $next_tok after this ngram
            $self->{token_table}->{$i}->{$token_key}->{$next_tok}++;
        }
    }

    $self->dirty(0);

    return $self->{token_table};
}

sub generate_text {
    my ($self, %opts) = @_;

    my @toks = $self->generate(%opts);
    return join(' ', @toks);
}

sub generate {
    my ($self, %opts) = @_;

    # update n-gram if things have changed
    $self->analyze
        if $self->dirty && ! $opts{no_analyze};

    my @ret_toks;
    my $tok = START_TOK;

    my @cur_toks = ();

    do {
        push @ret_toks, $tok if $tok ne START_TOK;

        push @cur_toks, $tok;
        shift @cur_toks while @cur_toks > $self->ngram_width;

        $tok = $self->next_tok(@cur_toks);
    } while $tok && $tok ne END_TOK;

    return @ret_toks;
}

sub next_tok {
    my ($self, @toks) = @_;

    return undef unless %{$self->token_table};

    my $tok_next = $self->token_lookup(@toks);
    croak "No next tokens defined for tokens " . $self->token_key(@toks)
        unless defined $tok_next;

    my @possible_toks;

    while (my ($next_tok, $count) = each %$tok_next) {
        push @possible_toks, $next_tok for 1 .. $count;
    }

    @possible_toks = shuffle @possible_toks;
    my $tok = shift @possible_toks;

    return $tok;
}

sub token_lookup {
    my ($self, @toks) = @_;

    my $tok_count = @toks;
    croak "token_lookup passed more than ngram_width tokens"
        unless $tok_count <= $self->ngram_width;

    my $tok_key = $self->token_key(@toks);

    return $self->{token_table}->{$tok_count}->{$tok_key} || undef;
}

sub token_key {
    my ($self, @toks) = @_;
    return join('-', @toks);
}

sub serialize {
    my ($self) = @_;

    my $save = {
        ngram_width => $self->ngram_width,
        tokens      => $self->tokens,
        ngram       => $self->token_table,
    };

    return Storable::nfreeze($save);
}

sub deserialize {
    my ($class, $str) = @_;
    croak "Empty string passed to deserialize" unless $str;

    my $save = Storable::thaw($str)
        or croak "Invalid serialized data passed to deserialize";

    my $tokens = $save->{tokens} || [];
    my $ngram  = $save->{ngram};
    my $width  = $save->{ngram_width} or croak "No n-gram width saved";

    my $instance = $class->new(
                               ngram_width => $width,
                               token_table => $ngram,
                               tokens => $tokens,
                               );

    $instance->dirty(0) if $ngram;

    return $instance;
}

1;

__END__