Text::Sentence::Alignment - Two Sentence Alignment


Text-Sentence-Alignment documentation Contained in the Text-Sentence-Alignment distribution.

Index


Code Index:

NAME

Top

Text::Sentence::Alignment - Two Sentence Alignment

VERSION

Top

Version 0.12

SYNOPSIS

Top

This Module process two sentences (i.e. terms separated by space) alignment. Now it provide two kind of alignment method, Global and Local Alignment.

    use Text::Sentence::Alignment;

    my $TSA = Text::Sentence::Alignment->new();

    # local alignment
    $TSA->is_local(1);
    my ($result1,$result2) = $TSA->do_alignment($s1,$s2); 

    # global alignment
    $TSA->is_local(0);
    my ($result1,$result2) = $TSA->do_alignment($s1,$s2); 

FUNCTIONS

Top

new

is_local

Set/get if current algorithm is local alignment

delimeter

The delimeter() is used to set delimeter of word/tags.

do_alignment

calculate_matrix

similarity_print

get_align_result

AUTHOR

Top

Cheng-Lung Sung, <clsung@cpan.org>

BUGS

Top

Please report any bugs or feature requests to bug-text-sentence-alignment@rt.cpan.org, or through the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Text-Sentence-Alignment. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

ACKNOWLEDGEMENTS

Top

COPYRIGHT & LICENSE

Top


Text-Sentence-Alignment documentation Contained in the Text-Sentence-Alignment distribution.
package Text::Sentence::Alignment;

use warnings;
use strict;
use List::Util qw(max min);

our $VERSION = '0.12';

sub new {
    my $class = shift;
    my $self = {};
    $self->{IS_LOCAL} = 0; # 0 for global alignment
    $self->{DELIMETER} = '/'; # / for split tags
    $self->{TABLE} = (); # Dynamic Programming Table
    $self->{BEST} = (); # Best path, for local
    $self->{max_len} = 0; # for global
    $self->{SENARR1} = [];
    $self->{SENARR2} = [];
    $self->{has_tag} = 0; # if we need to do hierarchy match, 
			  # e.g. POS, semantic tags
    bless($self, $class);
    return($self);
}

sub is_local {
    my $self = shift;
    if (@_) { $self->{IS_LOCAL} = shift }
    return $self->{IS_LOCAL}
}

sub delimeter {
    my $self = shift;
    if (@_) { $self->{DELIMETER} = shift }
    return $self->{DELIMETER}
}

sub do_alignment {
    my $self = shift;
    my $sen1 = shift;
    my $sen2 = shift;
    @{ $self->{SENARR1} } = split / /,$sen1;
    @{ $self->{SENARR2} } = split / /,$sen2;
    if ($sen1 =~ m/$self->{DELIMETER}/ or 
	$sen2 =~ m/$self->{DELIMETER}/) {
	$self->{has_tag} = 1;
    }
    $self->{TABLE} = ();
    $self->{BEST} = ();
    $self->{BEST}{MAX} = 0;
    $self->{TABLE}{0}{0} = 0;
    calculate_matrix($self);
#    similarity_print();
    return get_align_result($self);
}

sub calculate_matrix {
    my $self = shift;
    my @sa1 = @{ $self->{SENARR1}};
    my @sa2 = @{ $self->{SENARR2}};
    if ($self->{IS_LOCAL}) {
	$self->{max_len}= 0;
    } else {
	$self->{max_len} = scalar(@sa1) > scalar(@sa2) ? scalar(@sa1): scalar(@sa2); # for global
    }
    my ($len_s1, $len_s2) = (0,0); # length of s1/s2

#    print STDERR "max_len is ".$max_len."\n";
    while ($len_s1 <= (scalar @sa1)) {
	while ($len_s2 <= scalar @sa2) {
	    my ($candidate1, $candidate2, $candidate3) = ($self->{max_len},$self->{max_len},$self->{max_len});
	    if ($len_s1 > 0 and $len_s2 > 0) {
		# if match, we add 1 for local, 0 for global
		# else (not matched), we add -1 for local, 1 for global
		$candidate1 = int($self->{TABLE}{$len_s1-1}{$len_s2-1}) + 
		    ( $self->{IS_LOCAL} ? 1: -1) *
			( $self->{has_tag} ? 
			    how_similar($sa1[$len_s1-1], $sa2[$len_s2-1]) :
			    ( ( $sa1[$len_s1-1] eq $sa2[$len_s2-1] ) ? 
				1+(-1+$self->{IS_LOCAL}) : -1 )
			)
		;
	    }
	    if ($len_s1 > 0) {
		$candidate2 = int($self->{TABLE}{$len_s1-1}{$len_s2}) + 
		    ( $self->{IS_LOCAL} ? (-1) : 1);
	    }
	    if ($len_s2 > 0) {
		$candidate3 = int($self->{TABLE}{$len_s1}{$len_s2 - 1})  + 
		    ( $self->{IS_LOCAL} ? (-1) : 1);
	    }
#	    print STDERR "setting ($len_s1,$len_s2)...";
#	    print STDERR "(".$candidate1."\t".$candidate2."\t".$candidate3.")\n";
	    if ($self->{IS_LOCAL}) {
		$self->{TABLE}{$len_s1}{$len_s2} = max (
		    $candidate1, $candidate2, $candidate3, 0
		) if ($len_s1 > 0 or $len_s2 > 0);
		$self->{BEST}{X} = $len_s1 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2};
		$self->{BEST}{Y} = $len_s2 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2};
		$self->{BEST}{MAX} = $self->{TABLE}{$len_s1}{$len_s2} if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2};
	    } else { # global
		$self->{TABLE}{$len_s1}{$len_s2} = min (
		    $candidate1, $candidate2, $candidate3
		) if ($len_s1 > 0 or $len_s2 > 0);
	    }
	    $len_s2 +=1;
	}
	$len_s2 = 0;
	$len_s1 +=1;
    }
}

sub similarity_print {
    my $self = shift;
    my @sa1 = @{ $self->{SENARR1}};
    my @sa2 = @{ $self->{SENARR2}};
    print STDERR "\n \t \t".join("\t",@sa2)."\n";
    for my $key (sort {int($a) <=> int($b)}(keys %{$self->{TABLE}})) {
	print STDERR $sa1[$key-1]."\t" if $key > 0;
	print STDERR " \t" unless $key > 0;
	for my $subkey (sort {int($a) <=> int($b)} (keys %{$self->{TABLE}{$key}})) {
	    print STDERR $self->{TABLE}{$key}{$subkey}."\t";
	}
	print STDERR "\n";
    }
};

sub get_align_result {
    my $self = shift;
    my @sa1 = @{ $self->{SENARR1}};
    my @sa2 = @{ $self->{SENARR2}};
    my ($i, $j) = (0, 0);
    my (@as1, @as2);
    my $baseline = 0;
    my $first_round = 0;
    if ($self->{IS_LOCAL}) {
	$i = $self->{BEST}{X};
	$j = $self->{BEST}{Y};
    } else {
	$i = scalar @sa1;
	$j = scalar @sa2;
    }
    my $pi = $i + 2;
    my $pj = $j + 2;
    do { 
	if ($first_round) {
	    push @as1, $sa1[$i];
	    push @as2, $sa2[$j];
	    $first_round = 0;
	} else {
	    if ($i == ($pi-1)) {
		push @as1, $sa1[$i];
	    }
	    elsif ($i == $pi) {
		push @as1, "-";
	    }
	    if ($j == ($pj-1)) {
		push @as2, $sa2[$j];
	    }
	    elsif ($j == $pj) {
		push @as2, "-";
	    }
	}
	$pi = $i;
	$pj = $j;
	if ($self->{IS_LOCAL}) { 
	    $baseline = max($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1});
	} else {
	    $baseline = min($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1});
	}
	if ($self->{TABLE}{$i-1}{$j-1} == $baseline) {
	    $i--;
	    $j--;
	} elsif ($self->{TABLE}{$i}{$j-1} == $baseline) {
	    $j--;
	} elsif ($self->{TABLE}{$i-1}{$j} == $baseline) {
	    $i--;
	} else {
	    die $!;
	}
    } while ( $self->{TABLE}{$pi}{$pj} > 0 and $i >0 and $j > 0);
    if (!$self->{IS_LOCAL}) {
	push @as1, $sa1[$i];
	push @as2, $sa2[$j];
    }
    return ( join (" ",reverse @as1)."\t".join (" ",reverse @as2) );
}

1; # End of Text::Sentence::Alignment