| Text-Sentence-Alignment documentation | Contained in the Text-Sentence-Alignment distribution. |
Text::Sentence::Alignment - Two Sentence Alignment
Version 0.12
This Module process two sentences (i.e. terms separated by space) alignment. Now it provide two kind of alignment method, Global and Local Alignment.
use Text::Sentence::Alignment;
my $TSA = Text::Sentence::Alignment->new();
# local alignment
$TSA->is_local(1);
my ($result1,$result2) = $TSA->do_alignment($s1,$s2);
# global alignment
$TSA->is_local(0);
my ($result1,$result2) = $TSA->do_alignment($s1,$s2);
Set/get if current algorithm is local alignment
The delimeter() is used to set delimeter of word/tags.
Cheng-Lung Sung, <clsung@cpan.org>
Please report any bugs or feature requests to
bug-text-sentence-alignment@rt.cpan.org, or through the web interface at
http://rt.cpan.org/NoAuth/ReportBug.html?Queue=Text-Sentence-Alignment.
I will be notified, and then you'll automatically be notified of progress on
your bug as I make changes.
Copyright 2005 - 2007 Cheng-Lung Sung, All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| Text-Sentence-Alignment documentation | Contained in the Text-Sentence-Alignment distribution. |
package Text::Sentence::Alignment; use warnings; use strict; use List::Util qw(max min);
our $VERSION = '0.12';
sub new { my $class = shift; my $self = {}; $self->{IS_LOCAL} = 0; # 0 for global alignment $self->{DELIMETER} = '/'; # / for split tags $self->{TABLE} = (); # Dynamic Programming Table $self->{BEST} = (); # Best path, for local $self->{max_len} = 0; # for global $self->{SENARR1} = []; $self->{SENARR2} = []; $self->{has_tag} = 0; # if we need to do hierarchy match, # e.g. POS, semantic tags bless($self, $class); return($self); }
sub is_local { my $self = shift; if (@_) { $self->{IS_LOCAL} = shift } return $self->{IS_LOCAL} }
sub delimeter { my $self = shift; if (@_) { $self->{DELIMETER} = shift } return $self->{DELIMETER} }
sub do_alignment { my $self = shift; my $sen1 = shift; my $sen2 = shift; @{ $self->{SENARR1} } = split / /,$sen1; @{ $self->{SENARR2} } = split / /,$sen2; if ($sen1 =~ m/$self->{DELIMETER}/ or $sen2 =~ m/$self->{DELIMETER}/) { $self->{has_tag} = 1; } $self->{TABLE} = (); $self->{BEST} = (); $self->{BEST}{MAX} = 0; $self->{TABLE}{0}{0} = 0; calculate_matrix($self); # similarity_print(); return get_align_result($self); }
sub calculate_matrix { my $self = shift; my @sa1 = @{ $self->{SENARR1}}; my @sa2 = @{ $self->{SENARR2}}; if ($self->{IS_LOCAL}) { $self->{max_len}= 0; } else { $self->{max_len} = scalar(@sa1) > scalar(@sa2) ? scalar(@sa1): scalar(@sa2); # for global } my ($len_s1, $len_s2) = (0,0); # length of s1/s2 # print STDERR "max_len is ".$max_len."\n"; while ($len_s1 <= (scalar @sa1)) { while ($len_s2 <= scalar @sa2) { my ($candidate1, $candidate2, $candidate3) = ($self->{max_len},$self->{max_len},$self->{max_len}); if ($len_s1 > 0 and $len_s2 > 0) { # if match, we add 1 for local, 0 for global # else (not matched), we add -1 for local, 1 for global $candidate1 = int($self->{TABLE}{$len_s1-1}{$len_s2-1}) + ( $self->{IS_LOCAL} ? 1: -1) * ( $self->{has_tag} ? how_similar($sa1[$len_s1-1], $sa2[$len_s2-1]) : ( ( $sa1[$len_s1-1] eq $sa2[$len_s2-1] ) ? 1+(-1+$self->{IS_LOCAL}) : -1 ) ) ; } if ($len_s1 > 0) { $candidate2 = int($self->{TABLE}{$len_s1-1}{$len_s2}) + ( $self->{IS_LOCAL} ? (-1) : 1); } if ($len_s2 > 0) { $candidate3 = int($self->{TABLE}{$len_s1}{$len_s2 - 1}) + ( $self->{IS_LOCAL} ? (-1) : 1); } # print STDERR "setting ($len_s1,$len_s2)..."; # print STDERR "(".$candidate1."\t".$candidate2."\t".$candidate3.")\n"; if ($self->{IS_LOCAL}) { $self->{TABLE}{$len_s1}{$len_s2} = max ( $candidate1, $candidate2, $candidate3, 0 ) if ($len_s1 > 0 or $len_s2 > 0); $self->{BEST}{X} = $len_s1 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; $self->{BEST}{Y} = $len_s2 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; $self->{BEST}{MAX} = $self->{TABLE}{$len_s1}{$len_s2} if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; } else { # global $self->{TABLE}{$len_s1}{$len_s2} = min ( $candidate1, $candidate2, $candidate3 ) if ($len_s1 > 0 or $len_s2 > 0); } $len_s2 +=1; } $len_s2 = 0; $len_s1 +=1; } }
sub similarity_print { my $self = shift; my @sa1 = @{ $self->{SENARR1}}; my @sa2 = @{ $self->{SENARR2}}; print STDERR "\n \t \t".join("\t",@sa2)."\n"; for my $key (sort {int($a) <=> int($b)}(keys %{$self->{TABLE}})) { print STDERR $sa1[$key-1]."\t" if $key > 0; print STDERR " \t" unless $key > 0; for my $subkey (sort {int($a) <=> int($b)} (keys %{$self->{TABLE}{$key}})) { print STDERR $self->{TABLE}{$key}{$subkey}."\t"; } print STDERR "\n"; } };
sub get_align_result { my $self = shift; my @sa1 = @{ $self->{SENARR1}}; my @sa2 = @{ $self->{SENARR2}}; my ($i, $j) = (0, 0); my (@as1, @as2); my $baseline = 0; my $first_round = 0; if ($self->{IS_LOCAL}) { $i = $self->{BEST}{X}; $j = $self->{BEST}{Y}; } else { $i = scalar @sa1; $j = scalar @sa2; } my $pi = $i + 2; my $pj = $j + 2; do { if ($first_round) { push @as1, $sa1[$i]; push @as2, $sa2[$j]; $first_round = 0; } else { if ($i == ($pi-1)) { push @as1, $sa1[$i]; } elsif ($i == $pi) { push @as1, "-"; } if ($j == ($pj-1)) { push @as2, $sa2[$j]; } elsif ($j == $pj) { push @as2, "-"; } } $pi = $i; $pj = $j; if ($self->{IS_LOCAL}) { $baseline = max($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1}); } else { $baseline = min($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1}); } if ($self->{TABLE}{$i-1}{$j-1} == $baseline) { $i--; $j--; } elsif ($self->{TABLE}{$i}{$j-1} == $baseline) { $j--; } elsif ($self->{TABLE}{$i-1}{$j} == $baseline) { $i--; } else { die $!; } } while ( $self->{TABLE}{$pi}{$pj} > 0 and $i >0 and $j > 0); if (!$self->{IS_LOCAL}) { push @as1, $sa1[$i]; push @as2, $sa2[$j]; } return ( join (" ",reverse @as1)."\t".join (" ",reverse @as2) ); }
1; # End of Text::Sentence::Alignment