/usr/local/CPAN/Lingua-AlignmentSet/Lingua/AlSetLib.pm
package Lingua::AlSetLib;
$VERSION=1.1;
use strict;
use Dumpvalue;
# input: array of integers
# output: array of contiguous integer arrays
sub getContiguousSequences {
my $array=shift;
my $dumper = new Dumpvalue;
my @sortedArray = sort {$a <=> $b} @$array;
# print "sorted ARRay:\n";
# print $dumper->dumpValue(\@sortedArray);
my $sequences=[];
my $numSeq=0;
if (@$array > 0){
my $former = $sortedArray[0];
push @{$sequences->[$numSeq]},$former;
for (my $i=1;$i<@sortedArray;$i++){
if ($sortedArray[$i]== ($former+1)){
push @{$sequences->[$numSeq]},$sortedArray[$i];
}else{
$numSeq++;
push @{$sequences->[$numSeq]},$sortedArray[$i];
}
$former=$sortedArray[$i];
}
}
# print "SEQUENCES (best $longuestSequence):\n";
# print $dumper->dumpValue($sequences);
return $sequences;
}
sub getLonguestContiguousSequence {
my $array=shift;
my $dumper = new Dumpvalue;
my @sortedArray = sort {$a <=> $b} @$array;
# print "sorted ARRay:\n";
# print $dumper->dumpValue(\@sortedArray);
my $sequences=[];
my ($currentLength,$greatestLength,$longuestSequence,$numSeq)=(0,0,0,0);
if (@$array > 0){
my $former = $sortedArray[0];
push @{$sequences->[$numSeq]},$former;
$currentLength++;
$greatestLength=$currentLength;
$longuestSequence = $numSeq;
for (my $i=1;$i<@sortedArray;$i++){
if ($sortedArray[$i]== ($former+1)){
push @{$sequences->[$numSeq]},$sortedArray[$i];
$currentLength++;
if ($currentLength > $greatestLength){
$greatestLength=$currentLength;
$longuestSequence = $numSeq;
}
}else{
$numSeq++;
push @{$sequences->[$numSeq]},$sortedArray[$i];
$currentLength=1;
if ($currentLength > $greatestLength){
$greatestLength=$currentLength;
$longuestSequence = $numSeq;
}
}
$former=$sortedArray[$i];
}
}
# print "SEQUENCES (best $longuestSequence):\n";
# print $dumper->dumpValue($sequences);
return $sequences->[$longuestSequence];
}
sub ibm1Prob {
my ($src,$trg,$pt_s)=@_;
my @wsrc=split(/ /, $src);
my @wtrg=split(/ /, $trg);
my $numWs = scalar(@wsrc);
my $numWt = scalar(@wtrg);
# IBM1 SRC->TRG
if (@wsrc>1){
unshift @wtrg,"NULL";
$numWt++;
}
my $prod=1;
for my $ws (@wsrc){
my $sum=0;
for my $wt (@wtrg){
my $srctrg="$ws ||| $wt";
$sum+=$pt_s->{$srctrg};
#print "\t$srctrg => $pt_s->{$srctrg} ( $sum )\n";
}
$prod*=$sum;
}
#calcul de la probabilitat ibm1
# my $ibm1_st=$prod/($numWt**$numWs);
my $ibm1_st=$prod;
if ($ibm1_st==0){
$ibm1_st=1.0e-40;
}
# print "$src | $trg ==> prod:$prod I:$numWt J:$numWs ibmst: $ibm1_st\n";
return $ibm1_st;
}
# INPUT: ref to array A, ref to array B
# OUTPUT: list of positions of array A where array B begins in A
# EX: A=(4,5,8,3,9,22,8,3) B=(8,3) output=(2,6)
sub findArrayInAnother {
my ($refToSubArray,$refToArray)=@_;
my @startPosi;
my $numSub = scalar(@$refToSubArray);
my $numArray=scalar(@$refToArray);
if ($numArray >= $numSub){
for (my $i=0;$i<$numArray;$i++){
my $failed=0;
my $subi=0;
while (!$failed && $subi<$numSub){
if ($refToSubArray->[$subi] ne $refToArray->[$i+$subi]){$failed=1;}
$subi++;
}
if (!$failed){push @startPosi,$i;}
}
}
return @startPosi;
}
#########################################################
### SUBROUTINES COPIED FROM ALGORITHM::DIFF CPAN MODULE #
#########################################################
sub _withPositionsOfInInterval
{
my $aCollection = shift; # array ref
my $start = shift;
my $end = shift;
my $keyGen = shift;
my %d;
my $index;
for ( $index = $start ; $index <= $end ; $index++ )
{
my $element = $aCollection->[$index];
my $key = &$keyGen( $element, @_ );
if ( exists( $d{$key} ) )
{
unshift ( @{ $d{$key} }, $index );
}
else
{
$d{$key} = [$index];
}
}
return wantarray ? %d : \%d;
}
sub _replaceNextLargerWith
{
my ( $array, $aValue, $high ) = @_;
$high ||= $#$array;
# off the end?
if ( $high == -1 || $aValue > $array->[-1] )
{
push ( @$array, $aValue );
return $high + 1;
}
# binary search for insertion point...
my $low = 0;
my $index;
my $found;
while ( $low <= $high )
{
$index = ( $high + $low ) / 2;
# $index = int(( $high + $low ) / 2); # without 'use integer'
$found = $array->[$index];
if ( $aValue == $found )
{
return undef;
}
elsif ( $aValue > $found )
{
$low = $index + 1;
}
else
{
$high = $index - 1;
}
}
# now insertion point is in $low.
$array->[$low] = $aValue; # overwrite next larger
return $low;
}
sub _longestCommonSubsequence
{
my $a = shift; # array ref or hash ref
my $b = shift; # array ref or hash ref
my $counting = shift; # scalar
my $keyGen = shift; # code ref
my $compare; # code ref
if ( ref($a) eq 'HASH' )
{ # prepared hash must be in $b
my $tmp = $b;
$b = $a;
$a = $tmp;
}
# Check for bogus (non-ref) argument values
if ( !ref($a) || !ref($b) )
{
my @callerInfo = caller(1);
die 'error: must pass array or hash references to ' . $callerInfo[3];
}
# set up code refs
# Note that these are optimized.
if ( !defined($keyGen) ) # optimize for strings
{
$keyGen = sub { $_[0] };
$compare = sub { my ( $a, $b ) = @_; $a eq $b };
}
else
{
$compare = sub {
my $a = shift;
my $b = shift;
&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
};
}
my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
my ( $prunedCount, $bMatches ) = ( 0, {} );
if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
{
$bMatches = $b;
}
else
{
my ( $bStart, $bFinish ) = ( 0, $#$b );
# First we prune off any common elements at the beginning
while ( $aStart <= $aFinish
and $bStart <= $bFinish
and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
{
$matchVector->[ $aStart++ ] = $bStart++;
$prunedCount++;
}
# now the end
while ( $aStart <= $aFinish
and $bStart <= $bFinish
and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
{
$matchVector->[ $aFinish-- ] = $bFinish--;
$prunedCount++;
}
# Now compute the equivalence classes of positions of elements
$bMatches =
_withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
}
my $thresh = [];
my $links = [];
my ( $i, $ai, $j, $k );
for ( $i = $aStart ; $i <= $aFinish ; $i++ )
{
$ai = &$keyGen( $a->[$i], @_ );
if ( exists( $bMatches->{$ai} ) )
{
$k = 0;
for $j ( @{ $bMatches->{$ai} } )
{
# optimization: most of the time this will be true
if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
{
$thresh->[$k] = $j;
}
else
{
$k = _replaceNextLargerWith( $thresh, $j, $k );
}
# oddly, it's faster to always test this (CPU cache?).
if ( defined($k) )
{
$links->[$k] =
[ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
}
}
}
}
if (@$thresh)
{
return $prunedCount + @$thresh if $counting;
for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
{
$matchVector->[ $link->[1] ] = $link->[2];
}
}
elsif ($counting)
{
return $prunedCount;
}
return wantarray ? @$matchVector : $matchVector;
}
sub LCS
{
my $a = shift; # array ref
my $b = shift; # array ref or hash ref
my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
my @retval;
my $i;
for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
{
if ( defined( $matchVector->[$i] ) )
{
push ( @retval, $a->[$i] );
}
}
return wantarray ? @retval : \@retval;
}
sub LCS_ratio
{
my $a = shift; # array ref
my $b = shift; # array ref or hash ref
my $lon=_longestCommonSubsequence( $a, $b, 1, @_ );
# if ($lon>0){print "LCS:$lon\n";}
my $ratio;
#Aquí hago el ratio de la similitud cogiendo como valor total la longitud de la frase que sea más larga
my $m=scalar(@$a);
my $n=scalar(@$b);
if (($m >= $n) && ($m != 0) && ($n != 0)){$ratio=$lon/$m;}
elsif(($m < $n) && ($m != 0) && ($n != 0)){$ratio=$lon/$n;}
else{$ratio=0;}
return ($ratio);
}
sub traverse_sequences
{
my $a = shift; # array ref
my $b = shift; # array ref
my $callbacks = shift || {};
my $keyGen = shift;
my $matchCallback = $callbacks->{'MATCH'} || sub { };
my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
my $finishedACallback = $callbacks->{'A_FINISHED'};
my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
my $finishedBCallback = $callbacks->{'B_FINISHED'};
my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
# Process all the lines in @$matchVector
my $lastA = $#$a;
my $lastB = $#$b;
my $bi = 0;
my $ai;
for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
{
my $bLine = $matchVector->[$ai];
if ( defined($bLine) ) # matched
{
&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
&$matchCallback( $ai, $bi++, @_ );
}
else
{
&$discardACallback( $ai, $bi, @_ );
}
}
# The last entry (if any) processed was a match.
# $ai and $bi point just past the last matching lines in their sequences.
while ( $ai <= $lastA or $bi <= $lastB )
{
# last A?
if ( $ai == $lastA + 1 and $bi <= $lastB )
{
if ( defined($finishedACallback) )
{
&$finishedACallback( $lastA, @_ );
$finishedACallback = undef;
}
else
{
&$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
}
}
# last B?
if ( $bi == $lastB + 1 and $ai <= $lastA )
{
if ( defined($finishedBCallback) )
{
&$finishedBCallback( $lastB, @_ );
$finishedBCallback = undef;
}
else
{
&$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
}
}
&$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
&$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
}
return 1;
}
#diff computes the smallest set of additions and deletions necessary to turn the first sequence into the second, and returns a description of these changes. The description is a list of hunks; each hunk represents a contiguous section of items which should be added, deleted, or replaced. (Hunks containing unchanged items are not included.)
# EXAMPLES:
# @diffs = diff( \@seq1, \@seq2 );
# $diffs_ref = diff( \@seq1, \@seq2 );
sub diff
{
my $a = shift; # array ref
my $b = shift; # array ref
my $retval = [];
my $hunk = [];
my $discard = sub {
push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
};
my $add = sub {
push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
};
my $match = sub {
push @$retval, $hunk
if 0 < @$hunk;
$hunk = []
};
traverse_sequences( $a, $b,
{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
&$match();
return wantarray ? @$retval : $retval;
}
###### END ALGORITHM::DIFF FUNCTIONS
sub max {
my ($a,$b)=@_;
if ($a > $b) {return $a;}
else {return $b};
}
sub min {
my ($a,$b)=@_;
if ($a < $b) {return $a;}
else {return $b};
}
### copy-paste of Algorithm::MinMax module (to avoid having dependencies)
sub minmax {
my @array = @{ $_[ 1 ] };
my @result;
if( scalar( @array ) == 0 ) {
return @result;
}
if( scalar( @array ) == 1 ) {
$result[ 0 ] = $array[ 0 ];
$result[ 1 ] = $array[ 0 ];
return @result;
}
my @min_cand;
my @max_cand;
my $r = scalar( @array ) - 2;
my $k = 0;
for( my $i = 0; $i <= $r ; $i = $i + 2 ) {
if( $array[ $i ] < $array[ $i + 1 ] ) {
$min_cand[ $k ] = $array[ $i ];
$max_cand[ $k ] = $array[ $i + 1 ];
} else {
$min_cand[ $k ] = $array[ $i + 1 ];
$max_cand[ $k ] = $array[ $i ];
}
++$k;
}
if( scalar( @array ) % 2 != 0 ) {
if( $min_cand[ 0 ] < $array[ $r + 1 ] ) {
$max_cand[ $k ] = $array[ $r + 1 ];
} else {
$min_cand[ $k ] = $array[ $r + 1 ];
}
}
my $m = $min_cand[ 0 ];
for( my $i = 1; $i < scalar( @min_cand ); ++$i ) {
if( $min_cand[ $i ] < $m ) {
$m = $min_cand[ $i ];
}
}
$result[ 0 ] = $m;
$m = $max_cand[ 0 ];
for( my $i = 1; $i < scalar( @max_cand ); ++$i ) {
if( $max_cand[ $i ] > $m ) {
$m = $max_cand[ $i ];
}
}
$result[ 1 ] = $m;
@result;
}
### END Algorithm::MinMax function
sub escapeRegExp {
my $line = shift;
# regExp characters to escape: \ | ( ) [ { ^ $ * + ? .
$line =~ s/\\/\\\\/g;
$line =~ s/\|/\\\|/g;
$line =~ s/\(/\\\(/g;
$line =~ s/\)/\\\)/g;
$line =~ s/\[/\\\[/g;
$line =~ s/\{/\\\}/g;
$line =~ s/\^/\\\^/g;
$line =~ s/\$/\\\$/g;
$line =~ s/\*/\\\*/g;
$line =~ s/\+/\\\+/g;
$line =~ s/\?/\\\?/g;
$line =~ s/\./\\\./g;
return $line;
}
1;