Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm


BioPerl documentation Contained in the BioPerl distribution.

Index


Code Index:

NAME

Top

Bio::Search::Tiling::MapTileUtils - utilities for manipulating closed intervals for an HSP tiling algorithm

SYNOPSIS

Top

Not used directly.

DESCRIPTION

Top

Not used directly.

NOTE

Top

An "interval" in this module is defined as an arrayref [$a0, $a1], where $a0, $a1 are scalar numbers satisfying $a0 <= $a1.

AUTHOR

Top

Mark A. Jensen - maj -at- fortinbras -dot- us

APPENDIX

Top

interval_tiling

 Title   : interval_tiling()
 Usage   : @tiling = interval_tiling( \@array_of_intervals )
 Function: Find minimal set of intervals covering the input set
 Returns : array of arrayrefs of the form
  ( [$interval => [ @indices_of_collapsed_input_intervals ]], ...)
 Args    : arrayref of intervals

decompose_interval

 Title   : decompose_interval
 Usage   : @decomposition = decompose_interval( \@overlappers )
 Function: Calculate the disjoint decomposition of a set of
           overlapping intervals, each annotated with a list of
           covering intervals
 Returns : array of arrayrefs of the form
           ( [[@interval] => [@indices_of_coverers]], ... )
 Args    : arrayref of intervals (arrayrefs like [$a0, $a1], with
 Note    : Each returned interval is associated with a list of indices of the
           original intervals that cover that decomposition component
           (scalar size of this list could be called the 'coverage coefficient')
 Note    : Coverage: each component of the decomp is completely contained
           in the input intervals that overlap it, by construction.
 Caveat  : This routine expects the members of @overlappers to overlap,
           but doesn't check this.

are_disjoint

 Title   : are_disjoint
 Usage   : are_disjoint( [$a0, $a1], [$b0, $b1] )
 Function: Determine if two intervals are disjoint
 Returns : True if the intervals are disjoint, false if they overlap
 Args    : array of two intervals

min_covering_interval

 Title   : min_covering_interval 
 Usage   : $interval = min_covering_interval( [$a0,$a1],[$b0,$b1] )
 Function: Determine the minimal covering interval for two intervals
 Returns : an interval
 Args    : two intervals

get_intervals_from_hsps

 Title   : get_intervals_from_hsps
 Usage   : @intervals = get_intervals_from_hsps($type, @hsp_objects)
 Function: Return array of intervals of the form [ $start, $end ],
           from an array of hsp objects
 Returns : an array of intervals
 Args    : scalar $type, array of HSPI objects; where $type is one of 'hit',
           'subject', 'query'

_allowable_filters

 Title   : _allowable_filters
 Usage   : _allowable_filters($Bio_Search_Hit_HitI, $type)
 Function: Return the HSP filters (strand, frame) allowed, 
           based on the reported algorithm
 Returns : String encoding allowable filters: 
           s = strand, f = frame
           Empty string if no filters allowed
           undef if algorithm unrecognized
 Args    : A Bio::Search::Hit::HitI object,
           scalar $type, one of 'hit', 'subject', 'query';
           default is 'query'

_set_attributes

 Title   : _set_attributes
 Usage   : $tiling->_set_attributes()
 Function: Sets attributes for invocant
           that depend on algorithm name
 Returns : True on success
 Args    : none
 Note    : setting based on the configuration table
           %alg_lookup

containing_hsps()

 Title   : containing_hsps
 Usage   : @hsps = containing_hsps($interval, @hsps_to_search)
 Function: Return a list of hsps whose coordinates completely contain the
           given $interval
 Returns : Array of HSP objects
 Args    : $interval : [$int1, $int2],
           array of HSP objects

covering_groups()

 Title   : covering_groups
 Usage   : 
 Function: divide a list of **ordered,disjoint** intervals (as from a 
           coverage map) into a set of disjoint covering groups
 Returns : array of arrayrefs, each arrayref a covering set of 
           intervals
 Args    : array of intervals

matches_MT

 Title   : matches_MT
 Usage   : $hsp->matches($type, $action, $start, $end)
 Purpose   : Get the total number of identical or conserved matches 
             in the query or sbjct sequence for the given HSP. Optionally can
             report data within a defined interval along the seq.
 Returns   : scalar int 
 Args      : 
 Comments  : Relies on seq_str('match') to get the string of alignment symbols
             between the query and sbjct lines which are used for determining
             the number of identical and conservative matches.
 Note      : Modeled on Bio::Search::HSP::HSPI::matches

get_SeqFeatures

 Title   : get_SeqFeatures
 Usage   :
 Function: Get the feature objects held by this feature holder.

           Features which are not top-level are subfeatures of one or
           more of the returned feature objects, which means that you
           must traverse the subfeature arrays of each top-level
           feature object in order to traverse all features associated
           with this sequence.

           Top-level features can be obtained by tag, specified in 
           the argument.

           Use get_all_SeqFeatures() if you want the feature tree
           flattened into one single array.

 Example :
 Returns : an array of Bio::SeqFeatureI implementing objects
 Args    : [optional] scalar string (feature tag)




feature_count

 Title   : feature_count
 Usage   : $seq->feature_count()
 Function: Return the number of SeqFeatures attached to a sequence
 Returns : integer representing the number of SeqFeatures
 Args    : None




add_SeqFeature

 Title   : add_SeqFeature
 Usage   : $seq->add_SeqFeature($feat);
           $seq->add_SeqFeature(@feat);
 Function: Adds the given feature object (or each of an array of feature
           objects to the feature array of this
           sequence. The object passed is required to implement the
           Bio::SeqFeatureI interface.
 Returns : 1 on success
 Args    : A Bio::SeqFeatureI implementing object, or an array of such objects.




remove_SeqFeatures

 Title   : remove_SeqFeatures
 Usage   : $seq->remove_SeqFeatures();
 Function: Flushes all attached SeqFeatureI objects.

           To remove individual feature objects, delete those from the returned
           array and re-add the rest.
 Example :
 Returns : The array of Bio::SeqFeatureI objects removed from this seq.
 Args    : None





BioPerl documentation Contained in the BioPerl distribution.
#$Id$
package Bio::Search::Tiling::MapTileUtils;
use strict;
use warnings;
use Exporter;

BEGIN {
    our @ISA = qw( Exporter );
    our @EXPORT = qw( get_intervals_from_hsps 
                      interval_tiling 
                      decompose_interval
                      containing_hsps
                      covering_groups
                      _allowable_filters 
                      _set_attributes
                      _mapping_coeff);
}

# tiling trials
# assumed: intervals are [$a0, $a1], with $a0 <= $a1
sub interval_tiling {
    return unless $_[0]; # no input
    my $n = scalar @{$_[0]};
    my %input;
    @input{(0..$n-1)} = @{$_[0]};
    my @active = (0..$n-1);
    my @hold;
    my @tiled_ints;
    my @ret;
    while (@active) {
	my $tgt = $input{my $tgt_i = shift @active};
	push @tiled_ints, $tgt_i;
	my $tgt_not_disjoint = 1;
	while ($tgt_not_disjoint) {
	    $tgt_not_disjoint = 0;
	    while (my $try_i = shift @active) {
		my $try = $input{$try_i};
		if ( !are_disjoint($tgt, $try) ) {
		    $tgt = min_covering_interval($tgt,$try);
		    push @tiled_ints, $try_i;
		    $tgt_not_disjoint = 1;
		}
		else {
		    push @hold, $try_i;
		}
	    }
	    if (!$tgt_not_disjoint) {
		push @ret, [ $tgt => [@tiled_ints] ];
		@tiled_ints = ();
	    }
	    @active = @hold;
	    @hold = ();
	}
    }
    return @ret;
}

### what if the input intervals don't overlap?? They MUST overlap; that's
### what interval_tiling() is for.

sub decompose_interval {
    return unless $_[0]; # no input
    my @ints = @{$_[0]};
    my (%flat,@flat);
    ### this is ok, but need to handle the case where a lh and rh endpoint
    ### coincide...
    # decomposition --
    # flatten:
    # every lh endpoint generates (lh-1, lh)
    # every rh endpoint generates (rh, rh+)
    foreach (@ints) {
	$flat{$$_[0]-1}++;
	$flat{$$_[0]}++;
	$flat{$$_[1]}++;
	$flat{$$_[1]+1}++;
    }
    # sort, create singletons if nec.
    my @a;
    @a = sort {$a<=>$b} keys %flat;
    # throw out first and last (meeting a boundary condition)
    shift @a; pop @a;
    # look for singletons
    @flat = (shift @a, shift @a);
    if ( $flat[1]-$flat[0] == 1 ) {
	@flat = ($flat[0],$flat[0], $flat[1]);
    }
    while (my $a = shift @a) {
	if ($a-$flat[-2]==2) {
	    push @flat, $flat[-1]; # create singleton interval
	}
	push @flat, $a;
    }
    if ($flat[-1]-$flat[-2]==1 and @flat % 2) {
	push @flat, $flat[-1];
    }
    # component intervals are consecutive pairs
    my @decomp;
    while (my $a = shift @flat) {
	push @decomp, [$a, shift @flat];
    }

    # for each component, return a list of the indices of the input intervals
    # that cover the component.
    my @coverage;
    foreach my $i (0..$#decomp) {
	foreach my $j (0..$#ints) {
	    unless (are_disjoint($decomp[$i], $ints[$j])) {
		if (defined $coverage[$i]) {
		    push @{$coverage[$i]}, $j;
		}
		else {
		    $coverage[$i] = [$j];
		}
	    }
	}
    }
    return map { [$decomp[$_] => $coverage[$_]] } (0..$#decomp);
}    

sub are_disjoint {
    my ($int1, $int2) = @_;
    return 1 if ( $$int1[1] < $$int2[0] ) || ( $$int2[1] < $$int1[0]);
    return 0;
}

sub min_covering_interval {
    my ($int1, $int2) = @_;
    my @a = sort {$a <=> $b} (@$int1, @$int2);
    return [$a[0],$a[-1]];
}

sub get_intervals_from_hsps {
    my $type = shift;
    my @hsps;
    if (ref($type) =~ /HSP/) {
	push @hsps, $type;
	$type = 'query';
    }
    elsif ( !grep /^$type$/,qw( hit subject query ) ) {
	die "Unknown HSP type '$type'";
    }
    $type = 'hit' if $type eq 'subject';
    push @hsps, @_;
    my @ret;
    foreach (@hsps) {
#	push @ret, [ $_->start($type), $_->end($type)];
	push @ret, [ $_->range($type) ];
    }
    return @ret;
}

# fast, clear, nasty, brutish and short.
# for _allowable_filters(), _set_mapping()
# covers BLAST, FAST families
# FASTA is ambiguous (nt or aa) based on alg name only

my $alg_lookup = {
    'N'  => { 'mapping' => [1,1],
              'def_context' => ['p_','p_'],
	      'has_strand' => [1, 1],
              'has_frame' => [0, 0]},
    'P'  => { 'mapping' => [1,1],
              'def_context' => ['all','all'],
	      'has_strand' => [0, 0],
	      'has_frame' => [0, 0]},
    'X'  => { 'mapping' => [3, 1],
              'def_context' => [undef,'all'],
	      'has_strand' => [1, 0],
	      'has_frame' => [1, 0]}, 
    'Y'  => { 'mapping' => [3, 1],
              'def_context' => [undef,'all'],
	      'has_strand' => [1, 0],
	      'has_frame' => [1, 0]}, 
    'TA' => { 'mapping' => [1, 3],
              'def_context' => ['all',undef],
	      'has_strand' => [0, 1],
	      'has_frame' => [0, 1]}, 
    'TN' => { 'mapping' => [1, 3],
              'def_context' => ['p_',undef],
	      'has_strand' => [1,1],
	      'has_frame' => [0, 1]}, 
    'TX' => { 'mapping' => [3, 3],
              'def_context' => [undef,undef],
	      'has_strand' => [1, 1],
	      'has_frame' => [1, 1]}, 
    'TY' => { 'mapping' => [3, 3],
              'def_context' => [undef,undef],
	      'has_strand' => [1, 1],
	      'has_frame' => [1, 1]}
};
   
sub _allowable_filters {
    my $hit = shift;
    my $type = shift;
    $type ||= 'q';
    unless (grep /^$type$/, qw( h q s ) ) {
	warn("Unknown type '$type'; returning ''");
	return '';
    }
    $type = 'h' if $type eq 's';
    my $alg = $hit->algorithm;

    # pretreat (i.e., kludge it)
    $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);

    for ($hit->algorithm) {
	/MEGABLAST/i && do {
	    return qr/[s]/;
	};
	/(.?)BLAST(.?)/i && do {
	    return $$alg_lookup{$1.$2}{$type};
	};
	/(.?)FAST(.?)/ && do {
	    return $$alg_lookup{$1.$2}{$type};
	};
	do { # unrecognized
	    last;
	};
    }
    return;
}


sub _set_attributes {
    my $self = shift;
    my $alg = $self->hit->algorithm;

    # pretreat (i.e., kludge it)
    $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
    
    for ($alg) {
	/MEGABLAST/i && do {
	    ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
	    ($self->{_def_context_query},$self->{_def_context_hit}) =
		('p_','p_');
	    ($self->{_has_frame_query},$self->{_has_frame_hit}) =
		(0, 0);
	    ($self->{_has_strand_query},$self->{_has_strand_hit}) =
		(1, 1);
	    last;
	};
	/(.?)BLAST(.?)/i && do {
	    ($self->{_mapping_query},$self->{_mapping_hit}) = 
		@{$$alg_lookup{$1.$2}{mapping}};
	    ($self->{_def_context_query},$self->{_def_context_hit}) =
		@{$$alg_lookup{$1.$2}{def_context}};
	    ($self->{_has_frame_query},$self->{_has_frame_hit}) =	    
		@{$$alg_lookup{$1.$2}{has_frame}};
	    ($self->{_has_strand_query},$self->{_has_strand_hit}) =	    
		@{$$alg_lookup{$1.$2}{has_strand}};
	    last;
	};
	/(.?)FAST(.?)/ && do {
	    ($self->{_mapping_query},$self->{_mapping_hit}) = 
		@{$$alg_lookup{$1.$2}{mapping}};
	    ($self->{_def_context_query},$self->{_def_context_hit}) =
		@{$$alg_lookup{$1.$2}{def_context}};
	    ($self->{_has_frame_query},$self->{_has_frame_hit}) =	    
		@{$$alg_lookup{$1.$2}{has_frame}};
	    ($self->{_has_strand_query},$self->{_has_strand_hit}) =	    
		@{$$alg_lookup{$1.$2}{has_strand}};
	    last;
	};
	do { # unrecognized
	    $self->warn("Unrecognized algorithm '$alg'; defaults may not work");
	    ($self->{_mapping_query},$self->{_mapping_hit}) = (1,1);
	    ($self->{_def_context_query},$self->{_def_context_hit}) =
		('all','all');
	    ($self->{_has_frame_query},$self->{_has_frame_hit}) =	    
		(0,0);
	    ($self->{_has_strand_query},$self->{_has_strand_hit}) =	    
		(0,0);
	    return 0;
	    last;
	};
    }
    return 1;
}
           
sub _mapping_coeff {
    my $obj = shift;
    my $type = shift;
    my %type_i = ( 'query' => 0, 'hit' => 1 );
    unless ( ref($obj) && $obj->can('algorithm') ) {
	$obj->warn("Object type unrecognized");
	return undef;
    }
    $type ||= 'query';
    unless ( grep(/^$type$/, qw( query hit subject ) ) ) {
	$obj->warn("Sequence type unrecognized");
	return undef;
    }
    $type = 'hit' if $type eq 'subject';
    my $alg = $obj->algorithm;

    # pretreat (i.e., kludge it)
    $alg =~ /^RPS/ and ($alg) = ($alg =~ /\(([^)]+)\)/);
    
    for ($alg) {
	/MEGABLAST/i && do {
	    return 1;
	};
	/(.?)BLAST(.?)/i && do {
	    return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
	};
	/(.?)FAST(.?)/ && do {
	    return $$alg_lookup{$1.$2}{'mapping'}[$type_i{$type}];
	};
	do { # unrecognized
	    last;
	};
    }
    return;
}

# a graphical depiction of a set of intervals
sub _ints_as_text {
    my $ints = shift;
    my @ints = @$ints;
    my %pos;
    for (@ints) {
	$pos{$$_[0]}++;
	$pos{$$_[1]}++;
    }
    
    my @pos = sort {$a<=>$b} keys %pos;
    @pos = map {sprintf("%03d",$_)} @pos;
#header
    my $max=0;
    $max = (length > $max) ? length : $max for (@pos);
    for my $j (0..$max-1) {
	my $i = $max-1-$j; 
	my @line = map { substr($_, $j, 1) || '0' } @pos;
	print join('', @line), "\n";
    }
    print '-' x @pos, "\n";
    undef %pos;
    @pos{map {sprintf("%d",$_)} @pos} = (0..@pos);
    foreach (@ints) {
	print ' ' x $pos{$$_[0]}, '[', ' ' x ($pos{$$_[1]}-$pos{$$_[0]}-1), ']', ' ' x (@pos-$pos{$$_[1]}), "\n";
    }
}



# could be more efficient if hsps are assumed ordered...
sub containing_hsps {
    my $intvl = shift;
    my @hsps = @_;
    my @ret;
    my ($beg, $end) = @$intvl;
    foreach my $hsp (@hsps) {
	my ($start, $stop) = ($hsp->start, $hsp->end);
	push @ret, $hsp if ( $start <= $beg and $end <= $stop );
    }
    return @ret;
}



sub covering_groups {
    my @intervals = @_;
    return unless @intervals;
    my (@groups, $grp);
    push @{$groups[0]}, shift @intervals;
    $grp = $groups[0];
    for (my $intvl = shift @intervals; @intervals; $intvl = shift @intervals) {
	if ( $intvl->[0] - $grp->[-1][1] == 1 ) { # intervals are direcly adjacent
	    push @$grp, $intvl;
	}
	else {
	    $grp = [$intvl];
	    push @groups, $grp;
	}
    }
    return @groups;
}

1;
# need our own subsequencer for hsps. 

package Bio::Search::HSP::HSPI;

use strict;
use warnings;

sub matches_MT {
    my( $self, @args ) = @_;
    my($type, $action, $beg, $end) = $self->_rearrange( [qw(TYPE ACTION START END)], @args);
    my @actions = qw( identities conserved searchutils );
    
    # prep $type
    $self->throw("Type not specified") if !defined $type;
    $self->throw("Type '$type' unrecognized") unless grep(/^$type$/,qw(query hit subject));
    $type = 'hit' if $type eq 'subject';

    # prep $action
    $self->throw("Action not specified") if !defined $action;
    $self->throw("Action '$action' unrecognized") unless grep(/^$action$/, @actions);

    my ($len_id, $len_cons);
    my $c = Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type);
    if ((defined $beg && !defined $end) || (!defined $beg && defined $end)) {
	$self->throw("Both start and end are required");
    }
    elsif ( (!defined($beg) && !defined($end)) || !$self->seq_str('match') ) {
        ## Get data for the whole alignment.
	# the reported values x mapping 
	$self->debug("Sequence data not present in report; returning data for entire HSP") unless $self->seq_str('match');
	($len_id, $len_cons) = map { $c*$_ } ($self->num_identical, $self->num_conserved);
	for ($action) {
	    $_ eq 'identities'  && do {
		return $len_id;
	    };
	    $_ eq 'conserved'   && do {
		return $len_cons;
	    };
	    $_ eq 'searchutils' && do {
		return ($len_id, $len_cons);
	    };
	    do {
		$self->throw("What are YOU doing here?");
	    };
	}
    }
    else {
        ## Get the substring representing the desired sub-section of aln.
        my($start,$stop) = $self->range($type);
	if ( $beg < $start or $stop < $end ) {
	    $self->throw("Start/stop out of range [$start, $stop]");
	}

	# handle gaps
	my $match_str = $self->seq_str('match');
	if ($self->gaps) {
	    # strip the homology string of gap positions relative
	    # to the target type
	    $match_str = $self->seq_str('match');
	    my $tgt   = $self->seq_str($type);
	    my $encode = $match_str ^ $tgt;
	    my $zap = '-'^' ';
	    $encode =~ s/$zap//g;
	    $tgt =~ s/-//g;
	    $match_str = $tgt ^ $encode;
	    # match string is now the correct length for substr'ing below,
	    # given that start and end are gapless coordinates in the 
	    # blast report
	}

        my $seq = "";
	$seq = substr( $match_str, 
		       int( ($beg-$start)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ),
		       int( 1+($end-$beg)/Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self, $type) ) 
	    );

        if(!CORE::length $seq) {
            $self->throw("Undefined sub-sequence ($beg,$end). Valid range = $start - $stop");
        }

        $seq =~ s/ //g;  # remove space (no info).
        $len_cons = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
        $seq =~ s/\+//g;  # remove '+' characters (conservative substitutions)
        $len_id = (CORE::length $seq)*(Bio::Search::Tiling::MapTileUtils::_mapping_coeff($self,$type));
	for ($action) {
	    $_ eq 'identities' && do {
		return $len_id;
	    };
	    $_ eq 'conserved' && do {
		return $len_cons;
	    };
	    $_ eq 'searchutils' && do {
		return ($len_id, $len_cons);
	    };
	    do {
		$self->throw("What are YOU doing here?");
	    };
	}
    }
}

1;

package Bio::LocatableSeq;
use strict;
use warnings;

# mixin the Bio::FeatureHolderI implementation of 
# Bio::Seq -- for get_tiled_aln

sub get_SeqFeatures{
   my $self = shift;
   my $tag = shift;

   if( !defined $self->{'_as_feat'} ) {
       $self->{'_as_feat'} = [];
   }
   if ($tag) {
       return map { $_->primary_tag eq $tag ? $_ : () } @{$self->{'_as_feat'}};
   }
   else {
       return @{$self->{'_as_feat'}};
   }
}

sub feature_count {
    my ($self) = @_;
    if (defined($self->{'_as_feat'})) {
	return ($#{$self->{'_as_feat'}} + 1);
    } else {
	return 0;
    }
}

sub add_SeqFeature {
   my ($self,@feat) = @_;
   $self->{'_as_feat'} = [] unless $self->{'_as_feat'};
   foreach my $feat ( @feat ) {
       if( !$feat->isa("Bio::SeqFeatureI") ) {
	   $self->throw("$feat is not a SeqFeatureI and that's what we expect...");
       }
       $feat->attach_seq($self);
       push(@{$self->{'_as_feat'}},$feat);
   }
   return 1;
}

sub remove_SeqFeatures {
    my $self = shift;

    return () unless $self->{'_as_feat'};
    my @feats = @{$self->{'_as_feat'}};
    $self->{'_as_feat'} = [];
    return @feats;
}

1;