Bio::Chaos::FeatureUtil - sequence and feature utilities


Bio-Chaos documentation Contained in the Bio-Chaos distribution.

Index


Code Index:

NAME

Top

  Bio::Chaos::FeatureUtil     - sequence and feature utilities

SYNOPSIS

Top

  Bio::Chaos::FeatureUtil->blah('xx');

DESCRIPTION

Top

COORDINATES

Coordinates are either Interbase or <Base-oriented>.

Interbase coordinates count the spaces between bases and the origin is zero

Base-oriented coordinates count the bases and the origin is 1

To convert between these two, add/subtract 1 from the minimum (low) coordinate only

Imagine a sequence TCATGCAA eg 1 2 3 4 5 6 7 8 BASE T C A T G C A A 0 1 2 3 4 5 6 7 8 INTERBASE

In interbase, the ATG codon is [2,5]; in base it is [3,5]

With interbase, length is high-low

With base, length is (high-low)+1

interbase has the advantages of simpler arithmetic, and the ability to represent length-zero features (eg insertions)

Beyond the base/interbase distinction, ranges can also be specified as either min-max-strand triples (directionality explicit) or begin-end pairs (directionality implicit).

bcmm
  (bmin, bmax, strand)

base coordinates with minmax semantics - native to bioperl

ibmm
  (imin, imax, strand)

interbase with minmax semantics - native to chado

ibv
  (nbeg, nend, ?strand)

interbase vector (in the mathematical sense)

so called natural begin and end - this is the native chaos coordinate system


Bio-Chaos documentation Contained in the Bio-Chaos distribution.
# $Id: FeatureUtil.pm,v 1.3 2005/04/27 19:32:45 cmungall Exp $
#
#

package Bio::Chaos::FeatureUtil;

use Exporter;
use Bio::Chaos::Root;
use Data::Stag;
use Carp;
use Bio::Seq;
use FileHandle;
use Digest::MD5;
use strict qw(subs vars refs);
use vars qw(@ISA @EXPORT_OK %EXPORT_TAGS);

@ISA = qw(Bio::Chaos::Root Exporter);


@EXPORT_OK =
  qw(
     bcmm2ibv
     bcmm2ibmm
     ibv2bcmm
     cutseq
     revcomp
     translate
     md5
    );
%EXPORT_TAGS =
  (all=>[@EXPORT_OK]);


sub BLAST_TYPE_TO_DESCRIPTION {
    return
      {
       tblastx => 'translated_nucleotide_match',
       tblastn => 'translated_nucleotide_match',
       blastn => 'nucleotide_to_nucleotide_match',
       blastx => 'nucleotide_to_protein_match',
       blastp => 'protein_to_proteinmatch',
      };
}

sub SELF {
    my $self = shift;
    if (ref($self)) {
	return $self;
    }
    return $self->new;
}

sub unflattener {
    my $self = shift;
    $self->{_unflattener} = shift if @_;
    if (!$self->{_unflattener} ) {
	$self->load_module("Bio::SeqFeature::Tools::Unflattener");
	$self->{_unflattener} =
	  Bio::SeqFeature::Tools::Unflattener->new;
    }
    return $self->{_unflattener};
}

sub typemapper {
    my $self = shift;
    $self->{_typemapper} = shift if @_;
    if (!$self->{_typemapper} ) {
	$self->load_module("Bio::SeqFeature::Tools::TypeMapper");
	$self->{_typemapper} =
	  Bio::SeqFeature::Tools::TypeMapper->new;
    }
    return $self->{_typemapper};
}

sub chaos_from_genbank_file {
    my $self = SELF(shift);
    my $file = shift;
    my $fmt = shift || 'genbank';
    $self->load_module("Bio::SeqIO");
    my $unflattener = $self->unflattener;
    my $tm = $self->typemapper;
    my $seqio =
      Bio::SeqIO->new(-file=> $file,
                      -format => $fmt);
    my @chaoses = ();
    while (my $seq = $seqio->next_seq()) {
	$unflattener->unflatten_seq(-seq=>$seq,
				    -use_magic=>1);
	$tm->map_types_to_SO(-seq=>$seq);
	my $outio = Bio::SeqIO->new( -format => 'chaos');
	$outio->write_seq($seq);
	my $chaos = $outio->handler->stag;
	push(@chaoses, $chaos);
    }
    return $self->merge_chaos_docs(\@chaoses);
}

sub merge_chaos_docs {
    my $self = shift;
    my $chaoses = shift;
    my $chaos = shift @$chaoses;
    foreach my $chaosI (@$chaoses) {
	my @kids = $chaosI->kids;
	foreach my $kid (@kids) {
	    $chaos->add($kid->name, $kid->data)
	      unless $kid->name eq 'chaos_metadata';
	}
    }
    return $chaos;
}


sub bcmm2ibv {
    my ($bmin, $bmax, $strand) = @_;
    $strand ||= 0;
    $strand = 1 if ($strand eq '+');
    $strand = -1 if ($strand eq '-');
    $bmin--;
    ($bmin, $bmax) = ($bmax, $bmin) if $strand < 0;
    return ($bmin, $bmax, $strand); 
}
sub bcmm2ibmm {
    my ($bmin, $bmax, $strand) = @_;
    $bmin--;
    return ($bmin, $bmax, $strand); 
}
sub ibv2bcmm {
    my ($nbeg, $nend, $strand) = @_;
    if ($nbeg != $nend) {
	my $strandP = $nbeg < $nend ? -1 : +1;
	if ($strand &&
	    $strand != $strandP) {
	    confess("$nbeg, $nend, $strand is not valid ibv coordinate triple");
	}
	
    }
}

sub cutseq {
    my $res = shift;
    my $nbeg = shift;
    my $nend = shift;
    if ($nbeg <= $nend) {
        return substr($res, $nbeg, $nend-$nbeg);
    }
    else {
        my $cut = substr($res, $nend, $nbeg-$nend);
        $cut = revcomp($cut);
        return $cut;
    }
}

sub revcomp {
    my $res = shift;
    $res =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
    return scalar(CORE::reverse($res));
}

sub translate {
    my $res = shift;
    my $tt = shift || 0;
    my $seq = Bio::Seq->new(-seq=>$res);
    my $tnres = $seq->translate->seq;
    $tnres =~ s/\*.*//;
    return $tnres;
}

sub md5 {
    my $res = shift;
    my $md5 = Digest::MD5->new;
    $md5->add($res);
    return $md5->hexdigest;
}


1;