| BioPerl documentation | Contained in the BioPerl distribution. |
Bio::AlignIO::phylip - PHYLIP format sequence input/output stream
# Do not use this module directly. Use it via the Bio::AlignIO class.
use Bio::AlignIO;
use Bio::SimpleAlign;
#you can set the name length to something other than the default 10
#if you use a version of phylip (hacked) that accepts ids > 10
my $phylipstream = Bio::AlignIO->new(-format => 'phylip',
-fh => \*STDOUT,
-idlength=>30);
# convert data from one format to another
my $gcgstream = Bio::AlignIO->new(-format => 'msf',
-file => 't/data/cysprot1a.msf');
while( my $aln = $gcgstream->next_aln ) {
$phylipstream->write_aln($aln);
}
# do it again with phylip sequential format format
$phylipstream->interleaved(0);
# can also initialize the object like this
$phylipstream = Bio::AlignIO->new(-interleaved => 0,
-format => 'phylip',
-fh => \*STDOUT,
-idlength=>10);
$gcgstream = Bio::AlignIO->new(-format => 'msf',
-file => 't/data/cysprot1a.msf');
while( my $aln = $gcgstream->next_aln ) {
$phylipstream->write_aln($aln);
}
This object can transform Bio::SimpleAlign objects to and from PHYLIP format. By default it works with the interleaved format. By specifying the flag -interleaved => 0 in the initialization the module can read or write data in sequential format.
Long IDs up to 50 characters are supported by flag -longid => 1. ID strings can be surrounded by single quoted. They are mandatory only if the IDs contain spaces.
Please direct usage questions or support issues to the mailing list:
bioperl-l@bioperl.org
rather than to the module maintainer directly. Many experienced and reponsive experts will be able look at the problem and quickly address it. Please include a thorough description of the problem with code and data examples if at all possible.
Report bugs to the Bioperl bug tracking system to help us keep track the bugs and their resolution. Bug reports can be submitted via the web:
https://redmine.open-bio.org/projects/bioperl/
Email: heikki at ebi.ac.uk Email: jason at bioperl.org
The rest of the documentation details each of the object methods. Internal methods are usually preceded with a _
Title : new
Usage : my $alignio = Bio::AlignIO->new(-format => 'phylip'
-file => '>file',
-idlength => 10,
-idlinebreak => 1);
Function: Initialize a new L<Bio::AlignIO::phylip> reader or writer
Returns : L<Bio::AlignIO> object
Args : [specific for writing of phylip format files]
-idlength => integer - length of the id (will pad w/
spaces if needed)
-interleaved => boolean - whether interleaved
or sequential format required
-line_length => integer of how long a sequence lines should be
-idlinebreak => insert a line break after the sequence id
so that sequence starts on the next line
-flag_SI => whether or not write a "S" or "I" just after
the num.seq. and line len., in the first line
-tag_length => integer of how long the tags have to be in
each line between the space separator. set it
to 0 to have 1 tag only.
-wrap_sequential => boolean for whether or not sequential
format should be broken up or a single line
default is false (single line)
-longid => boolean for allowing arbitrary long IDs (default is false)
Title : next_aln
Usage : $aln = $stream->next_aln()
Function: returns the next alignment in the stream.
Throws an exception if trying to read in PHYLIP
sequential format.
Returns : L<Bio::SimpleAlign> object
Args :
Title : write_aln Usage : $stream->write_aln(@aln) Function: writes the $aln object into the stream in phylip format Returns : 1 for success and 0 for error Args : L<Bio::Align::AlignI> object
Title : interleaved Usage : my $interleaved = $obj->interleaved Function: Get/Set Interleaved status Returns : boolean Args : boolean
Title : flag_SI
Usage : my $flag = $obj->flag_SI
Function: Get/Set if the Sequential/Interleaved flag has to be shown
after the number of sequences and sequence length
Example :
Returns : boolean
Args : boolean
Title : idlength Usage : my $idlength = $obj->idlength Function: Get/Set value of id length Returns : string Args : string
Title : line_length Usage : $obj->line_length($newval) Function: Returns : value of line_length Args : newvalue (optional)
Title : tag_length Usage : $obj->tag_length($newval) Function: Example : my $tag_length = $obj->tag_length Returns : value of the length for each space-separated tag in a line Args : newvalue (optional) - set to zero to have one tag per line
Title : id_linebreak Usage : $obj->id_linebreak($newval) Function: Returns : value of id_linebreak Args : newvalue (optional)
Title : wrap_sequential Usage : $obj->wrap_sequential($newval) Function: Returns : value of wrap_sequential Args : newvalue (optional)
Title : longid Usage : $obj->longid($newval) Function: Returns : value of longid Args : newvalue (optional)
| BioPerl documentation | Contained in the BioPerl distribution. |
# # BioPerl module for Bio::AlignIO::phylip # # Copyright Heikki Lehvaslaiho #
# Let the code begin... package Bio::AlignIO::phylip; use vars qw($DEFAULTIDLENGTH $DEFAULTLINELEN $DEFAULTTAGLEN); use strict; use Bio::SimpleAlign; use POSIX; # for the rounding call use base qw(Bio::AlignIO); BEGIN { $DEFAULTIDLENGTH = 10; $DEFAULTLINELEN = 60; $DEFAULTTAGLEN = 10; }
sub _initialize { my($self,@args) = @_; $self->SUPER::_initialize(@args); my ($interleave,$linelen,$idlinebreak, $idlength, $flag_SI, $tag_length,$ws, $longid) = $self->_rearrange([qw(INTERLEAVED LINE_LENGTH IDLINEBREAK IDLENGTH FLAG_SI TAG_LENGTH WRAP_SEQUENTIAL LONGID)],@args); $self->interleaved($interleave ? 1 : 0) if defined $interleave; $self->idlength($idlength || $DEFAULTIDLENGTH); $self->id_linebreak(1) if( $idlinebreak ); $self->line_length($linelen) if defined $linelen && $linelen > 0; $self->flag_SI(1) if ( $flag_SI ); $self->tag_length($tag_length) if ( $tag_length || $DEFAULTTAGLEN ); $self->wrap_sequential($ws ? 1 : 0); $self->longid($longid ? 1 : 0); 1; }
sub next_aln { my $self = shift; my $entry; my ($seqcount, $residuecount, %hash, $name,$str, @names,$seqname,$start,$end,$count,$seq); my $aln = Bio::SimpleAlign->new(-source => 'phylip'); # skip blank lines until we see header line # if we see a non-blank line that isn't the seqcount and residuecount line # then bail out of next_aln (return) HEADER: while ($entry = $self->_readline) { next if $entry =~ /^\s?$/; if ($entry =~ /\s*(\d+)\s+(\d+)/) { ($seqcount, $residuecount) = ($1, $2); } last HEADER; } return unless $seqcount and $residuecount; # first alignment section my $idlen = $self->idlength; $count = 0; my $iter = 1; my $interleaved = $self->interleaved; while( $entry = $self->_readline) { last if( $entry =~ /^\s?$/ && $interleaved ); # we've hit the next entry. if( $entry =~ /^\s+(\d+)\s+(\d+)\s*$/) { $self->_pushback($entry); last; } if( $self->longid && $entry =~ /\w/ ) { if ($entry =~ /'/) { $entry =~ /^\s*'([^']+)'\s+(.+)$/; $name = $1; $str = $2; } else { $entry =~ /^\s*([^\s]+)\s+(.+)$/; $name = $1; $str = $2; } # $name =~ s/[\s\/]/_/g; # not sure how wise is it to do this $name =~ s/_+$//; # remove any trailing _'s push @names, $name; $str =~ s/\s//g; $count = scalar @names; $hash{$count} = $str; } elsif( $entry =~ /^\s+(.+)$/ ) { $interleaved = 0; $str = $1; $str =~ s/\s//g; $count = scalar @names; $hash{$count} .= $str; } elsif( $entry =~ /^(.{$idlen})\s*(.*)\s$/ || $entry =~ /^(.{$idlen})(\S{$idlen}\s+.+)\s$/ # Handle weirdness when id is too long ) { $name = $1; $str = $2; $name =~ s/[\s\/]/_/g; $name =~ s/_+$//; # remove any trailing _'s push @names, $name; $str =~ s/\s//g; $count = scalar @names; $hash{$count} = $str; } elsif( $interleaved ) { if( $entry =~ /^(\S+)\s+(.+)/ || $entry =~ /^(.{$idlen})(.*)\s$/ ) { $name = $1; $str = $2; $name =~ s/[\s\/]/_/g; $name =~ s/_+$//; # remove any trailing _'s push @names, $name; $str =~ s/\s//g; $count = scalar @names; $hash{$count} = $str; } else { $self->debug("unmatched line: $entry"); } } $self->throw("Not a valid interleaved PHYLIP file!") if $count > $seqcount; } if( $interleaved ) { # interleaved sections $count = 0; while( $entry = $self->_readline) { # finish current entry if($entry =~/\s*\d+\s+\d+/){ $self->_pushback($entry); last; } $count = 0, next if $entry =~ /^\s$/; $entry =~ /\s*(.*)$/ && do { $str = $1; $str =~ s/\s//g; $count++; $hash{$count} .= $str; }; $self->throw("Not a valid interleaved PHYLIP file! [$count,$seqcount] ($entry)") if $count > $seqcount; } } return if scalar @names < 1; # sequence creation $count = 0; foreach $name ( @names ) { $count++; if( $name =~ /(\S+)\/(\d+)-(\d+)/ ) { $seqname = $1; $start = $2; $end = $3; } else { $seqname=$name; $start = 1; $str = $hash{$count}; # $str =~ s/[^A-Za-z]//g; #$end = length($str); } # consistency test $self->throw("Length of sequence [$seqname] is not [$residuecount] it is ".CORE::length($hash{$count})."! ") unless CORE::length($hash{$count}) == $residuecount; $seq = Bio::LocatableSeq->new('-seq' => $hash{$count}, '-display_id' => $seqname, '-start' => $start, (defined $end) ? ('-end' => $end) : (), '-alphabet' => $self->alphabet, ); $aln->add_seq($seq); } return $aln if $aln->num_sequences; return; }
sub write_aln { my ($self,@aln) = @_; my $count = 0; my $wrapped = 0; my $maxname; my $width = $self->line_length(); my ($length,$date,$name,$seq,$miss,$pad, %hash,@arr,$tempcount,$index,$idlength,$flag_SI,$line_length, $tag_length); foreach my $aln (@aln) { if( ! $aln || ! $aln->isa('Bio::Align::AlignI') ) { $self->warn("Must provide a Bio::Align::AlignI object when calling write_aln"); next; } $self->throw("All sequences in the alignment must be the same length") unless $aln->is_flush(1) ; $flag_SI = $self->flag_SI(); $aln->set_displayname_flat(); # plain $length = $aln->length(); if ($flag_SI) { if ($self->interleaved() ) { $self->_print (sprintf(" %s %s I\n", $aln->num_sequences, $aln->length)); } else { $self->_print (sprintf(" %s %s S\n", $aln->num_sequences, $aln->length)); } } else { $self->_print (sprintf(" %s %s\n", $aln->num_sequences, $aln->length)); } $idlength = $self->idlength(); $line_length = $self->line_length(); $tag_length = $self->tag_length(); foreach $seq ( $aln->each_seq() ) { $name = $aln->displayname($seq->get_nse); if ($self->longid) { $self->warn("The lenght of the name is over 50 chars long [$name]") if length($name) > 50; $name = "'$name' " } else { $name = substr($name, 0, $idlength) if length($name) > $idlength; $name = sprintf("%-".$idlength."s",$name); if( $self->interleaved() ) { $name .= ' ' ; } elsif( $self->id_linebreak) { $name .= "\n"; } } #phylip needs dashes not dots my $seq = $seq->seq(); $seq =~ s/\./-/g; $hash{$name} = $seq; push(@arr,$name); } if( $self->interleaved() ) { my $numtags; if ($tag_length <= $line_length) { $numtags = floor($line_length/$tag_length); $line_length = $tag_length*$numtags; } else { $numtags = 1; } while( $count < $length ) { # there is another block to go! foreach $name ( @arr ) { my $dispname = $name; $dispname = '' if $wrapped; $self->_print (sprintf("%".($idlength+3)."s",$dispname)); $tempcount = $count; $index = 0; $self->debug("residue count: $count\n") if ($count%100000 == 0); while( ($tempcount + $tag_length < $length) && ($index < $numtags) ) { $self->_print (sprintf("%s ",substr($hash{$name}, $tempcount, $tag_length))); $tempcount += $tag_length; $index++; } # last if( $index < $numtags) { # space to print! $self->_print (sprintf("%s ",substr($hash{$name}, $tempcount))); $tempcount += $tag_length; } $self->_print ("\n"); } $self->_print ("\n"); $count = $tempcount; $wrapped = 1; } } else { foreach $name ( @arr ) { my $dispname = $name; my $line = sprintf("%s%s\n",$dispname,$hash{$name}); if( $self->wrap_sequential ) { $line =~ s/(.{1,$width})/$1\n/g; } $self->_print ($line); } } } $self->flush if $self->_flush_on_write && defined $self->_fh; return 1; }
sub interleaved { my ($self,$value) = @_; if( defined $value ) { if ($value) {$self->{'_interleaved'} = 1 } else {$self->{'_interleaved'} = 0 } } return 1 unless defined $self->{'_interleaved'}; return $self->{'_interleaved'}; }
sub flag_SI{ my ($self,$value) = @_; my $previous = $self->{'_flag_SI'}; if( defined $value ) { $self->{'_flag_SI'} = $value; } return $previous; }
sub idlength { my($self,$value) = @_; if (defined $value){ $self->{'_idlength'} = $value; } return $self->{'_idlength'}; }
sub line_length{ my ($self,$value) = @_; if( defined $value) { $self->{'_line_length'} = $value; } return $self->{'_line_length'} || $DEFAULTLINELEN; }
sub tag_length{ my ($self,$value) = @_; if( defined $value) { $self->{'_tag_length'} = $value; } return $self->{'_tag_length'} || $DEFAULTTAGLEN; }
sub id_linebreak{ my ($self,$value) = @_; if( defined $value) { $self->{'_id_linebreak'} = $value; } return $self->{'_id_linebreak'} || 0; }
sub wrap_sequential{ my ($self,$value) = @_; if( defined $value) { $self->{'_wrap_sequential'} = $value; } return $self->{'_wrap_sequential'} || 0; }
sub longid{ my ($self,$value) = @_; if( defined $value) { $self->{'_longid'} = $value; } return $self->{'_longid'} || 0; } 1;