| Boulder documentation | Contained in the Boulder distribution. |
Boulder::Blast::NCBI - Parse and read NCBI BLAST files
Not for direct use. Use Boulder::Blast instead.
Specialized parser for NCBI format BLAST output. Loaded automatically by Boulder::Blast.
Boulder, Boulder::GenBank, Boulder::Blast
Lincoln Stein <lstein@cshl.org>.
Copyright (c) 1998 Cold Spring Harbor Laboratory
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. See DISCLAIMER.txt for disclaimers of warranty.
| Boulder documentation | Contained in the Boulder distribution. |
package Boulder::Blast::NCBI; # NCBI BLAST file format parsing
use strict; use File::Basename; use Stone; use Boulder::Stream; use Carp; use vars qw($VERSION @ISA); @ISA = 'Boulder::Blast'; $VERSION = 1.02; sub _read_record { my $self = shift; my $fh = shift; my $stone = shift; # we don't find out about the name of the database or the parameters until we # get to the bottom of the file. Too bad. # loop until we find the query name my $line; do { $line = <$fh>; } until ($line && $line=~/^Query=/); if ($line && $line =~ /^Query=\s+(\S+)/) { $stone->insert(Blast_query => $1); } else { croak "Couldn't find query line!"; } do { $line = <$fh>; } until $line=~/([\d,]+) letters/; croak "Couldn't find query length!" unless $1; (my $len = $1) =~ s/,//g; $stone->insert(Blast_query_length => $len); # Read down to the first hit, if any. If we hit /^Parameters/, then we had no # hits. while (<$fh> ) { last if /^(>|\s+Database)/; } if (/^>/) { # we found some hits my $hits = $self->parse_hits($_); $stone->insert(Blast_hits => $hits); } # At this point, one way or another, we're pointing at the Database # line. # Now we should be pointing at statistics while (<$fh>) { if (/Database: (.*)/) { my $db = $1; $stone->insert(Blast_db => $db); $stone->insert(Blast_db_title => basename($db)); } # in case match title, overwrite previous setting of Blast_db_title $stone->insert(Blast_db_title => $1) if /Title: (.*)/; $stone->insert(Blast_db_date => $1) if /Posted date:\s+(.*)/; last if /Lambda/; } # At this point, one way or another, we're pointing at the Matrix # line. We create a parameter stone to hold the results my $parms = new Stone; chomp ($line = <$fh>); if (my ($lambda,$k,$h) = $line =~ /([\d.-]+)/g) { $parms->insert('Lambda' => {Lambda => $lambda, K => $k, H => $h}); } chomp ($line = <$fh>); chomp ($line = <$fh>); if ($line =~ /^Gapped/) { chomp ($line = <$fh>); if (my ($lambda,$k,$h) = $line =~ /([\d.-]+)/g) { $parms->insert('GappedLambda' => {Lambda => $lambda, K => $k, H => $h}); } } while (<$fh>) { chomp; $parms->insert(GapPenalties => $1) if /^Gap Penalties: (.+)/; $parms->insert(Matrix => $1) if /^Matrix: (.+)/; $parms->insert(HSPLength => $1) if /^effective HSP length: (.+)/; $parms->insert(DBLength => $1) if /^length of database: (.+)/; $parms->insert(T => $1) if /T: (.+)/; $parms->insert(A => $1) if /A: (.+)/; $parms->insert(X1 => $1) if /X1: (\d+)/; $parms->insert(X2 => $1) if /X2: (\d+)/; $parms->insert(S1 => $1) if /S1: (\d+)/; $parms->insert(S2 => $1) if /S2: (\d+)/; last if /^S2/; } $stone->insert(Blast_parms => $parms); # finally done! $stone; } # parse the hits and HSPs sub parse_hits { my $self = shift; $_ = shift; my $fh = $self->_fh; my (@hits,@hsps,$accession,$orientation,$hit,$hsp); my ($qstart,$qend,$tstart,$tend); my ($query,$match,$target,$done,$new_hit,$new_hsp); my $signif = 9999; my $expect = 9999; my $ident = 0; while (!$done) { chomp; next unless length($_); $done = /^\s+Database/; # here's how we get out of the loop $new_hit = /^>(\S+)/; $new_hsp = $accession && /Score\s+=\s+([\d.e+]+)\s+bits\s+\((\S+)\)/; # hit a new HSP section if ( $done || $new_hit || $new_hsp ) { if ($hsp) { croak "base alignment is out of whack" unless length($query) == length($target); $hsp->insert(Query => $query, Subject => $target, Alignment => substr($match,0,length($query)), ); $hsp->insert(Query_start => $qstart, Query_end => $qend, Subject_start => $tstart, Subject_end => $tend, Length => 1 + $qend - $qstart, Orientation => $tend > $tstart ? 'plus' : 'minus', ); push(@hsps,$hsp); undef $hsp; } if ($new_hsp) { $hsp = new Stone; $hsp->insert(Score => $2, Bits => $1); ($qstart,$qend,$tstart,$tend) = (undef,undef,undef,undef); # undef all ($query,$match,$target) = (undef,undef,undef); # these too } } # hit a new subject section if ( $done || $new_hit ) { $accession = $1; if ($hit) { $signif = $expect if $signif == 9999; $hit->insert(Hsps => \@hsps, Signif => $signif, Identity => $ident, Expect => $expect, ) if @hsps; undef @hsps; push(@hits,$hit); ($signif,$expect,$ident) = (9999,9999,0); # reset max values } if ($new_hit) { $hit = new Stone; $hit->insert(Name => $accession); next; } } # hit the length = line if (/Length\s*=\s*([\d,]+)/) { (my $len = $1) =~ s/,//g; $hit->insert(Length => $len); next; } # hit the Plus|Minus Strand line if (/(Plus|Minus) Strand HSPs/) { $orientation = lc $1; next; } # None of the following is relevant unless $hsp is defined next unless $hsp; if (/Expect = ([+e\d\.-]+)/) { $hsp->insert(Expect => $1); $expect = $1 < $expect ? $1 : $expect; } if (/P(?:\(\d+\))? = (\S+)/) { $hsp->insert(Signif => $1); $signif = $1 < $signif ? $1 : $signif; } if (/Identities = \S+ \((\d+)%?\)/) { my $idn = $1; $hsp->insert(Identity => "$idn%"); $ident = $idn > $ident ? $idn : $ident; } $hsp->insert(Positives => $1) if /Positives = \S+ \((\S+)\)/; $hsp->insert(Strand => $1) if /Strand =\s+([^,]+)/; $hsp->insert(Frame => $1) if /Frame =\s+([^,]+)/; # process the query sequence if (/^Query:\s+(\d+)\s*(\S+)\s+(\d+)/) { $qstart ||= $1; $qend = $3; $query .= $2; next; } # process the target sequence if (/^Sbjct:\s+(\d+)\s*(\S+)\s+(\d+)/) { $tstart ||= $1; $tend = $3; $target .= $2; next; } # anything else is going to be the match string # this is REALLY UGLY because we have to extract absolute # positions $match .= substr($_,12,60) if $query; } continue { $_ = <$fh>; } return \@hits; } 1;