Bio::DB::SeqFeature::Store::berkeleydb3 - Storage and retrieval of sequence


BioPerl documentation Contained in the BioPerl distribution.

Index


Code Index:

NAME

Top

Bio::DB::SeqFeature::Store::berkeleydb3 -- Storage and retrieval of sequence annotation data in Berkeleydb files

SYNOPSIS

Top

  # Create a feature database from scratch
  $db     = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
                                             -dsn     => '/var/databases/fly4.3',
                                             -create  => 1);

  # get a feature from somewhere
  my $feature = Bio::SeqFeature::Generic->new(...);

  # store it
  $db->store($feature) or die "Couldn't store!";

DESCRIPTION

Top

This is a faster version of the berkeleydb storage adaptor for Bio::DB::SeqFeature::Store. It is used automatically when you create a new database with the original berkeleydb adaptor. When opening a database created under the original adaptor, the old code is used for backward compatibility.

Please see Bio::DB::SeqFeature::Store::berkeleydb for full usage instructions.

BUGS

Top

This is an early version, so there are certainly some bugs. Please use the BioPerl bug tracking system to report bugs.

SEE ALSO

Top

bioperl, Bio::DB::SeqFeature, Bio::DB::SeqFeature::Store, Bio::DB::SeqFeature::GFF3Loader, Bio::DB::SeqFeature::Segment, Bio::DB::SeqFeature::Store::memory, Bio::DB::SeqFeature::Store::DBI::mysql,

AUTHOR

Top

Lincoln Stein <lincoln.stein@gmail.com>.

Copyright (c) 2009 Ontario Institute for Cancer Research

This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.


BioPerl documentation Contained in the BioPerl distribution.

package Bio::DB::SeqFeature::Store::berkeleydb3;

# $Id: berkeleydb3.pm 15987 2009-08-18 21:08:55Z lstein $
# faster implementation of berkeleydb

use strict;
use base 'Bio::DB::SeqFeature::Store::berkeleydb';
use DB_File;
use Fcntl qw(O_RDWR O_CREAT :flock);
use Bio::DB::GFF::Util::Rearrange 'rearrange';

# can't have more sequence ids than this
use constant MAX_SEQUENCES => 1_000_000_000;

# used to construct the bin key
use constant C1  => 500_000_000; # limits chromosome length to 500 megabases
use constant C2  => 1000*C1;     # at most 1000 chromosomes

use constant BINSIZE       => 10_000;
use constant MININT        => -999_999_999_999;
use constant MAXINT        => 999_999_999_999;
use constant SUMMARY_BIN_SIZE => 1000;

sub version { return 3.0 }

sub open_index_dbs {
    my $self = shift;
    my ($flags,$create) = @_;

    # Create the main index databases; these are DB_BTREE implementations with duplicates allowed.
    $DB_BTREE->{flags}  = R_DUP;

    my $string_cmp          = DB_File::BTREEINFO->new;
    $string_cmp->{flags}    = R_DUP;
    $string_cmp->{compare}  = sub { lc $_[0] cmp lc $_[1] };

    my $numeric_cmp         = DB_File::BTREEINFO->new;
    $numeric_cmp->{flags}   = R_DUP;
    $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] };

    for my $idx ($self->_index_files) {
	my $path    = $self->_qualify("$idx.idx");
	my %db;
	my $dbtype  = $idx eq 'locations' ? $numeric_cmp
	             :$idx eq 'summary'   ? $numeric_cmp
                     :$idx eq 'types'     ? $numeric_cmp
                     :$idx eq 'seqids'    ? $DB_HASH
                     :$idx eq 'typeids'   ? $DB_HASH
                     :$string_cmp;

	tie(%db,'DB_File',$path,$flags,0666,$dbtype)
	    or $self->throw("Couldn't tie $path: $!");
	%db = () if $create;
	$self->index_db($idx=>\%db);
    }

}

sub seqid_db  { shift->index_db('seqids')    }
sub typeid_db { shift->index_db('typeids') }

sub _delete_databases {
    my $self = shift;
    $self->SUPER::_delete_databases;
}

# given a seqid (name), return its denormalized numeric representation
sub seqid_id {
    my $self   = shift;
    my $seqid  = shift;
    my $db     = $self->seqid_db;
    return $db->{lc $seqid};
}

sub add_seqid {
    my $self  = shift;
    my $seqid = shift;

    my $db    = $self->seqid_db;
    my $key   = lc $seqid;
    $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
    die "Maximum number of sequence ids exceeded. This module can handle up to ",
        MAX_SEQUENCES," unique ids" if $db->{$key} > MAX_SEQUENCES;
    return $db->{$key};
}

# given a seqid (name), return its denormalized numeric representation
sub type_id {
    my $self   = shift;
    my $typeid  = shift;
    my $db     = $self->typeid_db;
    return $db->{$typeid};
}

sub add_typeid {
    my $self  = shift;
    my $typeid = shift;

    my $db      = $self->typeid_db;
    my $key     = lc $typeid;
    $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key};
    return $db->{$key};
}

sub _seq_ids {
    my $self = shift;
    if (my $fa = $self->{fasta_db}) {
	if (my @s = eval {$fa->ids}) {
	    return @s;
	}
    } 
    my $l = $self->seqid_db or return;
    return grep {!/^\./} keys %$l;
}

sub _index_files {
    return (shift->SUPER::_index_files,'seqids','typeids','summary');
}

sub _update_indexes {
    my $self = shift;
    my $obj  = shift;
    defined (my $id   = $obj->primary_id) or return;
    $self->SUPER::_update_indexes($obj);
    $self->_update_seqid_index($obj,$id);
}

sub _update_seqid_index {
    my $self = shift;
    my ($obj,$id,$delete) = @_;
    my $seq_name = $obj->seq_id;
    $self->add_seqid(lc $seq_name);
}

sub _update_type_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;
  my $db = $self->index_db('types')
    or $self->throw("Couldn't find 'types' index file");

  my $key         = $self->_obj_to_type($obj);
  my $typeid      = $self->add_typeid($key);
  $self->update_or_delete($delete,$db,$typeid,$id);
}

sub _obj_to_type {
    my $self = shift;
    my $obj  = shift;
    my $tag         = $obj->primary_tag;
    my $source_tag  = $obj->source_tag || '';
    return unless defined $tag;

    $tag           .= ":$source_tag";
    return lc $tag;
}

sub types {
    my $self = shift;
    eval "require Bio::DB::GFF::Typename" 
	unless Bio::DB::GFF::Typename->can('new');
    my $db   = $self->typeid_db;
    return grep {!/^\./} map {Bio::DB::GFF::Typename->new($_)} keys %$db;
}

sub _id2type {
    my $self = shift;
    my $wanted_id   = shift;

    my $db = $self->typeid_db;
    while (my($key,$id) = each %$db) {
	next if $key =~ /^\./;
	return $key if $id == $wanted_id;
    }
    return;
}

# return a hash of typeids that match a human-readable type
sub _matching_types {
    my $self  = shift;
    my $types = shift;
    my @types = ref $types eq 'ARRAY' ?  @$types : $types;
    my $db   = $self->typeid_db;

    my %result;
    my @all_types;

    for my $type (@types) {
	my ($primary_tag,$source_tag);
	if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
	    $primary_tag = $type->method;
	    $source_tag  = $type->source;
	} else {
	    ($primary_tag,$source_tag) = split ':',$type,2;
	}
	if (defined $source_tag) {
	    my $id = $db->{lc "$primary_tag:$source_tag"};
	    $result{$id}++ if defined $id;
	} else {
	    @all_types  = $self->types unless @all_types;
	    $result{$db->{$_}}++ foreach grep {/^$primary_tag:/} @all_types;
	}
    }
    return \%result;
}

sub _update_location_index {
  my $self = shift;
  my ($obj,$id,$delete) = @_;

  my $db = $self->index_db('locations')
    or $self->throw("Couldn't find 'locations' index file");

  my $seq_id      = $obj->seq_id || '';
  my $start       = $obj->start  || '';
  my $end         = $obj->end    || '';
  my $strand      = $obj->strand;
  my $bin_min     = int $start/BINSIZE;
  my $bin_max     = int $end/BINSIZE;

  my $typeid      = $self->add_typeid($self->_obj_to_type($obj));
  my $seq_no      = $self->add_seqid($seq_id);

  for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) {
    my $key = $seq_no * MAX_SEQUENCES + $bin;
    $self->update_or_delete($delete,$db,$key,pack("i5",$id,$start,$end,$strand,$typeid));
  }

}

sub _features {
  my $self = shift;
  my ($seq_id,$start,$end,$strand,
      $name,$class,$allow_aliases,
      $types,
      $attributes,
      $range_type,
      $iterator
     ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND',
		    'NAME','CLASS','ALIASES',
		    ['TYPES','TYPE','PRIMARY_TAG'],
		    ['ATTRIBUTES','ATTRIBUTE'],
		    'RANGE_TYPE',
		    'ITERATOR',
		   ],@_);

  my (@from,@where,@args,@group);
  $range_type ||= 'overlaps';

  my @result;
  unless (defined $name or defined $seq_id or defined $types or defined $attributes) {
      my $is_indexed = $self->index_db('is_indexed');
      @result = $is_indexed ? grep {$is_indexed->{$_}} keys %{$self->db}
                            : grep { !/^\./ }keys %{$self->db};
  }

  my %found = ();
  my $result = 1;

  if (defined($name)) {
    # hacky backward compatibility workaround
    undef $class if $class && $class eq 'Sequence';
    $name     = "$class:$name" if defined $class && length $class > 0;
    $result &&= $self->filter_by_name($name,$allow_aliases,\%found);
  }

  if (defined $seq_id) { # location with or without types
      my $typelist = defined $types ? $self->_matching_types($types) : undef;
      $result &&= $self->filter_by_type_and_location($seq_id,$start,$end,$strand,$range_type,
						     $typelist, \%found);
  }

  elsif (defined $types) { # types without location
      $result &&= $self->filter_by_type($types,\%found);
  }

  if (defined $attributes) {
    $result &&= $self->filter_by_attribute($attributes,\%found);
  }

  push @result,keys %found if $result;
  return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result)
                   : map {$self->fetch($_)} @result;
}

sub filter_by_type {
  my $self = shift;
  my ($types,$filter) = @_;
  my @types = ref $types eq 'ARRAY' ?  @$types : $types;

  my $index = $self->index_db('types');
  my $db    = tied(%$index);

  my @results;

  for my $type (@types) {
    my ($primary_tag,$source_tag);
    if (ref $type && $type->isa('Bio::DB::GFF::Typename')) {
      $primary_tag = $type->method;
      $source_tag  = $type->source;
    } else {
      ($primary_tag,$source_tag) = split ':',$type,2;
    }
    $source_tag ||= '';
    $primary_tag  = quotemeta($primary_tag);
    $source_tag    = quotemeta($source_tag);
    my $match = length $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:";
    my $key      = lc "$primary_tag:$source_tag";
    my $value;

    # If filter is already provided, then it is usually faster to
    # fetch each object.
    if (%$filter) {  
	for my $id (keys %$filter) {
	    my $obj = $self->_fetch($id) or next;
	    push @results,$id if $obj->type =~ /$match/i;
	}

    }

    else {
	my $types   = $self->typeid_db;
	my @typeids = map {$types->{$_}} grep {/$match/} keys %$types;
	for my $t (@typeids) {
	    my $k = $t;
	    for (my $status = $db->seq($k,$value,R_CURSOR);
		 $status == 0 && $k == $t;
		 $status = $db->seq($k,$value,R_NEXT)) {
		next if %$filter && !$filter->{$value};  # don't even bother
		push @results,$value;
	    }
	}
    }
  }
  $self->update_filter($filter,\@results);
}

sub filter_by_type_and_location {
  my $self = shift;
  my ($seq_id,$start,$end,$strand,$range_type,$typelist,$filter) = @_;
  $strand ||= 0;

  my $index    = $self->index_db('locations');
  my $db       = tied(%$index);

  my $binstart = defined $start ? int $start/BINSIZE : 0;
  my $binend   = defined $end   ? int $end/BINSIZE   : MAX_SEQUENCES-1;

  my %seenit;
  my @results;

  $start = MININT  if !defined $start;
  $end   = MAXINT  if !defined $end;

  my $seq_no = $self->seqid_id($seq_id);
  return unless defined $seq_no;

  if ($range_type eq 'overlaps' or $range_type eq 'contains') {
    my $keystart = $seq_no * MAX_SEQUENCES + $binstart;
    my $keystop  = $seq_no * MAX_SEQUENCES + $binend;
    my $value;

    for (my $status = $db->seq($keystart,$value,R_CURSOR);
	 $status == 0 && $keystart <= $keystop;
	 $status = $db->seq($keystart,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
      next if $seenit{$id}++;
      next if $strand   && $fstrand != $strand;
      next if $typelist && !$typelist->{$ftype};
      if ($range_type eq 'overlaps') {
	next unless $fend >= $start && $fstart <= $end;
      }
      elsif ($range_type eq 'contains') {
	next unless $fstart >= $start && $fend <= $end;
      }
      next if %$filter && !$filter->{$id};  # don't bother
      push @results,$id;
    }
  }

  # for contained in, we look for features originating and terminating outside the specified range
  # this is incredibly inefficient, but fortunately the query is rare (?)
  elsif ($range_type eq 'contained_in') {
    my $keystart = $seq_no * MAX_SEQUENCES;
    my $keystop  = $seq_no * MAX_SEQUENCES + $binstart;
    my $value;

    # do the left part of the range
    for (my $status = $db->seq($keystart,$value,R_CURSOR);
	 $status == 0 && $keystart <= $keystop;
	 $status = $db->seq($keystart,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
      next if $seenit{$id}++;
      next if $strand && $fstrand != $strand;
      next if $typelist && !$typelist->{$ftype};
      next unless $fstart <= $start && $fend >= $end;
      next if %$filter && !$filter->{$id};  # don't bother
      push @results,$id;
    }

    # do the right part of the range
    $keystart = $seq_no*MAX_SEQUENCES+$binend;
    for (my $status = $db->seq($keystart,$value,R_CURSOR);
	 $status == 0;
	 $status = $db->seq($keystart,$value,R_NEXT)) {
      my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value);
      next if $seenit{$id}++;
      next if $strand && $fstrand != $strand;
      next unless $fstart <= $start && $fend >= $end;
      next if $typelist && !$typelist->{$ftype};
      next if %$filter && !$filter->{$id};  # don't bother
      push @results,$id;
    }

  }

  $self->update_filter($filter,\@results);
}

sub build_summary_statistics {
    my $self = shift;

    my $insert = $self->index_db('summary');
    %$insert   = ();

    my $current_bin = -1;
    my (%residuals,$last_bin);

    my $le = -t \*STDERR ? "\r" : "\n";

    print STDERR "\n";

    # iterate through all the indexed features
    my $sbs      = SUMMARY_BIN_SIZE;

    # Sadly we have to do this in two steps. In the first step, we sort
    # features by typeid,seqid,start. In the second step, we read through
    # this sorted list. To avoid running out of memory, we use a db_file
    # temporary database
    my $fh   = File::Temp->new() or $self->throw("Couldn't create temporary file for sorting: $!");
    my $name = $fh->filename;
    my %sort;
    my $numeric_cmp         = DB_File::BTREEINFO->new;
    $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] };
    $numeric_cmp->{flags}   = R_DUP;
    my $s = tie %sort,'DB_File',$name,0666,O_CREAT|O_RDWR,$numeric_cmp 
	or $self->throw("Couldn't create temporary file for sorting: $!");

    my $index    = $self->index_db('locations');
    my $db       = tied(%$index);
    my $keystart = 0;
    my ($value,$count);
    my %seenit;

    for (my $status = $db->seq($keystart,$value,R_CURSOR);
	 $status == 0;
	 $status  = $db->seq($keystart,$value,R_NEXT)) {
	my ($id,$start,$end,$strand,$typeid) = unpack('i5',$value);
	next if $seenit{$id}++;

	print STDERR $count," features sorted$le" if ++$count % 1000 == 0;
	my $seqid = int($keystart / MAX_SEQUENCES);
	my $key   = $self->_encode_summary_key($typeid,$seqid,$start-1);
	$sort{$key}=$end;
    }
    print STDERR "COUNT = $count\n";
    
    my ($current_type,$current_seqid,$end);
    my $cum_count = 0;

    $keystart = 0;
    $count    = 0;

    # the second step allows us to iterate through this
    for (my $status = $s->seq($keystart,$end,R_CURSOR);
	 $status == 0;
	 $status  = $s->seq($keystart,$end,R_NEXT)) {

	print STDERR $count," features processed$le" if ++$count % 1000 == 0;
	my ($typeid,$seqid,$start) = $self->_decode_summary_key($keystart);

	my $bin   = int($start/$sbs);
	
	# because the input is sorted by start, no more features will contribute to the 
	# current bin so we can dispose of it
	if ($bin != $current_bin) {
	    if ($seqid != $current_seqid or $typeid != $current_type) {
		# load all bins left over
		$self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);
		%residuals = () ;
		$cum_count = 0;
	    } else {
		# load all up to current one
		$self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin); 
	    }
	}

	$last_bin = $current_bin;
	($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin);

	# summarize across entire spanned region
	my $last_bin = int(($end-1)/$sbs);
	for (my $b=$bin;$b<=$last_bin;$b++) {
	    $residuals{$b}++;
	}
    }

    # handle tail case
    # load all bins left over	
    $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid);

    undef %sort;
    undef $fh;
}

sub _load_bins {
    my $self = shift;
    my ($insert,$residuals,$cum_count,$typeid,$seqid,$stop_after) = @_;
    for my $b (sort {$a<=>$b} keys %$residuals) {
	last if defined $stop_after and $b > $stop_after;
	$$cum_count += $residuals->{$b};
	my $key         = $self->_encode_summary_key($typeid,$seqid,$b);
	$insert->{$key} = $$cum_count;
	delete $residuals->{$b}; # no longer needed
    }
}

sub coverage_array {
    my $self = shift;
    my ($seq_name,$start,$end,$types,$bins) = 
	rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],
		   ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_);

    $bins  ||= 1000;
    $start ||= 1;
    unless ($end) {
	my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name");
	$end        = $segment->end;
    }

    my $binsize = ($end-$start+1)/$bins;
    my $seqid   = $self->seqid_id($seq_name) || 0;

    return [] unless $seqid;

    # where each bin starts
    my @his_bin_array = map {$start + $binsize * $_}       (0..$bins);
    my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array;

    my $interval_stats_idx = $self->index_db('summary');
    my $db                 = tied(%$interval_stats_idx);
    my $t                  = $self->_matching_types($types);

    my (%bins,$report_tag);
    for my $typeid (sort keys %$t) {
	$report_tag ||= $typeid;

	for (my $i=0;$i<@sum_bin_array;$i++) {
	    my $cum_count;
	    my $bin = $sum_bin_array[$i];
	    my $key = $self->_encode_summary_key($typeid,$seqid,$bin);
	    my $status = $db->seq($key,$cum_count,R_CURSOR);
	    next unless $status == 0;
	    push @{$bins{$typeid}},[$bin,$cum_count];
	}
    }
    
    my @merged_bins;
    my $firstbin = int(($start-1)/$binsize);
    for my $type (keys %bins) {
	my $arry       = $bins{$type};
	my $last_count = $arry->[0][1]-1;
	my $last_bin   = -1;
	my $i          = 0;
	my $delta;
	for my $b (@$arry) {
	    my ($bin,$count) = @$b;
	    $delta              = $count - $last_count if $bin > $last_bin;
	    $merged_bins[$i++]  = $delta;
	    $last_count         = $count;
	    $last_bin           = $bin;
	}
    }
    my $returned_type = $self->_id2type($report_tag);
    return wantarray ? (\@merged_bins,$returned_type) : \@merged_bins;
}


sub _encode_summary_key {
    my $self                 = shift;
    my ($typeid,$seqid,$bin) = @_;
    $self->throw('Cannot index chromosomes larger than '.C1*SUMMARY_BIN_SIZE/1e6.' megabases')
	if $bin > C1;
    return ($typeid-1)*C2 + ($seqid-1)*C1 + $bin;
}

sub _decode_summary_key {
    my $self   = shift;
    my $key    = shift;
    my $typeid   = int($key/C2);
    my $residual =     $key%C2;
    my $seqid    = int($residual/C1);
    my $bin      =     $residual%C1;
    return ($typeid+1,$seqid+1,$bin);
}


1;