| BioPerl documentation | Contained in the BioPerl distribution. |
Bio::DB::SeqFeature::Store::berkeleydb3 -- Storage and retrieval of sequence annotation data in Berkeleydb files
# Create a feature database from scratch
$db = Bio::DB::SeqFeature::Store->new( -adaptor => 'berkeleydb',
-dsn => '/var/databases/fly4.3',
-create => 1);
# get a feature from somewhere
my $feature = Bio::SeqFeature::Generic->new(...);
# store it
$db->store($feature) or die "Couldn't store!";
This is a faster version of the berkeleydb storage adaptor for Bio::DB::SeqFeature::Store. It is used automatically when you create a new database with the original berkeleydb adaptor. When opening a database created under the original adaptor, the old code is used for backward compatibility.
Please see Bio::DB::SeqFeature::Store::berkeleydb for full usage instructions.
This is an early version, so there are certainly some bugs. Please use the BioPerl bug tracking system to report bugs.
bioperl, Bio::DB::SeqFeature, Bio::DB::SeqFeature::Store, Bio::DB::SeqFeature::GFF3Loader, Bio::DB::SeqFeature::Segment, Bio::DB::SeqFeature::Store::memory, Bio::DB::SeqFeature::Store::DBI::mysql,
Lincoln Stein <lincoln.stein@gmail.com>.
Copyright (c) 2009 Ontario Institute for Cancer Research
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| BioPerl documentation | Contained in the BioPerl distribution. |
package Bio::DB::SeqFeature::Store::berkeleydb3; # $Id: berkeleydb3.pm 15987 2009-08-18 21:08:55Z lstein $ # faster implementation of berkeleydb
use strict; use base 'Bio::DB::SeqFeature::Store::berkeleydb'; use DB_File; use Fcntl qw(O_RDWR O_CREAT :flock); use Bio::DB::GFF::Util::Rearrange 'rearrange'; # can't have more sequence ids than this use constant MAX_SEQUENCES => 1_000_000_000; # used to construct the bin key use constant C1 => 500_000_000; # limits chromosome length to 500 megabases use constant C2 => 1000*C1; # at most 1000 chromosomes use constant BINSIZE => 10_000; use constant MININT => -999_999_999_999; use constant MAXINT => 999_999_999_999; use constant SUMMARY_BIN_SIZE => 1000; sub version { return 3.0 } sub open_index_dbs { my $self = shift; my ($flags,$create) = @_; # Create the main index databases; these are DB_BTREE implementations with duplicates allowed. $DB_BTREE->{flags} = R_DUP; my $string_cmp = DB_File::BTREEINFO->new; $string_cmp->{flags} = R_DUP; $string_cmp->{compare} = sub { lc $_[0] cmp lc $_[1] }; my $numeric_cmp = DB_File::BTREEINFO->new; $numeric_cmp->{flags} = R_DUP; $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] }; for my $idx ($self->_index_files) { my $path = $self->_qualify("$idx.idx"); my %db; my $dbtype = $idx eq 'locations' ? $numeric_cmp :$idx eq 'summary' ? $numeric_cmp :$idx eq 'types' ? $numeric_cmp :$idx eq 'seqids' ? $DB_HASH :$idx eq 'typeids' ? $DB_HASH :$string_cmp; tie(%db,'DB_File',$path,$flags,0666,$dbtype) or $self->throw("Couldn't tie $path: $!"); %db = () if $create; $self->index_db($idx=>\%db); } } sub seqid_db { shift->index_db('seqids') } sub typeid_db { shift->index_db('typeids') } sub _delete_databases { my $self = shift; $self->SUPER::_delete_databases; } # given a seqid (name), return its denormalized numeric representation sub seqid_id { my $self = shift; my $seqid = shift; my $db = $self->seqid_db; return $db->{lc $seqid}; } sub add_seqid { my $self = shift; my $seqid = shift; my $db = $self->seqid_db; my $key = lc $seqid; $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key}; die "Maximum number of sequence ids exceeded. This module can handle up to ", MAX_SEQUENCES," unique ids" if $db->{$key} > MAX_SEQUENCES; return $db->{$key}; } # given a seqid (name), return its denormalized numeric representation sub type_id { my $self = shift; my $typeid = shift; my $db = $self->typeid_db; return $db->{$typeid}; } sub add_typeid { my $self = shift; my $typeid = shift; my $db = $self->typeid_db; my $key = lc $typeid; $db->{$key} = ++$db->{'.nextid'} unless exists $db->{$key}; return $db->{$key}; } sub _seq_ids { my $self = shift; if (my $fa = $self->{fasta_db}) { if (my @s = eval {$fa->ids}) { return @s; } } my $l = $self->seqid_db or return; return grep {!/^\./} keys %$l; } sub _index_files { return (shift->SUPER::_index_files,'seqids','typeids','summary'); } sub _update_indexes { my $self = shift; my $obj = shift; defined (my $id = $obj->primary_id) or return; $self->SUPER::_update_indexes($obj); $self->_update_seqid_index($obj,$id); } sub _update_seqid_index { my $self = shift; my ($obj,$id,$delete) = @_; my $seq_name = $obj->seq_id; $self->add_seqid(lc $seq_name); } sub _update_type_index { my $self = shift; my ($obj,$id,$delete) = @_; my $db = $self->index_db('types') or $self->throw("Couldn't find 'types' index file"); my $key = $self->_obj_to_type($obj); my $typeid = $self->add_typeid($key); $self->update_or_delete($delete,$db,$typeid,$id); } sub _obj_to_type { my $self = shift; my $obj = shift; my $tag = $obj->primary_tag; my $source_tag = $obj->source_tag || ''; return unless defined $tag; $tag .= ":$source_tag"; return lc $tag; } sub types { my $self = shift; eval "require Bio::DB::GFF::Typename" unless Bio::DB::GFF::Typename->can('new'); my $db = $self->typeid_db; return grep {!/^\./} map {Bio::DB::GFF::Typename->new($_)} keys %$db; } sub _id2type { my $self = shift; my $wanted_id = shift; my $db = $self->typeid_db; while (my($key,$id) = each %$db) { next if $key =~ /^\./; return $key if $id == $wanted_id; } return; } # return a hash of typeids that match a human-readable type sub _matching_types { my $self = shift; my $types = shift; my @types = ref $types eq 'ARRAY' ? @$types : $types; my $db = $self->typeid_db; my %result; my @all_types; for my $type (@types) { my ($primary_tag,$source_tag); if (ref $type && $type->isa('Bio::DB::GFF::Typename')) { $primary_tag = $type->method; $source_tag = $type->source; } else { ($primary_tag,$source_tag) = split ':',$type,2; } if (defined $source_tag) { my $id = $db->{lc "$primary_tag:$source_tag"}; $result{$id}++ if defined $id; } else { @all_types = $self->types unless @all_types; $result{$db->{$_}}++ foreach grep {/^$primary_tag:/} @all_types; } } return \%result; } sub _update_location_index { my $self = shift; my ($obj,$id,$delete) = @_; my $db = $self->index_db('locations') or $self->throw("Couldn't find 'locations' index file"); my $seq_id = $obj->seq_id || ''; my $start = $obj->start || ''; my $end = $obj->end || ''; my $strand = $obj->strand; my $bin_min = int $start/BINSIZE; my $bin_max = int $end/BINSIZE; my $typeid = $self->add_typeid($self->_obj_to_type($obj)); my $seq_no = $self->add_seqid($seq_id); for (my $bin = $bin_min; $bin <= $bin_max; $bin++ ) { my $key = $seq_no * MAX_SEQUENCES + $bin; $self->update_or_delete($delete,$db,$key,pack("i5",$id,$start,$end,$strand,$typeid)); } } sub _features { my $self = shift; my ($seq_id,$start,$end,$strand, $name,$class,$allow_aliases, $types, $attributes, $range_type, $iterator ) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'],'STRAND', 'NAME','CLASS','ALIASES', ['TYPES','TYPE','PRIMARY_TAG'], ['ATTRIBUTES','ATTRIBUTE'], 'RANGE_TYPE', 'ITERATOR', ],@_); my (@from,@where,@args,@group); $range_type ||= 'overlaps'; my @result; unless (defined $name or defined $seq_id or defined $types or defined $attributes) { my $is_indexed = $self->index_db('is_indexed'); @result = $is_indexed ? grep {$is_indexed->{$_}} keys %{$self->db} : grep { !/^\./ }keys %{$self->db}; } my %found = (); my $result = 1; if (defined($name)) { # hacky backward compatibility workaround undef $class if $class && $class eq 'Sequence'; $name = "$class:$name" if defined $class && length $class > 0; $result &&= $self->filter_by_name($name,$allow_aliases,\%found); } if (defined $seq_id) { # location with or without types my $typelist = defined $types ? $self->_matching_types($types) : undef; $result &&= $self->filter_by_type_and_location($seq_id,$start,$end,$strand,$range_type, $typelist, \%found); } elsif (defined $types) { # types without location $result &&= $self->filter_by_type($types,\%found); } if (defined $attributes) { $result &&= $self->filter_by_attribute($attributes,\%found); } push @result,keys %found if $result; return $iterator ? Bio::DB::SeqFeature::Store::berkeleydb::Iterator->new($self,\@result) : map {$self->fetch($_)} @result; } sub filter_by_type { my $self = shift; my ($types,$filter) = @_; my @types = ref $types eq 'ARRAY' ? @$types : $types; my $index = $self->index_db('types'); my $db = tied(%$index); my @results; for my $type (@types) { my ($primary_tag,$source_tag); if (ref $type && $type->isa('Bio::DB::GFF::Typename')) { $primary_tag = $type->method; $source_tag = $type->source; } else { ($primary_tag,$source_tag) = split ':',$type,2; } $source_tag ||= ''; $primary_tag = quotemeta($primary_tag); $source_tag = quotemeta($source_tag); my $match = length $source_tag ? "^$primary_tag:$source_tag\$" : "^$primary_tag:"; my $key = lc "$primary_tag:$source_tag"; my $value; # If filter is already provided, then it is usually faster to # fetch each object. if (%$filter) { for my $id (keys %$filter) { my $obj = $self->_fetch($id) or next; push @results,$id if $obj->type =~ /$match/i; } } else { my $types = $self->typeid_db; my @typeids = map {$types->{$_}} grep {/$match/} keys %$types; for my $t (@typeids) { my $k = $t; for (my $status = $db->seq($k,$value,R_CURSOR); $status == 0 && $k == $t; $status = $db->seq($k,$value,R_NEXT)) { next if %$filter && !$filter->{$value}; # don't even bother push @results,$value; } } } } $self->update_filter($filter,\@results); } sub filter_by_type_and_location { my $self = shift; my ($seq_id,$start,$end,$strand,$range_type,$typelist,$filter) = @_; $strand ||= 0; my $index = $self->index_db('locations'); my $db = tied(%$index); my $binstart = defined $start ? int $start/BINSIZE : 0; my $binend = defined $end ? int $end/BINSIZE : MAX_SEQUENCES-1; my %seenit; my @results; $start = MININT if !defined $start; $end = MAXINT if !defined $end; my $seq_no = $self->seqid_id($seq_id); return unless defined $seq_no; if ($range_type eq 'overlaps' or $range_type eq 'contains') { my $keystart = $seq_no * MAX_SEQUENCES + $binstart; my $keystop = $seq_no * MAX_SEQUENCES + $binend; my $value; for (my $status = $db->seq($keystart,$value,R_CURSOR); $status == 0 && $keystart <= $keystop; $status = $db->seq($keystart,$value,R_NEXT)) { my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value); next if $seenit{$id}++; next if $strand && $fstrand != $strand; next if $typelist && !$typelist->{$ftype}; if ($range_type eq 'overlaps') { next unless $fend >= $start && $fstart <= $end; } elsif ($range_type eq 'contains') { next unless $fstart >= $start && $fend <= $end; } next if %$filter && !$filter->{$id}; # don't bother push @results,$id; } } # for contained in, we look for features originating and terminating outside the specified range # this is incredibly inefficient, but fortunately the query is rare (?) elsif ($range_type eq 'contained_in') { my $keystart = $seq_no * MAX_SEQUENCES; my $keystop = $seq_no * MAX_SEQUENCES + $binstart; my $value; # do the left part of the range for (my $status = $db->seq($keystart,$value,R_CURSOR); $status == 0 && $keystart <= $keystop; $status = $db->seq($keystart,$value,R_NEXT)) { my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value); next if $seenit{$id}++; next if $strand && $fstrand != $strand; next if $typelist && !$typelist->{$ftype}; next unless $fstart <= $start && $fend >= $end; next if %$filter && !$filter->{$id}; # don't bother push @results,$id; } # do the right part of the range $keystart = $seq_no*MAX_SEQUENCES+$binend; for (my $status = $db->seq($keystart,$value,R_CURSOR); $status == 0; $status = $db->seq($keystart,$value,R_NEXT)) { my ($id,$fstart,$fend,$fstrand,$ftype) = unpack("i5",$value); next if $seenit{$id}++; next if $strand && $fstrand != $strand; next unless $fstart <= $start && $fend >= $end; next if $typelist && !$typelist->{$ftype}; next if %$filter && !$filter->{$id}; # don't bother push @results,$id; } } $self->update_filter($filter,\@results); } sub build_summary_statistics { my $self = shift; my $insert = $self->index_db('summary'); %$insert = (); my $current_bin = -1; my (%residuals,$last_bin); my $le = -t \*STDERR ? "\r" : "\n"; print STDERR "\n"; # iterate through all the indexed features my $sbs = SUMMARY_BIN_SIZE; # Sadly we have to do this in two steps. In the first step, we sort # features by typeid,seqid,start. In the second step, we read through # this sorted list. To avoid running out of memory, we use a db_file # temporary database my $fh = File::Temp->new() or $self->throw("Couldn't create temporary file for sorting: $!"); my $name = $fh->filename; my %sort; my $numeric_cmp = DB_File::BTREEINFO->new; $numeric_cmp->{compare} = sub { $_[0] <=> $_[1] }; $numeric_cmp->{flags} = R_DUP; my $s = tie %sort,'DB_File',$name,0666,O_CREAT|O_RDWR,$numeric_cmp or $self->throw("Couldn't create temporary file for sorting: $!"); my $index = $self->index_db('locations'); my $db = tied(%$index); my $keystart = 0; my ($value,$count); my %seenit; for (my $status = $db->seq($keystart,$value,R_CURSOR); $status == 0; $status = $db->seq($keystart,$value,R_NEXT)) { my ($id,$start,$end,$strand,$typeid) = unpack('i5',$value); next if $seenit{$id}++; print STDERR $count," features sorted$le" if ++$count % 1000 == 0; my $seqid = int($keystart / MAX_SEQUENCES); my $key = $self->_encode_summary_key($typeid,$seqid,$start-1); $sort{$key}=$end; } print STDERR "COUNT = $count\n"; my ($current_type,$current_seqid,$end); my $cum_count = 0; $keystart = 0; $count = 0; # the second step allows us to iterate through this for (my $status = $s->seq($keystart,$end,R_CURSOR); $status == 0; $status = $s->seq($keystart,$end,R_NEXT)) { print STDERR $count," features processed$le" if ++$count % 1000 == 0; my ($typeid,$seqid,$start) = $self->_decode_summary_key($keystart); my $bin = int($start/$sbs); # because the input is sorted by start, no more features will contribute to the # current bin so we can dispose of it if ($bin != $current_bin) { if ($seqid != $current_seqid or $typeid != $current_type) { # load all bins left over $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid); %residuals = () ; $cum_count = 0; } else { # load all up to current one $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid,$current_bin); } } $last_bin = $current_bin; ($current_seqid,$current_type,$current_bin) = ($seqid,$typeid,$bin); # summarize across entire spanned region my $last_bin = int(($end-1)/$sbs); for (my $b=$bin;$b<=$last_bin;$b++) { $residuals{$b}++; } } # handle tail case # load all bins left over $self->_load_bins($insert,\%residuals,\$cum_count,$current_type,$current_seqid); undef %sort; undef $fh; } sub _load_bins { my $self = shift; my ($insert,$residuals,$cum_count,$typeid,$seqid,$stop_after) = @_; for my $b (sort {$a<=>$b} keys %$residuals) { last if defined $stop_after and $b > $stop_after; $$cum_count += $residuals->{$b}; my $key = $self->_encode_summary_key($typeid,$seqid,$b); $insert->{$key} = $$cum_count; delete $residuals->{$b}; # no longer needed } } sub coverage_array { my $self = shift; my ($seq_name,$start,$end,$types,$bins) = rearrange([['SEQID','SEQ_ID','REF'],'START',['STOP','END'], ['TYPES','TYPE','PRIMARY_TAG'],'BINS'],@_); $bins ||= 1000; $start ||= 1; unless ($end) { my $segment = $self->segment($seq_name) or $self->throw("unknown seq_id $seq_name"); $end = $segment->end; } my $binsize = ($end-$start+1)/$bins; my $seqid = $self->seqid_id($seq_name) || 0; return [] unless $seqid; # where each bin starts my @his_bin_array = map {$start + $binsize * $_} (0..$bins); my @sum_bin_array = map {int(($_-1)/SUMMARY_BIN_SIZE)} @his_bin_array; my $interval_stats_idx = $self->index_db('summary'); my $db = tied(%$interval_stats_idx); my $t = $self->_matching_types($types); my (%bins,$report_tag); for my $typeid (sort keys %$t) { $report_tag ||= $typeid; for (my $i=0;$i<@sum_bin_array;$i++) { my $cum_count; my $bin = $sum_bin_array[$i]; my $key = $self->_encode_summary_key($typeid,$seqid,$bin); my $status = $db->seq($key,$cum_count,R_CURSOR); next unless $status == 0; push @{$bins{$typeid}},[$bin,$cum_count]; } } my @merged_bins; my $firstbin = int(($start-1)/$binsize); for my $type (keys %bins) { my $arry = $bins{$type}; my $last_count = $arry->[0][1]-1; my $last_bin = -1; my $i = 0; my $delta; for my $b (@$arry) { my ($bin,$count) = @$b; $delta = $count - $last_count if $bin > $last_bin; $merged_bins[$i++] = $delta; $last_count = $count; $last_bin = $bin; } } my $returned_type = $self->_id2type($report_tag); return wantarray ? (\@merged_bins,$returned_type) : \@merged_bins; } sub _encode_summary_key { my $self = shift; my ($typeid,$seqid,$bin) = @_; $self->throw('Cannot index chromosomes larger than '.C1*SUMMARY_BIN_SIZE/1e6.' megabases') if $bin > C1; return ($typeid-1)*C2 + ($seqid-1)*C1 + $bin; } sub _decode_summary_key { my $self = shift; my $key = shift; my $typeid = int($key/C2); my $residual = $key%C2; my $seqid = int($residual/C1); my $bin = $residual%C1; return ($typeid+1,$seqid+1,$bin); } 1;