Bloom::Filter - Sample Perl Bloom filter implementation


Bloom-Filter documentation Contained in the Bloom-Filter distribution.

Index


Code Index:

NAME

Top

Bloom::Filter - Sample Perl Bloom filter implementation

DESCRIPTION

Top

A Bloom filter is a probabilistic algorithm for doing existence tests in less memory than a full list of keys would require. The tradeoff to using Bloom filters is a certain configurable risk of false positives. This module implements a simple Bloom filter with configurable capacity and false positive rate. Bloom filters were first described in a 1970 paper by Burton Bloom, see http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal.

SYNOPSIS

Top

	use Bloom::Filter

	my $bf = Bloom::Filter->new( capacity => 10, error_rate => .001 );

	$bf->add( @keys );

	while ( <> ) {
		chomp;
		print "Found $_\n" if $bf->check( $_ );
	}

CONSTRUCTORS

Top

new %PARAMS

Create a brand new instance. Allowable params are error_rate, capacity.

init

Calculates the best number of hash functions and optimum filter length, creates some random salts, and generates a blank bit vector. Called automatically by constructor.

ACCESSORS

Top

capacity

Returns the total capacity of the Bloom filter

error_rate

Returns the configured maximum error rate

length

Returns the length of the Bloom filter in bits

key_count

Returns the number of items currently stored in the filter

on_bits

Returns the number of 'on' bits in the filter

salts

Returns the list of salts used to create the hash functions

PUBLIC METHODS

Top

add @KEYS

Adds the list of keys to the filter. Will fail, return undef and complain if the number of keys in the filter exceeds the configured capacity.

check @KEYS

Checks the provided key list against the Bloom filter, and returns a list of equivalent length, with true or false values depending on whether there was a match.

INTERNAL METHODS

Top

_calculate_shortest_filter_length CAPACITY ERR_RATE

Given a desired error rate and maximum capacity, returns the optimum combination of vector length (in bits) and number of hash functions to use in building the filter, where "optimum" means shortest vector length.

_get_cells KEY

Given a key, hashes it using the list of salts and returns an array of cell indexes corresponding to the key.

AUTHOR

Top

Maciej Ceglowski <maciej@ceglowski.com>

CHANGELOG

Top

Feb 2007 big speedup by Dmitriy Ryaboy <dmitriy.ryaboy@ask.com> (thanks!)

COPYRIGHT AND LICENSE

Top


Bloom-Filter documentation Contained in the Bloom-Filter distribution.
package Bloom::Filter;

use strict;
use warnings;
use Carp;
use Digest::SHA1 qw/sha1 sha1_base64/;

our $VERSION = '1.0';

sub new 
{
	my ( $class, %params ) = @_;

	my $self = 
	{  
		 # some defaults
		 error_rate => 0.001, 
		 capacity 	=> 100, 
			 
		 %params,
		 
		 # internal data
		 key_count 		=> 0,
		 filter_length 	=> 0,
		 num_hash_funcs => 0,
		 salts 			=> [],
	};
	bless $self, $class;
	$self->init();
	return $self;
}


sub init 
{
	my ( $self ) = @_;
	
	# some sanity checks
	croak "Capacity must be greater than zero" unless $self->{capacity};
	croak "Error rate must be greater than zero" unless $self->{error_rate};
	croak "Error rate cannot exceed 1" unless $self->{error_rate} < 1;
                                     	
	my ( $length, $num_funcs ) = $self->_calculate_shortest_filter_length
	    ($self->{capacity}, $self->{error_rate} );
	
	$self->{num_hash_funcs} = $num_funcs;
	$self->{filter_length} = $length;
	
	# create some random salts;
	my %collisions;
	while ( scalar keys %collisions < $self->{num_hash_funcs} ) {
		$collisions{rand()}++;
	}
	$self->{salts} = [ keys %collisions ];
	
	# make an empty filter
	$self->{filter} = pack( "b*", '0' x $self->{filter_length} );
	
	# make some blank vectors to use
	$self->{blankvec} = pack( "N", 0 ); 
	
	return 1;
}


sub capacity { $_[0]->{capacity} };

sub error_rate { $_[0]->{error_rate} };

sub length { $_[0]->{filter_length} };

sub key_count { $_[0]->{key_count} };


sub on_bits 
{
	my ( $self ) = @_;
	return unless $self->{filter};
	return unpack( "%32b*",  $self->{filter})
}

sub salts 
{ 
	my ( $self ) = @_;
	return unless exists $self->{salts}
		and ref $self->{salts}
		and ref $self->{salts} eq 'ARRAY';

	return @{ $self->{salts} };
}


sub add 
{
	my ( $self, @keys ) = @_;
	
	return unless @keys;
	# Hash our list of keys into the empty filter
	my @salts = @{ $self->{salts} }
		or croak "No salts found, cannot make bitmask";
	foreach my $key ( @keys ) {
	    if ($self->{key_count} >= $self->{capacity}) {	
			carp "Exceeded filter capacity";
			return;
	    }
	    # flip the appropriate bits on
        vec($self->{filter}, $_, 1) = 1 foreach @{$self->_get_cells($key)};  
	    $self->{key_count}++;
	}
	return 1;
}



sub check 
{	

	my ( $self, @keys ) = @_;
	
	return unless @keys;
	my @result;

	# A match occurs if every bit we check is on
	foreach my $key ( @keys ) {
	    my $match = 1;
	    foreach my $cell (@{$self->_get_cells($key)} ) {
			$match = vec( $self->{filter}, $cell, 1 ) ;
			last unless $match;
	    }
	    push @result, $match;
	}
	return ( wantarray() ? @result : $result[0] );
}




sub _calculate_shortest_filter_length 
{
        my ( $self, $num_keys, $error_rate ) = @_;
        my $lowest_m;
        my $best_k = 1;

        foreach my $k ( 1..100 ) {
                my $m = (-1 * $k * $num_keys) / 
                        ( log( 1 - ($error_rate ** (1/$k))));

                if ( !defined $lowest_m or ($m < $lowest_m) ) {
                        $lowest_m = $m;
                        $best_k   = $k;
                }
        }
        $lowest_m = int( $lowest_m ) + 1;
        return ( $lowest_m, $best_k );
} 



sub _get_cells 
{

	my ( $self, $key ) = @_;

	croak "Filter length is undefined" unless $self->{filter_length};
	my @salts = @{ $self->{salts} }
		or croak "No salts found, cannot make bitmask";

	my @cells;
	foreach my $salt ( @salts ){ 

		my $hash = sha1( $key, $salt );

		# blank 32 bit vector
		my $vec = $self->{blankvec};

		# split the 160-bit hash into five 32-bit ints
		# and XOR the pieces together

		my @pieces =  map {pack( "N", $_ )} unpack("N*", $hash );
		$vec = $_ ^ $vec foreach @pieces;	

		# Calculate bit offset by modding
		my $result = unpack( "N", $vec );
		my $bit_offset = $result % $self->{filter_length};
		push @cells, $bit_offset;
	}
	return \@cells;
}



1;