Digest::ManberHash - a Perl package to calculate Manber Hashes


Digest-ManberHash documentation Contained in the Digest-ManberHash distribution.

Index


Code Index:

NAME

Top

Digest::ManberHash - a Perl package to calculate Manber Hashes

SYNOPSIS

Top

  use Digest::ManberHash;

  $instance = Digest::ManberHash::new($maskbits, $prime, $charcount);

  $hash1 = $instance->DoHash($filename1);
  $hash2 = $instance->DoHash($filename2);

  $similarity = $instance->Compare($hash1, $hash2);

DESCRIPTION

Top

Initialization

Use Digest::ManberHash::new. Parameters:

maskbits

range 1 .. 30, default 11.

prime

range 3 .. 65537, default 7.

charcount

range 8 .. 32768, default 64.

For a detailed description please read http://citeseer.nj.nec.com/manber94finding.html.

Calculating hashes

  $hash = $instance->DoHash($filename);

This gives an object, which has an hash of hash values stored within.

Comparing hashes

  $similarity = $instance->Compare($hash1, $hash2);

This gives an value of 0.0 .. 1.0, depending on the similariness. Help wanted: The calculation could do better than now!!


Digest-ManberHash documentation Contained in the Digest-ManberHash distribution.

package Digest::ManberHash;

require Exporter;
require DynaLoader;

our @ISA = qw(Exporter DynaLoader);
# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.
our @EXPORT = qw(
    HashFile
    new
    Compare
    );
our $VERSION = '0.7';


sub new
{
  my($class, $maskbits, $prime, $charcount)=@_;
  my($x,%a);

  $prime||=7;
  $maskbits||=11;
  $charcount||=64;

  $x=Init($prime,$maskbits,$charcount);
  %a=( "settings" => $x );

  bless \%a;
}

sub DoHash
{
  my($self,$filename)=@_;
  my($e,$f,%a,%b);

  %b=();
  ManberHash($self->{"settings"}, $filename, \%b );
  %a= ( "data" => \%b, "base" => $self);

  while (($e, $f) = each(%b))
  {
    $self->{"max"}{$e}=$f if $self->{"max"}{$e} < $f;
  }
  
  bless \%a;
}

sub Compare
{
  my($self,$file1,$file2)=@_;
  my(%keys,$a,$k,$c,$v,$m);

  #return 0 if (ref($self) !~ /^HASH/);
  die if $self ne $file1->{"base"} ||
$self ne $file2->{"base"};


  %keys=map { $_,1; } (keys %{$file1->{"data"}}, keys %{$file2->{"data"}});  
  $c=$a=$m=0;
  for $k (keys %keys)
  {
    $v = ($file1->{"data"}->{$k} - $file2->{"data"}->{$k});
#    $m += $self->{"max"}{$k} * $self->{"max"}{$k};
    $a += $v*$v;
    $c++;
#    print "$k = ",$self->{$k}," - ",$other->{$k},"($c, $a)\n";
  }

  return 0 if !$c;
#  1 - 6*$a/($c*$c*$c - $c);
#  1-sqrt($a)/$c;
  1/(1.0+$a);
}

bootstrap Digest::ManberHash $VERSION;

# Preloaded methods go here.

# Autoload methods go after __END__, and are processed by the autosplit program.

1;
__END__
#