Search::Indexer::Incremental::MD5 - Incrementally index your files


Search-Indexer-Incremental-MD5 documentation Contained in the Search-Indexer-Incremental-MD5 distribution.

Index


Code Index:

NAME

Top

Search::Indexer::Incremental::MD5 - Incrementally index your files

SYNOPSIS

Top

  use File::Find::Rule ;

  use Readonly ;
  Readonly my $DEFAUT_MAX_FILE_SIZE_INDEXING_THRESHOLD => 300 << 10 ; # 300KB

  my $indexer 
	= Search::Indexer::Incremental::MD5::Indexer->new
		(
		USE_POSITIONS => 1, 
		INDEX_DIRECTORY => 'text_index', 
		get_perl_word_regex_and_stopwords(),
		) ;

  my @files = File::Find::Rule
		->file()
		->name( '*.pm', '*.pod' )
		->size( "<=$DEFAUT_MAX_FILE_SIZE_INDEXING_THRESHOLD" )
		->not_name(qr[auto | unicore | DateTime/TimeZone | DateTime/Locale])
		->in('.') ;

  indexer->add_files(@files) ;
  indexer->add_files(@more_files) ;
  indexer = undef ;

  my $search_string = 'find_me' ;
  my $searcher = 
	eval 
	{
	Search::Indexer::Incremental::MD5::Searcher->new
		(
		USE_POSITIONS => 1, 
		INDEX_DIRECTORY => 'text_index', 
		get_perl_word_regex_and_stopwords(),
		)
	} or croak "No full text index found! $@\n" ;

  my $results = $searcher->search($search_string) ;

  # sort in decreasing score order
  my @indexes = map { $_->[0] }
		    reverse
		        sort { $a->[1] <=> $b->[1] }
			    map { [$_, $results->[$_]{SCORE}] }
			        0 .. $#$results ;

  for (@indexes)
	{
	print {* STDOUT} "$results->[$_]{PATH} [$results->[$_]{SCORE}].\n" ;
	}

  $searcher = undef ;




DESCRIPTION

Top

This module implements an incremental text indexer and searcher based on Search::Indexer.

DOCUMENTATION

Top

Given a list of files, this module will allow you to create an indexed text database that you can later query for matches. You can also use the siim command line application installed with this module.

SUBROUTINES/METHODS

Top

show_database_information($index_directory)

Arguments

* $index_directory - location of the index databases

Returns - A hash reference. Keys represent an information field.

Exceptions - Error opening the indexing database

delete_indexing_databases($index_directory)

Removes all the index databases in the passed directory

Arguments

* $index_directory - location of the index databases

Returns - Nothing

Exceptions - Can't remove index databases.

search_string(\%arguments)

Displays all the files matching the search query.

Arguments

\%arguments -

-
$arguments->{perl_mode} - Boolean - Use Perl specific word regex and stopwords
$arguments->{index_directory} - The location of the index database
$arguments->{use_position} - See Sear::Indexer for a complete documentation
$arguments->{search} - String - The search query
$arguments->{verbose} - Boolean - Display the document id and score if set

$search_string -

Returns - Nothing

Exceptions - None

add_files(\%arguments, \@files)

Adds files to index, if the files are modified, and displays their name.

Arguments

\%arguments -

$arguments->{perl_mode} - Boolean - Use Perl specific word regex and stopwords
$arguments->{stopwords_file} - Optional- Name of the file containing the stopwords to use (overridden by the perl option)
$arguments->{index_directory} - The location of the index database
$arguments->{use_position} - See Sear::Indexer for a complete documentation
$arguments->{maximum_document_size} - Integer - Only files with size inferior to this limit will be added
$arguments->{verbose} - Boolean - Display the document id and score if set

\@files - Files to be added in the index

Returns - Nothing

Exceptions - None

remove_files(\%arguments, \@files)

Remove the passed files from the index

Arguments

$\%arguments -

$arguments->{perl_mode} - Boolean - Use Perl specific word regex and stopwords
$arguments->{stopwords_file} - Optional- Name of the file containing the stopwords to use (overridden by the perl option)
$arguments->{index_directory} - The location of the index database
$arguments->{use_position} - See Sear::Indexer for a complete documentation
$arguments->{verbose} - Boolean - Display the document id and score if set

\@files - Files to be removed

Returns - Nothing

Exceptions - None

check_index(\%arguments)

check the files in the index

Arguments

\%arguments -

$arguments->{perl_mode} - Boolean - Use Perl specific word regex and stopwords
$arguments->{stopwords_file} - Optional- Name of the file containing the stopwords to use (overridden by the perl option)
$arguments->{index_directory} - The location of the index database
$arguments->{use_position} - See Sear::Indexer for a complete documentation
$arguments->{verbose} - Boolean - Display the document id and score if set

Returns - Nothing

Exceptions - None

get_file_MD5($file)

Returns the MD5 of the $file argument.

Arguments

$file - The location of the file to compute an MD5 for

Returns - A string containing the file md5

Exceptions - fails if the file can't be open

BUGS AND LIMITATIONS

Top

None so far.

AUTHOR

Top

	Nadim ibn hamouda el Khemir
	CPAN ID: NKH
	mailto: nadim@cpan.org

LICENSE AND COPYRIGHT

Top

SUPPORT

Top

You can find documentation for this module with the perldoc command.

    perldoc Search::Indexer::Incremental::MD5

You can also look for information at:

* AnnoCPAN: Annotated CPAN documentation

http://annocpan.org/dist/Search-Indexer-Incremental-MD5

* RT: CPAN's request tracker

Please report any bugs or feature requests to L <bug-search-indexer-incremental-md5@rt.cpan.org>.

We will be notified, and then you'll automatically be notified of progress on your bug as we make changes.

* Search CPAN

http://search.cpan.org/dist/Search-Indexer-Incremental-MD5

SEE ALSO

Top

Search::Indexer

Search::Indexer::Incremental::MD5::Indexer and Search::Indexer::Incremental::MD5::Searcher


Search-Indexer-Incremental-MD5 documentation Contained in the Search-Indexer-Incremental-MD5 distribution.
package Search::Indexer::Incremental::MD5 ;

use strict;
use warnings ;
use Carp qw(carp croak confess) ;

BEGIN 
{
use Sub::Exporter -setup => 
	{
	exports => 
		[
		qw
			(
			delete_indexing_databases
			show_database_information
			add_files
			remove_files
			check_index
			search_string
			)
		],
		
	groups  => 
		{
		all  => [
			qw
				(
				delete_indexing_databases
				show_database_information
				add_files
				remove_files
				check_index
				search_string
				)
			],
		}
	};
	
use vars qw ($VERSION);
$VERSION     = '0.06';
}

#----------------------------------------------------------------------------------------------------------

use File::stat;
use Time::localtime;
use BerkeleyDB;
use List::Util qw/sum/;

use Search::Indexer::Incremental::MD5::Indexer qw() ;
use Search::Indexer::Incremental::MD5::Searcher qw() ;
use Search::Indexer::Incremental::MD5::Language::Perl qw(get_perl_word_regex_and_stopwords) ;

use Digest::MD5 ;
use English qw( -no_match_vars ) ;

use Readonly ;
Readonly my $EMPTY_STRING => q{} ;

#----------------------------------------------------------------------------------------------------------

#----------------------------------------------------------------------------------------------------------

Readonly my $ID_TO_METADATA_FILE => 'id_to_docs_metadata.bdb' ;

#----------------------------------------------------------------------------------------------------------

sub show_database_information
{

my ($index_directory) = @_ ;

croak 'Error: index directory not defined!'  unless defined $index_directory ;

Readonly my $ID_TO_METADATA_FILE_AND_PATH => "$index_directory/$ID_TO_METADATA_FILE" ;

# use id_to_docs_metadata.bdb, to store a lookup from the uniq id 
# to the document metadata {$doc_id => "$md5\t$path"}
tie my %id_to_metadata, 'BerkeleyDB::Hash', ## no critic (Miscellanea::ProhibitTies)
	-Filename => $ID_TO_METADATA_FILE_AND_PATH, 
	-Flags    => DB_CREATE
		or croak "Error: opening '$ID_TO_METADATA_FILE_AND_PATH': $^E $BerkeleyDB::Error";

return
	{
	entries => scalar(grep {defined $id_to_metadata{$_}} keys %id_to_metadata),
	size => sum(map {-s} (glob("$index_directory/*.bdb"), $ID_TO_METADATA_FILE_AND_PATH)),
	update_date => ctime(stat($ID_TO_METADATA_FILE_AND_PATH)->mtime),
	} ;
}

#----------------------------------------------------------------------------------------------------------

sub delete_indexing_databases
{

my ($index_directory) = @_ ;

croak "Error: Invalid or undefined index directory!\n" unless defined $index_directory ;

for my $file_to_remove
	(
	"$index_directory/$ID_TO_METADATA_FILE",
	"$index_directory/ixd.bdb",
	"$index_directory/ixw.bdb",
	)
	{
	unlink $file_to_remove or croak "Error: Can't unlink '$file_to_remove': $!" ;
	}

return ;
}

#----------------------------------------------------------------------------------------------------------

sub search_string
{

my ($arguments) = @_ ;

my $searcher 
	= eval 
		{
		Search::Indexer::Incremental::MD5::Searcher->new
			(
			INDEX_DIRECTORY => $arguments->{index_directory}, 
			USE_POSITIONS => $arguments->{use_position}, 
			);
		} or croak "No full text index found! $@\n" ;

my $results = $searcher->search(SEARCH_STRING => $arguments->{search}) ;

## no critic (ProhibitDoubleSigils)
my @indexes = map { $_->[0] } 
				reverse
					sort { $a->[1] <=> $b->[1] }
						map { [$_, $results->[$_]{SCORE}] }
							0 .. $#$results ;

for my $index (@indexes)
	{
	my $matching_file = $results->[$index]{PATH} ;
	
	if($arguments->{verbose})
		{
		print {* STDOUT} "'$matching_file' [id:$results->[$index]{ID}] with score $results->[$index]{SCORE}.\n" ;
		}
	else
		{
		print {* STDOUT} "$matching_file\n" ;
		}
	}
	
return ;
}

#----------------------------------------------------------------------------------------------------------

sub add_files
{

my ($arguments, $files) = @_ ;

my @perl_extra_arguments  ;
@perl_extra_arguments = get_perl_word_regex_and_stopwords() if($arguments->{perl_mode}) ;

my @stopwords ;
@stopwords = (STOPWORDS => $arguments->{stopwords_file}) if($arguments->{stopwords_file}) ;

my $indexer 
	= Search::Indexer::Incremental::MD5::Indexer->new
		(
		INDEX_DIRECTORY => $arguments->{index_directory}, 
		USE_POSITIONS => $arguments->{use_position}, 
		WORD_REGEX => qr/\w+/smx,
		@stopwords,
		@perl_extra_arguments,
		) ;

$indexer->add_files
	(
	FILES => [sort @{$files}],
	MAXIMUM_DOCUMENT_SIZE => $arguments->{maximum_document_size},
	DONE_ONE_FILE_CALLBACK => 
		sub
		{
		my ($file, $description, $file_info) = @_ ;
		
		if($file_info->{STATE} == 0)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] up to date %.3f s.\n", $file_info->{TIME} ;
				}
			}
		elsif($file_info->{STATE} == 1)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] re-indexed in %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		elsif($file_info->{STATE} == 2)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] new file %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		else
			{
			croak "Error: Unexpected file '$file' state!\n" ;
			}
		}
	) ;

return
}

#----------------------------------------------------------------------------------------------------------

sub remove_files
{

my ($arguments, $files) = @_ ;

my @perl_extra_arguments  ;
@perl_extra_arguments = get_perl_word_regex_and_stopwords() if($arguments->{perl_mode}) ;

my @stopwords ;
@stopwords = (STOPWORDS => $arguments->{stopwords_file}) if($arguments->{stopwords_file}) ;

my $indexer 
	= Search::Indexer::Incremental::MD5::Indexer->new
		(
		INDEX_DIRECTORY => $arguments->{index_directory}, 
		USE_POSITIONS => $arguments->{use_position}, 
		WORD_REGEX => qr/\w+/smx,
		@stopwords,
		@perl_extra_arguments,
		) ;

$indexer->remove_files
	(
	FILES => $files,
	DONE_ONE_FILE_CALLBACK => 
		sub
		{
		my ($file, $description, $file_info) = @_ ;

		if($file_info->{STATE} == 0)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] removed in  %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		elsif($file_info->{STATE} == 1)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' not found in %.3f s.\n", $file_info->{TIME} ;
				}
			}
		else
			{
			croak "Error: Unexpected file '$file' state!\n" ;
			}
		}
	) ;
	
return ;
}

#----------------------------------------------------------------------------------------------------------

sub check_index
{

my ($arguments) = @_ ;

my @perl_extra_arguments  ;
@perl_extra_arguments = get_perl_word_regex_and_stopwords() if($arguments->{perl_mode}) ;

my @stopwords ;
@stopwords = (STOPWORDS => $arguments->{stopwords_file}) if($arguments->{stopwords_file}) ;

my $indexer 
	= Search::Indexer::Incremental::MD5::Indexer->new
		(
		INDEX_DIRECTORY => $arguments->{index_directory}, 
		USE_POSITIONS => $arguments->{use_position}, 
		WORD_REGEX => qr/\w+/smx,
		@stopwords,
		@perl_extra_arguments,
		) ;

$indexer->check_indexed_files
	(
	DONE_ONE_FILE_CALLBACK => 
		sub
		{
		my ($file, $description,$file_info) = @_ ;

		if($file_info->{STATE} == 0)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] found and identical in %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		elsif($file_info->{STATE} == 1)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] file found, contents differ %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		elsif($file_info->{STATE} == 2)
			{
			if($arguments->{verbose})
				{
				printf {* STDOUT} "'$file' [id:$file_info->{ID}] not found in %.3f s.\n", $file_info->{TIME} ;
				}
			else
				{
				print {* STDOUT} "$file\n" ;
				}
			}
		else
			{
			croak "Error: Unexpected file '$file' state!\n" ;
			}
		}
	) ;

return ;
}

#----------------------------------------------------------------------------------------------------------

sub get_file_MD5
{

my ($file) = @_ ;
open(my $fh, '<', $file) or croak "Error: Can't open '$file' to compute MD5: $!";
binmode($fh);

my $md5 = Digest::MD5->new->addfile($fh)->hexdigest ;

close $fh or croak 'Error: Can not close file!' ;

return $md5 ;
}

#----------------------------------------------------------------------------------------------------------

1 ;