| MMM-Text-Search documentation | Contained in the MMM-Text-Search distribution. |
MMM::Text::Search - Perl module for indexing and searching text files and web objects
use MMM::Text::Search;
my $srch = new MMM::Text::Search { #for indexing...
#index main file location...
IndexPath => "/tmp/myindex.db",
#local files... (optional)
FileMask => '(?i)(\.txt|\.htm.?)$',
Dirs => [ "/usr/doc", "/tmp" ] ,
FollowSymLinks => 0|1, (default = 0)
#web objects... (optional)
URLs => [ "http://localhost/", ... ],
Level => recursion-level (0=unlimited)
#common options...
IgnoreLimit => 0.3, (default = 2/3)
Verbose => 0|1
};
$srch->start_indexing_session();
$srch->commit_indexing_session();
$srch->index_default_locations();
$srch->index_content( { title => '...',
content=> '...',
id => '...' } );
$srch->makeindex;
(Obsolete.)
my $srch = new MMM::Text::Search ( #for searching....
"/tmp/myindex.db", verbose_flag );
my $hashref = $srch->query("pizza","ciao", "-pasta" );
my $hashref = $srch->advanced_query("(pizza OR ciao) AND NOT pasta");
$srch->errstr() # returns last error
# (only query syntax-errors for the moment being)
$srch->dump_word_stats(\*FH)
When a session is closed the following files will have been created (assuming IndexPath = /path/myindex.db, see constructor):
/path/myindex.db word index database /path/myindex-locations.db filename/URL database /path/myindex-titles.db html title database /path/myindex.stopwords stop-words list /path/myindex.filelist readable list of indexed files/URLs /path/myindex.deadlinks broken http links [... lots of important things missing ... ]
start_indexing_session() starts session.
commit_indexing_session() commits and closes current session.
index_default_locations() indexes all files and URLs specified on construction.
index_content() pushes content into indexing engine. Argument must have the following structure
{ title => '...', content=> '...', id => '...' }
makeindex() is obsolete. Equivalent to: $srch->start_indexing_session(); $srch->index_default_locations(); $srch->commit_indexing_session();
dump_word_stats(\*FH) dumps all words sorted by occurence frequency using FH file handle (or STDOUT if no parameter is specified). Stop-words get a frequency value of 1.
Both query() and advanced_query() return a reference to a hash with the following structure:
( ignored => [ string, string, ... ], # ignored words searched => [ string, string, ... ], # words searched for entries => [ hashref, hashref, ... ] # list of records # found )
The 'entries' element is a reference to an array of hashes, each having the following structure:
( location => string, # file path or URL or anything score => number, # score title => string # HTML title )
Note on implementation: The technique used for indexing is substantially derived from that exposed by Tim Kientzle on Dr. Dobbs magazine.
Many, I guess.
Max Muzi <maxim@comm2000.it>
perl(1).
| MMM-Text-Search documentation | Contained in the MMM-Text-Search distribution. |
package MMM::Text::Search; use File::Copy; #$Id: Search.pm,v 1.50 2004/12/13 18:45:15 maxim Exp $ use strict; use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $verbose_flag ); require Exporter; require AutoLoader; @ISA = qw(Exporter AutoLoader); @EXPORT = qw( ); $VERSION = '0.07'; # # Perl module for indexing and searching text files and web pages. # (Max Muzi, Apr-Sep 1999) # # # Note on implementation: # The technique used for indexing is substantially derived from that # exposed by Tim Kientzle on Dr. Dobbs magazine. (Actually IndexWords() # has been cut'n'pasted from his scripts.) # # use DB_File; use Fcntl; require 5.005; $verbose_flag = 0; my $debug_flag = 0; my $errstr = undef; my $syntax_error = undef; sub errstr { $errstr }; sub new { # constructor! (see the docs for usage [sorry, there're no docs ]) my $pkg = shift; my $arg = shift; my $opt = undef; if (ref($arg) ne "HASH") { if (-f $arg) { $opt->{IndexDB} = $arg; $opt->{Verbose} = shift; } else { die "usage: \$obj = new MMM::Text::Search ( '/index/path' or \$hashref)\n" } } else { $opt = $arg; }; $verbose_flag = $opt->{Debug} || $opt->{Verbose} ; my $indexdbpath = $opt->{IndexDB} || $opt->{IndexPath} ; my $filemask = $opt->{FileMask} ; my $dirs = ( ref($opt->{Dirs}) eq "ARRAY" ) ? $opt->{Dirs} : [ ]; my $followsymlinks = defined $opt->{FollowSymLinks}; my $opturls = $opt->{Urls} || $opt->{URLs}; my $urls = ( ref($opturls) eq "ARRAY" ) ? $opturls : [ ]; my $level = int $opt->{Level}; my $locationsdbpath = $indexdbpath; $locationsdbpath =~ s/(\.db)*$/\-locations.db/; my $titlesdbpath = $indexdbpath; $titlesdbpath =~ s/(\.db)*$/\-titles.db/; my $minwordsize = $opt->{MinWordSize} || 1; my $self = { indexdbpath => $indexdbpath, locationsdbpath => $locationsdbpath, titlesdbpath => $titlesdbpath, filemask => length($filemask) ? qr/$filemask/ : undef, dirs => $dirs, followsymlinks => $followsymlinks, minwordsize => $minwordsize, ignorelimit => $opt->{IgnoreLimit} || (2/3), urls => $urls, level => $level, url_exclude => $opt->{UrlExludeMask} || "(?i).*\.(zip|exe|gz|arj|bin|hqx)", file_reader => $opt->{FileReader}, use_inode => $opt->{UseInodeAsKey}, no_reset => $opt->{UseInodeAsKey} && $opt->{NoReset} }; DEBUG("filemask=$filemask, indexfile=$indexdbpath, ignorelimit=$self->{ignorelimit}\n"); DEBUG("dirs = [", join(",", @$dirs),"], "); DEBUG("urls = [", join(",", @$urls),"] \n"); bless($self, $pkg); return $self; } sub _add_keys_to_match_hash { # extract file-codes from $keys and update corresponding $hash elements (score) my ($keys, $hash) = @_; my $key; foreach $key ( unpack("N*",$keys) ) { # DEBUG($key, " "); # ignored words (stop-words) only include file-id 0 (see FlushCache() below) return 0 if $key == 0 ; $hash->{$key}++ } return 1; } sub _push_words_from_hash { my ($hash,$array, $regexp) = @_; my $w; for $w(keys %$hash) { push @$array,$w if $w =~ $regexp; } } #notes on advanced_query(); # - queries containing stop-words may yields bizzare results.. # - score is not always correct # - error handling should be improved... :-) sub advanced_query { # perform queries such as "( a and ( b or c ) ) and ( d and e) " my $self = shift; my $expr = shift; my $indexdbpath= $self->{indexdbpath}; my $locationsdbpath = $self->{locationsdbpath}; my $titlesdbpath = $self->{titlesdbpath}; my %indexdb; my %locationsdb; my %titlesdb; return undef unless (-f $indexdbpath && -r _); return undef unless (-f $locationsdbpath && -r _); return undef unless (-f $titlesdbpath && -r _); return undef unless tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); my @ignored = (); my @words = (); my $verbose_flag_tmp = $verbose_flag; $verbose_flag = shift; # undocumented debug switch chomp $expr; undef $syntax_error; #reset error DEBUG("********** _match_expression() debug **********\n"); my $match = _match_expression($expr, \%indexdb, \@ignored); DEBUG("********** end debug **********\n"); if ($syntax_error) { $errstr = $syntax_error; $verbose_flag = $verbose_flag_tmp; return undef; } my $result = _make_result_hash($match,\%locationsdb, \%titlesdb, \@words, \@ignored); untie(%indexdb); untie(%locationsdb); untie(%titlesdb); $verbose_flag = $verbose_flag_tmp; return $result; } sub _match_expression { # recursively apply a keyword-search expression to indexdb # $expr may be either a string or a ref to an array of tokens # a ref to a "score" hash is returned (or undef sometimes) my ($expr, $index, $ignored) = @_; my $parsed = _parse_expression($expr); # _parse_expression() returns a reference to an array of three elements: # [ operator, left_expr, right_expr] # if right_expr is not defined then expr was atomic and left_expr is a string, # otherwise both right_expr and left_expr are references to arrays of tokens if ( not $parsed) { DEBUG("Syntax error :-( \n"); return undef; } my ( $op, $left,$right) = @$parsed; if ($left && not $right) { $left =~ s/^\s*\(?\s*|\s*\)?\s*$//g; DEBUG("Looking up >$left<\n"); my %matches = (); my $word = $left; my $rc = 0; my $keys = $index->{lc $word}; # get file-id's from indexdb $rc = _add_keys_to_match_hash($keys,\%matches); # if $rc is false then $word is a stop-word, see _add_keys_to_match_hash() for more info if (not $rc) { DEBUG("$word ignored\n"); push @$ignored, $word; return undef; # what should we do now? gotta think it over... } return \%matches; } DEBUG("Evaluating >$left< --$op-- >$right<\n"); my $left_match = _match_expression($left, $index, $ignored); my $right_match = _match_expression($right, $index, $ignored); return undef if ($syntax_error); my %matches = (); my $file = undef; if ($op eq 'AND' ) { %matches = ( %$left_match ); for $file( keys %matches) { delete $matches{$file} unless $right_match->{$file} } return \%matches; } if ($op eq 'AND NOT') { %matches = ( %$left_match ); for $file( keys %matches) { delete $matches{$file} if $right_match->{$file} } return \%matches; } if ($op eq 'OR') { %matches = ( %$left_match ); for $file( keys %$right_match) { if ($matches{$file}) { $matches{$file} +=$right_match->{$file}; } else { $matches{$file} =$right_match->{$file}; } } return \%matches; } return undef; } sub _parse_expression { my $arg = shift; my $tokens = undef; # this is an arry ref if (ref($arg) ne 'ARRAY') { $tokens = [ $arg =~ m/( \( | \)| \bAND\s+NOT\b | \bAND\b | \bOR\b | \"[^\"]+\" | \b\w+\b) /xig ]; } # important!! "AND NOT" is treated as a single logical operator... # this means that things like "not a and b" aren't well-formed, # while "b and not a" is else { $tokens = $arg; } my $left = undef; # array ref (oppure stringa se è un espressione atomica) my $right = undef; # array ref ! my $op = 'OR'; my $depth = 0; my $pos = 0; my $tok; my $len = int @$tokens; DEBUG("expr = ", join(" + ", @$tokens),"\n"); while (1) { if ($len == 1) { return [ undef, $tokens->[0], undef ]; } DEBUG("$tok : depth=$depth pos=$pos len=$len\n"); if ($depth == 0 && ($pos == $len) ) { if ($tokens->[0] eq '(' && $tokens->[$len-1] eq ')') { # take off outer parentheses... shift @$tokens; pop @$tokens; $len -= 2; $pos = 0; $depth = 0; DEBUG("expr = ", join(" + ", @$tokens),"\n"); } else { # ahhhh... this expression won't be parsed... $syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; DEBUG("atom not atomic\n"); return undef; } } elsif ( $pos == $len ) { $syntax_error = "Non-matching parentheses (\"".join(' ', @$tokens)."\")"; DEBUG("non matching parentheses\n"); return undef; } $tok = $tokens->[$pos++]; if ($tok eq '(') { $depth++; next; } if ($tok eq ')') { $depth--; next; } next if $depth; if ($tok =~ /\b(AND\s+NOT|AND|OR)\b/i) { if ($pos == 1 || $pos == $len) { $syntax_error = "Ill-formed expression (\"".join(' ', @$tokens)."\")"; return undef } $op = uc $1; $op =~ s/\s+/ /g; $left = [ @$tokens[0..$pos-2] ]; $right = [ @$tokens[$pos..$len-1] ]; DEBUG("right = ", join(" + ", @$right),"\n"); DEBUG("left = ", join(" + ", @$left),"\n"); return [ $op, $left, $right ]; } } } sub query { # simple query.... altavista +/- prefixes are recognized... # */? globbing also works but # slows query down significantly # globbing implicitly discards +/- prefix (it's a BUG!!!) my $self = shift; my $indexdbpath= $self->{indexdbpath}; my $locationsdbpath = $self->{locationsdbpath}; my $titlesdbpath = $self->{titlesdbpath}; my %indexdb; my %locationsdb; my %titlesdb; return undef unless (-f $indexdbpath && -r _); return undef unless (-f $locationsdbpath && -r _); return undef unless (-f $titlesdbpath && -r _); return undef unless tie_hash(\%indexdb,$indexdbpath, O_RDONLY ) && tie_hash(\%locationsdb,$locationsdbpath, O_RDONLY ) && tie_hash(\%titlesdb,$titlesdbpath, O_RDONLY ); my %matches; my %limit; my %exclude; my @ignored; my $key; my $word; my $mustbe_words = 0; my @words = (); my $glob_regexp = undef; for (@_) { # globbing feature... e.g. uni* passw? if ( /\*|\?/) { s/\*/\.\*/g; s/\?/\./g; $glob_regexp = $glob_regexp ? $glob_regexp."|^$_\$" : "^$_\$" ; } else { push @words, $_; } } if ($glob_regexp) { my $regexp = qr/$glob_regexp/; # collect all words in db matching $glob_regexp and append them to the query _push_words_from_hash(\%indexdb, \@words, $regexp); } DEBUG("looking up ", join(", ", @words ), "\n"); foreach $word (@words) { my $rc = 0; # DEBUG($word); if ($word =~ /^-(.*)/) { my $keys = $indexdb{lc $1}; $rc = _add_keys_to_match_hash($keys,\%exclude); } elsif ($word =~ /^\+(.*)/) { $mustbe_words++; my $keys = $indexdb{lc $1}; $rc = _add_keys_to_match_hash($keys,\%limit); } else { my $keys = $indexdb{lc $word}; $rc = _add_keys_to_match_hash($keys,\%matches); } # DEBUG("\n"); if (not $rc) { push @ignored, $word } } if ($mustbe_words) { for $key(keys %limit) { next unless $limit{$key} >= $mustbe_words; $matches{$key} += $limit{$key} ; } for $key(keys %matches) { delete $matches{$key} unless $limit{$key}; } } for $key(keys %exclude) { delete $matches{$key}; } my $result = _make_result_hash(\%matches,\%locationsdb, \%titlesdb, \@words, \@ignored); untie(%indexdb); untie(%locationsdb); untie(%titlesdb); return $result; } sub _make_result_hash { # hash-ref hash-ref hash-ref array-ref array-ref my ( $match, $locationsdb, $titlesdb, $words, $ignored ) = @_; my $result = { searched => $words, ignored => $ignored, entries => [] }; my $key; foreach $key (keys %$match) { my $ckey = pack("xN",$key); my $name = $locationsdb->{$ckey}; my $title = $titlesdb->{$ckey}; push @{ $result->{entries} }, { location => $name, score => $match->{$key}, title => $title }; DEBUG("$name: $match->{$key}\n"); } return $result; } sub DEBUG (@) { $verbose_flag && print STDERR @_ }; sub tie_hash { my ($hashref, $file ,$perm) = @_; $perm = (O_RDWR|O_CREAT) unless defined $perm; my $rc = tied(%$hashref); return $rc if $rc; $rc = tie(%$hashref,'DB_File',$file, $perm, 0644, $DB_File::DB_BTREE) ; if ($debug_flag) { my $count = int keys %$hashref; DEBUG("tie $hashref ($rc) ($count keys)\n"); } elsif ($verbose_flag) { DEBUG("tie $hashref ($rc)\n"); } return $rc; } sub untie_hash { my ($hashref, $file ) = @_; if ($debug_flag) { my $count = int keys %$hashref; DEBUG("untie $hashref ($count keys)\n") } untie(%$hashref); } 1; #__END__
# #-------------------- the following code is only used when indexing ---------------- # sub dump_word_stats { my $self = shift; my $fh = shift || \*STDOUT; my $indexdbpath= $self->{indexdbpath}; my %indexdb; die unless (-f $indexdbpath && -r _); tie_hash(\%indexdb,$indexdbpath, O_RDONLY ); my %index = ( %indexdb ); my $w; for $w( sort { length($index{$b}) <=> length($index{$a}) } keys %index ) { print $fh $w, "\t", length($index{$w}) / 2, "\n"; } untie_hash(\%indexdb); } sub start_indexing_session { my $self = shift; $self->rollback_indexing_session; my $key = 0; my $indexdbpath = $self->{indexdbpath}; my $locationsdbpath = $self->{locationsdbpath}; my $titlesdbpath = $self->{titlesdbpath}; my $filemask = $self->{filemask}; my $keyref = \$key; my $filelistfile = $indexdbpath; $filelistfile =~ s/(\.db)?$/\.filelist/; open FILELIST, ">".$filelistfile; my $session = { indexdbpath => $indexdbpath, locationsdbpath => $locationsdbpath, titlesdbpath => $titlesdbpath, indexdb => { }, locationsdb => { }, titlesdb => { }, cachedb => { }, filemask => $filemask, current_key => 16, # first 16 values are reserved (0 = word is ignored) bytes => 0, count => 0, filecount => 0, listfh => \*FILELIST, status_THE => 0, followsymlinks => $self->{followsymlinks}, minwordsize => $self->{minwordsize}, ignoreword => {}, autoignore => 1, ignorelimit => $self->{ignorelimit} || (2/3), level => $self->{level}, url_exclude => $self->{url_exclude}, file_reader => $self->{file_reader}, use_inode => $self->{use_inode}, no_reset => $self->{no_reset}, }; unlink $indexdbpath."~"; unlink $locationsdbpath."~"; unlink $titlesdbpath."~"; if( $self->{no_reset} ) { copy( $indexdbpath, $indexdbpath."~" ); copy( $locationsdbpath, $locationsdbpath."~" ); copy( $titlesdbpath, $titlesdbpath."~" ); } tie_hash($session->{indexdb}, $indexdbpath."~" ) or die "$indexdbpath: $!\n"; tie_hash($session->{locationsdb}, $locationsdbpath."~" ) or die $!; tie_hash($session->{titlesdb},$titlesdbpath."~" ) or die $!; my $ignorefile = $indexdbpath; $ignorefile =~ s/(\.db)?$/\.stopwords/; if (-r $ignorefile) { # read *-stopwords.dat file open F, $ignorefile; while (<F>) { chomp; s/^\s+|\s+$//g; $session->{ignoreword}->{$_} = 1; } close F; my $count = int keys %{ $session->{ignoreword} }; DEBUG("using stop-words from $ignorefile ($count words)\n"); $session->{autoignore} = 0; } $session->{ignorefile} = $ignorefile; my $time = time(); $session->{start_time} = $time; $self->{session} = $session; } sub index_default_locations { my $self = shift; my $session = $self->{session}; return unless $session; my $dirs = $self->{dirs}; my $urls = $self->{urls}; my $filecount = 0; DEBUG("Counting files...\n") if int @$dirs; my $dir; for $dir( sort @$dirs) { $filecount += IndexDir($session, $dir, 1); } $session->{filecount} = $filecount; for $dir( sort @$dirs) { IndexDir($session, $dir); } for my $url( sort @$urls) { IndexWeb($session, $url); } } sub index_content { my $self = shift; my $session = $self->{session}; return unless $session; my $info = shift; if( ref($info) ne 'HASH' ) { warn("usage: \$src->index_content( { content=>'...', id=>'...', title=>'...' } )\n"); return undef; } IndexFile( $session, $info->{id}, $info->{content}, $info->{title} ); return 1; } sub rollback_indexing_session { my $self = shift; my $session = $self->{session}; return unless $session; untie_hash($session->{indexdb}); untie_hash($session->{locationsdb}); untie_hash($session->{titlesdb}); my $indexdbpath = $self->{indexdbpath}; my $locationsdbpath = $self->{locationsdbpath}; my $titlesdbpath = $self->{titlesdbpath}; unlink $indexdbpath."~"; unlink $locationsdbpath."~"; unlink $titlesdbpath."~"; $self->{session} = undef; } sub DESTROY { my $self = shift; $self->rollback_indexing_session; } sub commit_indexing_session { my $self = shift; my $session = $self->{session}; return unless $session; FlushCache($session->{cachedb}, $session->{indexdb}, $session); my $time = time()-$session->{start_time}; DEBUG("$session->{bytes} bytes read, $session->{count} files processed in $time seconds\n"); untie_hash($session->{indexdb}); untie_hash($session->{locationsdb}); untie_hash($session->{titlesdb}); my $indexdbpath = $self->{indexdbpath}; my $locationsdbpath = $self->{locationsdbpath}; my $titlesdbpath = $self->{titlesdbpath}; rename $indexdbpath."~", $indexdbpath; rename $locationsdbpath."~", $locationsdbpath ; rename $titlesdbpath."~", $titlesdbpath; close $session->{listfh}; if ( $session->{autoignore} ) { my $ignorefile = $session->{ignorefile}; open F, ">".$ignorefile; #write *-stopwords.dat file print F join( "\n", sort keys %{ $session->{ignoreword} } ); close F; } $self->{session} = undef; } sub makeindex { my $self = shift; $self->start_indexing_session(); $self->index_default_locations(); $self->commit_indexing_session(); } sub IndexDir { my ($session, $dir, $only_recurse) = @_; my $followsymlinks = $session->{followsymlinks}; my $file_reader = $session->{file_reader}; opendir D, $dir; # DEBUG "D $dir\n"; my @files = readdir D; close D; my $e; my $count = 0; my $text; for $e(@files) { next if $e =~ /^\.\.?/; my $path = $dir."/".$e; if (-d $path) { unless ($followsymlinks) { next if -l $path ; } $count += IndexDir($session,$path, $only_recurse); } elsif (-f _ ) { my $filemask = $session->{filemask}; if ($filemask) { next unless $e =~ $filemask; } unless ($only_recurse) { if( $file_reader ) { $text = $file_reader->read( $path ); IndexFile($session,$path,$text); } else { IndexFile($session,$path); } } $count ++; } } return $count; } sub IndexFile { my ($session, $file, $text, $title ) = @_; my $cachedb = $session->{cachedb}; my $locationsdb = $session->{locationsdb}; my $key = $session->{current_key}; if( $session->{use_inode} ) { $key = (stat($file))[1]; } my $no_of_files = $session->{filecount}; if( $session->{no_reset} ) { if( exists $locationsdb->{pack"xN",$key} ) { warn("key $key already in locationsdb. Skipping\n"); return; } } DEBUG $session->{count}+1, "/$no_of_files $file (id=$key)\n"; my $fh = $session->{listfh}; print $fh "$key\t$file\n"; local $/; unless (defined $text) { undef $/; open(FILE, $file); ($text) = <FILE>; # Read entire file close FILE; } my $filesize = length($text); if ($file =~ /\.s?htm.?/i ) { $text =~ /<title[^>]*>([^<]+)<\/title/i ; $title = $1; $title =~ s/\s+/ /g; $text =~ s/<[^>]*>//g; # strip all HTML tags } if( defined $title ) { $session->{titlesdb}->{pack"xN",$key} = $title; # put title in db DEBUG("* \"$title\"\n"); } # index all the words under the current file-id my($wordsIndexed) = &IndexWords($cachedb, $text,$key, $session); $session->{current_key}++; DEBUG "* $wordsIndexed words\n"; # map file-id (key) to this filename $locationsdb->{pack"xN",$key} = $file; # leading null is here for # historical reasons :-) $session->{bytes} += $filesize; $session->{count}++; $session->{_temp_size} += $filesize; if ($session->{_temp_size} > 2000000 ) { my $rc = 0; $rc = FlushCache($cachedb, $session->{indexdb}, $session); if (! $rc ) { tie_hash($session->{indexdb}, $session->{indexdbpath}) or die $!; untie_hash($session->{indexdb}); $rc = FlushCache($cachedb, $session->{indexdb}, $session); die $! if not $rc; } $session->{_temp_size} = 0; $session->{cachedb} = {}; } } sub IndexWords { my ($db, $words, $fileKey, $session) = @_; # hash content file-id options my (%worduniq); # for unique-ifying word list my $minwordsize = $session->{minwordsize}; my (@words) = split( /[^a-zA-Z0-9\xc0-\xff\+\/\_]+/, lc $words); # split into an array of words @words = grep { $worduniq{$_}++ == 0 } # remove duplicates grep { length > $minwordsize } # must be longer than one character grep { s/^[^a-zA-Z0-9\xc0-\xff]+//; $_ } # strip leading punct grep { /[a-zA-Z0-9\xc0-\xff]/ } # must have an alphanumeric @words; # " foreach (sort @words) { " for (@words) { # no need to sort here, my $a = $db->{$_}; # we will sort when cache is flushed $a .= pack "N",$fileKey; # appending packed file-id's $db->{$_} = $a; } return int @words; } sub FlushCache { my ($source, $dest, $session) = @_; # flush source hashe into dest.... # %$dest is supposed to be tied, otherwise the whole # thing doens't make much sense... :-) my $scount = int keys %$source ; my $ucount = 0; my $acount = 0; if ($scount == 0) { die "error: 0 words in cache\n"; } # my $wordcount = int keys %$dest; # if ($wordcount < $session->{wordcount}) { # warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; # return undef; # } # $session->{wordcount} = $wordcount; # DEBUG("$wordcount words in database\n"); my $objref = tied %$dest ; DEBUG("flushing $scount words into $dest ($objref)\n"); my $filecount = $session->{count}; my $autoignore = $session->{autoignore}; my $ignorethreshold = int ( $filecount * $session->{ignorelimit} ); my $w; WORD: for $w(sort keys %$source) { my $data = $source->{$w}; if ($session->{ignoreword}->{$w} ) { DEBUG("ignoring '$w' \n"); $data = pack("N*", ( 0 ) ); # id = 0 means $w is a stop-word } elsif (defined $dest->{$w}) { my %uniq = (); my $keys = $dest->{$w} . $data ; my $keycount = length($keys)/2; # dividing by 2 $ucount++; ## my @keys = unpack("n*", $keys) ; ## my $keycount = @keys; ## ## if ($keys[0] == 0 ) { # skip ignored word ## DEBUG("skipping '$w' \n"); ## next WORD; ## } els if ($autoignore && ($filecount > 100) && ($keycount > $ignorethreshold ) ) { DEBUG("word '$w' will be ignored (found in $keycount of $filecount files)\n"); # ignored words are associated to file-id 0 ## @keys = ( 0 ); $keys = pack("N*", 0); $session->{ignoreword}->{$w} = 1; } ## @keys = grep { $uniq{$_}++ == 0} @keys; ## $data = pack("n*", @keys); $data = $keys; ## if ($verbose_flag && ( $w eq "the" ) ) { ## my $len = int(@keys); ## if ($len < $session->{status_THE} ) { ## die "panic: problem with word 'the'"; ## } ## $session->{status_THE} = $len; ## DEBUG("word 'the' found in $len files \n"); ## } } else { $acount++; } $dest->{$w} = $data; # if ($dest->{$w} ne $data) { # warn "unexpected error: \$w=$w\n"; # return undef; # } } DEBUG("$ucount words updated, $acount new words added\n"); if ($debug_flag) { my $wordcount = int keys %$dest; if ($wordcount < $session->{wordcount}) { warn "indexdb has lost entries (now $wordcount, were $session->{wordcount}) \n"; return undef; } $session->{wordcount} = $wordcount; DEBUG("$wordcount words in database\n"); } return 1; } sub IndexWeb { my ($session, $url) = @_; require MMM::Text::Search::Inet; my $req = new HTTPRequest { AutoRedirect => 1 }; my %fetched = (); $req->set_url($url); my $host = $req->host(); $session->{req} = $req; $session->{fetched} = \%fetched; $session->{host} = $host; my $deadlinksfile = $session->{indexdbpath}; $deadlinksfile =~ s/(\.db)?$/\.deadlinks/; open DL, ">".$deadlinksfile; $session->{deadlinksfh} = \*DL; recursive_fetch($session, $url, "", 0); } sub recursive_fetch { my ($session, $URL, $parent, $level) = @_; my $req = $session->{req}; $req->reset(); $req->set_url($URL); my $url = $req->url(); return unless $req->host() eq $session->{host}; return if $session->{fetched}->{$url}; $session->{fetched}->{$url} = 1; return unless $req->get_page(); my $status = $req->status(); DEBUG( ">>> $url ($status)\n"); if ( $status != 200 ) { my $fh = $session->{deadlinksfh}; my $url = $req->url(); print $fh $status, "\t", $url, "(", $req->{_URL},")", "\t", $parent, "\n"; return; }; my $base = $req->base_url(); my $content_ref = $req->content_ref(); my $header = $req->header(); IndexFile($session, $url, $$content_ref); return if ($session->{level} && $level >= $session->{level}); $$content_ref =~ s/<!--.*?-->//gs; #remove comments my @links = $$content_ref =~/href=([^>\s]+)/ig; #extract hyperlinks my $count = 0; my $exclude_re = $session->{url_exclude}; for(@links) { s/\"|\'//g; next if m/^(ftp|mailto|gopher|news):/; next if m/^$exclude_re$/o; my $link = /^http/ ? $_ : join("/",$base,$_); $link =~ s/#.*//; $count++; recursive_fetch($session,$link, $url, $level + 1); } } 1; __END__