| onsearch documentation | Contained in the onsearch distribution. |
OnSearch::Search - Search library module.
OnSearch::Search searches Web site indexes generated by onindex(8) and sends the names of matching documents to OnSearch::Results(3).
The subroutine, search (), is typically called by the application with a OnSearch::CGIQuery object that contains the search word or words, the type of search, whether to match the case of the search term, and whether to match complete or partial words.
OnSearch(3), onindex(8)
| onsearch documentation | Contained in the onsearch distribution. |
package OnSearch::Search;
#$Id: Search.pm,v 1.32 2005/08/22 13:45:51 kiesling Exp $ use strict; use warnings; use POSIX; use Socket; use OnSearch; use OnSearch::AppConfig; use OnSearch::Cache; use OnSearch::StringSearch; use OnSearch::WebLog; my ($VERSION)= ('$Revision: 1.32 $' =~ /:\s+(.*)\s+\$/); require Exporter; require DynaLoader; our (@ISA, @EXPORT); @ISA = qw(Exporter DynaLoader); @EXPORT = (qw/search collate text_string_search save_sid clean_sids clean_results s_write/); my $logfunc = \&OnSearch::WebLog::clf; my ($chldpid, $sid); my $OPENTAG = qr'^<file path="(.*?)">'o; my $CLOSETAG = qr'^</file>'o; sub search { my $q = shift; my $tlds = $_[0]; # Don't re-spawn when parent exits. Then re-assign signal handler # in the child process. $SIG{CHLD} = \&ignore_signal; FORK: if ($chldpid = fork ()) { return $chldpid; } elsif (defined $chldpid) { setpgrp (0,0); FORK2: if ($sid = fork ()) { $q -> {sid} = $sid; } elsif (defined $sid) { ### ### See the comments in WebClient.pm. ### chdir '/' || die "OnSearch: Could not chdir /: $!\n"; close STDIN; close STDOUT; close STDERR; OnSearch::WebLog::clf ('notice', "Search started PID $$."); s_write ($q->{ppid}, '<results>'); if (caching_enabled ()) { my @cachehits = cache_retrieve ($q->{ppid}, $q->{searchterm}, $q->{searchtermlist}, $q->{matchtype}, $q->{matchcase}); ### ### Collate cache hits. ### $q -> {cachehits} = _new_hash_ref (); if ($#cachehits >= 0) { foreach my $cacheref (@cachehits) { my ($cachefn) = ($cacheref =~ $OPENTAG); ${$q -> {cachehits}}{$cachefn} = $cachefn unless exists ${$q->{cachehits}}{$cachefn}; } } } foreach (@{$tlds}) { subdir ($q, $_); # tld); } s_write ($q->{ppid}, '</results>'); ### ### TO DO Make this into an onsearch.cfg directive. ### OnSearch::WebLog::clf ('notice', "Search ended PID $$."); return 0; } elsif ($! =~ /No more processes|Resource temporarily unavailable/) { OnSearch::WebLog::clf ('error', "search () $! PID $$."); sleep 2; redo FORK2; } else { OnSearch::WebLog::clf ('error', "search () $! PID $$."); } } elsif ($! =~ /No more processes|Resource temporarily unavailable/) { OnSearch::WebLog::clf ('error', "search () $! PID $$."); sleep 2; redo FORK; } else { OnSearch::WebLog::clf ('error', "search () $! PID $$."); } if ($?) { OnSearch::WebLog::clf ('error', "search exited $?."); exit $?; } return $$; } my $idxexpr = '.onindex.idx'; my $searchstr = \&OnSearch::StringSearch::_strindex; sub subdir { my $q = shift; my $top_dir = shift; my (@direntries, $path, $ent, $r); opendir DIR, $top_dir || die "$top_dir: $!"; @direntries = readdir DIR; closedir DIR; foreach $ent (@direntries) { chomp $ent; next if ($ent eq '.') || ($ent eq '..'); $path = $top_dir . '/' . $ent; no warnings; if (-d $path) { use warnings; if (! $q -> {excludedirs} || scalar grep ("$path", @{$q->{excludedirs}})) { $r = subdir ($q, $path); } ### ### A string match here would also match ### backup indexes. ### } elsif ($ent eq $idxexpr) { $r = postings ($q, $path); } } # $#direntries = -1; return undef; } ### ### This is where the work of searching actually occurs. ### First we check that the result was not already sent ### after the search of cached results. Then each target ### <file path="..."> section of the index gets parsed. We ### check each <word chars="...">...</word> posting for a ### match against the search regex, compiled in ### Regex.pm. ### ### If the user selects the match type, "any," we simply send all ### of the postings. If the user's match type is, "all," or, ### "exact," the collate function determines that the target ### file contains all of the search terms. Target files of "exact" ### matches then get searched for the exact phrase. The index posting ### of an exact phrase match is rewritten so that we can cache ### the entire phrase. That way, we need to read the target file only ### once. ### ### As many of the search parameters as possible are initialized in ### search.cgi, and the code here is designed fast, without ### attempting optimizations that would add significantly to the ### application's complexity. ### sub postings { my $q = $_[0]; my $idxfile = $_[1]; my ($l, $is, @postlines, $postbuf, $targetfn, $targetfnexp, $r, $cacheref); my (%cachehits, $cachefn); my ($indexfh, @indexlines); my ($s, $str, $im); my $idxref = _new_array_ref (); my $cached_results = caching_enabled (); ### ### Suppress warnings about reopening standard I/O channels. ### no warnings; open $indexfh, "$idxfile" || do { warn ("$idxfile: $!"); return; }; use warnings; $is = $im = 0; while (defined ($str = <$indexfh>)) { chomp $str; if (defined (&$searchstr ('</index>', $str))) { close ($indexfh); return undef; } ### ### If changing this code, remember that $targetfn needs ### to be saved somewhere. ### if (defined (&$searchstr ('<file path=', $str))) { ($targetfn) = ($str =~ $OPENTAG); ### ### Keep going if the result was already cached. ### $postbuf = "$str\n"; next if (exists ${$q->{cachehits}}{$targetfn}); $is = 1; next; } if ($is) { $str = lc ($str) unless $q -> {nmatchcase}; if ($str =~ $q -> {regex}) { $im = 1; $postbuf .= "$str\n"; } } if (defined (&$searchstr ('</file>', $str)) && $is) { $postbuf .= "$str\n"; if ($im) { if (&{$q->{sfptr}} ($q, \$postbuf)) { ### ### Posting should already have matched ### $q->{partword}. ### add_to_cache ($q->{ppid}, posting_to_cache ($postbuf, $q->{searchtermlist},$q->{matchtype},$q->{matchcase})) if caching_enabled (); } } $is = 0; $#postlines = -1; } } close ($indexfh); return undef; } sub _new_array_ref { my @a = (); return \@a; } sub _new_hash_ref { my $h = {}; return $h; } sub text_string_search { my $q = $_[0]; my $postbufref = $_[1]; ### ### Collate () has already filtered documents that don't ### contain all of the words in the search phrase. ### my ($vf, $offset_ref, $path, $buf, $bufnc, $content, $stnc, @l); my ($completewords, $partialwords); $completewords = 0; @l = split /\n/, $$postbufref; ($path) = ($l[0] =~ $OPENTAG); $vf = OnSearch::VFile -> new; return undef unless $vf -> vfopen ($path); $content = ''; while (1) { $buf = $vf -> vfread (1024); if ($q -> {nmatchcase}) { $bufnc = $buf; } else { $bufnc = lc $buf; } $content .= $bufnc; last if length ($buf) < 1024; } $vf -> vfclose; $content =~ s/\n/ /gs; if ($q -> {nmatchcase}) { $stnc = $q -> {searchterm}; } else { $stnc = lc $q -> {searchterm}; } $offset_ref = OnSearch::StringSearch::_search_string ($stnc, $content); if ($#{$offset_ref} < 0) { return undef; } ### ### Check for complete word matches. This workaround is necessary ### because _search_string can match partially; for example, ### "file dialog box," also matches "file dialog boxes." ### if ($q -> {partword} =~ /no/) { foreach my $offset (@{$offset_ref}) { my $beforechar = substr ($content, $offset-1, 1); my $afterchar = substr ($content, $offset + length($q->{searchterm}), 1); if ("$beforechar$afterchar" =~ /\W\W/) { $completewords = 1; last; } } } else { $partialwords = 1; } if (($completewords || $partialwords) && defined ($$postbufref = rewrite_posting ($stnc, $offset_ref, \@l))) { return $postbufref; } else { return undef; } } ### ### Rewrite posting for the complete phrase results of exact ### match searches. ### sub rewrite_posting { my $term = shift; my $offset_ref = shift; my $postrefs = shift; my ($offset_str, $lcterm); return undef if ($#{$offset_ref} < 0); $offset_str = join ',', @$offset_ref; $lcterm = lc ($term); $postrefs -> [1] = qq| <word chars="$lcterm">$offset_str</word>|; $postrefs -> [2] = qq|</file>|; $#{$postrefs} = 2; return join "\n", @$postrefs; } ### ### Collate () determines that a target file contains all of ### the required search terms. Each hash elements is a ### true/(implied) false vector for each search term, and if ### a vector for a search term is missing, the search fails. ### sub collate { my $q = shift; my $postbuf = shift; my ($p1, $m_str, $st, @l, %vec); @l = split /\n/, $postbuf; return 0 unless (($#l - 2) >= $#{$q->{searchtermlist}}); ### ### Loop through the <word chars=... tags. ### foreach $p1 (@l[1..$#l - 1]) { next unless $p1 =~ $q->{collateregex}; $m_str = $1; $m_str = lc $m_str unless $q->{nmatchcase}; $vec{$m_str} = 1 unless $vec{$m_str}; } foreach $st (@{$q -> {searchtermlist}}) { $st = lc $st unless $q -> {nmatchcase}; return 0 unless $vec{$st}; } return 1; } sub save_sid { my $sid = $_[0]; sysopen (S, OnSearch::AppConfig->str('DataDir') . '/' . $sid, O_CREAT | O_WRONLY); close S; } ### ### Retrieve session id before cleaning.... ### sub clean_sids { my (@dirents, $r); opendir (D, OnSearch::AppConfig->str('DataDir')) || browser_die ("clean_sids: " . OnSearch::AppConfig->str('DataDir') . ": $!\n"); @dirents = readdir (D); foreach my $ent (@dirents) { next if ($ent =~ /\.|\.\./); if ($ent =~ /^\d+$/) { if (($r = kill (0, $ent)) == 0) { if (($r = unlink (OnSearch::AppConfig->str ('DataDir') . '/' . $ent)) == 0) { warn "clean_sids unlink $ent: $!"; } } } } closedir D; } sub clean_results { my $exptime = OnSearch::AppConfig->str ('ResultsPersist'); my $rd = OnSearch::AppConfig->str ('DataDir'); opendir (D, $rd) || browser_die ("clean_results: $rd: $!\n"); my @dirents = readdir D; foreach my $ent (@dirents) { if ($ent =~ /session\.\d+/) { if (time - (stat("$rd/$ent"))[9] > $exptime) { OnSearch::WebLog::clf ('notice', "removing out of date $ent" ); unlink "$rd/$ent"; } } } closedir D; } sub s_write { my $session_id = shift; my $buf = shift; my ($name, $clientfh, $serverfh, $r, $buflength); $name = '/tmp/.onsearch.sock.' . $session_id; socket ($serverfh, PF_UNIX, SOCK_STREAM, 0) || die "OnSearch: s_write socket: $!"; if (-S $name && ! unlink ($name)) { &$logfunc ('error', "s_write unlink: $!\n"); } bind ($serverfh, sockaddr_un($name)) || &$logfunc ('notice', "s_write bind: $!."); listen ($serverfh, SOMAXCONN) || &$logfunc ('notice', "s_write listen: $!."); accept ($clientfh, $serverfh) || &$logfunc ('notice', "s_write accept: $!."); if (fileno ($clientfh)) { $buflength = length ($buf); if (($r = syswrite ($clientfh, $buf)) != $buflength) { &$logfunc ('error', "s_write error $r chars of $buflength written: $!."); } close $clientfh; close $serverfh; } return; } __END__ 1;