/usr/local/CPAN/onsearch/OnSearch/Utils.pm
package OnSearch::Utils;
#$Id: Utils.pm,v 1.16 2005/08/16 05:34:03 kiesling Exp $
use strict;
use warnings;
use Carp;
use Config;
use Socket;
my $VERSION='$Revision: 1.16 $';
use OnSearch;
use OnSearch::Base64;
use OnSearch::WebLog;
require Exporter;
require DynaLoader;
our (@ISA, @EXPORT);
@ISA = qw(Exporter DynaLoader);
@EXPORT = (qw/http_unescape hex_to_char filetype document_urls
new_array_ref http_date str_in_list valid_lock
basename sigwrapper run_onindex signumber
client_write/);
my $logfunc = \&OnSearch::WebLog::clf;
sub http_unescape {
my $uri = $_[0];
my ($c, $j, $sp);
$uri =~ s/\+/ /g;
if ($uri =~ /\%/) {
for ($j = 0; $j < length($uri); $j++) {
if (substr ($uri, $j, 1) eq '%') {
$c = substr ($uri, $j+1, 2);
$sp .= hex_to_char ($c);
$j += 2;
} else {
$sp .= substr ($uri, $j, 1);
}
}
} else {
$sp = $uri;
}
return $sp;
}
sub hex_to_char {
my $hexdigit = $_[0];
my $hexchars = { '0A' => ' ', '0D' => ' ',
'20' => ' ', '21' => '!', '22' => '"', '23' => '#',
'24' => '$', '25' => '%', '26' => '&', '27' => '\'',
'28' => '(', '29' => ')', '2A' => '*', '2B' => '+',
'2C' => ',', '2D' => '-', '2E' => '.', '2F' => '/',
'30' => '0', '31' => '1', '32' => '2', '33' => '3',
'34' => '4', '35' => '5', '36' => '6', '37' => '7',
'38' => '8', '39' => '9', '3A' => ':', '3B' => ';',
'3C' => '<', '3D' => '=', '3E' => '>', '3F' => '?',
'40' => '@', '41' => 'A', '42' => 'B', '43' => 'C',
'44' => 'D', '45' => 'E', '46' => 'F', '47' => 'G',
'48' => 'H', '49' => 'I', '4A' => 'J', '4B' => 'K',
'4C' => 'L', '4D' => 'M', '4E' => 'N', '4F' => 'O',
'50' => 'P', '51' => 'Q', '52' => 'R', '53' => 'S',
'54' => 'T', '55' => 'U', '56' => 'V', '57' => 'W',
'58' => 'X', '59' => 'Y', '5A' => 'Z', '5B' => '[',
'5C' => '\\','5D' => ']', '5E' => '^', '5F' => '_',
'60' => '`', '61' => 'a', '62' => 'b', '63' => 'c',
'64' => 'd', '65' => 'e', '66' => 'f', '67' => 'g',
'68' => 'h', '69' => 'i', '6A' => 'j', '6B' => 'k',
'6C' => 'l', '6D' => 'm', '6E' => 'n', '6F' => 'o',
'70' => 'p', '71' => 'q', '72' => 'r', '73' => 's',
'74' => 't', '75' => 'u', '76' => 'v', '77' => 'w',
'78' => 'x', '79' => 'y', '7A' => 'z', '7B' => '{',
'7C' => '|', '7D' => '}', '7E' => '~',
### Note: These are rendered in X Window System fonts.
'C0' => 'À', 'C1' => 'Á', 'C2' => 'Â', 'C3' => 'Ã',
'C4' => 'Ä', 'C5' => 'Å', 'C6' => 'Æ', 'C7' => 'Ç',
'C8' => 'È', 'C9' => 'É', 'CA' => 'Ê', 'CB' => 'Ë',
'CC' => 'Ì', 'CD' => 'Í', 'CE' => 'Î', 'CF' => 'Ï',
'D0' => 'Ð', 'D1' => 'Ñ', 'D2' => 'Ò', 'D3' => 'Ó',
### Don't include multiplication sign.
'D4' => 'Ô', 'D5' => 'Õ', 'D6' => 'Ö',
'D8' => 'Ø', 'D9' => 'Ù', 'DA' => 'Ú', 'DB' => 'Û',
'DC' => 'Ü', 'DD' => 'Ý', 'DE' => 'Þ',
### This is the X Window rendering of small sharp s.
'DF' => 'ß',
'E0' => 'à', 'E1' => 'á', 'E2' => 'â', 'E3' => 'ã',
'E4' => 'â', 'E5' => 'å', 'E6' => 'æ', 'E7' => 'ç',
'E8' => 'è', 'E9' => 'é', 'EA' => 'ê', 'EB' => 'ë',
'EC' => 'ì', 'ED' => 'í', 'EE' => 'î', 'EF' => 'ï',
### Rendering of small eth.
'F0' => 'ð',
'F1' => 'ñ', 'F2' => 'ò', 'F3' => 'ó',
### Don't include division sign.
'F4' => 'ô', 'F5' => 'õ', 'F6' => 'ö',
'F8' => 'ø', 'F9' => 'ù', 'FA' => 'ú', 'FB' => 'û',
'FC' => 'ü', 'FC' => 'ý', 'FE' => 'þ',
'FF' => 'ÿ'};
return $hexchars -> {$hexdigit};
}
sub new_array_ref { my @a; return \@a; }
###
### If adding magic types, also add to OnSearch::VFile::vf_ftype
###
sub filetype {
my $fname = $_[0];
my $type = 'text/plain';
my $buf;
my $size = 1024;
sysopen (F, $fname, 0) or return -1; # O_RDONLY
return undef unless sysread (F, $buf, $size);
if ($buf =~ /\<html/ism) { $type = 'text/html'; }
if ($buf =~ /\<\?xml/ism) { $type = 'text/html'; }
if ($buf =~ /^%!PS-Adobe/) { $type = 'application/postscript'; }
if ($buf =~ /^%PDF-/) { $type = 'application/pdf'; }
if ($buf =~ /^PK\003\004/) { $type = 'application/zip'; }
if ($buf =~ /^\037\213/) { $type = 'application/x-gzip'; }
if ($buf =~ /^GIF8/) { $type = 'image/gif'; }
if ($buf =~ /^\211PNG/) { $type = 'image/png'; }
if ($buf =~ /^\037\235/) { $type = 'application/compress'; }
if ($buf =~ /^\312\376\272\276/) { $type = 'application/java-class'; }
no warnings;
if (substr ($buf, 6, 4) eq 'JFIF') { $type = 'image/jpeg'; }
if (substr ($buf, 24, 22) eq 'outname=install.sfx.$$')
{ $type = 'application/vnd.sun.pkg'; }
use warnings;
close F;
return $type;
}
sub document_urls {
my $doc = $_[0];
my $base_url = $_[1];
my ($subdoc, $url);
my (%links, @sortedlinks, @uniqlinks);
if ($doc =~ /<base\s*href/is) {
($base_url) = ($doc =~ /<base\s*href\s*=\s*"(.*?)"/);
($base_url) =~ s/(\w+:\/\/[^:\/]+).*/$1/;
}
my $regex = qr/href\s*?=\s*?"(.*?)["#] |
<frame\s*?src\s*?=\s*?"(.*?)["#] /imsx;
$subdoc = $doc;
while (length ($subdoc) && ($subdoc =~ $regex)) {
$url = '';
($url) = ($subdoc =~ $regex);
$subdoc = $';
# This leads to lots of warnings and bad matches if included
# in $regex.
next if ($url =~ /^\.+/) || ! length ($url);
$url = "$base_url/$url" unless $url =~ /http\:/;
$links{$url} = '' unless exists $links{$url};
}
@sortedlinks = sort (keys %links);
return @sortedlinks;
}
my @months = (qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/);
my @wdays = (qw/Sun Mon Tue Wed Thu Fri Sat/);
sub http_date {
my $adv_secs = $_[0];
my $now = time();
my $later = $now + $adv_secs;
my @d_array = gmtime ($later);
my $datestr = sprintf ("%s, %02d-%s-%d %02d:%02d:%02d GMT",
$wdays[$d_array[6]], # weekday
$d_array[3], # date
$months[$d_array[4]], # month
$d_array[5] + 1900, # year
$d_array[2], # hour
$d_array[1], # min
$d_array[0]); # sec
return $datestr;
}
sub str_in_list {
my $s = shift;
my $listref = shift;
my $match_case = shift;
$s = lc $s unless $match_case;
foreach my $l (@{$listref}) {
$l = lc $l unless $match_case;
return 1 if $l eq $s;
}
return undef;
}
sub valid_lock {
my $lockfn = shift;
my ($l, $r, $lockfh);
return undef unless (-f $lockfn);
local $!;
###
### Suppress warnings about opening standard I/O channels
###
no warnings;
open $lockfh, "$lockfn" or warn "valid_lock $lockfn: $!";
use warnings;
while (defined ($l = <$lockfh>)) {
chomp $l;
$r = kill 0, $l;
if (!$r) {
OnSearch::WebLog::clf ('notice',
"Removing stale lock $lockfn ID $l");
unlink $l;
}
}
close ($lockfh);
return $r;
}
sub basename {
my $pathname = shift;
my $basename;
if ($pathname =~ /\\/) {
$basename = substr ($pathname, rindex ($pathname, '\\') + 1);
} elsif ($pathname =~ /\//) {
$basename = substr ($pathname, rindex ($pathname, '/') + 1);
} else {
$basename = $pathname;
}
return $basename
}
sub sigwrapper {
my ($signame, $sigsub, $wrapsub, @args) = @_;
my $oldsig = $SIG{$signame} if $SIG{$signame};
$SIG{$signame} = ($sigsub ? $sigsub : 'IGNORE');
&$wrapsub (@args);
$SIG{$signame} = $oldsig if $oldsig;
}
###
### TO DO - this so far is the most reliable way to index
### immediately. Try to clean up this implementation.
###
sub run_onindex {
my $txt = `/usr/local/etc/init.d/onindex index`;
OnSearch::WebLog::clf ('notice', "Onindex: $txt");
}
sub signumber {
my $signame = $_[0];
my (%sigs, $number, $name);
$number = 0;
foreach $name (split (' ', $Config{sig_name})) {
$sigs{$name} = $number++;
}
return $sigs{$signame};
}
sub client_write {
my $session_id = shift;
my $buf = shift;
my ($name, $clientfh, $serverfh);
$name = '/tmp/.onsearch.sock.' . $session_id;
socket ($serverfh, PF_UNIX, SOCK_STREAM, 0) ||
die "OnSearch: client_write socket: $!";
if (-S $name && ! unlink ($name)) {
&$logfunc ('error', "client_write unlink: $!\n");
}
bind ($serverfh, sockaddr_un($name)) ||
warn ("client_write bind: $!.");
listen ($serverfh, SOMAXCONN) ||
warn ("client_write listen: $!.");
accept ($clientfh, $serverfh) ||
warn ("client_write $$ accept: $!.");
if (fileno ($clientfh)) {
syswrite ($clientfh, $buf);
close $clientfh;
close $serverfh;
}
return;
}
1;