/usr/local/CPAN/onsearch/OnSearch/WebClient.pm


package OnSearch::WebClient; 

BEGIN { $ENV{MCarp} = 'verbose'; }

# '$Id: WebClient.pm,v 1.7 2005/08/16 05:34:03 kiesling Exp $'

use strict;
use warnings;
use Carp;
use Socket;

require Exporter;
require DynaLoader;
our (@ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS, $VERSION);
@ISA = qw(Exporter DynaLoader);
@EXPORT = (qw/get_req parse_url $VERSION/);
%EXPORT_TAGS = ( 'all' => [@EXPORT_OK] );

($VERSION) = ('$Revision: 1.7 $' =~ /.*: (\S*)/);

# Instead of importing.
my $CRLF = "\015\012";

sub get_req {
    my $url = $_[0];
    my $lineinput = '';
    my $page = '';
    my $reloc = 0;

    my ($proto_name, $server, $port) = 
	($url =~ m|(\w+):\/\/([^/:]+):?(\d*)|);
    $port = 80 unless $port;
    unless ($server) { warn "Invalid URL: $url"; return undef; }
    

    my $ra = $ENV{REMOTE_ADDR};
    my $sp = $ENV{SERVER_PORT};

my $getrequest = qq|GET $url HTTP/1.1
Host: $ra $sp
User-Agent: OnSearch $VERSION

|;

    my $addr = gethostbyname ("$server");
    if (! $addr ) { 
        ### 
	### Should there be a verbose setting to resource temporarily
	### unavailable errors for unreachable URLs....
	###
	### warn $!; 
	###
	return undef; 
    }
    
    socket (SOCKFH, PF_INET, SOCK_STREAM, getprotobyname ('tcp')) || die $!;
    my $paddr = inet_aton ($server);
    unless ($paddr) { warn $!; return undef; }
    my $sinput = sockaddr_in ($port, $paddr);
    if (!connect (SOCKFH, $sinput)) {
	###
	### See IO::Socket for error handling for concurrent connection
	### handling.
	###
	### warn $!;
	return undef;
    }
    my $deffhprev = select (SOCKFH); $| = 1; select ($deffhprev);
    $getrequest =~ s"\n"$CRLF"gs;
    if (syswrite (SOCKFH, $getrequest, length($getrequest)) 
        != length ($getrequest)) {
        warn "get_req $! PID $$.";
        return undef;
    }

    while (defined ($lineinput = <SOCKFH>)) {
	next if ($lineinput =~ /200 OK/i);
        if ($reloc && $lineinput =~ /^Location:/) {
            $lineinput =~ s"$CRLF|\n""g;
	    $page = $lineinput;
	    last;
	}
	if ($lineinput =~ m|HTTP/1.[01]\s+[45](\d+)|) {
	    $lineinput =~ s"$CRLF"\n"g;
            $lineinput =~ s"$CRLF|\n""g;
	    $page = $lineinput;
	    last;
	}
        # A redirection.  
	if ($lineinput =~ m|HTTP/1.[01]\s+3(\d+)|) {
           $reloc = 1;
	}
	$page .= $lineinput;
    }

    shutdown (SOCKFH, 2);
    $page =~ s"$CRLF"\n"gs;
    return $page;
}

sub parse_url {
    my ($proto_name, $server, $port, $path) =
	($_[0] =~ m|(\w+)://([^/:]+):?(\d*)(/?.*)|);
    $path = '/' unless $path;
    $port = 80 unless $port;
    return ($proto_name, $server, $port, $path);
}

package OnSearch::WebBot;

use OnSearch;
use OnSearch::AppConfig;
use OnSearch::Utils;

my $logfunc = \&OnSearch::WebLog::clf;

sub new {
    my $class = shift || __PACKAGE__;
    my $webcachepath = web_cache_path ();
    my $self = { level => 0,
		 urls => [],
		 cachedurls => [],
		 unavailurls => [],
		 disallowedurls => [],
		 cachedir => $webcachepath,
	     };
    bless ($self, $class);
    return $self;
}

sub siteindex {
    my $self = shift;
    my $url = $_[0];

    my ($chldpid, $gchldpid);

  FORK:
    if ($chldpid = fork ()) {
	$self -> {chldpid} = $chldpid;
	return $chldpid;
    } elsif (defined $chldpid) {
	setpgrp (0,0);
    } elsif ($! =~ /No more processes|Resource temporarily unavailable/) {
	sleep 2;
	redo FORK;
    } else {
	die "siteindex () error PID $chldpid: $!";
    }

    ###
    ###  Completely detach the indexer from the terminal.  The 
    ###  calling script should return as soon as possible, or the user
    ###  faces a blank screen while the server connection is alive but 
    ###  idle.  Even worse, when the Web server closes an idle 
    ###  connection, sends a SIGTERM, and respawns, it causes the
    ###  foreground script to restart.
    ###

  FORK2:
    if ($gchldpid = fork ()) {
	$self -> {gchldpid} = $gchldpid;
	###
	### Indicate that we're returning from the child process,
	### so we don't run the CGI script twice.  Do the same 
	### below also.
	###
	return 0; 
    } elsif (defined $gchldpid) {
	###
        ###  A real daemon would set its euid and egid
        ###  here, but because the Web server is running the
        ###  script, the uid and gid should already be correct.
	###
	chdir '/' || die "OnSearch: Could not chdir /: $!\n";
	close STDIN;
	close STDOUT;
	close STDERR;
    } elsif ($! =~ /No more processes|Resource temporarily unavailable/) {
	sleep 2;
	redo FORK2;
    } else {
	die "siteindex () error PID $chldpid: $!";
    }

    &$logfunc ('notice', "WebIndex started PID $$.");
    $self -> fetch_page_urls ($url);
    sigwrapper (qw/CHLD/, undef, \&run_onindex);

    return 0;
}

##
##  Server responses. Upper/lower case may vary.
##
## Server:
## MIME-version:
## Content-Type:
## Last-modified:
## Content-length:
## Connection:
## Cache-Control:
## Pragma:
## Transfer-Encoding:
## Upgrade:
## Content-Location: 
## Location:
## Via:
## Accept-Ranges:
## Age:
## Proxy-Authenticate:
## Public:
## Retry-After:
## Server:
## Set-Cookie:
## Vary:
## Warning:
## WWW-Authenticate:

sub fetch_page_urls {
    my $self = shift;
    my $url = $_[0];

    my ($cfg, $verbose, $nontargeturls, $r);
    my ($urldefault);
    my (@furls, $fpage);
    my ($fbaseurl, $fbasepath, $fservername);

    $cfg = OnSearch::AppConfig -> new;
    $verbose = $cfg -> str ('VerboseWebIndexer');

    ++$self -> {level};

    my ($fproto_name, $fserver, $fport, $fpath) = 
	OnSearch::WebClient::parse_url ($url);

    unless ($fproto_name && $fserver && $fpath) {
	&$logfunc ('notice', 
	       "WebIndex ".$self -> {level} . ". url $url is unparseable.")
	    if $verbose;
	  --$self -> {level}; 
	return 1;
      } else {
	  ($fbasepath) = ($fpath =~ /(.*)\//);
	  $fbaseurl = "$fproto_name://$fserver:$fport$fbasepath";
	  $fservername = ((length ($fport) || ($fport =~ /80/)) ? 
			     "$fproto_name://$fserver:$fport" : "$fproto_name://$fserver");
      }

    &$logfunc ('notice',"WebIndex ".$self -> {level}.". url $url") if $verbose;

    $fpage = OnSearch::WebClient::get_req ($url);
    if ($!) {
	&$logfunc ('warning', "WebClient get_req ($url): $!");
    }
    if (! $fpage) {
	push @{$self -> {unavailurls}}, ($url);
	--$self -> {level}; 
	return 1;
    }

    ###
    ### Response is a redirection header.
    ###
    if ($fpage =~ /^Location/) {
	my ($flabel, $floc) = split /:\s+/, $fpage;
	&$logfunc ('notice', 
		   "WebIndex ".$self->{level}.". Page %s redirected to %s.", 
		   $url, $floc)
	    if $verbose;
	my $r = $self -> fetch_page_urls ($floc);
    }
    ###
    ### Response is an error. Try retrieving "URL/index.html"
    ### if the URL doesn't specify a HTML page.
    ###
    if ($fpage =~ m|HTTP/1.[01]\s+[45](\d+)|) {
	if ($fpath eq '/') {
	    $urldefault = $url . 'index.html';
	    &$logfunc ('notice', 
       "WebIndex ".$self->{level}.". Page %s: %s. Trying %s.", 
		       $url, $fpage, $urldefault)
		if $verbose;
	    $fpage = $self -> fetch_page_urls ($urldefault);
	    --$self -> {level}; 
	    return 0;
	} else {
	    &$logfunc ('notice', 
       "WebIndex ".$self->{level}.". Page %s: %s.", $url, $fpage)
		if $verbose;
	      --$self -> {level};
	    push @{$self -> {unavailurls}}, ($url);
	    return 1;
	}
    }

    @furls = OnSearch::Utils::document_urls ($fpage, $fbaseurl);
    if ($! || $@) {
	warn  "Error finding URLS in $url: $! $@.";
	undef $!; undef $@;
	return 1;
    }

    $self -> cache_page ($fpage, $url);

    FU: foreach my $fu (@furls) { 
	foreach my $c (@{$self -> {cachedurls}}) {
	    if ($fu eq $c) { 
		next FU;
	    }
	}
	foreach my $c (@{$self -> {unavailurls}}) {
	    if ($fu eq $c) { 
		next FU;
	    }
	}

	unless ($r = $self -> url_disallowed ($fservername, $fu)) {
	    $r = $self -> fetch_page_urls ($fu) 
	} else {
	    &$logfunc ('notice', 
		       "WebIndex ".$self->{level}.". $fu disallowed.");
	    next FU;
	}
	
	if ($!) {
	    &$logfunc ('warning', 
       "Webindex ".$self->{level}. ". fetch_page_urls ($fu): $!.");
	    undef $!;
	    ###
	    ### Not necessary to return here.
	    ###
	}
    }

    --$self -> {level};
    return 0;
}

sub cache_page {
    my $self = shift;
    my $page = $_[0];
    my $url = $_[1];

    my ($cfg, $verbose, $content_location);

    $cfg = OnSearch::AppConfig -> new;
    $verbose = $cfg -> on (qw/VerboseWebIndexer/);

    my ($proto_name, $server, $port, $path) = 
	OnSearch::WebClient::parse_url ($url);

    ###
    ### If the URL ends in "/," determine if there's a 
    ### Content-Location header and use that value as the 
    ### file name. Otherwise, report an error.
    ###
    ### Trailing slashes added above.
    ###
    if ($path =~ /^.*\/$/) {
	($content_location) = ($page =~ /^Content-Location:\s+(.*?)$/ism);
	if ($content_location) {
	    $path .= $content_location;
	    &$logfunc ('notice', 
       "WebIndex ".$self->{level}.". Page %s\'s Content-Location is %s.", 
		       $url, $content_location)
		if $verbose;
	} else {
	    &$logfunc ('error', 
    "WebIndex ".$self->{level}.". Couldn't find Page %s\'s Content-Location.", 
		       $url) 
		if $verbose;
	    return;
	}
    }

    push @{$self -> {cachedurls}}, ($url);

    my $filepath = ($port) ? $self -> {cachedir} . "/$server:$port/$path" : 
	$self -> {cachedir} . "/$server/$path";

    my ($dirpath) = ($filepath =~ /(.*)\//);

    $self -> mkdirtree ($dirpath, 0755);
    if ($!) {
	&$logfunc ('warning', "mkdirtree ($dirpath): $!");
    }

    if (! -d $filepath) {
	eval {
	    open (WEBPAGE, "> $filepath") or do {
		warn "cache_page $filepath: $!.";
		return;
	    };
	    print WEBPAGE $page;
	    close WEBPAGE;
	};
    }
}

sub mkdirtree {
    my $self = shift;
    my ($dir, $mask) = @_;
    $dir =~ s/\/\//\//g;

    my $verbose = 0;

    my ($parent) = 
	($dir =~ /(.*)\/.*$/);

    return unless ($dir && length ($dir));

    if (! -d $parent) {
	$self -> mkdirtree ($parent, $mask);
	&$logfunc ('warning', "mkdirtree parent ($parent): $!") if $verbose;
	undef $!;
    } 

    if (! -d $dir) {
        mkdir ($dir, $mask) || do { 
	    &$logfunc ('error', "Could not make directory $dir: $!\n");
	};

        &$logfunc ('warning', "mkdirtree ($dir): $!")
	    if $verbose;
	undef $!;
    }
}

sub url_disallowed {
    my $self = shift;
    my ($server, $url) = @_;

    ###
    ### URL is not on the target server.
    ###
    if ($url !~ m"$server"i) {
	return 1;
    }

    ###
    ### URL is not a HTTP reference.
    ###
    if ($url !~ /http\:/) {
	return 1;
    }

    return undef;
}

sub web_cache_path { 
    my $cfg = OnSearch::AppConfig -> new;
    my $onsearchdir = $cfg -> str (qw/OnSearchDir/);
    undef $cfg;
    return $ENV{DOCUMENT_ROOT} . "/$onsearchdir/websites"; 
}

1;