/usr/local/CPAN/GetWeb/GetWeb/Fetcher.pm
package GetWeb::Fetcher;
use MailBot::Config;
use MIME::Base64 qw(encode_base64);
use HTML::FormatText;
use HTML::Parse;
use GetWeb::Util;
use LWP::Protocol;
use URI::URL;
use HTML::LinkExtor;
use strict;
&URI::URL::implementor('file','GetWeb::File');
my $pUserAgentCallback = sub {
my ($data,$response) = @_;
if (! $$response{"GETWEB_CHECKED_SIZE"}++)
{
my $config = MailBot::Config::current;
my $maxLength = $config -> getMaxSize;
# may create an 'X-Died' header
if ($response -> content_length >= $maxLength)
{
my $msg = "file size exceeded $maxLength bytes";
$response -> message($msg);
$response -> code(500);
die "$msg\n";
}
}
$response -> add_content($data);
};
sub new
{
my $type = shift;
my $ua = shift;
my $self = {UA => $ua,
REDIRECT_LIST => []};
bless($self,$type);
$self;
}
sub base
{
my $self = shift;
my $urlString = shift;
my $cwd = shift;
$$self{URL} = $urlString;
#$$self{REQUEST} = GetWeb::Util::safeRequest($urlString, $cwd);
$$self{REQUEST} = GetWeb::Util::safeRequest($urlString, " ");
undef;
}
sub authorizeUser
{
my ($self, $user) = @_;
$self -> {USER} = $user;
}
sub authorizePassword
{
my ($self, $password) = @_;
$self -> {PASSWORD} = $password;
}
sub setAuthHead
{
my ($self, $request) = @_;
my $user = $self -> {USER};
my $password = $self -> {PASSWORD};
(defined $user or defined $password) or return;
(defined $user and defined $password) or
die "UNAVAILABLE: must specify both username and password\n";
# next 3 lines taken from LWP::UserAgent::request
my $uidpwd = "$user:$password";
my $scheme = 'Basic';
my $header = "$scheme " . encode_base64($uidpwd, '');
$request->header('Authorization' => $header);
}
# jfj also follow text substrings
# jfj extract links more elegantly
sub follow
{
my $self = shift;
my $follow = shift;
my $paRedirect = $$self{REDIRECT_LIST};
$follow =~ /^\d+$/
or die "SYNTAX ERROR: $follow must be a link number";
my $req = $$self{REQUEST};
$self -> setAuthHead($req);
my $response = $$self{UA} -> request($req,$pUserAgentCallback);
push(@$paRedirect,"following link $follow");
if (! $response -> is_success)
{
$$self{RESPONSE} = $response;
return undef;
}
my $baseURL = eval {$response -> base} || $$self{URL};
my $encoder = new GetWeb::Encoder();
$encoder -> encode($response -> content, $response -> content_type,
$baseURL);
eval
{
$$self{URL} = $response -> base;
};
my $pText = $encoder -> getTextRef();
if ($$pText =~ /^\[$follow\] (\S+:(\\\n|.)+)/m)
{
my $urlString = $1;
$urlString =~ s/\\\n//g;
$$self{URL} = $urlString;
$$self{REQUEST} = GetWeb::Util::safeRequest($urlString);
return $response;
}
push(@$paRedirect,"could not follow link $follow, sorry");
die "UNAVAILABLE: could not follow link $follow, no such reference";
# jfj return a new category of error condition for not following links
return undef;
}
sub getNoteRef
{
shift->{REDIRECT_LIST};
}
sub getLinks
{
my $self = shift;
my $paLinkType = shift; # 'a' or 'img'
my $response = $self -> {RESPONSE};
return [] unless defined $response;
my $content = $response -> content;
my $extor = new HTML::LinkExtor;
$extor -> parse($content);
my @links = $extor -> links;
my $base = $self -> {URL};
$base -> frag(undef);
my %hHREF = ();
my $link;
foreach $link (@links)
{
my $linkType = shift @$link;
next unless grep($linkType eq $_,@$paLinkType);
my %attr = @$link;
my $HREF = $attr{href};
next unless defined $HREF;
my $url = new URI::URL($HREF,$base);
$url -> frag(undef); # ignore fragments
my $urlString = $url -> abs;
# URI::URL::eq is flaky, so also do strcmp:
next if $url -> eq($base);
next if $base -> abs eq $urlString;
my $scheme = $url -> scheme;
next if grep($scheme eq $_, (qw(mailto telnet news)));
# jf avoid duplicates like www.foo.com and WWW.FOO.COM
$hHREF{$urlString} = 1;
}
my @aHREF = keys %hHREF;
\@aHREF;
}
sub fetch
{
my $self = shift;
my $response = $$self{RESPONSE};
return $response if defined $response;
my $req = $$self{REQUEST};
$self -> setAuthHead($req);
#die "about to send request\n";
$response = $$self{UA} -> request($req,$pUserAgentCallback);
$$self{RESPONSE} = $response;
#$self -> getLinks;
$response;
}
1;