WWW::Page::Author - locates the author of a web page


WWW-Page-Author documentation Contained in the WWW-Page-Author distribution.

Index


Code Index:

NAME

Top

WWW::Page::Author - locates the author of a web page

SYNOPSIS

Top

    use WWW::Page::Author;
    my $pa = WWW::Page::Author->new;
    print $pa->get_author('http://www.apple.com/');

DESCRIPTION

Top

The WWW::Page::Author module attempts to determine the author of a web page. It does this by examining the HTTP headers, HTML headers and the body of the HTML document.

METHODS

Top

WWW::Page::Author->new()

Creates a new author seeking object.

$pa->get_author($url)

Returns the author of the web page (or site) or undef. $url can either be an HTTP::Response object, a URI object or just a string URL.

AUTHOR

Top

Iain Truskett <spoon@cpan.org> http://eh.org/~koschei/

Please report any bugs, or post any suggestions, to either the mailing list at <perl-www@dellah.anu.edu.au> (email <perl-www-subscribe@dellah.anu.edu.au> to subscribe) or directly to the author at <spoon@cpan.org>

PLANS

Top

It needs to cater for more weird and unusual ways of putting dates on web pages.

COPYRIGHT

Top

ACKNOWLEDGEMENTS

Top

I would like to thank GRF for having me write this.

SEE ALSO

Top

Um.


WWW-Page-Author documentation Contained in the WWW-Page-Author distribution.
package WWW::Page::Author;


use 5.006;
use strict;
use warnings;

use Carp;
use Data::Dumper;
use URI::URL;
use LWP::UserAgent;
use HTTP::Request::Common qw/GET HEAD/;
use Email::Find;

use constant DEBUG => 0;
use vars qw/$AUTOLOAD/;
our ( $VERSION ) = '$Revision: 1.2 $ ' =~ /\$Revision:\s+([^\s]+)/;
our @ISA = qw//;

# ========================================================================
#                                                                  Methods

sub new
{
    my $class = shift;
    $class = ref($class) || $class;

    my $self = {
	ua	=> undef,
    };

    bless $self, $class;
}

sub get_author
{
    my ($self,$url) = (@_);

    return 0 unless defined $url;
    $url = $self->_get_url_body($url);
    return '[error]' unless defined $url and $url->is_success;

    my $body = $url->content();
    my $emails = [];
    my $num_found = find_emails($body, sub {
	push @$emails, $_[1];
	return $_[1];
    });

    warn Dumper($emails, scalar @$emails) if DEBUG > 0;

    return undef unless $num_found;

    do {
	my @webs = grep /^webmaster\@/, @$emails;
	@$emails = @webs if @webs;
    };

    return $num_found ? $emails->[@$emails-1] : undef;
}

# ========================================================================
#                                                                  Private

sub _get_url_body
{
    my ($self,$url) = (@_);
    warn "Fetching $url\n" if DEBUG > 0;
    if (not ( ref $url and $url->isa('HTTP::Response') ) )
    {
	my $req = GET $url;
	$url = $self->_ua->request($req);
    }
    else
    {
	warn "Already a response object: ".ref($url)."\n" if DEBUG > 0;
    }
    warn Dumper($url) if DEBUG > 2;
    return $url;
}

sub _ua
{
    my $self = shift;
    $self->{ua} = $_[0] if @_;
    unless (defined $self->{ua})
    {
	my $ua = LWP::UserAgent->new;
	my $name = ref($self);
	$ua->agent($name.'/'.$VERSION);
	$ua->env_proxy();
	$self->_ua($ua);
    }
    return $self->{ua};
}

1;
__END__
#
# ========================================================================
#                                                Rest Of The Documentation