WWW::CheckSite - OO interface to an iterator that checks a website


WWW-CheckSite documentation Contained in the WWW-CheckSite distribution.

Index


Code Index:

NAME

Top

WWW::CheckSite - OO interface to an iterator that checks a website

SYNOPSIS

Top

    use WWW::CheckSite;

    my $wcs = WWW::CheckSite->new(
        uri    => 'http://www.test-smoke.org/',
        prefix => 'tsorg',
        save   => 1,
    );

    $wcs->validate;

    $wcs->write_report;

Or using saved data (skip the real validation):

    my $wcs = WWW::CheckSite->load(
        uri    => 'http://www.test-smoke.org/',
        prefix => 'tsorg',
    );

    $wcs->write_report;

DESCRIPTION

Top

This module implents a spider, that checks the pages on a website. For each page the links and images on that page are checked for availability. After that, the page is validated by W3.ORG.

When the spider is done, one can have a report in HTML written.

WARNING: Although the spider respects /robots.txt on the target site, the validator does not! Use this tool only on your own sites.

METHODS

Top

WWW::CheckSite->new( %args )

Initialize a new instance. Options supported:

* uri => the base uri to check [mandatory]
* prefix => the name of the project [mandatory]
* dir => target directory (curdir())
* save => true/false (false)
* strictrules => true/false (false)
* validate => by_none/by_uri/by_upload (by_none)
* ua_class => override the user agent class
* ua_args => hashref with extra options passed to the user agent class
* v => $verbosity, where $verbosity may be

0

Be quiet (default).

1

Report basic information for every visited page (e.g. number of links and images) and total time for checking the site.

2

Additional reporting of page validation details.

WWW::CheckSite->load( %args )

Initialize the object from datafile. Supported options:

* dir => target/source directory
* prefix => the prefix used for this dataset [mandatory]

$wcs->validate

The validate() method collects all the data.

$wcs->write_report

Generate the reports.

$wcs->write_ht_report()

Load, fill the HTML::Template template and write the reports.

$wcs->write_tt_report()

Load, fill the Template Toolkit template and write the reports.

$wcs->_die;

Do a Carp::croak().

NO METHODS

Top

create_report()

Load and fill the HTML::Template.

create_report_data()

Return a hash with all the data needed to fill both the HTML::Template and the Template Toolkit templates.

AUTHOR

Top

Abe Timmerman, <abeltje@cpan.org>

BUGS

Top

Please report any bugs or feature requests to bug-www-checksite@rt.cpan.org, or through the web interface at http://rt.cpan.org. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

COPYRIGHT & LICENSE

Top


WWW-CheckSite documentation Contained in the WWW-CheckSite distribution.
package WWW::CheckSite;
use strict;
use warnings;

# $Id: CheckSite.pm 648 2007-05-13 21:30:35Z abeltje $
our $VERSION = '0.018';

BEGIN { eval qq{ use Time::HiRes qw( time ); } }
use FindBin;
use Storable qw( nstore retrieve );
use File::Spec::Functions qw( :DEFAULT rel2abs );
use File::Basename;
use File::Path;
use URI::file;

use WWW::CheckSite::Validator;

sub new {
    my $class = shift;

    my %args = @_ ? ref $_[0] ? %{ +shift } : @_ : ();

    exists $args{uri} && length $args{uri} or
        _die( "", "Usage: WWW::CheckSite->new( uri => q<your_uri> )" );

    exists $args{prefix} && length $args{prefix} or
        _die( "", "Usage: WWW::CheckSite->load( prefix => 'xxx' )" );

    # Backward compatibility wrt {uri}
    ref $args{uri} or $args{uri} = [ $args{uri} ];

    bless \%args, $class;
}

sub load {
    my $class = shift;

    my %args = ref $_[0] ? %{ +shift } : @_;

    exists $args{prefix} && length $args{prefix} or
        _die( "", "Usage: WWW::CheckSite->load( prefix => 'xxx' )" );

    my $tmp = bless \%args, $class;
    my $wcsfile = $tmp->_datafile;

    -f $wcsfile && -r _ or _die( "", "Cannot find '$wcsfile': $!" );

    my $self = retrieve $wcsfile;
    $self->{v} and print "Loaded '$wcsfile'\n";
    $self->{ $_ } = $args{ $_ } for qw( dir prefix tt v );

    # Backward compatibility wrt {uri}
    ref $self->{uri} or $self->{uri} = [ $self->{uri} ];

    $self->_set_validator_fmt;

    return $self;
}

sub validate {
    my $self = shift;

    my $wcs = WWW::CheckSite::Validator->new(
        uri         => $self->{uri},
        ua_class    => $self->{ua_class},
        ua_args     => $self->{ua_args},
        exclude     => $self->{exclude},
        validate    => $self->{validate},
        strictrules => $self->{strictrules},
        lang        => $self->{lang},
        myrules     => $self->{myrules},
        v           => $self->{v} > 1,
    );

    my( $cnt, $intref ) = ( 0, 'a' );
    $self->{start_time} = time;
    while ( my $info = $wcs->get_page ) {
        $info->{intref} = $intref++
            if $info->{ret_uri} =~ /^\Q$self->{uri}[0]/;

        push @{ $self->{by_depth}{ $info->{depth} } }, $info->{ret_uri};
        $self->{report}{ $info->{ret_uri} } = $info;

        $self->{v} and printf "%5u %s (%u links; %u images; %s styles; %s)\n",
                              ++$cnt, @{ $info }{qw( ret_uri link_cnt 
                                                     image_cnt style_cnt)},
                              $info->{valid} ? $info->{valid} != -1
                                  ? 'valid' : 'skipped' : 'not valid';
    }
    $self->{spider_time} = time;

    $self->{v} and printf "That took %s\n", $self->_spider_time;

    $self->_set_validator_fmt;

    if ( $self->{save} ) {
        my $dir = $self->_datadir;
        unless ( -d $dir ) {
            mkpath( $dir, $self->{v} ) or
                $self->_die( "Cannot mkdir($dir): $!" );
        }
        nstore $self, $self->_datafile;
    }
}

sub _set_validator_fmt {
    my( $self ) = @_;

    if ( $self->{validate} eq 'by_upload' ) {
        $self->{validator_fmt} = $WWW::CheckSite::Validator::VALIDATOR_FRM .
                                 "/check?uri=%s";
    } else {
        $self->{validator_fmt} = $WWW::CheckSite::Validator::VALIDATOR_URL;
    }
}
 
sub dump_links {
    my( $self, $noskipped ) = @_;

    my %seen;
    for my $url ( keys %{ $self->{report} } ) {
        $seen{ $url }++;
        for my $link ( @{ $self->{report}{ $url }{links} } ) {
            $seen{ $link->{uri} }++
                unless $noskipped && $link->{status} == 999;
        }
    }

    return sort keys %seen;
}

sub _spider_time {
    my $self = shift;
    return time_hhmm( @{ $self }{qw( start_time spider_time )} );
}

sub _report_time {
    my $self = shift;
    return time_hhmm( @{ $self }{qw( rstart rfinish )} );
}

sub _datafile {
    my $self = shift;
    return catfile $self->_datadir, "$self->{prefix}.wcs";
}
 
sub _datadir {
    my $self = shift;
    my $dir = $self->{dir} || curdir();
    return catdir $dir, $self->{prefix};
}
 
sub write_report {
    my $self = shift;

    if ( $self->{tt} ) {
        # first check if we have Template Toolkit
        eval qq{use Template};
        return $self->write_tt_report unless $@;
    }

    # then check for HTML::Template
    eval qq{use HTML::Template};
    return $self->write_ht_report unless $@;

    $self->_die( "No supported template system found" );
}

sub write_ht_report {
    my $self = shift;

    my $dir = $self->_datadir;
    unless ( -d $dir ) {
        mkpath( $dir, $self->{v} ) or $self->_die( "Cannot mkdir($dir): $!" );
    }

    my $mainuri = ref $self->{uri} ? $self->{uri}[0] : $self->{uri};
    for my $type (qw( summ full )) {
        $self->{rstart} = time;
        my $report = create_report( "wcs${type}rpt.tmpl", $mainuri,
                                    @{ $self }{qw( by_depth report
                                                   validator_fmt v )} );
        $self->{rfinish} = time;

        $report->param( 
            spider_time  => $self->_spider_time,
            report_time  => $self->_report_time,
            now_time     => scalar localtime,
            did_validate => ($self->{validate} =~ /by_(?:upload|uri)/ ? 1 : 0),
            strict_rules => $self->{strictrules},
            language     => $self->{lang},
            summlink     => basename( name_outfile( $self->_datadir, 'summ' ) ),
            fulllink     => basename( name_outfile( $self->_datadir, 'full' ) ),
        );

        # write the report
        my $rptname = name_outfile( $self->_datadir, $type );
        open my $fh, "> $rptname" or
            $self->_die( "Cannot create($rptname): $!" );
        print $fh $report->output;
        close $fh or $self->_die( "Write error ($rptname): $!" );

        my $furi = URI::file->new_abs( $rptname );
        $self->{v} and print "Finished writing '$furi'\n";
    }
    return 1;
}

sub write_tt_report {
    my $self = shift;

    my $dir = $self->_datadir;
    unless ( -d $dir ) {
        mkpath( $dir, $self->{v} ) or
            $self->_die( "Cannot mkdir($dir): $!" );
    }

    my $mainuri = ref $self->{uri} ? $self->{uri}[0] : $self->{uri};
    my $data = create_report_data( 'all', $mainuri,
                                   @{ $self }{qw( by_depth report
                                                  validator_fmt v )} );
    for my $type (qw( summ full )) {
        my $tt_name = "wcs${type}rpt.tt";
        $self->{rstart} = time;
        my $report = Template->new({
            INCLUDE_PATH => dirname( find_tmpl( $tt_name, $self->{v} ) ),
            POST_CHOMP => 1,
            EVAL_PERL  => 1,
        });
        $self->{rfinish} = time;

        my $rptname = name_outfile( $self->_datadir, $type );
        $report->process( $tt_name, {
            ( map +( $_ => $self->{ $_ } ) => qw( uri by_depth report v ) ),
            ( map +( $_ => $data->{ $_ } ) => keys %$data ),

            spider_time  => $self->_spider_time,
            report_time  => $self->_report_time,
            now_time     => scalar localtime,
            did_validate => ($self->{validate} =~ /by_(?:upload|uri)/ ? 1 : 0),
            strict_rules => $self->{strictrules},
            language     => $self->{lang},
            summlink     => basename( name_outfile( $self->_datadir, 'summ' ) ),
            fulllink     => basename( name_outfile( $self->_datadir, 'full' ) ),
        },
            $rptname,
        ) || $self->_die( $report->error );

        my $furi = URI::file->new_abs( $rptname );
        $self->{v} and print "Finished writing '$furi'\n";
    }
    return 1;
}

sub _die {
    my $self = shift;
    require Carp;
    Carp::croak( @_ );
}

sub create_report {
    my( $tmplnm, $v ) = @_[ 0, -1 ];

    my $data = create_report_data( @_ );

    my $tmpl = HTML::Template->new(
        filename => find_tmpl( $tmplnm, $v ),
        loop_context_vars => 1,
        die_on_bad_params => 0,
    );
    $tmpl->param( %$data );
    return $tmpl;
}

sub create_report_data {
    my( $tmplnm, $url, $by_depth, $report, $validate_fmt, $v ) = @_;

    my %data = ( url => $url, title => $report->{ $url }{title},
                 valid_cnt => 0, valid_ok => 0, pages => [ ],
                 not_ok_cnt => 0, kw_total => 0 );

    $v > 1 and print "Using validator: '$validate_fmt'\n";
    foreach my $level ( sort { $a <=> $b } keys %$by_depth ) {
        $v > 1 and printf "[$tmplnm]Level %u, %u page(s)\n", $level,
                          scalar @{ $by_depth->{ $level } };

        foreach my $uri ( @{ $by_depth->{ $level } } ) {
            my $pinfo = $report->{ $uri };
            $pinfo->{uri} = $uri;
            $pinfo->{validator_uri} = sprintf $validate_fmt, $uri;
            $pinfo->{status_tx} = status_text( $pinfo->{status} );
            $pinfo->{status_ok} = $pinfo->{status} == 200;

            $pinfo->{all_ok} = $pinfo->{status_ok};
            for my $ikey (qw( links images styles )) {
                $pinfo->{ "all_${ikey}_ok" } = 1;
                $pinfo->{   "${ikey}_sk"   } = 0;
                $pinfo->{ "kw_${ikey}_cnt" } = 0;
            }

            foreach ( @{ $pinfo->{links} },
                      @{ $pinfo->{images} },
                      @{ $pinfo->{styles} } ) {
                $_->{status_tx} = status_text( $_->{status} );

                $_->{status_ok} = $_->{status} == 200;
                $_->{status_sk} = $_->{status} == 999;
                exists $_->{valid} and
                    $_->{status_ok} &&= $_->{valid};

                $pinfo->{all_ok} &&= ( $_->{status_ok} || $_->{status_sk} );

                $_->{text} =~ s/>No text in TAG</&gt;No text in $_->{tag}&lt;/
                    and $_->{no_text} = 1;

                $_->{"type_$_->{tag}"} = 1;
                $pinfo->{all_ok} &&= ! $_->{no_text};
                $_->{link_ok} = ! $_->{no_text} && $_->{status_ok};
                if ( $_->{tag} eq 'link' ) { # this is a style
                    $_->{status_sk} and $pinfo->{styles_sk}++;
                    $pinfo->{all_styles_ok} &&= $_->{link_ok};
                    $_->{valid_tx} = $_->{valid}
                        ? $_->{valid} == 1 ? 'ok' : 'skipped' : 'not ok';
                    $_->{valid_ok} = $_->{valid}
                        ? $_->{valid} == 1 ? 1 : 1 : 0;
                    $_->{link_ok} || $_->{status_sk}
                        and $pinfo->{kw_styles_cnt}++;
                } elsif ( exists $_->{ct} ) { # this is an image
                    $_->{status_sk} and $pinfo->{images_sk}++;
                    $pinfo->{all_images_ok} &&= $_->{link_ok};
                    $_->{link_ok} || $_->{status_sk}
                        and $pinfo->{kw_images_cnt}++;
                } else {
                    $_->{status_sk} and $pinfo->{links_sk}++;
                    $pinfo->{all_links_ok} &&= $_->{link_ok};
                    $_->{link_ok} || $_->{status_sk}
                        and $pinfo->{kw_links_cnt}++;
                }
            }

            $pinfo->{valid_tx} = $pinfo->{valid}
                ? $pinfo->{valid} == 1 ? 'ok' : 'skipped' : 'not ok';
            defined $pinfo->{valid} and $pinfo->{all_ok} &&= $pinfo->{valid};

            $pinfo->{link_cnt} and $pinfo->{link_status_ok} =
                $pinfo->{link_cnt} == $pinfo->{links_ok} + $pinfo->{links_sk}; 
            $pinfo->{link_status} = $pinfo->{link_cnt} 
                ? $pinfo->{link_cnt} == $pinfo->{links_ok} + $pinfo->{links_sk}
                    ? 'ok' : 'not ok' : 'N/A';
            $pinfo->{link_cnt} and 
                $pinfo->{all_ok} &&= $pinfo->{link_status_ok};
            $pinfo->{kw_links_cnt} ||= 0;
            $pinfo->{kw_links} = $pinfo->{link_cnt}
                ? sprintf "%.2f", $pinfo->{kw_links_cnt}/$pinfo->{link_cnt} : 1;
            $pinfo->{status_ok} or $pinfo->{kw_links} = 0;

            $pinfo->{kw_images_cnt} ||= 0;
            $pinfo->{kw_images_cnt} and $pinfo->{image_status_ok} =
                $pinfo->{kw_images_cnt} == $pinfo->{images_ok} +
                                       $pinfo->{images_sk}; 
            $pinfo->{image_status} = $pinfo->{image_cnt} 
                ? $pinfo->{image_cnt} == $pinfo->{images_ok} +
                                         $pinfo->{images_sk}
                    ? 'ok' : 'not ok' : 'N/A';
            $pinfo->{image_cnt} and 
                $pinfo->{all_ok} &&= $pinfo->{image_status_ok};
            $pinfo->{kw_images} = $pinfo->{image_cnt}
                ? sprintf "%.2f", $pinfo->{kw_images_cnt}/$pinfo->{image_cnt}
                : 1;

            $pinfo->{style_cnt} and $pinfo->{style_status_ok} =
                $pinfo->{style_cnt} == $pinfo->{styles_ok} +
                                       $pinfo->{styles_sk}; 
            $pinfo->{style_status} = $pinfo->{style_cnt} 
                ? $pinfo->{style_cnt} == $pinfo->{styles_ok} +
                                         $pinfo->{styles_sk}
                    ? 'ok' : 'not ok' : 'N/A';
            $pinfo->{style_cnt} and 
                $pinfo->{all_ok} &&= $pinfo->{style_status_ok};
            $pinfo->{kw_styles_cnt} ||= 0;
            $pinfo->{kw_styles} = $pinfo->{style_cnt}
                ? sprintf "%.2f", $pinfo->{kw_styles_cnt}/$pinfo->{style_cnt}
                : 1;

            $pinfo->{kw_return} = sprintf "%.2f", $pinfo->{status_ok} ? 1 : 0;
            $pinfo->{kw_title}  = sprintf "%.2f", $pinfo->{title}
                ? $pinfo->{status_ok} ? 1 : 0 : 0;
            $pinfo->{kw_valid}  = sprintf "%.2f", 
                                         $pinfo->{valid_tx} ne 'not ok' ? 1 : 0;
            my @metrics = qw( return title valid links images styles );
            $pinfo->{kw_total} = 0;
            for my $metric (@metrics) {
                $pinfo->{kw_total} += $pinfo->{ "kw_$metric" };
            }
            $pinfo->{kwalitee} = sprintf "%.2f", $pinfo->{kw_total}/@metrics;
            $data{kw_site} += $pinfo->{kwalitee};

            $pinfo->{all_ok} or $data{not_ok_cnt}++;
            $pinfo->{valid} and $data{valid_ok}++;
            $pinfo->{valid_tx} =~ /ok/ and $data{valid_cnt}++;
            push @{ $data{pages} }, $pinfo;
        }
    }

    $data{page_cnt}    = scalar @{ $data{pages} };
    $data{kwalitee}    = sprintf "%.2f", $data{kw_site}/$data{page_cnt};
    $data{copyright}   = '&copy; MMV Abe Timmerman &lt;abeltje@cpan.org&gt;';
    $data{wcs_version} = $VERSION;

    return \%data;
}

sub name_outfile {
    my( $dir, $type ) = @_;
    return catfile $dir, "${type}.html";
}

sub find_tmpl {
    my( $name, $v ) = @_;

    my $from_cur = $name;
    -f $from_cur and do {
        $v > 1 and print "Found: '$from_cur\n";
        return $from_cur;
    };
    $from_cur .= ".tmpl";
    -f $from_cur and do {
        $v > 1 and print "Found: '$from_cur\n";
        return $from_cur;
    };
    my $from_bin = rel2abs( $name, $FindBin::Bin );
    -f $from_bin and do {
        $v > 1 and print "Found: '$from_bin\n";
        return $from_bin;
    };
    $from_bin .= ".tmpl";
    -f $from_bin and do {
        $v > 1 and print "Found: '$from_bin\n";
        return $from_bin;
    };

    # Findout where the module is installed:
    ( my $lib = $INC{ 'WWW/CheckSite.pm' } ) =~ s|CheckSite.pm||;
    my $from_lib = rel2abs( $name, $lib );
    -f $from_lib and do {
        $v > 1 and print "Found: '$from_lib\n";
        return $from_lib;
    };
    $from_lib .= ".tmpl";
    -f $from_lib and do {
        $v > 1 and print "Found: '$from_lib\n";
        return $from_lib;
    };

    return $name;
}

sub status_text {
    local $_ = shift || 'unknown';
    SWITCH: {
        /^200$/ and return "ok";
        /^203$/ and return "non-authoritative information";
        /^204$/ and return "no content";
        /^304$/ and return "not modified";
        /^305$/ and return "use proxy";
        /^400$/ and return "bad request";
        /^401$/ and return "unauthorized";
        /^403$/ and return "forbidden";
        /^404$/ and return "not found";
        /^500$/ and return "internal server error";
        /^501$/ and return "not implemented";
        /^502$/ and return "bad gateway";
        /^503$/ and return "service unavailable";
        /^504$/ and return "gateway timeout";
        /^505$/ and return "http version not supported";
        /^999$/ and return "no robots allowed";
        return $_;
    }
}

sub time_hhmm {
    my( $start, $finish ) = @_;

    my $diff = defined $finish ?  abs( $finish - $start ) : $start;

    my $days = int( $diff / (24*60*60) );
    $diff -= 24*60*60 * $days;
    my $hours = int( $diff / (60*60) );
    $diff -= 60*60 * $hours;
    my $mins = int( $diff / 60 );
    $diff -=  60 * $mins;

    my @parts;
    push @parts, sprintf "$days day%s",    $days  > 1 ? 's' : '' if $days;
    push @parts, sprintf "$hours hour%s",  $hours > 1 ? 's' : '' if $hours;
    push @parts, sprintf "$mins minute%s", $mins  > 1 ? 's' : '' if $mins;
    push @parts, sprintf "%.3f seconds", $diff;

    return join " ", @parts;
}