| WWW-CheckSite documentation | Contained in the WWW-CheckSite distribution. |
WWW::CheckSite::Validator - A spider that assesses 'kwalitee' for a site
use WWW::CheckSite::Validator;
my $wcv = WWW::CheckSite::Validator->new(
uri => 'http://www.test-smoke.org'
);
while ( my $info = $wcv->get_page ) {
# handle the info
}
This is a subclass of WWW::CheckSite::Spider.
WWW::CheckSite::Validator starts its work after the spider has
fetched the page. It will check these things:
All links on the page (<a href>, <area href>, <frame
src>) are checked for availability.
All images on the page (<img src>, <input type=image>)
are checked for availability.
All stylesheets on the page (<link rel=stylesheet type=text/css>) are checked for availability.
The contents of the page are send to http://validator.w3.org for validation.
Extend WWW::CheckSite::Spider->new to check for Image::Info
so we can do a basic check on the images.
This method overrides the WWW::CheckSite::Spider::process_page()
method to check on the availability of links, images and
stylesheets. When specified it will also send the page for
validation by W3.ORG.
On top of the standard information it returns more:
The check_links() method gets information about the links on this
page. If there is no return status, it will HEAD the uri and
update the cache status for this link to prevent multiple HEADing.
NOTE: This method does not respect the exclusion rules, and only
robot-rules with strictrules enabled!
The structure for links:
a/area tagThe check_images() method gets information about the images on the
page. The list comes from the images() method of the mechanize
object. It will only HEAD the uri.
The structure for images:
img/input tagThe check_styles() method checks the validity of stylesheets used in the
page. We check for <link rel="stylesheet" type="text/css"> tags.
The structure for stylesheets:
The validate() method sends the url/contents off to W3.org to validate.
The fallback do-not-validate method.
Sends only the uri to W3.ORG and get the validation result.
Create a temporary file (with File::Temp) from $agent->content,
call the validator with that temporary file and save the result (as a
boolean) in $stats->{validate}.
Use the xmllint(1) program to validate the (X)HTML.
Dispatch the validation to the right method.
The fallback do-not-validate-stylesheet method.
Sends only the uri to JIGSAW.W3.ORG and get the validation result.
Create a temporary file (with File::Temp) from $ua->content,
call the validator with that temporary file and return the result.
This is more like a basic consistency check, that uses Image::Info::image_info().
Check if the content-type is "validatable".
Why?
Abe Timmerman, <abeltje@cpan.org>
Please report any bugs or feature requests to
bug-WWW-CheckSite@rt.cpan.org, or through the web interface at
http://rt.cpan.org. I will be notified, and then you'll automatically
be notified of progress on your bug as I make changes.
Copyright MMV Abe Timmerman, All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
| WWW-CheckSite documentation | Contained in the WWW-CheckSite distribution. |
package WWW::CheckSite::Validator; use strict; use warnings; # $Id: Validator.pm 633 2007-04-30 21:08:56Z abeltje $ use vars qw( $VERSION $VALIDATOR_URL $VALIDATOR_FRM $VALIDATOR_STYLE $XMLLINT ); $VERSION = '0.017';
use WWW::CheckSite::Spider qw( :const ); use base 'WWW::CheckSite::Spider'; BEGIN { $VALIDATOR_URL = 'http://validator.w3.org/check?uri=%s'; $VALIDATOR_FRM = 'http://validator.w3.org/'; $XMLLINT = 'xmllint'; $VALIDATOR_STYLE = 'http://jigsaw.w3.org/css-validator/'; }
sub new { my $class = shift; my $self = $class->SUPER::new( @_ ); $self->{validate} ||= 'by_none'; $self->{novalidate} = $self->{validate} eq 'by_none'; eval qq{use Image::Info qw( image_info )}; $self->{can_val_image} = ! $@; return $self; }
sub process_page { my $self = shift; my $stats = $self->SUPER::process_page( @_ ); $self->check_links( $stats ); $self->check_images( $stats ); $self->check_styles( $stats ); $self->validate( $stats ); return $stats; }
sub check_links { my( $self, $stats ) = @_; my( $stack, $cache, $mech ) = @{ $self }{qw( _stack _cache _agent )}; my @links = $mech->success ? $self->links_filtered : (); my @checked; for my $link ( @links ) { my $check = URI->new_abs( $link->url, $mech->uri ); $self->more_rrules( $check ); my $in_cache = $cache->has( $check ); unless ( $in_cache && defined $in_cache->[1] ) { $self->more_rrules( $check ); if ( ! $self->allowed( $check ) ) { $in_cache->[1] = '999'; $self->{v} and print " HEAD '$check': skipped.\n"; } else { $self->{v} and print " HEAD '$check': "; my $ua = $self->new_agent; eval { $ua->head( $check ) }; $in_cache->[1] = $ua->status; $self->{v} and printf "done(%sok).\n", $ua->success ? '' : 'not '; $ua->success && ! $self->ct_can_validate( $ua ) and $in_cache->[0] = WCS_NOCONTENT; } } push @checked, { link => $link->url, uri => $check->as_string, tag => $link->tag, text => $link->text || ">No text in TAG<", status => $in_cache->[1], depth => $in_cache->[2], action => $self->set_action( $check, $in_cache ), }; } $stats->{link_cnt} = @links; $stats->{links} = \@checked; $stats->{links_ok} = grep $_->{status} == 200 => @checked; return $stats; }
sub check_images { my( $self, $stats ) = @_; my( $stack, $cache, $mech ) = @{ $self }{qw( _stack _cache _agent )}; my @images = $mech->success ? $mech->images : ();; my @checked; for my $img ( @images ) { my $check = URI->new_abs( $img->url, $mech->base ); $self->more_rrules( $check ); my $in_cache = $cache->has( $check ); defined $in_cache or $in_cache = $cache->set( $check => [ WCS_FOLLOWED ] ); unless ( $in_cache && defined $in_cache->[1] ) { $self->more_rrules( $check ); if ( ! $self->allowed( $check ) ) { $in_cache->[1] = '999'; } else { my $ua = $self->new_agent; my $method = $self->{can_val_image} ? 'get' : 'head'; $self->{v} and print " \U$method\E '$check': "; eval { $ua->$method( $check ) }; my $success = $ua->success; $in_cache->[1] = $ua->status; $in_cache->[2] = $ua->ct; my $valid; if ( $method eq 'head' ) { $valid = $success ? -1 : 0; } else { # it's GET $valid = $success ? $self->validate_image( $ua ) : 0; } $in_cache->[3] = $valid; $self->{v} and printf "done(%sok).\n", $ua->success ? '' : 'not '; } } push @checked, { link => $img->url, uri => $check->as_string, tag => 'ALT', text => ( defined( $img->alt ) ? ($img->alt || "") : $self->{novalidate} ? "" : ">No text in TAG<" ), status => $in_cache->[1], ct => $in_cache->[2], valid => $in_cache->[3], }; } $stats->{image_cnt} = @images; $stats->{images} = \@checked; $stats->{images_ok} = grep $_->{status} == 200 && $_->{valid} => @checked; return $stats; }
sub check_styles { my( $self, $stats ) = @_; my( $stack, $cache, $mech ) = @{ $self }{qw( _stack _cache _agent )}; my $content = \( $mech->content ); my $p = HTML::TokeParser->new( $content ); my @styles; while ( my $token = $p->get_tag( 'link' ) ) { ( exists $token->[1]{rel} && $token->[1]{rel} eq 'stylesheet' ) && ( exists $token->[1]{type} && $token->[1]{type} eq 'text/css' ) or next; push @styles, $token->[1]{href}; } my @checked; for my $sheet ( @styles ) { my $check = URI->new_abs( $sheet, $mech->uri ); $self->more_rrules( $check ); my $in_cache = $self->{_cache}->has( $check ); defined $in_cache or $in_cache = $cache->set( $check => [ WCS_FOLLOWED ] ); unless ( $in_cache && defined $in_cache->[1] ) { $self->more_rrules( $check ); if ( ! $self->allowed( $check ) ) { $in_cache->[1] = '999'; $in_cache->[3] = -1; } else { my $ua = $self->new_agent; my $method = $self->{validate} =~ /by_(?:upload|uri)/ ? 'get' : 'head'; $self->{v} and print " \U$method\E '$check': "; eval { $ua->$method( $check ) }; my $success = $ua->success; $self->{v} and printf "done(%sok).\n", $success ? '' : 'not '; $in_cache->[1] = $ua->status; $in_cache->[2] = $ua->ct; $in_cache->[3] = $method eq 'get' && $success ? $self->validate_style( $ua ) : -1; } } push @checked, { link => $sheet, uri => $check->as_string, tag => 'link', text => '', status => $in_cache->[1], ct => $in_cache->[2], valid => $in_cache->[3], }; } $stats->{style_cnt} = @styles; $stats->{styles} = \@checked; $stats->{styles_ok} = grep +($_->{status} == 200) => @checked; $stats->{vstyles_ok} = grep defined( $_->{valid} ) ? ($_->{valid} == 1) ? 1 : 0 : 1 => @checked; return $stats; }
sub validate { my( $self, $stats ) = @_; unless ( $self->current_agent->success ) { $self->{v} and print "Validate @{[$self->current_agent->uri]}: skipped\n"; $stats->{valid} = 0; return $stats; } my $how_to = $self->{validate} || 'by_none'; my $validate = "validate_$how_to"; $self->can( $validate ) or $validate = 'validate_by_none'; $self->$validate( $stats ); }
sub validate_by_none { my( $self, $stats ) = @_; $stats->{valid} = -1; }
sub validate_by_uri { my( $self, $stats ) = @_; my $val_uri = sprintf $VALIDATOR_URL, $self->current_agent->uri; $self->{v} and print "HTML-Validate $val_uri: "; my $ua = $self->new_agent; $self->{lang} and $ua->default_header( 'Accept-Language' => 'en' ); $ua->get( $val_uri ); $stats->{valid} = $ua->success ? $ua->content =~ /This Page Is Valid/ : '-1'; $self->{v} and printf "done(%sok)\n", $stats->{valid} == 1 ? "" : "not "; $self->{lang} and $ua->default_header( 'Accept-Language' => $self->{lang} ); }
sub validate_by_upload { my( $self, $stats ) = @_; eval "use File::Temp"; $@ and $stats->{valid} = 1, return; my( $mech ) = @{ $self }{qw( _agent )}; File::Temp->import( 'tempfile' ); my( $fh, $filename ) = tempfile( 'wcvtempXXXX', SUFFIX => '.html', UNLINK => 1 ); print $fh $mech->content; close $fh; $self->{v} and printf "HTML-Validate_upl(%s): %s ", $filename, $mech->uri; $stats->{validate} = $filename; my $ua = $self->new_agent; $self->{lang} and $ua->default_header( 'Accept-Language' => 'en' ); $ua->get( $VALIDATOR_FRM ); $ua->submit_form( form_number => 2, fields => { uploaded_file => $filename }, ); $stats->{valid} = $ua->success ? $ua->content =~ /This Page Is Valid/ : -1; $self->{v} and printf " done(%sok)\n", $stats->{valid} == 1 ? "" : "not "; $self->{lang} and $ua->default_header( 'Accept-Language' => $self->{lang} ); }
sub validate_by_xmllint { my( $self, $stats ) = @_; my $opts = qq[--postvalid --recover --stream]; eval "use File::Temp"; $@ and $stats->{valid} = 1, return; my( $ua ) = @{ $self }{qw( _agent )}; File::Temp->import( 'tempfile' ); my( $fh, $filename ) = tempfile( 'wcvtempXXXX', SUFFIX => '.html', UNLINK => 1 ); print $fh $ua->content; close $fh; $self->{v} and print "[$XMLLINT $opts $filename 2>\&1]\n"; $self->{v} and printf "xmllint(%s): %s ", $filename, $ua->uri; $stats->{validate} = $filename; my $out = qx[$XMLLINT $opts $filename 2>\&1]; $self->{v} and print $out; $stats->{valid} = defined $out ? $out eq '' : -1; $self->{v} and printf " done(%sok)\n", $stats->{valid} == 1 ? "" : "not "; }
sub validate_style { my( $self, $ua ) = @_; $self->{novalidate} and return -1; my $how_to = $self->{validate} || 'by_none'; my $validate = "style_$how_to"; $self->can( $validate ) or $validate = 'style_by_none'; $self->$validate( $ua ); }
sub style_by_none { return -1; }
sub style_by_uri { my( $self, $ua ) = @_; my $uri = $ua->uri; $self->{v} and print "CSS-Validate $VALIDATOR_STYLE?$uri: "; $self->{lang} and $ua->default_header( 'Accept-Language' => 'en' ); $ua->get( $VALIDATOR_STYLE ); $ua->submit_form( form_number => 1, fields => { uri => $uri }, ); my $valid = $ua->success ? $ua->content =~ /This document validates as / : -1; $self->{v} and printf "done(%sok)\n", $valid == 1 ? "" : "not "; $self->{lang} and $ua->default_header( 'Accept-Language' => $self->{lang} ); return $valid; }
sub style_by_upload { my( $self, $ua ) = @_; eval "use File::Temp"; return if $@; File::Temp->import( 'tempfile' ); my( $fh, $filename ) = tempfile( 'wcvtempXXXX', SUFFIX => '.css', UNLINK => 1 ); print $fh $ua->content; close $fh; $self->{v} and printf "CSS-Validate_upl(%s): %s ", $filename, $ua->uri; $self->{lang} and $ua->default_header( 'Accept-Language' => 'en' ); $ua->get( $VALIDATOR_STYLE ); $ua->submit_form( form_number => 2, fields => { file => $filename }, ); my $valid = $ua->success ? $ua->content !~ m|<h2>Errors</h2>|i : -1; $self->{v} and printf " done(%sok)\n", $valid == 1 ? "" : "not "; $self->{lang} and $ua->default_header( 'Accept-Language' => $self->{lang} ); return $valid; }
sub validate_image { my( $self, $ua ) = @_; my $image = $ua->content; my $iinfo = Image::Info::image_info( \$image ); return ! $iinfo->{error}; }
sub ct_can_validate { my( $self, $ua ) = @_; return $ua->ct =~ m[^\Qtext/html\E] || $ua->ct =~ m[^\Qtext/xhtml\E] || $ua->ct =~ m[^\Qapplication/xhtml+xml\E] || $ua->ct =~ m[^\Qapplication/vnd.wap.xhtml+xml\E]; }
sub set_action { my( $self, $check, $in_cache ) = @_; my $reason = ($in_cache->[0] & WCS_OUTSCOPE) ? $self->{_uri_ok} eq 'scope' ? 'Out of scope' : 'Excluded by pattern' : ''; $reason ||= ($in_cache->[0] & WCS_SPIDERED) ? 'done' : ''; $reason ||= ($in_cache->[0] & WCS_NOCONTENT) ? 'no text/html' : ''; return $reason ? "[c] Skip: ($reason)" : "[c] Spider: $check"; }