WWW::Webrobot::Recur::LinkChecker - check all links you can get.


webrobot documentation Contained in the webrobot distribution.

Index


Code Index:

NAME

Top

WWW::Webrobot::Recur::LinkChecker - check all links you can get.

SYNOPSIS

Top

see WWW::Webrobot::pod::Testplan

DESCRIPTION

Top

This module allows to load an HTML page, extract all contained frames (recursivly), all images, and all links. It then follows these references

METHODS

Top

new ()

Constructor

$obj -> next ($r)

See next in WWW::Webrobot::pod::Recur

$obj -> allowed ($url)

See allowed in WWW::Webrobot::pod::Recur


webrobot documentation Contained in the webrobot distribution.
package WWW::Webrobot::Recur::LinkChecker;
use strict;
use warnings;

# Author: Stefan Trcek
# Copyright(c) 2004 ABAS Software AG


use WWW::Webrobot::HtmlAnalyzer;
use WWW::Webrobot::Tree2Postfix;

my $unary_operator = {
    'not' => sub { ! $_[0] },
};

my $binary_operator = {
    'and' => sub { $_[0] && $_[1] },
    'or'  => sub { $_[0] || $_[1] },
};

my $predicate = {
    url => sub {
        my ($uri, $tree) = @_;
        my $regex = $tree->{value};
        #print "REGEX=$regex URI=$uri\n";
        return $uri =~ /$regex/ ? 1 : 0;
    },
    scheme => sub {
        my ($uri, $tree) = @_;
        my $regex = $tree->{value};
        my $arg = URI -> new($uri) -> scheme();
        #return 0;
        return $arg =~ /$regex/ ? 1 : 0;
    },
    host => sub {
        my ($uri, $tree) = @_;
        my $regex = $tree->{value};
        my $arg = URI -> new($uri) -> host();
        return $arg =~ $regex ? 1 : 0;
    },
    port => sub {
        my ($uri, $tree) = @_;
        my $regex = $tree->{value};
        my $arg = URI -> new($uri) -> port();
        return $arg =~ $regex ? 1 : 0;
    },
    'host:port' => sub {
        my ($uri, $tree) = @_;
        my $regex = $tree->{value};
        my $host = URI -> new($uri) -> host();
        my $port = URI -> new($uri) -> port();
        my $arg = "$host:$port";
        return $arg =~ $regex ? 1 : 0;
    },
};



sub new {
    my $class = shift;
    my $self = bless({}, ref($class) || $class);
    my ($tree) = @_;
    $self->{evaluator} = WWW::Webrobot::Tree2Postfix -> new(
        $unary_operator, $binary_operator, $predicate, "and"
    );
    $self->{evaluator}->tree2postfix($tree);
    $self->{follow_link} = sub {
        my ($result, $error) = $self->{evaluator}->eval_postfix($_[0]);
        return $result ? 1 : 0;
    };
    $self->{url_rejected} = sub {};
    $self->{url_accepted} = sub {};
    $self->{ignore_img} = 0;
    return $self;
}


sub next {
    my $self = shift;
    my ($r) = @_;
    my $in = $r -> {'_content'};
    my $uri = $r -> {_request} -> {_uri};
    $self -> {current_uri} = $uri;
    if ($self -> is_type("text/html", $r->{_headers}->{'content-type'})) {
	# nur in einer HTML-Seite gibt es neue Links
	my ($img, $frame, $a, $refresh) = WWW::Webrobot::HtmlAnalyzer -> get_links($uri, \$in);
        if ($self->{ignore_img}) {
            foreach my $img_url (@$img) {
                $self -> {url_rejected} -> ($img_url);
            }
            $img = [];
        }

	($img, $frame,$a) = $self -> only_allowed($img, $frame,$a);
	push @{$self -> {img}}, @$img;
	push @{$self -> {frame}}, @$frame;
	push @{$self -> {a}}, @$a;
    }
    my $e = $self -> next_link($self->{img}, $self->{frame}, $self->{a});
    $self -> {visited} -> {$e} = 1 if defined $e;
    return (defined $e) ? ($e, $self->{seen}->{$e} || []) : (undef, undef);
}


sub is_type {
    my $self = shift;
    my ($match, $obj) = @_;
    return 0 if !defined $obj;
    $obj = [$obj] if !ref($obj);
    foreach (@$obj) {
	return 1 if m/$match/;
    }
    return 0;
}


sub allowed {
    my ($self, $uri) = @_;
    return $self -> {follow_link} -> ($uri);
}


sub only_allowed {
    my $self = shift;
    my @ret = ();
    foreach my $array (@_) {
	# delete all links that are not allowed
	my @new = ();
	foreach (@$array) {
	    if (!defined($self -> {seen} -> {$_})) { # link unseen yet
		$self -> {seen} -> {$_} = [] if !defined $self -> {seen} -> {$_};
		push @{$self -> {seen} -> {$_}}, $self->{current_uri};
		if ($self -> allowed($_)) {
		    push @new, $_;
		    $self -> {url_accepted} -> ($_);
		}
		else {
		    $self -> {url_rejected} -> ($_);
		}
	    }
	}
        push @ret, \@new;
    }
    return @ret;
}


sub next_link {
    my $self = shift;
    foreach my $array (@_) {
        my $n = shift @$array;
	return $n if defined $n;
    }
    return undef;
}


1;