WWW::Webrobot::HtmlAnalyzer - analyze HTML files for links/images/frames


webrobot documentation Contained in the webrobot distribution.

Index


Code Index:

NAME

Top

WWW::Webrobot::HtmlAnalyzer - analyze HTML files for links/images/frames

SYNOPSIS

Top

 WWW::Webrobot::HtmlAnalyzer -> get_links($scheme, $input)

DESCRIPTION

Top

Analyze an HTML file. Returns a list of images, a list of frames and a list of links.

METHODS

Top

WWW::Webrobot::HtmlAnalyzer -> get_links($scheme, $input)

Extract all links found in an HTML page

Parameters:

 $scheme    uri of the content
 $in        content, same form as in HTML::TokeParser->new($in)

 return (\@img, \@frame, \@a);
 \@img      list of images
 \@frame    list of frames
 \@a        list of plain links


webrobot documentation Contained in the webrobot distribution.

package WWW::Webrobot::HtmlAnalyzer;
use strict;

# Author: Stefan Trcek
# Copyright(c) 2004 ABAS Software AG

use HTML::TokeParser;

sub get_links  { # static method
    my ($self, $scheme, $in) = @_;
    #print $scheme, " ", $$in;
    my $p = HTML::TokeParser -> new($in);
    my @img = ();
    my @frame = ();
    my @a = ();
    my $refresh = undef;
    while (my $token = $p -> get_tag(qw(img frame a meta))) {
	my ($tag, $attr, $attrseq, $text) = @$token;
	SWITCH: {
	    ($tag eq "img") && do {
		my $href = $attr -> {'src'};
		my $link = URI -> new($href) -> abs($scheme);
		push(@img, $link->as_string()) if $href;
		last SWITCH;
	    };
	    ($tag eq "frame") && do {
		my $href = $attr -> {'src'};
		my $link = URI -> new($href) -> abs($scheme);
		push(@frame, $link->as_string()) if $href;
		last SWITCH;
	    };
	    ($tag eq "a") && do {
		my $href = $attr -> {'href'};
		my $link = URI -> new($href) -> abs($scheme);
		push(@a, $link->as_string()) if $href;
		last SWITCH;
	    };
	    ($tag eq "meta" && ($attr -> {"http-equiv"} || "") eq "refresh") && do {
		my $refresh = $attr -> {'content'} || "-";
		my ($time, $href) = ($refresh =~ /^\s*(\d+);\s+URL\s*=\s*(.*)$/);
		my $link = URI -> new($href) -> abs($scheme);
		$refresh = $link->as_string() if $href;
		last SWITCH;
	    };
	}
    }
    return (\@img, \@frame, \@a, $refresh);
}

1;