| HTML-ResolveLink documentation | Contained in the HTML-ResolveLink distribution. |
HTML::ResolveLink - Resolve relative links in (X)HTML into absolute URI
use HTML::ResolveLink;
my $resolver = HTML::ResolveLink->new(
base => 'http://www.example.com/foo/bar.html',
callback => sub {
my($uri, $old) = @_;
# ...
},
);
$html = $resolver->resolve($html);
HTML::ResolveLink is a module to rewrite relative links in XHTML or HTML into absolute URI.
For example. when you have
<a href="foo.html">foo</a> <img src="/bar.gif" />
and use http://www.example.com/foo/bar as base URL, you'll get:
<a href="http://www.example.com/foo/foo.html">foo</a> <img src="http://www.example.com/bar.gif" />
If the parser encounters <base> tag in HTML, it'll honor that.
my $resolver = HTML::ResolveLink->new(
base => 'http://www.example.com/',
callback => \&callback,
);
base is a required parameter, which is used to resolve the relative
URI found in the document.
callback is an optional parameter, which is a callback subroutine
reference which would take new resolved URI and the original path as
arguments.
Here's an example code to illustrate how to use callback function.
my $count;
my $resolver = HTML::ResolveLink->new(
base => $base,
callback => sub {
my($uri, $old) = @_;
warn "$old is resolved to $uri";
$count++;
},
);
$html = $resolver->resolve($html);
if ($count) {
warn "HTML::ResolveLink resolved $count links";
}
$html = $resolver->resolve($html);
Resolves relative URI found in $html into absolute and returns a
string containing rewritten one.
$count = $resolver->resolved_count;
Returns how many URIs are resolved during the previous resolve method call. This should be called after the resolve, otherwise returns undef.
Tatsuhiko Miyagawa <miyagawa@bulknews.net>
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| HTML-ResolveLink documentation | Contained in the HTML-ResolveLink distribution. |
package HTML::ResolveLink; use strict; our $VERSION = '0.05'; use base qw(HTML::Parser); use Carp; use HTML::Tagset (); use URI; sub new { my($class, %p) = @_; my $self = $class->SUPER::new( start_h => [ \&_start_tag, "self,tagname,attr,attrseq,text" ], default_h => [ \&_default, "self,tagname,attr,text" ], ); unless ($p{base}) { Carp::croak("HTML::ResolveLink->new: base is a required parameter"); } $p{base} = URI->new($p{base}) unless ref $p{base}; $self->{resolvelink_base} = $p{base}; $self->{resolvelink_callback} = $p{callback} if $p{callback}; $self; } sub _start_tag { my($self, $tagname, $attr, $attrseq, $text) = @_; if ($tagname eq 'base' && defined $attr->{href}) { $self->{resolvelink_base} = $attr->{href}; } my $base = $self->{resolvelink_base}; my $links = $HTML::Tagset::linkElements{$tagname} || []; $links = [$links] unless ref $links; for my $a (@$links) { next unless exists $attr->{$a}; my $link = $attr->{$a}; my $uri = URI->new($link); # relative link: unless (defined $uri->scheme) { my $old = $uri; $uri = $uri->abs($base); $attr->{$a} = $uri->as_string; if ($self->{resolvelink_callback}) { $self->{resolvelink_callback}->($uri, $old); } $self->{resolvelink_count}++; } } $self->{resolvelink_html} .= "<$tagname"; for my $a (@$attrseq) { next if $a eq '/'; $self->{resolvelink_html} .= sprintf qq( %s="%s"), $a, _escape($attr->{$a}); } $self->{resolvelink_html} .= ' /' if $attr->{'/'}; $self->{resolvelink_html} .= '>'; } sub _default { my($self, $tagname, $attr, $text) = @_; $self->{resolvelink_html} .= $text; } my %escape = ( '<' => '<', '>' => '>', '"' => '"', '&' => '&', ); my $esc_re = join '|', keys %escape; sub _escape { my $str = shift; $str =~ s/($esc_re)/$escape{$1}/g; $str; } sub resolve { my($self, $html) = @_; # init $self->{resolvelink_html} = ''; $self->{resolvelink_count} = 0; $self->parse($html); $self->eof; $self->{resolvelink_html}; } sub resolved_count { my $self = shift; $self->{resolvelink_count}; } 1; __END__