HTML::SiteTear::PageFilter - change link pathes in HTML files.


HTML-SiteTear documentation Contained in the HTML-SiteTear distribution.

Index


Code Index:

NAME

Top

HTML::SiteTear::PageFilter - change link pathes in HTML files.

SYMPOSIS

Top

 use HTML::SiteTear::PageFilter;

 # $page must be an instance of L<HTML::SiteTear::Page>.
 $filter = HTML::SiteTear::PageFilter->new($page);
 $fileter->parse_file();

DESCRIPTION

Top

This module is to change link pathes in HTML files. It's a sub class of HTML::Parser. Internal use only.

METHODS

Top

new

    $filter = HTML::SiteTear::PageFilter->new($page);

Make an instance of this moduel. $parent must be an instance of HTML::SiteTear::Root or HTML::SiteTear::Page. This method is called from $parent.

parse_file

    $filter->parse_file;

Parse the HTML file given by $page and change link pathes. The output data are retuned thru the method "write_data".

SEE ALOSO

Top

HTML::SiteTear, HTML::SiteTear::Item, HTML::SiteTear::Root, HTML::SiteTear:Page

AUTHOR

Top

Tetsuro KURITA <tkurita@mac.com>


HTML-SiteTear documentation Contained in the HTML-SiteTear distribution.
package HTML::SiteTear::PageFilter;

use strict;
use warnings;
use File::Basename;
use Encode;
use Encode::Guess;
use URI;
#use Data::dumper;

use HTML::Parser 3.40;
use HTML::HeadParser;
use base qw(HTML::Parser Class::Accessor);
__PACKAGE__->mk_accessors(qw(has_remote_base
                            page));

use HTML::Copy;

our $VERSION = '1.43';
our @htmlSuffix = qw(.html .htm .xhtml);

sub new {
    my ($class, $page) = @_;
    my $parent = $class->SUPER::new();
    my $self = bless $parent, $class;
    $self->page($page);
    $self->{'allow_abs_link'} = $page->source_root->allow_abs_link;
    $self->{'use_abs_link'} = 0;
    $self->has_remote_base(0);
    return $self;
}

sub parse_file {
    my ($self) = @_;
    my $p = HTML::Copy->new($self->page->source_path);
    $self->page->set_binmode($p->io_layer);
    $self->SUPER::parse($p->source_html);
}

##== private methods
sub output {
  my ($self, $data) = @_;
  $self->page->write_data($data);
}

##== overriding methods of HTML::Parser

sub declaration { $_[0]->output("<!$_[1]>")     }
sub process     { $_[0]->output($_[2])          }
sub end         { $_[0]->output($_[2])          }
sub text        { $_[0]->output($_[1])          }

sub comment {
    my ($self, $comment) = @_;

    if ($self->{'allow_abs_link'}) {
        if ($comment =~ /^\s*begin abs_link/) {
            $self->{'use_abs_link'} = 1;
        
        } elsif($comment =~ /^\s*end abs_link/) {
            $self->{'use_abs_link'} = 0;
        }
    }

    $self->output("<!--$comment-->");
}

sub start {
    my ($self, $tag, $attr_dict, $attr_names, $tag_text) = @_; 
    my $page = $self->page;
    my $empty_tag_end = ($tag =~ /\/>$/) ? ' />' : '>';
    
    if ($self->has_remote_base) {
        return $self->output($tag_text);
    }
    
    my $process_link = sub {
        my ($target_attr, $folder_name, $kind) = @_;
        if (my $link = $attr_dict->{$target_attr}) {
            if ($self->{'use_abs_link'}) {
                $attr_dict->{$target_attr} = $page->build_abs_url($link);
            } else {
                unless ($kind) {$kind = $folder_name};
                $attr_dict->{$target_attr} 
                        = $page->change_path($link, $folder_name, $kind);
            }
            return HTML::Copy->build_attributes($attr_dict, $attr_names);
        }
        return ();
    };
    
    #treat image files
    if ($tag eq 'base') {
        my $uri = URI->new($attr_dict->{'href'});
        if (!($uri->scheme) or ($uri->scheme eq 'file')) {
            $page->base_uri($uri->abs($page->base_uri));
            $tag_text  = '';
        } else {
            $self->has_remote_base(1);
        }
        
    } elsif ($tag eq 'img') {
        if (my $tag_attrs = &$process_link('src', $page->resource_folder_name)) {
            $tag_text = "<$tag $tag_attrs".$empty_tag_end;
        }

    } elsif ($tag eq 'body') { #background images
        if (my $tag_attrs = &$process_link('background', $page->resource_folder_name)) {
            $tag_text = "<$tag $tag_attrs>";
        }
    }
    #linked stylesheet
    elsif ($tag eq 'link') {
        my $folder_name = $page->resource_folder_name;
        my $kind = $folder_name;
        my $relation;
        if (defined( $relation = ($attr_dict ->{'rel'}) )){
            $relation = lc $relation;
            if ($relation eq 'stylesheet') {
                $kind = 'css';
            }
        }
        
        if (my $tag_attrs = &$process_link('href', $folder_name, $kind)) {
            $tag_text = "<$tag $tag_attrs".$empty_tag_end;
        }
    }
    #frame
    elsif ($tag eq 'frame') {
        if (my $tag_attrs = &$process_link('src', $page->page_folder_name, 'page')) {
            $tag_text = "<$tag $tag_attrs".$empty_tag_end;
        }    
    }
    #javascript
    elsif ($tag eq 'script') {
        if (my $tag_attrs = &$process_link('src', $page->resource_folder_name)) {
            $tag_text = "<$tag $tag_attrs>";
        }
    }
    #link
    elsif ($tag eq 'a') {
        if ( exists($attr_dict->{'href'}) ) {
            my $href =  $attr_dict->{'href'};
            my $kind = 'page';
            my $folder_name = $page->page_folder_name;
            if ($href !~/(.+)#(.*)/) {
                my @matchedSuffix = grep {$href =~ /\Q$_\E$/} @htmlSuffix;
                unless (@matchedSuffix) {
                    $folder_name = $page->resource_folder_name;
                    $kind = $folder_name;
                }
            }
            if (my $tag_attrs = &$process_link('href', $folder_name, $kind)) {
                $tag_text = "<$tag $tag_attrs>";
            }
        }
    }
    
    $self->output($tag_text);
}

1;