Plagger::Plugin::Filter::HTMLTidy - Filters body HTML using HTML::Tidy


Plagger documentation Contained in the Plagger distribution.

Index


Code Index:

NAME

Top

Plagger::Plugin::Filter::HTMLTidy - Filters body HTML using HTML::Tidy

SYNOPSIS

Top

  - module: Filter::HTMLTidy
    config:
      output-xhtml: yes
      char-encoding: utf-8

DESCRIPTION

Top

This plugin glues HTML::Tidy as an entry filter, so it scrubs HTML to make it tidy. Best used with Publish plugins like Planet.

CONFIG

Top

This plugin accepts any config options that can be used as htmltidy config file. See http://tidy.sourceforge.net/docs/quickref.html for details.

AUTHOR

Top

Tatsuhiko Miyagawa

SEE ALSO

Top

Plagger, HTML::Tidy, http://tidy.sourceforge.net/docs/quickref.html


Plagger documentation Contained in the Plagger distribution.

package Plagger::Plugin::Filter::HTMLTidy;
use strict;
use base qw( Plagger::Plugin );

use HTML::Tidy;

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'update.entry.fixup' => \&filter,
    );
}

our %defaults = (
    doctype      => 'omit',
    output_xhtml => 1,
    wrap         => 0,
    break_before_br => 0,
    input_encoding => 'utf8',
    output_encoding => 'utf8',
    tidy_mark => 0,
);

sub filter {
    my($self, $context, $args) = @_;

    my $body = $args->{entry}->body;
    return unless $body && $body->is_html;

    my $conf = $self->conf || {};
    while (my($key, $value) = each %defaults) {
        $conf->{$key} = $value unless exists $conf->{$key};
    }

    my $tidy = HTML::Tidy->new( $self->conf || {} );
    $tidy->ignore( type => TIDY_WARNING );
    my $new_body = $tidy->clean($body->data); # pass in Unicode string, not UTF-8

    # HACK to extract <body /> only
    $new_body =~ s!^.*<body>\s*(.*?)\s*</body>\s*</html>\s*$!$1!s;

    $args->{entry}->body($new_body);
}

1;
__END__