/usr/local/CPAN/HTML-WebMake/HTML/WebMake/HTMLCleaner.pm
#
package HTML::WebMake::HTMLCleaner;
use Carp;
use strict;
use HTML::Parser;
use HTML::WebMake::Main;
use vars qw{
@ISA
@ALLFEATURES $INLINE_TAGS $KEEP_FORMAT_TAGS
$EMPTY_ELEMENT_TAGS $BOOL_ATTR_VALUE
};
@ISA = qw(HTML::Parser);
@ALLFEATURES = qw{
pack nocomments addimgsizes addxmlslashes fixcolors cleanattrs
indent fixhrefs
};
$KEEP_FORMAT_TAGS = qr{(?:xmp|listing|pre|plaintext)};
$INLINE_TAGS = qr{(?:a|b|i|em|q|strong|h\d|code|abbr|acronym|address|big|cite|del|ins|s|small|strike|sub|sup|u|samp|kbd|var|img|span)};
$EMPTY_ELEMENT_TAGS = qr{(?:area|base|basefont|bgsound|br|col|embed|frame|hr|img|input|isindex|keygen|link|meta|param|spacer|wbr)};
###########################################################################
sub new {
my $class = shift;
$class = ref($class) || $class;
my ($main) = @_;
my $self = $class->SUPER::new ( api_version => 2 );
$self->{main} = $main;
$self->clear_features();
$BOOL_ATTR_VALUE = undef;
# this parameter is not supported in earlier versions
if ($HTML::Parser::VERSION >= 3.00) {
my $val = '==BOOL_TRUE==';
eval '
$self->boolean_attribute_value ($val);
$BOOL_ATTR_VALUE = $val;
';
}
bless ($self, $class);
$self;
}
###########################################################################
sub select_features {
my ($self, $feats) = @_;
$self->clear_features();
if ($feats =~ /\ball\b/i) {
foreach my $feat (@ALLFEATURES) {
$self->{$feat} = 1;
}
}
foreach my $feat (split (' ', $feats)) {
if ($feat =~ s/^\-//) {
$self->{$feat} = 0; # turned off
} else {
$self->{$feat} = 1; # turned on
}
}
}
sub clear_features {
my ($self) = @_;
foreach my $feat (@ALLFEATURES) {
$self->{$feat} = 0;
}
}
###########################################################################
sub clean {
my ($self, $txt, $fname) = @_;
$self->{out} = [ ];
$self->{in_pre} = 0;
$self->{indent_level} = 0;
$self->{indent_str} = '';
$self->{indent_depth} = 2;
$self->{last_was_noninline_close_tag} = 0;
$self->{last_text_was_whitespace} = 0;
$self->parse ($$txt); $self->eof();
if ($self->{indent_level} > 0) {
warn "HTML cleaner: unbalanced tags found in $fname\n";
}
join ('', @{$self->{out}});
}
###########################################################################
sub start {
my($self, $tagname, $attr, $attrseq, $origtext) = @_;
if ($tagname =~ /^${KEEP_FORMAT_TAGS}$/) {
$self->{in_pre}++;
}
my $is_inline_tag;
if ($tagname =~ /^${INLINE_TAGS}$/) {
$is_inline_tag = 1;
if ($self->{last_text_was_whitespace}) {
$self->add_text (" ");
}
} else {
$is_inline_tag = 0;
}
if (!$self->{cleanattrs}) {
$self->add_text ($origtext);
} else {
$self->clean_attrs_at_start ($tagname, $attr, $attrseq, $origtext);
}
if (!$is_inline_tag && !$self->{in_pre}) {
$self->add_text ("\n");
if (!$self->{in_pre}) {
if ($tagname !~ /^${EMPTY_ELEMENT_TAGS}$/) {
$self->open_indent();
} else {
$self->add_current_indent();
}
}
}
$self->{last_was_noninline_close_tag} = 0;
$self->{last_text_was_whitespace} = 0;
}
sub clean_attrs_at_start {
my ($self, $tagname, $attr, $attrseq, $origtext) = @_;
my $attrs = '';
my $imgsrc = '';
foreach my $name (@{$attrseq})
{
my $val = $attr->{$name};
if ($self->{fixcolors} && $name =~ /colou?r$/) {
if ($val =~ /^[\da-f]{6}$/i) {
$val = "#".$val; # color=004000 -> color="#004000"
}
}
if ($tagname eq 'img' && $name eq 'src') {
$imgsrc = $val;
}
if ($self->{fixhrefs} && ($name eq 'src' || $name eq 'href')) {
if ($val !~ /^[a-z0-9A-Z]+:/) {
$val = HTML::WebMake::Main::canon_path ($val);
$val =~ s,\\,/,g;
}
}
if (defined $BOOL_ATTR_VALUE && $val eq $BOOL_ATTR_VALUE) {
$attrs .= " ".$name;
} elsif (!defined $BOOL_ATTR_VALUE && $val eq $name) {
$attrs .= " ".$name;
} elsif ($val =~ /\"/) {
$val =~ s/\'/'/g;
$attrs .= " ".$name ."=\'".$val."\'";
} else {
$attrs .= " ".$name ."=\"".$val."\"";
}
}
my $tagend = '';
if ($self->{addxmlslashes} && $attrs !~ /\/\s*$/ &&
$tagname =~ /^${EMPTY_ELEMENT_TAGS}$/)
{
$tagend = " />"; # HTML-4, XHTML, XML style
} else {
$tagend = ">";
}
if ($self->{last_was_noninline_close_tag}) {
$self->add_current_indent();
}
if ($tagname eq 'img' && $self->{addimgsizes} &&
$attrs !~ /(height|width)/i &&
$imgsrc !~ /^(?:[a-z0-9A-Z]+:|\/)/)
{
$self->add_text ($self->{main}->fileless_subst
("(html-cleaner)", "<$tagname".$attrs.' ${IMGSIZE}>'));
} else {
$self->add_text ("<".$tagname, $attrs, $tagend);
}
}
# --------------------------------------------------------------------------
sub end {
my($self, $tagname, $origtext) = @_;
my $exiting_pre = ($tagname =~ /^${KEEP_FORMAT_TAGS}$/);
if ($exiting_pre) { $self->{in_pre}--; }
if ($tagname !~ /^${INLINE_TAGS}$/ && !$exiting_pre) {
if (!$self->{last_was_noninline_close_tag}) {
$self->add_text ("\n");
}
$self->close_indent();
$self->add_text ("</$tagname>\n");
$self->{last_was_noninline_close_tag} = 1;
} else {
$self->add_text ("</$tagname>");
$self->{last_was_noninline_close_tag} = 0;
}
$self->{last_text_was_whitespace} = 0;
}
# --------------------------------------------------------------------------
sub text {
my($self, $origtext, $is_cdata) = @_;
if ($self->{in_pre} > 0) {
$self->{last_was_noninline_close_tag} = 0;
$self->{last_text_was_whitespace} = 0;
$self->add_text ($origtext);
return;
} elsif ($origtext =~ /^\s*$/s) {
$self->{last_text_was_whitespace} = 1;
return;
}
$self->{last_was_noninline_close_tag} = 0;
$self->{last_text_was_whitespace} = 0;
$self->pack_text (\$origtext);
# or, to tidy up whitespace:
$self->add_text ($origtext);
}
# --------------------------------------------------------------------------
sub process {
my ($self, $origtext) = @_;
$self->add_text ("<?$origtext>\n");
$self->add_current_indent();
}
# --------------------------------------------------------------------------
sub comment {
my ($self, $origtext) = @_;
if (!$self->{nocomments}) {
$self->pack_text (\$origtext);
$self->add_text ("<!--$origtext-->\n");
$self->add_current_indent();
}
}
# --------------------------------------------------------------------------
sub declaration {
my ($self, $origtext) = @_;
$self->add_text ("<!$origtext>\n");
$self->add_current_indent();
}
###########################################################################
sub pack_text {
my($self, $txt) = @_;
if ($self->{pack} && !($self->{in_pre} > 0)) {
$$txt =~ s/\n\n+/\n/gm;
$$txt =~ s/[ \t]+/ /gm;
$$txt =~ s/^ / /gm;
$$txt =~ s/ $/ /gm;
my $indent = $self->get_current_indent();
$$txt =~ s/\n/\n${indent}/gs;
}
}
###########################################################################
sub add_text {
my $self = shift;
push (@{$self->{out}}, @_);
# $self->{last_was_indent} = 0;
}
sub open_indent {
my ($self) = @_;
if (!$self->{indent}) { return; }
$self->{indent_level} += $self->{indent_depth};
$self->{indent_str} = (' ' x $self->{indent_level});
# return if ($self->{last_was_indent});
push (@{$self->{out}}, $self->{indent_str});
# $self->{last_was_indent} = 1;
}
sub close_indent {
my ($self) = @_;
if (!$self->{indent}) { return; }
$self->{indent_level} -= $self->{indent_depth};
if ($self->{indent_level} < 0) { $self->{indent_level} = 0; }
$self->{indent_str} = (' ' x $self->{indent_level});
# return if ($self->{last_was_indent});
push (@{$self->{out}}, $self->{indent_str});
# $self->{last_was_indent} = 1;
}
sub add_current_indent {
my ($self) = @_;
if (!$self->{indent}) { return; }
# return if ($self->{last_was_indent});
push (@{$self->{out}}, $self->{indent_str});
# $self->{last_was_indent} = 1;
}
sub get_current_indent {
my ($self) = @_;
$self->{indent_str};
}
###########################################################################
1;