XML::Driver::HTML - SAX Driver for non wellformed HTML.


XML-Driver-HTML documentation Contained in the XML-Driver-HTML distribution.

Index


Code Index:

NAME

Top

XML::Driver::HTML - SAX Driver for non wellformed HTML.

SYNOPSIS

Top

  use XML::Driver::HTML;

  $driver = new XML::Driver::HTML(
  	'Handler' => $some_sax_filter_or_handler,
	'Source' => $some_PerlSAX_like_hash
	);

  $driver->parse();

or

  use XML::Driver::HTML;

  $driver = new XML::Driver::HTML();

  $driver->parse(
  	'Handler' => $some_sax_filter_or_handler,
	'Source' => $some_PerlSAX_like_hash
	);

  $driver->parse(
  	'Handler' => $some_other_sax_filter_or_handler,
	'Source' => $some_other_source
	);

DESCRIPTION

Top

XML::Driver::HTML is a SAX Driver for HTML. There is no need for the HTML input to be weel formed, as XML::Driver::HTML is generating its SAX events by walking a HTML::TreeBuilder object. The simplest kind of use, is a filter from HTML to XHTML using XML::Handler::YAWriter as a SAX Handler.

    my $ya = new XML::Handler::YAWriter( 
	'Output' => new IO::File ( ">-" ),
	'Pretty' => {
	    'NoWhiteSpace'=>1,
	    'NoComments'=>1,
	    'AddHiddenNewline'=>1,
	    'AddHiddenAttrTab'=>1,
	    }
	);

    my $html = new XML::Driver::HTML(
	'Handler' => $ya,
	'Source' => { 'ByteStream' => new IO::File ( "<-" ) }
	);

    $html->parse();

METHODS

new

Creates a new XML::Driver::HTML object. Default options for parsing, described below, are passed as key-value pairs or as a single hash. Options may be changed directly in the object.

parse

Parses a document. Options, described below, are passed as key-value pairs or as a single hash. Options passed to parse() override the default options in the parser object for the duration of the parse.

OPTIONS

The following options are supported by XML::Driver::HTML :

Handler

Default SAX Handler to receive events

Source

Hash containing the input source for parsing. The `Source' hash may contain the following parameters:

ByteStream

The raw byte stream (file handle) containing the document.

String

A string containing the document.

SystemId

The system identifier (URL) of the document.

Encoding

A string describing the character encoding.

If more than one of `ByteStream', `String', or `SystemId', then preference is given first to `ByteStream', then `String', then `SystemId'.

NOTES

Top

XML::Driver::HTML requires Perl 5.6 to convert from ISO-8859-1 to UTF-8.

BUGS

Top

not yet implemented:

    Interpretation of SystemId as being an URI
    XHTML document type

other bugs:

    HTML::Parser and HTML::TreeBuilder bugs concerning DOCTYPE and CSS.
    Perl handling of UFT8 is compatible between different versions. So
    you need exactly Perl 5.6.0, not lower not higher.

AUTHOR

Top

  Michael Koehne, Kraehe@Copyleft.De
  (c) 2001 GNU General Public License

SEE ALSO

Top

XML::Parser::PerlSAX and HTML::TreeBuilder


XML-Driver-HTML documentation Contained in the XML-Driver-HTML distribution.

# XML::Driver::HTML
#
# Copyright (c) 2000 Michael Koehne <kraehe@copyleft.de>
#
# XML::Driver::HTML is free software. You can use and redistribute
# this copy under terms of the GNU General Public License.

use 5.006;
no warnings 'utf8' ;

package XML::Driver::HTML;

use HTML::TreeBuilder;
use strict;
use vars qw($VERSION $METHODS);

$VERSION = '0.06';
$METHODS = {
	start_document => 1,
	end_document => 1,
	start_element => 1,
	end_element => 1,
	characters => 1,
	comment => 1
	};

$HTML::TreeBuilder::Debug = 0; # default debug level

sub new {
    my $proto = shift;
    my $self  = ($#_ == 0) ? { %{ (shift) } } : { @_ };
    my $class = ref($proto) || $proto;

    bless($self, $class);
}

sub parse {
    my $self = shift;
    my $args = ($#_ == 0) ? { %{ (shift) } } : { @_ };
    my $file;
    my $result = undef;

    $self->{'Source'} = $args->{'Source'} if $args->{'Source'};
    $self->{'Handler'} = $args->{'Handler'} if $args->{'Handler'};

    die "no Source defined" unless $self->{'Source'};
    die "no Handler defined" unless $self->{'Handler'};

    my $h = HTML::TreeBuilder->new;
    $h->ignore_unknown(1);
    $h->warn(0);
    $h->{'_store_comments'}=1;
    $h->{'_store_declarations'}=1;

    if ($self->{'Source'}{'ByteStream'}) {
    	$h->parse_file($self->{'Source'}{'ByteStream'});
    } elsif ($self->{'Source'}{'String'}) {
    	$h->parse($self->{'Source'}{'String'});
	$h->eof();
    } elsif ($self->{'Source'}{'SystemId'}) {
    	$h->parse_file($self->{'Source'}{'SystemId'});
    } else {
    	die "no Source defined";
    }

    $self->{'Methods'} = {};
    foreach (keys %$METHODS) {
	$self->{'Methods'}{$_} = 1 if $self->{'Handler'}->can($_);
    }

    delete $self->{'Recode'};
    $self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "iso-8859-1";
    $self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "iso88591";
    $self->{'Recode'} = 1 if lc($self->{'Source'}{'Encoding'}) eq "latin1";

    $self->{'Handler'}->start_document()
        if $self->{'Methods'}{'start_document'};
    $self->dumptree($h);
    $result = $self->{'Handler'}->end_document()
        if $self->{'Methods'}{'end_document'};

    $h = $h->delete(); # nuke it!

    return $result;
}

sub dumptree {
    my ($self,$element) = @_;

    my $tag = $element->{'_tag'};
    my $cont = $element->{'_content'};
    my $attr = {};
    my $value;

    return if $tag eq "style";

    if ($tag) {
    	if ($self->{'Recode'}) {
            foreach (keys %$element) {
                if ($_ !~ '^_') {
                    $value = $element->{$_};
                    $value =~
                        s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg;
                    $attr->{$_} = $value;
                }
            }
	} else {
	    foreach (keys %$element) {
	        $attr->{$_} = $element->{$_}
			if $_ !~ '^_';
	    }
	}

	if ($tag !~ /^\~/) {
	    $self->{'Handler'}->start_element( {
		    'Name' => $tag, 'Attributes' => $attr
		    } )
		if $self->{'Methods'}{'start_element'};

	    foreach (@$cont) {
		if (ref $_ eq 'HTML::Element') {
		    $self->dumptree($_)
		} else {
                    s/([\x80-\xFF])/chr(0xC0|ord($1)>>6).chr(0x80|ord($1)&0x3F)/eg
                        if $self->{'Recode'};
		    $self->{'Handler'}->characters( { 'Data' => $_ } )
			if ($self->{'Methods'}{'characters'} && (lc($_) !~ "^[ \t]*<!"));
		}
	    }

	    $self->{'Handler'}->end_element( { 'Name' => $tag } )
		if $self->{'Methods'}{'end_element'};
	}
	if ($tag eq "~comment") {
	    $self->{'Handler'}->comment( { 'Data' => $attr->{'text'} } )
	    	if $self->{'Methods'}{'comment'};
	}
    }
}

1;
__END__