| HTML-TreeBuilder-LibXML documentation | Contained in the HTML-TreeBuilder-LibXML distribution. |
HTML::TreeBuilder::LibXML - HTML::TreeBuilder and XPath compatible interface with libxml
use HTML::TreeBuilder::LibXML;
my $tree = HTML::TreeBuilder::LibXML->new;
$tree->parse($html);
$tree->eof;
# $tree and $node compatible to HTML::Element
my @nodes = $tree->findvalue($xpath);
for my $node (@nodes) {
print $node->tag;
my %attr = $node->all_external_attr;
}
HTML::TreeBuilder::LibXML->replace_original(); # replace HTML::TreeBuilder::XPath->new
HTML::TreeBuilder::XPath is libxml based compatible interface to HTML::TreeBuilder, which could be slow for a large document.
HTML::TreeBuilder::LibXML is drop-in-replacement for HTML::TreeBuilder::XPath.
This module doesn't implement all of HTML::TreeBuilder and HTML::Element APIs, but enough methods are defined so modules like Web::Scraper work.
This is a benchmark result by tools/benchmark.pl
Web::Scraper: 0.26
HTML::TreeBuilder::XPath: 0.09
HTML::TreeBuilder::LibXML: 0.01_01
Rate no_libxml use_libxml
no_libxml 5.45/s -- -94%
use_libxml 94.3/s 1632% --
Tokuhiro Matsuno <tokuhirom slkjfd gmail.com>
Tatsuhiko Miyagawa <miyagawa@cpan.org>
Masahiro Chiba
woremacx++ http://d.hatena.ne.jp/woremacx/20080202/1201927162
id:dailyflower
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| HTML-TreeBuilder-LibXML documentation | Contained in the HTML-TreeBuilder-LibXML distribution. |
package HTML::TreeBuilder::LibXML; use strict; use warnings; our $VERSION = '0.12'; use Carp (); use base 'HTML::TreeBuilder::LibXML::Node'; use XML::LibXML; sub new { my $class = shift; bless {}, $class; } sub new_from_content { my $class = shift; my $self = $class->new; for my $content (@_) { $self->parse($content); } $self->eof; return $self; } sub new_from_file { my $class = shift; my $self = $class->new; $self->parse_file(@_); return $self; } my $PARSER; sub _parser { unless ($PARSER) { $PARSER = XML::LibXML->new(); $PARSER->recover(1); $PARSER->recover_silently(1); $PARSER->keep_blanks(0); $PARSER->expand_entities(1); $PARSER->no_network(1); } $PARSER; } sub parse { my ($self, $html) = @_; $self->{_content} .= $html; } sub parse_file { my $self = shift; my $doc = $self->_parser->parse_html_file(@_); $self->{node} = $self->_documentElement($doc); } sub eof { my ($self, ) = @_; $self->{_content} = ' ' if defined $self->{_content} && $self->{_content} eq ''; # HACK my $doc = $self->_parser->parse_html_string($self->{_content}); $self->{node} = $self->_documentElement($doc); } sub _documentElement { my($self, $doc) = @_; return $doc->documentElement || do { my $elem = $doc->createElement("html"); $elem->appendChild($doc->createElement("body")); $elem; }; } sub replace_original { require HTML::TreeBuilder::XPath; my $orig = HTML::TreeBuilder::XPath->can('new'); no warnings 'redefine'; *HTML::TreeBuilder::XPath::new = sub { HTML::TreeBuilder::LibXML->new(); }; if (defined wantarray) { return HTML::TreeBuilder::LibXML::Destructor->new( sub { *HTML::TreeBuilder::XPath::new = $orig } ); } return; } package # hide from cpan HTML::TreeBuilder::LibXML::Destructor; sub new { my ( $class, $callback ) = @_; bless { cb => $callback }, $class; } sub DESTROY { my $self = shift; $self->{cb}->(); } 1; __END__