HTML::TableExtractor - Do stuff with the layout of HTML tables.


HTML-TableExtractor documentation Contained in the HTML-TableExtractor distribution.

Index


Code Index:

NAME

Top

HTML::TableExtractor - Do stuff with the layout of HTML tables.

SYNOPSIS

Top

  use HTML::TableExtractor;
  $p = HTML::TableExtractor->new();
  $p->parse($html, 	table => sub { ... }, tr => sub { ... });

DESCRIPTION

Top

Parses HTML looking for table-related elements (table, tr, td and th as of version 0.1).

Three callbacks can be registered for each element. These callbacks, described below, are executed whenever an element of a particular type is encountered.

  o  start_${tagname}  Called whenever $tagname is opened.
  o  ${tagname}        Called immediately after start_${tagname}, and
		                   immediately before end_${tagname}.
  o  end_${tagname}    Called whenever a closing $tagname is encountered.




EXAMPLE

  use HTML::TableExtractor;
  $p = HTML::TableExtractor->new();
  $p->parse($html,
      start_table => sub {
        my ($attr, $origtext) = @_;
        print "Table border is $table->{border}\n";
      },
      tr => sub { print "Row opened or closed.\n" },
      );

	


METHODS

Top

start($parser, $tag, $attr, $attrseq, $origtext);

Called whenever a particular start tag has been recognised. This module recognises these tags: <table>, <tr>, <td> & <th>.

This method will be called by the parser and is not intended to be called from an application.

end($parser, $tag, $origtext);

Called whenever a particular end tag is encountered.

This method will be called by the parser and is not intended to be called from an application.

$p->parse($html, tag_type => \&coderef, ...);

This method is all you really need to do. Call it with callbacks for each tag type. These will be executed as described above.

EXPORTS

CAVEATS, BUGS, and TODO

o parse() should handle other data sources, such as streaming, file handle etc.

SEE ALSO

HTML::Parser, HTML::TableContentParser

AUTHOR

Top

Simon Drabble <simon@thebigmachine.org<gt>

(C) 2002 Simon Drabble

This software is released under the same terms as perl.


HTML-TableExtractor documentation Contained in the HTML-TableExtractor distribution.

#  $Id: TableExtractor.pm,v 1.2 2002/06/11 15:52:25 simon Exp $


package HTML::TableExtractor;

use HTML::Parser;

@ISA = qw(HTML::Parser);

use strict;


our $VERSION = 0.11;

# The tags we're interested in.
my @tag_names = qw(table tr td th);



sub start
{
	my ($self, $tag, $attr, $attrseq, $origtext) = @_;

	return unless grep { $_ eq lc($tag) } @tag_names;

	if (ref($self->{"${tag}_start_callback"}) eq 'CODE') {
		&{$self->{"${tag}_start_callback"}}($attr, $origtext);
	}
	if (ref($self->{"${tag}_callback"}) eq 'CODE') {
		&{$self->{"${tag}_callback"}}($attr, $origtext);
	}

}




sub end
{
	my ($self, $tag, $origtext) = @_;

	return unless grep { $_ eq lc($tag) } @tag_names;

	if (ref($self->{"${tag}_callback"}) eq 'CODE') {
		&{$self->{"${tag}_callback"}}($origtext);
	}
	if (ref($self->{"${tag}_end_callback"}) eq 'CODE') {
		&{$self->{"${tag}_end_callback"}}($origtext);
	}
}



sub parse
{
	my ($self, $data, @types) = @_;
	my %cbs = @types;

	for (@tag_names) {
		$self->{$_ . "_callback"} = $cbs{$_} if exists $cbs{$_};
		$self->{$_ . "_start_callback"} = $cbs{"start_$_"}
			if exists $cbs{"start_$_"};
		$self->{$_ . "_end_callback"} = $cbs{"end_$_"}
			if exists $cbs{"end_$_"};
	}
	$self->SUPER::parse($data);
}




1;

__END__