| HTML-TokeParser-Simple documentation | Contained in the HTML-TokeParser-Simple distribution. |
HTML::TokeParser::Simple::Token::Tag::Start - Token.pm "start tag" class.
use HTML::TokeParser::Simple;
my $p = HTML::TokeParser::Simple->new( $somefile );
while ( my $token = $p->get_token ) {
# This prints all text in an HTML doc (i.e., it strips the HTML)
next unless $token->is_text;
print $token->as_is;
}
This class does most of the heavy lifting for HTML::TokeParser::Simple. See
the HTML::TokeParser::Simple docs for details.
| HTML-TokeParser-Simple documentation | Contained in the HTML-TokeParser-Simple distribution. |
package HTML::TokeParser::Simple::Token::Tag::Start; use strict; use vars qw/ $VERSION $REVISION /; $REVISION = '$Id: Start.pm,v 1.2 2005/10/08 19:45:55 ovid Exp $'; $VERSION = '1.0'; use base 'HTML::TokeParser::Simple::Token::Tag'; use HTML::Entities qw/encode_entities/; my %TOKEN = ( tag => 1, attr => 2, attrseq => 3, text => 4 ); my %INSTANCE; sub _init { my $self = shift; if ('S' eq $self->[0]) { $INSTANCE{$self}{offset} = 0; $INSTANCE{$self}{tag} = $self->[1]; } else { $INSTANCE{$self}{offset} = -1; my $tag = $self->[0]; $tag =~ s/^\///; $INSTANCE{$self}{tag} = $tag; } return $self; } sub _get_offset { return $INSTANCE{+shift}{offset} } sub _get_text { return shift->[-1] } sub _get_tag { my $self = shift; return $INSTANCE{$self}{tag}; } sub _get_attrseq { my $self = shift; my $index = $TOKEN{attrseq} + $self->_get_offset; return $self->[$index]; } sub _get_attr { my $self = shift; my $index = $TOKEN{attr} + $self->_get_offset; return $self->[$index]; } sub DESTROY { delete $INSTANCE{+shift} } sub return_attr { goto &get_attr } sub return_attrseq { goto &get_attrseq } sub return_tag { goto &get_tag } # attribute munging methods sub set_attr { my ($self, $name, $value) = @_; return 'HASH' eq ref $name ? $self->_set_attr_from_hashref($name) : $self->_set_attr_from_string($name, $value); } sub _set_attr_from_string { my ($self, $name, $value) = @_; $name = lc $name; my $attr = $self->get_attr; my $attrseq = $self->get_attrseq; unless (exists $attr->{$name}) { push @$attrseq => $name; } $attr->{$name} = $value; $self->rewrite_tag; } sub _set_attr_from_hashref { my ($self, $attr_hash) = @_; while (my ($attr, $value) = each %$attr_hash) { $self->set_attr($attr, $value); } return $self; } sub rewrite_tag { my $self = shift; my $attr = $self->get_attr; my $attrseq = $self->get_attrseq; # capture the final slash if the tag is self-closing my ($self_closing) = $self->_get_text =~ m{(\s?/)>$}; $self_closing ||= ''; my $tag = ''; foreach ( @$attrseq ) { next if $_ eq '/'; # is this a bug in HTML::TokeParser? $tag .= sprintf qq{ %s="%s"} => $_, encode_entities($attr->{$_}); } my $first = $self->is_end_tag ? '/' : ''; $tag = sprintf '<%s%s%s%s>', $first, $self->get_tag, $tag, $self_closing; $self->_set_text($tag); return $self; } sub delete_attr { my ($self,$name) = @_; $name = lc $name; my $attr = $self->get_attr; return unless exists $attr->{$name}; delete $attr->{$name}; my $attrseq = $self->get_attrseq; @$attrseq = grep { $_ ne $name } @$attrseq; $self->rewrite_tag; } # get_foo methods sub return_text { require Carp; Carp::carp('return_text() is deprecated. Use as_is() instead'); goto &as_is; } sub as_is { return shift->_get_text; } sub get_tag { return shift->_get_tag; } sub get_token0 { return ''; } sub get_attr { my $self = shift; my $attributes = $self->_get_attr; return @_ ? $attributes->{lc shift} : $attributes; } sub get_attrseq { my $self = shift; $self->_get_attrseq; } # is_foo methods sub is_tag { my $self = shift; return $self->is_start_tag( @_ ); } sub is_start_tag { my ($self, $tag) = @_; return $tag ? $self->_match_tag($tag) : 1; } sub _match_tag { my ($self, $tag) = @_; return 'Regexp' eq ref $tag ? $self->_get_tag =~ $tag : $self->_get_tag eq lc $tag; } 1; __END__