| PerlIO-via-StripHTML documentation | Contained in the PerlIO-via-StripHTML distribution. |
PerlIO::via::StripHTML - PerlIO layer to strip HTML tags from an input file
use PerlIO::via::StripHTML;
open my $file, '<:via(StripHTML)', 'foo.html'
or die "Can't open foo.html: $!\n";
This package implements a PerlIO layer, for reading files only. It strips HTML tags from the input, leaving only plain text. This can be useful, for example, to find something in the text of a HTML page.
This is only a preliminary version.
PerlIO::via
Copyright (c) 2002 Rafael Garcia-Suarez. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
The HTML stripping code was borrowed from the eg/htext script in the
HTML-Parser distribution.
| PerlIO-via-StripHTML documentation | Contained in the PerlIO-via-StripHTML distribution. |
package PerlIO::via::StripHTML; require 5.008; use strict; use warnings; use HTML::Parser 3.00; our $VERSION = 0.04; sub PUSHED { my ($class, $mode) = @_; return -1 if $mode ne 'r'; # The following variables are updated / accessed via the closures below my $buffer = ''; # internal buffer for this layer my %inside = (); bless { buffer => sub : lvalue { $buffer }, parser => new HTML::Parser( api_version => 3, marked_sections => 1, start_h => [ sub { $buffer .= "\n" if $_[0] =~ /^[bt]r$/; $buffer .= "\n\n" if $_[0] eq 'p'; ++$inside{$_[0]}; }, 'tagname', ], end_h => [ sub { --$inside{$_[0]} }, 'tagname', ], text_h => [ sub { $buffer .= $_[0] unless $inside{script} || $inside{style}; }, 'dtext', ], ), }, $class; } sub FILL { my ($self, $fh) = @_; my $line = <$fh>; return undef unless defined $line; $self->{buffer}->() = ''; $self->{parser}->parse($line) or return undef; $self->{parser}->eof; return $self->{buffer}->(); } 1; __END__