File::Extract::HTML - Extract Text From HTML Files


File-Extract documentation Contained in the File-Extract distribution.

Index


Code Index:

NAME

Top

File::Extract::HTML - Extract Text From HTML Files

SEE ALSO

Top

File::Extract File::Extract::Base (File::Extract::Base) HTML::TreeBuilder


File-Extract documentation Contained in the File-Extract distribution.

# $Id: /mirror/perl/File-Extract/trunk/lib/File/Extract/HTML.pm 4210 2007-10-27T13:43:07.499967Z daisuke  $
#
# Copyright (c) 2005 Daisuke Maki <dmaki@cpan.org>
# All rights reserved.

package File::Extract::HTML;
use strict;
use base qw(File::Extract::Base);
use HTML::TreeBuilder;

sub mime_type { 'text/html' }
sub extract
{
    my $self = shift;
    my $file = shift;

    my $text;
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($file);

    $text = $tree->as_text;
    $tree->delete;

    my $r = File::Extract::Result->new(
        text      => eval { $self->recode($text) } || $text,
        filename  => $file,
        mime_type => $self->mime_type,
    );
    return $r;
}

1;

__END__