/usr/local/CPAN/WAIT/WAIT/Parse/HTML.pm
#!/usr/bin/perl
# -*- Mode: Perl -*-
# $Basename: HTML.pm $
# $Revision: 1.2 $
# Author : Ulrich Pfeifer with Andreas König
# Created On : Sat Nov 1 1997
# Last Modified By: Ulrich Pfeifer
# Last Modified On: Wed Nov 5 16:48:17 1997
# Language : CPerl
# Update Count : 1
# Status : Unknown, Use with caution!
#
# (C) Copyright 1997, Ulrich Pfeifer, all rights reserved.
#
#
package WAIT::Parse::HTML;
use vars qw(@ISA);
require HTML::Parse;
require HTML::FormatText;
use HTML::Entities qw(decode_entities);
@ISA = qw(WAIT::Parse::Base);
sub split {
my ($self, $html_source) = @_;
my ($title) = $html_source =~ /<title\s*>(.*?)<\/title\s*>/si;
my $html = HTML::Parse::parse_html($html_source);
my $formatter = HTML::FormatText->new;
{
'text', $formatter->format($html),
'title', $formatter->format(HTML::Parse::parse_html($title)),
};
}
sub tag {
my ($self, $html_source) = @_;
$html_source =~ tr/\r/\n/;
my ($pre,$title,$body)
= $html_source =~ /^(.*?<title\s*>)(.*?)(<\/title\s*>.+)/si;
(
{'text' => 1}, decode_entities($pre),
{'title' => 1}, decode_entities($title),
{'text' => 1}, decode_entities($body),
);
}