/usr/local/CPAN/HTML-Content-Extractor/HTML/Content/TokeParserTokenizer.pm
package HTML::Content::TokeParserTokenizer;
use strict;
use warnings;
use Carp;
use HTML::TokeParser;
use HTML::Content::HTMLTokenizer;
use vars qw(@ISA);
@ISA = qw(HTML::Content::HTMLTokenizer);
# new - constructs TokeParserTokenizer object
# - preconditions: 1st arg points to string to indicate tag
# 2nd arg points to string to indicate word
# - postconditions: TokeParserTokenizer is constructed
sub new
{
my $invocant = shift;
my $class = ref($invocant) || $invocant;
my($self) = new HTML::Content::HTMLTokenizer(@_);
return(bless($self, $class));
}
sub Tokenize
{
my $self = shift;
my $doc = shift;
my @seq = ();
my @N = ();
my @T = ();
my %tokens = ();
my $parse = HTML::TokeParser->new(\$doc) || die "HTML::TokeParser can't open: $!";
my $i = 0;
my $tagcnt = 0;
my $wordcnt = 0;
while (my $token = $parse->get_token)
{
if ($$token[0] ne 'T')
{
$i++;
$tagcnt++;
push(@seq,$self->{TAGMARKER});
push(@N,$wordcnt);
push(@T,$tagcnt);
}
else
{
my $text = $$token[1];
#Remove carriage returns and newlines
$text =~ s/[\n\r]+/ /g;
#Remove HTML spaces
$text =~ s/\Q \E/ /g;
$text =~ s/\Q"\E/\"/g;
$text =~ s/\Q—\E/-/g;
#Remove HTML directives
$text =~ s/\Q&\E.*?\Q;\E/ /g;
my @line = split(/\s+/,$text);
foreach my $word (@line)
{
$wordcnt++;
$i++;
$tokens{$i} = $word;
push(@seq,$self->{WORDMARKER});
push(@N,$wordcnt);
push(@T,$tagcnt);
}
}
}
return (\@N,\@T,\@seq,\%tokens);
}
1;