HTML::WordTagRatio::SmoothedRatio - Default module for determining the ratio of words to tags in a range of tokens in an HTML document.


HTML-Content-Extractor documentation Contained in the HTML-Content-Extractor distribution.

Index


Code Index:

NAME

Top

HTML::WordTagRatio::SmoothedRatio - Default module for determining the ratio of words to tags in a range of tokens in an HTML document.

SYNOPSIS

Top

  use HTML::WordTagRatio::SmoothedRatio;
  use HTML::Content::HTMLTokenizer;
  use HTML::Content::ContentExtractor;

  my $tokenizer = new HTML::Content::HTMLTokenizer('TAG','WORD');

  open(HTML,"index.html");
  my $doc = join("",<HTML>);
  close(HTML);

  my ($word_count_arr_ref,$tag_count_arr_ref,$token_type_arr_ref,$token_hash_ref) = $tokenizer->Tokenize($doc);

  my $ratio = new HTML::WordTagRatio::SmoothedRatio();

  my $value = $ratio->RangeValue(0, @$word_count_arr_ref, 
  				$word_count_arr_ref, $tag_count_arr_ref);

DESCRIPTION

Top

HTML::WordTagRatio::SmoothedRatio computes a ratio of Words to Tags for a given range. In psuedo code, the ratio is

Words/TotalWords/(Tags + 1)/(TotalTags + 1)

Methods

* my $ratio = new HTML::WordTagRatio::SmoothedRatio()

Initializes HTML::WordTagRatio::SmoothedRatio

* my $value = $ratio->RangeValue($start, $end, \@WordCount, \@TagCount)

$value is computed as follows:

	($WordCount[$end] - $WordCount[$start])/$WordCount[$#WordCount]/($TagCount[$end] - $TagCount[$start] + 1)/($TagCount[$#TagCount] + 1)

This is the number of words in the range, divided by the total number of words in the document, divided by the number of tags in range plus one, divided by the total number of tags plus one. The plus ones compensate for ranges with no tags. $WordCount[$i] is the number of word tokens before or at the ith token in the input HTML document. $TagCount[$i] is the number of tag tokens before or at the ith token in the input HTML document.

AUTHOR

Top

Jean Tavernier (jj.tavernier@gmail.com)

COPYRIGHT

Top

SEE ALSO

Top

ContentExtractorDriver.pl (1), HTML::Content::ContentExtractor (3), HTML::Content::HTMLTokenizer (3), HTML::WordTagRatio::Ratio (3),HTML::WordTagRatio::WeightedRatio (3), HTML::WordTagRatio::RelativeRatio (3), HTML::WordTagRatio::ExponentialRatio (3), HTML::WordTagRatio::NormalizedRatio (3).


HTML-Content-Extractor documentation Contained in the HTML-Content-Extractor distribution.

package HTML::WordTagRatio::SmoothedRatio;
use strict;
use warnings;
use Carp;
use HTML::WordTagRatio::Ratio;
use vars qw(@ISA);
@ISA = qw(HTML::WordTagRatio::Ratio);

# new - constructs SmoothedRatio object
# - preconditions: 	none
# - postconditions: 	SmoothedRatio is constructed
sub new
{
	my $invocant = shift;	
    	my $class   = ref($invocant) || $invocant;
    	my($self) = new HTML::WordTagRatio::Ratio();
	
        return(bless($self, $class));
}
# RangeValue - returns value of a range of tokens
# - preconditions: 	1st arg is an integer >= 0 and < length of @{3rd argument}
#			2nd arg is an integer > 1st arg and < length of @{3rd argument}
#			3rd arg is an array ref which points to an array of monotonically
#				increasing integers, indicating the number of words found
#				in the HTML document before or at the i_th token (i being an
#				index into the array)
#			4th arg is an array ref which points to an array of monotonically
#				increasing integers, indicating the number of tags found
#				in the HTML document before or at the i_th token (i being an
#				index into the array)
# - postconditions: 	floating point value returned indicating the value of the range
sub RangeValue
{
	my $self = shift;
	my $i = shift;
	my $j = shift;
	my $tN = shift;
	my $tT = shift;
	my @N = @{$tN};
	my @T = @{$tT};
	
	if ($j <= $i)
	{
		return -1;
	}
	my $NinRange = $N[$j] - $N[$i];
	my $TinRange = $T[$j] - $T[$i];
	return ($NinRange*($T[$#T] + 1))/(($TinRange + 1)*$N[$#N]);
}
1;