HTML::ContentExtractor - extract the main content from a web page by analysising the DOM tree!


HTML-ContentExtractor documentation Contained in the HTML-ContentExtractor distribution.

Index


Code Index:

NAME

Top

HTML::ContentExtractor - extract the main content from a web page by analysising the DOM tree!

VERSION

Top

Version 0.03

SYNOPSIS

Top

    use HTML::ContentExtractor;
    my $extractor = HTML::ContentExtractor->new();
    my $agent=LWP::UserAgent->new;

    my $url='http://sports.sina.com.cn/g/2007-03-23/16572821174.shtml';
    my $res=$agent->get($url);
    my $HTML = $res->decoded_content();

    $extractor->extract($url,$HTML);
    print $extractor->as_html();
    print $extractor->as_text();

DESCRIPTION

Top

Web pages often contain clutter (such as ads, unnecessary images and extraneous links) around the body of an article that distracts a user from actual content. This module is used to reduce the noise content in web pages and thus identify the content rich regions.

A web page is first parsed by an HTML parser, which corrects the markup and creates a DOM (Document Object Model) tree. By using a depth-first traversal to navigate the DOM tree, noise nodes are identified and removed, thus the main content is extracted. Some useless nodes (script, style, etc.) are removed; the container nodes (table, div, etc.) which have high link/text ratio (higher than threshold) are removed; (link/text ratio is the ratio of the number of links and non-linked words.) The nodes contain any string in the predefined spam string list are removed.

Please notice the input HTML should be encoded in utf-8 format( so do the spam words), thus the module can handle web pages in any language (I've used it to process English, Chinese, and Japanese web pages).

$e = HTML::ContentExtractor->new(%options);

Constructs a new HTML::ContentExtractor object. The optional %options hash can be used to set the options list below.

$e->table_tags();
$e->table_tags(@tags);
$e->table_tags(\@tags);

This is used to get/set the table tags array. The tags are used as the container tags.

$e->ignore_tags();
$e->ignore_tags(@tags);
$e->ignore_tags(\@tags);

This is used to get/set the ignore tags array. The elements of such tags will be removed.

$e->spam_words();
$e->spam_words(@strings);
$e->spam_words(\@strings);

This is used to get/set the spam words list. The elements have such string will be removed.

This is used to get/set the link/text ratio, default is 0.05.

$e->min_text_len();
$e->min_text_len($len);

This is used to get/set the min text length, default is 20. If length of the text of an elment is less than this value, this element will be removed.

$e->extract($url,$HTML);

This is used to perform the extraction process. Please notice the input $HTML must be encoded in UTF-8.

$e->as_html();

Return the extraction result in HTML format.

$e->as_text();

Return the extraction result in text format.

AUTHOR

Top

Zhang Jun, <jzhang533 at gmail.com>

COPYRIGHT & LICENSE

Top


HTML-ContentExtractor documentation Contained in the HTML-ContentExtractor distribution.
package HTML::ContentExtractor;

our $VERSION = '0.03';

use strict;
use warnings;
use HTML::TreeBuilder;

sub new {
    my $proto  = shift;
    my $class  = ref($proto) || $proto;

    my $self = {};
    bless($self, $class);

    return $self->_init(@_);
}

sub _init{
    my $self = shift;

    $self->{table_tags} = [qw(table form div td tr tbody thead tfoot th col colgroup span iframe center ul h1 h2 h3 p)];
    $self->{ignore_tags} = [qw(script noscript style form button meta input select iframe embed hr img)];
    $self->{spam_words} = ['All rights reserved'];
    $self->{link_text_ratio} = 0.05;
    $self->{min_text_len} = 20;
    
    if (@_ != 0) {
        if (ref $_[0] eq 'HASH') {
            my $hash=$_[0];
            foreach my $key (keys %$hash) {
                $self->{lc($key)}=$hash->{$key};
            }
        }else{ 
            my %args = @_;
            foreach my $key (keys %args) {
                $self->{lc($key)}=$args{$key};
            }
        }
    }

    $self->table_tags($self->{table_tags});
    $self->ignore_tags($self->{ignore_tags});
    return $self;
}

sub min_text_len{
    my $self=shift;
    return $self->{min_text_len} if (@_ == 0);

    $self->{min_text_len}=shift;
}

sub link_text_ratio{
    my $self=shift;
    return $self->{link_text_ratio} if (@_ == 0);

    $self->{link_text_ratio}=shift;
}

sub spam_words{
    my $self = shift;

    if(@_ == 0){
        return @{$self->{spam_words}};
    }

    if(ref $_[0] eq 'ARRAY'){
        $self->{spam_words} = $_[0];
    }else{
        my @array = @_;
        $self->{spam_words} = \@array;
    }
}

sub ignore_tags{
    my $self = shift;

    if(@_ == 0){
        return keys %{$self->{ignore_tags}};
    }

    my $array;
    if(ref $_[0] eq 'ARRAY'){
        $array = $_[0];
    }else{
        $array = \@_;
    }

    my $h={};
    grep {$h->{$_}=1;} @$array;
    $self->{ignore_tags} = $h;
}

sub table_tags{
    my $self = shift;
    if(@_ == 0){
        return keys %{$self->{table_tags}};
    }
    
    my $array;
    if(ref $_[0] eq 'ARRAY'){
        $array = $_[0];
    }else{
        $array = \@_;
    }

    my $h={};
    grep {$h->{$_}=1;} @$array;
    $self->{table_tags} = $h;
}

#the input should be utf8 encoded html content
sub extract{
    my $self=shift;
    my $url=shift;
    my $HTML=shift;

    $self->{tree}->delete if($self->{tree});
    
    $HTML=_PreprocessForFragmentIdentifiedPage($url,$HTML);             
    _remove_crap($HTML);
    
    $self->{url}=$url;
    $self->{tree} = HTML::TreeBuilder->new();
    $self->{tree} ->parse($HTML);
    $self->{link_count} = _how_many_links($self->{tree});
    $self->{is_index}= _check_if_index($self->{tree});
    $self->_Heuristic_Remove($self->{tree});
    $self->_Table_Remove($self->{tree});
}

sub _is_index{
    return $_[0]->{is_index};
}

sub DESTROY{
    my $self = shift;
    $self->{tree}->delete if($self->{tree});
}

#also the output are in utf8 format
sub as_html{
    my $self=shift;
    my $HTML = $self->{tree}->as_HTML('<>&',"\t");
    return $HTML;
}

sub as_text{
    my $self=shift;
    my $output = _to_text($self->{tree});
    $output =~ s/[\n\r] +/\n/sg;
    $output =~ s/[\n\r]+/\n/sg;
    $output =~ s/ +/ /sg;
    $output =~ s/\n /\n/sg;
    $output =~ s/^\s+//;
    return $output;
}

sub _link_count{
    return $_[0]->{link_count};
}

sub _check_if_index{
    my $node=shift;
    
    my $num_links=_how_many_links($node);
    my $txt=_nonlink_words($node);
        
    my $num_words = _count_words_num($txt);
        
    my $ratio=1;
    $ratio = $num_links/$num_words unless $num_words==0;
    if($ratio>0.3 || $num_links>400){
        return 1;
    }else{
        return 0;
    }
}

sub _remove_crap{
    $_[0] =~ s/&nbsp;/ /isg;
}

sub _Table_Remove{
    my $self=shift;
    my $node=shift;
    return if not ref $node;             # not an element node

    my $tag=$node->tag;

    my @nodes = $node->content_list(); # depth first recursive travesel
    foreach my $child (@nodes){
        $self->_Table_Remove( $child );
    }
    
    if($self->{table_tags}->{$tag}){
        
        my $num_links=_how_many_links($node);
        my $txt=_nonlink_words($node);
        
        my $num_words = _count_words_num($txt);
        
        my $ratio=1;
        $ratio = $num_links/$num_words unless $num_words==0;

        if ($num_words < $self->{min_text_len} and 
            $node->tag ne 'h1' and
            $node->tag ne 'h2' and
            $node->tag ne 'h3' and
            $node->tag ne 'p'){
            $node->delete; return;
        }
        
        if ($ratio > $self->{link_text_ratio}){
            $node->delete; return;
        }
        
        $txt = lc $txt;
        
        foreach(@{ $self->{spam_words} }){
            if(index($txt,$_) != -1){
                $node->delete;
                return;
            }
        }
    }
}

sub _how_many_links{
    my $node=shift;
    my $links_r = $node->extract_links();
    my $num_links = scalar(@$links_r);
    return $num_links;
}

sub _nonlink_words{
    my $node=shift;
    if(not ref $node){
        my $text = $node;
        return $text;
    }
    return '' if($node->tag eq 'a'
                 or $node->tag eq 'style'
                 or $node->tag eq 'script'
                 or $node->tag eq 'option'
                 or $node->tag eq 'noscript'
                 or $node->tag eq 'hr'
                 or $node->tag eq 'input'
                 );
                 
    my @nodes = $node->content_list(); # breadth first travesel
    my $sum_text="";
    foreach $node (@nodes){
        $sum_text .= _nonlink_words( $node );
    }
    return $sum_text;
}

sub _Heuristic_Remove{
    my $self=shift;
    my $node=shift;
    return if not ref $node;             # not an element node
    
    my @nodes = $node->content_list();   # depth first recursive travesel
    foreach my $child (@nodes){
        $self->_Heuristic_Remove( $child );
    }
    
    if($self->{ignore_tags}->{$node->tag} ){       # ignore the tags defined in ignore_tags
        $node->delete;
        return;
    }
    
    if($node->tag eq 'a' and $node->parent->tag eq 'body'){
        $node->delete;
    }
}

sub _to_text{
    my $node = shift;
    if(not ref $node){
        return $node;
    }
    return '' if($node->tag eq 'head');
    my @nodes = $node->content_list();  #breadth firth travesel
		my $text = "";
		foreach my $child (@nodes) {
        if ( ref $child and $child->can('tag') and $child->tag() eq 'table' ) {
            my $avail = eval { require HTML::TableExtract };
            unless ($avail) {
                $text .= _to_text($child) . "\n";
                next;
            }
            my $table   = 'HTML::TableExtract'->new();
            my $content = $child->as_HTML;
            $table->parse($content);
            foreach my $ts ( $table->tables ) {
                foreach my $row ( $ts->rows ) {
                    defined and do { s/\s+$//, s/^\s+// }
                    for @$row;
                    $text .= join( ', ', grep { defined } @$row ) . "\n";
                }
            }
        }else {
            $text .= _to_text($child) . "\n";
        }
		}
    return $text;
}

sub _count_words_num{
    my $text = shift;

    $text =~ s/([\x21-\x7e]+)/ $1 /g;
    $text =~ s/([^\x20-\x7e])/ $1 /g;
    $text =~ s/^ +//;
    my @tokens=split(/\s+/,$text);
    
    return scalar(@tokens);
}

# input is the url and HTML
# output is the processed HTML
sub _PreprocessForFragmentIdentifiedPage{
    my $url=shift;
    my $HTML=shift;
    if($url!~/\#/){        
        return $HTML;
    }
    
    my ($fragment_id)= $url=~/\#(.+)$/;
    $fragment_id=~s/\///;
  
    if($HTML=~/(<a id=\"$fragment_id\".*?)<a id/s){
        $HTML=$1;
    }
    return $HTML;
}

1;