HTML::ExtractMain - Extract the main content of a web page


HTML-ExtractMain documentation Contained in the HTML-ExtractMain distribution.

Index


Code Index:

NAME

Top

HTML::ExtractMain - Extract the main content of a web page

VERSION

Top

Version 0.62

SYNOPSIS

Top

    use HTML::ExtractMain qw( extract_main_html );

    my $html = <<'END';
    <div id="header">Header</div>
    <div id="nav"><a href="/">Home</a></div>
    <div id="body">
        <p>Foo</p>
        <p>Baz</p>
    </div>
    <div id="footer">Footer</div>
    END

    my $main_html = extract_main_html($html);
    if (defined $main_html) {
	# do something with $main_html here
        # $main_html is '<div id="body"><p>Foo</p><p>Baz</p></div>'
    }

EXPORT

Top

extract_main_html is optionally exported

FUNCTIONS

Top

extract_main_html

extract_main_html takes HTML content, and uses the Readability algorithm to detect the main body of the page, usually skipping headers, footers, navigation, etc.

It takes a single argument, either an HTML string, or an HTML::TreeBuilder tree. (If passed a tree, the tree will be modified and destroyed.)

If the HTML's main content is found, it's returned as an XHTML snippet. The returned HTML will not look like what you put in. (Source formatting, e.g. indentation, will be removed, and you may get back XHTML when you put in HTML.)

If a most relevant block of content is not found, extract_main_html returns undef.

AUTHOR

Top

Anirvan Chatterjee, <anirvan at cpan.org>

BUGS

Top

Please report any bugs or feature requests to bug-html-extractmain at rt.cpan.org, or through the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HTML-ExtractMain. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

SUPPORT

Top

You can find documentation for this module with the perldoc command.

    perldoc HTML::ExtractMain

You can also look for information at:

* RT: CPAN's request tracker

http://rt.cpan.org/NoAuth/Bugs.html?Dist=HTML-ExtractMain

* AnnoCPAN: Annotated CPAN documentation

http://annocpan.org/dist/HTML-ExtractMain

* CPAN Ratings

http://cpanratings.perl.org/d/HTML-ExtractMain

* Search CPAN

http://search.cpan.org/dist/HTML-ExtractMain/

SEE ALSO

Top

* HTML::Feature
* HTML::ExtractContent

ACKNOWLEDGEMENTS

Top

The Readability algorithm is ported from Arc90's JavaScript original, built as part of the excellent Readability application, online at http://lab.arc90.com/experiments/readability/, repository at http://code.google.com/p/arc90labs-readability/.

COPYRIGHT & LICENSE

Top


HTML-ExtractMain documentation Contained in the HTML-ExtractMain distribution.
#!perl

package HTML::ExtractMain;
use Carp qw( carp );
use HTML::TreeBuilder;
use Object::Destroyer 2.0;
use Scalar::Util qw( refaddr );
use base qw( Exporter );
use strict;
use warnings;

our @EXPORT_OK = qw( extract_main_html );

sub extract_main_html {
    my $arg = shift;

    unless ( defined $arg ) {
        carp 'extract_main_html requires HTML content as an argument';
        return;
    }

    my $tree;
    if ( ref $arg and blessed $arg and $arg->isa('HTML::TreeBuilder') ) {
        $tree = $arg;
    } else {
        my $raw_html = $arg;

        $tree = eval { HTML::TreeBuilder->new_from_content($raw_html) };
        if ( !$tree ) {
            carp 'check HTML input, could not create new HTML::TreeBuilder';
            return;
        }
    }

    # Remove any lingering circular references. Details at:
    # http://www.perl.com/pub/2007/06/07/better-code-through-destruction.html
    my $sentry = Object::Destroyer->new( $tree, 'delete' );

    # Use the Readability algorithm, inspired by:
    # http://lab.arc90.com/experiments/readability/js/readability.js

    # Study all the paragraphs and find the chunk that has the best score.
    # A score is determined by things like: Number of <p>'s, commas,
    #  class names, etc.

    my %parents;
    foreach my $p ( $tree->find_by_tag_name('p') ) {
        my $parent    = $p->parent;
        my $parent_id = refaddr($parent);

        if ( !defined $parents{$parent_id} ) {
            $parents{$parent_id}->{element}     = $parent;
            $parents{$parent_id}->{readability} = 0;

            my $text_to_scan = join q{ },
                grep {defined}
                ( $parent->attr('class'), $parent->attr('id') );

            if ( $text_to_scan =~ m/\b(?:comment|meta|footer|footnote)\b/ ) {
                $parents{$parent_id}->{readability} -= 50;
            } elsif ( $text_to_scan
                =~ m/\b(post|hentry|entry[-]?(content|text|body)?|article[-]?(content|text|body)?)\b/
                ) {
                $parents{$parent_id}->{readability} += 25;
            }
        }

        # add point for each para found
        $parents{$parent_id}->{readability}++;

        # add a point for each comma found in the paragraph
        foreach my $text_ref ( $p->content_refs_list ) {
            my $num_commas = ( ${$text_ref} =~ m/,/g );
            $parents{$parent_id}->{readability} += $num_commas;
        }
    }

    my $best_parent;
    foreach my $id ( keys %parents ) {
        if (   !$best_parent
             || $parents{$id}->{readability} > $best_parent->{readability} ) {
            $best_parent = $parents{$id};
        }
    }

    if ($best_parent) {
        my $best_parent_element = $best_parent->{element};
        $best_parent_element->detach;
        my $html = $best_parent_element->as_XML;
        $html =~ s{^<body>(.*)</body>\s*$}{$1}s;    # kill wrapping <body>
        $best_parent_element->delete;
        return $html;
    } else {
        return;
    }
}

our $VERSION = '0.62';

1;    # End of HTML::ExtractMain

# Local Variables:
# mode: perltidy
# End: