Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class


Plagger documentation Contained in the Plagger distribution.

Index


Code Index:

NAME

Top

Plagger::Plugin::Filter::EntryFullText - Upgrade your feeds to fulltext class

SYNOPSIS

Top

  - module: Filter::EntryFullText

DESCRIPTION

Top

This plugin allows you to fetch entry full text by doing HTTP GET and apply regexp to HTML. It's just like upgrading your flight ticket from economy class to business class!

You can write custom fulltext handler by putting .pl or .yaml files under assets plugin directory.

CONFIG

Top

store_html_on_failure

Even if fulltext handlers fail to extract content body from HTML, this option enables to store the whole document HTML as entry body. It will be useful to use with search engines like Gmail and Search:: plugins. Defaults to 0.

force_upgrade

Even if entry body already contains HTML, this config forces the plugin to upgrade the body. Defaults to 0.

WRITING CUSTOM FULLTEXT HANDLER

Top

(To be documented)

AUTHOR

Top

Tatsuhiko Miyagawa

SEE ALSO

Top

Plagger


Plagger documentation Contained in the Plagger distribution.

package Plagger::Plugin::Filter::EntryFullText;
use strict;
use base qw( Plagger::Plugin );

use DirHandle;
use Encode;
use File::Spec;
use List::Util qw(first);
use HTML::ResolveLink;
use Plagger::Date; # for metadata in plugins
use Plagger::Util qw( decode_content );
use Plagger::Plugin::CustomFeed::Simple;
use Plagger::UserAgent;

sub rule_hook { 'update.entry.fixup' }

sub register {
    my($self, $context) = @_;
    $context->register_hook(
        $self,
        'customfeed.handle'  => \&handle,
        'update.entry.fixup' => \&filter,
    );
}

sub init {
    my $self = shift;
    $self->SUPER::init(@_);
    $self->load_plugins();

    $self->{ua} = Plagger::UserAgent->new;
}

sub load_plugins {
    my $self = shift;
    my $context = Plagger->context;

    $self->load_assets('*.yaml', sub { $self->load_plugin_yaml(@_) });
    $self->load_assets('*.pl',   sub { $self->load_plugin_perl(@_) });
}

sub load_plugin_perl {
    my($self, $file, $base) = @_;

    Plagger->context->log(debug => "Load plugin $file");

    open my $fh, '<', $file or Plagger->context->error("$file: $!");
    (my $pkg = $base) =~ s/\.pl$//;
    my $plugin_class = "Plagger::Plugin::Filter::EntryFullText::Site::$pkg";

    if ($plugin_class->can('new')) {
        Plagger->context->log(warn => "$plugin_class is already defined. skip compiling code");
        return $plugin_class->new;
    }

    my $code = join '', <$fh>;
    unless ($code =~ /^\s*package/s) {
        $code = join "\n",
            ( "package $plugin_class;",
              "use strict;",
              "use base qw( Plagger::Plugin::Filter::EntryFullText::Site );",
              "sub site_name { '$pkg' }",
              $code,
              "1;" );
    }

    eval $code;
    Plagger->context->error($@) if $@;

    push @{ $self->{plugins} }, $plugin_class->new;
}

sub load_plugin_yaml {
    my($self, $file, $base) = @_;

    Plagger->context->log(debug => "Load YAML $file");
    my @data = YAML::LoadFile($file);

    push @{ $self->{plugins} },
        map { Plagger::Plugin::Filter::EntryFullText::YAML->new($_, $base) } @data;
}

sub handle {
    my($self, $context, $args) = @_;

    my $handler = first { $_->custom_feed_handle($args) } @{ $self->{plugins} };
    if ($handler) {
        $args->{match} = $handler->custom_feed_follow_link;
        $args->{xpath} = $handler->custom_feed_follow_xpath;
        return $self->Plagger::Plugin::CustomFeed::Simple::aggregate($context, $args);
    }
}

sub filter {
    my($self, $context, $args) = @_;

    my $handler = first { $_->handle_force($args) } @{ $self->{plugins} };
    if ( !$handler && $args->{entry}->body && $args->{entry}->body->is_html && !$self->conf->{force_upgrade} ) {
        $self->log(debug => $args->{entry}->link . " already contains body. Skipped");
        return;
    }

    if (! $args->{entry}->permalink) {
        $self->log(debug => "Entry " . $args->{entry}->title . " doesn't have permalink. Skipped");
        return;
    }

    # NoNetwork: don't connect for 3 hours
    my $res = $self->{ua}->fetch( $args->{entry}->permalink, $self, { NoNetwork => 60 * 60 * 3 } );
    if (!$res->status && $res->is_error) {
        $self->log(debug => "Fetch " . $args->{entry}->permalink . " failed");
        return;
    }

    $args->{content} = decode_content($res);

    # if the request was redirected, set it as permalink
    if ($res->http_response) {
        my $base = $res->http_response->request->uri;
        if ( $base ne $args->{entry}->permalink ) {
            $context->log(info => "rewrite permalink to $base");
            $args->{entry}->permalink($base);
        }
    }

    # use Last-Modified to populate entry date, even if handler doesn't find one
    # TODO: make this a separate plugin
    if ($res->last_modified && !$args->{entry}->date) {
        $args->{entry}->date( Plagger::Date->from_epoch($res->last_modified) );
    }

    my @plugins = $handler ? ($handler) : @{ $self->{plugins} };

    my $upgraded;
    for my $plugin (@plugins) {
        if ( $handler || $plugin->handle($args) ) {
            $context->log(debug => $args->{entry}->permalink . " handled by " . $plugin->site_name);
            my $data = $plugin->extract($args);
               $data = { body => $data } if $data && !ref $data;
            if ($data) {
                $context->log(info => "Extract content succeeded on " . $args->{entry}->permalink);
                my $resolver = HTML::ResolveLink->new( base => $args->{entry}->permalink );

                # if body was already there, set that to summary
                if ($args->{entry}->body) {
                    $args->{entry}->summary($args->{entry}->body);
                }

                $data->{body} = $resolver->resolve( $data->{body} );
                $args->{entry}->body($data->{body});
                $args->{entry}->title($data->{title}) if $data->{title};
                $args->{entry}->author($data->{author}) if $data->{author};
                $args->{entry}->icon({ url => $data->{icon} }) if $data->{icon};
                $args->{entry}->summary($data->{summary}) if $data->{summary};

                # extract date using found one
                if ($data->{date}) {
                    $args->{entry}->date($data->{date});
                }

                $upgraded++;
                last;
            }
        }
    }

    # extract TITLE tag if title is not set yet
    # TODO: make this a separate plugin
    if (!$args->{entry}->title
        and $args->{content} =~ m!<title>\s*(.*?)\s*</title>!is ) {
        $args->{entry}->title( HTML::Entities::decode($1) );
    }

    return 1 if $upgraded;

    # failed to extract: store whole HTML if the config is on
    if ($self->conf->{store_html_on_failure}) {
        $args->{entry}->body($args->{content});
        return 1;
    }

    $context->log(warn => "Extract content failed on " . $args->{entry}->permalink);
}


package Plagger::Plugin::Filter::EntryFullText::Site;
sub new { bless {}, shift }
sub custom_feed_handle { 0 }
sub custom_feed_follow_link { }
sub custom_feed_follow_xpath { }
sub handle_force { 0 }
sub handle { 0 }

package Plagger::Plugin::Filter::EntryFullText::YAML;
use Encode;
use List::Util qw(first);

sub new {
    my($class, $data, $base) = @_;

    # add ^ if handle method starts with http://
    for my $key ( qw(custom_feed_handle handle handle_force) ) {
        next unless defined $data->{$key};
        $data->{$key} = "^$data->{$key}" if $data->{$key} =~ m!^https?://!;
    }

    # decode as UTF-8
    for my $key ( qw(extract extract_date_format) ) {
        next unless defined $data->{$key};
	if (ref $data->{$key} && ref $data->{$key} eq 'ARRAY') {
	    $data->{$key} = [ map decode("UTF-8", $_), @{$data->{$key}} ];
	} else {
	    $data->{$key} = decode("UTF-8", $data->{$key});
	}
    }

    bless {%$data, base => $base }, $class;
}

sub site_name {
    my $self = shift;
    $self->{base};
}

sub custom_feed_handle {
    my($self, $args) = @_;
    $self->{custom_feed_handle} ?
        $args->{feed}->url =~ /$self->{custom_feed_handle}/ : 0;
}

sub custom_feed_follow_link {
    $_[0]->{custom_feed_follow_link};
}

sub custom_feed_follow_xpath {
    $_[0]->{custom_feed_follow_xpath};
}

sub handle_force {
    my($self, $args) = @_;
    $self->{handle_force}
        ? $args->{entry}->permalink =~ /$self->{handle_force}/ : 0;
}

sub handle {
    my($self, $args) = @_;
    $self->{handle}
        ? $args->{entry}->permalink =~ /$self->{handle}/ : 0;
}

sub xml_escape {
    for my $x (@_) {
        $x = Plagger::Util::encode_xml($x);
    }
}

sub extract {
    my($self, $args) = @_;
    my $data;

    unless ($self->{extract} || $self->{extract_xpath}) {
        Plagger->context->log(error => "YAML doesn't have either 'extract' nor 'extract_xpath'");
        return;
    }

    if ($self->{extract}) {
	if (my @match = $args->{content} =~ /$self->{extract}/s) {
	    my @capture = split /\s+/, $self->{extract_capture};
            @capture = ('body') unless @capture;
	    @{$data}{@capture} = @match;
	}
    }

    if ($self->{extract_xpath}) {
        eval { require HTML::TreeBuilder::XPath };
        if ($@) {
            Plagger->context->log(error => "HTML::TreeBuilder::XPath is required. $@");
            return;
        }

        my $tree = HTML::TreeBuilder::XPath->new;
        $tree->parse($args->{content});
        $tree->eof;

        for my $capture (keys %{$self->{extract_xpath}}) {
            my @children = $tree->findnodes($self->{extract_xpath}->{$capture});
            if (@children) {
                no warnings 'redefine';
                local *HTML::Element::_xml_escape = \&xml_escape;
                $data->{$capture} = $children[0]->isElementNode
                    ? $children[0]->as_XML
                    : $children[0]->getValue;
            } else {
                Plagger->context->log(error => "Can't find node matching $self->{extract_xpath}->{$capture}");
            }
        }
    }

    if ($data) {
        if ($self->{extract_after_hook}) {
            eval $self->{extract_after_hook};
            Plagger->context->error($@) if $@;
        }

        if ($data->{date}) {
            if (my $format = $self->{extract_date_format}) {
                $format = [ $format ] unless ref $format;
                $data->{date} = (map { Plagger::Date->strptime($_, $data->{date}) } @$format)[0];
                if ($data->{date} && $self->{extract_date_timezone}) {
                    $data->{date}->set_time_zone($self->{extract_date_timezone});
                }
            } else {
                $data->{date} = Plagger::Date->parse_dwim($data->{date});
            }
        }

        return $data;
    }
}

1;

__END__