SoggyOnion::Plugin::ImageScraper - get images from a page


SoggyOnion documentation Contained in the SoggyOnion distribution.

Index


Code Index:

NAME

Top

SoggyOnion::Plugin::ImageScraper - get images from a page

SYNOPSIS

Top

In config.yaml:

    layout:
      - title: Comic Strips
        name:  comics.html
        items:
          - images: http://www.myfavoritestrip.com/
            id:  myfavoritestrip
            offset: 4
            limit: 1

DESCRIPTION

Top

This is a plugin for SoggyOnion that grabs a series of <IMG> tags from a URI and adds them to the SoggyOnion output page.

Item Options

* images - the URI of the page to scrape the image(s) from
* id - the item ID that appears in the HTML <DIV> tag
* offset - (optional) the index of the first image, default is 0 (first image)
* limit - (optional) how many images to show past the offset, default is 1
* prefix - (optional) prefix the output URI of the image with this. By default, checks the IMG SRC. If it's relative it prepends the prefix if you set it or the URI specified with images.

SEE ALSO

Top

SoggyOnion

AUTHOR

Top

Ian Langworth, <ian@cpan.org>

COPYRIGHT AND LICENSE

Top


SoggyOnion documentation Contained in the SoggyOnion distribution.

package SoggyOnion::Plugin::ImageScraper;
use warnings;
use strict;
use base qw( SoggyOnion::Plugin );

our $VERSION = '0.04';

use Template;
use constant TEMPLATE_FILE => 'imagescraper.tt2';

use LWP::Simple qw(get head $ua);
use constant MOD_TIME => 2;

use HTML::TokeParser;
use constant { TYPE => 0, TAG => 1, ATTR => 2 };

# set our useragent to be nice
sub init {
    $ua->agent( SoggyOnion->useragent );
}

# try getting the modification time of the RSS feed from the web server.
# if we can't, just return the current time to make sure the feed is
# processed.
sub mod_time {
    my $self  = shift;
    my $mtime = [ head( $self->{rss} ) ]->[MOD_TIME];
    return $mtime || time;    # in case no modification time is available
}

sub content {
    my $self = shift;

    # error checking for required options
    die "'images' attribute is required\n"
        unless ( exists $self->{images} );

    # setup defaults for other options
    $self->{offset} ||= 0;
    $self->{limit}  ||= 1;

    # get the URL
    my $document = get( $self->{images} );
    die "couldn't get document" unless defined $document;

    # cheap way of getting title! FIXME
    # URI::Title doesn't do much more anyway
    $document =~ m#<title>(.+?)</title>#si;
    my $title = $1;

    # process links
    my $parser = HTML::TokeParser->new( \$document ) or die $!;
    my $i      = 0;
    my @links  = ();
    while ( my $token = $parser->get_token ) {
        next unless ref $token eq 'ARRAY';
        next unless $token->[TYPE] eq 'S' && $token->[TAG] eq 'img';
        next unless $i++ >= $self->{offset};
        push @links, $token->[ATTR]->{src};
        last if @links >= $self->{limit};
    }

    # did we specify a prefix in the config? if so, prefix all links
    if ( exists $self->{prefix} ) {
        @links = map { $self->{prefix} . $_ } @links;
    }

    # no prefix in the conf? go through and make sure that all our links are
    # absolute. if they're relative, prepend the source URL
    else {

        # determine protocol -- use if double-slash shorthand is used
        $self->{images} =~ m/^(\w+):/;
        my $protocol = $1;

        # strip connecting slash
        $self->{images} =~ s#/+$##;

        for (@links) {

            # valid but uncommon URI shorthand
            $_ = "$protocol\:$_" if m#^//[^/]#;

            # strip connecting slashes
            s#^/+##;

            # prepend relative URIs with our source URI
            $_ = $self->{images} . '/' . $_
                unless m/^\w+:\/\//;
        }
    }

    # run it through our template
    my $tt
        = Template->new( INCLUDE_PATH => SoggyOnion->options->{templatedir} )
        or die "couldn't create Template object\n";
    my $output;
    $tt->process( TEMPLATE_FILE,
        { links => \@links, src => $self->{images}, title => $title },
        \$output )
        or die $tt->error;
    return $output;
}

1;

__END__