WWW::Google::Images - Google Images Agent


WWW-Google-Images documentation Contained in the WWW-Google-Images distribution.

Index


Code Index:

NAME

Top

WWW::Google::Images - Google Images Agent

VERSION

Top

Version 0.6.5

DESCRIPTION

Top

This module may be used search images on Google. Its interface is heavily inspired from WWW::Google::Groups.

SYNOPSIS

Top

    use WWW::Google::Images;

    $agent = WWW::Google::Images->new(
        server => 'images.google.com',
        proxy  => 'my.proxy.server:port',
    );

    $result = $agent->search('flowers', limit => 10);

    while ($image = $result->next()) {
        $count++;
        print $image->content_url();
        print $image->context_url();
        print $image->save_content(base => 'image' . $count);
        print $image->save_context(base => 'page' . $count);
    }

Constructor

Top

new(%args)

Creates and returns a new WWW::Google::Images object.

Optional parameters:

server => $server

use $server as server.

proxy => $proxy:$port

use $proxy as proxy on port $port.

$agent->search($query, %args);

Perform a search for $query, and return a WWW::Google::Images::SearchResult object.

Optional parameters:

limit => $limit

limit the maximum number of result returned to $limit.

min_width => $width

limit the minimum width of result returned to $width pixels.

min_height => $height

limit the minimum width of result returned to $height pixels.

min_size => $size

limit the minimum size of result returned to $size ko.

max_width => $width

limit the maximum width of result returned to $width pixels.

max_height => $height

limit the maximum width of result returned to $height pixels.

max_size => $size

limit the maximum size of result returned to $size ko.

ratio => $ratio

limit the width/height ratio of result returned to $ratio (+/- tolerance).

ratio_delta => $ratio_delta

set the tolerance limit for the ratio limit to $ratio_delta (default: 1.0).

regex => $regex

limit the result returned to those whose filename matches case-sensitive $regex regular expression.

iregex => $regex

limit the result returned to those whose filename matches case-insensitive $regex regular expression.

COPYRIGHT AND LICENSE

Top

AUTHOR

Top

Guillaume Rousse <grousse@cpan.org>


WWW-Google-Images documentation Contained in the WWW-Google-Images distribution.
# $Id: Images.pm,v 1.31 2007/12/29 01:09:12 rousse Exp $
package WWW::Google::Images;

use WWW::Mechanize;
use WWW::Google::Images::SearchResult;
use HTML::Parser;
use strict;
use warnings;
our $VERSION = '0.6.5';

sub new {
    my ($class, %arg) = @_;

    foreach my $key (qw(server proxy)){
        next unless $arg{$key};
        $arg{$key} = 'http://'.$arg{$key} if $arg{$key} !~ m,^\w+?://,o;
    }

    my $a = WWW::Mechanize->new(onwarn => undef, onerror => undef);
    $a->proxy(['http'], $arg{proxy}) if $arg{proxy};

    my $self = bless {
        _server => ($arg{server} || 'http://images.google.com/'),
        _proxy  => $arg{proxy},
        _agent  => $a,
    }, $class;

    return $self;
}

sub search {
    my ($self, $query, %arg) = @_;

    warn "No query given, aborting" and return unless $query;

    $arg{limit} = 10 unless defined $arg{limit};

    $self->{_agent}->get($self->{_server});

    $self->{_agent}->submit_form(
        form_number => 1,
        fields      => {
            q => $query
        }
    );

    my @images;
    my $page = 1;

    LOOP: {
        do {
            push(@images, $self->_extract_images(($arg{limit} ? $arg{limit} - @images : 0), %arg));
            last if $arg{limit} && @images == $arg{limit};
        } while ($self->_next_page(++$page));
    }

    return WWW::Google::Images::SearchResult->new($self->{_agent}, @images);
}

sub _next_page {
    my ($self, $page) = @_;

    return $self->{_agent}->follow_link(text => $page)
}

sub _extract_images {
    my ($self, $limit, %arg) = @_;

    my @images;
    my @data;

    my @links = $self->{_agent}->find_all_links( url_regex => qr/imgurl/ );

    if (
        $arg{min_size}   ||
        $arg{max_size}   ||
        $arg{min_width}  || 
        $arg{max_width}  ||
        $arg{min_height} ||
        $arg{max_height} ||
        $arg{ratio}
    ) {
        my $parser = HTML::Parser->new();
        my $pattern = qr/
                        ^
                        (\d+) \s x \s (\d+)
                        \s - \s (\d+)k
                        (?:&nbsp; - &nbsp; \w*)?
                        $
                /ox;
        my $callback = sub {
            my ($text) = @_;
            if ($text =~ $pattern) {
                push(@data, { width => $1, height => $2, size => $3 });
            }
        };
        $parser->handler(text => $callback, 'text');
        $parser->parse($self->{_agent}->content());
    }

    my ($upper, $lower);
    if ($arg{ratio}) {
        my $delta = $arg{ratio_delta} || 1.0;
        $lower = $arg{ratio} - $delta;
        $upper = $arg{ratio} + $delta;
    }

    for my $i (0 .. $#links) {
        next if $arg{min_size} && $data[$i]->{size} < $arg{min_size};
        next if $arg{max_size} && $data[$i]->{size} > $arg{max_size};
        next if $arg{min_width} && $data[$i]->{width} < $arg{min_width};
        next if $arg{max_width} && $data[$i]->{width} > $arg{max_width};
        next if $arg{min_height} && $data[$i]->{height} < $arg{min_height};
        next if $arg{max_height} && $data[$i]->{height} > $arg{max_height};
        if ($arg{ratio}) {
            my $ratio = $data[$i]->{width} / $data[$i]->{height};
            next if $ratio < $lower || $ratio > $upper;
        }
        $links[$i]->url() =~ /imgurl=([^&]+)&imgrefurl=([^&]+)/;
        my $content = $1;
        my $context = $2;
        next if $arg{regex} && $content !~ /$arg{regex}/;
        next if $arg{iregex} && $content !~ /$arg{iregex}/i;
        push(@images, { content => $content, context => $context});
        last if $limit && @images == $limit;
    }

    return @images;
}

1;