WWW::Search::MetaCrawler - class for searching http://mc3.go2net.com!


WWW-Search-Backends documentation Contained in the WWW-Search-Backends distribution.

Index


Code Index:

NAME

Top

WWW::Search::MetaCrawler - class for searching http://mc3.go2net.com!

SYNOPSIS

Top

    require WWW::Search;
    $search = new WWW::Search('MetaCrawler');

DESCRIPTION

Top

Class specialization of WWW::Search for searching http://mc3.go2net.com.

This class exports no public interface; all interaction should be done through WWW::Search objects.

SEE ALSO

Top

To make new back-ends, see WWW::Search, or the specialized AltaVista searches described in options.

HOW DOES IT WORK?

Top

native_setup_search is called before we do anything. It initializes our private variables (which all begin with underscores) and sets up a URL to the first results page in {_next_url}.

native_retrieve_some is called (from WWW::Search::retrieve_some) whenever more hits are needed. It calls the LWP library to fetch the page specified by {_next_url}. It parses this page, appending any search hits it finds to {cache}. If it finds a ``next'' button in the text, it sets {_next_url} to point to the page for the next set of results, otherwise it sets it to undef to indicate we're done.

AUTHOR

Top

WWW::Search::MetaCrawler is written and maintained by Jim Smyser - <jsmyser@bigfoot.com>.

COPYRIGHT

Top


WWW-Search-Backends documentation Contained in the WWW-Search-Backends distribution.

# MetaCrawler.pm
# by Jim Smyser
# $Id: MetaCrawler.pm,v 2.72 2008/02/01 02:50:26 Daddy Exp $

#'

#####################################################################

package WWW::Search::MetaCrawler;

use strict;
use warnings;

use base 'WWW::Search';

our
$VERSION = do { my @r = ( q$Revision: 2.72 $ =~ /\d+/g ); sprintf "%d." . "%03d" x $#r, @r };

my $MAINTAINER = 'Jim Smyser <jsmyser@bigfoot.com>';
my $TEST_CASES = <<"ENDTESTCASES";
&test('MetaCrawler', '$MAINTAINER', 'zero', \$bogus_query, \$TEST_EXACTLY);
&test('MetaCrawler', '$MAINTAINER', 'one', 'ohme'.'ohmy', \$TEST_RANGE, 2,20);
&test('MetaCrawler', '$MAINTAINER', 'two', 'Bos'.'sk', \$TEST_GREATER_THAN, 101);
ENDTESTCASES

use Carp ();
use WWW::Search qw(generic_option strip_tags);
require WWW::SearchResult;

sub native_setup_search {

       my($self, $native_query, $native_options_ref) = @_;
       $self->{_debug} = $native_options_ref->{'search_debug'};
       $self->{_debug} = 2 if ($native_options_ref->{'search_parse_debug'});
       $self->{_debug} = 0 if (!defined($self->{_debug}));
       $self->{agent_e_mail} = 'jsmyser@bigfoot.com';
       $self->user_agent('user');
       $self->{'_num_hits'} = 30;
       $self->{'_next_to_retrieve'} = 1;
               if (!defined($self->{_options})) {
             $self->{_options} = {
             'search_url' => 'http://mc3.go2net.com/crawler',
             'method'=> '0',
             'region' => '0',
             'rpp' => $self->{'_num_hits'},
             'timeout' => '10',
             'hpe' => '30',
             'sort'=> '0',
             'eng' => 'AltaVista&eng=Excite&eng=Infoseek&eng=Lycos&eng=WebCrawler&eng=Yahoo&eng=Thunderstone&eng=LookSmart&eng=About&eng=GoTo&eng=DirectHit',
             'refer' => 'mc-power',
             'general' =>  $native_query,
             };
             }

       my $options_ref = $self->{_options};
       if (defined($native_options_ref))
         {
       foreach (keys %$native_options_ref)
         {
       $options_ref->{$_} = $native_options_ref->{$_};
         } 
         } 
       my($options) = '';
       foreach (sort keys %$options_ref)
         {
       next if (generic_option($_));
       $options .= $_ . '=' . $options_ref->{$_} . '&';
         }
       chop $options;
       $self->{_next_url} = $self->{_options}{'search_url'} .'?'. $self->hash_to_cgi_string($self->{_options});
         } 

# private
sub native_retrieve_some {

       my ($self) = @_;
       print STDERR "**MetaCrawler::native_retrieve_some()\n" if $self->{_debug};
       # Fast exit if already done:
       return undef if (!defined($self->{_next_url}));
           
       # If this is not the first page of results, sleep so as to not
       # overload the server:
       $self->user_agent_delay if 1 < $self->{'_next_to_retrieve'};
           
       # Get some:
       print STDERR "**Requesting (",$self->{_next_url},")\n" if $self->{_debug};
       my($response) = $self->http_request('GET', $self->{_next_url});
       $self->{response} = $response;
       if (!$response->is_success)
         {
       return undef;
         }
       $self->{'_next_url'} = undef;
       print STDERR "**Found Some\n" if $self->{_debug};

       my ($HEADER, $HITS, $NEXT, $DESC) = qw(HE HI NX DE);
       my $state = $HEADER;
       my $hit = ();
       my $hits_found = 0;
       foreach ($self->split_lines($response->content()))
         {
       next if m@^$@; # short circuit for blank lines
       print STDERR " * $state ===$_=== " if 2 <= $self->{'_debug'};
       if (m|(\d+) results</font>|i) 
         {
       $self->approximate_result_count($1);
       $state = $HITS;
         } 
       elsif ($state eq $HITS && m@<dt>.*?<a href="([^"]+)">(.*)</a>@i) 
         {
       print STDERR "**Found a URL\n" if 2 <= $self->{_debug};
       my ($url,$title) = ($1,$2);
       if (defined($hit))
         {
       push(@{$self->{cache}}, $hit);
         }
       $hit = new WWW::SearchResult;
       $hit->add_url($url);
       $hits_found++;
       $hit->title(strip_tags($title));
       $state = $NEXT;
         } 
       elsif ($state eq $NEXT && m@^<br>@i) 
         {
       $state = $DESC;
         } 
       elsif ($state eq $DESC && m@(<i>.*?</i>:.*)<br>@i || m@<dd>(.*)<br>@i) 
         {
       print STDERR "**Found description\n" if 2 <= $self->{_debug};
       $hit->description(strip_tags($1));
       $state = $HITS;
         } 
       elsif ($state eq $HITS && m|HREF="([^"]+)"><b>next</b></a>|i) 
         {
       print STDERR "**Found 'next' Tag\n" if 2 <= $self->{_debug};
       my $sURL = $1;
       $self->{'_next_url'} = $sURL;
       print STDERR " **Next URL is: ", $self->{'_next_url'}, "\n" if 2 <= $self->{_debug};
       $state = $HITS;
         }
         else
         {
       print STDERR "**Nothing Matched\n" if 2 <= $self->{_debug};
         }
         } 
         if (defined($hit)) 
         {
       push(@{$self->{cache}}, $hit);
         }
       return $hits_found;
         }

1;

__END__