WWW::Search::Metapedia - class for searching online Encyclopedias


WWW-Search-Backends documentation Contained in the WWW-Search-Backends distribution.

Index


Code Index:

NAME

Top

WWW::Search::Metapedia - class for searching online Encyclopedias

SYNOPSIS

Top

    require WWW::Search;
    $search = new WWW::Search('Metapedia');




DESCRIPTION

Top

http://www.savvysearch.com.

Silly class I made to sponge off a Encyclopedia meta search engine. Searches multi online Encyclopedias.

Covers:

Encylopedia Britannica - paid subscription service Encarta Knowledge Adventure Infoplease

This class exports no public interface; all interaction should be done through WWW::Search objects.

SEE ALSO

Top

To make new back-ends, see WWW::Search.

HOW DOES IT WORK?

Top

native_setup_search is called before we do anything. It initializes our private variables (which all begin with underscores) and sets up a URL to the first results page in {_next_url}.

native_retrieve_some is called (from WWW::Search::retrieve_some) whenever more hits are needed. It calls the LWP library to fetch the page specified by {_next_url}. It parses this page, appending any search hits it finds to {cache}. If it finds a ``next'' button in the text, it sets {_next_url} to point to the page for the next set of results, otherwise it sets it to undef to indicate we're done.

AUTHOR

Top

Jim Smyser <jsmyser@bigfoot.com>

COPYRIGHT

Top


WWW-Search-Backends documentation Contained in the WWW-Search-Backends distribution.

# Metapedia.pm
# by Jim Smyser
# Copyright (C) 1996-1998 by USC/ISI
# $Id: Metapedia.pm,v 2.42 2008/02/01 02:50:26 Daddy Exp $

package WWW::Search::Metapedia;

use strict;
use warnings;

use base 'WWW::Search';

our
$VERSION = do { my @r = ( q$Revision: 2.42 $ =~ /\d+/g ); sprintf "%d." . "%03d" x $#r, @r };

use Carp ();
use WWW::Search(generic_option);
use WWW::SearchResult;


# private
sub native_setup_search {
    my($self, $native_query, $native_options_ref) = @_;
    $self->user_agent('user');
    $self->{_next_to_retrieve} = 0;
    if (!defined($self->{_options})) {
    $self->{_options} = {
        'cat' => '63',
        'search_url' => 'http://www.savvysearch.com/search',
        };
    };
    my($options_ref) = $self->{_options};
    if (defined($native_options_ref)) {
    # Copy in new options.
    foreach (keys %$native_options_ref) {
        $options_ref->{$_} = $native_options_ref->{$_};
    };
    };
    # Process the options.
    my($options) = '';
    foreach (keys %$options_ref) {
    # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
    next if (generic_option($_));
    $options .= $_ . '=' . $options_ref->{$_} . '&';
    };
    $self->{_debug} = $options_ref->{'search_debug'};
    $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'});
    $self->{_debug} = 0 if (!defined($self->{_debug}));
    
    # Finally figure out the url.
    $self->{_base_url} = 
    $self->{_next_url} =
    $self->{_options}{'search_url'} .
    "?" . $options .
    "q=" . $native_query;
    print $self->{_base_url} . "\n" if ($self->{_debug});
    }
# private
sub native_retrieve_some {
    my ($self) = @_;
    # fast exit if already done
    return undef if (!defined($self->{_next_url}));

    # get some
    print STDERR "WWW::Search::Metapedia::native_retrieve_some: fetching " . $self->{_next_url} . "\n" if ($self->{_debug});
    my($response) = $self->http_request('GET', $self->{_next_url});
    $self->{response} = $response;
    if (!$response->is_success) {
    return undef;
    };
    # parse the output
    my($HEADER, $HITS, $TRAILER, $POST_NEXT) = (1..10);
    my($hits_found) = 0;
    my($state) = ($HEADER);
    my($hit, $title, $url, $sDesc) = ();
    foreach ($self->split_lines($response->content())) {
        next if m@^$@; # short circuit for blank lines
     if ($state == $HEADER && m@metasearch results.*?of\s(\d+)@i) { 
        $self->approximate_result_count($1);
        print STDERR "PARSE(HEADER->HITS-1): $_\n" if ($self->{_debug} >= 2);
        $state = $HITS;
   } elsif ($state == $HITS && m@<dt><a href=.*?,http%3A%2F%2F(.*?)\">(.*)</font></dt><dd><.*?>(.*)<br>@i) { 
        my ($url, $title, $sDesc) = ($1,$2,$3);
        print STDERR "PARSE(HITS->HITS): $_\n" if ($self->{_debug} >= 2);
        my($hit) = new WWW::SearchResult;
        $url = 'http://'. $url;        
        $title =~ s/<b>//g;
        $hit->add_url(&decodeURL($url));
        $hit->title($title);
        $hit->description($sDesc);
        $hits_found++;
        push(@{$self->{cache}}, $hit);
        $state = $HITS;
        # Make sure we don't miss anything
   } elsif ($state == $HITS && m@<dt><a href=.*?,http%3A%2F%2F(.*?)\">(.*)</font></dd><br>@i) { 
        my ($url, $title, $sDesc) = ($1,$2,$3);
        $sDesc .= $3;
        print STDERR "PARSE(HITS->HITS): $_\n" if ($self->{_debug} >= 2);
        my($hit) = new WWW::SearchResult;
        $url = 'http://'. $url; 
        $title =~ s/<b>//g;
        $hit->add_url(&decodeURL($url));
        $hit->title($title);
        $hit->description($sDesc);
        $hits_found++;
        push(@{$self->{cache}}, $hit);
        $state = $HITS;
    } elsif ($state == $HITS && (m@</font></dl>@i)) {
        print STDERR "PARSE(HITS->TRAILER): $_\n\n" if ($self->{_debug} >= 2);
        $state = $TRAILER;
    } elsif ($state == $TRAILER && m@<a href="([^"]+)">next\<\/a\>(.*)@i) { 
        my($relative_url) = $1;
        $self->{_next_url} = new URI::URL($relative_url, $self->{_base_url});
        print STDERR "PARSE(TRAILER->POST_NEXT): $_\n\n" if ($self->{_debug} >= 2);
        $state = $POST_NEXT;
        } else {
        print STDERR "PARSE: read:\"$_\"\n" if ($self->{_debug} >= 2);
        };
        };
        if ($state != $POST_NEXT) {
        # no "next" tag
    if (defined($hit)) {
        push(@{$self->{cache}}, $hit);
        };
        $self->{_next_url} = undef;
        };
        # sleep so as to not overload server
        $self->user_agent_delay if (defined($self->{_next_url}));
        return $hits_found;
        }
# private
sub decodeURL {
     # rolled my own unescape stuff 
     my $url = shift;
           $url=~ s/%2E/./g;
           $url=~ s/%2F/\//g;
           $url=~ s/%3A/:/g;
           $url=~ s/%2D/-/g;
           $url=~ s/%5F/_/g;
           $url=~ s/%7E/~/g;
     return $url;
        } 

1;

__END__