WWW::Search::Simple - class for searching any web site


WWW-Search documentation Contained in the WWW-Search distribution.

Index


Code Index:

NAME

Top

WWW::Search::Simple - class for searching any web site

SYNOPSIS

Top

    require WWW::Search;
    $search = new WWW::Search('Simple');

DESCRIPTION

Top

This class is a specialization of WWW::Search for simple web based search indices. It extracts all links from a given page.

This class exports no public interface; all interaction should be done through WWW::Search objects.

Note that this module will probably get a lot of false hits.

AUTHOR

Top

WWW::Search::Simple is written by Paul Lindner, <lindner@itu.int>

COPYRIGHT

Top


WWW-Search documentation Contained in the WWW-Search distribution.

#!/usr/local/bin/perl

# contributed from Paul Lindner <lindner@itu.int>



package WWW::Search::Simple;

use strict;
use warnings;

use base 'WWW::Search';

use Carp ();
require WWW::SearchResult;

my($debug) = 0;

#private
sub native_setup_search {
    my($self, $native_query, $native_opt) = @_;
    my($native_url);
    my($default_native_url) =
	"http://www.itu.int/cgi-bin/SFgate?application=itu&database=local//usr/local/wais/WWW/www-pages&listenv=table&httppath=/usr/local/www-data/&httpprefix=/&tie=and&maxhits=%n&text=%s";
    if (defined($native_opt)) {
	#print "Got " . join(' ', keys(%$native_opt)) . "\n";
	# Process options..
	# Substitute query terms for %s...

	if ($self->{'search_url'} && $native_opt->{'search_args'}) {
	    $native_url = $native_opt->{'search_url'} . "?" . $native_opt->{'search_args'};
	}
    } 

    
    $native_url = $default_native_url if (!$native_url);

    $native_url =~ s/%s/$native_query/g; # Substitute search terms...

    $self->user_agent();
    $self->{_next_to_retrieve} = 0;
    $self->{_base_url} = $self->{_next_url} = $native_url;
}


# private
sub native_retrieve_some
{
    my ($self) = @_;
    my ($hit)  = ();
    my ($hits_found) = 0;

    # fast exit if already done
    return undef if (!defined($self->{_next_url}));

    # get some
    print "GET " . $self->{_next_url} . "\n" if ($debug);
    my($response) = $self->http_request($self->{search_method}, 
					$self->{_next_url});

    $self->{response} = $response;
    if (!$response->is_success) {
	print "Some problem\n" if ($Debug);
	return undef;
    };
    # parse the output
    use HTML::TreeBuilder;


    my $score = 800;
    my $results = $response->content();

    my($h) = new HTML::TreeBuilder;
    $h->parse($results);


    for (@{ $h->extract_links(qw(a)) }) {
	my($link, $linkelem) = @$_;
	
	my($linkobj)       = new URI::URL $link, $self->{_next_url};
	print "Fixing $link\n" if ($Debug);
	
	my($hit) = new WWW::SearchResult;
	$hit->add_url($linkobj->abs->as_string());
	$hit->title(join(' ',@{$linkelem->content}));
	$hit->score($score);
	$hit->normalized_score($score);

	if (!($srchitem{'title'} =~ /HASH\(0x/)) {
	    $hits_found++;
	    push(@{$self->{cache}}, $hit);
	}
	
	
	#$srchitem{'origin'} = $self->{'myurl'};
	#$srchitem{'index'}  = $self->{'index'};
	
	$score = int ($score * .95);
    }
    $self->approximate_result_count($hits_found);
    $self->{_next_url} = undef;
    return($hits_found);
}


1;