SWISH::Prog - information retrieval application framework


SWISH-Prog documentation Contained in the SWISH-Prog distribution.

Index


Code Index:

NAME

Top

SWISH::Prog - information retrieval application framework

SYNOPSIS

Top

  use SWISH::Prog;
  my $program = SWISH::Prog->new(
    invindex    => 'path/to/myindex',
    aggregator  => 'fs',
    indexer     => 'native',
    config      => 'some/swish/config/file',
    filter      => sub { print $_[0]->url . "\n" },
  );

  $program->run('some/dir');

  print $program->count . " documents indexed\n";




DESCRIPTION

Top

NOTE: As of version 0.20 this API has been completely redesigned from previous versions.

SWISH::Prog is a full-text search framework based on Swish-e. SWISH::Prog handles document and data aggregation and indexing.

The name "SWISH::Prog" comes from the Swish-e -S prog feature. "prog" is short for "program". SWISH::Prog makes it easy to write indexing and search programs.

The API is a work in progress and subject to change.

METHODS

Top

All of the following methods may be overridden when subclassing this module.

init

Overrides base SWISH::Prog::Class init() method.

aggregator( $swish_prog_aggregator )

Get the SWISH::Prog::Aggregator object. You should set this in new().

run( collection )

Execute the program. This is an alias for index().

index( collection )

Add items in collection to the invindex().

config

Returns the aggregator's config() object.

invindex

Returns the indexer's invindex.

indexer

Returns the indexer.

count

Returns the indexer's count. NOTE This is the number of documents actually indexed, not counting the number of documents considered and discarded by the aggregator. If you want the number of documents the aggregator looked at, regardless of whether they were indexed, use the aggregator's count() method.

test_mode

Dry run mode, just prints info on stderr but does not build index. This flag is set in new() and passed to the indexer and aggregator.

AUTHOR

Top

Peter Karman, <perl@peknet.com>

BUGS

Top

Please report any bugs or feature requests to bug-swish-prog at rt.cpan.org, or through the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SWISH-Prog. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

SUPPORT

Top

You can find documentation for this module with the perldoc command.

    perldoc SWISH::Prog




You can also look for information at:

* Mailing list

http://lists.swish-e.org/listinfo/users

* RT: CPAN's request tracker

http://rt.cpan.org/NoAuth/Bugs.html?Dist=SWISH-Prog

* AnnoCPAN: Annotated CPAN documentation

http://annocpan.org/dist/SWISH-Prog

* CPAN Ratings

http://cpanratings.perl.org/d/SWISH-Prog

* Search CPAN

http://search.cpan.org/dist/SWISH-Prog/

COPYRIGHT AND LICENSE

Top

SEE ALSO

Top

http://swish-e.org/

SWISH::Prog::Doc, SWISH::Prog::Headers, SWISH::Prog::Indexer, SWISH::Prog::InvIndex, SWISH::Prog::Utils, SWISH::Prog::Aggregator, SWISH::Prog::Config


SWISH-Prog documentation Contained in the SWISH-Prog distribution.
package SWISH::Prog;
use 5.008003;
use strict;
use warnings;
use base qw( SWISH::Prog::Class );
use Carp;
use Data::Dump qw( dump );
use Scalar::Util qw( blessed );
use SWISH::Prog::Config;
use SWISH::Prog::InvIndex;

our $VERSION = '0.51';

__PACKAGE__->mk_accessors(qw( aggregator test_mode ));

# each $swishProg hasa aggregator, which hasa indexer and hasa invindex

# allow for short names. we map to class->new
my %ashort = (
    fs     => 'SWISH::Prog::Aggregator::FS',
    mail   => 'SWISH::Prog::Aggregator::Mail',
    mailfs => 'SWISH::Prog::Aggregator::MailFS',
    dbi    => 'SWISH::Prog::Aggregator::DBI',
    spider => 'SWISH::Prog::Aggregator::Spider',
    object => 'SWISH::Prog::Aggregator::Object',
);
my %ishort = (
    native => 'SWISH::Prog::Native::Indexer',
    xapian => 'SWISH::Prog::Xapian::Indexer',
    ks     => 'SWISH::Prog::KSx::Indexer',
    lucy   => 'SWISH::Prog::Lucy::Indexer',
    dbi    => 'SWISH::Prog::DBI::Indexer',
);

sub init {
    my $self   = shift;
    my %arg    = @_;
    my $filter = delete $arg{filter};    # no such method. just convenience.
    $self->SUPER::init(%arg);

    # search mode requires only invindex
    if ( $self->{query} && !$self->{indexer} && !$self->{aggregator} ) {
        return;
    }

    # need to make sure we have an aggregator
    # indexer and/or config might already be set in aggregator
    # but if set here, we override.

    my ( $aggregator, $indexer );

    # ok if undef
    my $config = $self->{config};

    # get indexer
    $indexer = $self->{indexer} || 'native';
    if ( $self->{aggregator} and blessed( $self->{aggregator} ) ) {
        $indexer = $self->{aggregator}->indexer;
        $config  = $self->{aggregator}->config;
    }
    if ( !blessed($indexer) ) {

        if ( exists $ishort{$indexer} ) {
            $indexer = $ishort{$indexer};
        }

        $self->debug and warn "creating indexer: $indexer";
        eval "require $indexer";
        if ($@) {
            croak "invalid indexer $indexer: $@";
        }
        $indexer = $indexer->new(
            debug     => $self->debug,
            invindex  => $self->{invindex},    # may be undef
            verbose   => $self->verbose,
            config    => $config,              # may be undef
            test_mode => $self->test_mode,
        );
    }
    elsif ( !$indexer->isa('SWISH::Prog::Indexer') ) {
        croak "$indexer is not a SWISH::Prog::Indexer-derived object";
    }

    $aggregator = $self->{aggregator} || 'fs';

    if ( !blessed($aggregator) ) {

        if ( exists $ashort{$aggregator} ) {
            $aggregator = $ashort{$aggregator};
        }

        $self->debug and warn "creating aggregator: $aggregator";
        eval "require $aggregator";
        if ($@) {
            croak "invalid aggregator $aggregator: $@";
        }
        $aggregator = $aggregator->new(
            indexer   => $indexer,
            debug     => $self->debug,
            verbose   => $self->verbose,
            test_mode => $self->test_mode,
        );
    }
    elsif ( !$aggregator->isa('SWISH::Prog::Aggregator') ) {
        croak "$aggregator is not a SWISH::Prog::Aggregator-derived object";
    }

    if ($filter) {
        $aggregator->set_filter($filter);
    }

    $self->{aggregator} = $aggregator;
    $self->{indexer}    = $indexer;

    $indexer->{test_mode} = $self->{test_mode}
        unless exists $indexer->{test_mode};
    $aggregator->{test_mode} = $self->{test_mode}
        unless exists $aggregator->{test_mode};

    $self->debug and carp dump $self;

    return $self;
}

*run = \&index;

sub index {
    my $self = shift;
    my $aggregator = $self->aggregator or croak 'aggregator required';
    unless ( $aggregator->isa('SWISH::Prog::Aggregator') ) {
        croak "aggregator is not a SWISH::Prog::Aggregator";
    }

    $aggregator->indexer->start;
    $aggregator->crawl(@_);
    $aggregator->indexer->finish;
    return $aggregator->indexer->count;
}

sub config {
    my $self = shift;
    if ( $self->aggregator ) {
        return $self->aggregator->config;
    }
    if ( $self->indexer ) {
        return $self->indexer->config;
    }
    return undef;
}

sub invindex {
    my $self = shift;
    if ( $self->aggregator ) {
        return $self->indexer->invindex;
    }
    return blessed( $self->{invindex} )
        ? $self->{invindex}
        : SWISH::Prog::InvIndex->new( path => $self->{invindex} );
}

sub indexer {
    shift->aggregator->indexer;
}

sub count {
    shift->indexer->count;
}

1;

__END__