SWISH::Prog::Aggregator::FS - crawl a filesystem


SWISH-Prog documentation Contained in the SWISH-Prog distribution.

Index


Code Index:

NAME

Top

SWISH::Prog::Aggregator::FS - crawl a filesystem

SYNOPSIS

Top

 use SWISH::Prog::Aggregator::FS;
 my $fs = SWISH::Prog::Aggregator::FS->new(
        indexer => SWISH::Prog::Indexer->new
    );

 $fs->indexer->start;
 $fs->crawl( $path );
 $fs->indexer->finish;

DESCRIPTION

Top

SWISH::Prog::Aggregator::FS is a filesystem aggregator implementation of the SWISH::Prog::Aggregator API. It is similar to the DirTree.pl script in the Swish-e 2.4 distribution.

METHODS

Top

See SWISH::Prog::Aggregator.

init

Implements the base init() method called by new().

file_ok( full_path )

Check full_path before fetch()ing it.

Returns 0 if full_path should be skipped.

Returns file extension of full_path if full_path should be processed.

dir_ok( directory )

Called by find() for all directories. You can control the recursion into directory via the config() params

 TODO

get_doc( file_path [, stat, ext ] )

Returns a doc_class() instance representing file_path.

crawl( paths_or_files )

Crawl the filesystem recursively within paths_or_files, processing each document specified by the config().

AUTHOR

Top

Peter Karman, <perl@peknet.com>

BUGS

Top

Please report any bugs or feature requests to bug-swish-prog at rt.cpan.org, or through the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SWISH-Prog. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

SUPPORT

Top

You can find documentation for this module with the perldoc command.

    perldoc SWISH::Prog




You can also look for information at:

* Mailing list

http://lists.swish-e.org/listinfo/users

* RT: CPAN's request tracker

http://rt.cpan.org/NoAuth/Bugs.html?Dist=SWISH-Prog

* AnnoCPAN: Annotated CPAN documentation

http://annocpan.org/dist/SWISH-Prog

* CPAN Ratings

http://cpanratings.perl.org/d/SWISH-Prog

* Search CPAN

http://search.cpan.org/dist/SWISH-Prog/

COPYRIGHT AND LICENSE

Top

SEE ALSO

Top

http://swish-e.org/


SWISH-Prog documentation Contained in the SWISH-Prog distribution.
package SWISH::Prog::Aggregator::FS;
use strict;
use warnings;
use base qw( SWISH::Prog::Aggregator );

use Carp;
use File::Slurp;
use File::Find;
use File::Rules;
use Data::Dump qw( dump );

our $VERSION = '0.51';

sub init {
    my $self = shift;
    $self->SUPER::init(@_);

    # read from $self->config and set some flags
    # TODO FileRules, FileMatch

    # create .ext regex to match in file_ok()
    if ( $self->config->IndexOnly ) {
        my $re = join( '|',
            grep {s/^\.//} split( m/\s+/, $self->config->IndexOnly ) );
        $self->{_ext_re} = qr{\.($re)}io;
    }
    else {
        $self->{_ext_re} = $SWISH::Prog::Utils::ExtRE;
    }

    # if running with SWISH::3,
    # instantiate for the slurp advantage
    eval "use SWISH::3 0.09";
    if ( !$@ ) {
        $self->{_swish3} = SWISH::3->new;
    }

}

sub file_ok {
    my $self      = shift;
    my $full_path = shift;
    my $stat      = shift;

    $self->debug and warn "checking file $full_path\n";

    my ( $path, $file, $ext )
        = SWISH::Prog::Utils->path_parts( $full_path, $self->{_ext_re} );

    $self->debug and warn "path=$path file=$file ext=$ext\n";

    return 0 unless $ext;
    return 0 if $full_path =~ m![\\/](\.svn|RCS)[\\/]!; # TODO configure this.
    return 0 if $file =~ m/^\./;

    #carp "parsed file: $file\npath: $path\next: $ext";

    $stat ||= [ stat($full_path) ];
    return 0 unless -r _;
    return 0 if -d _;
    if (    $self->ok_if_newer_than
        and $self->ok_if_newer_than >= $stat->[9] )
    {
        return 0;
    }
    return 0
        if ( $self->_apply_file_rules($full_path)
        && !$self->_apply_file_match($full_path) );

    $self->debug and warn "  $full_path -> ok\n";
    if ( $self->verbose & 4 ) {
        local $| = 1;    # don't buffer
        print "crawling $full_path\n";
    }

    return $ext;
}

sub dir_ok {
    my $self = shift;
    my $dir  = shift;
    my $stat = shift || [ stat($dir) ];

    $self->debug and warn "checking dir $dir\n";

    return 0 unless -d _;
    return 0 if $dir =~ m!/\.!;
    return 0 if $dir =~ m/^\.[^\.]/;        # could be ../foo
    return 0 if $dir =~ m!/(\.svn|RCS)/!;
    return 0
        if ( $self->_apply_file_rules($dir)
        && !$self->_apply_file_match($dir) );

    $self->debug and warn "  $dir -> ok\n";
    if ( $self->verbose & 2 ) {
        local $| = 1;                       # don't buffer
        print "crawling $dir\n";
    }

    1;                                      # TODO esp RecursionDepth
}

sub _apply_file_rules {
    my ( $self, $file ) = @_;
    if ( !exists $self->{_file_rules} && $self->config->FileRules ) {

        # cache obj
        $self->{_file_rules} = File::Rules->new( $self->config->FileRules );
    }
    if ( exists $self->{_file_rules} ) {
        $self->debug and warn "applying FileRules";
        my $match = $self->{_file_rules}->match($file);
        return $match;
    }
    return 0;    # no rules
}

sub _apply_file_match {
    my ( $self, $file ) = @_;

    # TODO
    return 0;    # no-op for now
}

sub get_doc {
    my $self = shift;
    my $url = shift or croak "file path required";
    my ( $stat, $ext ) = @_;
    my $buf;

    # the SWISH::3->slurp is about 50% faster
    # but obviously only available if SWISH::3 is loaded.
    # It also handles .gz files transparently based on .gz
    # extension, so must remove the extension to avoid
    # double-unzip via SWISH::Filter.

    # NOTE we always read in binary (raw) mode in case
    # the file is compressed, binary, etc.
    if ( $self->{_swish3} ) {

        #warn "$url using swish3->slurp\n";
        eval {
            $buf = $self->{_swish3}->slurp( $url, 1 );
            $url =~ s/\.gz$//;    # post-slurp, in case it failed.
        };
    }
    else {
        eval { $buf = read_file( $url, binmode => ':raw' ) };
    }

    if ($@) {
        carp "unable to read $url - skipping";
        return;
    }

    $stat ||= [ stat($url) ];

    # TODO SWISH::3 has this function too.
    # might be faster since no OO overhead.
    my $type = SWISH::Prog::Utils->mime_type( $url, $ext );

    if (    $self->ok_if_newer_than
        and $self->ok_if_newer_than >= $stat->[9] )
    {
        warn "skipping $url ... too old\n";
        return;
    }

    return $self->doc_class->new(
        url     => $url,
        modtime => $stat->[9],
        content => $buf,
        type    => $type,
        size    => $stat->[7],
        debug   => $self->debug
    );

}

sub _do_file {
    my $self = shift;
    my $file = shift;
    if ( my $ext = $self->file_ok($file) ) {
        my $doc = $self->get_doc( $file, [ stat(_) ], $ext );
        $self->swish_filter($doc);
        if ( $self->test_mode ) {
            warn join( ' ', $doc->url, $doc->type ) . "\n";
        }
        else {
            $self->{indexer}->process($doc);
        }
        $self->_increment_count;
    }
    else {
        $self->debug and warn "skipping file $file\n";
        if ( $self->verbose & 4 ) {
            local $| = 1;
            print "skipping $file\n";
        }
    }
}

#
# the basic wanted() code here based on Bill Moseley's DirTree.pl,
# part of the Swish-e 2.4 distrib.

sub crawl {
    my $self = shift;

    my @paths = @_;

    my @files = grep { !-d } @paths;
    my @dirs  = grep {-d} @paths;

    for my $f (@files) {
        $self->_do_file($f);
    }

    # TODO set some flags here for filtering out files/dirs
    # based on $self->indexer->config.

    if (@dirs) {

        find(
            {   wanted => sub {

                    # canonpath cleans up any leading .
                    my $path = File::Spec->canonpath($File::Find::name);

                    if (-d) {
                        unless ( $self->dir_ok( $path, [ stat(_) ] ) ) {
                            if ( $self->verbose & 2 ) {
                                local $| = 1;
                                print "skipping $path\n";
                            }
                            $File::Find::prune = 1;
                            return;
                        }

                        #warn "-d $path\n";
                        return;
                    }
                    else {

                        #warn "!-d $path\n";
                    }

                    $self->_do_file($path);

                },
                no_chdir => 1,
                follow   => $self->config->FollowSymLinks,

            },
            @dirs
        );
    }

    return $self->{count};
}

1;

__END__