| SWISH-Prog documentation | Contained in the SWISH-Prog distribution. |
SWISH::Prog::Aggregator::FS - crawl a filesystem
use SWISH::Prog::Aggregator::FS;
my $fs = SWISH::Prog::Aggregator::FS->new(
indexer => SWISH::Prog::Indexer->new
);
$fs->indexer->start;
$fs->crawl( $path );
$fs->indexer->finish;
SWISH::Prog::Aggregator::FS is a filesystem aggregator implementation of the SWISH::Prog::Aggregator API. It is similar to the DirTree.pl script in the Swish-e 2.4 distribution.
See SWISH::Prog::Aggregator.
Implements the base init() method called by new().
Check full_path before fetch()ing it.
Returns 0 if full_path should be skipped.
Returns file extension of full_path if full_path should be processed.
Called by find() for all directories. You can control the recursion into directory via the config() params
TODO
Returns a doc_class() instance representing file_path.
Crawl the filesystem recursively within paths_or_files, processing each document specified by the config().
Peter Karman, <perl@peknet.com>
Please report any bugs or feature requests to bug-swish-prog at rt.cpan.org, or through
the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=SWISH-Prog.
I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
You can find documentation for this module with the perldoc command.
perldoc SWISH::Prog
You can also look for information at:
Copyright 2008-2009 by Peter Karman
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| SWISH-Prog documentation | Contained in the SWISH-Prog distribution. |
package SWISH::Prog::Aggregator::FS; use strict; use warnings; use base qw( SWISH::Prog::Aggregator ); use Carp; use File::Slurp; use File::Find; use File::Rules; use Data::Dump qw( dump ); our $VERSION = '0.51';
sub init { my $self = shift; $self->SUPER::init(@_); # read from $self->config and set some flags # TODO FileRules, FileMatch # create .ext regex to match in file_ok() if ( $self->config->IndexOnly ) { my $re = join( '|', grep {s/^\.//} split( m/\s+/, $self->config->IndexOnly ) ); $self->{_ext_re} = qr{\.($re)}io; } else { $self->{_ext_re} = $SWISH::Prog::Utils::ExtRE; } # if running with SWISH::3, # instantiate for the slurp advantage eval "use SWISH::3 0.09"; if ( !$@ ) { $self->{_swish3} = SWISH::3->new; } }
sub file_ok { my $self = shift; my $full_path = shift; my $stat = shift; $self->debug and warn "checking file $full_path\n"; my ( $path, $file, $ext ) = SWISH::Prog::Utils->path_parts( $full_path, $self->{_ext_re} ); $self->debug and warn "path=$path file=$file ext=$ext\n"; return 0 unless $ext; return 0 if $full_path =~ m[\\/]!; # TODO configure this. return 0 if $file =~ m/^\./; #carp "parsed file: $file\npath: $path\next: $ext"; $stat ||= [ stat($full_path) ]; return 0 unless -r _; return 0 if -d _; if ( $self->ok_if_newer_than and $self->ok_if_newer_than >= $stat->[9] ) { return 0; } return 0 if ( $self->_apply_file_rules($full_path) && !$self->_apply_file_match($full_path) ); $self->debug and warn " $full_path -> ok\n"; if ( $self->verbose & 4 ) { local $| = 1; # don't buffer print "crawling $full_path\n"; } return $ext; }
sub dir_ok { my $self = shift; my $dir = shift; my $stat = shift || [ stat($dir) ]; $self->debug and warn "checking dir $dir\n"; return 0 unless -d _; return 0 if $dir =~ m!/\.!; return 0 if $dir =~ m/^\.[^\.]/; # could be ../foo return 0 if $dir =~ m!/(\.svn|RCS)/!; return 0 if ( $self->_apply_file_rules($dir) && !$self->_apply_file_match($dir) ); $self->debug and warn " $dir -> ok\n"; if ( $self->verbose & 2 ) { local $| = 1; # don't buffer print "crawling $dir\n"; } 1; # TODO esp RecursionDepth } sub _apply_file_rules { my ( $self, $file ) = @_; if ( !exists $self->{_file_rules} && $self->config->FileRules ) { # cache obj $self->{_file_rules} = File::Rules->new( $self->config->FileRules ); } if ( exists $self->{_file_rules} ) { $self->debug and warn "applying FileRules"; my $match = $self->{_file_rules}->match($file); return $match; } return 0; # no rules } sub _apply_file_match { my ( $self, $file ) = @_; # TODO return 0; # no-op for now }
sub get_doc { my $self = shift; my $url = shift or croak "file path required"; my ( $stat, $ext ) = @_; my $buf; # the SWISH::3->slurp is about 50% faster # but obviously only available if SWISH::3 is loaded. # It also handles .gz files transparently based on .gz # extension, so must remove the extension to avoid # double-unzip via SWISH::Filter. # NOTE we always read in binary (raw) mode in case # the file is compressed, binary, etc. if ( $self->{_swish3} ) { #warn "$url using swish3->slurp\n"; eval { $buf = $self->{_swish3}->slurp( $url, 1 ); $url =~ s/\.gz$//; # post-slurp, in case it failed. }; } else { eval { $buf = read_file( $url, binmode => ':raw' ) }; } if ($@) { carp "unable to read $url - skipping"; return; } $stat ||= [ stat($url) ]; # TODO SWISH::3 has this function too. # might be faster since no OO overhead. my $type = SWISH::Prog::Utils->mime_type( $url, $ext ); if ( $self->ok_if_newer_than and $self->ok_if_newer_than >= $stat->[9] ) { warn "skipping $url ... too old\n"; return; } return $self->doc_class->new( url => $url, modtime => $stat->[9], content => $buf, type => $type, size => $stat->[7], debug => $self->debug ); } sub _do_file { my $self = shift; my $file = shift; if ( my $ext = $self->file_ok($file) ) { my $doc = $self->get_doc( $file, [ stat(_) ], $ext ); $self->swish_filter($doc); if ( $self->test_mode ) { warn join( ' ', $doc->url, $doc->type ) . "\n"; } else { $self->{indexer}->process($doc); } $self->_increment_count; } else { $self->debug and warn "skipping file $file\n"; if ( $self->verbose & 4 ) { local $| = 1; print "skipping $file\n"; } } } # # the basic wanted() code here based on Bill Moseley's DirTree.pl, # part of the Swish-e 2.4 distrib.
sub crawl { my $self = shift; my @paths = @_; my @files = grep { !-d } @paths; my @dirs = grep {-d} @paths; for my $f (@files) { $self->_do_file($f); } # TODO set some flags here for filtering out files/dirs # based on $self->indexer->config. if (@dirs) { find( { wanted => sub { # canonpath cleans up any leading . my $path = File::Spec->canonpath($File::Find::name); if (-d) { unless ( $self->dir_ok( $path, [ stat(_) ] ) ) { if ( $self->verbose & 2 ) { local $| = 1; print "skipping $path\n"; } $File::Find::prune = 1; return; } #warn "-d $path\n"; return; } else { #warn "!-d $path\n"; } $self->_do_file($path); }, no_chdir => 1, follow => $self->config->FollowSymLinks, }, @dirs ); } return $self->{count}; } 1; __END__