/usr/local/CPAN/KSx-IndexManager/USConManager.pm
# this takes the example application from KinoSearch and shows how it could be
# written using IndexManager. -- hdp, 2007-07-01
use strict;
use warnings;
package USConManager;
use base qw(KSx::IndexManager);
use File::Basename qw(basename);
__PACKAGE__->mk_group_accessors(inherited => qw(base_url));
__PACKAGE__->base_url('/us_constitution');
__PACKAGE__->schema_class('USConSchema');
# expect a filename as input
# this is largely a copy of invindexer.plx from the KS example
sub to_doc {
my ($self, $filepath) = @_;
open my $fh, '<', $filepath or die "Can't open $filepath: $!";
my $raw = do { local $/; <$fh> };
my %doc = (
url => join("/", $self->base_url, basename($filepath)),
);
$raw =~ m#<title>(.*?)</title>#s
or die "couldn't isolate title in '$filepath'";
$doc{title} = $1;
$raw =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
or die "couldn't isolate bodytext in '$filepath'";
$doc{content} = $1;
$doc{content} =~ s/<.*?>/ /gsm;
return \%doc;
}
1;