/usr/local/CPAN/AxKit-XSP-Wiki/AxKit/XSP/Wiki/Indexer.pm
# $Id: Indexer.pm,v 1.2 2003/02/02 21:20:46 matt Exp $
package AxKit::XSP::Wiki::Indexer;
use strict;
use XML::SAX::Base;
use vars qw($VERSION @ISA);
$VERSION = '1.00';
@ISA = qw(XML::SAX::Base);
sub new {
my $class = shift;
my (%opts) = @_;
my $db = $opts{DB} || die "DB argument required";
my $page_id = $opts{PageId} || die "PageId argument required";
my $self = bless { DB => $db, PageId => $page_id }, $class;
$self->{InsertCTI} = $db->prepare("INSERT INTO ContentIndex (page_id, word_id, value) VALUES (?, ?, ?)");
$self->{InsertWord} = $db->prepare("INSERT INTO Word (word) VALUES (?)");
$self->{InsertWord}->{PrintError} = 0;
$self->{FindWord} = $db->prepare("SELECT id FROM Word WHERE word = ?");
$self->{DeleteCTI} = $db->prepare("DELETE FROM ContentIndex WHERE page_id = ?");
$self->{Words} = {};
$self->{DocSize} = 0;
return $self;
}
sub end_document {
my ($self) = @_;
# Delete current index for this page
$self->{DeleteCTI}->execute($self->{PageId});
for my $word (keys %{$self->{Words}}) {
next unless $word;
my $word_id = $self->insert_word($word);
next unless $word_id;
warn("Indexing: $self->{PageId}, $word_id, $word\n");
$self->{InsertCTI}->execute(
$self->{PageId},
$word_id,
$self->{Words}{$word},
);
}
$self->{DB}->commit;
}
sub insert_word {
my ($self, $word) = @_;
my $word_id;
eval {
$self->{InsertWord}->execute($word);
$word_id = $self->{DB}->func('last_insert_rowid');
};
if ($@) {
$self->{FindWord}->execute($word);
my $row = $self->{FindWord}->fetch;
$word_id = $row->[0];
}
return $word_id;
}
# NB: This implementation assumes SAX parsers that don't break mid-word.
# (Could use filter if this is a problem)
sub characters {
my ($self, $node) = @_;
while ($node->{Data} =~ /\G(\S*)\s*/gc) {
my $word = $1;
$word =~ s/\W*$//; # strip trailing non-word chars
$word =~ s/^\W*//; # strip leading non-word chars
$self->{Words}{lc($word)}++;
}
}
1;