Plucene::SearchEngine::Index::HTML - Backend for simply parsing HTML


Plucene-SearchEngine documentation Contained in the Plucene-SearchEngine distribution.

Index


Code Index:

NAME

Top

Plucene::SearchEngine::Index::HTML - Backend for simply parsing HTML

DESCRIPTION

Top

This backend analysis a HTML file for the following Plucene fields:

text

The text part of the HTML

A list of links in the HTML

Additionally, any META tags are turned into Plucene fields.


Plucene-SearchEngine documentation Contained in the Plucene-SearchEngine distribution.

package Plucene::SearchEngine::Index::HTML;
use base 'Plucene::SearchEngine::Index::Base';
use HTML::TreeBuilder;
__PACKAGE__->register_handler("text/html", ".html");

sub gather_data_from_file {
    my ($self, $filename) = @_;
    my $tree = HTML::TreeBuilder->new;
    $tree->parse_file($filename);
    for($tree->look_down(_tag => "meta")) {
        next if $_->attr("http-equiv");
        next unless $_->attr("value");
        $self->add_data($_->attr("name"), "Text", $_->attr("value"));
    }
    for (@{$tree->extract_links("a")}) {
        $self->add_data("link", "Text", $_->[0]);
    }
    $self->add_data("text", "UnStored", $tree->as_trimmed_text);
    return $self;
}

1;