| Plucene-SearchEngine documentation | Contained in the Plucene-SearchEngine distribution. |
Plucene::SearchEngine::Index::HTML - Backend for simply parsing HTML
This backend analysis a HTML file for the following Plucene fields:
The text part of the HTML
A list of links in the HTML
Additionally, any META tags are turned into Plucene fields.
| Plucene-SearchEngine documentation | Contained in the Plucene-SearchEngine distribution. |
package Plucene::SearchEngine::Index::HTML; use base 'Plucene::SearchEngine::Index::Base'; use HTML::TreeBuilder; __PACKAGE__->register_handler("text/html", ".html");
sub gather_data_from_file { my ($self, $filename) = @_; my $tree = HTML::TreeBuilder->new; $tree->parse_file($filename); for($tree->look_down(_tag => "meta")) { next if $_->attr("http-equiv"); next unless $_->attr("value"); $self->add_data($_->attr("name"), "Text", $_->attr("value")); } for (@{$tree->extract_links("a")}) { $self->add_data("link", "Text", $_->[0]); } $self->add_data("text", "UnStored", $tree->as_trimmed_text); return $self; } 1;