Plucene::SearchEngine::Index::PDF - Backend for parsing PDF


Plucene-SearchEngine documentation Contained in the Plucene-SearchEngine distribution.

Index


Code Index:

NAME

Top

Plucene::SearchEngine::Index::PDF - Backend for parsing PDF

DESCRIPTION

Top

This backend analyzes a PDF file for its textual content (using pdftotext) and turns any metadata found in the PDF into Plucene fields.


Plucene-SearchEngine documentation Contained in the Plucene-SearchEngine distribution.

package Plucene::SearchEngine::Index::PDF;
use base 'Plucene::SearchEngine::Index::Base';
__PACKAGE__->register_handler("application/pdf", ".pdf");
use File::Temp qw/tmpnam/;

sub gather_data_from_file {
    my ($self, $filename) = @_;
    my $html = tmpnam();
    system("pdftotext", "-htmlmeta", $filename, $html);
    return unless -e $html;
    $self->Plucene::SearchEngine::Index::HTML::gather_data_from_file($html);
    unlink $html;
    return $self;
}

1;