| Plucene-SearchEngine documentation | Contained in the Plucene-SearchEngine distribution. |
Plucene::SearchEngine::Index::PDF - Backend for parsing PDF
This backend analyzes a PDF file for its textual content (using pdftotext)
and turns any metadata found in the PDF into Plucene fields.
| Plucene-SearchEngine documentation | Contained in the Plucene-SearchEngine distribution. |
package Plucene::SearchEngine::Index::PDF; use base 'Plucene::SearchEngine::Index::Base'; __PACKAGE__->register_handler("application/pdf", ".pdf"); use File::Temp qw/tmpnam/;
sub gather_data_from_file { my ($self, $filename) = @_; my $html = tmpnam(); system("pdftotext", "-htmlmeta", $filename, $html); return unless -e $html; $self->Plucene::SearchEngine::Index::HTML::gather_data_from_file($html); unlink $html; return $self; } 1;