| File-Extract documentation | Contained in the File-Extract distribution. |
File::Extract - Extract Text From Arbitrary File Types
use File::Extract; my $e = File::Extract->new(); my $r = $e->extract($filename); my $e = File::Extract->new(encodings => [...]); my $class = "MyExtractor"; File::Extract->register_processor($class); my $filter = MyCustomFilter->new; File::Extact->register_filter($mime_type => $filter);
File::Extract is a framework to extract text data out of arbitrary file types, useful to collect data for indexing.
Registers a new text-extractor. The processor is used as the default processor for a given MIME type, but it can be overridden by specifying the 'processors' parameter
The specified class needs to implement two functions:
Returns the MIME type that $class can extract files from.
Extracts the text from $file. Returns a File::Extract::Result object.
Registers a filter to be used when a particular mime type has been found.
Returns the File::MMagic::XS object that used by the object. Use this to modify, set options, etc. E.g.:
my $extract = File::Extract->new(...); $extract->magic->add_file_ext(t => 'text/perl-test'); $extract->extract(...);
A hashref of filters to be applied before attempting to extract the text out of it.
Here's a trivial example that puts line numbers in the beginning of each line before extracting the output out of it.
use File::Extract;
use File::Extract::Filter::Exec;
my $extract = File::Extract->new(
filters => {
'text/plain' => [
File::Extract::Filter::Exec->new(cmd => "perl -pe 's/^/\$. /'")
]
}
);
my $r = $extract->extract($file);
A list of processors to be used for this instance. This overrides any processors that were registered previously via register_processor() class method.
List of encodings that you expect your files to be in. This is used to re-encode and normalize the contents of the file via Encode::Guess.
The final encoding that you the extracted test to be in. The default encoding is UTF8.
Copyright 2005-2007 Daisuke Maki <daisuke@endeworks.jp>. All rights reserved.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
See http://www.perl.com/perl/misc/Artistic.html
| File-Extract documentation | Contained in the File-Extract distribution. |
# $Id: /mirror/perl/File-Extract/trunk/lib/File/Extract.pm 9350 2007-11-18T13:33:38.729170Z daisuke $ # # Copyright (c) 2005-2007 Daisuke Maki <daisuke@endeworks.jp> # All rights reserved. package File::Extract; use strict; use warnings; use base qw(Class::Data::Inheritable); use File::MMagic::XS qw(:compat); use File::Temp(); our $VERSION = '0.07000'; sub new { my $class = shift; my %args = @_; my $encoding = $args{output_encoding} || 'utf8'; my @encodings = $args{encodings} ? (ref($args{encodings}) eq 'ARRAY' ? @{$args{encodings}} : $args{encodings}) : (); my $self = bless { filters => $args{filters}, processors => $args{processors}, magic => $args{file_mmagic_args} ? File::MMagic::XS->new(%{$args{file_mmagic_args}}) : File::MMagic::XS->new(), encodings => \@encodings, output_encoding => $encoding }, $class; return $self; } sub magic { shift->{magic} } sub register_processor { my $class = shift; my $pkg = shift; eval "require $pkg" or die; my $mime = $pkg->mime_type; $class->RegisteredProcessors->{$mime} ||= []; push @{$class->RegisteredProcessors->{$mime}}, $pkg; } sub register_filter { my $class = shift; my $pkg = shift; eval "require $pkg" or die; my $mime = $pkg->mime_type; $class->RegisteredFilter->{$mime} ||= []; push @{$class->RegisteredFilter->{$mime}}, $pkg; } sub _processors { my $self = shift; my $mime = shift; my $processors; # First, check if we have instance specific processors $processors = $self->{processors}{$mime}; if ($processors) { return @$processors; } $processors = ref($self)->RegisteredProcessors->{$mime}; if ($processors) { return @$processors; } return (); } sub _filters { my $self = shift; my $mime = shift; my $filters; # First, check if we have instance specific filters $filters = $self->{filters}{$mime}; if ($filters) { return @$filters; } $filters = ref($self)->RegisteredFilters->{$mime}; if ($filters) { return @$filters; } return (); } sub extract { my $self = shift; my $file = shift; my $magic = $self->{magic}; my $mime = $magic->checktype_filename($file); return unless $mime; my $o_mime = $mime; my $tmp; my $source = $file; if (my @filters = $self->_filters($mime)) { # Filters are applied one after the other, even if that may cause the # underlying MIME type to change (i.e. maybe you are crazy enough to # apply a filter that changes a plain text file to HTML -- god knows # why ;). This may be a bit confusing, since text extractors are # applied from the MIME type of the resulting file. foreach my $f (@filters) { $tmp = File::Temp->new(UNLINK => 1); $f->filter(file => $source, output => $tmp); $source = $tmp->filename; } $tmp->flush; $mime = $magic->checktype_filename($source); return unless $mime; } if (my @processors = $self->_processors($mime)) { foreach my $pkg (@processors) { my $p = $pkg->new( encodings => $self->{encodings}, output_encoding => $self->{output_encoding} ); my $r = eval { $p->extract($source) }; # Restore the original mime type of the source file. This is # required because we might have passed through several filters if ($r) { if ($source ne $file) { $r->filename($file); $r->mime_type($o_mime); } return $r; } } } return undef; } BEGIN { __PACKAGE__->mk_classdata('RegisteredFilters'); __PACKAGE__->mk_classdata('RegisteredProcessors'); __PACKAGE__->RegisteredFilters({}); __PACKAGE__->RegisteredProcessors({}); my @p = qw( File::Extract::Excel File::Extract::HTML File::Extract::MP3 File::Extract::PDF File::Extract::Plain File::Extract::RTF ); foreach my $p (@p) { __PACKAGE__->register_processor($p); } } 1; __END__