| Alvis-Convert documentation | Contained in the Alvis-Convert distribution. |
Alvis::Document::Type - Perl extension for guessing and checking the type of a document (an extension of MIME types).
use Alvis::Document::Type;
# Create a new instance
my $t=Alvis::Document::Type->new(defaultType=>'text',
defaultSubType=>'html');
if (!defined($t))
{
die('Ugh!');
}
my ($doc_type,$doc_sub_type)=$t->guess($doc_text);
if (!(defined($doc_type) && defined($doc_sub_type)))
{
die("Guess what? " . $t->errmsg());
}
Tries to guess the type of a document similarly to MIME types (type and a subtype).
Adds subtypes 'rss' and 'html' to MIME type 'text'.
Options:
defaultType The default type (text).
defaultSubType The default subtype (plain).
Tries to guess the type of $text.
Returns a stack of error messages, if any. Empty string otherwise.
Kimmo Valtonen, <kimmo.valtonen@hiit.fi>
Copyright (C) 2006 by Kimmo Valtonen
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.4 or, at your option, any later version of Perl 5 you may have available.
| Alvis-Convert documentation | Contained in the Alvis-Convert distribution. |
package Alvis::Document::Type; use warnings; use strict; $Alvis::Document::Type::VERSION = '0.1'; use File::Type; ############################################################################# # # Tries to predict the type of a document. Currently pretty crude. # # Kimmo Valtonen # ############################################################################# ############################################################################# # # Error message stuff # ############################################################################# my $ErrStr; my ($ERR_OK, $ERR_DOC, $ERR_FILE_TYPE )=(0..2); my %ErrMsgs=($ERR_OK=>"", $ERR_DOC=>"No document.", $ERR_FILE_TYPE=>"Unable to instantiate File::Type" ); sub _set_err_state { my $self=shift; my $errcode=shift; my $errmsg=shift; if (!defined($errcode)) { confess("set_err_state() called with an undefined argument."); } if (exists($ErrMsgs{$errcode})) { if ($errcode==$ERR_OK) { $self->{errstr}=""; } else { $self->{errstr}.=" " . $ErrMsgs{$errcode}; if (defined($errmsg)) { $self->{errstr}.=" " . $errmsg; } } } else { confess("Internal error: set_err_state() called with an " . "unrecognized argument ($errcode).") } } sub errmsg { my $self=shift; return $self->{errstr}; } ############################################################################# # # Methods # ############################################################################## sub new { my $proto=shift; my $class=ref($proto)||$proto; my $parent=ref($proto)&&$proto; my $self={}; bless($self,$class); $self->_init(@_); $self->_set_err_state($ERR_OK); $self->{fileType}=File::Type->new(); if (!defined($self->{fileType})) { $self->_set_err_state($ERR_FILE_TYPE); return undef; } return $self; } sub _init { my $self=shift; $self->{defaultType}='text'; $self->{defaultSubType}='plain'; if (defined(@_)) { my %args=@_; @$self{ keys %args }=values(%args); } } # # Returns similarly to MIME ($type,$sub_type), but is broader # sub guess { my $self=shift; my $text=shift; $self->_set_err_state($ERR_OK); # clean the slate if (!defined($text)) { $self->_set_err_state($ERR_DOC); return undef; } my ($type,$sub_type); # Try File::Type first my $mime_type=$self->{fileType}->mime_type($text); if (!defined($mime_type)) { $type=$self->{defaultType}; $sub_type=$self->{defaultSubType}; } else { ($type,$sub_type)=split(/\//,$mime_type,-1); } # If the result is a generic one, check for our types of interest # by other means # BTW, File::Type should make it clear and checkable what its # "I dunno" reply is if ($type eq 'application' && $sub_type eq 'octet-stream') { if ($self->_looks_like_HTML($text)) { ($type,$sub_type)=('text','html'); } elsif ($self->_looks_like_RSS($text)) { # not a MIME type ($type,$sub_type)=('text','rss') } } return ($type,$sub_type); } sub _looks_like_HTML { my $self=shift; my $text=shift; # # If we're lucky... # if ($text=~/<!DOCTYPE\s+(\S+)/isgo) { my $type=$1; if ($type=~/(?:html|wml)/igo) { return 1; } } # Otherwise, use a weaker way of checking... a single # signature start tag will do. # if ($text=~/<(?:(?i)html|body)\W/sgo) { return 1; } return 0; } sub _looks_like_RSS { my $self=shift; my $text=shift; # # If we're lucky... # if ($text=~/<!DOCTYPE\s+(\S+)/isgo) { my $type=$1; if ($type=~/(?:rss)/igo) { return 1; } } # Otherwise, use a weaker way of checking... a single # signature start tag will do. # if ($text=~/<(?:(?i)rss|channel)\W/sgo) { return 1; } return 0; } 1; 1; __END__