URI::Title - get the titles of things on the web in a sensible way


URI-Title documentation Contained in the URI-Title distribution.

Index


Code Index:

NAME

Top

URI::Title - get the titles of things on the web in a sensible way

SYNOPSIS

Top

  use URI::Title qw( title );
  my $title = title('http://microsoft.com');
  print "Title is $title\n";

DESCRIPTION

Top

I keep having to find the title of things on the web. This seems like a really simple request, just get() the object, parse for a title tag, you're done. Ha, I wish. There are several problems with this approach:

What if the resource is on a very slow server? Do we wait for ever or what?
What if the resource is a 900 gig file? You don't want to download that.
What if the page title isn't in a title tag, but is buried in the HTML somewhere?
What if the resource is an MP3 file, or a word document or something?
...

So, let's solve these issues once.

METHODS

Top

only one, the title(url) method. Call it with an url, get the title if possible, undef if it wasn't. Very simple.

TODO

Top

Many, many, many things. Still unimplemented:

Get titles of MP3 files, Word Docs, PDFs, etc.
Configurable.. well, anything, in fact. Timeout would be a good start.
Better error reporting.

AUTHOR

Top

Tom Insam <tom@jerakeen.org>

This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.

CREDITS

Top

Invented because of a conversation with rjp, who contributed some eyeball-melting and as-yet-unused code to get titles from MP3s and PDFs, and hex, who has also solved the problem, and got bits done in a nicer way than I did.


URI-Title documentation Contained in the URI-Title distribution.

package URI::Title;
use warnings;
use strict;

use base qw(Exporter);
our @EXPORT_OK = qw( title );

our $VERSION = '1.85';

use Module::Pluggable (search_path => ['URI::Title'], require => 1 );
use File::Type;

use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;


sub ua {
  my $ua = LWP::UserAgent->new;
  $ua->agent("URI::Title/$VERSION");
  $ua->timeout(20);
  $ua->default_header('Accept-Encoding' => 'gzip');
  return $ua;
}

sub get_limited {
  my $url = shift;
  my $size = shift || 32*1024;
  my $ua = ua();
  $ua->max_size($size);
  my $req = HTTP::Request->new(GET => $url);
  $req->header( Range => "bytes=0-$size" );
  $req->header( "Accept-Encoding" => "" ); # vox sends invalid gzipped data?
  my $res = eval { $ua->request($req) };
  return unless $res; # useragent explodes for non-valid uris

  # some servers don't like the Range header. If we
  # get an odd 4xx response that isn't 404, just try getting
  # the full thing. This may be a little impolite.
  return get_all($url) if $res->code >= 400 and $res->code < 500 and $res->code != 404;
  return unless $res->is_success;
  if (!wantarray) {
    return $res->decoded_content || $res->content;
  }
  my $cset = "iso-8859-1"; # default;
  my $ct = $res->header("Content-type");
  if ($ct =~ /charset\s*=\>?\s*\"?([\w-]+)/i) {
    $cset = lc($1);
    #warn "Got charset $cset from URI headers\n";
  }
  return ($res->decoded_content || $res->content, $cset);
}

sub get_end {
  my $url = shift;
  my $size = shift || 16*1024;

  my $ua = ua();

  my $request = HTTP::Request->new(HEAD => $url);
  my $response = $ua->request($request);
  return unless $response; # useragent explodes for non-valid uris
  my $length = $response->header('Content-Length');

  return unless $length; # We can't get the length, and we're _not_
                         # going to get the whole thing.

  my $start = $length - $size;

  $ua->max_size($size);

  my $req = HTTP::Request->new(GET => $url);
  $req->header( Range => "bytes=$start-$length" );
  my $res = $ua->request($req);
  return unless $res; # useragent explodes for non-valid uris

  return unless $res->is_success;
  return $res->decoded_content unless wantarray;
  my $cset = "iso-8859-1"; # default;
  my $ct = $res->header("Content-type");
  if ($ct =~ /charset=\"?(.*)\"?$/) {
    $cset = $1;
  }
  return ($res->decoded_content, $cset);
}

sub get_all {
  my $url = shift;
  my $ua = ua();
  my $req = HTTP::Request->new(GET => $url);
  my $res = $ua->request($req);
  return unless $res->is_success;
  return $res->decoded_content unless wantarray;
  my $cset = "iso-8859-1"; # default;
  my $ct = $res->header("Content-type");
  if ($ct =~ /charset=\"?(.*)\"?$/) {
    $cset = $1;
  }
  return ($res->decoded_content, $cset);
}

# cache
our $HANDLERS;
sub handlers {
  my @plugins = plugins();
  return $HANDLERS if $HANDLERS;
  for my $plugin (@plugins) {
    for my $type ($plugin->types) {
      $HANDLERS->{$type} = $plugin;
    }
  }
  return $HANDLERS;
}

sub title {
  my $param = shift;
  my $data;
  my $url;
  my $type;
  my $cset = "iso-8859-1"; # default

  # we can be passed a hashref. Keys are url, or data.  
  if (ref($param)) {
    if ($param->{data}) {
      $data = $param->{data};
      $data = $$data if ref($data); # we can be passed a ref to the data
    } elsif ($param->{url}) {
      $url = $param->{url};
    } else {
      use Carp qw(croak);
      croak("Expected a single parameter, or an 'url' or 'data' key");
    }

  # otherwise, assume we're passed an url
  } else {
    $url = $param;
  }

  if (!$url and !$data) {
    warn "Need at least an url or data";
    return;
  }

  # If we don't have data, we will have an url, so try to get data.
  if (!$data) {
    # url might be a filename
    if (-e $url) {
      local $/ = undef;
      unless (open DATA, $url) {
        warn "$url looks like a file and isn't";
        return;
      }
      $data = <DATA>;
      close DATA;
      
    # If not, assume it's an url
    } else {
      # special case for itms
      if ($url =~ s/^itms:/http:/) {
        $type = "itms";
        $data = 1; # we don't need it, fake it.

      } else {
        # special case for spotify
        $url =~ s{^(?:http://open.spotify.com/|spotify:)(\w+)[:/]}{http://spotify.url.fi/$1/};
        
        $url =~ s{#!}{?_escaped_fragment_=};

        ($data, $cset) = get_limited($url);
      }
    }
  }
  if (!$data) {
    #warn "Can't get content for $url";
    return;
  }

  return undef unless $data;

  $type ||= File::Type->new->checktype_contents($data);

  my $handlers = handlers();
  my $handler = $handlers->{$type} || $handlers->{default}
    or return;

  return $handler->title($url, $data, $type, $cset);
}

1;