| URI-Title documentation | Contained in the URI-Title distribution. |
URI::Title - get the titles of things on the web in a sensible way
use URI::Title qw( title );
my $title = title('http://microsoft.com');
print "Title is $title\n";
I keep having to find the title of things on the web. This seems like a really simple request, just get() the object, parse for a title tag, you're done. Ha, I wish. There are several problems with this approach:
So, let's solve these issues once.
only one, the title(url) method. Call it with an url, get the title if possible, undef if it wasn't. Very simple.
Many, many, many things. Still unimplemented:
Tom Insam <tom@jerakeen.org>
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
Invented because of a conversation with rjp, who contributed some eyeball-melting and as-yet-unused code to get titles from MP3s and PDFs, and hex, who has also solved the problem, and got bits done in a nicer way than I did.
| URI-Title documentation | Contained in the URI-Title distribution. |
package URI::Title; use warnings; use strict; use base qw(Exporter); our @EXPORT_OK = qw( title ); our $VERSION = '1.85'; use Module::Pluggable (search_path => ['URI::Title'], require => 1 ); use File::Type; use LWP::UserAgent; use HTTP::Request; use HTTP::Response; sub ua { my $ua = LWP::UserAgent->new; $ua->agent("URI::Title/$VERSION"); $ua->timeout(20); $ua->default_header('Accept-Encoding' => 'gzip'); return $ua; } sub get_limited { my $url = shift; my $size = shift || 32*1024; my $ua = ua(); $ua->max_size($size); my $req = HTTP::Request->new(GET => $url); $req->header( Range => "bytes=0-$size" ); $req->header( "Accept-Encoding" => "" ); # vox sends invalid gzipped data? my $res = eval { $ua->request($req) }; return unless $res; # useragent explodes for non-valid uris # some servers don't like the Range header. If we # get an odd 4xx response that isn't 404, just try getting # the full thing. This may be a little impolite. return get_all($url) if $res->code >= 400 and $res->code < 500 and $res->code != 404; return unless $res->is_success; if (!wantarray) { return $res->decoded_content || $res->content; } my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset\s*=\>?\s*\"?([\w-]+)/i) { $cset = lc($1); #warn "Got charset $cset from URI headers\n"; } return ($res->decoded_content || $res->content, $cset); } sub get_end { my $url = shift; my $size = shift || 16*1024; my $ua = ua(); my $request = HTTP::Request->new(HEAD => $url); my $response = $ua->request($request); return unless $response; # useragent explodes for non-valid uris my $length = $response->header('Content-Length'); return unless $length; # We can't get the length, and we're _not_ # going to get the whole thing. my $start = $length - $size; $ua->max_size($size); my $req = HTTP::Request->new(GET => $url); $req->header( Range => "bytes=$start-$length" ); my $res = $ua->request($req); return unless $res; # useragent explodes for non-valid uris return unless $res->is_success; return $res->decoded_content unless wantarray; my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset=\"?(.*)\"?$/) { $cset = $1; } return ($res->decoded_content, $cset); } sub get_all { my $url = shift; my $ua = ua(); my $req = HTTP::Request->new(GET => $url); my $res = $ua->request($req); return unless $res->is_success; return $res->decoded_content unless wantarray; my $cset = "iso-8859-1"; # default; my $ct = $res->header("Content-type"); if ($ct =~ /charset=\"?(.*)\"?$/) { $cset = $1; } return ($res->decoded_content, $cset); } # cache our $HANDLERS; sub handlers { my @plugins = plugins(); return $HANDLERS if $HANDLERS; for my $plugin (@plugins) { for my $type ($plugin->types) { $HANDLERS->{$type} = $plugin; } } return $HANDLERS; } sub title { my $param = shift; my $data; my $url; my $type; my $cset = "iso-8859-1"; # default # we can be passed a hashref. Keys are url, or data. if (ref($param)) { if ($param->{data}) { $data = $param->{data}; $data = $$data if ref($data); # we can be passed a ref to the data } elsif ($param->{url}) { $url = $param->{url}; } else { use Carp qw(croak); croak("Expected a single parameter, or an 'url' or 'data' key"); } # otherwise, assume we're passed an url } else { $url = $param; } if (!$url and !$data) { warn "Need at least an url or data"; return; } # If we don't have data, we will have an url, so try to get data. if (!$data) { # url might be a filename if (-e $url) { local $/ = undef; unless (open DATA, $url) { warn "$url looks like a file and isn't"; return; } $data = <DATA>; close DATA; # If not, assume it's an url } else { # special case for itms if ($url =~ s/^itms:/http:/) { $type = "itms"; $data = 1; # we don't need it, fake it. } else { # special case for spotify $url =~ s{^(?:http://open.spotify.com/|spotify:)(\w+)[:/]}{http://spotify.url.fi/$1/}; $url =~ s{#!}{?_escaped_fragment_=}; ($data, $cset) = get_limited($url); } } } if (!$data) { #warn "Can't get content for $url"; return; } return undef unless $data; $type ||= File::Type->new->checktype_contents($data); my $handlers = handlers(); my $handler = $handlers->{$type} || $handlers->{default} or return; return $handler->title($url, $data, $type, $cset); } 1;