| Win32-UrlCache documentation | Contained in the Win32-UrlCache distribution. |
Win32::UrlCache - parse Internet Explorer's history/cache/cookies
use Win32::UrlCache;
my $index = Win32::UrlCache->new( 'index.dat' );
foreach my $url ( $index->urls ) {
print $url->url, "\n";
}
Or, you can use callback function if you care memory usage.
use Win32::UrlCache;
my $index = Win32::UrlCache->new( 'index.dat' );
$index->urls( callback => \&callback )
sub callback {
my $entry = shift;
my $url = $entry->url;
$url =~ s/^Visited: //;
$entry->url( $url );
print $entry->url, "\n";
return; # to prevent the entry from being kept in the object
}
If you want to know the title of the cached page (for Win32 only):
use Win32::UrlCache::Cache;
use Win32::UrlCache::Title;
use Encode;
my $cache = Win32::UrlCache::Cache->new;
$cache->urls( callback => \&callback )
sub callback {
my $entry = shift;
print $entry->url, "\n";
my $title = Win32::UrlCache::Title->extract( $entry->filename );
print encode( shiftjis => $title ), "\n\n" if $title;
return;
}
This parses so-called "Client UrlCache MMF Ver 5.2" index.dat files, which are used to store Internet Explorer's history, cache, and cookies. As of writing this, I've only tested on Win2K + IE 6.0, but I hope this also works with some of the other versions of OS/Internet Explorer. However, note that this is not based on the official/public MSDN specification, but on a hack on the web. So, caveat emptor in every sense, especially for the redr entries ;)
Patches and feedbacks are welcome.
receives a path to an 'index.dat', and parses it to create an object.
returns URL entries in the 'index.dat' file. Each entry has url, filename, headers, filesize, last_modified, last_accessed, and optionally, title accessors (note that some of them would return meaningless values). As of 0.02, it can receive a callback function. See below. As of 0.04, you can also pass ( extract_title => 1 ) to extract title. However, this extraction is processed after a callback. So, if you want both to use a callback and to extract title, you might want to insert extraction code into the callback as shown in the synopsis.
almost the same as urls, but returns LEAK entries (if any) in the 'index.dat' file.
returns REDR entries (if any) in the 'index.dat' file. Each entry has a url accessor. As of 0.02, it can receive a callback function.
Three methods shown above return all the entries found in the index by default, but this may eat lots of memory especially if you use IE as a main browser. As of 0.02, those methods may receive a callback function, which will take an entry for the first (and only, as of writing this) argument. If the callback returns true, the entry will be stored in the ::UrlCache object, and if the callback returns false, the entry will be discarded after the callback is executed.
Kenichi Ishigaki, <ishigaki at cpan.org>
Copyright (C) 2007 by Kenichi Ishigaki.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
| Win32-UrlCache documentation | Contained in the Win32-UrlCache distribution. |
package Win32::UrlCache; use strict; use warnings; use Carp; BEGIN { if ( $^O eq 'MSWin32' ) { require Win32::UrlCache::FileTime; Win32::UrlCache::FileTime->import; } else { require Win32::UrlCache::FileTimePP; Win32::UrlCache::FileTimePP->import; } } our $VERSION = '0.06'; use constant BADFOOD => chr(0x0D).chr(0xF0).chr(0xAD).chr(0x0B); sub new { my $class = shift; my $self = bless {}, $class; $self->_read_file( @_ ); $self->_version_check; $self->_size_check; $self->_get_pointer_to_first_hash; $self; } sub _read_file { my ($self, $file) = @_; open my $fh, '<', $file or croak $!; binmode $fh; sysread $fh, ( my $data ), ( my $size = -s $fh ); close $fh; $self->{_data} = $data; $self->{_size} = $size; $self->{_pos} = 0; } sub _version_check { my $self = shift; my $header = 'Client UrlCache MMF Ver 5.2'; my $read = $self->_read_string; unless ( $read eq $header ) { croak "unsupported file type: $read"; } } sub _size_check { my $self = shift; my $read = _to_int( $self->_read ); unless ( $read == $self->{_size} ) { croak "index file seems broken: $read / ".$self->{_size}; } } sub _get_pointer_to_first_hash { my $self = shift; $self->{_from} = _to_int( $self->_read ); } sub _read_hashes { my ($self, $target, %options) = @_; my $pointer = $self->{_from}; while( $pointer ) { unless ( $self->_read_from( $pointer ) eq 'HASH' ) { croak "index file seems broken: HASH not found"; } my $hash_length = _to_int( $self->_read ); my $next_hash = _to_int( $self->_read ); my $unknown = _to_int( $self->_read ); my $hash_end = $pointer + ( $hash_length * 0x80 ); while ( $self->{_pos} < $hash_end ) { my ( $hashkey, $offset ) = ( $self->_read, $self->_read ); next if $offset eq BADFOOD; my $int_offset = _to_int( $offset ); next unless $int_offset; # last of the offset should be 0x80/0x00 (not 0x03 etc) next unless ( $int_offset & 0xf ) == 0; my $tag = $self->_test_from( $int_offset ); next if $tag eq BADFOOD; if ( $tag =~ /^(?:URL|REDR|LEAK)/ ) { next if $target && $target ne $tag; my $pos = $self->{_pos}; $self->_read_entry( $int_offset, %options ); $self->{_pos} = $pos; } } $pointer = $next_hash or last; } } sub _read_entry { my ($self, $offset, %options) = @_; my $tag = $self->_read_from( $offset ); $tag =~ s/ $//; my $class = 'Win32::UrlCache::'.$tag; my $item; if ( $tag eq 'REDR' ) { my $block = $self->_read; my $unknown = $self->_read(8); my $url = $self->_read_string; $item = { url => $url }; } if ( $tag eq 'URL' or $tag eq 'LEAK' ) { my $block = $self->_read; my $last_modified = filetime( $self->_read(8) ); my $last_accessed = filetime( $self->_read(8) ); my $maybe_expire = $self->_read(8); my $maybe_filesize = $self->_read(8); my $unknown = $self->_read(20); my $offset_to_filename = _to_int( $self->_read ); my $unknown2 = $self->_read; my $offset_to_headers = _to_int( $self->_read ); my $unknown3 = $self->_read(32); my $url = $self->_read_string; my $filename = $offset_to_filename ? $self->_read_string_from( $offset + $offset_to_filename ) : ''; my $headers = $offset_to_headers ? $self->_read_string_from( $offset + $offset_to_headers ) : ''; $item = { url => $url, filename => $filename, headers => $headers, filesize => $maybe_filesize, last_modified => $last_modified, last_accessed => $last_accessed, }; } return unless $item; my $object = bless $item, $class; if ( $options{callback} ) { my $ret = $options{callback}->( $object ); return unless $ret; } if ( $options{extract_title} && $item->{filename} && $^O eq 'MSWin32' ) { require Win32::UrlCache::Title; Win32::UrlCache::Title->import; $item->title( Win32::UrlCache::Title->extract( $item->filename ) ); } push @{ $self->{$tag} ||= [] }, $object; } sub urls { my $self = shift; $self->_read_hashes( 'URL ', @_ ); return @{ $self->{URL} || [] }; } sub redrs { my $self = shift; $self->_read_hashes( 'REDR', @_ ); return @{ $self->{REDR} || [] }; } sub leaks { my $self = shift; $self->_read_hashes( 'LEAK', @_ ); return @{ $self->{LEAK} || [] }; } sub _to_int { my $dword = shift; my @bytes = split //, $dword; return ( ord( $bytes[3] ) * (256 ** 3) + ord( $bytes[2] ) * (256 ** 2) + ord( $bytes[1] ) * (256 ** 1) + ord( $bytes[0] ) * (256 ** 0) ); } sub _read { my ($self, $length) = @_; $length ||= 4; my $str = substr( $self->{_data}, $self->{_pos}, $length ); $self->{_pos} += $length; return $str; } sub _read_from { my ($self, $from, $length) = @_; $self->{_pos} = $from; $self->_read( $length ); } sub _read_string { my $self = shift; my $from = $self->{_pos}; my $to = index( $self->{_data}, "\000", $from ); my $str = substr( $self->{_data}, $from, $to - $from ); $self->{_pos} = $to + 1; return $str; } sub _read_string_from { my ($self, $from) = @_; $self->{_pos} = $from; $self->_read_string; } sub _test_from { my ($self, $from, $length) = @_; $length ||= 4; $from = $self->{_pos} unless defined $from; return substr( $self->{_data}, $from, $length ); } package # Win32::UrlCache::URL; use strict; use warnings; use base qw( Class::Accessor::Fast ); __PACKAGE__->mk_accessors(qw( url filename headers filesize last_modified last_accessed title )); package # Win32::UrlCache::LEAK; use strict; use warnings; use base qw( Class::Accessor::Fast ); __PACKAGE__->mk_accessors(qw( url filename headers filesize last_modified last_accessed title )); package # Win32::UrlCache::REDR; use strict; use warnings; use base qw( Class::Accessor::Fast ); __PACKAGE__->mk_accessors(qw( url )); 1; __END__