WWW::Scraper::Delicious - Retrieve links from del.icio.us


WWW-Scraper-Delicious documentation Contained in the WWW-Scraper-Delicious distribution.

Index


Code Index:

NAME

Top

WWW::Scraper::Delicious - Retrieve links from del.icio.us

SYNOPSIS

Top

    use WWW::Scraper::Delicious;
    my $delicious = WWW::Scraper::Delicious->new();
    my %linkset = $delicious->getlinks('blahuser');

    map { print "\n".$delicious->dumplink($linkset{$_}) } keys %linkset;

REQUIRED MODULES

Top

LWP::UserAgent

EXPORT

Top

None.

DESCRIPTION

Top

This module implements a very simple and effective way to scrape links from the http://del.icio.us/ site without the requirement of using the del.icio.us API, authentication, or RSS. Although links can be scraped from any valid del.icio.us URL, the intended use of this module is to provide users a simple way to backup and/or mirror their own links. There is no hard limit for the number of user links that can be returned, but a limit of 100 is respected for other types of link queries.

METHODS

Top

new()

    $delicious = WWW::Scraper::Delicious->new();
    $delicious = WWW::Scraper::Delicious->new( limit => 5, ua => $ua );

The constructor method returns a WWW::Scraper::Delicious object. The limit and ua arguments are optional. The limit option allows you to restrict the number of results returned (default of 0 is unlimited). You may also pass a custom LWP::UserAgent object handle.

getlinksarray()

    my @links = $delicious->getlinksarray('blahuser');

The argument is the same as with getlinks, but this method returns the link results in the form of a reverse chronologically-ordered array.

ua()

    $delicious->ua($myCustomUA);

This method can be invoked without an argument to obtain the current LWP::UserAgent object handle. Invoking with an argument will establish the new setting.

limit()

    $delicious->limit(9);

This method can be invoked without an argument to obtain the current limit setting (default 0 is unlimited). Invoking with an argument will establish the new setting.

AUTHOR

Top

Adam Foust, <agf@cpan.org>

COPYRIGHT AND LICENSE

Top


WWW-Scraper-Delicious documentation Contained in the WWW-Scraper-Delicious distribution.

package WWW::Scraper::Delicious;

use strict;
use warnings;

use LWP::UserAgent;

our $VERSION = '0.10';

sub new {
    my ($class, %args) = @_;
    my $self = {};
    limit($self, $args{limit}) if $args{limit};
    ua($self, $args{ua}) if $args{ua};
    $self->{ua} = LWP::UserAgent->new() unless $self->{ua};
    bless($self, $class);
    return $self;
}

sub getlinks {
    my ($self, $path) = @_;
    return unless $path;
    my $url = ($path =~ /^htt/ ? $path
      : ( $path =~ /^\// ? "http://del.icio.us$path"
            : "http://del.icio.us/$path" ) );
    my $limit = $self->{limit} || 0;
    my %linkset = _scrape($self->{ua}, $url, $limit);
    return %linkset;
}

sub getlinksarray {
    my ($self, $path) = @_;
    my %linkset = getlinks($self, $path);
    return unless keys %linkset;
    my @table = sort { $b->[5] cmp $a->[5] }
      map { [ $linkset{$_}{id},
              $linkset{$_}{url},
              $linkset{$_}{desc},
              $linkset{$_}{notes},
              $linkset{$_}{pop},
              $linkset{$_}{date},
              $linkset{$_}{tag}, $linkset{$_}  ]
      } keys %linkset;
    return @table;
}

sub ua {
    my ($self, $ua) = @_;
    $self->{ua} = $ua if defined $ua
      && ref($ua) eq 'LWP::UserAgent';
    return $self->{ua};
}

sub limit {                     # limit of 0 is default (unlimited)
    my ($self, $limit) = @_;
    $self->{limit} = $limit if defined $limit && $limit =~ /^\d+$/;
    return $self->{limit};
}

sub dumplink {
    my ($self, $linkref) = @_;
    return unless $linkref;
    $linkref = $linkref->[7] if ref($linkref) eq 'ARRAY';
    return unless $linkref->{id};
    my $str = "   id = ".$linkref->{id}."\n";
    $str   .= "  url = ".$linkref->{url}."\n";
    $str   .= " desc = ".$linkref->{desc}."\n"  if $linkref->{desc};
    $str   .= "notes = ".$linkref->{notes}."\n" if $linkref->{notes};
    $str   .= "  pop = ".$linkref->{pop}."\n"   if $linkref->{pop};
    $str   .= " date = ".$linkref->{date}."\n"  if $linkref->{date};
    $str   .= " tags = ".join(', ', sort keys %{$linkref->{tag}})."\n"
                                                if $linkref->{tag};
    return $str;
}

sub _scrape {
    my ($ua, $url, $limit) = @_;
    my (%linkset, $page);
    my $num = 0;

    while (1) {

        my $url = "$url?setcount=100" . ($page ? "&page=$page" : '');
        my $rs = $ua->get($url);
        return unless $rs->is_success;
        my $html = $rs->content;

        my @tmp = split /<li class="post" key="/si, $html;
        for my $scrap (@tmp[1..$#tmp]) {
            $scrap =~ s/\s*<\/li>.*$//si;

            next unless (my ($id, $url, $desc) = $scrap =~
              /^(.*?)".*?a href="(.*?)".*?>(.*?)<\/a>/si) == 3;
            $linkset{$id} = { id => $id, url => $url, desc => $desc };

            my ($notes) = $scrap =~ /class="notes">(.*?)<\/p>/si;
            $linkset{$id}{notes} = $notes if $notes;

            for my $str (split /<a class="tag" /, $scrap) {
                next unless my($tag) = $str =~ /^href=.*?>(.*?)<\/a>/;
                $linkset{$id}{tag}{$tag}++;
            }
            delete $linkset{$id}{tag} unless keys %{$linkset{$id}{tag}};

            my ($pop) = $scrap =~ /a class="pop".*?>.*?by (\d+) /si;
            $linkset{$id}{pop} = $pop if $pop;
          
            my ($date) = $scrap =~ / class="date" title="(.*?)"/si;
            $linkset{$id}{date} = $date if $date;

            last if ++$num == $limit;
        }
             
        last unless my($page0,$page1) = $html =~ / page (\d+) of (\d+)/si;
        last if $page0 == $page1;
        $page = $page ? $page + 1 : 2;
    }

    return %linkset;
}

1;
__END__