| Web-Scraper-Config documentation | Contained in the Web-Scraper-Config distribution. |
Web::Scraper::Config - Run Web::Scraper From Config Files
---
scraper:
- process:
- td>ul>li
- trailers[]
- scraper:
- process_first:
- li>b
- title
- TEXT
- process_first:
- ul>li>a[href]
- url
- @href
- process:
- ul>li>ul>li>a
- movies[]
- __callback(process_movie)__
my $scraper = Web::Scraper::Config->new(
$config,
{
callbacks => {
process_movie => sub {
my $elem = shift;
return {
text => $elem->as_text,
href => $elem->attr('href')
}
}
}
}
);
$scraper->scrape($uri);
Web::Scraper::Config allows you to harness the power of Web::Scraper from a config file.
The config files can be written in any format that Config::Any understands, as long as it conforms to this module's rules.
Creates a new Web::Scraper::Config instance.
The first arguments is either a hashref that represents a config, or a filename to the config. The config file can be in any format that Config::Any understands as long as it returns a hash that's conformant to the Web::Scraper::Config rules.
The second argument (options) is optional, and is currently only used to provider callbacks to be called from the scraper. When Web::Scraper::Config encounters an element in the form of:
__callback(function_name)__
then that is replaced by the corresponding callback specified in the options hash.
Starts scraping. The semantics are exactly the same as Web::Scraper::scrape
Daisuke Maki <daisuke@endeworks.jp>
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
See http://www.perl.com/perl/misc/Artistic.html
| Web-Scraper-Config documentation | Contained in the Web-Scraper-Config distribution. |
# $Id: /mirror/perl/Web-Scraper-Config/trunk/lib/Web/Scraper/Config.pm 7145 2007-05-09T16:36:57.901467Z daisuke $ # # Copyright (c) 2007 Daisuke Maki <daisuke@endeworks.jp> package Web::Scraper::Config; use strict; use warnings; use Config::Any; use Data::Visitor::Callback; use Web::Scraper; use URI; our $VERSION = '0.01'; sub new { my $class = shift; my $self = bless {}, $class; my $config = shift; my $opts = shift; $config = $self->_load_config($config); { # BROKEN YAML::Syck my $v = Data::Visitor::Callback->new( plain_value => sub { $_[1] =~ s/(\w:) /$1/g; $_[1] } ); $config = $v->visit($config); } $self->{config} = $config; $self->{callbacks} = $opts->{callbacks} if $opts && $opts->{callbacks}; return $self; } sub _load_config { my $self = shift; my $file = shift; if (ref $file eq 'HASH') { return $file; } else { # This is a bit hackish, but we're only loading one file, so # we should be okay my $list = Config::Any->load_files({ files => [ $file ]}); if (! @$list ) { require Carp; Carp::croak("Could not load config file $file: $@"); } return (values %{$list->[0]})[0]; } } sub scrape { my $self = shift; my $config = $self->{config}; my $scraper = $self->_recurse($config)->(); return $scraper->scrape(@_); } sub _recurse { my ($self, $rules) = @_; my $ref = ref($rules); my $ret; if (! $ref) { if ($rules =~ /^__callback\(([^\)]+)\)__$/) { $rules = $self->{callbacks}{$1}; } $ret = $rules; } elsif ($ref eq 'ARRAY') { my @elements; foreach my $rule (@$rules) { if ($rule =~ /^__callback\(([^\)]+)\)__$/) { $rule = $self->{callbacks}{$1}; push @elements, sub { sub { $rule->(@_) } }; } else { push @elements, ref $rule ? $self->_recurse($rule) : $rule; } } if (! grep { my $ref = ref($_); $ref ? $ref ne 'CODE' : 1 } @elements) { $ret = sub { foreach my $code (@elements) { $code->() } }; } else { $ret = \@elements; } } elsif ($ref eq 'HASH'){ my($op) = keys %$rules; my $h = $self->_recurse($rules->{$op}); my $is_func = ($op =~ /^(?:scraper|process(?:_first)?|result)$/); if ($is_func) { my @args = (ref $h eq 'ARRAY') ? @$h : ($h); if ($op eq 'scraper') { $ret = sub { scraper(sub { for (@args) { $_->() } }) }; } else { $ret = sub { my $code = sub { @_ = map { (ref $_ eq 'CODE') ? $_->() : $_ }@args; goto &$op; }; $code->() }; } } else { $ret = { $op => $h }; } } else { require Data::Dumper; die "Web::Scraper::Config does not know how to parse: " . Data::Dumper::Dumper($rules); } return $ret; } 1; __END__