WWW::Scraper::ScraperDiscovery - discovers forms and inputs on a HTML page.


Scraper documentation Contained in the Scraper distribution.

Index


Code Index:

NAME

Top

WWW::Scraper::ScraperDiscovery - discovers forms and inputs on a HTML page.

SYNOPSIS

Top

    use WWW::Scraper;
    $scraper = new WWW::Scraper('ScraperDiscovery',{'url' => 'http://someplace.com/formInQuestion.html'});

See eg/ScraperDiscovery.pl

DESCRIPTION

Top

This class is an experimental exploration of "Scraper Discovery".

AUTHOR and CURRENT VERSION

Top

WWW::Scraper::ScraperDiscovery is written and maintained by Glenn Wood, http://search.cpan.org/search?mode=author&query=GLENNWOOD.

COPYRIGHT

Top


Scraper documentation Contained in the Scraper distribution.

use strict;
package WWW::Scraper::ScraperDiscovery;

#####################################################################

use base qw(WWW::Scraper Exporter);
# This is an appropriate VERSION calculation to use for CVS revision numbering.
use vars qw($VERSION);
$VERSION = sprintf("%d.%02d", q$Revision: 1.0 $ =~ /(\d+).(\d+)/);

use WWW::Scraper(qw(3.00 generic_option trimLFs trimTags removeScriptsInHTML));

use strict;

my $scraperRequest = 
        { 
            # This engine's method is QUERY
            'type' => 'QUERY'
            
            # This is the basic URL on which to get the form to build the query.
            ,'url' => 'http://google.com'

           # specify defaults, by native field names
           ,'nativeQuery' => undef
           ,'nativeDefaults' => {'rootUrl' => '1' }
            
            # specify translations from canonical fields to native fields
           ,'defaultRequestClass' => undef
           ,'fieldTranslations' =>
                   {
                       '*' =>
                           {
                                '*'         => '*'
                           }
                   }
            # Miscellaneous options for the Scraper operation.
           ,'cookies' => 0
       };

sub generateQuery {
    my ($self) = @_;
    $scraperRequest->{'url'} = $self->{'native_query'};
    return $scraperRequest->{'url'};
}

my $scraperFrame =
       [ 'HTML',
         [ 
            [ 'HIT*', 'ScraperDiscovery::FORM',
              [
                [ 'FORM',
                  [
                    [ 'HIT*', 'ScraperDiscovery::INPUT',
                      [
                        [ 'INPUT' ]
                      ]
                    ]
                   ,[ 'HIT*', 'ScraperDiscovery::SELECT',
                      [
                         [ 'SELECT', 
                           [
                             [ 'HIT*', 'ScraperDiscovery::OPTION',
                               [
                                 [ 'OPTION' ]
                               ]
                             ]
                           ]
                         ]
                      ]
                    ]
                  ]
                ],
              ]
            ],
         ]
       ];

my $scraperFrame1 =
       [ 'HTML',
         [ 
            [ 'HIT*', 'ScraperDiscovery::FORM',
              [
                [ 'FORM',
                  [
                    [ 'HIT*', 'ScraperDiscovery::INPUT',
                      [
                        [ 'INPUT' ]
                      ]
                    ]
                   ,[ 'HIT*', 'ScraperDiscovery::SELECT',
                      [
                         [ 'SELECT', 
                           [
                             [ 'HIT*', 'ScraperDiscovery::OPTION',
                               [
                                 [ 'OPTION' ]
                               ]
                             ]
                           ]
                         ]
                      ]
                    ]
                  ]
                ],
              ]
            ],
         ]
       ];

my $scraperFrame2 =
       [ 'HTML',
         [ 
            [ 'HIT*', 'ScraperDiscovery::BODY',
              [
                [ 'BODY',
                  [
                    [ 'HIT*', 'ScraperDiscovery::NEXT',
                      [ 'DISCOVERNEXT' ]
                    ]
                   ,[ 'MACRO', 'TABLELOOP', 
                      [
                        [ 'HIT*', 'ScraperDiscovery::TABLE',
                          [
                            [ 'TABLE' ]
                           ,[ 'HIT*', 'ScraperDiscovery::TR',
                              [
                                 [ 'TR', 
                                   [
                                     [ 'HIT*', 'ScraperDiscovery::TD',
                                       [
                                          [ 'A', 'url', 'urlCaption' ]
                                         ,[ 'A', 'url', 'urlCaption' ]
                                         ,[ 'A', 'url', 'urlCaption' ]
                                         ,[ 'A', 'url', 'urlCaption' ]
                                         ,[ 'TD' ]
                                         ,[ 'MACROX', 'TABLELOOP' ]
                                       ]
                                     ]
                                   ]
                                 ]
                              ]
                            ]
                          ]
                        ]
                      ]
                    ]
                  ]
                ],
              ]
            ],
         ]
       ];


sub init {
    my ($self, $subclass, $native_query, $native_options) = @_;
    
    if ( $native_options->{'SCRAPERREQUEST'} ) {
        $self->SetScraperRequest($native_options->{'SCRAPERREQUEST'});
    } else {
        $self->SetScraperRequest($scraperRequest);
    }

    if ( $native_options->{'PHASE'} ) {
        my $phase = $native_options->{'PHASE'};
        $self->SetScraperFrame($scraperFrame1) if $phase == 1;
        $self->SetScraperFrame($scraperFrame2) if $phase == 2;
    } else {
        $self->SetScraperFrame($scraperFrame1);
    }
    
    $self->SetScraperDetail(undef);
    return $self->SUPER::init($subclass, $native_query);
}

1;

__END__