WWW::Scraper::NorthernLight - Scrapes NorthernLight.com


Scraper documentation Contained in the Scraper distribution.

Index


Code Index:

NAME

Top

WWW::Scraper::NorthernLight - Scrapes NorthernLight.com

SYNOPSIS

Top

    require WWW::Scraper;
    $search = new WWW::Scraper('NorthernLight');




DESCRIPTION

Top

This class is an NorthernLight specialization of WWW::Search. It handles making and interpreting NorthernLight searches http://www.NorthernLight.com.

This class exports no public interface; all interaction should be done through WWW::Search objects.

OPTIONS

Top

None at this time (2001.05.06)

search_url=URL

Specifies who to query with the NorthernLight protocol. The default is at http://www.northernlight.com/power.html

search_debug, search_parse_debug, search_ref Specified at WWW::Search.

SEARCH FIELDS

Top

displayResultsPerPage - Results per Page

"5" => 5
"10" => 10
"20" => 20
"50" => 50
"100" => 100

postingAge - Age of Posting

"0" => any time
"1" => 1 day
"3" => 3 days
"7" => 1 week
"8" => 2 weeks
"10" => 1 month

workTermTypeIDs - Work Term

"1" => Full Time
"2" => Part Time
"3" => Contract
"4" => Temporary/Seasonal
"5" => Internship

countyIDs - Job Location-County

"0" => Any
"1" => Alameda
"2" => Contra Costa
"3" => Marin
"4" => Napa
"5" => San Benito
"6" => San Francisco
"7" => San Mateo
"8" => Santa Clara
"9" => Santa Cruz
"10" => Solano
"11" => Sonoma
"12" => Other

jobPostingCategoryIDs => Job Category

"0" => Any
"1" => Accounting/Finance
"2" => Administrative/Clerical
"3" => Advertising
"4" => Aerospace/Aviation
"5" => Agricultural
"6" => Architecture
"7" => Arts/Entertainment
"8" => Assembly
"9" => Audio/Visual
"10" => Automotive
"11" => Banking/Financial Services
"12" => Biotechnology
"13" => Bookkeeping
"14" => Business Development
"15" => Child Care Services
"16" => Colleges & Universities
"17" => Communications/Media
"18" => Computer
"19" => Computer - Hardware
"20" => Computer - Software
"21" => Construction
"22" => Consulting/Professional Services
"23" => Customer Service/Support
"24" => Data Entry/Processing
"25" => Education/Training
"26" => Engineering
"27" => Engineering - Civil
"28" => Engineering - Hardware
"29" => Engineering - Software
"30" => Environmental
"31" => Executive/Management
"32" => Fund Raising/Development
"33" => Government/Civil Service
"34" => Graphic Design
"35" => Health Care/Health Services
"36" => Hospitality/Tourism
"37" => Human Resources
"38" => Information Technology
"39" => Insurance
"40" => Internet/E-Commerce
"41" => Law Enforcement/Security
"43" => Maintenance/Custodial
"44" => Manufacturing
"45" => Marketing
"46" => Miscellaneous
"47" => Non-Profit
"48" => Pharmaceutical
"49" => Printing/Publishing
"50" => Property Management/Facilities
"51" => Public Relations
"74" => Purchasing
"52" => QA/QC
"53" => Radio/Television/Film/Video
"54" => Real Estate
"57" => Receptionist
"55" => Recruiting/Staffing
"56" => Research
"58" => Restaurant/Food Service
"59" => Retail
"60" => Sales
"61" => Sales - Inside/Telemarketing
"62" => Sales - Outside
"63" => Security/Investment
"64" => Shipping/Receiving
"65" => Social Work/Services
"66" => Technical Support
"67" => Telecommunications
"68" => Training
"69" => Transportation
"70" => Travel
"71" => Warehouse
"72" => Web Design
"73" => Writer

AUTHOR

Top

WWW::Scraper::NorthernLight is written and maintained by Glenn Wood, http://search.cpan.org/search?mode=author&query=GLENNWOOD.

COPYRIGHT

Top


Scraper documentation Contained in the Scraper distribution.

package WWW::Scraper::NorthernLight;

#####################################################################

use strict;
use vars qw($VERSION @ISA);
@ISA = qw(WWW::Scraper);
$VERSION = sprintf("%d.%02d", q$Revision: 1.0 $ =~ /(\d+)\.(\d+)/);

use Carp ();
use WWW::Scraper(qw(2.27 generic_option addURL trimTags));
use WWW::Scraper::FieldTranslation;

my $scraperRequest = 
   { 
      'type' => 'FORM'       # Type of query generation is 'QUERY'
     ,'formNameOrNumber' => 'powSearch'
     ,'submitButton' => 'search'
      # This is the basic URL on which to build the query.
     ,'url' => 'http://www.northernlight.com/power.html'
      # This is the Scraper attributes => native input fields mapping
      ,'nativeQuery' => 'qr'
      ,'nativeDefaults' => {
                            'qr' => undef
                           }
      ,'fieldTranslations' =>
              {
                  '*' =>
                      {    'skills'    => 'qr'
#                            ,'payrate'   => undef
#                            ,'locations' => new WWW::Scraper::FieldTranslation('NorthernLight', 'Job', 'locations')
                          ,'*'         => '*'
                      }
              }
      # Some more options for the Scraper operation.
     ,'cookies' => 0
   };

my $scraperFrame =
        [ 'HTML', 
           [ 
               #        </b> found <b>10,032,977 items</b>
               [ 'COUNT', 'found\s+<b>([0-9,]+)\s+items?</b>']
              ,[ 'NEXT', 'alt="Next Page"' ]
              ,[ 'BODY', '<!--NLBannerStart-->', '<!--NLResultListEnd-->',
                  [  
                     [ 'HIT*',
                        [  
                           [ 'BODY', '<!--NLResultStart-->', '<!--NLResultEnd-->',
                             [
                               [ 'TR',
                                  [
                                    # <!--  --><!-- <td>&nbsp;</td> -->
                                     [ 'SNIP', '<!--[^>]*?<td>.*?-->',
                                       [
                                         [ 'TD',
                                              [['SPAN', 'number']]
                                         ]
                                        ,[ 'TD', 
                                            [
                                               [ 'A', 'url', 'title' ]
                                              ,['REGEX', '<!--NLResultRelevanceStart-->(\d+)% -', 'relevance']
                                              ,['REGEX', '<!--NLResultRelevanceEnd-->(.*?)&nbsp;', 'source']
                                              ,['REGEX', '</b>(.*?)<br>', 'description']
                                              ,['REGEX', '<!-- Misc Block --><!-- \d+ -->(.*?)<!-- Misc Block --><!-- \d+ -->', 'miscBlock']
                                              ,[ 'TABLE',
                                                 [
                                                   ['TR']
                                                  ,['REGEX', '(\d+)%:', 'secondRelevance']
                                                  ,[ 'A', 'secondUrl', 'secondTitle' ]
                                                 ]
                                               ]
                                              #,['SPAN', 'avail'] #needs better treatment of the <SPAN> at the top of this <TD> for this to work.
                                              ,[ 'AQ', 'more\s+results', 'moreResultsUrl', undef ]
                                              # <!-- Inline Clustering -->
                                            ]
                                         ]
                                       ]
                                     ]
                                  ]
                               ]
                             ]
                           ]
                        ]
                     ] 
                  ]
              ]
           ]
        ];



# Access methods for the structural declarations of this Scraper engine.
sub scraperRequest { $scraperRequest }
sub scraperFrame { $_[0]->SUPER::scraperFrame($scraperFrame); }
sub scraperDetail{ undef }

sub testParameters {
    my ($self) = @_;

    if ( ref $self ) {
        $self->{'isTesting'} = 1;
    }
    
    return { 
             'SKIP' => "NorthernLight's search engine seems to be down these days!?"
            ,'testNativeQuery' => 'search scraper'
            ,'expectedOnePage' => 9
            ,'expectedMultiPage' => 12
            ,'expectedBogusPage' => 0
           };
}

1;