/usr/local/CPAN/Bundle-WWW-Scraper-News/WWW/Scraper/CNN.pm


use strict;

package WWW::Scraper::CNN;

use vars qw($VERSION @ISA);
@ISA = qw(WWW::Scraper);
$VERSION = sprintf("%d.%02d", q$Revision: 1.00 $ =~ /(\d+)\.(\d+)/);

use WWW::Scraper(qw(1.24 generic_option addURL trimTags trimLFs));

my $scraperRequest = 
   { 
      'type' => 'GET'
     ,'formNameOrNumber' => 0
     ,'submitButton' => undef

     # This is the basic URL on which to build the query.
     ,'url' => 'http://cnn.looksmart.com/r_search?'
     #qc=&col=cnni&qm=0&st=1&nh=10&lk=1&rf=1&look=&venue=all&keyword=&qp=&comefrom=izch&isp=zch&search=0&key=Infospace
     ,'nativeQuery' => 'key'
     ,'nativeDefaults' => {
                             'qc' => ''
                            ,'col' => 'cnn'
                            ,'qm' => '0'
                            ,'st' => '1'
                            ,'nh' => '10'
                            ,'lk' => '1'
                            ,'rf' => '1'
                            ,'look' => ''
                            ,'venue' => 'all'
                            ,'keyword' => ''
                            ,'qp' => ''
                            ,'comefrom' => 'izch'
                            ,'isp' => 'zch'
                            ,'search' => '0'
                            ,'key' => undef
                          }
     ,'fieldTranslations' =>
             {
                 '*' =>
                     {    '*'             => '*'
                     }
             }
      # Some more options for the Scraper operation.
     ,'cookies' => 0
   };

my $scraperFrame =
      [ 'HTML', 
         [  
            [ 'HIT*' , 'News', 
               [  
                    [ 'DL',
                        [
                            [ 'DT', [ [ 'AN', 'url', 'Title' ] ] ]
                           ,[ 'DD', 'Description' ]
                        ]
                     ]
               ]
            ]
         ] 
      ];


my $scraperDetail =
      [ 'HTML', 
         [  
            [ 'BODY', '<html>', '</html>',
            [
                [ 'HIT' , 
                   [  
                        [ 'F', \&get_authors, 'authors']
                       ,[ 'F', \&get_description, 'description']
                       ,[ 'F', \&get_text, 'text', 'dateline', 'source']
                       ,[ 'F', \&get_section, 'section']
                       ,[ 'F', \&get_sub_section, 'sub_section']
                       ,[ 'F', \&get_creation_date, 'creation_date' ]
                       ,[ 'F', \&get_title, 'title']
                       ,[ 'F', \&get_posted, 'posted']
                   ]
                ]
            ]
            ]
         ] 
      ];



# Access methods for the structural declarations of this Scraper engine.
sub scraperRequest { $scraperRequest }
sub scraperFrame { $scraperFrame }
sub scraperDetail{ $scraperDetail }




sub testParameters {
    my ($self) = @_;

    if ( ref $self ) {
        $self->{'isTesting'} = 1;
    }
    
    my $isNotTestable = WWW::Scraper::isGlennWood()?0:'No testParameters provided.';
    return { 
             'SKIP' => $isNotTestable
            ,'testNativeQuery' => 'NASA'
            ,'expectedOnePage' => 9
            ,'expectedMultiPage' => 100
            ,'expectedBogusPage' => 0
           };
}









sub get_authors {
   my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	$dat =~ m{<meta name="AUTHOR" content="(.*?)">}si;
return ($1);
}

sub get_description {
   my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	$dat =~ m{<meta name="DESCRIPTION" content="(.*?)">}si;
return ($1);

}

sub get_text {
    my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	my $text = '';
    # preserve the <p> tags here for _get_dateline()
    $text = join '', ($dat =~ m{\n(<p>.*?</p>)}gsi);
    my ($dateline, $source) = $self->_get_dateline(\$text);
return ($text,$dateline,$source);
}

sub _get_dateline {
    my ($self, $text) = @_;
    # <P><a href="map.nevada.las.vegas.jpg">LAS VEGAS, Nevada</a> (IDG) --
    my $dtln = $1 if ( $$text =~ s{<[pbPB]>\s*([^-<]*?)\s*--}{}s );
    # Doing the tripTags before the above regex would allow us to capture datelines with <A>
    #  anchors in them, but we need a trimTags() that preserves selected tags to make that work.
    #  (or a much more elaborate regex here!)
    $dtln = $self->trimTags(undef, $dtln);
    my ($dateline, $source) = ($2,$3) if ( $dtln =~ m{^\s*(([^-<(]*)\s+)?\((\w+)\)$}s );
    unless ( $source ) {
        # <b>HONG KONG, China --</b>  
        ($dateline, $source) = ($1,$2) if ( $dtln =~ m{^\s*([^,]+)?\s*,\s*([^\s-]+)}s );
    }
    return ($dateline, $source);
}

sub get_section {
    my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	my ($section) = ($dat =~ m{<meta name="SECTION" content="(.*?)">}si);
    unless ( $section ) {
        $section = $1 if $dat =~ m{/([^.]+)\.([^.]+)\.story.gif};
    }
    unless ( $section ) {
        $section = $1 if $dat =~ m{<SPAN CLASS="Bnr1">\s*(\w+)\s*>?</SPAN>}s;
    }
return ($section);
}

sub get_sub_section {
    my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	my ($sub_section) = ($dat =~ m{<meta name="SUBSECTION" content="(.*?)">}si);
    unless ( $sub_section ) {
        $sub_section = $2 if $dat =~ m{/([^.]+)\.([^.]+)\.story.gif}s;
    }
    unless ( $sub_section ) {
        $sub_section = $1 if $dat =~ m{<SPAN CLASS="Bnr2">\s*(\w+)\s*>?</SPAN>}s;
    }
return ($sub_section);
}

sub get_title {
    my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
    #<title>CNN.com - On September 11, final words of love - September  9, 2002</title>
    my ($title) = ($dat =~ m{<title>(.*?)</title>}si);
    $title =~ s{[\w.]+\s+\d+,\s+\d\d\d\d$}{};
    unless ( $title ) {
        ($title) = ($dat =~ m{<h1>(.*?)</h1>}si);
    }
return ($title);
}

sub get_creation_date {
    my ($self, $hit, $dat) = @_;
#	my ($self, $cs, $ds) = @_;
	$dat =~ m{<meta name="(publicationDate|DATE)" content="(.*?)">}si;
    my $date = $2;
    # or <META NAME="date" CONTENT="<!-- CNN date -->"> ? ? ?
    unless ( $date =~ m{^\d\d\d\d-\d\d-\d\d} ) {
        ($date) = m{<p class="timestamp">([\w\.]+\s+\d+,\s+\d\d\d\d)}si;
    }
    unless ( $date ) {
        my ($title) = ($dat =~ m{<title>(.*?)</title>}si);
        ($date) = ($title =~ m{([\w\.]+\s+\d+,\s+\d\d\d\d)$});
    }
    return ($date);
}

sub get_posted {
    my ($self, $hit, $dat) = @_;
    #document.write('<p><span class="Small">September  6, 2002 Posted: 0837 GMT<br><\/span><\/p>')
    my ($postedPre, undef, $postedPost) = ($dat =~ m{document\.write\('<p><span class="Small">(\w+\s+\d+,\s+\d\d\d\d)(\s+Posted:\s+)(\d\d\d\d\s+\w+)<br>'}s);
    unless ( $postedPre and $postedPost ) {
        # older docs: <p><span class="Small">August 6, 2001 Posted: 1603 GMT<br></span></p>
        ($postedPre, undef, $postedPost) = ($dat =~ m{<p><span class="Small">(\w+\s+\d+,\s+\d\d\d\d)(\s+Posted:\s+)(\d\d\d\d\s+\w+)<br>}s);
    }
    unless ( $postedPre and $postedPost ) {
        #<p class="timestamp">March 1, 2001<br> Web posted at: 1432 GMT</p>
        #<p class="timestamp">April 24, 2001<br> Web posted at: 1540 GMT</p>
        ($postedPre, undef, $postedPost) = ($dat =~ m{<p\s+class="timestamp">([\w\.]+\s+\d+,\s+\d\d\d\d)(<br> Web posted at:\s+)(\d\d\d\d\s+\w+)</p>}si);
        }
    # MORE: older documents.
    #<p><FONT FACE="Verdana, Arial, Helvetica, sans-serif" SIZE="1" color="#333333"><i><b>January 10, 2000</b><br>
    # Web posted at: 10:57 a.m. EST (1557 GMT)</i></font></p>
    
    return "$postedPre $postedPost" if ( $postedPre and $postedPost );
    return undef;
}
    
sub is_OK {
	my ($self, $cs, $ds) = @_;

	return (1, 'Registration Required')
		if ($ds->{'content'} =~ m{<title>Please register</title>}si);

	unless($ds->{content} =~ m{<div class="content">}si) {
		return (99, 'Unknown Error');
	}

	return (0);
}