| News-Search documentation | Contained in the News-Search distribution. |
News::Search - Usenet news searching toolkit
use News::Search; my $ns = News::Search->new(); $ns->search_for(\@ARGV); my %newsarticles = $ns->SearchNewsgroups;
News::Search searches Usenet news postings.
It can be used to search local news groups that google doesn't cover. Or, even for news groups that are covered by google, it can give you all the hits in one file, in the format that you prescribed.
You can also use the provided news-search in cron to watch specific news groups for specific criteria and mail you reports according to the interval you set.
Initialize the object.
my $searcher = News::Search->new();
or,
my $searcher = News::Search->new( {} );
which are the same as:
my $searcher = News::Search->new( {
nntp_server => 'news',
msg_headers => 'Date|From', # + Subject, which is always printed
msg_limit => 200,
verbose => 0,
on_group => \&default_group_handler,
on_message => \&default_message_handler,
} );
What shown above are default settings. Any of the %config_param attribute can be omitted when calling the new method.
The new is the only class method. All the rest methods are object methods.
The following object attributes are accessible.
The nntp server to search.
Message headers to print.
Maximum number of posts to search (not return).
Be verbose.
Handler for group starts. Refer to news-search for the example.
Handler for news message. Refer to news-search for the example.
Provide the set_val to change the attribute, omitting it to retrieve the attribute value. E.g.,
$searcher->nntp_server("news.easysw.com");
$searcher->search_for(\@ARGV);
Command line parameter handling. Refer to news-search section "command line arguments" for details.
Search the given newsgroups with the given criteria:
my %newsarticles = $ns->SearchNewsgroups;
foreach my $article (values %newsarticles) {
# deal with $article->{"SUBJECT"}, @{$article->{"HEADER"}})
# and $article->{"BODY"}
}
Refer to news-search for usage example.
Please report any bugs or feature requests to bug-news-search at rt.cpan.org, or through
the web interface at http://rt.cpan.org/NoAuth/ReportBug.html?Queue=News-Search. I will be notified, and then you'll
automatically be notified of progress on your bug as I make changes.
You can find documentation for this module with the perldoc command.
perldoc News::Search
You can also look for information at:
SUN, Tong <suntong at cpan.org>
http://xpt.sourceforge.net/
Copyright 2003-2008 Tong Sun, all rights reserved.
This program is released under the BSD license.
| News-Search documentation | Contained in the News-Search distribution. |
package News::Search; use warnings; use strict; # @Author: Tong SUN, (c)2001-2008, all right reserved # @Version: $Date: 2008/11/04 17:19:30 $ $Revision: 1.15 $ # @HomeURL: http://xpt.sourceforge.net/ # {{{ LICENSE: # # Permission to use, copy, modify, and distribute this software and its # documentation for any purpose and without fee is hereby granted, provided # that the above copyright notices appear in all copies and that both those # copyright notices and this permission notice appear in supporting # documentation, and that the names of author not be used in advertising or # publicity pertaining to distribution of the software without specific, # written prior permission. Tong Sun makes no representations about the # suitability of this software for any purpose. It is provided "as is" # without express or implied warranty. # # TONG SUN DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL ADOBE # SYSTEMS INCORPORATED AND DIGITAL EQUIPMENT CORPORATION BE LIABLE FOR ANY # SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER # RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF # CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # # }}} # {{{ POD, Intro:
# }}} # {{{ Global Declaration: # ============================================================== &us === # ............................................................. Uses ... # -- global modules use Carp; use Net::NNTP; use base qw(Class::Accessor::Fast); # ============================================================== &cs === # ................................................. Constant setting ... # our @EXPORT = ( ); # may even omit this line our $VERSION = sprintf("%d.%02d", q$Revision: 1.15 $ =~ /(\d+)\.(\d+)/); # }}} # ############################################################## &ss ### # ................................................ Subroutions start ...
News::Search->mk_accessors(qw(nntp_server msg_headers msg_limit verbose on_group on_message nntp_handle newsgroups nntp_query)); my %config = ( nntp_server => 'news', msg_headers => 'Date|From', # + Subject, which is always printed msg_limit => 200, verbose => 0, on_group => \&default_group_handler, on_message => \&default_message_handler, ); my $verbose; sub new { my ($class, $arg_ref) = @_; my $self = $class->SUPER::new({%config, %$arg_ref}); $verbose = $self->verbose; return $self; }
sub search_for { my ($self, $array_ref) = @_; my $nntp_server; $nntp_server = $self->nntp_server; $nntp_server = $ENV{"NNTPSERVER"} if $ENV{"NNTPSERVER"}; my $nntp; if (defined($ENV{DEBUG}) && $ENV{DEBUG} eq "1") { $nntp = Net::NNTP->new($nntp_server, Debug=>'On', Timeout=>10) || croak "Cant connect to News Server: $@"; } else { $nntp = Net::NNTP->new($nntp_server) || croak "Cant connect to News Server: $@"; } my @newsgroups; my %args; foreach (@$array_ref) { if (/=/) { # key/value pair my ($name, $value) = split(/=/); $name = lc $name; $args{$name} = $value; } else { # group name my $ngname = $_; if (index($ngname, "\*") > -1) { # have wildcard (*) in group name. my $nntplist = $nntp->list() || die "Cannot list newsgroups"; $ngname =~ s/\*/.*/g; foreach (sort(keys(%$nntplist))) { if (/$ngname/) { push(@newsgroups, $_); } } } else { push(@newsgroups, $ngname); } } } print STDERR "Searching the top ". $self->msg_limit. " messages " . " in newsgroups: @newsgroups...\n\n" if $verbose; $self->nntp_handle($nntp); $self->newsgroups(\@newsgroups); $self->nntp_query(\%args); } # default handler for group starts ... sub default_group_handler { my $newsgroup = shift; #print STDERR "\n\nSearching group '$newsgroup'\n\n"; } # default handler for news message ... sub default_message_handler { print STDERR "." if $verbose; } sub dbg_msg { my $show_msg = shift; my $show_level = shift; $show_level = 1 unless $show_level; return unless $verbose >= $show_level; warn "[News::Search] $show_msg\n"; }
sub SearchNewsgroups { my $self = shift; my ($newsgroups) = @_; $newsgroups = $self->{newsgroups} unless $newsgroups; my $nntp = $self->{nntp_handle}; my $args = $self->{nntp_query}; my %newsarticles; foreach my $newsgroup (@$newsgroups) { my ($first, $last) = ($nntp->group($newsgroup))[1,2]; #warn "] $first => $last\n"; if (($first == 0) && ($first == $last)) { next; } $first = $last - $self->msg_limit if $last - $self->msg_limit > $first; #warn "] $first => $last\n" if $verbose; # == news article loop $self->{on_group}->($newsgroup); my $msg_headers = $self->msg_headers; for ($nntp->nntpstat($first);$nntp->next() || last;) { my $msghead = $nntp->head(); unless(defined($msghead)){ dbg_msg "No message head found"; next; } # Ignore html postings if(arrary_search("Content-Type: text/html",$msghead)){ dbg_msg "html posting ignored (found in head)"; next; } my ($msgfound, $msgsubj, $msgfrom, $newsarticle) = SearchMessage($nntp, $msghead, $args); next unless $msgfound; $self->{on_message}->($newsgroup, $msghead, $newsarticle); # Ignore html postings if($newsarticle =~ "Content-Type: text/html"){ dbg_msg "html posting ignored (found in body)"; next; } # zap excessive spaces $newsarticle =~ s/\n(\s*\n){2,}/\n\n/; # eliminate duplicated posts #$newsarticles{"$msgfrom $msgsubj"} = $newsarticles{"$msgfrom"} = { "SUBJECT" => $msgsubj, "HEADER" => [ grep(/^($msg_headers): /, @$msghead) ], #"BODY" => $newsarticle, "BODY" => $newsarticle }; } } $nntp->quit(); return %newsarticles; } # message search sub SearchMessage($$$){ my ($nntp, $msghead, $args, ) = @_; my $headmatched = my $bodymatched = 0; my $msgfrom = "nofrom"; my $msgsubj = "nosubj"; my $i = 0; # -- message head loop #warn "] @$msghead\n"; foreach my $headline (@$msghead) { chomp($headline); $headline =~ /^([^:]+): /; my $argname = lc $1; my $argval = "$'"; $msgfrom = $argval if ($argname eq 'from'); $msgsubj = $argval if ($argname eq 'subject'); # look for search patterns if (defined($args->{$argname})) { $i++; if ($argval =~ m/$args->{$argname}/i) { #warn "] <$args->{$argname}> $argname => $argval\n"; $headmatched = 1; } } # look for ignore patterns if (defined($args->{"no$argname"})) { if ($argval =~ m/$args->{"no$argname"}/i) { return (0, undef, undef, undef); } } } $msgsubj =~ s/^\w+: //; # remove re: fw:, etc #warn "] headmatched = $i\n"; if ($i == 0 && defined($args->{"body"})){ #warn "] search in the body only\n"; $headmatched = 1; } my $msgbodyfh = $nntp->bodyfh() || Carp::shortmess "Can't get body filehandle of article\n"; # get the whole body my $newsarticle = ''; while (my $bodyline=<$msgbodyfh>) { $newsarticle .= $bodyline; } # Ignore html postings #next if $newsarticle =~ m{^Content-Type: text/html|Mississauga|Scarborough|Etobicoke}mi; if (defined($args->{"body"})) { if ($newsarticle =~ m/$args->{"body"}/i) { $bodymatched = 1; } } else { # not searching the body $bodymatched = 1; } return ($headmatched == 1 && $bodymatched == 1, $msgsubj, $msgfrom, $newsarticle); } sub arrary_search($$){ my ($look_for, $arrary_ref) = @_; my $is_there = 0; foreach my $elt (@$arrary_ref) { if ($elt =~ /$look_for/) { $is_there = 1; last; } } return $is_there; } # {{{ POD, Appendixes:
# }}} 1; # End of News::Search