Mail::Abuse::Incident::Normalize - Filter the text of a report within an Email


Mail-Abuse documentation Contained in the Mail-Abuse distribution.

Index


Code Index:

NAME

Top

Mail::Abuse::Incident::Normalize - Filter the text of a report within an Email

SYNOPSIS

Top

  use Mail::Abuse::Report;
  use Mail::Abuse::Incident::Normalize;
  my $i = new Mail::Abuse::Incident::Normalize;
  my $report = new Mail::Abuse::Report (incidents => [$i] );

DESCRIPTION

Top

This class handles various important tasks related with recognizing an abuse report. The specific tassks are:

The parser based on this class must be the first in the list of parsers passed to the report object, so that all parser that need its services can access them automatically.

The following new keys are created into the corresponding report after running this class. These are:

header

A Mail::Header object with the header of the message.

body

Contains a reference to the body of the message, as a single scalar or string.

normalized

Set to the scalar Mail::Abuse::Incident::Normalize to indicate that this class did the normalization.

tz

The guessed timezone from the message. This might come from matching it in the body of the message, from the Date header or from other sources. This should be used as a "last-resort hint" by other Incident parsers.

The following method is implemented bu this class:

parse($report)

Pushes incidents into the given report, based on parsing of the text in the report itself.

It must return a list of objects of the same class, with the incident data (IP address, timestamp and other information) filled.

EXPORT

None by default.

HISTORY

Top

0.01

Original version; created by h2xs 1.2 with options

  -ACOXcfkn
	Mail::Abuse
	-v
	0.01

LICENSE AND WARRANTY

Top

This code and all accompanying software comes with NO WARRANTY. You use it at your own risk.

This code and all accompanying software can be used freely under the same terms as Perl itself.

AUTHOR

Top

Luis E. Muñoz <luismunoz@cpan.org>

SEE ALSO

Top

perl(1).


Mail-Abuse documentation Contained in the Mail-Abuse distribution.
package Mail::Abuse::Incident::Normalize;

require 5.005_62;

use Carp;
use strict;
use warnings;
use MIME::Parser;
use HTML::Parser;

use base 'Mail::Abuse::Incident';

				# The code below should be in a single line

our $VERSION = do { my @r = (q$Revision: 1.11 $ =~ /\d+/g); sprintf " %d."."%03d" x $#r, @r };

sub parse
{
    my $self	= shift;
    my $rep	= shift;

				# Phase 0 - Build the parser we'll
				# be using this time. We do so with
				# closures, so that they can reach
				# $rep w/o black magic

    my $p = HTML::Parser->new
	(
	 api_version => 3,
	 default_h		=> [ "" ],
	 start_h => [ sub {  $ {$rep->body} .= 
				 "[IMG " . 
				     ($_[1]->{alt} || $_[1]->{src} || 'n/a') . 
					 "]\n" 
					     if $_[0] && $_[0] eq 'img';
			 }, "tagname, attr" ],
	 text_h => [ sub { $ {$rep->body} .= shift; }, "dtext" ],
	 ) or return;
    
    $p->ignore_elements(qw(script style));
    $p->strict_comment(1);

    $self->html_parser($p);

    my $parser = new MIME::Parser;
    return unless $parser;

    $parser->ignore_errors(1);
    $parser->decode_headers(1);
    $parser->extract_nested_messages(1);
    $parser->extract_uuencode(1);

				# Phase 1 - Decode and find a suitable
				# part containing the message text.
				# Store its headers and body

    my $e = eval { $parser->parse_data($rep->text); };

    if ($@ or !$e)
    {
	$parser->filer->purge;
	return;
    }

    my $text;
    $rep->header($e->head);
    $rep->body(\$text);

    my $decoded_body = ($self->decode_parts($rep, 'any', $e) || '');
    $rep->body(\$decoded_body) if $decoded_body;

    $rep->normalized(ref $self);
    $self->html_parser(undef);
    $parser->filer->purge;

    return unless $rep->body and $ {$rep->body};

				# Phase 2 - This might be actually a
				# reply of forward. Remove any
				# indications from the message body

    $ {$rep->body} =~ s/
		^(([\t ]|[^\w\s])*	# 0 or more non-space, non-word chars
	  	  [^\w\s]+		# followed by one or more non-space, non-words
	  	  )+			# repeated one or more times,
	 	 [\t ]*			# followed by zero or more spaces.
		//xmsg;

				# Phase 3 - Try to recognize a
				# timezone in the body. If this fails,
				# try at the Date: header. Finally,
				# use UTC as the guess 

    # These timezones were taken w/o permission from Time::Zone.
    # Thanks to the authors anyway :)

    my %Zones =
	(
	 "GMT"	=> '+0000',	# Greenwich Mean
	 "UT"	=> '+0000',	# Universal (Coordinated)
	 "UTC"	=> '+0000',
	 "WET"	=> '+0000',	# Western European
	 "WAT"	=> '-0100',	# West Africa
	 "AT"	=> '-0200',	# Azores
	 "FNT" 	=> '-0200',	# Brazil Time (Extreme East)
	 "BRT"	=> '-0300',	# Brazil Time (East Standard)
	 "MNT"	=> '-0400',	# Brazil Time (West Standard)
	 "EWT"	=> '-0400',	# U.S. Eastern War Time
	 "AST"	=> '-0400',	# Atlantic Standard
	 "VET"	=> '-0400',	# Venezuela Standard Time
	 "EST"	=> '-0500',	# Eastern Standard
	 "ACT"	=> '-0500',	# Brazil Time (Extreme West - Acre)
	 "CST"	=> '-0600',	# Central Standard
	 "MST"	=> '-0700',	# Mountain Standard
	 "PST"	=> '-0800',	# Pacific Standard
	 "YST"	=> '-0900',	# Yukon Standard
	 "HST"	=> '-1000',	# Hawaii Standard
	 "CAT"	=> '-1000',	# Central Alaska
	 "AHST"	=> '-1000',	# Alaska-Hawaii Standard
	 "NT"	=> '-1100',	# Nome
	 "IDLW"	=> '-1200',	# International Date Line West
	 "CET"	=> '+0100',	# Central European
	 "MEZ"	=> '+0100',	# Central European (German)
	 "ECT"	=> '+0100',	# Central European (French)
	 "MET"	=> '+0100',	# Middle European
	 "MEWT"	=> '+0100',	# Middle European Winter
	 "SWT"	=> '+0100',	# Swedish Winter
	 "SET"	=> '+0100',	# Seychelles
	 "FWT"	=> '+0100',	# French Winter
	 "EET"	=> '+0200',	# Eastern Europe, USSR Zone 1
	 "UKR"	=> '+0200',	# Ukraine
	 "BT"	=> '+0300',	# Baghdad, USSR Zone 2
	 "IT"	=> '+0330',	# Iran
	 "ZP4"	=> '+0400',	# USSR Zone 3
	 "ZP5"	=> '+0500',	# USSR Zone 4
	 "IST"	=> '+0530',	# Indian Standard 
	 "ZP6"	=> '+0600',	# USSR Zone 5
	 "WST"	=> '+0800',	# West Australian Standard
	 "HKT"	=> '+0800',	# Hong Kong
	 "CCT"	=> '+0800',	# China Coast, USSR Zone 7
	 "JST"	=> '+0900',	# Japan Standard, USSR Zone 8
	 "KST"	=> '+0900',	# Korean Standard
	 "CAST"	=> '+0930',	# Central Australian Standard 
	 "EAST"	=> '+1000',	# Eastern Australian Standard
	 "GST"	=> '+1000',	# Guam Standard, USSR Zone 9
	 "NZT"	=> '+1200',	# New Zealand
	 "NZST"	=> '+1200',	# New Zealand Standard
	 "IDLE"	=> '+1200',	# International Date Line East
	 );

    # In the following matches, we'll attempt to use the inverted
    # message to avoid mistaking a timezone in a forwarded header for
    # our target timestamp...

    my $rev = join "\n", reverse split /\n/, $ {$rep->body};

				# Step 3a - Try to locate a numeric
				# timezone in the format [+-]\d\d:?\d\d

#    warn "Body: ${$rep->body}\n*********************\n";
#    warn "Rev: $rev\n******************\n";

    if ($rev =~ m!([-+]\d\d:?\d\d)!ms) 
    {
	$rep->tz($1);
    }

				# Step 3b - If failed, try to locate the
				# longest timezone posssible

    unless ($rep->tz)
    {
	my $zone = '';

	for my $tz ( sort keys %Zones )
	{
	    next if length($tz) < length($zone);
	    next unless ($rev =~ m/\W ${tz} \W /msx
			 or $rev =~ m/^ ${tz} \W /msx
			 or $rev =~ m/\W ${tz} $ /msx
			 or $rev =~ m/^ ${tz} $ /msx);
#	    warn "*** Zone $tz matched\n";
	    $zone = $tz;
	}

	$rep->tz($Zones{$zone}) if $zone;
    }

				# Step 3c - If failed, declare UTC

    $rep->tz('UTC') unless defined $rep->tz;

    return; 
}

sub decode_parts
{
    my $self	= shift;
    my $rep	= shift;
    my $type	= shift;
    my $e	= shift;

    if (my @parts = $e->parts)
    {
	my $r = '';
	$r .= ($self->decode_parts($rep, $type, $_) || '') for @parts;
	return $r;
    }
    elsif (my $body = $e->bodyhandle)
    {
	my $mime = $e->head->mime_type;
	if (grep { $mime eq $_ } qw(text/plain message/rfc822))
	{
	    return $body->as_string;
	}
	elsif ($type eq 'any' and $mime eq 'text/html')
	{
	    my $b = $body->as_string;
	    $self->html_parser->parse($b);
	    return $ {$rep->body};
	}
    }
    return;			# False by default
}

__END__