Log::Parallel::ApacheCLF - parse apache common log format


Log-Parallel documentation Contained in the Log-Parallel distribution.

Index


Code Index:

NAME

Top

 Log::Parallel::ApacheCLF - parse apache common log format

SYNOPSIS

Top

 use Log::Parallel::ApacheCLF;

 my $parser = Log::Parallel::ApacheCLF->return_parser($fh, %info);

LOG PROCESSING CONFIG

Top

sources: - name: raw apache server logs hosts: host1.domain path: /var/apache_archive/%YYYY%.%MM%.%DD%{,.bz2} format: ApacheCLF valid_from: 2009-01-01 valid_to: yesterday jobs: - name: server logs destination: server logs source: raw apache server logs path: '%DATADIR%/%YYYY%/%MM%/%DD%/%JOBNAME%.%DURATION%.%BUCKET%.%SOURCE_BKT%' valid_from: 2008-01-01 valid_to: yesterday frequency: daily output_format: TSV use: Log::Parallel::TSV Log::Parallel::ApacheCLF buckets: 20 hosts: host10,host11,host12,host13 bucketizer: $log->{server_time}

DESCRIPTION

Top

Parse the apache web server logs in Common Log Format. The fields from the apache logs are named as follows:

ip

The IP address header field. Sometimes -.

auth_user

The HTTP authenticated user.

server_time

The time, unix time seconds, that the server wrote the log line.

request

The HTTP request line. Eg: GET / HTTP/1.0.

status

The HTTP status code. 200, 301, etc.

bytes_sent

The number bytes transfered.

user_agent

The HTTP UserAgent.

refferer

The HTTP Refferrer field.

This module can also be used to parse more extended Apache logs. Create a new module and invoke this one to do a bunch of the work. There are three extra construction arguments that can be used:

pre_rx

A regular expression to match of things that come before the regular Apache log format on each line. If this has saved matches, they'll be returned as an array: pre_match.

pre_rx_saved_match_count

If you have a pre_rx, and if that regular expression has saved matches, you must say how many for Log::Parallel::ApacheCLF to work. This is how.

post_rx

A regular expression to match of things that come after the regular Apache log format on each line.

If this has saved matches, they'll be returned as an array: post_match.

LICENSE

Top

This package may be used and redistributed under the terms of either the Artistic 2.0 or LGPL 2.1 license.


Log-Parallel documentation Contained in the Log-Parallel distribution.

package Log::Parallel::ApacheCLF;

use strict;
use warnings;
use Data::Dumper;
require Log::Parallel::Parsers;
require Exporter;
use Time::JulianDay qw(jd_timegm);;
use URI::Escape::XS qw(uri_unescape uri_escape);

our @ISA = qw(Log::Parallel::Parsers::BaseClass Exporter);
our @EXPORT = qw();

__PACKAGE__->register_parser();

our $warn_level = 1;

my $month_rx = qr/(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)/;
my $num255_rx = qr/(?:25[0-5]|2[0-4]\d|1\d\d|[1-9]\d|\d)/;
my $ip_rx = qr/$num255_rx\.$num255_rx\.$num255_rx\.$num255_rx/;
my $host_rx = qr/(?:$ip_rx|unknown)/;
my $quoted_rx = qr/(?:[^\\"]|\\.)*/;

my %mon_num = (
	Jan	=> 1,
	Feb	=> 2,
	Mar	=> 3,
	Apr	=> 4,
	May	=> 5,
	Jun	=> 6,
	Jul	=> 7,
	Aug	=> 8,
	Sep	=> 9,
	Oct	=> 10,
	Nov	=> 11,
	Dec	=> 12,
);

sub return_parser
{
	my ($pkg, $fh, %info) = @_;
	my $filesize = $info{filesize} || 2_000_000_000;
	my $span = $info{span};
	my $start_time = $info{time};
	my $post_rx = $info{extra_rx} || qr//;
	my $pre_rx = $info{pre_rx} || qr//;
	my $pre_rx_save_match_count = $info{pre_rx_saved_match_count} || 0;

	my $safewarn = sub {
		my ($level, $err) = @_;
		return unless $level <= $warn_level;
		use bytes;
		$err =~ s/([\200-\377])/sprintf("M-%c",ord($1)&0177)/eg;
		$err =~ s/([\0-\37\177])/sprintf("^%c",ord($1)^64)/eg;
		my ($pkg, $file, $line) = caller;
		warn "$err at $file:$line processing $info{host}:$info{filename}\n";
	};


	my $line_number = 0;

	my $midnight;
	my $day = '';
	my $offset;
	my $zone = '';

	return sub {
		while (<$fh>) {

			$line_number++;

			# - - - [19/Mar/2009:00:00:03 -0700] "HEAD / HTTP/1.0" 401 - "-" "-"
			unless (

				m{
										^
										$pre_rx

										(\S+)					# server hostname, *1
										[ ]
										apache:
										[ ]
										([^]["\@/]+?)				# forward ip, *2
										[ ]
										\S+					# ident user
										[ ]
										(\S+)					# apache auth user, *3
										[ ]
										\[
												(\d\d?/$month_rx/\d\d\d\d)	# apache date, *4
												:
												(\d\d)				# apache hour, *5
												:
												(\d\d)				# apache minute, *6
												:
												(\d\d)				# apache second, *7
												[ ]
												([-+]\d\d\d\d)			# apache timezone, *8
										\]
										[ ]
										"($quoted_rx)"				# request, *9
										[ ]
										(\d+)					# apache result code, *10
										[ ]
										(\S+)					# number of bytes sent, *11
										[ ]
										"($quoted_rx)"				# referrer, *12
										[ ]
										"($quoted_rx)"				# user agent, *13

										$post_rx
								}x

			) {
				warn "Could not parse $line_number $_";
				next;
			}
			
			my @match = ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23);
			my @pre = splice(@match, 0, $pre_rx_save_match_count);
			my ($ip, $auth_user, $date, $hh, $mm, $ss, $tzoff, $request, $status, $sent, $referrer, $user_agent) = splice(@match, 0, 13);

			#print "server_host=$server_host\n";
			#print "\tip=$ip\n";
			#print "\tdate=$date\n";
			#print "\trequest=$request\n";
			#print "\tstatus=$status\n";
			#print "\tsent=$sent\n";

			my $detail = '';

			$site = '' unless $site;

			unless ($day eq $date) {
				$day = $date;
				my ($mday, $month, $year) = split(/\//, $day);
				my $mnum = $mon_num{$month} || die "month = $month ($day)";
				$midnight = jd_timegm(0, 0, 0, $mday, $mnum-1, $year);
			}
			unless ($zone eq $tzoff) {
				$tzoff =~ /^([+-])(\d\d)(\d\d)/ || die;
				$offset = ($1 eq '-' ? -1 : 1) * (60 * $2 + $3);
			}
			my $time = $midnight + $offset + $hh * 3600 + $mm * 60 + $ss;

			$ip		= '' if ! defined($ip) || $ip eq '-';
			$referrer	= '' if ! defined($referrer) || $referrer eq '-';
			$user_agent	= '' if ! defined($user_agent) || $user_agent eq '-';
			$auth_user	= '' if ! defined($auth_user) || $auth_user eq '-';

			return {
				ip		=> $ip,
				auth_user	=> $auth_user,
				server_time	=> $time,
				request		=> $request,
				status		=> $status,
				bytes_sent	=> $sent,
				referrer	=> $referrer,
				user_agent	=> $user_agent,
				@pre ? (pre_match => \@pre) : (),
				(@+ > $pre_rx_save_match_count + 13) ? (post_match => \@match ) : (),
			};
		}
		return undef;
	};
}

1;

__END__