/usr/local/CPAN/DSlib/DS/Transformer/StatCounter.pm
#!perl
# ########################################################################## #
# Title: Gather counts of selected fields
# Creation date: 2007-03-05
# Author: Michael Zedeler
# Description: Counts number of occurences of various field values
# Data Stream class
# Data transformer
# File: $Source: /data/cvs/lib/DSlib/lib/DS/Transformer/StatCounter.pm,v $
# Repository: kronhjorten
# State: $State: Exp $
# Documentation: inline
# Recepient: -
# ########################################################################## #
package DS::Transformer::StatCounter;
use base qw{ DS::Transformer };
use strict;
use Carp::Assert;
use IO::Handle;
our ($VERSION) = $DS::VERSION;
our ($REVISION) = '$Revision: 1.1 $' =~ /(\d+\.\d+)/;
sub new {
my ( $class, $typespec, $source, $count_config ) = @_;
my $self = $class->SUPER::new( $typespec, $source );
$self->{count_config} = $count_config;
$self->{stats} = {};
return $self;
}
sub process {
my( $self, $row ) = @_;
$self->_update_counts( $row, $self->{count_config}, $self->{stats} );
return $row;
}
sub _update_counts {
my ( $self, $row, $field_list, $stats ) = @_;
for( my $i = 0; $i <= $#$field_list; $i++ ) {
# Find out which field to process
my $field = ${$field_list}[$i];
# Create any missing data structures
# (yes - this could be done once and would speed up the processing a lot)
unless( defined $stats->{$field} ) {
$stats->{$field} = {};
}
unless( defined $stats->{$field}->{ $row->{$field} } ) {
$stats->{$field}->{ $row->{$field} } = { stats => { count => 0 } };
}
# This is where the actual statistics are being stored
$stats->{$field}->{ $row->{$field} }->{stats}->{count}++;
# If there are any extra subfields, add the next one to the structure
if( $i < $#$field_list) {
unless( defined( $stats->{$field}->{ $row->{$field} }->{subfield} ) ) {
$stats->{$field}->{ $row->{$field} }->{subfield} = {};
}
$stats = $stats->{$field}->{ $row->{$field} }->{subfield};
}
}
}
# Method to print statistics
sub print {
my( $self, $fh ) = @_;
unless( defined( $fh ) ) {
$fh = new IO::Handle;
#TODO Error check: did we get a file handle on STDOUT?
$fh->fdopen(fileno(STDOUT),"w");
}
$fh->print(join("\t\t", @{$self->{count_config}}), "\n");
return $self->_print( $fh, $self->{stats}, '' );
}
# Recursive print of statistics
sub _print {
my( $self, $fh, $stats, $indent ) = @_;
foreach my $field (sort keys %{$stats}) {
foreach my $value (sort keys %{$stats->{$field}}) {
$fh->print("$indent$value\t", $stats->{$field}->{$value}->{stats}->{count}, "\n");
if(defined($stats->{$field}->{$value}->{subfield})) {
$self->_print( $fh, $stats->{$field}->{$value}->{subfield}, "$indent\t\t" );
}
}
}
}
sub print_terse_sum {
my( $self, $fh ) = @_;
unless( defined( $fh ) ) {
$fh = new IO::Handle;
#TODO Error check: did we get a file handle on STDOUT?
$fh->fdopen(fileno(STDOUT),"w");
}
$fh->print($self->terse_sum_line(), "\n");
}
sub terse_sum_line {
my( $self ) = @_;
my $ts = $self->terse_sum();
my $line = '';
foreach my $field (@{$self->{count_config}}) {
$line .= sprintf("%6s % 6d % 6d ", $field, $ts->{$field}->{count}, $ts->{$field}->{sum});
}
return $line;
}
sub terse_sum {
my( $self ) = @_;
my $result = {};
foreach my $field (@{$self->{count_config}}) {
$result->{$field} = {
count => 0,
sum => 0
};
}
$self->_terse_sum( $self->{stats}, $result );
return $result;
}
# Recursive print of statistics
sub _terse_sum {
my( $self, $stats, $result ) = @_;
foreach my $field (keys %{$stats}) {
foreach my $value (keys %{$stats->{$field}}) {
$result->{$field}->{count}++;
$result->{$field}->{sum} += $stats->{$field}->{$value}->{stats}->{count};
if(defined($stats->{$field}->{$value}->{subfield})) {
$self->_terse_sum( $stats->{$field}->{$value}->{subfield}, $result );
}
}
}
}
1;