/usr/local/CPAN/Simulation-Automate/Simulation/Automate/Analysis.pm


package Simulation::Automate::Analysis;

use vars qw( $VERSION );
$VERSION = "1.0.1";

#################################################################################
#                                                                              	#
#  Copyright (C) 2000,2002 Wim Vanderbauwhede. All rights reserved.             #
#  This program is free software; you can redistribute it and/or modify it      #
#  under the same terms as Perl itself.                                         #
#                                                                              	#
#################################################################################

#=headers

#Package for statistical analysis of SynSim results.
#Based on create_histograms.pl
#This is not finished by far, but already useful.
#This module is used by SynSim.pm and PostProcessors.pm

#$Id$

#=cut

use strict;
use Carp;
use FileHandle;
use Exporter;

@Simulation::Automate::Analysis::ISA = qw(Exporter);
@Simulation::Automate::Analysis::EXPORT = qw(
		     &calc_statistics
		     &build_histograms
                  );

my $v=0;
my %bins=();
my %spec=();
my %trend=();
my @parnames;
my @parunits;
my $info;
my @info;
my $nlots=0;
my @pars=();
my $ncols;
my $data=0;
my $l=0;
my $nsites=0;

my $binfile;
my $title;
my $nbins;
my $min;
my $max;
my $uctitle;
my $lctitle;
my $log='';


################################################################################
##
## PROCESS THE DATAFILE
##
sub process_datafile {
my $binfile=shift;
my %trend=();
push @{$trend{DIFF}},0;

open(IN,"<$binfile")||carp "Can't open $binfile: $!\n";
my $j=0;
$nlots++;
my $in_table=0;
$info=0;
@info=();

#if just 1 column:
my $par=$pars[0];
if($par =~/dif/i){$par='DIFF'};
#else{$par='PAR'}

my $datacol=$pars[1];

while(<IN>) {
/^\s+$/ && next;
/^\s*\#/ && next;
chomp;
s/\s+$//;
s/^\s+//;
my @row=split(/[\s\t]+/,$_);
my $ri=0;

#foreach my $par (@pars) {
#push @{$trend{$par}},$row[$ri];
#$ri++
#}

#if just 1 column:
#print "$row[$datacol-1]\n";
push @{$trend{$par}},$row[$datacol-1];
#print STDERR "$par ($datacol) : ",$row[$datacol-1],"\n";
$nsites++;

}#while

if($v){print "# Done parsing $binfile: $nsites sites\n";}
close IN;


if (exists $trend{DIFF}) {
foreach my $i (0..@{$trend{DIFF}}-2) {
${$trend{DIFF}}[$i]=${$trend{DIFF}}[$i+1]-${$trend{DIFF}}[$i];
}
pop @{$trend{DIFF}};
}

if($log=~/log/i) {
foreach my $logkey (sort keys %trend) {
($logkey !~/log/i) && next;
foreach my $i (0..@{$trend{$logkey}}-1) {
#carp "$i:${$trend{$logkey}}[$i]\n";
if(${$trend{$logkey}}[$i]>0){
${$trend{$logkey}}[$i]=log(${$trend{$logkey}}[$i])/log(10); 
} else {
${$trend{$logkey}}[$i]='';
}
} # each $i
} # each $logkey
} # if LOG
return(\%trend);
} # END of process_datafile


##############################################################################################

sub calc_average {
my $value_array_ref=shift;

my $avg=0;
my $stdev=0;
my $pct_bad=shift||0; # means we throw 50% of the devices away as outliers to calculate the rough mean
my $delta=shift||2e10; # means we take all devices within 20% deviation from actual mean 

my @tmp_values=sort numerical @{$value_array_ref};

my $min=$tmp_values[0];
my $max=$tmp_values[@tmp_values-1];
#print "#all values:".scalar(@tmp_values)."\n";
my $n_samp=scalar @tmp_values;
my $n_iter=int($n_samp*0.5*$pct_bad);

#1. allow $pct_bad bad devices
if($n_iter>0){

foreach my $iter (1..$n_iter) {
pop @tmp_values;
shift @tmp_values;
}
}
#2. calc approx average of these
my $tmpav=0;
foreach my $tmp_val (@tmp_values) {
#print "$tmp_val\n";
$tmpav+=$tmp_val;
}
#print "tmpav*ngood:$tmpav\n";
$tmpav/=@tmp_values;
#print "tmpav:$tmpav\n";
#3. reject based on $tmpav and $delta; calc actual average
my $n_good=0;
my $sumx=0;
my $sumxsq=0;
foreach my $tmp_val (@$value_array_ref) {
#print "$tmp_val ";
  if ($tmpav==0||abs(($tmp_val-$tmpav)/$tmpav)<=$delta) {
$sumx+=$tmp_val;
$sumxsq+=$tmp_val*$tmp_val;
$n_good++;
#print " :pass\n";
} #else {print " :fail\n";}
}
if ($n_good>1) {
$avg=$sumx/$n_good;
# calc stdev: stdev^2=(n*sum(xi^2)-sum(xi)^2)/n/(n-1)
$stdev=sqrt($sumxsq/($n_good-1)-$sumx*$sumx/($n_good*($n_good-1)));
} else {
$avg='';
$stdev='';}
if($v){print "# samples:$n_good\n";}
if($v){print "# AVG:$avg\tSTDEV:$stdev\n";}
return [$avg,$stdev,$min,$max];
}

##############################################################################################

sub numerical { $a<=>$b }

##############################################################################################

sub min {
my $a=shift;
my $b=shift;
my $min=abs(($a+$b)-abs($a-$b))/2;
return $min;
}


################################################################################
# Build Histogram

sub build_histogram {
# reference to array with values
my $value_array_ref=shift;

# number of bins, number of sigmas for with ofinterval
my $nbins=shift; 
my $nsigma=shift;
my $min=shift||'CALC';
my $max=shift||'CALC';

# calculate mean & sigma and min/max/binwidth
my ($avg,$stdev)=@{&calc_average($value_array_ref)};
if(not $nsigma){$nsigma=6}
#if((not $min) && ($min!=0)){
if($min=~/C/) { 
$min=$avg-$nsigma*$stdev;
if($stdev>$avg) {
# in this case take min and max value of set
$min=${$value_array_ref}[0];
  foreach my $val (@{$value_array_ref}) {
    if($val<$min){$min=$val}
  }
}
}
if($v){print "# MIN:$min\n";}
if( $max=~/C/) { 
$max=$avg+$nsigma*$stdev;

if(1||$stdev>$avg) {
# in this case take min and max value of set
$max=${$value_array_ref}[0];
  foreach my $val (@{$value_array_ref}) {
    if($val>$max){$max=$val}
  }
}
}
if($v){print "# MAX:$max\n";}

#For traffic studies, all values are always positive.
if($min<0){$min=0;}
my $binwidth=($max-$min)/$nbins;

## first sort

my @tmp_values=sort numerical @$value_array_ref;
my $i=0;

my $n_samp=scalar @tmp_values;
my @counts;
for my $i (0..$nbins+1) {
$counts[$i]=0;
}
my $bin=1;
my $binh;
my $binl;

foreach my $val (@tmp_values) {

$binh=$bin*$binwidth+$min; 
$binl=($bin-1)*$binwidth+$min;

  if($val>=$binl&&$val<$binh) { # if inside bin
$counts[$bin]++;

}
elsif($val<$min) { # if lower than min
#print "lower:$counts[0]\n";
$counts[0]++;
}
elsif($val>=$binh) { # if higher than bin, next bin
  while($val >=$binh && $bin<$nbins) {
#print STDERR "$bin\n";
if($bin<$nbins) {
$bin++; #max. 25
$counts[$bin]=0;
}else{$bin=$nbins} 
#so $bin <=25
$binl=($bin-1)*$binwidth+$min; #WV15072002
$binh=$bin*$binwidth+$min;
}

if($val<$binh){$counts[$bin]++} # if lower than bin+1
elsif($val>=$max) {
#print "higher:$counts[$nbins]\n";
$counts[$nbins+1]++
} elsif(not defined $counts[$bin] ) {$counts[$bin]=0} 
} else {print STDERR "#$binl#$val#$binh#\n";}
}

return [\@counts,$min,$binwidth,$avg,$stdev];
}

#------------------------------------------------------------------------------
sub get_common_args {
if(not @_){die 'arguments: $datafilename,\@parameters,$title,$log'."\n"}

 $binfile=shift;
 @pars= @{shift(@_)}; # deref an array ref
 $title=shift||'';
 $log=shift||'';
#carp "LOG:$log\n";
$uctitle=uc($title);
$lctitle=lc($title);
$lctitle=~s/\s+/_/g;
if($v){print "# $uctitle DATA ANALYSIS\n\n";}
return [@_];
}
#------------------------------------------------------------------------------
sub calc_statistics {
my @specific=@{&get_common_args(@_)};
my $reject=shift  @specific;
my $delta=shift  @specific;
my %trend=%{&process_datafile($binfile)};
my %stats=();

#foreach my $par (@pars) {
#just 1 par
my $par=$pars[0];
#($par=~/none/i) && next;
if($v){print "# Processing $par\n";}
($stats{$par}{AVG},$stats{$par}{STDEV},$stats{$par}{MIN},$stats{$par}{MAX})=@{&calc_average(\@{$trend{$par}})};
#} # foreach par

return( \%stats );

} # END of calc_statistics

#------------------------------------------------------------------------------
sub build_histograms {

my @specific=@{&get_common_args(@_)};
$nbins=shift  @specific;
$min =shift  @specific;
$max =shift  @specific;

my %trend=%{&process_datafile($binfile)};

my %hists=();


my $par=$pars[0];

print "# Processing $par\n" if $v;

my @tmpa=@{&build_histogram(\@{$trend{$par}},$nbins,3,$min,$max)};
my $tmp=$tmpa[0];
my $minbin=$tmpa[1];
my $binwidth=$tmpa[2];
my $avg=$tmpa[@tmpa-2];
my $stdev=$tmpa[@tmpa-1];
my $bi=0;

foreach my $bin (@{$tmp}) {
  if($bi>0){
if($v){print $minbin+($bi-1)*$binwidth,"\t$bin\n";}
push @{$hists{$par}},{BIN=>$minbin+($bi-1)*$binwidth,COUNT=>$bin};
}
$bi++;
}



return( \%hists );

} # END of build_histograms
#------------------------------------------------------------------------------
1;