Statistics::LSNoHistory - Least-Squares linear regression package without


Statistics-LSNoHistory documentation Contained in the Statistics-LSNoHistory distribution.

Index


Code Index:

NAME

Top

Statistics::LSNoHistory - Least-Squares linear regression package without data history

SYNOPSIS

Top

  # construct from points
  $reg = Statistics::LSNoHistory->new(points => [
    1.0 => 1.0,
    2.1 => 1.9,
    2.8 => 3.2,
    4.0 => 4.1,
    5.2 => 4.9
  ]);

  # other equivalent constructions
  $reg = Statistics::LSNoHistory->new(
    xvalues => [1.0, 2.1, 2.8, 4.0, 5.2],
    yvalues => [1.0, 1.9, 3.2, 4.1, 4.9]
  );
  # or
  $reg = Statistics::LSNoHistory->new;
  $reg->append_arrays(
    [1.0, 2.1, 2.8, 4.0, 5.2],
    [1.0, 1.9, 3.2, 4.1, 4.9]
  );
  # or
  $reg = Statistics::LSNoHistory->new;
  $reg->append_points(
    1.0 => 1.0, 2.1 => 1.9, 2.8 => 3.2, 4.0 => 4.1, 5.2 => 4.9
  );

  # You may also construct from the preliminary statistics of a 
  # previous regression:
  $reg = Statistics::LSNoHistory->new(
    num => 5,
    sumx => 15.1,
    sumy => 15.1,
    sumxx => 56.29,
    sumyy => 55.67,
    sumxy => 55.83,
    minx => 1.0,
    maxx => 5.2,
    miny => 1.0,
    maxy => 4.9
  );
  # thus a branch may be instantiated as follows
  $branch = Statistics::LSNoHistory->new(%{$reg->dump_stats});
  $reg->append_point(6.1, 5.9);
  $branch->append_point(5.8, 6.0);

  # calculate regression values, print some
  printf("Slope: %.2f\n", $reg->slope);
  printf("Intercept %.2f\n", $reg->intercept);
  printf("Correlation Coefficient: %.2f\n", $reg->pearson_r);
  ...




DESCRIPTION

Top

This package provides standard least squares linear regression functionality without the need for storing the complete data history. Like any other, it finds best m,k (in least squares sense) so that y = m*x + k fits data points (x_1,y_1),...,(x_n,y_n).

In many applications involving linear regression, it is desirable to compute a regression based on the intermediate statistics of a previous regression along with any new data points. Thus there is no need to store a complete data history, but rather only a minimal set of intermediate statistics, the number of which, thanks to Gauss, is 6.

The user interface provides a way to instantiate a regression object with either raw data or previous intermediate statistics.

CONSTRUCTOR ARGUMENTS

Top

The constructor (or class method new) takes several possible arguments. The initialization scenario depends on the kinds of arguments passed and falls into one of the following categories:

METHODS

Top

BUGS

Top



This technique is more susceptible to roundoff errors than others which store the data. Extra care must be taken to scale the data before processing.

AUTHOR

Top



John Pliam <pliam@cpan.org>

SEE ALSO

Top



CPAN modules: Statistics::OLS, Statistics::Descriptive, Statistics::GaussHelmert, Statistics::Regression.

Any book on statistics, any handbook of mathematics, any comprehensive book on numerical algorithms.

Press et al, Numerical Recipes in L [L in {C,Fortran, ...}], Nth edition [N > 0], Cambridge Univ Press.

COPYING

Top



See distribution file COPYING for complete information.

Statistics-LSNoHistory documentation Contained in the Statistics-LSNoHistory distribution.
#
# LSNoHistory.pm - least-squares regression without data history
#
# $Id: LSNoHistory.pm,v 1.6 2003/02/23 05:11:29 pliam Exp $
#

package Statistics::LSNoHistory;
use strict;

use vars qw($VERSION);
$VERSION = sprintf("%d.%02d", (q$Name: LSNoHist_Release_0_01 $ =~ /\d+/g));

#############################################################################
# top-level pod 
#############################################################################

#############################################################################
# construction
#############################################################################

## new constructor
sub new {
	my $class = shift;
	my %args = @_;
	my $self;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max

	# if complete set of statistics, construct from previous state
	# if (@stats == scalar(grep {defined($args{$_})} @stats)) {
	if (@stats == grep {defined($args{$_})} @stats) {
		# reject unsupported arguments and combinations 
		if (grep {defined($args{$_})} qw(points xvalues yvalues)) {
			die "Cannot give new data along with previous state.";
		}
		unless (@stats == keys %args) {
			die "Unknown constructor arguments.";
		}
		# check the number of points for consistency
		unless (abs(int($args{num})) == $args{num}) {
			die "Bad number of points: must be positive integer.";
		}
		$self = \%args;
    	bless $self, $class;
		return $self;
	}
	# in any other case we're starting from scratch
	$self = {};
	bless $self, $class;
	$self->_init;
	# x & y value array refs
	if (defined($args{xvalues}) && defined($args{yvalues})) {
		if (defined $args{points}) {
			die "Must give points or array values, but not both";
		}
		unless (scalar(keys %args) == 2) {
			die "Unknown constructor arguments.";
		}
		$self->append_arrays($args{xvalues}, $args{yvalues});
	}
	# (x,y) point array ref
	elsif (defined($args{points})) {
		if (grep {defined($args{$_})} qw(xvalues yvalues)) {
			die "Must give points or array values, but not both";
		}
		unless (scalar(keys %args) == 1) {
			die "Unknown constructor arguments.";
		}
		$self->append_points(@{$args{points}});
	}
	# default constructor (already initialized above)
	else { 
		if (scalar(keys %args)) {
			die "Unknown constructor arguments.";
		}
	}
    return $self;
}

## _init in this context really means start with state of 0's
sub _init {
	my $self = shift;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max

	@$self{@stats} = (0) x scalar(@stats);
}


#############################################################################
# other methods
#############################################################################
#
# adding data
#

## append_point
sub append_point {
	my $self = shift;
	my($x,$y) = @_;

	## will have to recompute regression
	$self->{cached} = 0;

	# min/max
	if ($self->{num}) {
		$self->{minx} = ($x < $self->{minx}) ? $x : $self->{minx};
		$self->{maxx} = ($x > $self->{maxx}) ? $x : $self->{maxx};
		$self->{miny} = ($y < $self->{miny}) ? $y : $self->{miny};
		$self->{maxy} = ($y > $self->{maxy}) ? $y : $self->{maxy};
	}
	else {
		$self->{minx} = $x;
		$self->{maxx} = $x;
		$self->{miny} = $y;
		$self->{maxy} = $y;
	}

	# classic stats
	$self->{num}++;
	$self->{sumx} += $x;
	$self->{sumy} += $y;
	$self->{sumxx} += $x**2;
	$self->{sumyy} += $y**2;
	$self->{sumxy} += $x*$y;
}

## append_points
sub append_points {
	my $self = shift;
	my @points = @_;
	my $num = scalar(@points);

	if ($num % 2) { die "Incomplete list of points."; }

	$num /= 2;
	for (1..$num) { $self->append_point(splice(@points, 0, 2)); }
}


## append_arrays
sub append_arrays {
	my $self = shift;
	my ($xr, $yr) = @_;
	my ($xn, $yn);

	# check arg type
	unless ((ref($xr) eq 'ARRAY') && (ref($yr) eq 'ARRAY'))  { 
		die "Must pass pair of array references."; 
	}

	# check that sizes match
	$xn = scalar(@$xr);
	$yn = scalar(@$yr);
	unless ($xn == $yn) { die "Incomplete list of points."; }

	for (1..$xn) { $self->append_point(shift(@$xr), shift(@$yr)); }
}

#
# computing the regression
#

## _regress method -- done behind the scenes & considered private
sub _regress {
	my $self = shift;
	my($n) = $self->{num};
	my($dx) = $n*$self->{sumxx} - $self->{sumx}**2;
	my($dy) = $n*$self->{sumyy} - $self->{sumy}**2;

	# check that we have 2 points 
	unless ($n >= 2) { die "Must have at least 2 points for regression."; }
	# check data for consistency
	unless (($dx!=0) && ($dy!=0)) { 
		die "Inconsistent data: would divide by zero."; 
	}

	# means and variances
	$self->{avgx} = $self->{sumx}/$n;
	$self->{avgy} = $self->{sumy}/$n;
	$self->{varx} = $dx/$n/($n-1);
	$self->{vary} = $dy/$n/($n-1);

	# slopes and intercepts
	$self->{mx} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy})/$dx;
	$self->{kx} = $self->{avgy} - $self->{mx}*$self->{avgx};
	$self->{my} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy})/$dy;
	$self->{ky} = $self->{avgx} - $self->{my}*$self->{avgy};
	
	# correlation coefficient (Pearson's r) and chi squared
	$self->{r} = ($n*$self->{sumxy} - $self->{sumx}*$self->{sumy}) 
		/ sqrt($dx*$dy);
	$self->{chi2} = (1-$self->{r}**2)*$dy/$n;

	# flag that regression calculations are up to date
	$self->{cached} = 1;
}

#
# presentation of stats, prediction
#

## average_x 
sub average_x { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{avgx}
}

## average_y 
sub average_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{avgy}
}

## variance_x
sub variance_x { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{varx}
}

## variance_y
sub variance_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{vary}
}

## slope
sub slope { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{mx}
}

## intercept
sub intercept { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{kx}
}

## predict - predicte a y value given an x value
sub predict {
	my $self = shift;
	my($x) = @_;

	$self->_regress unless $self->{cached};
	return $self->{mx}*$x + $self->{kx};
}

## slope_y
sub slope_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{my}
}

## intercept_y
sub intercept_y { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{ky}
}

## predict_x - predicte an x value given a y value
sub predict_x {
	my $self = shift;
	my($y) = @_;

	$self->_regress unless $self->{cached};
	return $self->{my}*$y + $self->{ky};
}

## pearson_r
sub pearson_r { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{r}
}

## chi_squared
sub chi_squared { 
	my $self = shift;
	$self->_regress unless $self->{cached};
	return $self->{chi2}
}

## minimum_x
sub minimum_x { return shift->{minx}; }

## maximum_x
sub maximum_x { return shift->{maxx}; }

## minimum_y
sub minimum_y { return shift->{miny}; }

## maximum_y
sub maximum_y { return shift->{maxy}; }

## dump_stats
sub dump_stats { 
	my $self = shift;
	my @stats = qw(num sumx sumy sumxx sumyy sumxy);
	push(@stats, qw(minx maxx miny maxy)); # min/max
	my %dump;

	@dump{@stats} = @$self{@stats};
	return \%dump;
}

1;

__END__