Compress::LZW::Progressive - Progressive LZW-like compression


Compress-LZW-Progressive documentation Contained in the Compress-LZW-Progressive distribution.

Index


Code Index:

NAME

Top

Compress::LZW::Progressive - Progressive LZW-like compression

SYNOPSIS

Top

  use Compress::LZW::Progressive;

  my $codec = Compress::LZW::Progressive->new();

  my $compressed = $codec->compress($data);

DESCRIPTION

Top

This module implements a progressive LZW-like compression technique. The progressive nature means that, in general, the more times you call $codec->compress(), the more efficient the codec will get (more highly compressed the data will be).

CUSTOM LZW

Top

The codec is LZW-like because it has the following differences with Compress::LZW:

- Compressor can request the decompressor to delete a certain number of least frequently used codes - Stream will have a end_segment codeword at the end of a segment - Number of bits used is predefined, and cannot change.

USAGE

Top

new (...)

Creates a new codec for compressing and/or decompressing

* bits => $num_bits (default 16)

Number of bits to use in dictionary entries. Generally this will be between 12-16. The greater the number of bits, the more dictionary entries can be held (2^^$num_bits entries), and therefore the larger memory requirements necessary on compression and decompression. Additionally, the more bits used, the easier it is for the decompressor to decompress.

* debug => $boolean (default 0)

If true, will print debug information to STDOUT.

reset ($which)

Reset the state of the compressor/decompressor. If $which is either 'compress' or 'decompress', it'll only reset that part. Otherwise, it'll reset both.

compress ($str)

decompress ($str)

Compress or decompress the string given and return the (de)compressed data.

stats ($show_phrases)

Prints efficiency statistics: how compressed the data is, how many code words used, how many Kb the dictionary is occupying in memory. If $show_phrases is true, it'll also spit out phrase length statistics in the dictionary.

TO DO

Top

For more efficiency, the codec should support outputting codes over less than two bytes. For example, a 12 bit compressed segment would be better expressed using 1.5 bytes per code, since you're not going to be using 4 bits of each output code using "pack 'S*'".

KNOWN BUGS

Top

The LZW algorithim implemented here is not compatible with any other LZW implementation. It is a slight varient from that implemented in Compress::LZW, but don't expect it to work with any other LZW compressed data.

COPYRIGHT

Top

AUTHOR

Top

Eric Waters <ewaters@uarc.com>


Compress-LZW-Progressive documentation Contained in the Compress-LZW-Progressive distribution.

package Compress::LZW::Progressive;

use strict;
use warnings;
use Compress::LZW::Progressive::Dict;
use bytes;

our $VERSION = '0.102';

my @empty_dict;
@empty_dict[0..255] = map { chr } 0..255;

sub new {
	my ($class, %args) = @_;

	$args{bits} ||= 16;

	my $code_counter = (2 ** $args{bits}) - 1;
	$args{code_end_segment} = $code_counter--;
	$args{code_add_start} = $code_counter--;
	$args{code_add_end} = $code_counter--;
	$args{code_delete_start} = $code_counter--;
	$args{code_delete_end} = $code_counter--;
	$args{code_delete_count} = $code_counter--;
	$args{code_max} = $code_counter--;

	$args{compress_resets} = 0;
	$args{decompress_resets} = 0;

	$args{compress_deleted_least_used_codes} = 0;

	my $self = bless \%args, $class;
	$self->reset();

	return $self;
}

sub reset {
	my ($self, $which) = @_;
	$which ||= '';

	if (! $which || $which eq 'compress') {
		$self->{cdict} = Compress::LZW::Progressive::Dict->new();
		$self->{compress_resets}++;
	}

	if (! $which || $which eq 'decompress') {
		$self->{ddict_reuse_codes} = [];
		$self->{ddict} = [ @empty_dict ];
		$self->{ddict_usage} = Compress::LZW::Progressive::Dict->new();
		$self->{dnext} = 256;
		$self->{decompress_resets}++;
	}

	$self->{code_frequency} = {};
	$self->{stats} = {};
}

sub compress {
	my ($self, $str) = @_;

	my $dict  = $self->{cdict};
	my $debug = $self->{debug};

	my @out = ();

	my @char = split //, $str;
	while (int @char > 0) {
		print "Matching '".join('', @char[0..($#char > 20 ? 20 : $#char)])."'\n" if $debug;

		# Find the code that matches the most of the upcoming chars
		my $code = $dict->code_matching_array(\@char);
		die "Caouldn't find code to match '".join('', @char)."'" if ! defined $code;

		my $phrase = $dict->phrase($code);
		die "Found code that has no phrase ($code)" if ! length $phrase;

		$dict->increment_code_usage_count($code);
		print " + $code for '$phrase'" if $debug;
		push @out, $code;

		# Remove the phrase found from the start of the @char
		splice @char, 0, length($phrase);
		if (! defined $char[0]) {
			print "\n" if $debug;
			last;
		}

		# If I'm running out of code space...
		if ($dict->codes_used + 1 == $self->{code_max}) {
			# First, try getting some old, unused codes, and asking the client to delete that many (1/4th of custom codes)
			my $delete_max_old_codes = int(($dict->codes_used - 256) * .25);
			if (my @delete = $dict->least_used_codes($delete_max_old_codes)) {
				print "Asking reusal of ".int(@delete)." codes ".join(', ', @delete)."\n" if $debug;
				die "Couldn't delete codes" unless $dict->delete_codes(@delete);

				# Push to out codestream
				push @out, $self->{code_delete_count};
				push @out, int @delete;

				$self->{compress_deleted_least_used_codes}++;

				# ...and continue with next code creation
			}
			# Otherwise (probably won't get here), do a full dict reset and skip to next char (can't create new)
			else {
				print " + reset code '".$self->{code_max}."'\n" if $debug;
				push @out, $self->{code_max};

				$self->{compress_resets}++;
				$dict = Compress::LZW::Progressive::Dict->new();
				print "\n" if $debug;
				next;
			}
		}

		my $new_phrase = $phrase . $char[0];
		my $new_code = $dict->add($new_phrase);
		print ", creating $new_code => '$new_phrase'\n" if $debug;
		$dict->increment_code_usage_count($new_code);
	}
	print "End of \@char; putting end segment code\n" if $debug;
	push @out, $self->{code_end_segment};

	$self->{stats}{last_compress_in_bytes} = length $str;
	$self->{stats}{last_compress_out_bytes} = int(@out) * 2;
	$self->{stats}{compress_in_bytes} += $self->{stats}{last_compress_in_bytes};
	$self->{stats}{compress_out_bytes} += $self->{stats}{last_compress_out_bytes};
    
	$self->{cdict} = $dict;
	return pack 'S*', @out;
}

sub decompress {
	my ($self, $str) = @_;
    
	my $dict       = $self->{ddict};
	my $dict_usage = $self->{ddict_usage};
	my $reuse      = $self->{ddict_reuse_codes};
	my $debug      = $self->{debug};
	my $next       = $self->{dnext};

	my @code = unpack 'S*', $str;
    
	my $last_code;
	my $return = '';
	while (defined (my $code = shift @code)) {
		if ($code >= $self->{code_max}) {
			print "Code $code\n" if $debug;
			# Resetting dictionary to scratch
			if ($code == $self->{code_max}) {
				print "Resetting decompress as have reached the max code '$self->{code_max}'\n" if $debug;

				$self->{decompress_resets}++;
				$next = 256;
				$dict = [ @empty_dict ];
				$last_code = undef;
			}
			# End of segment; don't allow last code to affect new codes
			elsif ($code == $self->{code_end_segment}) {
				print "Reached seg code '$self->{code_end_segment}'\n" if $debug;
				$last_code = undef;
			}
			# Process a list of codes to delete
			elsif ($code == $self->{code_delete_start}) {
				while (defined (my $delete_code = shift @code)) {
					last if $delete_code == $self->{code_delete_end};
					$dict_usage->{codes_used}[$delete_code] = undef;
					$dict->[$delete_code] = undef;
					push @$reuse, $delete_code;
				}
			}
			# Received a request to delete a number of unused codes; find that many least used codes and delete them
			elsif ($code == $self->{code_delete_count}) {
				my $delete_count = shift @code;

				my @delete = $dict_usage->least_used_codes($delete_count);
				if (int(@delete) != $delete_count) {
					die "Tried to find $delete_count unused codes, but found ".int(@delete)." instead; (".join(', ', @delete).")\n";
				}
				print "Reusing ".int(@delete)." (asked $delete_count) codes ".join(', ', @delete)."\n" if $debug;
				foreach my $delete_code (@delete) {
					if (! $dict->[$delete_code]) {
						die "Attempting to delete non-defined code $delete_code";
					}
					$dict_usage->{codes_used}[$delete_code] = undef;
					$dict->[$delete_code] = undef;
					push @$reuse, $delete_code;
				}
			}
			next;
		}

		my $next_code;
		if (defined $dict->[$code]) {
			$return .= $dict->[$code];

			print " + '".$dict->[$code]."' from $code" if $debug;
			if (defined $last_code) {
				$next_code = @$reuse ? shift @$reuse : $next++;
				$dict->[$next_code] = $dict->[$last_code] . substr($dict->[$code], 0, 1);
				print " and adding '".$dict->[$next_code]."' to dict on code $next_code" if $debug;
			}
			print "\n" if $debug;
		}
		# This is the edge case where repeating phrase won't be defined (see wikipedia.org on LZW)
		else {
			$next_code = @$reuse ? shift @$reuse : $next++;
			my $dp = $dict->[$last_code];
			$return .= $dict->[$code] = $dp . substr($dp, 0, 1);
			print " + '".$dict->[$code]."' from $code\n" if $debug;
		}
		$dict_usage->increment_code_usage_count($next_code) if defined $next_code;
		$dict_usage->increment_code_usage_count($code);
		$last_code = $code;
	}

	$self->{stats}{last_decompress_in_bytes} = length $str;
	$self->{stats}{last_decompress_out_bytes} = length $return;
	$self->{stats}{decompress_in_bytes} += $self->{stats}{last_decompress_in_bytes};
	$self->{stats}{decompress_out_bytes} += $self->{stats}{last_decompress_out_bytes};

	$self->{dnext} = $next;
	$self->{ddict} = $dict;
	return $return;
}

sub stats {
	my ($self, $type, $phrases) = @_;

	my $devel_size;
	eval {
		require Devel::Size;
		$devel_size = 1;
	};
	if ($@) {
		print STDERR "Devel::Size not installed so stats() will exclude data size\n";
	}

	my @return;

	push @return, sprintf "Bits %d", $self->{bits};
	if (! $type || $type eq 'compress') {
		push @return, sprintf "Compress efficiency: %3.1f%% (%3.1f%% last) with %d/%d codes used",
			100 * (1 - ($self->{stats}{compress_out_bytes} / $self->{stats}{compress_in_bytes})),
			100 * (1 - ($self->{stats}{last_compress_out_bytes} / $self->{stats}{last_compress_in_bytes})),
			$self->{cdict}->codes_used,
			$self->{code_max},
			;
		push @return, sprintf "cdict: %.2f Kb",
			Devel::Size::total_size($self->{cdict}) / 1024
			if $devel_size;

		if ($phrases) {
			# Collect stats on phrase lengths
			my $smallest = 100;
			my $largest = 0;
			my $total = 0;
			my $avg_count = 0;
			foreach my $code (256..$#{ $self->{cdict}{array} }) {
				my $phrase = $self->{cdict}->phrase($code);
				next unless defined $phrase;
				my $length = length $phrase;
				$smallest = $length if $length < $smallest;
				$total += $length;
				$avg_count++;
				$largest = $length if $length > $largest;
			}
			my $average = $total / $avg_count;

			push @return, sprintf "phrase lengths, sm: %d, avg: %d, lg: %d, total: %d",
				$smallest, $average, $largest, $total;
		}
	}
	if (! $type || $type eq 'decompress') {
		push @return, sprintf "ddict: %.2f Kb [%d/%d codes used]",
			(Devel::Size::total_size($self->{ddict}) +
		 	 Devel::Size::total_size($self->{ddict_reuse_codes}) + 
			 Devel::Size::total_size($self->{ddict_usage})) / 1024,
			$self->{dnext} - int @{ $self->{ddict_reuse_codes} },
			$self->{code_max}
			if $devel_size;
	}

	return join("; ", @return);
}

sub dict_dump {
	my $self = shift;
	my $return = " Index | Compress | Decompress \n";

	my $comp = $self->{cdict}{array}; #[ sort { $self->{cdict}{$a} <=> $self->{cdict}{$b} } keys %{ $self->{cdict} } ];
	my $decomp = $self->{ddict};
	my $count = $#{ $comp } > $#{ $decomp } ? $#{ $comp } : $#{ $decomp }; 
	
	my @char_map = qw/nul soh stx etx eot enq ack bel bs ht lf vt ff cr so si dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us/;
	$char_map[127] = 'del';
	my $show_invis = sub {
		my $return = '';
		foreach my $char (split(//, shift)) {
			my $num = ord $char;
			if (($num < 32 || $num == 127 || $num > 128) && $num != 10) {
				if (defined $char_map[$num]) {
					$return .= "^$char_map[$num]"."[$num]";
				} else {
					$return .= "[$num]";
				}
			} else {
				$return .= $char;
			}
		}
		return $return;
	};

	for my $i (0..$count) {
		my $c = defined $comp->[$i] ? $comp->[$i] : '[undef]';
		my $d = defined $decomp->[$i] ? $decomp->[$i] : '[undef]';
		next if $c eq $d;
		
		printf " %-6d | %6s %s %6s\n", $i, $show_invis->($c), ($c eq $d ? '=' : '!'), $show_invis->($d);
	}
}

1;


__END__