| Compress-LZW-Progressive documentation | Contained in the Compress-LZW-Progressive distribution. |
Compress::LZW::Progressive - Progressive LZW-like compression
use Compress::LZW::Progressive; my $codec = Compress::LZW::Progressive->new(); my $compressed = $codec->compress($data);
This module implements a progressive LZW-like compression technique. The progressive nature means that, in general, the more times you call $codec->compress(), the more efficient the codec will get (more highly compressed the data will be).
The codec is LZW-like because it has the following differences with Compress::LZW:
- Compressor can request the decompressor to delete a certain number of least frequently used codes - Stream will have a end_segment codeword at the end of a segment - Number of bits used is predefined, and cannot change.
Creates a new codec for compressing and/or decompressing
Number of bits to use in dictionary entries. Generally this will be between 12-16. The greater the number of bits, the more dictionary entries can be held (2^^$num_bits entries), and therefore the larger memory requirements necessary on compression and decompression. Additionally, the more bits used, the easier it is for the decompressor to decompress.
If true, will print debug information to STDOUT.
Reset the state of the compressor/decompressor. If $which is either 'compress' or 'decompress', it'll only reset that part. Otherwise, it'll reset both.
Compress or decompress the string given and return the (de)compressed data.
Prints efficiency statistics: how compressed the data is, how many code words used, how many Kb the dictionary is occupying in memory. If $show_phrases is true, it'll also spit out phrase length statistics in the dictionary.
For more efficiency, the codec should support outputting codes over less than two bytes. For example, a 12 bit compressed segment would be better expressed using 1.5 bytes per code, since you're not going to be using 4 bits of each output code using "pack 'S*'".
The LZW algorithim implemented here is not compatible with any other LZW implementation. It is a slight varient from that implemented in Compress::LZW, but don't expect it to work with any other LZW compressed data.
Copyright (c) 2006 Eric Waters and XMission LLC (http://www.xmission.com/). All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
The full text of the license can be found in the LICENSE file included with this module.
Eric Waters <ewaters@uarc.com>
| Compress-LZW-Progressive documentation | Contained in the Compress-LZW-Progressive distribution. |
package Compress::LZW::Progressive; use strict; use warnings; use Compress::LZW::Progressive::Dict; use bytes; our $VERSION = '0.102'; my @empty_dict; @empty_dict[0..255] = map { chr } 0..255; sub new { my ($class, %args) = @_; $args{bits} ||= 16; my $code_counter = (2 ** $args{bits}) - 1; $args{code_end_segment} = $code_counter--; $args{code_add_start} = $code_counter--; $args{code_add_end} = $code_counter--; $args{code_delete_start} = $code_counter--; $args{code_delete_end} = $code_counter--; $args{code_delete_count} = $code_counter--; $args{code_max} = $code_counter--; $args{compress_resets} = 0; $args{decompress_resets} = 0; $args{compress_deleted_least_used_codes} = 0; my $self = bless \%args, $class; $self->reset(); return $self; } sub reset { my ($self, $which) = @_; $which ||= ''; if (! $which || $which eq 'compress') { $self->{cdict} = Compress::LZW::Progressive::Dict->new(); $self->{compress_resets}++; } if (! $which || $which eq 'decompress') { $self->{ddict_reuse_codes} = []; $self->{ddict} = [ @empty_dict ]; $self->{ddict_usage} = Compress::LZW::Progressive::Dict->new(); $self->{dnext} = 256; $self->{decompress_resets}++; } $self->{code_frequency} = {}; $self->{stats} = {}; } sub compress { my ($self, $str) = @_; my $dict = $self->{cdict}; my $debug = $self->{debug}; my @out = (); my @char = split //, $str; while (int @char > 0) { print "Matching '".join('', @char[0..($#char > 20 ? 20 : $#char)])."'\n" if $debug; # Find the code that matches the most of the upcoming chars my $code = $dict->code_matching_array(\@char); die "Caouldn't find code to match '".join('', @char)."'" if ! defined $code; my $phrase = $dict->phrase($code); die "Found code that has no phrase ($code)" if ! length $phrase; $dict->increment_code_usage_count($code); print " + $code for '$phrase'" if $debug; push @out, $code; # Remove the phrase found from the start of the @char splice @char, 0, length($phrase); if (! defined $char[0]) { print "\n" if $debug; last; } # If I'm running out of code space... if ($dict->codes_used + 1 == $self->{code_max}) { # First, try getting some old, unused codes, and asking the client to delete that many (1/4th of custom codes) my $delete_max_old_codes = int(($dict->codes_used - 256) * .25); if (my @delete = $dict->least_used_codes($delete_max_old_codes)) { print "Asking reusal of ".int(@delete)." codes ".join(', ', @delete)."\n" if $debug; die "Couldn't delete codes" unless $dict->delete_codes(@delete); # Push to out codestream push @out, $self->{code_delete_count}; push @out, int @delete; $self->{compress_deleted_least_used_codes}++; # ...and continue with next code creation } # Otherwise (probably won't get here), do a full dict reset and skip to next char (can't create new) else { print " + reset code '".$self->{code_max}."'\n" if $debug; push @out, $self->{code_max}; $self->{compress_resets}++; $dict = Compress::LZW::Progressive::Dict->new(); print "\n" if $debug; next; } } my $new_phrase = $phrase . $char[0]; my $new_code = $dict->add($new_phrase); print ", creating $new_code => '$new_phrase'\n" if $debug; $dict->increment_code_usage_count($new_code); } print "End of \@char; putting end segment code\n" if $debug; push @out, $self->{code_end_segment}; $self->{stats}{last_compress_in_bytes} = length $str; $self->{stats}{last_compress_out_bytes} = int(@out) * 2; $self->{stats}{compress_in_bytes} += $self->{stats}{last_compress_in_bytes}; $self->{stats}{compress_out_bytes} += $self->{stats}{last_compress_out_bytes}; $self->{cdict} = $dict; return pack 'S*', @out; } sub decompress { my ($self, $str) = @_; my $dict = $self->{ddict}; my $dict_usage = $self->{ddict_usage}; my $reuse = $self->{ddict_reuse_codes}; my $debug = $self->{debug}; my $next = $self->{dnext}; my @code = unpack 'S*', $str; my $last_code; my $return = ''; while (defined (my $code = shift @code)) { if ($code >= $self->{code_max}) { print "Code $code\n" if $debug; # Resetting dictionary to scratch if ($code == $self->{code_max}) { print "Resetting decompress as have reached the max code '$self->{code_max}'\n" if $debug; $self->{decompress_resets}++; $next = 256; $dict = [ @empty_dict ]; $last_code = undef; } # End of segment; don't allow last code to affect new codes elsif ($code == $self->{code_end_segment}) { print "Reached seg code '$self->{code_end_segment}'\n" if $debug; $last_code = undef; } # Process a list of codes to delete elsif ($code == $self->{code_delete_start}) { while (defined (my $delete_code = shift @code)) { last if $delete_code == $self->{code_delete_end}; $dict_usage->{codes_used}[$delete_code] = undef; $dict->[$delete_code] = undef; push @$reuse, $delete_code; } } # Received a request to delete a number of unused codes; find that many least used codes and delete them elsif ($code == $self->{code_delete_count}) { my $delete_count = shift @code; my @delete = $dict_usage->least_used_codes($delete_count); if (int(@delete) != $delete_count) { die "Tried to find $delete_count unused codes, but found ".int(@delete)." instead; (".join(', ', @delete).")\n"; } print "Reusing ".int(@delete)." (asked $delete_count) codes ".join(', ', @delete)."\n" if $debug; foreach my $delete_code (@delete) { if (! $dict->[$delete_code]) { die "Attempting to delete non-defined code $delete_code"; } $dict_usage->{codes_used}[$delete_code] = undef; $dict->[$delete_code] = undef; push @$reuse, $delete_code; } } next; } my $next_code; if (defined $dict->[$code]) { $return .= $dict->[$code]; print " + '".$dict->[$code]."' from $code" if $debug; if (defined $last_code) { $next_code = @$reuse ? shift @$reuse : $next++; $dict->[$next_code] = $dict->[$last_code] . substr($dict->[$code], 0, 1); print " and adding '".$dict->[$next_code]."' to dict on code $next_code" if $debug; } print "\n" if $debug; } # This is the edge case where repeating phrase won't be defined (see wikipedia.org on LZW) else { $next_code = @$reuse ? shift @$reuse : $next++; my $dp = $dict->[$last_code]; $return .= $dict->[$code] = $dp . substr($dp, 0, 1); print " + '".$dict->[$code]."' from $code\n" if $debug; } $dict_usage->increment_code_usage_count($next_code) if defined $next_code; $dict_usage->increment_code_usage_count($code); $last_code = $code; } $self->{stats}{last_decompress_in_bytes} = length $str; $self->{stats}{last_decompress_out_bytes} = length $return; $self->{stats}{decompress_in_bytes} += $self->{stats}{last_decompress_in_bytes}; $self->{stats}{decompress_out_bytes} += $self->{stats}{last_decompress_out_bytes}; $self->{dnext} = $next; $self->{ddict} = $dict; return $return; } sub stats { my ($self, $type, $phrases) = @_; my $devel_size; eval { require Devel::Size; $devel_size = 1; }; if ($@) { print STDERR "Devel::Size not installed so stats() will exclude data size\n"; } my @return; push @return, sprintf "Bits %d", $self->{bits}; if (! $type || $type eq 'compress') { push @return, sprintf "Compress efficiency: %3.1f%% (%3.1f%% last) with %d/%d codes used", 100 * (1 - ($self->{stats}{compress_out_bytes} / $self->{stats}{compress_in_bytes})), 100 * (1 - ($self->{stats}{last_compress_out_bytes} / $self->{stats}{last_compress_in_bytes})), $self->{cdict}->codes_used, $self->{code_max}, ; push @return, sprintf "cdict: %.2f Kb", Devel::Size::total_size($self->{cdict}) / 1024 if $devel_size; if ($phrases) { # Collect stats on phrase lengths my $smallest = 100; my $largest = 0; my $total = 0; my $avg_count = 0; foreach my $code (256..$#{ $self->{cdict}{array} }) { my $phrase = $self->{cdict}->phrase($code); next unless defined $phrase; my $length = length $phrase; $smallest = $length if $length < $smallest; $total += $length; $avg_count++; $largest = $length if $length > $largest; } my $average = $total / $avg_count; push @return, sprintf "phrase lengths, sm: %d, avg: %d, lg: %d, total: %d", $smallest, $average, $largest, $total; } } if (! $type || $type eq 'decompress') { push @return, sprintf "ddict: %.2f Kb [%d/%d codes used]", (Devel::Size::total_size($self->{ddict}) + Devel::Size::total_size($self->{ddict_reuse_codes}) + Devel::Size::total_size($self->{ddict_usage})) / 1024, $self->{dnext} - int @{ $self->{ddict_reuse_codes} }, $self->{code_max} if $devel_size; } return join("; ", @return); } sub dict_dump { my $self = shift; my $return = " Index | Compress | Decompress \n"; my $comp = $self->{cdict}{array}; #[ sort { $self->{cdict}{$a} <=> $self->{cdict}{$b} } keys %{ $self->{cdict} } ]; my $decomp = $self->{ddict}; my $count = $#{ $comp } > $#{ $decomp } ? $#{ $comp } : $#{ $decomp }; my @char_map = qw/nul soh stx etx eot enq ack bel bs ht lf vt ff cr so si dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us/; $char_map[127] = 'del'; my $show_invis = sub { my $return = ''; foreach my $char (split(//, shift)) { my $num = ord $char; if (($num < 32 || $num == 127 || $num > 128) && $num != 10) { if (defined $char_map[$num]) { $return .= "^$char_map[$num]"."[$num]"; } else { $return .= "[$num]"; } } else { $return .= $char; } } return $return; }; for my $i (0..$count) { my $c = defined $comp->[$i] ? $comp->[$i] : '[undef]'; my $d = defined $decomp->[$i] ? $decomp->[$i] : '[undef]'; next if $c eq $d; printf " %-6d | %6s %s %6s\n", $i, $show_invis->($c), ($c eq $d ? '=' : '!'), $show_invis->($d); } } 1; __END__