| Encode-BOCU1 documentation | Contained in the Encode-BOCU1 distribution. |
Encode::BOCU1 -- encodes / decodes BOCU-1 string, works as part of Encode.pm
use Encode::BOCU1;
$string = 'Some text to convert... in UTF-8' Encode::from_to($string,'utf8','bocu1'); Encode::from_to($string,'bocu1','shiftjis');
BOCU-1 is a MIME-compatible application of the Binary Ordered Compression for Unicode [BOCU] base algorithm.
Encode::BOCU1 enables to convert any encoding systems supported by Encode.pm from/to BOCU-1 through UTF-8.
http://www.unicode.org/notes/tn6/ http://icu.sourceforge.net/docs/papers/binary_ordered_compression_for_unicode.html
Based on pure-perl port of "Sample C Code" on http://www.unicode.org/notes/tn6/, written by Markus W. Scherer on 2002jan24 and is
Copyright (C) 2002, International Business Machines Corporation and others. All Rights Reserved.
The "Sample C Code" is under the X license (ICU version). ICU License : http://dev.icu-project.org/cgi-bin/viewcvs.cgi/*checkout*/icu/license.html
BOCU "Binary-Ordered Compression For Unicode" is a patent-protected technology of IBM. (US Patent 6737994)
Ported and modified by Naoya Tozuka <naoyat@naochan.com> As with the original C code, this port is licensed under the X license (ICU version).
| Encode-BOCU1 documentation | Contained in the Encode-BOCU1 distribution. |
package Encode::BOCU1; use 5.008; use strict; use warnings; use Carp; use base qw(Encode::Encoding); our $VERSION = '0.03'; __PACKAGE__->Define('bocu1'); use Encode::Alias; define_alias( qr/^bocu.1$/i => '"bocu1"'); define_alias( qr/^bocu$/i => '"bocu1"'); # # encode / decode # sub encode($$;$) { my ($obj, $str, $check) = @_; my $octet = utf8_to_bocu1($str); $_[1] = '' if $check; return $octet; } sub decode($$;$) { my ($obj, $octet, $check) = @_; my $str = bocu1_to_utf8($octet); $_[1] = '' if $check; return $str; } # # subroutines # my @bocu1_trail_to_byte = ( # 0 - 19 (0x0 - 0x13) 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1c, 0x1d, 0x1e, 0x1f, # 20 - 242 (0x14 - 0xf2) 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff ); my @bocu1_byte_to_trail = ( # 0x00 - 0x20 -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, -1, # 0x21 - 0xff 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2 ); sub bocu1_to_utf8 { my $bocu1str = shift; my @chars = unpack("C*", $bocu1str); my $pc = 0x40; my @codepoints; for (my $i=0; $i<=$#chars; $i++) { my $lead = $chars[$i]; my $cp = 0; my $diff = 0; if ($lead <= 0x20) { $cp = $lead; } elsif ($lead == 0x21) { # 21 t1 t2 t3 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; $diff = 14161247 + $t1 * 59049 + $t2 * 243 + $t3 } elsif ($lead < 0x25) { # [22-24] t1 t2 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0 || $t2 < 0; $diff = -2195326 + $lead * 59049 + $t1 * 243 + $t2; } elsif ($lead < 0x50) { # [25-4F] t1 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0; $diff = -19504 + $lead * 243 + $t1; } elsif ($lead < 0xd0) { # [50-CF] $diff = $lead - 0x90; } elsif ($lead < 0xfb) { # [D0-FA] t1 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0; $diff = -50480 + $lead * 243 + $t1; } elsif ($lead < 0xfe) { # [FB-FD] t1 t2 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0 || $t2 < 0; $diff = -14810786 + $lead * 59049 + $t1 * 243 + $t2; } elsif ($lead == 0xfe) { # FE t1 t2 t3 my $t1 = $bocu1_byte_to_trail[$chars[++$i]]; my $t2 = $bocu1_byte_to_trail[$chars[++$i]]; my $t3 = $bocu1_byte_to_trail[$chars[++$i]]; croak "illegal trail char" if $t1 < 0 || $t2 < 0 || $t3 < 0; $diff = 187660 + $t1 * 59049 + $t2 * 243 + $t3; } elsif ($lead == 0xff) { ## reset $cp = 0; $diff = 0; } # codepoint, next pc if ($lead <= 0x20) { $pc = 0x40 if ($lead < 0x20); push(@codepoints,$lead); } elsif ($lead < 0xff) { $cp = $pc + $diff; $cp = 0 if $cp < 0; push(@codepoints,$cp); if ($cp < 0x20) { $pc = 0x40; } elsif ($cp == 0x20) { # keep pc } elsif (0x3040 <= $cp && $cp <= 0x309f) { $pc = 0x3070; } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { $pc = 0x7711; } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { $pc = 0xc1d1; } else { $pc = ($cp & ~0x7f) + 0x40; } } else { # 0xff : reset $pc = 0x40; } } my $utf8str = pack("U*", @codepoints); Encode::_utf8_on($utf8str); $utf8str; } sub utf8_to_bocu1 { my $utf8str = shift; my @chars = unpack("U*", $utf8str); my $bocu1str = '*' x $#chars; $bocu1str = ''; my $pc = 0x40; for (my $i=0; $i<=$#chars; $i++) { my $cp = $chars[$i]; next if $i == 0 && $cp == 0xfeff; croak "unsupported codepoint (>0x1fffff)." if $cp > 0x001fffff; # cp -> diff -> bocu1 if ($cp <= 0x20) { $bocu1str .= chr($cp); $pc = 0x40 unless $cp == 0x20; } else { my $diff = $cp - $pc; my $b; if ($diff < -187660) { # [...,-187660) : 21 $diff -= -14536567; my $t3 = $diff % 243; $diff = int($diff / 243); my $t2 = $diff % 243; $diff = int($diff / 243); my $t1 = $diff % 243; $diff = int($diff / 243); # my $t0 = $diff; $b = pack("C4", 0x21, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); } elsif ($diff < -10513) { # [-187660,-10513) : 22-24 $diff -= -187660; my $t2 = $diff % 243; $diff = int($diff / 243); my $t1 = $diff % 243; $diff = int($diff / 243); my $t0 = $diff; $b = pack("C3", (0x22 + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); } elsif ($diff < -64) { # [-10513,-64) : 25-4F $diff -= -10513; my $t1 = $diff % 243; $diff = int($diff / 243); my $t0 = $diff; $b = pack("C2", (0x25 + $t0), $bocu1_trail_to_byte[$t1]); } elsif ($diff < 64) { # [-64,63) : 50-CF $diff -= -64; my $t0 = $diff; $b = pack("C", (0x50 + $t0)); } elsif ($diff < 10513) { # [64,10513) : D0-FA $diff -= 64; my $t1 = $diff % 243; $diff = int($diff / 243); my $t0 = $diff; $b = pack("C2", (0xd0 + $t0), $bocu1_trail_to_byte[$t1]); } elsif ($diff < 187660) { # [10513,187660) : FB-FD $diff -= 10513; my $t2 = $diff % 243; $diff = int($diff / 243); my $t1 = $diff % 243; $diff = int($diff / 243); my $t0 = $diff; $b = pack("C3", (0xfb + $t0), $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2]); } else { # [187660,...) : FE $diff -= 187660; my $t3 = $diff % 243; $diff = int($diff / 243); my $t2 = $diff % 243; $diff = int($diff / 243); my $t1 = $diff % 243; $diff = int($diff / 243); # my $t0 = $diff; $b = pack("C4", 0xfe, $bocu1_trail_to_byte[$t1], $bocu1_trail_to_byte[$t2], $bocu1_trail_to_byte[$t3]); } $bocu1str .= $b; # next pc if (0x3040 <= $cp && $cp <= 0x309f) { $pc = 0x3070; } elsif (0x4e00 <= $cp && $cp <= 0x9fa5) { $pc = 0x7711; } elsif (0xac00 <= $cp && $cp <= 0xd7a3) { $pc = 0xc1d1; } else { $pc = $cp & ~0x7f | 0x40; } } } $bocu1str; } 1; __END__