| Encode-UTR22 documentation | Contained in the Encode-UTR22 distribution. |
Encode::UTR22 - Implement Unicode TR22 complex conversions
Implements all of UTR22 except: validity, header, bidirectional re-ordering, history, v attribute for versioning, aliases - that's the job of another module fbu and fub are treated synonymously with single directional a, with equal priority
Supports UTR22c extensions including: contexts, reordering
Hash containing attributes from the <characterMapping> element.
Two element array containing, in order, the bytes and Unicode replacement characters
Hash, indexed by classname, returning Encode::UTR22::Regexp::class (Encode::UTR22::Regexp::class) object.
Array of rules, each rule being a hash.
Hash, indexed by contextname, returning Encode::UTR22::Regexp::group (Encode::UTR22::Regexp::group) object representing a context expression.
Hash, indexed by 'bytes' or 'unicode', returning hash containing ordering elements 'b', 'u', 'bctxt', 'actxt'
Create new instance, parsing and compiling the UTR22 xml
Create and return a new instance, and parse (but do not compile) a UTR22 xml file
Compile a UTR22 object. Parameters recognized
Determines which direction that map will be compiled. True = compile for Unicode to Bytes. False = compile for Bytes to Unicode. Default is both.
Turn on debugging.
Perform Bytes to Unicode conversion.
Perform Unicode to Bytes conversion.
This module is copyright SIL International and is distributed under the same terms as Perl itself.
| Encode-UTR22 documentation | Contained in the Encode-UTR22 distribution. |
package Encode::UTR22;
require 5.8.0; use XML::Parser::Expat; use Unicode::Normalize; use Encode; use Carp; use strict; use vars qw($curr_side $VERSION); $VERSION = 0.03; # MJPH 6-FEB-2004 Add Normalization to encode() # $VERSION = 0.02; # MJPH 7-JUL-2004 Add bctxt to reorder rules
sub new { my ($class, $infile, %attrs) = @_; my ($self) = $class->process_file($infile, %attrs) || return undef; $self->compile(%attrs); $self; }
sub process_file { my ($class, $infile, %attrs) = @_; my ($xml) = XML::Parser::Expat->new(); my ($context) = {}; my ($r); bless $context, ref $class || $class; $xml->{' mycontext'} = $context; my (%regex_classes) = ( 'group' => 'Encode::UTR22::Regexp::Group', 'class-ref' => 'Encode::UTR22::Regexp::classRef', 'context-ref' => 'Encode::UTR22::Regexp::contextRef', 'eos' => 'Encode::UTR22::Regexp::EOS' ); my $current ; $xml->setHandlers('Start' => sub { my ($xml, $tag, %attrs) = @_; my ($this, $temp); $attrs{'line'} = $xml->current_line; if ($tag eq 'characterMapping') { $xml->{' mycontext'}{'info'} = {%attrs}; } elsif ($tag eq 'assignments') { $xml->{' mycontext'}{'sub'}[0] = pack('C', hex($attrs{'sub'})); $xml->{' mycontext'}{'sub'}[1] = pack('U', 0xFFFD); } elsif ($tag eq 'a' || $tag eq 'fbu' || $tag eq 'fub') { error($xml, undef, "b and u attributes are required in a element") unless (defined $attrs{'b'} && defined $attrs{'u'}); push(@{$xml->{' mycontext'}{'rules'}}, { 'line' => $xml->current_line, # 'b' => pack('C0C*', map {hex($_)} $attrs{'b'} =~ m/\G\s*([0-9a-fA-F]{2})/og), # 'u' => pack('U0U*', map {hex($_)} $attrs{'u'} =~ m/\G\s*([0-9a-fA-F]{4,6})/og), 'b' => pack('C*', map {hex($_)} split(' ', $attrs{'b'})), 'u' => pack('U*', map {hex($_)} split(' ', $attrs{'u'})), 'type' => $tag, 'bactxt' => $attrs{'bactxt'}, 'bbctxt' => $attrs{'bbctxt'}, 'uactxt' => $attrs{'uactxt'}, 'ubctxt' => $attrs{'ubctxt'}, 'priority' => $attrs{'priority'}}); } elsif ($tag eq 'range') { $xml->{' mycontext'}{'rules'} = [] unless $xml->{' mycontext'}{'rules'}; process_range($xml, $xml->{' mycontext'}{'rules'}, %attrs); } elsif (defined($regex_classes{$tag})) { $this = $regex_classes{$tag}->new(%attrs) || error($xml, undef, "illegal $tag element"); if ($current) { $current = $current->add_child($this); } else { error($xml, $this, "top level regexps must be named") unless ($attrs{'id'}); error($xml, $this, "top level regexps may not share names") if (defined $xml->{' mycontext'}{'contexts'}{$attrs{'id'}}); $xml->{' mycontext'}{'contexts'}{$attrs{'id'}} = $this; $current = $this; } $this->{'owner'} = $xml->{' mycontext'}; error($xml, $this, "context name $attrs{'id'} must not include /") if ($attrs{'id'} && $attrs{'id'} =~ m|/|o); } elsif ($tag eq 'class') { error($xml, undef, "classes must be named") unless ($attrs{'name'}); error($xml, undef, "size must be 'bytes' in class definition $attrs{'name'}") if (defined $attrs{'size'} && $attrs{'size'} ne 'bytes'); $this = Encode::UTR22::Regexp::class->new(%attrs); $current = $this; $xml->{' mycontext'}{'classes'}{$attrs{'name'}} = $this; } elsif ($tag eq 'class-include') { error($xml, undef, "class-include $attrs{'name'} not in classes element") unless (defined $current && $current->can('add_from')); $temp = $xml->{' mycontext'}{'classes'}{$attrs{'name'}} || error($xml, $current, "Class $attrs{'name'} not yet defined"); error($xml, $current, "Class-include $attrs{'name'} must be the same data size") if ($temp->{'size'} ne $current->{'size'}); $current->add_from($temp); } elsif ($tag eq 'class-range') { $current->add_range(hex($attrs{'first'}), hex($attrs{'last'})); } elsif ($tag eq 'ordering') { $curr_side = lc($attrs{'side'}); error($xml, undef, "ordering must have a side of 'unicode' or 'bytes'") unless ($curr_side eq 'unicode' || $curr_side eq 'bytes'); } elsif ($tag eq 'order') { error($xml, undef, "order element must have b and u attributes") unless (defined $attrs{'b'} && defined $attrs{'u'}); error($xml, undef, "order must occur inside ordering") unless (defined $curr_side); push(@{$xml->{' mycontext'}{'orders'}{$curr_side}}, { 'line' => $xml->current_line, 'b' => $attrs{'b'}, 'u' => $attrs{'u'}, 'bctxt' => $attrs{'bctxt'}, 'actxt' => $attrs{'actxt'}}); } }, 'End' => sub { my ($xml, $tag) = @_; if ($tag eq 'class') { undef $current; } elsif (defined $regex_classes{$tag}) { $current = $current->{'parent'}; } elsif ($tag eq 'ordering') { $curr_side = ''; } }, 'Char' => sub { my ($xml, $str) = @_; if (defined $current && $current->can('add_elements')) { if ($current->{'size'} && $current->{'size'} eq 'bytes') { $current->add_elements(map {pack('C', hex($_))} split(' ', $str)); } # { $current->add_elements(map {pack('C0C', hex($_))} $str =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); } else { $current->add_elements(map {pack('U', hex($_))} split(' ', $str)); } # { $current->add_elements(map {pack('U0U', hex($_))} $str =~ m/\G\s*([0-9a-fA-F]{4,6})\s*/og); } } elsif ($str !~ /^\s*$/ && !$xml->in_element('modified')) {error($xml, undef, "unexpected text '$str' ignored"); } }); if ($attrs{'-path'}) { my ($done); foreach $r (@{$attrs{'-path'}}) { if (-f "$r/$infile") { $xml->parsefile("$r/$infile"); $done = 1; last; } } $xml->parsefile($infile) unless $done; } elsif (ref $infile) { $xml->parse($infile); } else { $xml->parsefile($infile); } return $context; }
sub compile { my ($self, %attrs) = @_; my ($r); foreach $r (sort {$self->{'contexts'}{$a}{'line'} <=> $self->{'contexts'}{$b}{'line'}} keys %{$self->{'contexts'}}) { $self->{'regexps'}{$r} = $self->{'contexts'}{$r}->as_perl(1); } if (defined $attrs{'toBytes'}) { $self->compile_map($attrs{'toBytes'}); $self->compile_order($attrs{'toBytes'}); } else { $self->compile_map(0); $self->compile_order(0); $self->compile_map(1); $self->compile_order(1); unless($attrs{'debug'}) { foreach $r (qw(rules classes contexts regexps orders)) { delete $self->{$r}; } } } $self; }
sub decode($$;$) { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $count, $tpos, $found); return undef unless ($self->{'bsimple'} || $self->{'bconv'}); Encode::_utf8_on($res); $str = $self->reorder($str, $self->{'border'}[0], 1) if (defined $self->{'border'}[0]); $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'bconv'}{$c}) { foreach $r (@{$self->{'bconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; last; } } } unless ($found) { if (defined $self->{'bsimple'}{$c}) { $res .= $self->{'bsimple'}{$c}; } elsif (ref $check eq 'CODE') { $res .= &{$check}($str, pos($str)); } elsif ($check) { $res .= $check; } else { $res .= pack('U', 0xFFFD); } pos($str) = $tpos; } } $res = $self->reorder($res, $self->{'border'}[1], 0) if (defined $self->{'border'}[1]); $res; }
sub encode($$;$) { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $tpos, $found); return undef unless ($self->{'usimple'} || $self->{'uconv'}); if ($self->{'info'}{'normalization'} eq 'NFD') { $str = NFD($str); } elsif ($self->{'info'}{'normalization'} eq 'NFC') { $str = NFC($str); } Encode::_utf8_off($res); $str = $self->reorder($str, $self->{'uorder'}[0], 0) if (defined $self->{'uorder'}[0]); $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'uconv'}{$c}) { foreach $r (@{$self->{'uconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; last; } } } unless ($found) { if (defined $self->{'usimple'}{$c}) { $res .= $self->{'usimple'}{$c}; } elsif (ref $check eq 'CODE') { $res .= &{$check}($str, pos($str)); } elsif ($check) { $res .= $check; } else { $res .= $self->{'sub'}[0]; } pos($str) = $tpos; } } $res = $self->reorder($res, $self->{'uorder'}[1], 1) if (defined $self->{'uorder'}[1]); $res; } sub debug_decode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $count, $tpos, $found, $debug, $debstr); return undef unless ($self->{'bsimple'} || $self->{'bconv'}); Encode::_utf8_on($res); ($str, $debug) = $self->debug_reorder($str, $self->{'border'}[0], 1) if (defined $self->{'border'}[0]); $debug .= "\nMapping from Bytes to Unicode\n"; $len = length($str); pos($str) = 0; while (pos($str) < $len) { $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'bconv'}{$c}) { foreach $r (@{$self->{'bconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $found = 1; $debug .= "matched line $r->[2]: " . debug_blist($str, $temp) . " =~ $r->[0] -> " . debug_ulist($r->[1]) . "\n\n"; last; } else { $debug .= "tried line $r->[2]: " . debug_blist($str, $temp) . " =~ $r->[0]\n"; } } } unless ($found) { if (defined $self->{'bsimple'}{$c}) { $res .= $self->{'bsimple'}{$c}; $debug .= "simple: " . debug_blist($c) . " = " . debug_ulist($self->{'bsimple'}{$c}) . "\n\n"; } elsif (ref $check eq 'CODE') { $debug .= "checked at " . pos($str) . "\n\n"; $res .= &{$check}($str, pos($str)); } elsif ($check) { $debug .= "added check: $check\n\n"; $res .= $check; } else { $debug .= "failed\n\n"; $res .= pack('U', 0xFFFD); } pos($str) = $tpos; } } ($res, $debstr) = $self->debug_reorder($res, $self->{'border'}[1], 0) if (defined $self->{'border'}[1]); ($res, $debug . $debstr); } sub debug_encode { my ($self, $str, $check) = @_; my ($res, $len, $c, $temp, $r, $tpos, $found, $debug, $debstr); return undef unless ($self->{'usimple'} || $self->{'uconv'}); if ($self->{'info'}{'normalization'} eq 'NFD') { $str = NFD($str); } elsif ($self->{'info'}{'normalization'} eq 'NFC') { $str = NFC($str); } Encode::_utf8_off($res); ($str, $debug) = $self->debug_reorder($str, $self->{'uorder'}[0], 0) if (defined $self->{'uorder'}[0]); $debug .= "\nMapping from Unicode to Bytes\n"; $len = length($str); pos($str) = 0; while (pos($str) < $len) { use utf8; $found = 0; $temp = pos($str); $str =~ m/\G(.)/ogcs; $c = $1; $tpos = pos($str); pos($str) = $temp; if (defined $self->{'uconv'}{$c}) { foreach $r (@{$self->{'uconv'}{$c}}) { if ($str =~ m/$r->[0]/gcs) { $res .= $r->[1]; $debug .= "matched line $r->[2]: " . debug_ulist($str, $temp) . " =~ $r->[0] -> " . debug_blist($r->[1]) . "\n\n"; $found = 1; last; } else { $debug .= "tried line $r->[2]: " . debug_ulist($str, $temp) . " =~ $r->[0]\n"; } } } unless ($found) { if (defined $self->{'usimple'}{$c}) { $debug .= "simple: " . debug_ulist($c) . " = " . debug_blist($self->{'usimple'}{$c}) . "\n\n"; $res .= $self->{'usimple'}{$c}; } elsif (ref $check eq 'CODE') { $debug .= "check at " . pos($str) . "\n\n"; $res .= &{$check}($str, pos($str)); } elsif ($check) { $debug .= "check: $check\n\n"; $res .= $check; } else { $debug .= "failed: $self->{'sub'}[0]\n\n"; $res .= $self->{'sub'}[0]; } pos($str) = $tpos; } } ($res, $debstr) = $self->debug_reorder($res, $self->{'uorder'}[1], 1) if (defined $self->{'uorder'}[1]); ($res, $debug . $debstr); } sub name { my ($self) = @_; return $self->{'info'}{'id'}; } sub new_sequence { return $_[0]; } sub compile_map { my ($self, $toBytes) = @_; my ($srcl) = $toBytes ? 'u' : 'b'; my ($destl) = $toBytes ? 'b' : 'u'; my ($r, $res, $pre, $post, $lres, $lpre, $lpost, $line, $first, $dump); return $self if ($self->{"${srcl}simple"} || $self->{"${srcl}conv"}); foreach $r (@{$self->{'rules'}}) { next if ($r->{'type'} ne 'a' && (($toBytes == 1) ^ ($r->{'type'} eq 'fub'))); $pre = ''; $post = ''; $res = ''; $lpre = 0; $lpost = 0; $line = $r->{'line'}; if ($r->{"${srcl}bctxt"}) { error (undef, undef, "No regexp " . $r->{"${srcl}bctxt"} . " for ${srcl}bctxt at line $r->{line}") unless ($self->{'regexps'}{$r->{"${srcl}bctxt"}}[0]); ($pre, $dump, $lpre) = @{$self->{'regexps'}{$r->{"${srcl}bctxt"}}}; } $res = $r->{$srcl}; error (undef, undef, "Empty mapping to " . strerror($r->{$destl}, $toBytes) . " not allowed") if ($res eq ''); if ($r->{"${srcl}actxt"}) { error (undef, undef, "No regexp " . $r->{"${srcl}actxt"} . " for ${srcl}actxt at line $r->{line}") unless ($self->{'regexps'}{$r->{"${srcl}actxt"}}[0]); ($post, $dump, $lpost) = @{$self->{'regexps'}{$r->{"${srcl}actxt"}}}; $post = "(?=" . $post . ")"; } if ($toBytes) { use utf8; $r->{'u'} =~ m/^(.)/os; $first = $1; # substr() not working yet # $line = -length($res) if (length($res) > 1); # doesn't work yet my (@temp) = unpack('U*', $res); if ($#temp == 0 && $pre eq '' and $post eq '') { error (undef, undef, "Ambiguous mapping from " . strerror($first, !$toBytes) . " to at least " . strerror($self->{'usimple'}{$first}, $toBytes) . " and " . strerror($r->{$destl}, $toBytes)) if (defined $self->{'usimple'}{$first}); $self->{'usimple'}{$first} = $r->{$destl}; } else { $lres = $#temp + 1; $res =~ s/([$%\\^&*(){}\[\]|"'?\/+.`~\-])/\\$1/ogs; #" $res =~ s/([^\x21-\x7e])/sprintf("\\x{%04X}", unpack('U', $1))/ogse; push (@{$self->{"uconv"}{$first}}, [qr/$pre\G$res$post/, $r->{$destl}, $line, $lres, $lpre, $lpost, $r->{'priority'}]); # print STDERR "qr/$pre\\G$res$post/, $r->{$destl}, $line\n"; } } else { use bytes; $first = substr($res, 0, 1); if (length($res) == 1 && $pre eq '' && $post eq '') { error (undef, undef, "Ambiguous mapping from " . strerror($first, !$toBytes) . " to at least " . strerror($self->{'bsimple'}{$first}, $toBytes) . " and " . strerror($r->{$destl}, $toBytes)) if (defined $self->{'bsimple'}{$first}); $self->{'bsimple'}{$first} = $r->{$destl}; } else { $lres = length($res); $res =~ s/([$%\\^&*(){}\[\]|"'?\/+.`~\-])/\\$1/ogs; #" $res =~ s/([^\x21-\x7e])/sprintf("\\x%02x", ord($1))/ogse; push (@{$self->{"bconv"}{$first}}, [qr/$pre\G$res$post/, $r->{$destl}, $line, $lres, $lpre, $lpost, $r->{'priority'}]); } } } $res = $self->{"${srcl}conv"}; foreach $first (keys %{$res}) { my ($has_short); $r = $res->{$first}; $res->{$first} = [sort { $b->[6] <=> $a->[6] || # highest priority attribute first $b->[3] <=> $a->[3] || # longest match string first $b->[4] + $b->[5] <=> $a->[4] + $a->[5] || # pre+post longest first $a->[2] <=> $b->[2]; # lowest line number first } @{$r}]; foreach (@{$res->{$first}}) { $has_short ||= ($_->[3] == 0);} error (undef, undef, 'No default mapping for ' . ($toBytes ? sprintf("U+%04X", unpack('U', $first)) : sprintf("0x%02x", ord($first)))) unless (!$has_short || defined $self->{"${srcl}simple"}{$first}); } $self; } sub compile_order { my ($self, $toBytes) = @_; my ($srcl) = $toBytes ? 'u' : 'b'; my ($destl) = $toBytes ? 'b' : 'u'; my ($output) = $toBytes ? 'uorder' : 'border'; my (@sides) = $toBytes ? ('unicode', 'bytes') : ('bytes', 'unicode'); my ($count, $r, $obj, $i, $reg, $list, %names, @nums, $name, $reg1, $eval, $rega, $regb, $namec); for ($count = 0; $count < 2; $count++) { $obj = $self->{'orders'}{$sides[$count]}; next unless defined $obj; foreach $r (@{$obj}) { @nums = (); %names = (); if ($r->{'bctxt'}) { ($regb, $list) = @{$self->{'regexps'}{$r->{'bctxt'}}}; $namec = scalar @{$list}; } else { $regb = ''; $namec = 0; } if ($r->{'actxt'}) { ($rega) = @{$self->{'regexps'}{$r->{'actxt'}}}; } else { $rega = ''; } error(undef, undef, "No regexp called $r->{$srcl} available") unless defined $r->{$srcl}; ($reg, $list) = @{$self->{'regexps'}{$r->{$srcl}}}; for ($i = 0; $i <= $#{$list}; $i++) { $name = $list->[$i]; if ($name =~ s{^\Q$r->{$srcl}\E(?:/|$)}{}) { $name =~ s{^\Q$r->{$destl}\E(?:/|$)}{}; } # next unless ($name !~ m|/|o && $name ne ''); $names{$name} = $i; } error(undef, undef, "No regexp called $r->{$destl} available") unless defined $r->{$destl}; ($reg1, $list) = @{$self->{'regexps'}{$r->{$destl}}}; for ($i = 0; $i <= $#{$list}; $i++) { $name = $list->[$i]; if ($name =~ s{^\Q$r->{$destl}\E(?:/|$)}{}) { $name =~ s{^\Q$r->{$srcl}\E(?:/|$)}{}; } # next unless ($name !~ m|/|o && $name ne ''); push (@nums, $names{$name}+1+$namec) if ($name && $names{$name}); } $eval = join('', map {"\$$_"} @nums); if ($sides[$count] eq 'unicode') { use utf8; push (@{$self->{$output}[$count]}, [qr/$regb\G$reg$rega/, $eval, $r->{'line'}, $namec]); } else { use bytes; push (@{$self->{$output}[$count]}, [qr/$regb\G$reg$rega/, $eval, $r->{'line'}, $namec]); } } } $self; } sub reorder { my ($self, $str, $rules, $isbytes) = @_; my ($r, $res, $found, $len, @ress, $temp, $oldpos); if ($isbytes || $] < 5.008) { use bytes; $len = length($str); } else { use utf8; $len = length($str); } while (pos($str) < $len) { $found = 0; foreach $r (@{$rules}) { if ($isbytes) { use bytes; # the \G seems to be anchoring the global search # here so it only finds $r->[0] once next unless (@ress = $str =~ m/$r->[0]/gcs); } else { use utf8; # and here next unless (@ress = $str =~ m/$r->[0]/gcs); } $oldpos += length($ress[$r->[3]]); pos($str) = $oldpos; $temp = $r->[1]; $temp =~ s/\$(\d+)/$ress[$1 - 1]/og; $res .= $temp; $found = 1; last; } unless ($found) { if ($isbytes) { use bytes; $str =~ m/\G(.)/ogcs; $res .= $1; } else { use utf8; $str =~ m/\G(.)/ogcs; $res .= $1; } $oldpos++; } } $res; } sub debug_reorder { my ($self, $str, $rules, $isbytes) = @_; my ($r, $res, $found, $len, @ress, $temp, $debug, $oldpos); $debug = "\nRe-ordering:\n"; if ($isbytes || $] < 5.008) { use bytes; $len = length($str); } else { use utf8; $len = length($str); } foreach $r (@{$rules}) { if ($isbytes) { $debug .= "reorder(line $r->[2]): $r->[0] = "; $debug .= " -> $r->[1]\n"; } else { $debug .= "reorder(line $r->[2]): $r->[0] = "; $debug .= " -> $r->[1])\n"; } } while (pos($str) < $len) { $found = 0; foreach $r (@{$rules}) { if ($isbytes) { use bytes; # the \G seems to be anchoring the global search # here so it only finds $r->[0] once next unless (@ress = $str =~ m/$r->[0]/gcs) } else { use utf8; # and here next unless (@ress = $str =~ m/$r->[0]/gcs); } $oldpos += length($ress[$r->[3]]); pos($str) = $oldpos; $temp = $r->[1]; $temp =~ s/\$(\d+)/$ress[$1 - 1]/og; $res .= $temp; if ($isbytes) { $debug .= "reorder(line $r->[2]): " . join(",", map {debug_blist($_)} @ress); $debug .= " -> $r->[1] = "; $debug .= debug_blist($temp) . "\n\n"; } else { $debug .= "reorder(line $r->[2]): $r->[0]" . join(",", map {debug_ulist($_)} @ress); $debug .= " -> $r->[1]) = "; $debug .= debug_ulist($temp) . "\n\n"; } $found = 1; last; } unless ($found) { if ($isbytes) { use bytes; $str =~ m/\G(.)/ogcs; $res .= $1; } else { use utf8; $str =~ m/\G(.)/ogcs; $res .= $1; } $oldpos++; } } if ($isbytes) { $debug .= "Final result: " . debug_blist($res) . "\n"; } else { $debug .= "Final result: " . debug_ulist($res) . "\n"; } ($res, $debug); } sub process_range { my ($xml, $store, %attrs) = @_; my (@first, @last, @max, @min, $uFirst, $uLast); my (@current, $i, $j, $done); @first = map {hex($_)} ($attrs{'bFirst'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og) or return error($xml, undef, "bFirst attribute required in range"); @last = map {hex($_)} ($attrs{'bLast'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og) or return error($xml, undef, "bLast attribute required in range"); @max = map {hex($_)} ($attrs{'bMax'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); @min = map {hex($_)} ($attrs{'bMin'} =~ m/\G\s*([0-9a-fA-F]{2})\s*/og); $uFirst = hex($attrs{'uFirst'}); $uLast = hex($attrs{'uLast'}) || return error($xml, undef, "uLast attribute require in range"); @current = @first; for ($i = $uFirst; $i <= $uLast; $i++) { push(@{$store}, { 'line' => $xml->current_line, 'b' => pack('C0C*', @current), 'u' => pack('U0U', $i), 'type' => 'a', 'bactxt' => $attrs{'bactxt'}, 'bbctxt' => $attrs{'bbctxt'}, 'uactxt' => $attrs{'uactxt'}, 'ubctxt' => $attrs{'ubctxt'}, 'priority' => $attrs{'priority'}}); last if $i == $uLast; for ($j = 0; $j <= $#current; $j++) { $current[$j]++; if (defined $max[$j] && $current[$j] > $max[$j]) { $current[$j] = $min[$j]; } else { last; } } } for ($j = 0; $j <= $#current; $j++) { if ($current[$j] != $last[$j]) { error($xml, undef, "Number of byte codes does not correspond to number of Unicodes in range"); last; } } } sub strerror { my ($str, $isBytes) = @_; my ($res); if ($isBytes) { use bytes; $res = join(" ", map {sprintf("0x%02x", ord($_))} split('', $str)); } else { use utf8; $res = join(" ", map {sprintf("U+%04X", unpack('U', $_))} split('', $str)); } $res; } sub error { my ($xml, $obj, $str, $die) = @_; if (defined $obj && $obj->can('as_error')) { $str .= "\n in " . $obj->as_error; } if ($die) { if ($xml) { $xml->xpcroak($str); } else { die($str); } } else { if ($xml) { $xml->xpcarp($str); } else { print STDERR "$str\n"; } } undef; } sub debug_ulist { my ($str, $pos) = @_; my (@res1) = map{sprintf("%04X",$_)} unpack('U*', substr($str, 0, $pos)); my (@res2) = map{sprintf("%04X",$_)} unpack('U*', substr($str, $pos)); return join(" ", defined $pos ? (@res1, '|') : (), @res2); } sub debug_blist { my ($str, $pos) = @_; my (@res) = map{sprintf("x%02X",$_)} unpack('C*', $str); splice(@res, $pos, 0, '|') if defined $pos; return join(" ", @res); } no strict 'refs'; package Encode::UTR22::Regexp::Element; use Carp; sub new { my ($class, %attrs) = @_; my ($self) = {%attrs}; bless $self, ref ($class) || $class; $self; } sub add_child { my ($self, $child) = @_; my ($name); $child->{'parent'} = $self; push (@{$self->{'child'}}, $child); if ($name = $child->{'id'}) { if (defined $self->{'named'}{$name}) { carp("child with duplicate name at line $child->{'line'}"); } } else { $name = $child->{'name'}; while (defined $self->{'named'}{$name}) { $name =~ s/(\d*)$/$1 + 1/oe; } } $self->{'named'}{$name} = $child; $child; } # returns two-element array containing minimum and maximum length of the resultant regex element sub count { my $self = shift; my ($mymin, $mymax); if (exists $self->{'child'} && $#{$self->{'child'}} >= 0) { foreach (@{$self->{'child'}}) { unless (defined $mymin) { ($mymin, $mymax) = $_->count(); } else { my ($cmin, $cmax) = $_->count(); if ($self->{'alt'}) { $mymin = $cmin if $cmin < $mymin; $mymax = $cmax if $cmax > $mymax; } else { $mymin += $cmin; $mymax += $cmax; } } } } else { $mymin = $mymax = 0; } $mymin *= $self->{'min'} if defined $self->{'min'}; $mymax *= $self->{'max'} if defined $self->{'max'}; return ($mymin, $mymax); } sub as_error { $_[0]->{'id'}; } package Encode::UTR22::Regexp::Group; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart, %opts) = @_; my ($r, $res, $names, $count, $text, $sub, $subl, $lacc); my ($min, $max); my ($fn) = $opts{'-fn'} || "as_perl"; $min = defined $self->{'min'} ? $self->{'min'} : 1; $max = defined $self->{'max'} ? $self->{'max'} : 1; $names = []; if ($self->{'id'} ne '' && !$opts{'-noref'}) { $res = "("; $names = [$self->{'id'}]; } if ($self->{'id'} eq '' || $min != 1 || $max != 1) { $res .= "(?:"; } $lacc = 0; foreach $r (@{$self->{'child'}}) { ($text, $sub, $subl) = @{$r->${fn}($atstart, %opts)}; if (defined $self->{'alt'}) { if ($count) { $res .= "|"; } else { $count = 1; } $res .= $text; $lacc = $subl if $subl > $lacc; } else { $res .= $text; $atstart = 0; $lacc += $subl; } push (@{$names}, map {"$self->{'id'}/$_"} @{$sub}); } $res .= ')' if ($self->{'id'} eq '' || $min != 1 || $max != 1); if ($max > 1) { if ($max != $min) { $res .= "{$min,$max}"; } else { $res .= "{$max}"; } } elsif ($min == 0) { $res .= "?"; } $res .= ")" if ($self->{'id'} ne '' && !$opts{'-noref'}); return [$res, $names, $lacc * $max]; } package Encode::UTR22::Regexp::classRef; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub count { my $self = shift; my ($min, $max); $min = defined $self->{'min'} ? $self->{'min'} : 1; $max = defined $self->{'max'} ? $self->{'max'} : 1; return ($min, $max); } sub as_perl { my ($self, $atstart, %opts) = @_; my ($class) = $self->{'owner'}{'classes'}{$self->{'name'}}; my ($res, $temp, $wrap); my ($min, $max); $min = defined $self->{'min'} ? $self->{'min'} : 1; $max = defined $self->{'max'} ? $self->{'max'} : 1; return warn("No class defined for $self->{'name'}\n in " . $self->as_error) unless defined $class; return warn("Empty class $self->{'name'}\n in ". $self->as_error) unless (defined $class->{'elements'}); if ($class->{'size'} && $class->{'size'} eq 'bytes') { $temp = join('', @{$class->{'elements'}}); $temp =~ s/([$%\\^&*(){}\[\]|+"'?\/.`~\-])/\\$1/og; $temp =~ s/([^\x21-\x7e])/sprintf("\\x%02x", ord($1))/oge; } else { use utf8; $temp = join('', @{$class->{'elements'}}); $temp =~ s/([$%\\^&*(){}\[\]|+"'?\/.`~\-])/\\$1/og; $temp =~ s/([^\x21-\x7e])/sprintf("\\x{%04X}", unpack('U', $1))/oge; } $res = "(" if ($self->{'id'} && !$opts{'-noref'}); $wrap = ($#{$class->{'elements'}} > 0 || defined $self->{'neg'} || $min != 1 || $max != 1); $res .= "[" if $wrap; $res .= '^' if (defined $self->{'neg'}); $res .= "$temp"; $res .= "]" if $wrap; if ($max > 1) { if ($min != $max) { $res .= "{$min,$max}"; } else { $res .= "{$max}"; } } elsif ($min == 0) { $res .= "?"; } if ($self->{'id'}) { $res .= ')' unless ($opts{'-noref'}); return [$res, [$self->{'id'}], $max]; } else { return [$res, [], $max]; } } package Encode::UTR22::Regexp::contextRef; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart, %opts) = @_; my ($ref, $n, $res, $ind, $temp, $len, $id); my ($fn) = $opts{'-fn'} || "as_perl"; unless ($self->{'name'}) { print STDERR "No name attribute in context-ref at line $self->{'line'}\n"; return [undef, '', 0]; } foreach $n (split('/', $self->{'name'})) { if ($ref) { $ref = $ref->{'named'}{$n}; } else { $ref = $self->{'owner'}{'contexts'}{$n}; } unless ($ref) { print STDERR "Can't find reference to $n in $self->{'name'} at line $self->{'line'}\n"; return ['', []]; } } $self->{'named'} = $ref->{'named'}; if (defined $self->{'max'} || defined $self->{'min'}) { $temp = bless {%$ref}, ref $ref; $temp->{'max'} = $self->{'max'} if defined $self->{'max'}; $temp->{'min'} = $self->{'min'} if defined $self->{'min'}; ($res, $ind, $len) = @{$temp->${fn}($atstart, %opts)}; } else { ($res, $ind, $len) = @{$ref->${fn}($atstart, %opts)}; } $id = $self->{'id'} || $self->{'name'}; foreach $n (@{$ind}) { $n =~ s|^[^/]+|$id|o; } return [$res, $ind, $len]; } package Encode::UTR22::Regexp::EOS; use vars qw(@ISA); BEGIN { @ISA = qw(Encode::UTR22::Regexp::Element); } sub as_perl { my ($self, $atstart, %opts) = @_; return [$atstart ? '^' : '$', [], 0]; } package Encode::UTR22::Regexp::class; sub new { my ($class, %attrs) = @_; my ($self) = {%attrs}; bless $self, ref $class || $class; } sub add_elements { my ($self) = shift; push (@{$self->{'elements'}}, @_); $self; } sub add_from { my ($self, $other) = @_; push (@{$self->{'elements'}}, @{$other->{'elements'}}); $self; } sub add_range { my ($self, $start, $end) = @_; push (@{$self->{'elements'}}, map {pack($self->{'size'} eq 'bytes' ? 'C' : 'U', $_)} ($start .. $end)); $self; } 1;