| MARC-Charset documentation | Contained in the MARC-Charset distribution. |
MARC::Charset::Compiler - compile XML mapping rules from LoC
$compiler = MARC::Charset::Compiler->new();
$table = $compiler->compile('codetables.xml');
MARC::Charset uses mapping rules from the Library of Congress for generating a MARC::Charset::Table for looking up utf8 values based on the source MARC-8 character set and the character.
The constructor.
Pass in the path to an XML file to compile.
| MARC-Charset documentation | Contained in the MARC-Charset distribution. |
package MARC::Charset::Compiler;
use strict; use warnings; use base qw( XML::SAX::Base ); use XML::SAX::ParserFactory; use Unicode::UCD qw(charinfo); use MARC::Charset::Table; use MARC::Charset::Code;
sub new { my $self = bless {}, 'MARC::Charset::Compiler'; $self->{table} = MARC::Charset::Table->brand_new(); $self->{current_code} = undef; $self->{text} = ''; return $self; }
sub compile { my ($self, $file) = @_; my $factory = XML::SAX::ParserFactory->new(); my $parser = $factory->parser(Handler => $self); $parser->parse_uri($file); } ## SAX event handlers are below sub start_element { my ($self, $data) = @_; my $name = $data->{Name}; if ($name eq 'code') { $self->{current_code} = MARC::Charset::Code->new(); } elsif ($name eq 'characterSet') { my $charset = $data->{Attributes}{'{}ISOcode'}{Value}; warn('missing ISOcode in characterSet element') unless $charset; $self->{current_charset} = $charset; } } sub end_element { my ($self, $data) = @_; my $name = $data->{Name}; # normalize some names for method lookup $name = 'is_combining' if $name eq 'isCombining'; # get the existing code if we have one my $code = $self->{current_code}; # if we're ending a code element if ($code and $name eq 'code') { # if there is no ucs code, use what's in alt $code->ucs($code->alt()) unless $code->ucs; # can't process a code point that lacks a unicode representation die("invalid code: " . $code->to_string()) unless $code->ucs; # set the charset code $code->charset($self->{current_charset}); # lookup the name from perl's character db my $info = charinfo(hex($code->ucs())); $code->name($info->{name}) if $info; # add it to the table $self->{table}->add_code($code); # start with a clean slate $self->{current_code} = undef; } # add these elements elsif ($code and $name =~ /^(marc|ucs|is_combining|alt)$/) { $code->$name($self->text()); } # ending an element so forget all text $self->{text} = ''; } sub characters { my ($self, $data) = @_; return unless $self->{current_code}; my $text = $data->{Data}; $self->{text} .= $data->{Data}; } sub text { my $text = shift->{text}; # collapse whitespace $text =~ s/\s\s+/ /g; # strip new lines $text =~ s/[\r\n]//g; # strip leading whitespace $text =~ s/^\s+//; # strip trailing whitespace $text =~ s/\s+$//; return $text; } 1;