/usr/local/CPAN/perlSGML.1997Sep/SGML/ISO8859.pm


##---------------------------------------------------------------------------##
##  File:
##	@(#) ISO8859.pm 1.2 97/09/15 14:58:20 @(#)
##  Author:
##      Earl Hood       ehood@medusa.acs.uci.edu
##  Description:
##	Module to deal with ISO-8859 data.
##---------------------------------------------------------------------------##
##    Copyright (C) 1997        Earl Hood, ehood@medusa.acs.uci.edu
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program; if not, write to the Free Software
##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
##    02111-1307, USA
##---------------------------------------------------------------------------##

package SGML::ISO8859;

use Exporter;
@ISA = qw( Exporter );

$VERSION = "0.01";

@EXPORT_OK = qw(
    &str2sgml
    &sgml2str
);

##---------------------------------------------------------------------------
##      US-ASCII/Common characters
##---------------------------------------------------------------------------

%Char2Ent = (
  #--------------------------------------------------------------------------
  # Hex Code	Entity Ref	# ISO external entity and description
  #--------------------------------------------------------------------------
    0x26 =>	"amp",  	# ISOnum : Ampersand
    0x3C =>	"lt",   	# ISOnum : Less-than sign
    0x3E =>	"gt",   	# ISOnum : Greater-than sign

    0xA0 =>	"nbsp",  	# ISOnum : NO-BREAK SPACE
);

%Ent2Char = reverse %Char2Ent;

##---------------------------------------------------------------------------
##      Charset specification to mapping
##---------------------------------------------------------------------------

%CharsetSpec2Char2Ent = (
    'us-ascii'    =>	\%SGML::ISO8859::Char2Ent,
    'iso-8859-1'  =>	\%SGML::ISO8859::S1::Char2Ent,
    'iso-8859-2'  =>	\%SGML::ISO8859::S2::Char2Ent,
    'iso-8859-3'  =>	\%SGML::ISO8859::S3::Char2Ent,
    'iso-8859-4'  =>	\%SGML::ISO8859::S4::Char2Ent,
    'iso-8859-5'  =>	\%SGML::ISO8859::S5::Char2Ent,
    'iso-8859-6'  =>	\%SGML::ISO8859::S6::Char2Ent,
    'iso-8859-7'  =>	\%SGML::ISO8859::S7::Char2Ent,
    'iso-8859-8'  =>	\%SGML::ISO8859::S8::Char2Ent,
    'iso-8859-9'  =>	\%SGML::ISO8859::S9::Char2Ent,
    'iso-8859-10' =>	\%SGML::ISO8859::S10::Char2Ent,
);

%CharsetSpec2Ent2Char = (
    'us-ascii'    =>	\%SGML::ISO8859::Ent2Char,
    'iso-8859-1'  =>	\%SGML::ISO8859::S1::Ent2Char,
    'iso-8859-2'  =>	\%SGML::ISO8859::S2::Ent2Char,
    'iso-8859-3'  =>	\%SGML::ISO8859::S3::Ent2Char,
    'iso-8859-4'  =>	\%SGML::ISO8859::S4::Ent2Char,
    'iso-8859-5'  =>	\%SGML::ISO8859::S5::Ent2Char,
    'iso-8859-6'  =>	\%SGML::ISO8859::S6::Ent2Char,
    'iso-8859-7'  =>	\%SGML::ISO8859::S7::Ent2Char,
    'iso-8859-8'  =>	\%SGML::ISO8859::S8::Ent2Char,
    'iso-8859-9'  =>	\%SGML::ISO8859::S9::Ent2Char,
    'iso-8859-10' =>	\%SGML::ISO8859::S10::Ent2Char,
);

##---------------------------------------------------------------------------

###############################################################################
##	Routines
###############################################################################

##---------------------------------------------------------------------------##
##	str2sgml converts a string encoded by $charset to an sgml
##	string where special characters are converted to entity
##	references.
##
##	$return_data = SGML::ISO8859::str2sgml($data, $charset, $only8bit);
##
##	If $only8bit is non-zero, than only 8-bit characters are
##	translated.
##
sub str2sgml {
    my $data 	 =    shift;
    my $charset  = lc shift;
    my $only8bit =    shift;

    my($ret, $offset, $len) = ('', 0, 0);
    my($map);
    $charset =~ tr/_/-/;

    # Get mapping
    if ($charset =~ /iso-8859-(\d+)/) {
	$set = $1;
	require "SGML/ISO8859/S$set.pm";	# Load mapping
	$map = $CharsetSpec2Char2Ent{$charset};
    } else {
	$map = $CharsetSpec2Char2Ent{"us-ascii"};
    }

    # Convert string
    $len = length($data);
    while ($offset < $len) {
	$char = unpack("C", substr($data, $offset++, 1));
	if ($only8bit && $char < 0xA0) {
	    $ret .= pack("C", $char);
	} elsif ($map->{$char}) {
	    $ret .= join('', '&', $map->{$char}, ';');
	} elsif ($Char2Ent{$char}) {
	    $ret .= join('', '&', $Char2Ent{$char}, ';');
	}else {
	    $ret .= pack("C", $char);
	}
    }
    $ret;
}

##---------------------------------------------------------------------------##
##	sgml2str converts a string with sdata entity references to the
##	raw character values denoted by a character set.
##
##	$return_data = SGML::ISO8859::sgml2str($data, $charset);
##
sub sgml2str {
    my $data 	 =    shift;
    my $charset  = lc shift;

    my($map);
    $charset =~ tr/_/-/;

    # Get mapping
    if ($charset =~ /iso-8859-(\d+)/) {
	$set = $1;
	require "SGML/ISO8859/S$set.pm";	# Load mapping
	$map = $CharsetSpec2Ent2Char{$charset};
    } else {
	$map = $CharsetSpec2Ent2Char{"us-ascii"};
    }

    $data =~ s/\&([\w\.\-]+);
	      	      /defined($map->{$1}) ? sprintf("%c", $map->{$1}) :
		   		   defined($Ent2Char{$1}) ? sprintf("%c", $Ent2Char{$1}) :
		   		   "&$1;"
	      	      /gex;
    $data;
}

##---------------------------------------------------------------------------##
1;