MHonArc::UTF8::MhaEncode - UTF-8 based routines for MHonArc


MHonArc documentation Contained in the MHonArc distribution.

Index


Code Index:

NAME

Top

MHonArc::UTF8::MhaEncode - UTF-8 based routines for MHonArc

SYNOPSIS

Top

  use MHonArc::UTF8::MhaEncode;

DESCRIPTION

Top

MHonArc::UTF8::MhaEncode provides UTF-8 related routines for use in MHonArc. Implementation of routines are designed to work with non-Unicode aware versions of Perl 5.

This module is generally not accessed directly since it is used by MHonArc::UTF8 when determining what encoding routines it can use based on your perl installation. However, the following shows you how to use it directly:

  <CharsetConverters override>
  plain;   mhonarc::htmlize;
  default; MHonArc::UTF8::MhaEncode::str2sgml; MHonArc/UTF8/MhaEncode.pm
  </CharsetConverters>

  <TextClipFunc>
  MHonArc::UTF8::MhaEncode::clip; MHonArc/UTF8/MhaEncode.pm
  </TextClipFunc>

FUNCTIONS

Top

to_utf8($data, $from_charset, $to_charset)

Converts $data encoded in $from_charset into UTF-8. $to_charset is ignored since it assumed to be utf-8.

str2sgml($data, $charset)

All data passed in is converted to utf-8 with HTML specials converted into entity references.

clip($text, $clip_len, $is_html, $has_tags)

Clip $text to $clip_len number of characters.

SEE ALSO

Top

MHonArc::UTF8

VERSION

Top

$Id: MhaEncode.pm,v 1.3 2003/03/05 22:17:15 ehood Exp $

AUTHOR

Top

Earl Hood, earl@earlhood.com

MHonArc comes with ABSOLUTELY NO WARRANTY and MHonArc may be copied only under the terms of the GNU General Public License, which may be found in the MHonArc distribution.


MHonArc documentation Contained in the MHonArc distribution.

##---------------------------------------------------------------------------##
##  File:
##	$Id: MhaEncode.pm,v 1.3 2003/03/05 22:17:15 ehood Exp $
##  Author:
##      Earl Hood       earl@earlhood.com
##  Description:
##	POD after __END__.
##---------------------------------------------------------------------------##
##    Copyright (C) 2002	Earl Hood, earl@earlhood.com
##
##    This program is free software; you can redistribute it and/or modify
##    it under the terms of the GNU General Public License as published by
##    the Free Software Foundation; either version 2 of the License, or
##    (at your option) any later version.
##
##    This program is distributed in the hope that it will be useful,
##    but WITHOUT ANY WARRANTY; without even the implied warranty of
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##    GNU General Public License for more details.
##
##    You should have received a copy of the GNU General Public License
##    along with this program; if not, write to the Free Software
##    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
##    02111-1307, USA
##---------------------------------------------------------------------------##

package MHonArc::UTF8::MhaEncode;

use strict;
use MHonArc::CharMaps;
use MHonArc::Char;

my %CharsetMaps = (
    'iso-8859-1'     =>	'MHonArc/UTF8/ISO8859_1.pm',
    'iso-8859-2'     =>	'MHonArc/UTF8/ISO8859_2.pm',
    'iso-8859-3'     =>	'MHonArc/UTF8/ISO8859_3.pm',
    'iso-8859-4'     =>	'MHonArc/UTF8/ISO8859_4.pm',
    'iso-8859-5'     =>	'MHonArc/UTF8/ISO8859_5.pm',
    'iso-8859-6'     =>	'MHonArc/UTF8/ISO8859_6.pm',
    'iso-8859-7'     =>	'MHonArc/UTF8/ISO8859_7.pm',
    'iso-8859-8'     =>	'MHonArc/UTF8/ISO8859_8.pm',
    'iso-8859-9'     =>	'MHonArc/UTF8/ISO8859_9.pm',
    'iso-8859-10'    =>	'MHonArc/UTF8/ISO8859_10.pm',
    'iso-8859-11'    =>	'MHonArc/UTF8/ISO8859_11.pm',
    'iso-8859-13'    =>	'MHonArc/UTF8/ISO8859_13.pm',
    'iso-8859-14'    =>	'MHonArc/UTF8/ISO8859_14.pm',
    'iso-8859-15'    =>	'MHonArc/UTF8/ISO8859_15.pm',
    'iso-8859-16'    =>	'MHonArc/UTF8/ISO8859_16.pm',
    'cp866'	     =>	'MHonArc/UTF8/CP866.pm',
    'cp949'	     =>	'MHonArc/UTF8/CP949.pm', # euc-kr
    'cp932'	     =>	'MHonArc/UTF8/CP932.pm', # shiftjis
    'cp936'	     =>	'MHonArc/UTF8/CP936.pm', # GBK
    'cp950'	     =>	'MHonArc/UTF8/CP950.pm',
    'cp1250'	     =>	'MHonArc/UTF8/CP1250.pm',
    'cp1251'	     =>	'MHonArc/UTF8/CP1251.pm',
    'cp1252'	     =>	'MHonArc/UTF8/CP1252.pm',
    'cp1253'	     =>	'MHonArc/UTF8/CP1253.pm',
    'cp1254'	     =>	'MHonArc/UTF8/CP1254.pm',
    'cp1255'	     =>	'MHonArc/UTF8/CP1255.pm',
    'cp1256'	     =>	'MHonArc/UTF8/CP1256.pm',
    'cp1257'	     =>	'MHonArc/UTF8/CP1257.pm',
    'cp1258'	     =>	'MHonArc/UTF8/CP1258.pm',
    'koi-0'	     =>	'MHonArc/UTF8/KOI_0.pm',
    'koi-7'	     =>	'MHonArc/UTF8/KOI_7.pm',
    'koi8-a'	     =>	'MHonArc/UTF8/KOI8_A.pm',
    'koi8-b'	     =>	'MHonArc/UTF8/KOI8_B.pm',
    'koi8-e'	     =>	'MHonArc/UTF8/KOI8_E.pm',
    'koi8-f'	     =>	'MHonArc/UTF8/KOI8_F.pm',
    'koi8-r'	     =>	'MHonArc/UTF8/KOI8_R.pm',
    'koi8-u'	     =>	'MHonArc/UTF8/KOI8_U.pm',
    'gost19768-87'   =>	'MHonArc/UTF8/GOST19768_87.pm',
    'viscii'	     =>	'MHonArc/UTF8/VISCII.pm',
    'macarabic'	     =>	'MHonArc/UTF8/AppleArabic.pm',
    'maccentraleurroman' => 'MHonArc/UTF8/AppleCenteuro.pm',
    'maccroatian'    =>	'MHonArc/UTF8/AppleCroatian.pm',
    'maccyrillic'    =>	'MHonArc/UTF8/AppleCyrillic.pm',
    'macgreek'	     =>	'MHonArc/UTF8/AppleGreek.pm',
    'machebrew'	     =>	'MHonArc/UTF8/AppleHebrew.pm',
    'macicelandic'   =>	'MHonArc/UTF8/AppleIceland.pm',
    'macromanian'    =>	'MHonArc/UTF8/AppleRomanian.pm',
    'macroman'	     =>	'MHonArc/UTF8/AppleRoman.pm',
    'macthai'	     =>	'MHonArc/UTF8/AppleThai.pm',
    'macturkish'     =>	'MHonArc/UTF8/AppleTurkish.pm',
    'big5-eten'      =>	'MHonArc/UTF8/BIG5_ETEN.pm',
    'big5-hkscs'     =>	'MHonArc/UTF8/BIG5_HKSCS.pm',
    'gb2312'         =>	'MHonArc/UTF8/GB2312.pm',
    'euc-jp'         =>	'MHonArc/UTF8/EUC_JP.pm',
    'hp-roman8'      =>	'MHonArc/UTF8/HP_ROMAN8.pm',
);

my $char_maps = MHonArc::CharMaps->new(\%CharsetMaps);

##---------------------------------------------------------------------------##

# We do not care for valid sequences, just that we catch everything
my $utf8_re = q/[\x00-\x7F]|
				[\xC0-\xDF][\x00-\xFF]|
				[\xE0-\xEF][\x00-\xFF]{2}|
				[\xF0-\xF7][\x00-\xFF]{3}|
				[\xF8-\xFB][\x00-\xFF]{4}|
				[\xFC\xFD][\x00-\xFF]{5}|
				[\x80-\xFF]/;

# Return the length of an utf-8 string
sub utf8_length {
    my $n = 0;
    while ($_[0] =~ m/($utf8_re)/gox) { ++$n; };
    $n;
}

##---------------------------------------------------------------------------##

## Version of TEXTCLIPFUNC for utf8 strings for versions of Perl without
## decent utf8 support (Perl <= 5.6.x).
sub clip {
    my $str      = shift;   # Unfortunately, it is much easier to make a copy
    my $len      = shift;   # Clip length
    my $is_html  = shift;   # If entity references should be considered
    my $has_tags = shift;   # If html tags should be stripped

    # If not HTML text, things are alot easier
    if (!$is_html) {
	# do nothing if we know for sure there is nothing to do
	return $str
	    if length($str) <= $len;

	# Get $len utf8 chars
	$str =~ m/^((?:$utf8_re){1,$len})/x;
	return $1;
    }

    $str =~ s/<[^>]*>//g  if $has_tags;
    return $str  if length($str) <= $len; # nothing to do

    my($utf8_len, $er_len);
    my $text = "";
    my $subtext = "";
    my $sub_len = $len;
    my $real_len = 0;
    
    while ($str ne '') {
	if (!($str =~ s/^((?:$utf8_re){1,$sub_len})//x)) {
	    # pattern should always match, but just in-case...
	    warn qq/Warning: MHonArc::UTF8::MhaEncode::clip:/,
			 qq/ Internal error/;
	    return $text . $str;
	}
	$subtext = $1;

	# check for clipped entity reference
	if (($str ne '') && ($subtext =~ /\&[^;]*\Z/)) {
	    if ($str =~ s/^([^;]*;)//) {
		$subtext .= $1;
	    } else {
		warn qq/Warning: MHonArc::UTF8::MhaEncode::clip: malformed/,
			     qq/ entity reference detected\n/;
		$subtext .= $str;
		$str = '';
	    }
	}

	# compute entity reference lengths to determine "real" character
	# count and not raw character count.
	$er_len = 0;
	while ($subtext =~ /(\&[^;]+);/g) {
	    $er_len += length($1);
	}

	# done if we have enough
	$utf8_len  = utf8_length($subtext);
	$real_len += $utf8_len - $er_len;
	$text     .= $subtext;
	last       if ($real_len >= $len);
	$sub_len   = $len - $real_len;
    }
    $text;
}

sub to_utf8 {
    my $data    = shift;
    my $charset = lc shift;
    my $data_r  = ref($data) ? $data : \$data;

    return $$data_r  if ($charset eq 'us-ascii' ||
			 $charset eq 'utf-8' ||
			 $charset eq 'utf8');
    MHonArc::Char::map_conv($data_r, $charset, $char_maps);
}

sub str2sgml {
    my $data    = shift;
    my $charset = lc shift;
    my $data_r  = ref($data) ? $data : \$data;

    if ($charset eq 'us-ascii') {
	if ($$data_r =~ /[\x80-\xFF]/) {
	    $charset = 'iso-8859-1';
	} else {
	    $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
	    return $$data_r;
	}
    }
    if ($charset eq 'utf-8' || $charset eq 'utf8') {
	$$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
	return $$data_r;
    }
    MHonArc::Char::map_conv($data_r, $charset, $char_maps);
    $$data_r =~ s/([$HTMLSpecials])/$HTMLSpecials{$1}/go;
    $$data_r;
}

##---------------------------------------------------------------------------##
1;
__END__