/usr/local/CPAN/Lingua-IW-Logical/Lingua/IW/Logical.pm
#!/usr/bin/perl -w
package Lingua::IW::Logical;
use strict;
use integer;
require Exporter;
@Lingua::IW::Logical::ISA = qw(Exporter);
@Lingua::IW::Logical::EXPORT = qw(log2vis_string log2vis_text);
@Lingua::IW::Logical::EXPORT_OK = qw(set_debug);
$Lingua::IW::Logical::VERSION="0.5";
################################################################################
# Logical-Visual Hebrew subroutines
# Author: Stanislav Malyshev <frodo@sharat.co.il>
# Date: 02/08/1998
# Based on the algorithm from the book 'The Unicode Standard, Version 2.0'
# Redistribution and modification of the code is allowed freely under LGPL terms
################################################################################
my($ALEPH,$TAV,$debug,$STRONG_RTL,$STRONG_LTR,$WEAK_EN,$WEAK_CS,$WEAK_ET,$WEAK_ES,$NEUTRAL_WS,$NEUTRAL_ON,%mirror);
$ALEPH='à';
$TAV='ú';
$debug=0; # put 1 here for full report of what algorithm is doing
# this is entity-type constants
# see Unicode 2.0, page 4-11
$STRONG_RTL=0;
$STRONG_LTR=1;
$WEAK_EN=2;
$WEAK_CS=3;
$WEAK_ET=6;
$WEAK_ES=7;
$NEUTRAL_WS=4;
$NEUTRAL_ON=5;
# This is the list of "mirrored" characters
# see Unicode 2.0, page 4-22
%mirror = (
'(' => ')',
')' => '(',
'[' => ']',
']' => '[',
'{' => '}',
'}' => '{',
);
# subroutine to get type of a character
# needs to be converted to a hash for efficiency
sub get_type {
my($l)=@_;
return $STRONG_RTL if($l ge $ALEPH && $l le $TAV);
return $STRONG_LTR if($l =~ /[a-zA-Z]/);
return $WEAK_EN if($l =~ /[0-9]/);
return $WEAK_ET if($l =~ m|[-\$%+\^#]|);
return $WEAK_ES if($l =~ m%[./\(\)]%);
return $WEAK_CS if($l =~ /[:,;]/);
return $NEUTRAL_WS if($l =~ /\s/);
return $NEUTRAL_ON;
}
sub set_debug {
my($db)=@_;
$debug=$db;
}
# main reverse subroutine
# expects text in "logical" encoding, converts to "visual"
sub log2vis_string ($) {
my($str)=@_;
my($i)=();
return $str unless $str =~ /[$ALEPH-$TAV]/o; # shortcut - no hebrew
my($len)=length($str);
# making levels
my(@str_types)=map { &get_type($_) } split(//,$str); # get character types
print "{",join(":",@str_types),"}\n" if $debug;
# resolving weak types
# see page 3-15 to 3-23
for($i=0;$i<$len;$i++) {
# EN,ES,EN -> EN,EN,EN
if($str_types[$i] == $WEAK_ES && $str_types[$i-1] == $WEAK_EN && $str_types[$i+1] == $WEAK_EN) {
$str_types[$i]=$WEAK_EN;
next;
}
# EN,CS,EN -> EN,EN,EN
if($str_types[$i] == $WEAK_CS && $str_types[$i-1] == $WEAK_EN && $str_types[$i+1] == $WEAK_EN) {
$str_types[$i]=$WEAK_EN;
next;
}
# EN, ET -> EN,EN
if($i>0 && $str_types[$i-1] == $WEAK_EN && $str_types[$i] == $WEAK_ET && $str_types[$i+1] != $STRONG_RTL) {
$str_types[$i]=$WEAK_EN;
next;
}
# ET, EN -> EN,EN
if($str_types[$i+1] == $WEAK_EN && $str_types[$i] == $WEAK_ET) {
$str_types[$i]=$WEAK_EN;
next;
}
# otherwise: L, ES, EN -> L, N, EN
# etc.
## if($i>0 && $str_types[$i-1] == $STRONG_LTR && $str_types[$i+1] == $WEAK_EN) {
if($str_types[$i] == $WEAK_CS || $str_types[$i] == $WEAK_ES || $str_types[$i] == $WEAK_ET) {
$str_types[$i]=$NEUTRAL_ON;
next;
}
} ## for
print "<",join(":",@str_types),">\n" if $debug;
# making directions
# r - RTL, l - LTR, n - neutral (takes current direction), e - embedding level direction
my($levels)='-' x $len; # initially characters have no directionality
my($base)=''; # base directionality of the string
my($last_strong,@next_strong)=();
for($i=0;$i<$len;$i++) {
# first strong character is LTR - all before are LTR
if($str_types[$i] == $STRONG_LTR) {
substr($levels,$i,1) = 'l';
# substr($levels,0,$i) = 'l' x $i unless $base;
$base='l' unless $base;
for(my($j)=$last_strong;$j<$i;$j++) { $next_strong[$j]=$STRONG_LTR; }
$last_strong=$i;
next;
}
# first strong character is RTL - all before are RTL
if($str_types[$i] == $STRONG_RTL) {
substr($levels,$i,1) = 'r';
# substr($levels,0,$i) = 'r' x $i unless $base;
$base='r' unless $base;
for(my($j)=$last_strong;$j<$i;$j++) { $next_strong[$j]=$STRONG_RTL; }
$last_strong=$i;
next;
}
# directioning neutrals
if($str_types[$i] == $NEUTRAL_ON || $str_types[$i] == $NEUTRAL_WS) {
# RNR -> RRR
if($str_types[$i-1] == $STRONG_RTL && $str_types[$i+1] == $STRONG_RTL) {
substr($levels,$i,1)='r';
next;
}
# LNL -> LLL
if($str_types[$i-1] == $STRONG_LTR && $str_types[$i+1] == $STRONG_LTR) {
substr($levels,$i,1)='l';
next;
}
# RNL -> ReL
if($str_types[$i-1] == $STRONG_RTL && $str_types[$i+1] == $STRONG_LTR) {
substr($levels,$i,1)='e';
next;
}
# LNR -> LeR
if($str_types[$i-1] == $STRONG_LTR && $str_types[$i+1] == $STRONG_RTL) {
substr($levels,$i,1)='e';
next;
}
# RNW -> RR?
if($str_types[$i-1] == $STRONG_RTL && $str_types[$i+1] == $WEAK_EN) {
substr($levels,$i,1)='r';
next;
}
# LNW -> LL?
if($str_types[$i-1] == $STRONG_LTR && $str_types[$i+1] == $WEAK_EN) {
substr($levels,$i,1)='l';
next;
}
# if basic directionality is RTL : WNL -> ?LL
if($base == 'r' && $str_types[$i-1] == $WEAK_EN && $str_types[$i+1] == $STRONG_LTR) {
substr($levels,$i,1)='l';
next;
}
# if basic directionality is RTL : WNR -> ?RR
if($base == 'r' && $str_types[$i-1] == $WEAK_EN && $str_types[$i+1] == $STRONG_RTL) {
substr($levels,$i,1)='r';
next;
}
substr($levels,$i,1)='e'; # default for neutrals is 'e'
next;
}
# weak entity
if($str_types[$i] == $WEAK_EN) {
substr($levels,$i,1) = 'n';
next;
}
substr($levels,$i,1) = 'e'; # if not matched - take 'e'
}
print $levels,"\n" if $debug;
# compose string
my($dir)=$base; ##substr($str,0,1); # current direction
my($cursor)=0;
my($outstr)='';
my($nowdir)=$dir;
for($i=0;$i<$len;$i++) {
my($c)=substr($str,$i,1);
if(substr($levels,$i,1) eq 'l') {
$dir = 'l';
}
if(substr($levels,$i,1) eq 'r') {
$dir = 'r';
}
if(substr($levels,$i,1) eq 'e') {
if($dir eq 'r' && $next_strong[$i] == $STRONG_LTR) {
# $dir = 'l';
}
elsif($dir eq 'l' && $next_strong[$i] == $STRONG_RTL) {
$dir = 'r'
}
elsif($next_strong[$i] == '') {
$dir = $base;
}
}
$nowdir=$dir;
# space between LTR and RTL is moved towards RTL, like
# abc ABC -> CBA abc
if(substr($levels,$i,1) eq 'e' && $dir eq 'l' && $next_strong[$i] == $STRONG_RTL) {
$nowdir = 'r';
}
if(substr($levels,$i,1) eq 'n') {
$nowdir='l';
}
$cursor=0 if $nowdir eq 'r';
$c=$mirror{$c} if($nowdir eq 'r' && $mirror{$c} ne '');
substr($outstr,$cursor,0)=$c;
$cursor++ if $nowdir eq 'l';
print "$dir:$nowdir:$cursor: [$outstr]\n" if $debug;
}
return $outstr;
}
# this one works with texts, i.e. handles linebreaks, etc. and splits text on the given width
sub log2vis_text {
my($text,$string_len,$before,$after) = @_;
$string_len = 80 unless $string_len;
$after = "\n" unless defined($after);
my($logstr,$outtext,$visstr);
while($text =~ /(.*(\n|$))/g) { # for each line
next unless $1;
$logstr = $1;
chomp($logstr);
$visstr = log2vis_string($logstr);
while(length($visstr) > $string_len) { # we need to divide
substr($visstr,-$string_len) =~ /.*?\s(.*)/; # find first space after length
$outtext .= ( defined($before) ? $before : ' ' x ($string_len - length($1) - 1) ). $1 . $after; #
substr($visstr,-length($1))='';
}
$outtext .= (defined($before) ? $before : ' ' x ($string_len - length($visstr) - 1) ) . $visstr . $after;
}
return $outtext;
}