| Text-AsReadWithoutIntonation documentation | Contained in the Text-AsReadWithoutIntonation distribution. |
Text::AsReadWithoutIntonation - Perl extension for converting sentences to text resembling the way it would be read without intonation.
use Text::AsReadWithoutIntonation; $readWord = Text::AsReadWithoutIntonation:inSpanish($text);
Perl extension for converting sentences to text resembling the way it would be read without intonation (no questions, no pauses or any other prosodic clues).
None by default.
Alberto Montero, <amonero@gsi.dit.upm.es>
Copyright (C) 2007 by Alberto Montero
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.8 or, at your option, any later version of Perl 5 you may have available.
Convert the given sentence to text resembling the way it would be read without intonation (no questions, no pauses or any other prosodic clues).
Non interpretable symbols are supressed.
Numbers are read in spanish. If ª or º is supplied, appropiated gender is used.
Roman numbers lower than or equal to 10 are read as ordinals, and cardinals otherwise.
Supported currencies are euros and dollars.
Mathematical symbols are equal, plus and percentage
'@' is correctly read
Read the supplied single word in spanish, with the same conventions as inSpanish.
Read the supplied single word in spanish, symbol by symbol
| Text-AsReadWithoutIntonation documentation | Contained in the Text-AsReadWithoutIntonation distribution. |
package Text::AsReadWithoutIntonation; use 5.008008; use strict; use warnings; require Exporter; use AutoLoader qw(AUTOLOAD); our @ISA = qw(Exporter); # Items to export into callers namespace by default. Note: do not export # names by default without a very good reason. Use EXPORT_OK instead. # Do not simply export all your public functions/methods/constants. # This allows declaration use Text::AsReadWithoutIntonation ':all'; # If you do not need this, moving things directly into @EXPORT or @EXPORT_OK # will save memory. our %EXPORT_TAGS = ( 'test' => [ qw( inSpanish ) ] ); our @EXPORT_OK = ( @{ $EXPORT_TAGS{'test'} } ); our @EXPORT = qw( ); our $VERSION = '0.9'; # Preloaded methods go here. # Autoload methods go after =cut, and are processed by the autosplit program. 1; __END__
use Lingua::ES::Numeros qw(parse_num); use Text::IdMor; use Text::Roman; use Symbol::Name; use String::Multibyte;
sub inSpanish($) { my $text = shift; my @words = split /\s+/, $text; my $readText = ""; foreach my $word (@words) { $readText .= " "._wordInSpanish($word); } $readText =~ s/^\s+//; $readText =~ s/\s+$//; return $readText; }
sub _wordInSpanish($) { my $word = shift; chop($word) if (($word =~ /\.$/ && !Text::IdMor::isAcronym($word)) || ($word =~ /\,$/) || ($word =~ /:$/)); my $readWord = ""; if (Text::IdMor::isAcronym($word)) { $word =~ s/\.//g; $readWord .= _readSymbolBySymbolInSpanish($word); } elsif (Text::IdMor::isRomanNumber($word)) { my $numReader = new Lingua::ES::Numeros(FORMATO => ""); my $intVersion = roman2int($word); if ($intVersion <= 10) { $readWord .= " ".$numReader->ordinal($intVersion); } else { $readWord .= " ".$numReader->real($intVersion); } } elsif (Text::IdMor::isWord($word)) { # $word =~ tr/A-ZÃ-ÃUÃ/a-zá-úüñ/; $word =~ tr/A-Z/a-z/; $word =~ s/Ã/á/g; $word =~ s/Ã/é/g; $word =~ s/Ã/Ã/g; $word =~ s/Ã/ó/g; $word =~ s/Ã/ú/g; $word =~ s/Ã/ü/g; $word =~ s/Ã/ñ/g; $readWord = $word; # my $utf8 = new String::Multibyte('UTF8'); # $readWord = $utf8->strtr($word, # ["A-Z", "Ã-Ã", "Ã", "Ã"], # ["a-z", "á-ú", "ü", "ñ"]); } elsif (Text::IdMor::isInteger($word)) { my $numReader = new Lingua::ES::Numeros(FORMATO => "", UNMIL => 0); $word = _convertNumbersFromSpanishToEnglishForm($word); eval { $readWord .= " ".$numReader->cardinal($word); }; } elsif (Text::IdMor::isSpanishRealNumber($word)) { my ($ent, $frac) = split /\,/, $word; $readWord .= " "._wordInSpanish($ent)." con"; $frac =~ /^(0*)([^0]*)$/; my $fracZeros = $1; my $fracDigits = $2; $readWord .= " ".Symbol::Name::inSpanish('0') foreach (split //, $fracZeros); $readWord .= " "._wordInSpanish($fracDigits); } elsif (Text::IdMor::isRomanNumber($word)) { my $numReader = new Lingua::ES::Numeros(FORMATO => ""); my $intVersion = roman2int($word); eval { if ($intVersion <= 10) { $readWord .= " ".$numReader->ordinal($intVersion); } else { $readWord .= " ".$numReader->real($intVersion); } }; } elsif (Text::IdMor::isOrdinalNumber($word)) { my $lastChar = chop($word); # A bit tricky, but it does the job if not utf8 my $c2 = chop($word); if ($c2 =~ /[0-9]/) { $word .= $c2; $lastChar = $c2.$c1; } else { $lastChar = $c2.$lastChar; } my $gender = $lastChar =~ /º/?'o':'a'; my $numReader = new Lingua::ES::Numeros(FORMATO => "", UNMIL => 0, SEXO => $gender); $word = _convertNumbersFromSpanishToEnglishForm($word); eval { $readWord .= " ".$numReader->ordinal($word); }; # } elsif (Text::IdMor::isURL($word)) { # my @items = split( /\/+/, $word); # $readWord .= " "._readSymbolBySymbolInSpanish(shift @items); # read protocol # $readWord .= " ".Symbol::Name::inSpanish(':'); # $readWord .= " ".Symbol::Name::inSpanish('/'); # $readWord .= " ".Symbol::Name::inSpanish('/'); # while (@items) { # my $item = shift @items; # my @byDot = split(/\./, $item); # while (@byDot) { # my $deeperItem = shift; # if ($deeperItem eq "www" || # $deeperItem eq "html" || # $deeperItem eq "htm" || # $deeperItem eq "sgml" || # $deeperItem eq "asp" || # $deeperItem eq "jsp") { # consider as an achronym) # $readWord .= " "._readSymbolBySymbolInSpanish($deeperItem); # } else { # $readWord .= " "._wordInSpanish($deeperItem); # } # $readWord .= " ".Symbol::Name::inSpanish('.') if (@byDot); # } # $readWord .= " ".Symbol::Name::inSpanish('/') if (@items); # } # } elsif (Text::IdMor::isEMail($word)) { # } elsif ($word =~ /[¡!¡?"]/) { $word =~ s/[¡!¡?"]//g; $readWord .= _wordInSpanish($word); } elsif ($word =~ /^([^$â¬%\@\+=\(\)]*)(\$|â¬|%|\@|\+|=|\(|\))(.*)$/) { #(\$|â¬|%|\@|\+|=|\(|\)) my $beforeSymbol = $1; my $symbol = $2; my $afterSymbol = $3; $readWord .= " "._wordInSpanish($beforeSymbol) if $beforeSymbol; $readWord .= " ".Symbol::Name::inSpanish($symbol); $readWord .= " "._wordInSpanish($afterSymbol) if $afterSymbol; } else { my $supportedSymbols = Symbol::Name::supportedSpanishSymbols(); my $supportedSymbolsEnum = join('', @$supportedSymbols); my @symbols = split //, $word; my $nOriginalSymbols = @symbols; shift @symbols unless (grep /[$supportedSymbolsEnum]/, $symbols[0]); pop @symbols if (@symbols && !grep /[$supportedSymbolsEnum]/, $symbols[-1]); if (@symbols == $nOriginalSymbols) { $readWord = _readSymbolBySymbolInSpanish($word); } else { $readWord = _wordInSpanish(join('', @symbols)); } } $readWord =~ s/^\s+//; $readWord =~ s/\s+$//; return $readWord; }
sub _readSymbolBySymbolInSpanish($) { my $arg = shift; my @symbols = split //, $arg; my $readWord = ""; foreach (@symbols) { $readWord .= " ".Symbol::Name::inSpanish($_) ; } $readWord =~ s/ +/ /g; return $readWord; } sub _convertNumbersFromSpanishToEnglishForm($) { my $num = shift; $num =~ s/\./a/g; $num =~ s/,/./; $num =~ s/a/,/g; return $num; }