| Lingua-PT-PLN documentation | Contained in the Lingua-PT-PLN distribution. |
Lingua::PT::PLN - Perl extension for NLP of the Portuguese Language
use Lingua::PT::PLN;
# occurrence counter
%o = oco("file");
oco({num=>1,output=>"outfile"},"file");
$p = accent($phrase); ## mark word accent of all words
$w = syllable($word);
$w = wordaccent($word);
This is a module for Natural Language Processing of the Portuguese.
Because you are processing Portuguese, you must use a correct locale.
ocoCounts word occurrence from a string or a set of files. Returns an hash with the information or creates a sorted file with the results.
This function takes optionally as first argument an hash of options where you can specify:
means the output should be sorted by ocurrence number;
mean the output should be sorted lexicographically
means the output will be written to the file "f";
means that next argument (after the option hash) is a string which should be used as input for the function.
means that remaining arguments to the function are filenames which should be used as input for the function. This is the default option.
To force UTF8 encoding (default latin1)
XML tags are striped.
All words are lower-cased.
to obtain logaritmic output. Output values are between 0..log(1000000) or (0..13.85).
log => 20 -- to obtain values between 0 and 20
Examples:
oco({num=>1,output=>"f"}, "f1","f2")
# sort by occurrence
# store output on file "f"
# process files "f1" and "f2"
oco({alpha=>1,output=>"f"}, "f1","f2")
# sort lexicographically
# store output on file "f"
# process files "f1" and "f2"
%oc = oco("f1","f2")
# return a hash with the occurrences
# use "f1" and "f2" as input files
%oc = oco( {from=>"string"},"text in a string")
# use a string as input
# return a hash with the occurrences
syllablemy $sylls = syllable( $word )
Returns the word with the syllables separated by "|"
my $accent = accent( $phrase )
Returns the phrase with the syllables separated by "|" and accents marked with the charater ":".
Retuns the word splited into syllables and with the accent character marked.
Projecto Natura (http://natura.di.uminho.pt)
Alberto Simoes (albie@alfarrabio.di.uminho.pt)
José João Almeida (jj@di.uminho.pt)
Paulo Rocha (paulo.rocha@di.uminho.pt)
Lingua::PT::PLNbase(3pm), perl(1), cqp(1),
| Lingua-PT-PLN documentation | Contained in the Lingua-PT-PLN distribution. |
package Lingua::PT::PLN; use strict; use Lingua::PT::PLNbase; require Exporter; our @ISA = qw(Exporter AutoLoader); our @EXPORT = (@Lingua::PT::PLNbase::EXPORT, qw(syllable accent wordaccent oco)); our $VERSION = '0.17'; use POSIX qw(locale_h); setlocale(&POSIX::LC_ALL, "pt_PT"); use locale; our ($consoante, $vogal, $acento, %names); my ($lmax,$maxlog,$magicF); BEGIN { $consoante=qr{[bcçdfghjklmñnpqrstvwyxz]}i; $vogal=qr{[áéíóúâêôãõàèaeiouüöäë]}i; $acento=qr{[áéíóúâêôãõüöäë]}i; setlocale(&POSIX::LC_ALL, "pt_PT"); use POSIX; POSIX::setlocale(LC_CTYPE,"pt_PT"); $lmax = 1000000; $maxlog = 13.815; $magicF = $maxlog/log($lmax); } sub oco { ### { from => (file|string), ### num => 1, ### log => 1, # logaritmic output ### alpha => 1, ### output => file, ### encoding => utf8, ### ignorexml => 1, ### ignorecase => 1} my %opt = (from => 'file', ignorecase => 0, ignorexml => 0, encoding => "latin1"); %opt = (%opt , %{shift(@_)}) if ref($_[0]) eq "HASH"; local $\ = "\n"; # set output record separator my $P="(?:[,;:?!]|[.]+|[-]+)"; # pontuacao a contar my $A="[A-ZñÑa-záàãâçéèêíóòõôöúùûüÁÀÃÂÇÉÈÊÍÓÒÕÔÚÙÛÜÖ_]"; my $I="[ \"(){}+*=<>\250\256\257\277\253\273]"; # car. a ignorar my %oco=(); my $tot=0; if ($opt{from} eq 'string') { my (@str) = (@_); for (@str) { $_ = lc if $opt{ignorecase}; s/<[^>]+>//g if $opt{ignorexml}; for (/($A+(?:['-]$A+)*|$P)/g) { $oco{$_}++; $tot++ } } } else { my (@file) = (@_); for(@file) { open F,"< $_" or die "cant open $_: $!"; binmode(F, ":utf8") if $opt{encoding} =~ /utf8/i ; while (<F>) { $_ = lc if $opt{ignorecase}; s/<[^>]+>//g if $opt{ignorexml}; for (/($A+(?:['-]$A+)*|$P)/g) { $oco{$_}++; $tot++} } close F; } } if ($opt{log}){ print "total = $tot\n"; _setmax($tot); _setmax($opt{log}) if($opt{log} > 1); for (keys %oco){ $oco{$_}=_logit($oco{$_}); } } if ($opt{num}) { # imprime por ordem de quantidade de ocorrencias # TODO: não é portável if (defined $opt{output}) { open SORT,"| sort -nr > $opt{output}" } else { open SORT,"| sort -nr" } for my $i (keys %oco) { print SORT "$oco{$i} $i" } close SORT; } elsif ($opt{alpha}) { # imprime ordenado alfabeticamente if (defined $opt{output}) { open SORT ,"> $opt{output}"; for my $i (sort keys %oco ) { print SORT "$i $oco{$i}"; } } else { for my $i (sort keys %oco ) { print "$i $oco{$i}"; } } } else { return (%oco) } } ### syllabs, and accents sub accent { local $/ = ""; # input record separator=1 or more empty lines my $p=shift; $p =~ s/(\w+)/ wordaccent($1) /ge; $p } sub wordaccent { my $p = syllable($_[0]); for ($p) { s{(\w*$acento)}{"$1}i or # word with an accent character # s{(\w*)([ua])(ir)$}{$1$2|"$3}i or # word ending with air uir s{(\w*([zlr]|[iu]s?))$}{"$1}i or # word ending with z l r i u is us s{(\w+\|\w+)$}{"$1} or # accent in 2 syllable frm the end s{(\w)}{"$1}; # accent in the only syllable s{"(([qg]u|$consoante)*($vogal|[yw]))}{$1:}i ; # accent in the 1.st vowel s{:($acento)}{$1:}i ; # mv accent after accents s{"}{}g; } $p } my %syl = ( 20 => " -.!?:;", 10 => "bçdfgjkpqtv", 8 => "sc", 7 => "m", 6 => "lzx", 5 => "nr", 4 => "h", 3 => "wy", 2 => "eaoáéíóúôâêûàãõäëïöü", 1 => "iu", breakpair => #"ie|ia|io|ee|oo|oa|sl|sm|sn|sc|sr|rn|bc|lr|lz|bd|bj|bg|bq|bt|bv|pt|pc|dj|pç|ln|nr|mn|tp|bf|bp", "sl|sm|sn|sc|sr|rn|bc|lr|lz|bd|bj|bg|bq|bt|bv|pt|pc|dj|pç|ln|nr|mn|tp|bf|bp", ); my %spri = (); for my $pri (grep(/\d/, keys %syl)){ for(split(//,$syl{$pri})) { $spri{$_} = $pri}} (my $sylseppair= $syl{breakpair}) =~ s/(\w)(\w)/(\?<=($1))(\?=($2))/g; sub syllable{ my $p=shift; for($p){ s/$sylseppair/|/g; s{(\w)(?=(\w)(\w))} {if($spri{lc($1)}<$spri{lc($2)} && $spri{lc($2)}>=$spri{lc($3)}){"$1|"} else{$1} }ge; s{([a])(i[ru])}{$1|$2}i; #ditongos and friends s{([ioeê])([aoe])}{$1|$2}ig; s{u(ai|ou)}{u|$1}i; s{([^qg]u)(ei|iu|ir|$acento)}{$1|$2}i; s{([aeio])($acento)}{$1|$2}i; s{([íúô])($vogal)}{$1|$2}i; s{([qg]u)\|([eií])}{$1$2}i; s{^($consoante)\|}{$1}i; s{êm$}{ê|_nhem}i; } $p } sub compara { # ordena pela lista de palavras invertida join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b))); } sub compacta { my $s; my $p = shift; my $r = $p; my $q = $names{$p}; while ($s = shift) { if ($s =~ (/^(.+) $p/)) { $r = "($1) $r" ; $q += $names{$s}; } else { print "$r - $q"; $r = $s; $q = $names{$s}; } $p=$s; } print "$r - $q"; } my %savit_p = (); my $savit_n = 0; sub _savit { my $a = shift; $savit_p{++$savit_n} = $a ; " __MARCA__$savit_n " } sub _loadit { my $a = shift; $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g; $savit_n = 0; $a; } 1; #sub setlogmax{ # $maxlog = shift; # $magicF=$maxlog/log($lmax); ### print "Debud .... Maxlog=$maxlog; magic=$magicF\n"; #} sub _setmax{ $lmax = shift; $magicF=$maxlog/log($lmax); ## print "Debud .... Max=$lmax; magic=$magicF\n"; } sub _logit{ my $n=shift; return 0 unless $n; ## print STDERR "...$n,", log($n*$magicF) ,"\n" ; log($n)*$magicF } 1; __END__ $lm='[a-záéíóúâêôàãõçüöñ]'; # letra minuscula $lM='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑ]'; # letra Maiuscula $l1='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑa-záéíóúâêôàãõçüöñ0-9]'; # letra e numero $c1='[^»a-záéíóúâêà,;?!)]';