Lingua::PT::PLN - Perl extension for NLP of the Portuguese Language


Lingua-PT-PLN documentation Contained in the Lingua-PT-PLN distribution.

Index


Code Index:

NAME

Top

Lingua::PT::PLN - Perl extension for NLP of the Portuguese Language

SYNOPSIS

Top

  use Lingua::PT::PLN;

  # occurrence counter
  %o = oco("file");
  oco({num=>1,output=>"outfile"},"file");

  $p = accent($phrase);        ## mark word accent of all words

  $w = syllable($word);
  $w = wordaccent($word);

DESCRIPTION

Top

This is a module for Natural Language Processing of the Portuguese.

Because you are processing Portuguese, you must use a correct locale.

Occurrence counting: oco

Counts word occurrence from a string or a set of files. Returns an hash with the information or creates a sorted file with the results.

This function takes optionally as first argument an hash of options where you can specify:

num => 1

means the output should be sorted by ocurrence number;

alpha => 1

mean the output should be sorted lexicographically

output => "f"

means the output will be written to the file "f";

from => "string"

means that next argument (after the option hash) is a string which should be used as input for the function.

from => "file"

means that remaining arguments to the function are filenames which should be used as input for the function. This is the default option.

encoding => "utf8"

To force UTF8 encoding (default latin1)

ignorexml => 1

XML tags are striped.

ignorecase => 1

All words are lower-cased.

log => 1

to obtain logaritmic output. Output values are between 0..log(1000000) or (0..13.85).

  log => 20    -- to obtain values between 0 and 20

Examples:

  oco({num=>1,output=>"f"}, "f1","f2")
  # sort by occurrence
  # store output on file "f"
  # process files "f1" and "f2"

  oco({alpha=>1,output=>"f"}, "f1","f2")
  # sort lexicographically
  # store output on file "f"
  # process files "f1" and "f2"

  %oc = oco("f1","f2")
  # return a hash with the occurrences
  # use "f1" and "f2" as input files

  %oc = oco( {from=>"string"},"text in a string")
  # use a string as input
  # return a hash with the occurrences

syllable

  my $sylls = syllable( $word )

Returns the word with the syllables separated by "|"

accent

  my $accent = accent( $phrase )

Returns the phrase with the syllables separated by "|" and accents marked with the charater ":".

wordaccent

Retuns the word splited into syllables and with the accent character marked.

compacta

compara

AUTHOR

Top

Projecto Natura (http://natura.di.uminho.pt)

Alberto Simoes (albie@alfarrabio.di.uminho.pt)

José João Almeida (jj@di.uminho.pt)

Paulo Rocha (paulo.rocha@di.uminho.pt)

SEE ALSO

Top

Lingua::PT::PLNbase(3pm), perl(1), cqp(1),


Lingua-PT-PLN documentation Contained in the Lingua-PT-PLN distribution.

package Lingua::PT::PLN;

use strict;

use Lingua::PT::PLNbase;

require Exporter;
our @ISA = qw(Exporter AutoLoader);

our @EXPORT = 
  (@Lingua::PT::PLNbase::EXPORT,
   qw(syllable accent wordaccent oco));
our $VERSION = '0.17';

use POSIX qw(locale_h);
setlocale(&POSIX::LC_ALL, "pt_PT");
use locale;

our ($consoante, $vogal, $acento, %names);

my ($lmax,$maxlog,$magicF);

BEGIN {
  $consoante=qr{[bcçdfghjklmñnpqrstvwyxz]}i;
  $vogal=qr{[áéíóúâêôãõàèaeiouüöäë]}i;
  $acento=qr{[áéíóúâêôãõüöäë]}i;
  setlocale(&POSIX::LC_ALL, "pt_PT");

  use POSIX;
  POSIX::setlocale(LC_CTYPE,"pt_PT");

  $lmax = 1000000;
  $maxlog = 13.815;
  $magicF = $maxlog/log($lmax);
}



sub oco {
  ### { from => (file|string),
  ###    num => 1,
  ###    log => 1,    # logaritmic output 
  ###  alpha => 1,
  ### output => file,
  ### encoding => utf8,
  ### ignorexml => 1,
  ### ignorecase => 1}

  my %opt = (from => 'file', ignorecase => 0, ignorexml => 0, encoding => "latin1");
  %opt = (%opt , %{shift(@_)}) if ref($_[0]) eq "HASH";

  local $\ = "\n";                    # set output record separator

  my $P="(?:[,;:?!]|[.]+|[-]+)";      # pontuacao a contar
  my $A="[A-ZñÑa-záàãâçéèêíóòõôöúùûüÁÀÃÂÇÉÈÊÍÓÒÕÔÚÙÛÜÖ_]";
  my $I="[ \"(){}+*=<>\250\256\257\277\253\273]"; # car. a  ignorar
  my %oco=();
  my $tot=0;

  if ($opt{from} eq 'string') {
    my (@str) = (@_);
    for (@str) {
      $_ = lc if $opt{ignorecase};
      s/<[^>]+>//g if $opt{ignorexml};
      for (/($A+(?:['-]$A+)*|$P)/g) { $oco{$_}++; $tot++ }
    }
  } else {
    my (@file) = (@_);
    for(@file) {
      open F,"< $_" or die "cant open $_: $!";
      binmode(F, ":utf8") if $opt{encoding} =~ /utf8/i ;
      while (<F>) {
	$_ = lc if $opt{ignorecase};
	s/<[^>]+>//g if $opt{ignorexml};
	for (/($A+(?:['-]$A+)*|$P)/g) { $oco{$_}++;  $tot++}
      }
      close F;
    }
  }

  if ($opt{log}){
    print "total = $tot\n";
    _setmax($tot);
    _setmax($opt{log}) if($opt{log} > 1);
    for (keys %oco){
      $oco{$_}=_logit($oco{$_});
    }
  }

  if ($opt{num}) { # imprime por ordem de quantidade de ocorrencias

    # TODO: não é portável
    if (defined $opt{output}) {
      open SORT,"| sort -nr > $opt{output}"
    } else {
      open SORT,"| sort -nr"
    }

    for my $i (keys %oco) {
      print SORT "$oco{$i} $i"
    }
    close SORT;

  } elsif ($opt{alpha}) { # imprime ordenado alfabeticamente

    if (defined $opt{output}) {
      open SORT ,"> $opt{output}";
      for my $i (sort keys %oco ) {
	print SORT  "$i $oco{$i}";
      }
    } else {
      for my $i (sort keys %oco ) {
	print  "$i $oco{$i}";
      }
    }
  } else {
    return (%oco)
  }
}

### syllabs, and accents

sub accent {
  local $/ = "";           # input record separator=1 or more empty lines
  my $p=shift;
  $p =~ s/(\w+)/ wordaccent($1) /ge;
  $p
}

sub wordaccent {
  my $p = syllable($_[0]);
  for ($p) {
    s{(\w*$acento)}{"$1}i             or  # word with an accent character
#      s{(\w*)([ua])(ir)$}{$1$2|"$3}i  or  # word ending with air uir
      s{(\w*([zlr]|[iu]s?))$}{"$1}i   or  # word ending with z l r i u is us
      s{(\w+\|\w+)$}{"$1}             or  # accent in 2 syllable frm the end
      s{(\w)}{"$1};                       # accent in the only syllable

    s{"(([qg]u|$consoante)*($vogal|[yw]))}{$1:}i ; # accent in the 1.st vowel
    s{:($acento)}{$1:}i  ;                         # mv accent after accents
    s{"}{}g;

  }
  $p
}

my %syl = (
   20 => " -.!?:;",
   10 => "bçdfgjkpqtv",
   8 => "sc",
   7 => "m",
   6 => "lzx",
   5 => "nr",
   4 => "h",
   3 => "wy",
   2 => "eaoáéíóúôâêûàãõäëïöü",
   1 => "iu",
   breakpair =>
      #"ie|ia|io|ee|oo|oa|sl|sm|sn|sc|sr|rn|bc|lr|lz|bd|bj|bg|bq|bt|bv|pt|pc|dj|pç|ln|nr|mn|tp|bf|bp",
      "sl|sm|sn|sc|sr|rn|bc|lr|lz|bd|bj|bg|bq|bt|bv|pt|pc|dj|pç|ln|nr|mn|tp|bf|bp",
  );

my %spri = ();

for my $pri (grep(/\d/, keys %syl)){
  for(split(//,$syl{$pri})) { $spri{$_} = $pri}}

(my $sylseppair= $syl{breakpair}) =~ s/(\w)(\w)/(\?<=($1))(\?=($2))/g;

sub syllable{
  my $p=shift;

  for($p){
    s/$sylseppair/|/g;
    s{(\w)(?=(\w)(\w))}
            {if($spri{lc($1)}<$spri{lc($2)} && $spri{lc($2)}>=$spri{lc($3)}){"$1|"}
              else{$1}
            }ge;

    s{([a])(i[ru])}{$1|$2}i;              #ditongos and friends
    s{([ioeê])([aoe])}{$1|$2}ig;
    s{u(ai|ou)}{u|$1}i;
    s{([^qg]u)(ei|iu|ir|$acento)}{$1|$2}i;
    s{([aeio])($acento)}{$1|$2}i;
    s{([íúô])($vogal)}{$1|$2}i;
    
    s{([qg]u)\|([eií])}{$1$2}i;
    s{^($consoante)\|}{$1}i;
    s{êm$}{ê|_nhem}i;
  }
  $p
}

sub compara {
  # ordena pela lista de palavras invertida
  join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b)));
}

sub compacta {
  my $s;
  my $p = shift;
  my $r = $p;
  my $q = $names{$p};
  while ($s = shift) {
    if ($s =~ (/^(.+) $p/))
      {
	$r = "($1) $r" ;
	$q += $names{$s};
      }
    else
      {
	print "$r - $q";
	$r = $s;
	$q = $names{$s};
      }
    $p=$s;
  }
  print "$r - $q";
}

my %savit_p = ();
my $savit_n = 0;

sub _savit {
  my $a = shift;
  $savit_p{++$savit_n} = $a ;
  " __MARCA__$savit_n "
}

sub _loadit {
  my $a = shift;
  $a =~ s/ ?__MARCA__(\d+) ?/$savit_p{$1}/g;
  $savit_n = 0;
  $a;
}

1;

#sub setlogmax{
# $maxlog = shift;
# $magicF=$maxlog/log($lmax);
###  print "Debud .... Maxlog=$maxlog; magic=$magicF\n";
#}

sub _setmax{
 $lmax = shift;
 $magicF=$maxlog/log($lmax);
##  print "Debud .... Max=$lmax; magic=$magicF\n";
}

sub _logit{
  my $n=shift;
  return 0 unless $n;
##  print STDERR "...$n,", log($n*$magicF) ,"\n" ;
  log($n)*$magicF
}

1;

__END__

$lm='[a-záéíóúâêôàãõçüöñ]';                      # letra minuscula
$lM='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑ]';                      # letra Maiuscula
$l1='[A-ZÁÉÍÓÚÂÊÔÀÃÕÇÜÖÑa-záéíóúâêôàãõçüöñ0-9]'; # letra e numero
$c1='[^»a-záéíóúâêà,;?!)]';