Lingua::PT::ProperNames - Simple module to extract proper names from Portuguese Text


Lingua-PT-ProperNames documentation Contained in the Lingua-PT-ProperNames distribution.

Index


Code Index:

NAME

Top

Lingua::PT::ProperNames - Simple module to extract proper names from Portuguese Text

Version

Top

Version 0.09

Synopsis

Top

This module contains simple Perl-based functions to detect and extract proper names from Portuguese text.

  use Lingua::PT::ProperNames;




  printPN(@options);
  printPNstring({ %options... } ,$textstrint);
  printPNstring([ @options... ] ,$textstrint);

  forPN( sub{my ($pn, $contex)=@_;... } ) ;
  forPN( {t=>"double"},
         sub{my ($pn, $contex)=@_;... }, sub{...} ) ;
  $outstr = forPN($instr, sub{my ($pn, $contex)=@_;... }, ... ) ;

  forPNstring(sub{my ($pn, $contex)=@_;... },
         $textstring, regsep) ;




  my $pndict = Lingua::PT::ProperNames->new;

ProperNames dictionary

Top

new

Creates a new ProperNames dictionary

is_name

This method checks if a name exists in the Names dictionary.

is_surname

Thie method checks if a name exists in the Names dictionary as a Surname.

Export the following functions

Top

forPN

Substitutes all propername by <funref-($propername,$context)>> in STDIN and sends output to STDOUT

Usage:

   forPN({options...}, sub{ propername processor...})

Optionally you can define input or output files:

   forPN({in=> "inputfile", out => "outputfile" }, sub{...})

Optionally you can use option type : <{t = "double"}>> to have special treatment for process names after pontuation (".", etc). With this options you must provide 2 functions: one for normal propernames and one for names after pontuation.

   forPN({t=>"double"}, sub{...}, sub{...})

You can also define record paragraph separator

   forPN({sep=>"\n", t=>"normal"}, sub{...}) ## each line is a par.
   forPN({sep=>""}, sub{...})                ## par. empty lines

forPNstring

   forPNstring( $funref, "textstring" [, regSeparator] )>

Substitutes all propername by funref(propername) in the text string.

printPNstring

   printPNstring("oco")

getPN

printPN

  printPN("oco")

  printPN  - extrai os nomes próprios dum texto.
   -comp    junta certos nomes: Fermat + Pierre de Fermat = (Pierre de) Fermat
   -prof
   -e       "Sebastiao e Silva" "e" como pertencente a PN
   -em      "em Famalicão" como pertencente a PN




Author

Top

José João Almeida, <jj@di.uminho.pt>

Alberto Simões, <ambs@di.uminho.pt>

Bugs

Top

NOTE: We know documentation for exported methods is inexistent. We are working on that for very soon.

Please report any bugs or feature requests to bug-lingua-pt-propernames@rt.cpan.org, or through the web interface at http://rt.cpan.org. I will be notified, and then you'll automatically be notified of progress on your bug as I make changes.

COPYRIGHT & LICENSE

Top


Lingua-PT-ProperNames documentation Contained in the Lingua-PT-ProperNames distribution.
package Lingua::PT::ProperNames;

#require Exporter;
use locale;
use IO::String;
use warnings;
use strict;

our $VERSION = '0.09';
use base 'Exporter';
our @EXPORT = qw/getPN printPN printPNstring forPN forPNstring/;

our ($em, $np1, $np, $prof, $sep1, $sep2, %vazia, @stopw);

BEGIN {

  $np1 = qr{(?:(?: [A-ZÈÉÚÓÁÂ][.])+
                              |   [sS]r[.]
                              |   [dD]r[.]
                              |   St[oa]?[.]
                              |   [A-ZÈÉÚÓÁÂ]\w+(?:[\'\-]\w+)*
                  )}x;

  #if ($e) {
  #$np= qr{$np1(?:\s+(?:d[eao]s?\s+|e\s+)?$np1)*};
  #} else {
  $np= qr{$np1
                    (?: \s+ (?:d[eaou]s?\s+
                                    |  d'
                                    |  de \s+ l[ae]s? \s+
                                    |  v[oa]n\s+
                                    )?
                            $np1)*
                  }x;
  #}

  @stopw = qw{
              no com se em segundo a o os as na nos nas do das dos da tanto
              para de desde mas quando esta sem nem só apenas mesmo até uma uns um
              pela por pelo pelas pelos depois ao sobre como umas já enquanto aos
              também amanhã ontem embora essa nesse olhe hoje não eu ele eles
              primeiro simplesmente era foi é será são seja nosso nossa nossos nossas
              chama-se chamam-se subtitui resta diz salvo disse diz vamos entra entram
              aqui começou lá seu vinham passou quanto sou vi onde este então temos
              num aquele tivemos

              en la pour le
             };

  $prof = join("|", qw{
                       astrólogo astrónomo advogado actor
                       baterista
                       cantor compositor
                       dramaturgo
                       engenheiro escritor
                       filósofo flautista físico
                       investigador
                       jogador
                       matemático médico ministro músico
                       pianista poeta professor
                       químico
                       teólogo
                      });
  $sep1 = join("|", qw{chamado "conhecido como"});

  $sep2 = join("|", qw{brilhante conhecido reputado popular});
  @vazia{@stopw} = (@stopw); # para ser mais facil ver se uma pal é stopword
  $em = '\b(?:[Ee]m|[nN][oa]s?)';
}

sub new {
  my $class = shift;
  # my $filename = shift;

  my $self = bless {}, $class;
  $self->_load_dictionary;
  return $self;
}

sub _load_dictionary {
  my $self = shift;
  my $file = shift || undef;

  if ($file) {
    open C, $file or die;
    while(<C>) {
      chomp;
      next if m!^\s*$!;
      $self->{cdic}{$_} = $_;
    }
    close C;
  } else {
    my $f = _find_file();
    open D, $f or die "Cannot open file $f: $!\n";
    while(<D>) {
      chomp;
      next if m!^\s*$!;
      my ($nome,$prob,$type) = split /\s+/;
      $self->{dic}{$nome} = {type=>$type,prob=>$prob};
    }
    close D;
  }
}

sub _exists {
  my $self = shift;
  my $word = shift;
  return exists($self->{dic}{$word}) or
    exists($self->{cdic}{$word}) or
      exists($self->{sdic}{$word})
}

sub is_name {
  return _exists(@_)
}

sub is_surname {
  return _exists(@_) && _type(@_) eq "apelido";
}

sub _type {
  my $self = shift;
  my $word = shift;
  if (exists($self->{dic}{$word})) {
    return $self->{dic}{$word}{type}
  } elsif (exists($self->{cdic}{$word})) {
    return $self->{cdic}{$word}{type}
  } elsif (exists($self->{sdic}{$word})) {
    return $self->{sdic}{$word}{type}
  } else {
    return undef;
  }
}



sub forPN{
  ## opt:  in=> inputfile(sdtin), out => file(stdout)
  my %opt = (sep => "", t => "normal" );

  %opt = (%opt , %{shift(@_)}) if   ref($_[0]) eq "HASH";
  my $instring = "";
  $instring = shift(@_)        if ! ref($_[0]);

  my ($f,$f1) = @_;
  my $m="\x01";
  my $old;
  my ($F1, $F2) ;

  die "invalid parameter to 'forPN'" unless ref($f) eq "CODE";

  if ($opt{t} eq "double") {
    die "invalid parameter ". ref($f1) unless ref($f1) eq "CODE";
  }

  local $/ = $opt{sep};  # input record separator=one or more empty lines

  if (defined $opt{in}) {
    open $F1, "$opt{in}" or die "cant open $opt{in}\n";
  } elsif (defined $instring) {          ## input is a string (1st parameter)
    $F1 = IO::String->new($instring);
  } else {
    $F1=*STDIN;
  }

  if (defined $opt{out}) {
    open F, ">$opt{out}" or die "cant create $opt{out}\n";
    $old = select(F);
  } elsif (defined $instring) {          ## input is a string (1st parameter)
    $F2 = IO::String->new();
    $old = select($F2);
  }

  while (<$F1>) {
    my $ctx = $_;
    if ($opt{t} eq "double") {

      s{($np)}{$m($1$m)}g;
      s{(^\s* 
                | [-]\s+
                | [.!?]\s*
                )  $m\( ($np) $m\)
              }{
                      my ($aux1,$aux2,$aux3)= ($1,$2, $f1->($2,$ctx));
	   	   if   (defined($aux3)){$aux1 . $aux3}
	   	   else                 {$aux1 . _tryright($aux2)} }xge;
      
      s{$m\(($np)$m\)}{  $f->($1,$ctx) }ge;

    } else {
      s{( \w+\s+
                | [\«\»,:()'`"]\s*
                )  ($np)
              }{$1 . $f->($2,$ctx) }xge;
    }
    print;
  }
  close $F1 if $opt{in};
  if (defined $opt{out}) {
    select $old;
    close F;
  } elsif (defined $instring) {          ## input is a string (1st parameter)
    return ${$F2->string_ref()};
  }
}

sub forPNstring {
  my $f = shift;
  die "invalid parameter to 'forPNstring': function expected" unless ref($f) eq "CODE";
  my $text = shift;
  my $sep = shift || "\n";
  my $r = '';
  for (split(/$sep/,$text)) {
    my $ctx = $_;
    s/(\w+\s+|[\«\»,()'`i"]\s*)($np)/$1 . $f->($2,$ctx)/ge       ;
    $r .= "$_$sep";
  }
  return $r;
}

sub printPNstring{
  my $text = shift;
  my %opt = ();

  if   (ref($text) eq "HASH") { %opt = %$text        ; $text = shift; }
  elsif(ref($text) eq "ARRAY"){ @opt{@$text} = @$text; $text = shift; }

  my (%profissao, %names, %namesduv, %gnames);

  for ($text) {
    chop;
    s/\n/ /g;
    for (m/[.?!:;"]\s+($np1\s+$np)/gxs)  { $namesduv{$_}++ }
    for (m![)>(]\s*($np1\s+$np)!gxs)     { $namesduv{$_}++ }
    for (m/(?:[\w\«\»,]\s+)($np)/gxs)    { $names{$_}++ }
    if ($opt{em}) {
      for (/$em\s+($np)/g) { $gnames{$_}++ }
    }
    if ($opt{prof}) {
      while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g)
	{ $profissao{$2} = $1 }
      while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g)
	{ $profissao{$1} = $2 }
    }
  }

  # tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase
  #

  for (keys %namesduv) {
    if (/^(\w+)/ && $vazia{lc($1)}) { #exemplo "Como Jose Manuel"
      s/^\w+\s*//;                    # retira-se a 1.a palavra
      $names{$_}++
    } else { 
      $names{$_}++
    }
  }

  for (keys %names) {
    if (/^(\w+)/ && $vazia{lc($1)}) {  #exemplo "Como Jose Manuel"
      my $ant = $_;
      s/^\w+\s*//;                     # retira-se a 1.a palavra
      $names{$_} += $names{$ant};
      delete $names{$ant}
    }
  }

  if ($opt{oco}) {
    for (sort {$names{$b} <=> $names{$a}} keys %names ) {
      printf("%60s - %d\n", $_ ,$names{$_});
    }
  } else {
    if ($opt{comp}) {
      my @l = sort _compara keys %names;
      _compacta(\%names, @l)
    } else {
      for (sort _compara keys %names ) {
	printf("%60s - %d\n", $_ ,$names{$_});
      }
    }
    if ($opt{prof}) {
      print "\nProfissões\n";
      for (keys %profissao) {
	print "$_ -- $profissao{$_}"
      }
    }
    if ($opt{em}) {
      print "\nGeograficos\n";
      for (sort _compara keys %gnames ) {
	printf("%60s - %d\n", $_ ,$gnames{$_})
      }
    }
  }
}


sub getPN {
  local $/ = "";           # input record separator=1 or more empty lines

  my %opt;
  @opt{@_} = @_;
  my (%profissao, %names, %namesduv, %gnames);

  while (<>) {
    chop;
    s/\n/ /g;
    for (/[.?!:;"]\s+($np1\s+$np)/g)     { $namesduv{$_}++;}
    for (/[)>(]\s*($np1\s+$np)/g)        { $namesduv{$_}++;}
    for (/(?:[\w\«\»,]\s+)($np)/g)       { $names{$_}++;}
    if ($opt{em}) {
      for (/$em\s+($np)/g) { $gnames{$_}++;}}
    if ($opt{prof}) {
       while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g)
	 { $profissao{$2} = $1 }
       while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g)
	 { $profissao{$1} = $2 }
     }
  }

  # tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase
  #

  for (keys %namesduv) {
    if(/^(\w+)/ && $vazia{lc($1)}) {  # exemplo "Como Jose Manuel"
      s/^\w+\s*//;                    # retira-se a 1.a palavra
      $names{$_}++
    } else {
      $names{$_}++
    }
  }
  return (%names)
}


sub printPN{
  local $/ = "";           # input record separator=1 or more empty lines

  my %opt;
  @opt{@_} = @_;
  my (%profissao, %names, %namesduv, %gnames);

  while (<>) {
    chop;
    s/\n/ /g;
    for (/[.?!:;"]\s+($np1\s+$np)/g)     { $namesduv{$_}++ }
    for (/[)>(]\s*($np1\s+$np)/g)        { $namesduv{$_}++ }
    for (/(?:[\w\«\»,]\s+)($np)/g)       { $names{$_}++ }
    if ($opt{em}) {
      for (/$em\s+($np)/g) { $gnames{$_}++ }
    }
    if ($opt{prof}) {
       while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g)
	 { $profissao{$2} = $1 }
       while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g)
	 { $profissao{$1} = $2 }
     }
  }

  # tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase
  #

  for (keys %namesduv){
    if(/^(\w+)/ && $vazia{lc($1)} )   #exemplo "Como Jose Manuel"
      {s/^\w+\s*//;                  # retira-se a 1.a palavra
       $names{$_}++;}
    else
      { $names{$_}++;}
  }

  ##### Não sei bem se isto serve...

  for (keys %names){
    if(/^(\w+)/ && $vazia{lc($1)} )   #exemplo "Como Jose Manuel"
      { my $ant = $_;
        s/^\w+\s*//;                  # retira-se a 1.a palavra
        $names{$_}+=$names{$ant};
        delete $names{$ant};}
  }

  if($opt{oco}){
    for (sort {$names{$b} <=> $names{$a}} keys %names )
      {printf("%6d - %s\n",$names{$_}, $_ );}
  }
  else
    {
      if($opt{comp}){my @l = sort _compara keys %names;
		     _compacta(\%names, @l); }
      else{for (sort _compara keys %names )
	     {printf("%60s - %d\n", $_ ,$names{$_});} }

      if($opt{prof}){print "\nProfissões\n";
		     for (keys %profissao){print "$_ -- $profissao{$_}";} }

      if($opt{em}){print "\nGeograficos\n";
		   for (sort _compara keys %gnames )
		     {printf("%60s - %d\n", $_ ,$gnames{$_});} }
  }
}



##
# Auxiliary stuff

sub _tryright{
  my $a = shift;
  return $a unless $a =~ /(\w+)(.*)$/;
  my ($w,$r) = ($1,$2);
  my $m = "\x01";
  $r =~ s{($np)}{$m($1$m)}g;
  return "$w$r";
}


sub _compacta{
  my $s;
  my $names = shift;

  my $p = shift;
  my $r = $p;
  my $q = $names->{$p};
  while ($s = shift)
    { if ($s =~ (/^(.+) $p/)) { $r = "($1) $r" ;
				$q += $names->{$s};
			      }
      else {print "$r - $q"; $r=$s; $q = $names->{$s}; }
      $p=$s;
    }
  print "$r - $q";
}

sub _compara {
  # ordena pela lista de palavras invertida
  join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b)));
}

sub _find_file {
    my @files = grep { -e $_ } map { "$_/Lingua/PT/ProperNames/names.dat" } @INC;
    return $files[0];
}

1; # End of Lingua::PT::ProperNames