| Lingua-PT-Conjugate documentation | Contained in the Lingua-PT-Conjugate distribution. |
$verb_forms = unconj( [-a] , $string )
@res = string_entries( ['l'], \%verb_forms )@res = list_entries( ['l'], \%verb_forms )Lingua::PT::UnConjugate - Recognition of the conjugated forms of portuguese verbs.
This module provides functions for the recognition of the conjugated forms of portuguese verbs.
Composed tenses are not recognized. The verb list contains many non-verbs that I have not removed yet.
$verb_forms = unconj( [-a] , $string )Attempts to recognize a conjugated form of a Portuguese verb, and returns the result as a reference to hash : if the element
C<$verb_forms-E<gt>{$infinitive}-E<gt>{$tense}-E<gt>[$person]>
is true, then the conjugation of the verb "$infinitive" at the tense "$tense" and the person "$person" should yield "$string".
The first argument may an option :
@res = string_entries( ['l'], \%verb_forms )Convert a hash of recognized forms into a list of strings
"$verb, $tense, $person".
If the first argument is a 'l', then long forms of verb names will be used.
@res = list_entries( ['l'], \%verb_forms )Convert a hash of recognized forms into a list of triplets
[ $verb, $tense, $person ].
If the first argument is a 'l', then long forms of verb names will be used.
Thanks to Soraia Almeida (salmeida@logos.it) from the Logos project (http://www.logos.it) and Ulisses Pinto and José João Almeida from Projecto Natura (http://shiva.di.uminho.pt/~jj/pln) who made Ispell available.
A big part of the list of verb infinitives comes from files used in Ispell (http://shiva.di.uminho.pt/~jj/pln) and in Logos (http://www.verba.org, http://www.logos.it). these projects. Some verbs were removed and others added by hand.
| Lingua-PT-Conjugate documentation | Contained in the Lingua-PT-Conjugate distribution. |
#!/usr/bin/perl -w # # Perl package exporting a function "unconj" that un-conjugates # Portuguese verbs. # # Author : Etienne Grossmann (etienne@isr.ist.utl.pt) # # Date : September 1999 onwards. # # package Lingua::PT::UnConjugate ;
use Lingua::PT::Conjugate qw( conjug %long_tense ) ; use Lingua::PT::Infinitives ; use Lingua::PT::VerbSuffixes ; import Lingua::PT::Accent_iso_8859_1 qw(asc2iso); use Exporter ; @ISA = qw(Exporter); # Yes, this package is a namespace polluter. @EXPORT = qw( unconj ); @EXPORT_OK = qw( unconj list_entries string_entries ); BEGIN { # Suffixes and Infinitives. # ####################### VOCALS, CONSONANTS ##################### # Vocals and Consonants $vocs = "aeiouáàäâãéèëêíìïîóòöôõúùüû"; $cons = 'qwrtypsdfghjklzxcvbnm'; $letter = "ç$vocs$cons"; $lpat = "[$letter]" ; # Equivalent accent-matching regexp %equiv = ( "a"=>"[aáàäâã]", "á"=>"[aáàäâã]", "à"=>"[aáàäâã]", "ä"=>"[aáàäâã]", "â"=>"[aáàäâã]", "ã"=>"[aáàäâã]", "e"=>"[eéèëê]", "é"=>"[eéèëê]", "è"=>"[eéèëê]", "ë"=>"[eéèëê]", "ê"=>"[eéèëê]", "i"=>"[iíìïî]", "í"=>"[iíìïî]", "ì"=>"[iíìïî]", "ï"=>"[iíìïî]", "î"=>"[iíìïî]", "o"=>"[oóòöôõ]", "ó"=>"[oóòöôõ]", "ò"=>"[oóòöôõ]", "ö"=>"[oóòöôõ]", "ô"=>"[oóòöôõ]", "õ"=>"[oóòöôõ]", "u"=>"[uúùüû]", "ú"=>"[uúùüû]", "ù"=>"[uúùüû]", "ü"=>"[uúùüû]", "û"=>"[uúùüû]", "c"=>"[cç]", "ç"=>"[cç]", ) ; $equivk = join "", "[", keys(%equiv), "]" ; # Lower_case %mylc = split "", "ÇçÁáÀàÄäÂâÃãÉéÈèËëÊêÍíÌìÏïÎîÓóÒòÖöÔôÕõÚúÙùÜüÛû"; $mylck = join "", "[", keys(%mylc), "]" ; # print "$equivk\n$mylck\n" ; } sub my_lc # lc() for accentuated characters too { my $a = shift ; $a = lc($a) ; $a =~ s/($mylck)/$mylc{$1}/g ; return $a ; } # $r = regexify( $w ) # $r is a regex that will match any ending substring of $w sub regexify { my $r = shift ; my $r0 = $r ; while( $r =~ s/($lpat+)($lpat)/(\?:$1)\?$2/ ){} # print "regexify : $r0 -> $r\n" ; return $r ; }
sub unconj { my $acc = 0 ; # Check errors in accentuation ? my $ret = 0 ; # Retry in case of failure ? while( $_[0] =~ /^-[aA]$/ ) # Get options { my $opt = shift ; $opt =~ s/-//; $acc = 1 if $opt =~ /a/ ; $ret = 1 if $opt =~ /A/ ; # print "unconj : option $opt\n" ; } my $v0 = shift ; # print "unconj : $v0\n" ; my $v = asc2iso( $v0 ); # No ascii-style accents $v = my_lc($v) ; # my @res = (); my %res = (); my $p = regexify( $v ) ; if( $acc ) { ## HERE : assume letters are isolated in $p $p =~ s/\b($equivk)\b/$equiv{$1}/g ; ## print "$p\n" ; } my $p2 = "($p .*)" ; my @matches = $verb_suffixes =~ /^$p2/mg ; push @matches, " cfut,1,", " cfut,3," if $infinitives =~ /^$v$/m ; # print join "\n", @matches,"\n" ; foreach $m (@matches) { my ($s,$t,$p,$r) = $m =~ /^(\S*) (\w+),(\d+),(.*)/ ; my @endings = split ",",$r ; @endings = ("") unless @endings ; # print "Found $m ",0+@endings," endings\n"; # print "-- $s, $t, $p, $r\n" ; my $root = $s ? substr( $v, 0, -length($s) ) : $v ; foreach (@endings) { my $i = $root . $_ ; # $i =~ s/r+r$/r/ ; # Why? ##print "Trying : $i, $t, $p, $root + $_\n" ; next unless $infinitives =~ /^$i$/m ; my $check = conjug("xs",$i,$t,$p) ; $check =~ s/($equivk)/$equiv{$1}/g if $acc ; # print "Check $check\n" if $acc ; # print "Checking $i, $t, $p against $check\n" ; next unless $v =~ /^$check$/ ; # print "Found $i, $t, $p in $m\n" ; # push @res , [$i, $t, $p] ; $res{$i}->{$t}->[$p] = 1 ; } } %res = %{unconj("-a",$v0)} if $ret && !$acc && !keys(%res) ; return \%res ; }
sub string_entries { my $long = 0 ; if( $_[0] eq 'l' ) # Accept a "long verb name" option { $long = 1 ; shift ; } my $vdb = shift ; my ($w,$x,$y,$z); my @res = (); my $vcnt = 0 ; while( ($w,$x) = each %$vdb ) { next if $w eq " " ; next unless $w ; while( ($t,$y) = each %$x ) { foreach $p (1..6) { # print "." if $vcnt % 50 == 49 ; # print "\n".sprintf("%-6d ",$vcnt) # if $vcnt %1000 == 999 ; next unless defined( $y->[$p] ); # HERE : A bug ? If I don't check for "defined" the # first time $long_tense{$t} is used, it is undef'd. $t = $long_tense{$t} if $long && defined( $long_tense{$t} ); push @res, "$w, $t, $p" ; } } } return sort @res ; }
sub list_entries { my $long = 0 ; if( $_[0] eq 'l' ) # Accept a "long verb name" option { $long = 1 ; shift ; } my $vdb = shift ; my ($w,$x,$y,$z); my @res = (); my $vcnt = 0 ; while( ($w,$x) = each %$vdb ) { next if $w eq " " ; next unless $w ; while( ($t,$y) = each %$x ) { foreach $p (1..6) { # print "." if $vcnt % 50 == 49 ; # print "\n".sprintf("%-6d ",$vcnt) # if $vcnt %1000 == 999 ; next unless defined( $y->[$p] ); $t = $long_tense{$t} if $long && defined( $long_tense{$t} ); push @res, [$w, $t, $p ] ; } } } return sort @res ; }
1;