Lingua::PT::UnConjugate - Recognition of the conjugated forms of


Lingua-PT-Conjugate documentation Contained in the Lingua-PT-Conjugate distribution.

Index


Code Index:

NAME

Top

Lingua::PT::UnConjugate - Recognition of the conjugated forms of portuguese verbs.

DESCRIPTION

Top

This module provides functions for the recognition of the conjugated forms of portuguese verbs.

BUGS

Top

Composed tenses are not recognized. The verb list contains many non-verbs that I have not removed yet.

SYNOPSIS

Top

$verb_forms = unconj( [-a] , $string )

Top

Attempts to recognize a conjugated form of a Portuguese verb, and returns the result as a reference to hash : if the element

    C<$verb_forms-E<gt>{$infinitive}-E<gt>{$tense}-E<gt>[$person]>

is true, then the conjugation of the verb "$infinitive" at the tense "$tense" and the person "$person" should yield "$string".

OPTIONS

The first argument may an option :

-a : Try to match accentuation errors.
-A : If no match is found, try matching with option -a.

@res = string_entries( ['l'], \%verb_forms )

Top

Convert a hash of recognized forms into a list of strings "$verb, $tense, $person".

If the first argument is a 'l', then long forms of verb names will be used.

@res = list_entries( ['l'], \%verb_forms )

Top

Convert a hash of recognized forms into a list of triplets [ $verb, $tense, $person ].

If the first argument is a 'l', then long forms of verb names will be used.

SEE ALSO : unconj, conjug, treinar.

Top

AUTHOR Etienne Grossmann, 1999 [etienne@isr.ist.utl.pt]

Top

CREDITS

Top

Thanks to Soraia Almeida (salmeida@logos.it) from the Logos project (http://www.logos.it) and Ulisses Pinto and José João Almeida from Projecto Natura (http://shiva.di.uminho.pt/~jj/pln) who made Ispell available.

A big part of the list of verb infinitives comes from files used in Ispell (http://shiva.di.uminho.pt/~jj/pln) and in Logos (http://www.verba.org, http://www.logos.it). these projects. Some verbs were removed and others added by hand.


Lingua-PT-Conjugate documentation Contained in the Lingua-PT-Conjugate distribution.
#!/usr/bin/perl -w
#
# Perl package exporting a function "unconj" that un-conjugates 
# Portuguese verbs. 
# 
# Author : Etienne Grossmann (etienne@isr.ist.utl.pt) 
# 
# Date   : September 1999 onwards.
#
# 
package Lingua::PT::UnConjugate ;

use Lingua::PT::Conjugate qw( conjug %long_tense ) ;

use Lingua::PT::Infinitives ;
use Lingua::PT::VerbSuffixes ;
import Lingua::PT::Accent_iso_8859_1 qw(asc2iso);
use Exporter ;
@ISA = qw(Exporter);
# Yes, this package is a namespace polluter. 
@EXPORT = qw( unconj ); 
@EXPORT_OK = qw( unconj list_entries string_entries ); 


BEGIN {
				# Suffixes and Infinitives.

    # ####################### VOCALS, CONSONANTS ##################### 
    # Vocals and Consonants   
    $vocs = "aeiouáàäâãéèëêíìïîóòöôõúùüû";
    $cons = 'qwrtypsdfghjklzxcvbnm';
    $letter = "ç$vocs$cons";
    $lpat = "[$letter]" ;
				# Equivalent accent-matching regexp
    %equiv = ( "a"=>"[aáàäâã]",
	       "á"=>"[aáàäâã]",
	       "à"=>"[aáàäâã]",
	       "ä"=>"[aáàäâã]",
	       "â"=>"[aáàäâã]",
	       "ã"=>"[aáàäâã]",
	       "e"=>"[eéèëê]",
	       "é"=>"[eéèëê]",
	       "è"=>"[eéèëê]",
	       "ë"=>"[eéèëê]",
	       "ê"=>"[eéèëê]",
	       "i"=>"[iíìïî]",
	       "í"=>"[iíìïî]",
	       "ì"=>"[iíìïî]",
	       "ï"=>"[iíìïî]",
	       "î"=>"[iíìïî]",
	       "o"=>"[oóòöôõ]",
	       "ó"=>"[oóòöôõ]",
	       "ò"=>"[oóòöôõ]",
	       "ö"=>"[oóòöôõ]",
	       "ô"=>"[oóòöôõ]",
	       "õ"=>"[oóòöôõ]",
	       "u"=>"[uúùüû]",
	       "ú"=>"[uúùüû]",
	       "ù"=>"[uúùüû]",
	       "ü"=>"[uúùüû]",
	       "û"=>"[uúùüû]",
	       "c"=>"[cç]",
	       "ç"=>"[cç]",
	       ) ;
    $equivk = join "", "[", keys(%equiv), "]" ;

				# Lower_case
    %mylc = split "",
    "ÇçÁáÀàÄäÂâÃãÉéÈèËëÊêÍíÌìÏïÎîÓóÒòÖöÔôÕõÚúÙùÜüÛû";  
    $mylck = join "",  "[", keys(%mylc), "]" ;
    # print "$equivk\n$mylck\n" ;

}

sub my_lc			# lc() for accentuated characters too 
{
    my $a = shift ;
    $a = lc($a) ;
    $a =~ s/($mylck)/$mylc{$1}/g ;
    return $a ;
}

# $r = regexify( $w ) 
# $r is a regex that will match any ending substring of $w 
sub regexify			
{
    my $r = shift ;
    my $r0 = $r ;
    while( $r =~ s/($lpat+)($lpat)/(\?:$1)\?$2/ ){}
    # print "regexify : $r0 -> $r\n" ;
    return $r ;
}

sub unconj
{
    my $acc = 0 ;		# Check errors in accentuation ?
    my $ret = 0 ;		# Retry in case of failure ?
    while( $_[0] =~ /^-[aA]$/ )	# Get options
    {
	my $opt = shift ;
	$opt =~ s/-//;
	$acc = 1 if $opt =~ /a/ ;
	$ret = 1 if $opt =~ /A/ ;
	# print "unconj : option $opt\n" ;
    }
    my $v0 = shift ;
    # print "unconj : $v0\n" ;
    my $v = asc2iso( $v0 );	# No ascii-style accents
    $v = my_lc($v) ;
    # my @res = ();
    my %res = ();
    
    my $p = regexify( $v ) ;
    if( $acc )
    {
	## HERE : assume letters are isolated in $p
	$p =~ s/\b($equivk)\b/$equiv{$1}/g ;
	## print "$p\n" ;
    }
    my $p2 = "($p .*)" ;
    my @matches = $verb_suffixes =~ /^$p2/mg ;
    
    push @matches, " cfut,1,", " cfut,3," if $infinitives =~ /^$v$/m ;
    
    # print join "\n", @matches,"\n" ;
    foreach $m (@matches)
    {
	my ($s,$t,$p,$r) = $m =~ /^(\S*) (\w+),(\d+),(.*)/ ;
	my @endings = split ",",$r ;
	@endings = ("") unless @endings ;
	# print "Found $m ",0+@endings," endings\n";
	# print "-- $s, $t, $p, $r\n" ;
	my $root = $s ? substr( $v, 0, -length($s) ) : $v ;
	foreach (@endings)
	{
	    my $i = $root . $_ ;
	    # $i =~ s/r+r$/r/ ;	# Why?
	    ##print "Trying : $i, $t, $p, $root + $_\n" ;
	    next unless $infinitives =~ /^$i$/m ;
	    my $check = conjug("xs",$i,$t,$p) ;
	    $check =~ s/($equivk)/$equiv{$1}/g if $acc ;
	    # print "Check $check\n" if $acc ;
	    # print "Checking $i, $t, $p against $check\n" ;
	    next unless $v =~ /^$check$/ ;
	    # print "Found $i, $t, $p in $m\n" ;
	    # push @res , [$i, $t, $p] ; 
	    $res{$i}->{$t}->[$p] = 1 ;
	}
    }
    %res = %{unconj("-a",$v0)} if $ret && !$acc && !keys(%res) ;
    return \%res ;
}

sub string_entries
{
    my $long = 0 ;
    if( $_[0] eq 'l' )		# Accept a "long verb name" option
    {
	$long = 1 ;
	shift ;
    }
    my $vdb = shift ;
    my ($w,$x,$y,$z);
    my @res = ();

    my $vcnt = 0 ;

    while( ($w,$x) = each %$vdb ) {
	next if $w eq " " ;
	next unless $w ;
	while( ($t,$y) = each %$x ) {
	    foreach $p (1..6)
	    {
		# print "." if $vcnt % 50 == 49 ;
		# print "\n".sprintf("%-6d ",$vcnt) 
		# if $vcnt %1000 == 999 ;
		next unless defined( $y->[$p] );
		# HERE : A bug ? If I don't check for "defined" the
		# first time $long_tense{$t} is used, it is undef'd.
		$t = $long_tense{$t} if $long   && 
		     defined( $long_tense{$t}  );
		push @res, "$w, $t, $p" ;
	    }
	}
    }
    return sort @res ;
}

 
sub list_entries
{
    my $long = 0 ;
    if( $_[0] eq 'l' )		# Accept a "long verb name" option
    {
	$long = 1 ;
	shift ;
    }
    my $vdb = shift ;
    my ($w,$x,$y,$z);
    my @res = ();

    my $vcnt = 0 ;

    while( ($w,$x) = each %$vdb ) {
	next if $w eq " " ;
	next unless $w ;
	while( ($t,$y) = each %$x ) {
	    foreach $p (1..6)
	    {
		# print "." if $vcnt % 50 == 49 ;
		# print "\n".sprintf("%-6d ",$vcnt) 
		# if $vcnt %1000 == 999 ;
		next unless defined( $y->[$p] );
		$t = $long_tense{$t} if $long && 
		     defined( $long_tense{$t} );
		push @res, [$w, $t, $p ] ;
	    }
	}
    }
    return sort @res ;
}


1;