| Lingua-MAF documentation | Contained in the Lingua-MAF distribution. |
Lingua::MAF - Multi-Annotation Framework format writer
Version 0.1
This module allows to easily produces MAF-format documents. MAF (Morpho-Syntactic Annotation Framework) is a standard proposal developed by ISO TC37SC4 commitee, see http://www.tc37sc4.org for further details.
use Lingua::MAF;
my $maf = Lingua::MAF->new(
author => 'me',
language => 'english',
format => {
reduced => 1,
compact => 1,
}
);
$maf->add_token(
id => 't1',
content => 'content1',
pleft => 0,
pright => 6
);
$maf->add_word_form(
source => 0,
target => 1,
tokens => [ 't1' ]
form => 'form1',
entry => 'entry1',
tag => 'tag1'
);
$maf->add_word_form(
source => 1,
target => 2,
tokens => [ 't1' ]
form => 'form2',
entry => 'entry2',
tag => 'tag2'
);
$maf->flush();
Creates and returns a new Lingua::MAF object.
%options is an hash with the following keys:
The author of the document
The language of the document
The format of this document, as an hashref with the following keys:
add the full tagset at the beginning of the document
use reduced fsm format
use compact tag format
use standoff format
use embedded token format
Adds a token to the document. %token is a hash with the following keys:
Adds a word form to the document. %word_form is a hash with the following keys:
Directly paste an fsm twig extracted from another MAF document.
Merge current word forms and related tokens into a finite state automata, flushed to STDOUT.
Flush current document to STDOUT.
Copyright (C) 2005, INRIA.
This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
Guillaume Rousse <grousse@cpan.org>
Eric de la Clergerie, <Eric.De_La_Clergerie@inria.fr>
| Lingua-MAF documentation | Contained in the Lingua-MAF distribution. |
# $Id: MAF.pm,v 1.6 2005/05/26 15:53:17 rousse Exp $ package Lingua::MAF;
use warnings; use strict; use XML::Twig; use Lingua::Features; use DateTime; use Carp; our $VERSION = '0.1';
sub new { my ($class, %options) = @_; my $twig = XML::Twig->new( pretty_print => 'indented', output_encoding => 'ISO-8859-1', ); my $root = XML::Twig::Elt->new('maf', { author => $options{author}, date => DateTime->now()->ymd(), language => $options{language}, addressing => $options{format}->{standoff} ? 'byte' : 'embedded', }); my @variant; push(@variant,'reduced') if $options{format}->{reduced}; push(@variant,'compact') if $options{format}->{compact}; push(@variant,'embedded_token') if $options{format}->{embedded_token}; $root->set_att('variant' => join(' ', @variant)) if @variant; $twig->set_root($root); if ($options{format}->{tagset}) { my $tagset = _tagset2xml(); $tagset->paste($root); $options{format}->{compact} = 1; } my $self = bless { _twig => $twig, _debug => $options{format}->{debug}, _reduced => $options{format}->{reduced}, _compact => $options{format}->{compact}, _standoff => $options{format}->{standoff}, _embedded_token => $options{format}->{embedded_token} }, $class; return $self; }
sub add_token { my ($self, %token) = @_; croak "not a class method" unless ref $self; $self->{_tokens}->{$token{id}} = \%token; }
sub add_word_form { my ($self, %word_form) = @_; croak "not a class method" unless ref $self; push( @{$self->{_fsm}->{$word_form{source}}->{$word_form{target}}}, \%word_form ); }
sub paste_fsm_twig { my ($self, $fsm) = @_; croak "not a class method" unless ref $self; $fsm->cut(); $fsm->paste('last_child', $self->{_twig}->root()); $self->_flush($fsm); }
sub flush_fsm { my ($self) = @_; croak "not a class method" unless ref $self; my $root = $self->{_twig}->root(); my $current = $root; my %used_tokens; if ($self->{_reduced}) { my $in_fsm = 0; my @content = (); my $fsmstart = 0; foreach my $left (sort {$a <=> $b} keys %{$self->{_fsm}}) { if ($in_fsm && $left == $in_fsm) { $in_fsm = 0; $current->set_att('final', $left); $current->set_att('init', $fsmstart); $current->paste('last_child', $root); $current = $root; } my @rights = sort {$b <=> $a} keys %{$self->{_fsm}->{$left}}; if (@rights == 1) { my $right = $rights[0]; my @word_forms = (@{$self->{_fsm}->{$left}->{$right}}); unless ($self->{_embedded_token}) { foreach my $word_form (@word_forms) { foreach my $id (@{$word_form->{tokens}}) { unless ($used_tokens{$id}) { $self->_token2xml($self->{_tokens}->{$id})->paste('last_child', $root); $used_tokens{$id} = 1; } } } } my $node; if (@word_forms > 1) { my $alt = XML::Twig::Elt->new('wfAlt'); foreach my $word_form (@word_forms) { $self->_word_form2xml($word_form)->paste('last_child', $alt); } $node = $alt; } else { $node = $self->_word_form2xml($word_forms[0]); } if ($in_fsm) { my $transition = XML::Twig::Elt->new('transition', { source => $left, target => $right }); $node->paste('last_child', $transition); $transition->paste('last_child', $current); } else { $node->paste('last_child', $current); } } else { if (!$in_fsm) { $fsmstart = $left; $in_fsm = $rights[0]; $current = XML::Twig::Elt->new('fsm'); } foreach my $right (@rights) { my @word_forms = (@{$self->{_fsm}->{$left}->{$right}}); foreach my $word_form (@word_forms) { unless ($self->{_embedded_token}) { foreach my $id (@{$word_form->{tokens}}) { unless ($used_tokens{$id}) { $self->_token2xml($self->{_tokens}->{$id})->paste('last_child', $root); $used_tokens{$id} = 1; } } } $self->_word_form2xml($word_form)->paste('last_child', $current); } } } } if ($in_fsm) { $current->set_att(final => $in_fsm); $current->set_att(init => $fsmstart); $current->paste('last_child', $root); $current = $root; $in_fsm = 0; } } else { $current = XML::Twig::Elt->new('fsm'); $current->paste('last_child', $root); my $min = 1000; my $max = 0; foreach my $left (sort {$a <=> $b} keys %{$self->{_fsm}}) { $min = $left unless ($left > $min); foreach my $right (sort {$b <=> $a} keys %{$self->{_fsm}->{$left}}) { $max = $right unless ($right < $max); my @word_forms = (@{$self->{_fsm}->{$left}->{$right}}); foreach my $word_form (@word_forms) { unless ($self->{_embedded_token}) { foreach my $id (@{$word_form->{tokens}}) { unless ($used_tokens{$id}) { $self->_token2xml($self->{_tokens}->{$id})->paste('last_child', $current); $used_tokens{$id} = 1; } } } $self->_word_form2xml($word_form)->paste('last_child', $current); } } } $current->set_att('init', $min); $current->set_att('final', $max); } # clear variables delete $self->{_tokens}->{$_} foreach keys %used_tokens; $self->{_fsm} = {}; # flush until current fsm $self->_flush($current); }
sub flush_document { my ($self) = @_; croak "not a class method" unless ref $self; # flush all the twig $self->_flush(); } sub _flush { my ($self, $elt) = @_; # flush twig $self->{_twig}->flush($elt); } sub _token2xml { my ($self, $token) = @_; my $xml = XML::Twig::Elt->new('token', { id => $token->{id} }); if ($self->{_standoff}) { $xml->set_att('from', $token->{pleft}); $xml->set_att('to', $token->{pright}); } else { $xml->set_content($token->{content}); } return $xml; } sub _word_form2xml { my ($self, $word_form) = @_; my $xml = XML::Twig::Elt->new('wordForm', { author => $word_form->{author}, entry => $word_form->{entry}, form => $word_form->{form} }); if ($self->{_embedded_token}) { if (grep { !defined $self->{_tokens}->{$_} } @{$word_form->{tokens}}) { $xml->set_att('tokens', join(' ', @{$word_form->{tokens}})); } $self->_token2xml($self->{_tokens}->{$_})->paste('last_child', $xml) foreach (@{$word_form->{tokens}}); } else { $xml->set_att('tokens', join(' ', @{$word_form->{tokens}})); } if ($self->{_compact}) { $xml->set_att('tag', $word_form->{tag}); } else { my $fs = XML::Twig::Elt->new('fs'); my %features = Lingua::Features::Structure->from_string( $word_form->{tag} )->get_features(); foreach my $key (keys %features) { my $f = XML::Twig::Elt->new( 'f', { name => $key } ); my $node; my $values = $features{$key}; next unless $values; if (@$values > 1) { my $valt = XML::Twig::Elt->new('vAlt'); $f->paste('last_child', $valt); $node = $valt; } else { $node = $f; } foreach my $value (@$values) { my $symbol = XML::Twig::Elt->new( 'symbol', { value => $value } ); $symbol->paste('last_child', $node); } $node->paste('last_child', $fs); } $fs->paste('last_child', $xml); } unless ($self->{_reduced}) { my $transition = XML::Twig::Elt->new( transition => { source => $word_form->{source}, target => $word_form->{target} } ); $xml->paste('last_child', $transition); $xml = $transition; } return $xml; } sub _tagset2xml { my ($self) = @_; my $tagset = XML::Twig::Elt->new('tagset'); foreach my $type (Lingua::Features::FeatureType->types()) { my $id = $type->id(); my $dcs = XML::Twig::Elt->new( dcs => { private => $id, registered => "dcs:morphosyntax:fr:$id", rel => "eq" } ); $dcs->paste('last_child', $tagset); my $vlib = XML::Twig::Elt->new( vLib => { name => $id } ); $vlib->paste('last_child', $tagset); foreach my $value_id ($type->values()) { my $value_name = $type->value_name($value_id); my $symbol = XML::Twig::Elt->new( symbol => { value => $value_name, id => $value_id } ); $symbol->paste('last_child', $vlib); } } return $tagset; }
1;