Lingua::TagSet - Natural language tagset conversion


Lingua-TagSet documentation Contained in the Lingua-TagSet distribution.

Index


Code Index:

NAME

Top

Lingua::TagSet - Natural language tagset conversion

VERSION

Top

Version 0.3

DESCRIPTION

Top

This module allows to convert values between different tagsets used in natural language processing, using Lingua::Features as a pivot format

SYNOPSIS

Top

    use Lingua::TagSet::Multext;

    # tagset to features conversions
    my $struct = Lingua::TagSet::Multext->tag2structure($multext);
    my $string = Lingua::TagSet::Multext->tag2string($multext);

    # features to tagset conversions
    my $multext = Lingua::TagSet::Multext->string2tag($string);
    my $multext = Lingua::TagSet::Multext->structure2tag($structure);

Lingua::TagSet->tag2structure()

Convert a tag to a features structure.

Lingua::TagSet->structure2tag()

Convert a features structure to a tag.

Lingua::TagSet->tag2string()

Convert a tag into a string representation of a features structure. The result is cached.

Lingua::TagSet->string2tag()

Convert a string representation of a features structure into a tag. The result is cached.

COPYRIGHT AND LICENSE

Top

AUTHOR

Top

Guillaume Rousse <grousse@cpan.org>


Lingua-TagSet documentation Contained in the Lingua-TagSet distribution.
# $Id: TagSet.pm,v 1.8 2004/06/11 11:35:18 rousse Exp $
package Lingua::TagSet;

use Memoize;
use Lingua::Features::Structure;
use Lingua::TagSet::Tag;
use strict;
use warnings;

our $VERSION = '0.3';

my (%tag2string, %string2tag);
memoize 'tag2string', SCALAR_CACHE => [ HASH => \%tag2string ];
memoize 'string2tag', SCALAR_CACHE => [ HASH => \%string2tag ];

my %tokens_trees;
my %features_trees;
my %tokens_tables;
my %features_tables;

sub _init {
    my ($class) = @_;

    no strict 'refs';

    my $tokens_tree    = {};
    my $features_tree  = {};
    my $tokens_table   = {};
    my $features_table = {};

    foreach my $map (@{$class . '::id_maps'}) {

	# token to feature: tree of tokens valued by mappings
	my $token_leaf;
	$token_leaf->{features} = $map->{features};
	$token_leaf->{submap}   = { _index(@{$map->{submap}}) } if $map->{submap};
	$token_leaf->{specific} = ref $map->{tokens}->[0] ne 'ARRAY';
	my $tag        = Lingua::TagSet::Tag->new(@{$map->{tokens}});
	my @token_list = _get_tokens_list($tag);
	_build_tree($tokens_tree, \@token_list, 0, $token_leaf);

	# feature to token: tree of features valued by mappings
	my $feature_leaf;
	$feature_leaf->{tokens} = $map->{tokens};
	$feature_leaf->{submap} = { _index(reverse @{$map->{submap}}) } if $map->{submap};
	$feature_leaf->{specific} = ref $map->{features}->{cat} ne 'ARRAY';
	my $structure = Lingua::Features::Structure->new(%{$map->{features}});
	my @feature_list = _get_features_list($structure);
	_build_tree($features_tree, \@feature_list, 0, $feature_leaf);
    }

    while (my ($type, $map) = each %{$class . '::value_maps'}) {
	# token to feature: feature values indexed by token value
	$tokens_table->{$type} = { _index(@{$map}) };
	# feature to token: token values indexed by feature value
	$features_table->{$type} = { _index(reverse @{$map}) };
    }

    $tokens_trees{$class}    = $tokens_tree;
    $features_trees{$class}  = $features_tree;
    $tokens_tables{$class}   = $tokens_table;
    $features_tables{$class} = $features_table;
}

sub _index {
    my (@list) = @_;
    my %hash;
    while (@list) {
	my $key   = shift @list;
	my $value = shift @list;
	push(@{$hash{$key}}, $value); 
    }
    return %hash;
}

sub _get_features_list {
    my ($structure) = @_;
    my $i;
    my @list = grep { $i++ % 2 } $structure->get_features();
    pop @list while ! defined $list[-1];
    return @list;
}

sub _get_tokens_list {
    my ($tag) = @_;
    my @list = $tag->get_tokens();
    pop @list while ! defined $list[-1];
    return @list;
}

sub tag2structure {
    my ($class, $tag, %args) = @_;

    return unless $tag;

    # get converter data
    my $tree  = $tokens_trees{$class};
    my $table = $tokens_tables{$class};

    # find matching maps
    my @tokens  = _get_tokens_list($tag);
    my @id_maps = _parse_tree($tree, \@tokens, 0);
    return unless @id_maps;

    # select most relevant one
    my $id_map = $id_maps[0];

    # create base structure
    my $structure = Lingua::Features::Structure->new(%{$id_map->{features}});

    # compute other features
    my $submap = $id_map->{submap};
    if ($submap) {
	foreach my $token_id (sort keys %{$submap}) {
	    my $token_values = $tag->get_token($token_id);
	    next unless $token_values; # unknown value

	    my $feature_ids = $submap->{$token_id};
	    foreach my $feature_id (@{$feature_ids}) {
		my $type = $structure->get_type()->{$feature_id};
		die "no feature $feature_id" unless $type;

		my $type_id   = $type->id();
		my $value_map = $table->{$type_id};
		die "no value map for type $type_id" unless $value_map;

		my @feature_values;
		foreach my $token_value (@{$token_values}) {
		    push(@feature_values, @{$value_map->{$token_value}}) if $value_map->{$token_value};

		}

		if (@feature_values) {
		    $structure->set_feature($feature_id, \@feature_values);
		} else {
		    if ($args{no_strict_align}) {
			# some tagset skip unknown values
			$tag->insert_token($token_id, undef);
		    } else {
			# unknown value
			$structure->set_feature($feature_id, undef);
		    }
		}
	    }
	}
    }

    return $structure;
}

sub structure2tag {
    my ($class, $structure, %args) = @_;

    return unless $structure;

    # get converter data
    my $tree  = $features_trees{$class};
    my $table = $features_tables{$class};

    # find matching maps
    my @features = _get_features_list($structure);
    my @id_maps = _parse_tree($tree, \@features, 0);
    return unless @id_maps;

    # select most relevant one
    my $id_map = _select_alternative_maps(
	\@id_maps,
	[
	    sub { return $_->{specific} }, # prefer specific maps
	    sub { return $_->{submap} }    # prefer exhaustive maps
	]
    );

    # create base tag
    my $tag = Lingua::TagSet::Tag->new(@{$id_map->{tokens}});

    # compute other tokens
    my $submap = $id_map->{submap};
    if ($submap) {
	foreach my $feature_id (%{$submap}) {
	    my $feature_values = $structure->get_feature($feature_id);
	    next unless $feature_values; # unknown value

	    my $type = $structure->get_type()->{$feature_id};
	    die "no feature $feature_id" unless $type;

	    my $type_id   = $type->id();
	    my $value_map = $table->{$type_id};
	    die "no value map for type $type_id" unless $value_map;

	    my $token_ids = $submap->{$feature_id};
	    foreach my $token_id (@{$token_ids}) {
		my @token_values;
		foreach my $feature_value (@{$feature_values}) {
		    push(@token_values, @{$value_map->{$feature_value}}) if $value_map->{$feature_value};
		}

		# take care of multiple-mapped features
		my $current_values = $tag->get_token($token_id);
		if ($current_values) {
		    # keep only intersection of two sets
		    my (%union, %intersection);
		    foreach my $value (@token_values, @{$current_values}) {
			$union{$value}++ && $intersection{$value}++;
		    }
		    @token_values = keys %intersection;
		}

		if (@token_values) {
		    $tag->set_token($token_id, \@token_values);
		} else {
		    $tag->set_token($token_id, undef);
		}
	    }
	}
    }

    return $tag;
}

sub _build_tree {
    my ($node, $list, $index, $leaf) = @_;

    # extract values for current position
    my $values = $list->[$index] || [ '_any' ];

    # construct a branch for each value
    foreach my $value (@{$values}) {
	# add a new node
	$node->{$value} = {} unless $node->{$value};

	if ($index != $#$list) {
	    # build tree further
	    _build_tree($node->{$value}, $list, ++$index, $leaf);
	} else {
	    # add leaves
	    if ($node->{$value}->{_map}) {
		push (@{$node->{$value}->{_map}}, $leaf);
	    } else {
		$node->{$value}->{_map} = [ $leaf ]
	    }
	}
    }
}

sub _parse_tree {
    my ($node, $list, $index) = @_;

    # extract values for current position
    my $values = $list->[$index];

    my @results;

    # browse each potential specific branch if values are known
    if ($values) {
	foreach my $value (@{$values}) {
	    if ($node->{$value}) {
		push(@results, _parse_tree($node->{$value}, $list, ++$index));
	    }
	}
    }

    # browse generic branch if present
    if ($node->{_any}) {
	push(@results, _parse_tree($node->{_any}, $list, ++$index));
    }

    # return current maps if no other results found sofar
    unless (@results) {
	@results = @{$node->{_map}} if $node->{_map};
    }

    # otherwise return current result
    return @results;
}

sub _select_alternative_maps {
    my ($maps, $functions) = @_;

    # return unique solution
    return $maps->[0] if $#$maps == 0;

    my $function = shift @$functions;

    # keep filtering while criterias available
    if ($function) {
	my @filtered_maps = grep { $function->($_) } @$maps;
	return @filtered_maps ?
	    _select_alternative_maps(\@filtered_maps, $functions) :
	    _select_alternative_maps($maps, $functions) ;
    }

    # otherwise merge remaining mappings
    my $max = 0;
    for my $map (@$maps) {
	$max = $#{$map->{tokens}} if $#{$map->{tokens}} > $max;
    }

    my @tokens;
    for (my $i = 0; $i <= $max; $i++) {
	foreach my $map (@$maps) {
	    my $item = $map->{tokens}->[$i];
	    if ($item) {
		if (ref $item eq 'ARRAY') {
		    push(@{$tokens[$i]}, @{$item});
		} else {
		    push(@{$tokens[$i]}, $item);
		}
	    }
	}
    }

    return {
	tokens => \@tokens
    }
}

sub tag2string {
    my ($class, $tag_string) = @_;
    return unless $tag_string;
    my $structure = $class->tag2structure($tag_string);
    return $structure ? $structure->to_string(): '';
}

sub string2tag {
    my ($class, $structure_string) = @_;
    return unless $structure_string;
    my $structure = Lingua::Features::Structure->from_string($structure_string);
    my $tag_string = $class->structure2tag($structure);
    return $tag_string ? $tag_string : '';
}

1;