| Lingua-EN-Semtags-Engine documentation | Contained in the Lingua-EN-Semtags-Engine distribution. |
Lingua::EN::Semtags::Engine - extract semantic tags (semtags) from English text
use Lingua::EN::Semtags::Engine;
my $engine = Lingua::EN::Semtags::Engine->new;
my @semtags = $engine->semtags("your blog post title");
Lingua::EN::Semtags uses Lingua::EN::Tagger and WordNet::QueryData to extract semantic tags (semtags) from English text. Semtags are words which reflect the semantic essence of the text (similar to topic keywords).
Lingua::EN::Semtags was designed and developed to solve a particular problem I was facing.
Problem: a user is processing blog post titles and needs to programmatically determine the posts' semantic context.
Solution: the user feeds a blog post title to Lingua::EN::Semtags and gets back a set of semtags which can be used for further processing (e.g., web searches).
Example: a blog post title like "BBtv: Graffiti Research Lab, the movie" (boingboing.net, Posted by Xeni Jardin, April 24, 2008 8:00 AM) would produce the following semtags: [DECORATION WORKPLACE SHOW].
Please note that the module makes the following assumptions when attempting to extract semtags:
Calls sentence($string), gets back a populated instance of
Lingua::EN::Semtags::Sentence, iterates over its
Lingua::EN::Semtags::LangUnits, populates and returns an array of their
semtags.
Returns an instance of Lingua::EN::Semtags::Sentence populates with
Lingua::EN::Semtags::LangUnit objects which represnet meaningful language
units.
Returns the Lingua::EN::Tagger instance used by the engine.
Returns/sets the verbose mode.
Returns the WordNet::QueryData instance used by the engine.
Igor Myroshnichenko <igorm@cpan.org>
Copyright (c) 2008, All Rights Reserved.
This software is free software and may be redistributed and/or modified under the same terms as Perl itself.
| Lingua-EN-Semtags-Engine documentation | Contained in the Lingua-EN-Semtags-Engine distribution. |
package Lingua::EN::Semtags::Engine; use strict; use warnings; #use Data::Dumper; use WordNet::QueryData 1.46; use Lingua::EN::Tagger 0.11; use List::Util qw(min max); use Lingua::EN::Semtags::Sentence; use Lingua::EN::Semtags::LangUnit; use constant SEMTAG_ISA_INDEX => 1; # TODO May be calculated use constant PHRASE_FRAME_SIZE => 3; use constant MIN_ISAS => 3; use constant ISAS => 'hypes'; # Hypernyms use constant TRUE => 1; use constant FALSE => 0; our $VERSION = '0.01'; #============================================================ sub new { #============================================================ my ($invocant, %args) = @_; my $self = bless ({}, ref $invocant || $invocant); $self->_init(%args); return $self; } #============================================================ sub _init { #============================================================ my ($self, %args) = @_; # Initialize attributes $self->{wn} = WordNet::QueryData->new; $self->{tagger} = Lingua::EN::Tagger->new; $self->{verbose} = FALSE; # Set the args that came from the constructor foreach my $arg (sort keys %args) { die "Unknown argument: $arg!" unless exists $self->{$arg}; $self->{$arg} = $args{$arg}; } } #============================================================ sub semtags { #============================================================ my ($self, $string) = @_; my @semtags = (); foreach my $lunit ($self->sentence($string)->lunits) { my $semtag = ($lunit->isas)[SEMTAG_ISA_INDEX]; $semtag =~ s/#\w#\d+$//; push @semtags, uc $semtag; } return @semtags; } #============================================================ sub sentence { #============================================================ my ($self, $string) = @_; my $sentence = Lingua::EN::Semtags::Sentence->new(string => $string); $self->_detect_words($sentence); $self->_detect_phrases($sentence); $self->_set_lunits($sentence); return $sentence; } # Detects the POS of every token in the string. Populates $sentence->word_tokens. # Only tokens of nouns, verbs, adjectives, adverbs go into $sentence->word_tokens. #============================================================ sub _detect_words { #============================================================ my ($self, $sentence) = @_; $sentence->string(&_clean_for_words($sentence->string)); my $tagged_string = $self->tagger->get_readable($sentence->string); foreach my $token_pos (split /\s/, $tagged_string) { my ($token, $pos) = split /\//, $token_pos; # Nouns, verbs, adjectives, adverbs $sentence->word_tokens->{$token} = $pos if ($pos =~ /^(NN|VB|JJ|RB)/); } # print '_detect_words: ', Dumper($sentence->word_tokens) if $self->verbose; } # Detects WordNet phrases. Updates $sentence->string: glues phrase tokens # together with underscores. Populates $sentence->phrase_tokens. #============================================================ sub _detect_phrases { #============================================================ my ($self, $sentence) = @_; $sentence->string(&_clean_for_phrases($sentence->string)); # Move a frame across the sentence and test the contents for a sense my @tokens = split /\s/, $sentence->string; for (my $i = 0; $i < @tokens; $i++) { my $phrase_string = $tokens[$i]; my $frame = min($i + 1 + PHRASE_FRAME_SIZE, scalar @tokens); for (my $j = $i + 1; $j < $frame; $j++) { $phrase_string .= ' '.$tokens[$j]; if ($self->wn->validForms($phrase_string)) { print "_detect_phrases: [$phrase_string]\n" if $self->verbose; my @phrase_tokens = split /\s/, $phrase_string; my $phrase_string_padded = join '_', @phrase_tokens; (my $string = $sentence->string) =~ s/$phrase_string/$phrase_string_padded/g; $sentence->string($string); $sentence->phrase_tokens->{$phrase_string_padded} = TRUE; $i += $#phrase_tokens; # Avoid frame overlaps last; # Stop growing the frame if a phrase is detected } } } # print '_detect_phrases: ', Dumper($sentence->phrase_tokens) if $self->verbose; } #============================================================ sub _set_lunits { #============================================================ my ($self, $sentence) = @_; my %word_tokens = %{$sentence->word_tokens}; my %phrase_tokens = %{$sentence->phrase_tokens}; my %seen_tokens = (); foreach my $token (split /\s/, $sentence->string) { if ((exists $word_tokens{$token} or exists $phrase_tokens{$token}) and !exists $seen_tokens{$token}) { $seen_tokens{$token} = TRUE; my $pos = exists $word_tokens{$token} ? $word_tokens{$token} : undef; my $is_word = exists $word_tokens{$token} ? TRUE : FALSE; my $is_phrase = exists $phrase_tokens{$token} ? TRUE : FALSE; my $lunit = Lingua::EN::Semtags::LangUnit->new( pos => $pos, token => $token, is_word => $is_word, is_phrase => $is_phrase ); $self->_set_isas($lunit) if $self->_set_sense($lunit); $sentence->add_lunit($lunit) if &_is_meaningful($lunit); } } } #============================================================ sub _set_sense { #============================================================ my ($self, $lunit) = @_; my $token = $lunit->token; my $sense = undef; if ($sense = $self->_sense($lunit)) { print "_set_sense: [$token] is [$sense]\n" if $self->verbose; $lunit->sense($sense); return TRUE; } else { print "_set_sense: [$token] has no sense!\n" if $self->verbose; return FALSE; } } #============================================================ sub _sense { #============================================================ my ($self, $lunit) = @_; my $token = $lunit->token; my $poswn = $lunit->pos ? $lunit->poswn : undef; my $sense = undef; # Query for the token without a POS my @senses = $self->wn->validForms($token); if (@senses == 1) { $sense = $senses[0]; } elsif (@senses > 1) { # Requires disambiguation if (defined $poswn) { # Query for the token with a POS my @senses_pos = $self->wn->validForms("$token#$poswn"); if (@senses_pos == 1) { $sense = $senses_pos[0]; } elsif (@senses_pos > 1) { $sense = $self->_disambiguate_senses(@senses_pos); } else { $sense = $self->_disambiguate_senses(@senses); } } else { $sense = $self->_disambiguate_senses(@senses); } } return $sense; } #============================================================ sub _disambiguate_senses { #============================================================ my ($self, @senses) = @_; my %freqs2senses = (); foreach my $sense (@senses) { my $freq = $self->wn->frequency("$sense#1"); $freqs2senses{$freq} = $sense; } # We are interested in the most frequently used sense my $max_freq = max keys %freqs2senses; my $sense = $freqs2senses{$max_freq}; print "_disambiguate_senses: [@senses]->[$sense]\n" if $self->verbose; return $sense; } #============================================================ sub _set_isas { #============================================================ my ($self, $lunit) = @_; my $isa = $lunit->sense; while ($isa = ($self->wn->querySense($isa, ISAS))[0]) { $lunit->add_isa($isa); } my @isas = $lunit->isas; print "_set_isas: [@isas]\n" if $self->verbose; } #============================================================ sub wn { $_[0]->{wn}; } sub tagger { $_[0]->{tagger}; } sub verbose { defined $_[1] ? $_[0]->{verbose} = $_[1] : $_[0]->{verbose}; } #============================================================ #============================================================ sub _clean_for_words { #============================================================ for (my $string = $_[0]) { s/\// /g; s/\s+/ /g; # Collapse multiple spaces into one return $string; } } #============================================================ sub _clean_for_phrases { #============================================================ for (my $string = $_[0]) { s/\W/ /g; # Remove non-word chars s/\b\w\b//g; # Remove single chars s/\s+/ /g; # Collapse multiple spaces into one return $string; } } #============================================================ sub _is_meaningful { #============================================================ return $_[0]->isas > MIN_ISAS ? TRUE : FALSE; } TRUE; __END__