Combine::utilPlugIn - Combine::utilPlugIn documentation


Combine documentation Contained in the Combine distribution.

Index


Code Index:

NAME

Top

utilPlugIn

DESCRIPTION

Top

Utilities for: * extracting text from XWI's * SVM classification * language and country identification

AUTHOR

Top

Ignacio Garcia Dorado Anders Ardö <anders.ardo@eit.lth.se>

COPYRIGHT AND LICENSE

Top


Combine documentation Contained in the Combine distribution.

## $Id: utilPlugIn.pm 305 2009-03-30 07:24:01Z it-aar $

# See the file LICENCE included in the distribution.
# Ignacio Garcia Dorado 2008, and Anders Ardö 2008
# Utils to control the SVM, language, access to database, update...

package Combine::utilPlugIn;
use strict;

use Combine::XWI;
use Combine::MySQLhdb;
use Combine::LoadTermList;

my $svm;
my %term_set;
my $stoplist;
my $geoinfo;
my %server2country;

#######################################
#Analyze
#######################################
sub analyse {
    my ($xwi) = @_;

#Country for server
    use Geo::IP;
    use Locale::Country;
    # semi-private routines
    if ( ref($geoinfo) ne 'Geo::IP' ) { #INIT
      Locale::Country::alias_code('uk' => 'gb');
      Locale::Country::rename_country('gb' => 'Great Britain');
      Locale::Country::rename_country('tw' => 'Taiwan');
      $geoinfo = Geo::IP->new(GEOIP_STANDARD);
      my $configDir = Combine::Config::Get('configDir');
#      print "INIT server2country\n";
      if ( open(TT,"<$configDir/server2country") ) {
	  while (<TT>) { 
	      next if (/^\s*#/);
	      next if (/^\s*$/);
	      s/[\n\r\f]//g;
	      if (/^([^\s]+)\s+(.*)\s*$/) {
		  my $server=$1;
		  my $country=$2;
		  $server =~ s|^([^/]+)/.*$|$1|; #clean up
		  #do server translation
		  $server = $Combine::Config::serverbyalias{$server} || $server;
		  $server2country{$server}=$country;
	      }
	  }
	  close(TT);
      }
    }

   my $url = $xwi->url;
	#print "Doing $url\n";
   if ($url =~ m|http://([^/]+)/|) {
        my $country='';
        my $host=$1;
        $host =~ s/:\d+$//;
        $host =~ s/%..$//;
        $host =~ m/\.([a-z]+)$/;
        my $topdom = $1;

	if (defined($server2country{$host})) {
	    $country=$server2country{$host};
	    #print "server2country: $host => $country\n";
        } elsif ( $host =~ /^[\d\.]+$/ ) {
	    $country = $geoinfo->country_name_by_addr($host);
         #print "IP: $host -> $country\n";
        } elsif ( (length($topdom)==2) && !(($topdom eq 'tv') || ($topdom eq 'nu') || ($topdom eq 'to')) ) {
	    $country = code2country($topdom);
         #print "topdom: $host -> $country\n";
        } elsif ( ($topdom eq 'gov') || ($topdom eq 'edu') ) {
	    $country = 'United States';
         #print "gov,edu: $host -> $country\n";
        } else {
	    $country = $geoinfo->country_name_by_name($host);
         #print "geoip: $host -> $country\n";
        }
         #print "Got: $host -> $country\n";
	if ($country ne '') { $xwi->robot_add('country', $country); }
	else  { $xwi->robot_add('country', 'NotKnown'); }
   }

#Language of content
    my $text;
    if (defined($xwi->text)) { $text = substr(${$xwi->text},0,5000); } else { return; }
    use Lingua::Identify qw(:language_identification);
    if (length($text)<1000) {$text .= ' ' . $xwi->title;}
    my $lang = langof($text); # gives the most probable language
    if ($lang ne '') { $xwi->robot_add('language', $lang); }

##Plugin for more analysis
  my $analysePlugin = Combine::Config::Get('analysePlugin');
  if (defined($analysePlugin) && $analysePlugin ne '') {
    eval "require $analysePlugin";
    $analysePlugin->analyse($xwi);
  }
}

########################################
sub init_stoplist {
    $stoplist = new Combine::LoadTermList;
    my $configDir = Combine::Config::Get('configDir');
    $stoplist->LoadStopWordList("$configDir/stopwords.txt");
}

#################################

#########################################################
# GETtext given a XMI								    								#
#########################################################
sub getTextXWI {
  my ( $xwi, $DoStem, $stopwords ) = @_;
  
  if ( ref($stopwords) eq 'Combine::LoadTermList' ) { }
  elsif ( ref($stoplist) eq 'Combine::LoadTermList' ) { $stopwords = $stoplist; }
  else { init_stoplist(); $stopwords=$stoplist; }

  my $urlpath ="";
  my $title="No Title";
  my $meta=""; my $head=""; my $text="";

  $urlpath = $xwi->urlpath;
  $urlpath =~ s/^\///;
  $urlpath =~ s/\// /g;

  $xwi->meta_rewind;
  my ($name,$content);
  while (1) {
    ($name,$content) = $xwi->meta_get;
    last unless $name;
    next if ($name eq 'Rsummary');
    next if ($name =~ /^autoclass/);
    $meta .= $content . " ";
  } 

  $title = $xwi->title;
  
  $xwi->heading_rewind;
  my $this;
  while (1) {
    $this = $xwi->heading_get or last; 
    $head .= $this . " "; 
  }

  $this = $xwi->text;
  if ($this) {
    $this = $$this;
    $text = $this ;
  }

    if ( defined($meta)  && ($meta ne '') )  { $meta = SimpletextConv($meta,  $DoStem, $stopwords); }
    if ( defined($head)  && ($head ne '') )  { $head = SimpletextConv($head,  $DoStem, $stopwords); }
    if ( defined($text)  && ($text ne '') )  { $text = SimpletextConv($text,  $DoStem, $stopwords); }
    if ( defined($urlpath) && ($urlpath ne '') ) { $urlpath =  SimpletextConv($urlpath,   $DoStem, $stopwords); }
    if ( defined($title) && ($title ne '') ) { $title = SimpletextConv($title, $DoStem, $stopwords); }

  return ($meta, $head, $text, $urlpath, $title);
}

########################
sub SimpletextConv {
  my ($txt, $DoStem, $stopwords) = @_;

#remove unwanted characters, pure numbers, and lowercase everything
  $txt =~ tr/0-9a-zA-ZÅÄÖåäöøæØÆ\/\'-/ /c; #Keep ' - /
  $txt =~ s/\b\d+\b/ /g;
  $txt =~ tr/A-ZÅÄÖØÆ/a-zåäöøæ/;

  my @text = map { 
       if ( exists ${$stopwords->{StopWord}}{$_} ) { (); } else { $_; }
       } split(/\s+/,$txt);

#  if ( $DoStem ) { $stext = Lingua::Stem::stem(@text); }
  return join(' ', @text);
}

#########################################################
# SVM												    												#
#########################################################
#Given a SVM (prepared or not), the term list used in
#the svm, the file of the trained svm and the text to
#analyce, it returns the score
sub SVM {
  require Algorithm::SVMLight;

	my ( $SVMtrainingFile, @text ) = @_;

	if ( !defined($svm) ) {
		($svm) = CreateSVM($SVMtrainingFile);
	}
	if ( !defined(%term_set) ) {
		(%term_set) = CreateTermSet($svm);
	}
	my %set;
	foreach my $term (@text) {
		if ( ( ( exists $term_set{$term} ) && ( !( exists $set{$term} ) ) ) )
		{    #
			$set{$term} = 1;
		}
	}

	#use the svm function
	my %test;
	$test{attributes} = {%set};
	my $result = $svm->predict(%test);
	return ($result);
}

sub CreateTermSet {
	my ($svm) = @_;
#	print "Creating term list\n";
	my @terms = $svm->feature_names();
	chomp(@terms);
	my %term_set = map { $_, 1 } @terms;
	delete $term_set{""};
#	print "End creation list\n";
	return (%term_set);
}

sub CreateSVM {
	my ($SVMtrainingFile) = @_;
	#print "----------------------------------------------->Loading SVM... $SVMtrainingFile\n";
	my $svm = Algorithm::SVMLight->new();
	$svm->read_model($SVMtrainingFile);
	#print "End loading SVM\n";
	return ($svm);
}

1;


__END__