/usr/local/CPAN/FreeHAL/AI/FreeHAL/Module/WebAccess.pm
#!/usr/bin/env perl
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# (at your option) any later version.
#
package AI::FreeHAL::Module::WebAccess;
use AI::FreeHAL::Config;
our $data = {};
our %config;
use AI::Util;
use AI::POS;
use Data::Dumper;
our @functions = qw{
download_pos
download_synonyms
download_genus
};
use LWP::UserAgent;
use HTTP::Request;
use LWP::Protocol;
use LWP::Protocol::http;
our $ua = LWP::UserAgent->new( timeout => 5 );
$ua->agent(
"Mozilla/5.0 (X11; U; Linux i686; de; rv:1.8.1.10) Gecko/20071213 Firefox/2.0.0.12"
);
sub download_pos {
my ( $CLIENT_ref, $word, $at_beginning ) = @_;
local $| = 1;
say;
say "download_pos: ( $CLIENT_ref, $word, $at_beginning )";
return if $config{'modes'}{'offline_mode'};
return if LANGUAGE() eq 'en';
return if $data->{modes}{batch};
return if !$AI::SemanticNetwork::initialized;
print '.';
my $url =
'http://wortschatz.uni-leipzig.de/cgi-portal/de/wort_www?site=10&Wort='
. $word;
# Create a request
# my $req = HTTP::Request->new( GET => $url );
# Pass request to the user agent and get a response back
# my $res = $ua->request($req);
$ua->timeout(5);
my $res = $ua->get($url);
# Check the outcome of the response
if ( !$res->is_success ) {
# print $res->content;
say 'Error while Downloading:';
say $url;
say $res->status_line;
return;
}
open my $d, ">", "downloaded.html";
print $d $res->content;
close $d;
my @lines = split /\n/, $res->content;
my @not_correct_but_conjugated_last;
my @not_correct_but_conjugated;
my $found_right_spelling = 0;
while ( defined( my $line = shift @lines ) ) {
print ".";
# say $line;
if ( $line =~ /Wort:/i && $line !~ /searchform/i ) {
chomp $line;
$line =~ s/.*?[<]\/B[>]//igm;
if ( ( $word eq lc $word ) == ( $line eq lc $line ) ) {
$found_right_spelling = 1;
print "($word eq lc $word) == ($line eq lc $line) \n";
# exit 0;
# select undef, undef, undef, 10;
}
else {
print "($word eq lc $word) != ($line eq lc $line) \n";
$found_right_spelling = 0;
# select undef, undef, undef, 10;
}
}
if ( $line =~ /licherweise haben Sie eine Seite zu schn/i ) {
select undef, undef, undef, 5;
return download_pos( $CLIENT_ref, $word, $at_beginning );
}
if ( $line =~ /Wortart: /i ) {
say 'wrong part, posible part of speech:';
say $line;
}
next if !$found_right_spelling;
if ( $line =~ /Stammform:/i && !@not_correct_but_conjugated ) {
$line =~ s/Stammform: //igm;
$line =~ s/[<].*?[>]//igm;
$line =~ s/^\s+//igm;
$line =~ s/\s+$//igm;
print $line . "\n";
if ( ( $word eq lc $word ) == ( $line eq lc $line ) ) {
my $wt = download_pos( $CLIENT_ref, $line, $at_beginning );
if ( $wt && $wt != $data->{const}{NO_POS} ) {
return $wt;
}
}
}
if ( $line =~ /falsche Rechtschreibung von/i ) {
shift @lines;
my $right_word = shift @lines;
$right_word =~ s/[<](.*)//igm;
my $ae = chr 228;
my $Ae = chr 196;
my $ue = chr 252;
my $Ue = chr 220;
my $oe = chr 246;
my $Oe = chr 214;
my $ss = chr 223;
$right_word =~ s/[&]auml[;]/$ae/igm;
$right_word =~ s/[&]Auml[;]/$Ae/igm;
$right_word =~ s/[&]ouml[;]/$oe/igm;
$right_word =~ s/[&]Ouml[;]/$Oe/igm;
$right_word =~ s/[&]uuml[;]/$ue/igm;
$right_word =~ s/[&]Uuml[;]/$Ue/igm;
$right_word =~ s/[&]szlig[;]/$ss/igm;
return download_pos( $CLIENT_ref, $right_word, $at_beginning );
}
if ( $line !~ /Wortart: /i ) {
next;
}
$line =~ s/[<](.+?)[>]//igm;
$line = ascii($line);
say "Line: ", $line;
$line =~ s/Wortart: //igm;
$line =~ s/^\s+//igm;
$line =~ s/\s+$//igm;
say "Line: ", $line;
push @not_correct_but_conjugated, $line;
}
foreach my $line (@not_correct_but_conjugated) {
next if $line !~ /adverb/i;
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} ) {
say "- Downloaded word type (5): ", $wt;
return $wt;
}
}
foreach my $line (@not_correct_but_conjugated) {
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} && $wt == $data->{const}{PREP} ) {
say "- Downloaded word type (4): ", $wt;
return $wt;
}
}
foreach my $line (@not_correct_but_conjugated) {
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} && $wt == $data->{const}{ADJ} ) {
say "- Downloaded word type (2): ", $wt;
return $wt;
}
}
foreach my $line (@not_correct_but_conjugated) {
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} && $wt == $data->{const}{VERB} ) {
say "- Downloaded word type (2): ", $wt;
return $wt;
}
}
foreach my $line (@not_correct_but_conjugated) {
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} ) {
say "- Downloaded word type (3): ", $wt;
return $wt;
}
}
foreach my $line (@not_correct_but_conjugated_last) {
my $wt = detect_pos_from_string( $CLIENT_ref, $line, $at_beginning );
if ( $wt != $data->{const}{NO_POS} ) {
say "- Downloaded word type (last): ", $wt;
return $wt;
}
}
say;
return $data->{const}{NO_POS};
}
sub download_synonyms {
my ( $word, $count, $real_word ) = @_;
$real_word ||= $word;
$count ||= 0;
read_config $data->{intern}{config_file} => my %config;
return if !$word;
chomp $word;
chomp $real_word;
return () if $count >= 5;
return () if $word =~ /^[_]/;
return if $data->{modes}{batch};
return if LANGUAGE() eq 'en';
return map { $_ => 1 }
split /[,]\s/, $data{'synonyms'}{ lc $word }
if $data{'synonyms'}{ lc $word };
chomp $data{'synonyms'}{ lc $word } if $data{'synonyms'}{ lc $word };
return ()
if $data{'synonyms'}{ lc $word }
&& $data{'synonyms'}{ lc $word } =~ /^[.]/;
return () if !$config{'modes'}{'offline_mode'};
# $word = ascii( $word );
say '-> Downloading synonyms: ', $word;
my $url = 'http://wortschatz.uni-leipzig.de/abfrage/';
# Create a request
my $req = HTTP::Request->new( POST => $url );
$req->content_type('application/x-www-form-urlencoded');
$req->content( 'Wort=' . $word . '&Submit=Suche!&site=10' );
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
# Check the outcome of the response
if ( !$res->is_success ) {
# print $res->content;
say 'Error while Downloading:';
say $url;
say $res->status_line;
}
my @lines = split /\n/, $res->content;
my %synonyms;
while ( defined( my $line = shift @lines ) ) {
chomp $line;
$line =~ s/[<].*?[>]//igm;
if ( $line =~ /Stammform:/i ) {
$line =~ s/Stammform: //igm;
chomp $line;
$line = ascii( lc $line );
say '-> base form of ' . $word . ': ' . $line;
$line =~ s/(^\s+)|(\s+$)//igm;
$synonyms{$line} = 1;
}
if ( $line =~ /Flexion:/i ) {
my $flex = lc shift @lines;
$flex .= lc shift @lines;
$flex =~ s/Stammform: //igm;
$flex = ascii($flex);
chomp $flex;
my @flexion = split /[,]|([<]br[>])/, $flex;
foreach my $fl (@flexion) {
$fl =~ s/(^\s+)|(\s+$)//igm;
chomp $fl;
say '-> flexion ' . $word . ': ' . $fl;
$synonyms{$fl} = 1;
}
}
if ( $line =~ /(falsche Rechtschreibung von)|(Form\(en)/i ) {
shift @lines;
my $right_word = shift @lines;
$right_word =~ s/[<].*?[>]//igm;
$right_word =~ s/[<](.*?)//igm;
$right_word =~ s/[,.-;]//igm;
my $ae = chr 228;
my $Ae = chr 196;
my $ue = chr 252;
my $Ue = chr 220;
my $oe = chr 246;
my $Oe = chr 214;
my $ss = chr 223;
$right_word =~ s/[&]auml/$ae/igm;
$right_word =~ s/[&]Auml/$Ae/igm;
$right_word =~ s/[&]ouml/$oe/igm;
$right_word =~ s/[&]Ouml/$Oe/igm;
$right_word =~ s/[&]uuml/$ue/igm;
$right_word =~ s/[&]Uuml/$Ue/igm;
$right_word =~ s/[&]szlig/$ss/igm;
if ( lc $right_word eq lc $word && lc $real_word ) {
$data{'synonyms'}{ lc $real_word } = '.'
if !$data{'synonyms'}{ lc $real_word };
foreach my $item ( values %{ $data{'synonyms'} } ) {
$item = '.' if !$item;
}
delete $config{''};
delete $data{'synonyms'}{''};
foreach my $value ( values %{ $data{'synonyms'} } ) {
$value = '' if !$value;
}
#write_config %config, $data->{intern}{config_file};
return ();
}
return download_synonyms( $right_word, $count + 1, $real_word );
}
}
# say join ', ', keys %synonyms;
say;
foreach my $item ( values %{ $data{'synonyms'} } ) {
$item = '.' if !$item;
}
$data{'synonyms'}{ lc $real_word } = join ', ', keys %synonyms;
$data{'synonyms'}{ lc $real_word } =~ s/(^\s+)|(\s+$)//igm;
$data{'synonyms'}{ lc $real_word } = '.'
if !$data{'synonyms'}{ lc $real_word };
delete $config{''};
delete $data{'synonyms'}{''};
foreach my $value ( values %{ $data{'synonyms'} } ) {
$value = '' if !$value;
}
#write_config %config, $data->{intern}{config_file};
return %synonyms;
}
sub download_genus {
my ( $word, $count, $real_word ) = @_;
$real_word ||= $word;
$count ||= 0;
chomp $word;
chomp $real_word;
return if $count >= 5;
return if LANGUAGE() eq 'en';
return if $data->{modes}{batch};
return if !$AI::SemanticNetwork::initialized;
return if $config{'modes'}{'offline_mode'};
# $word = ascii( $word );
say '-> Downloading genus: ', $word;
my $url = 'http://wortschatz.uni-leipzig.de/abfrage/';
# Create a request
my $req = HTTP::Request->new( POST => $url );
$req->content_type('application/x-www-form-urlencoded');
$req->content( 'Wort=' . $word . '&Submit=Suche!&site=10' );
# Pass request to the user agent and get a response back
my $res = $ua->request($req);
# Check the outcome of the response
if ( !$res->is_success ) {
# print $res->content;
say 'Error while Downloading:';
say $url;
say $res->status_line;
}
my $ctn = $res->content;
my @lines = split /\n/, $ctn;
while ( defined( my $line = shift @lines ) ) {
print ".";
chomp $line;
$line =~ s/[<].*?[>]//igm;
$line = ascii($line);
if ( $line =~ /Flexion:/i ) {
my $flex = lc shift @lines;
$flex = ascii($flex);
return 'm' if $flex =~ /^der/;
return 'm' if $flex =~ /^die/;
return 'm' if $flex =~ /^das/;
}
if ( $line =~ /eschlecht/i && $line =~ /nnlich/i ) {
say;
return 'm';
}
if ( $line =~ /eschlecht/i && $line =~ /weiblich/i ) {
say;
return 'f';
}
if ( $line =~ /eschlecht/i && $line =~ /chlich/i ) {
say;
return 's';
}
}
say;
return;
}