/usr/local/CPAN/Combine/Combine/MySQLhdb.pm


# MySql replacement for hdb
# AA0 2002-09-30
#    Modified Open to return DBI connection and HDB table name

package Combine::MySQLhdb;

use strict;
use Combine::XWI;
use HTTP::Date;
use Encode;

my $sv; # holds the mysql connection
my $table = ''; # holds the hdb table name
my $savehtml;
my $doOAI;

sub Open { #needed??
    use Combine::Config;
    $sv = Combine::Config::Get('MySQLhandle');
    $savehtml = Combine::Config::Get('saveHTML');
    $doOAI = Combine::Config::Get('doOAI');
    my $hdbd = 'hdb';
    return ($sv,$hdbd);
}

sub Close {
#    print "MySQLhdb::Close\n";
    $sv->disconnect ;
}

sub DESTROY {
    print STDERR "MySQLhdb::DESTROY\n";
    $sv->disconnect ;
}

sub Write {
    my ($xwi) = @_;
    return undef unless $xwi;
    if (!defined($sv)) { Open(); } #Init $sv CHANGE?
    my $md5 = $xwi->md5; 
    my $recordid = $xwi->recordid;  #Set by DataBase.pm
#OAI
     if ($doOAI) {
       $sv->prepare("REPLACE INTO oai SET status='created', recordid=?, md5=?")->execute($recordid, $md5);
     }
#OAI
#    $xwi->url_rewind; MORE THAN one URL??
#    my $url = $xwi->url_get;
    my $urlid = $xwi->urlid;
    my $my_netlocid = $xwi->netlocid;
    my $type = $xwi->type;
    my $title = $xwi->title;
#checkedDate is inserted/updated in DataBase.pm and harvpars.pl
    my $modifiedDate = $xwi->modifiedDate;
    if ( ! $modifiedDate) { $modifiedDate = $xwi->checkedDate; }
    my $expiryDate = $xwi->expiryDate;
#    if ($expiryDate) { $expiryDate = str2time($expiryDate) ; }
#    else { $expiryDate = 'NULL'; }
    my $length = $xwi->length;
    my $server = $xwi->server;
    my $etag = $xwi->etag;
    my $nheadings = $xwi->heading_count;
    my $headings='';
# headings
    $xwi->heading_rewind;
    while (1) {
        my $this = $xwi->heading_get or last; 
	$headings .= $this . '; ';
    }
    my $nlinks = $xwi->link_count;
    my $this = $xwi->text;
    my $ip;
    if ($this) {
      $this = $$this;
      if ($xwi->truncated()) {

        # IMPORTANT! This document was truncated. Therefore:
        #
        # 1) Discard it if no space characters in it, because then it
        #    could be binary.
        #
        # 2) If a space is found, then truncate after the last space,
        #    so as to avoid erroneous indexing (since the truncation
        #    most likely cut a word).

        my $last_blank = rindex($this,' ');
        if ($last_blank > 0) {
          $ip = substr($this, 0, $last_blank) ;
        }
      }
      else {
        $ip = $this ;
      }
    } else { my $t=''; $xwi->text(\$t); } #make sure xwi->text is defined
#??    if (length($ip)>250000) {$ip = substr($ip, 0, 250000);}

    $sv->prepare("REPLACE INTO hdb VALUES (?, ?, ?, FROM_UNIXTIME( ? ), FROM_UNIXTIME( ? ), ?, ?, ?, ?, ?, ?, COMPRESS(?))")->execute(
      $recordid, $type, Encode::encode('utf8',$title), $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, Encode::encode('utf8',$headings), Encode::encode('utf8',$ip));

    if ( $savehtml == 1 ) {
	my $html = $xwi->content;
	$sv->prepare("REPLACE INTO html SET html=COMPRESS(?), recordid=?")->execute(Encode::encode('utf8',$$html),$recordid);
    }

    my $res;

#save links
    my ( $urlstr, $anchor, $ltype);
    $xwi->link_rewind;
    my $link_count = 1;
    my $netlocid;
    $res = $sv->do(qq{DELETE FROM links WHERE recordid='$recordid';}); #needed?
    while(1) { #links
        ($urlstr, $netlocid, $urlid, $anchor, $ltype) = $xwi->link_get;
        if (defined($urlstr)) {
#Convert urlstr to urlid,netlocid if needed
	    if ( ($netlocid <= 0) || ($urlid <= 0) ) {
		if ( $urlstr eq '') { print STDERR "ERR MySQLhdb, save links, no info\n"; }  ## sanity check -> log error 
		use Combine::selurl;
		my $u;
		if ( $u = new Combine::selurl($urlstr) ) {
		    $urlstr = $u->normalise();
		    my $netlocstr = $u->authority;
		    my $path_query = $u->path_query;
		    my $lsth = $sv->prepare(qq{SELECT netlocid,urlid FROM urls WHERE urlstr=?;});
		    $lsth->execute($urlstr);
		    ($netlocid,$urlid) = $lsth->fetchrow_array;
		    if ( !defined($urlid) ) {
			$sv->prepare(qq{INSERT IGNORE INTO netlocs SET netlocstr=?;})->execute($netlocstr);
#			($netlocid) =  $sv->selectrow_array(qq{SELECT netlocid FROM netlocs WHERE netlocstr='$netlocstr';});
			my $nlsth =  $sv->prepare(qq{SELECT netlocid FROM netlocs WHERE netlocstr=?;});
			$nlsth->execute($netlocstr);
			($netlocid) =  $nlsth->fetchrow_array();
			$sv->prepare(qq{INSERT IGNORE INTO urls SET urlstr=?, netlocid=?, path=?;})->execute($urlstr,$netlocid,$path_query);
			$lsth->execute($urlstr);
			($netlocid,$urlid) = $lsth->fetchrow_array;
		    }
		    $sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype);
		}
	    } else {
		$sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype);
	    }
        } else { last; }
        last if ($link_count++ >= 500);  # limit on number of links
    }

#save metadata
     $xwi->meta_rewind;
    $res = $sv->do(qq{DELETE FROM meta WHERE recordid='$recordid';}); #needed?
     my ($name,$content);
     while (1) {
        ($name,$content) = $xwi->meta_get;
        last unless $name;
        $sv->prepare("INSERT INTO meta VALUES (?, ?, ?)")->execute($recordid, Encode::encode('utf8',$name), Encode::encode('utf8',$content));
     } 

#OLD
#save URLs
#    $xwi->url_rewind;
#    $res = $sv->do(qq{DELETE FROM urls WHERE recordid='$recordid';});
#    while (1) {
#        $this = $xwi->url_get or last;
##        $res = $sv->do(qq{INSERT INTO urls VALUES ('$recordid','$this');});
#	my $machine = $this;
#	$machine =~ s|http://([^:/]+)[:/]?.*|$1|;
#        $sv->prepare("INSERT INTO urls VALUES (?, ?, ?)")->execute($recordid, $this, $machine);
#    }

#save robot data in analys table (uses that URL is stored)
    $xwi->robot_rewind;
    $res = $sv->do(qq{DELETE FROM analys WHERE recordid='$recordid';}); #needed?
    while (1) {
        ($name,$content) = $xwi->robot_get;
        last unless $name;
        $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, $name, Encode::encode('utf8',$content));
     }
##    my $alinks = calclinks($recordid,$machine); #?
#What if link-stats are inserted double after a Get and following write?
    my $sth = $sv->prepare(qq{SELECT COUNT(DISTINCT(links.recordid)), COUNT(DISTINCT(mynetlocid)) FROM links,recordurl WHERE recordurl.recordid= ? AND
                links.urlid = recordurl.urlid AND mynetlocid<>links.netlocid;});
    $sth->execute($recordid);
    my ($inlinks,$hostinlinks)=$sth->fetchrow_array;
    $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'inlinks', $inlinks);
    $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'hostinlinks', $hostinlinks);
    $sth = $sv->prepare(qq{SELECT count(distinct(netlocid)) FROM links WHERE recordid=?;});
    $sth->execute($recordid);
    my ($outlinks)=$sth->fetchrow_array;
    $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'outlinks', $outlinks);

#save topic, ie result of autoclassification
    $xwi->topic_rewind;
    $res = $sv->do(qq{DELETE FROM topic WHERE recordid='$recordid';}); #needed?
    my ($cls,$absscore, $relscore, $terms, $alg);
    while (1) {
        ($cls,$absscore, $relscore,$terms, $alg) = $xwi->topic_get;
        last unless $cls;
        $sv->prepare("INSERT INTO topic VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid, Encode::encode('utf8',$cls), $absscore, $relscore, Encode::encode('utf8',$terms), $alg);
     }
    if (my $zh = Combine::Config::Get('ZebraHost')) {
      require Combine::Zebra;
      Combine::Zebra::update($zh,$xwi);
    }
    if (Combine::Config::Get('MySQLfulltext')) {
      $sv->prepare("REPLACE INTO search VALUES (?, ?)")->execute($recordid, Encode::encode('utf8',$title .' '. $ip));
    }
    if (my $sh = Combine::Config::Get('SolrHost')) {
      require Combine::Solr;
      Combine::Solr::update($sh,$xwi);
    }
}

sub Delete { #Used??
    my ($xwi) = @_;
    return undef unless $xwi;

    my $recordid = $xwi->recordid; 
#print "MySQLhdb::DeleteMD5 $recordid\n";
    DeleteKey($recordid, $xwi->md5);
}

sub DeleteKey {
    my ($key, $md5) = @_;
    if (!defined($sv)) { Open(); } #Init $sv CHANGE?
#OAI
     if ($doOAI) {
#     $sv->prepare("REPLACE INTO oai SET status='deleted', recordid=?, md5=?")->execute($key,$md5);
##FEL recurdurl updaterad i Database.pm FIX!
       $sv->prepare("REPLACE INTO oai SELECT recordid,md5,NOW(),'deleted' FROM recordurl WHERE recordid=?")->execute($key);
     }
#OAI

#Zebra
    if (my $zh = Combine::Config::Get('ZebraHost')) {
      require Combine::Zebra;
#Not needed: if ($md5 eq '') { ($md5)=$sv->selectrow_array('SELECT md5 FROM recordurl WHERE recordid=$key'); }
      Combine::Zebra::delete($zh, $md5, $key);
    }
    if (my $sh = Combine::Config::Get('SolrHost')) {
      require Combine::Solr;
      Combine::Solr::delete($sh, $md5, $key);
    }

#print "MySQLhdb::DeleteKey $key\n";
    my $res = $sv->do(qq{DELETE FROM hdb WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM html WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM search WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM meta WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM analys WHERE recordid=$key});
    $res = $sv->do(qq{DELETE FROM links WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM topic WHERE recordid=$key;});
    $res = $sv->do(qq{DELETE FROM recordurl WHERE recordid=$key;});
}

sub Get {
    my ($key) = @_;
    #should return an initalized xwi-object
    if (!defined($sv)) { Open(); } #Init $sv CHANGE?

    my ($type, $title, $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, $headings, $ip) =
      $sv->selectrow_array(qq{SELECT type,title,
      UNIX_TIMESTAMP(mdate),IF(expiredate,UNIX_TIMESTAMP(expiredate),0),
      length,server,etag,nheadings,nlinks,headings,UNCOMPRESS(ip)
      FROM hdb WHERE recordid='$key';});

    my $xwi = new Combine::XWI ;
    $xwi->recordid($key);
#url Relies on that all urls are in table urls
    $xwi->type($type);
    $xwi->title(Encode::decode('utf8',$title));
    $xwi->modifiedDate($modifiedDate);
    if ($expiryDate>0) {$xwi->expiryDate($expiryDate)};
    $xwi->length($length);
    $xwi->server($server);
    $xwi->etag($etag);
    $xwi->nheadings($nheadings);
    $xwi->nlinks($nlinks);
    $headings =~ s/; $//;
    $xwi->heading_add(Encode::decode('utf8',$headings)) ;
    my $ip1=Encode::decode('utf8',$ip);
    $xwi->text(\$ip1);
    my ($html1) = $sv->selectrow_array(qq{SELECT UNCOMPRESS(html) FROM html WHERE recordid='$key';});
    my $html = Encode::decode('utf8',$html1);
    $xwi->content(\$html);

    my ($urlpath) = $sv->selectrow_array(qq{SELECT path FROM urls,recordurl WHERE recordid='$key' AND recordurl.urlid=urls.urlid;});
    $xwi->urlpath($urlpath);

    my ($url,$anchor,$lty,$name,$value,$heading);
#links
    my $sth = $sv->prepare(qq{SELECT urlid,netlocid,anchor,linktype from links WHERE recordid='$key';});
    $sth->execute;
    my ($urlid,$netlocid,$checkedDate,$md5,$fingerprint,$cls,$absscore,$relscore,$terms,$alg);
   while (($urlid,$netlocid,$anchor,$lty)=$sth->fetchrow_array) {
	$xwi->link_add('', $netlocid, $urlid, Encode::decode('utf8',$anchor), $lty) ; #no URLstr add?
    }

#meta
    $sth = $sv->prepare(qq{SELECT name,value from meta WHERE recordid='$key';});
    $sth->execute;
    while (($name,$value)=$sth->fetchrow_array) {
	$xwi->meta_add(Encode::decode('utf8',$name),Encode::decode('utf8',$value)) ;
    }

# analys -> robot
    $sth = $sv->prepare(qq{SELECT name,value FROM analys WHERE recordid='$key';});
    $sth->execute;
    while (($name,$value)=$sth->fetchrow_array) {
	$xwi->robot_add($name,Encode::decode('utf8',$value)) ;
    }

# topic
    $sth = $sv->prepare(qq{SELECT notation,abscore,relscore,terms,algorithm FROM topic WHERE recordid='$key';});
    $sth->execute;
    while (($cls,$absscore,$relscore,$terms,$alg)=$sth->fetchrow_array) {
	$xwi->topic_add(Encode::decode('utf8',$cls),$absscore,$relscore,Encode::decode('utf8',$terms),$alg) ;
    }

#recordurl
    $sth = $sv->prepare(qq{SELECT urlid,UNIX_TIMESTAMP(lastchecked),md5,fingerprint FROM recordurl WHERE recordid='$key';});
    $sth->execute;
    while (($urlid,$checkedDate,$md5,$fingerprint)=$sth->fetchrow_array) {
	$xwi->urlid($urlid);
	$xwi->checkedDate($checkedDate);
	$xwi->md5($md5);
        $xwi->fingerprint($fingerprint);
    }

    return $xwi;
}

1;