/usr/local/CPAN/Combine/Combine/DataBase.pm


# Copyright (c) 2004, 2005 Anders Ardö
## $Id: DataBase.pm 286 2008-10-13 09:13:26Z it-aar $

# 
# See the file LICENCE included in the distribution.

package Combine::DataBase;

use strict;
use Combine::MySQLhdb;
use Digest::MD5;
use Encode qw(encode_utf8);

sub new { 
    my ($class, $xwi, $sv, $loghandle) = @_;
    $xwi = new Combine::XWI unless ref $xwi;
    my $self = {};
    $self->{'xwi'} = $xwi;
    $self->{'databasehandle'} = $sv;
    $self->{'loghandle'} = $loghandle;
    bless $self, $class;
    return $self;
} 

#uses table recordurl with columns: recordid, urlid, lastchecked PRIMARY KEY (urlid), key recordid
#  recordid and urlid starts at 1 !!

sub delete {
    my ($self) = @_;
    my $xwi = $self->{'xwi'};
    return undef unless ref $xwi;
    
    my $urlid = $xwi->urlid;
    my ($recordid, $md5) = $self->{databasehandle}->selectrow_array(
              qq{SELECT recordid,md5 FROM recordurl WHERE urlid=$urlid;}); #Only one
    return if !defined($recordid);
    $self->{'loghandle'}->say("DataBase::delete $urlid, $recordid, $md5;");
#LOCK recordurl - needed?
    $self->{databasehandle}->prepare(qq{LOCK TABLES recordurl WRITE;})->execute();
    #delete URL from recordurl
    $self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
    my ($ant) = $self->{databasehandle}->selectrow_array(
              qq{SELECT recordid FROM recordurl WHERE recordid=$recordid LIMIT 1;});
#UNLOCK recordurl - needed?
    $self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute();
    if ( !defined($ant) || ($ant == 0) ) { Combine::MySQLhdb::DeleteKey($recordid,$md5); } #Should handle del's of non-existing recs

    $xwi->nofollow("true"); #?? check??
}

sub insert {
    my ($self) = @_;
    my $xwi = $self->{'xwi'};
    return undef unless ref $xwi;

    my $urlid = $xwi->urlid;
##    my $md5 = $xwi->md5;

    my $md5D = new Digest::MD5;
    $md5D->reset;
    if ( length($xwi->text) > 0 ) {
	my $text = ${$xwi->text};
	$text =~ s/[\s\n\r]+//g;
	$md5D->add(encode_utf8($text)); #use only visible text without whitespace
    } else {
	$md5D->add($xwi->url);
	$md5D->add($xwi->type());
    }
    $_ = $md5D->hexdigest;
    tr/a-z/A-Z/;
    $xwi->md5($_);
    my $md5 = $_;

    $self->{'loghandle'}->say("DataBase::insert $urlid, $md5;");

    #actions according the following truth table based presence in recordurl
    #urlid: there is a document in the database for this url
    #recordid: there is as documenent in the database with the same MD5 as the new page
#
    #              recordid       |        ! recordid
    #  urlid   if same md5        |    delete(urlid_recordid);
    #           update(lastcheck) |    update(urlid); insertRec
    #          else delete(urlid_recordid);|
    #           add(urlid)        |
    # -----------------------------------------------------------------------------
    # ! urlid  add(urlid)         |   add(urlid); insertRec

    my $existurlid = 0;
    my $existrecordid = 0;
    my $oldmd5='';
#LOCK recordurl
    $self->{databasehandle}->prepare(qq{LOCK TABLES recordurl WRITE;})->execute();
    ($existurlid,$oldmd5) = $self->{databasehandle}->selectrow_array(
			   qq{SELECT urlid,md5 FROM recordurl WHERE urlid=$urlid;});
    ($existrecordid) = $self->{databasehandle}->selectrow_array(
                           qq{SELECT recordid FROM recordurl WHERE md5='$md5';});

    if (!defined($existrecordid)) { $existrecordid = 0; }
    if (!defined($existurlid)) { $existurlid = 0; }
    if (!defined($oldmd5)) { $oldmd5 = ''; }
#Log not locked    $self->{'loghandle'}->say("DataBase:: $urlid, $md5; $existrecordid; $existurlid; $oldmd5;");

#CASE 1: There are documents for both the URL and the MD5 and they have the same md5
    if ( ($existrecordid && $existurlid) && ($md5 eq $oldmd5) ) {
        # updateLastCheck
	$self->{databasehandle}->prepare(
               qq{UPDATE recordurl SET lastchecked=NOW() WHERE urlid=?;})->execute($urlid);
	$self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
	$self->{'loghandle'}->say("DataBase:: case 1: $existrecordid; $existurlid; $oldmd5;");

#CASE 2
    } elsif ( $existrecordid && $existurlid ) {
    #eg    } elsif ( ($existrecordid && $existurlid) && ($md5 ne $oldmd5) ) {
    #There are documents for both the URL and the MD5 and they have different md5
        # deleteOld
	my $oldrecordid = 0;
	($oldrecordid) = $self->{databasehandle}->selectrow_array(
                          qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
	#delete URL from recordurl
	$self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
	$self->{databasehandle}->prepare(
              qq{INSERT INTO recordurl SET urlid=?, recordid=?, md5=?, lastchecked=NOW();})->execute($urlid, $existrecordid, $md5);
	$self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
	my ($ant) = $self->{databasehandle}->selectrow_array(
	          qq{SELECT recordid FROM recordurl WHERE recordid=$oldrecordid LIMIT 1;}); #Outside LOCK?
	if ( ! defined($ant) ) { $ant = 0; }
	$self->{'loghandle'}->say("DataBase::DelURL case 2: $oldrecordid; $ant;; $existrecordid; $existurlid; $oldmd5;");
	if ( $ant == 0 ) { Combine::MySQLhdb::DeleteKey($oldrecordid, $oldmd5); }

#CASE 3
    } elsif ( $existrecordid && ! $existurlid ) {
        # addUrlId
	$self->{databasehandle}->prepare(
              qq{INSERT INTO recordurl SET urlid=?, recordid=?, md5=?, lastchecked=NOW();})->execute($urlid, $existrecordid, $md5);
	$self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
	$self->{'loghandle'}->say("DataBase:: case 3: $existrecordid; $existurlid; $oldmd5;");

#CASE 4
    } elsif ( ! $existrecordid && $existurlid ) {
        # deleteOld
	my $oldrecordid = 0;
	($oldrecordid) = $self->{databasehandle}->selectrow_array(
                          qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});

#	delete($self, $urlid, $oldrecordid); #Problem med LOCK!!! -> ny subrutin
	#delete URL from recordurl
	$self->{databasehandle}->prepare(qq{DELETE FROM recordurl WHERE urlid=?;})->execute($urlid);
        #ASSIGN NEW RECORDID done with auto_increment in SQL
	$self->{databasehandle}->prepare(
             qq{INSERT INTO recordurl SET urlid=?, md5=?, lastchecked=NOW();})->execute($urlid,$md5);
	$self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
	my ($ant) = $self->{databasehandle}->selectrow_array(
              qq{SELECT recordid FROM recordurl WHERE recordid=$oldrecordid LIMIT 1;});
	if ( ! defined($ant) ) { $ant = 0; }
	$self->{'loghandle'}->say("DataBase::DelURL $oldrecordid; $ant;");
	if ( $ant == 0 ) { Combine::MySQLhdb::DeleteKey($oldrecordid, $oldmd5); }
	my ($recordid) = $self->{databasehandle}->selectrow_array(
              qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
	$xwi->recordid($recordid);
	$self->{'loghandle'}->say("DataBase::Write $recordid case 4: $existrecordid; $existurlid; $oldmd5;");
	Combine::MySQLhdb::Write($xwi);

#CASE 5
    } elsif ( ! $existrecordid && ! $existurlid ) {
        #ASSIGN NEW RECORDID done with auto_increment in SQL
	$self->{databasehandle}->prepare(
             qq{INSERT INTO recordurl SET urlid=?, md5=?, lastchecked=NOW();})->execute($urlid,$md5);
	$self->{databasehandle}->prepare(qq{UNLOCK TABLES;})->execute(); #UNLOCK recordurl
	my ($recordid) = $self->{databasehandle}->selectrow_array(
              qq{SELECT recordid FROM recordurl WHERE urlid=$urlid;});
	$xwi->recordid($recordid);
	$self->{'loghandle'}->say("DataBase::Write $recordid case 5: $existrecordid; $existurlid; $oldmd5;");
	Combine::MySQLhdb::Write($xwi);
    }

#Should not happen
    else {
	$self->{'loghandle'}->say("DataBase::ERR $existrecordid; $existurlid; $oldmd5;");
	print "ERR DataBase impossible case\n";
    }

    $xwi->nofollow("false"); # was set to true by delete...???
    # my ($follow,$add,$replaced) = &COMB::Policy::url_accept($url,@urls);??? 
}

sub newLinks {
    my ($self) = @_;
    my $xwi = $self->{'xwi'};
    return undef unless ref $xwi;
    my $recordid = $xwi->recordid; #SANITY CHECK?
    $self->{databasehandle}->prepare(
       qq{INSERT IGNORE INTO newlinks SELECT urlid,netlocid FROM links WHERE recordid=?;})->execute($recordid);
}

sub newRedirect {
    my ($self) = @_;
    my $xwi = $self->{'xwi'};
    return undef unless ref $xwi;
    use Combine::selurl;
    my ($u, $netlocid, $urlid, $urlstr);
#    my $tl=$xwi->location; my $tb=$xwi->base; print "NL: $tl, $tb\n";
    if ( $u =  Combine::selurl->new_abs($xwi->location, $xwi->base) ) {
	$urlstr = $u->normalise();
#	print "NL: $urlstr\n";
	my $lsth = $self->{databasehandle}->prepare(qq{SELECT netlocid,urlid FROM urls WHERE urlstr=?;});
	$lsth->execute($urlstr);
	($netlocid,$urlid) = $lsth->fetchrow_array;
	if ( !defined($urlid) ) {
	    my $netlocstr = $u->authority;
	    my $path_query = $u->path_query;
#	    print "NL: $netlocstr, $path_query\n";
	    $self->{databasehandle}->prepare(qq{INSERT IGNORE INTO netlocs SET netlocstr=?;})->execute($netlocstr);
	    ($netlocid) =  $self->{databasehandle}->selectrow_array(qq{SELECT netlocid FROM netlocs WHERE netlocstr='$netlocstr';});
	    $self->{databasehandle}->prepare(qq{INSERT IGNORE INTO urls SET urlstr=?, netlocid=?, path=?;})->execute($urlstr,$netlocid,$path_query);
	    $lsth->execute($urlstr);
	    ($netlocid,$urlid) = $lsth->fetchrow_array;
	}
#	print "NL INS: $urlid,$netlocid\n";
#test if undefined
	$self->{databasehandle}->prepare(
	     qq{INSERT IGNORE INTO newlinks SET urlid=?, netlocid=?;})->execute($urlid,$netlocid);
    }
}

1;