/usr/local/CPAN/go-db-perl/GO/Reasoner.pm


package GO::Reasoner;

use strict;
use Carp;
use base qw(GO::Model::Root);

my $REL_ID_COL = "id";
my $REL_TBL = "term";
my $ACC_COL = "acc";
my $CLASS_TABLE="term";
my $LINK_TABLE="graph_path";
my $ASSERTED_LINK_TABLE="term2term";
my $LINK_ID_COL="id";
my $SUBJECT_COL="term2_id";
my $TARGET_COL="term1_id";
my $REL_COL="relationship_type_id";
my %time_in_view_h = ();

our $sth_link;
our $sth_store;
our $is_a;

sub _valid_params {
    return qw(dbh skip ruleconf verbose);
}

sub run {
    my $self = shift;
    my $dbh = $self->dbh;
    my %skip = %{$self->skip || {}};
    # make sure relations are correctly indicated
    $dbh->do("UPDATE term SET is_relation=1 WHERE id IN (SELECT DISTINCT relationship_type_id FROM term2term)");

    my @is_a_nodes = 
        $dbh->selectrow_array("SELECT $REL_ID_COL FROM $REL_TBL WHERE $ACC_COL='OBO_REL:is_a'");
    if (@is_a_nodes != 1) {
        @is_a_nodes = 
            $dbh->selectrow_array("SELECT $REL_ID_COL FROM $REL_TBL WHERE $ACC_COL='is_a'");
    }
    if (@is_a_nodes != 1) {
        die "@is_a_nodes";
    }
    $is_a = shift @is_a_nodes;
    $self->logmsg("is_a node id: $is_a");

    my $isa_term_ids = 
        $dbh->selectcol_arrayref("SELECT relationship_type_id FROM relation_properties WHERE is_transitive=1");
    my $isa_id = shift @$isa_term_ids;

    if (!@{$dbh->selectcol_arrayref("SELECT relationship_type_id FROM relation_properties WHERE relationship_type_id=$isa_id")}) {
        $dbh->do("UPDATE relation_properties SET is_transitive=1, is_reflexive=1 WHERE relationship_type_id=$isa_id");
    }

    my $transitive_relation_node_ids = 
        $dbh->selectcol_arrayref("SELECT relationship_type_id FROM relation_properties WHERE is_transitive=1");

    # hard-wire is_a just to be safe
    my $reflexive_relation_node_ids = 
        $dbh->selectcol_arrayref("SELECT relationship_type_id FROM relation_properties WHERE is_reflexive=1 UNION SELECT id FROM term WHERE acc='is_a'");

    my $inheritable_relation_node_ids = 
        $dbh->selectcol_arrayref("SELECT relationship_type_id FROM relation_properties WHERE is_metadata_tag != 1 or is_metadata_tag IS NULL");

    $self->logmsg("inheritable relations: @$inheritable_relation_node_ids");
    my $superrelation_h = {};
    foreach my $sid (@$inheritable_relation_node_ids) {
        my $parent_rids =
            $dbh->selectcol_arrayref("SELECT term1_id FROM $ASSERTED_LINK_TABLE WHERE relationship_type_id=$is_a AND term2_id = $sid ");
        $superrelation_h->{$sid} = $parent_rids;
        #print STDERR "SUPERREL: $sid @$parent_rids\n";
    }

    $self->logmsg("finding transitive closure of relation hierarchy");
    my $t_superrelation_h = {};
    foreach my $sid (keys %$superrelation_h) {
        my @pids = @{$superrelation_h->{$sid}};
        my %done = ();
        while (my $pid = shift @pids) {
            next if $done{$pid};
            push(@{$t_superrelation_h->{$sid}}, $pid);
            push(@pids, @{$superrelation_h->{$pid}});
            $done{$pid} = 1;
            $self->logmsg("  super: $sid $pid");
        }
    }

    my $t_subrelation_h = {};
    foreach my $sid (keys %$superrelation_h) {
        foreach my $pid (@{$superrelation_h->{$sid}}) {
            push(@{$t_subrelation_h->{$pid}}, $sid);
        }
    }

    $self->logmsg("retrieving relation_properties");
    my $relation_property_h = 
        $dbh->selectall_hashref("SELECT * FROM relation_properties","relationship_type_id");

    my $relation_compositions = 
        $dbh->selectall_arrayref("SELECT relation1_id,relation2_id,inferred_relation_id FROM relation_composition");

    my $rc_h = {};
    foreach my $rc (@$relation_compositions) {
        my $key = join('-',@$rc);
        $rc_h->{$key} = 1;
        #print STDERR "In db <$key> $rc->[0] $rc->[1] $rc->[2]\n";
    }

    my @new_rcs = ();

# transitivity over and under is_a
    foreach my $id (@$inheritable_relation_node_ids) {
        push(@new_rcs, [$id, $is_a, $id]);
        push(@new_rcs, [$is_a, $id, $id]);
    }

# transitivity: is_a should be pre-declared transitive
    foreach my $id (@$transitive_relation_node_ids) {
        push(@new_rcs, [$id, $id, $id]);
    }

    foreach my $rc (@new_rcs, @$relation_compositions) {
        $self->logmsg("relation_composition: @$rc");
        foreach my $r0 ($rc->[0], @{$t_subrelation_h->{$rc->[0]}}) {
            #print STDERR "  subrel: $r0 < $rc->[0]\n";
            
            foreach my $r1 ($rc->[1], @{$t_subrelation_h->{$rc->[1]}}) {
                $self->logmsg("    subrel: $r1 < $rc->[1]");
                my $new_rc = [$r0, $r1, $rc->[2]];
                unless ($rc_h->{"@$new_rc"}) {
                    push(@new_rcs, $new_rc);
                }
            }
        }
    }

# complete relation composition table in db. TODO: make this configurable?
    foreach my $rc (@new_rcs) {
        my $key = join('-',@$rc);
        if ($rc_h->{$key}) {
            next;
        }
        $dbh->do("INSERT INTO relation_composition (relation1_id, relation2_id, inferred_relation_id) VALUES ($rc->[0],$rc->[1],$rc->[2])");
        $rc_h->{$key} = 1;
    }
    push(@$relation_compositions, @new_rcs);

    $self->logmsg("seeding graph_path");

# seed graph_path table
# todo: no dupes
    $dbh->do("INSERT INTO $LINK_TABLE (distance,relation_distance,term1_id,term2_id,relationship_type_id) SELECT 1,1,term1_id,term2_id,relationship_type_id FROM $ASSERTED_LINK_TABLE AS alt WHERE NOT EXISTS (SELECT * FROM $LINK_TABLE AS lt WHERE lt.term1_id=alt.term1_id AND term2_id=alt.term2_id AND lt.relationship_type_id=alt.relationship_type_id)")
        unless $skip{seed};

    my $lj=qq[
    LEFT JOIN $LINK_TABLE AS existing_link
                ON (x.$SUBJECT_COL=existing_link.$SUBJECT_COL AND
                        x.$REL_COL=existing_link.$REL_COL AND
                        y.$TARGET_COL=existing_link.$TARGET_COL)
];
    my $lj_cond = "AND existing_link.$LINK_ID_COL IS NULL";

# TODO: transitive_over and relation compositions
    my @views = ();

    unless ($skip{chain}) {

        my %done = ();
        foreach my $rc (@$relation_compositions) {
            my $r1id = $rc->[0];
            my $r2id = $rc->[1];
            my $rid = $rc->[2];
            my $relation_distance = "1";
            if ($r1id == $r2id && $r2id == $rid) {
                $relation_distance = "x.relation_distance+y.relation_distance";
            }
            elsif ($r1id == $rid) {
                $relation_distance = "x.relation_distance";
            }
            elsif ($r2id == $rid) {
                $relation_distance = "y.relation_distance";
            }
            else {
                $relation_distance = "x.relation_distance+y.relation_distance";
            }
            my $sql =
                qq[
  SELECT DISTINCT
    x.$SUBJECT_COL             AS node_id,
    $rid                  AS predicate_id,
    y.$TARGET_COL           AS object_id,
    x.distance+y.distance   AS distance,
    $relation_distance   AS relation_distance
  FROM $LINK_TABLE              AS x
    INNER JOIN $LINK_TABLE       AS y ON (x.$TARGET_COL=y.$SUBJECT_COL)
    $lj
  WHERE x.$REL_COL = $r1id
    AND y.$REL_COL = $r2id
    $lj_cond 
];
            my $view_id = "chain: @$rc";
            if ($done{$view_id}) {
                next;
            }
            $done{$view_id} = 1;
            push(@views,
                 {id=>$view_id,
                  type=>'composition',
                  rule=>"@$rc",
                  sql=>$sql});
        }
    }

    for my $rid (@$reflexive_relation_node_ids) {
        $self->logmsg("reflexive_relation: $rid");
        push(@views,
             {id=>"reflexive: $rid",
              type=>'reflexive',
              rule=>"reflexivity",
              sql=>qq[
                                    SELECT DISTINCT
                                    $CLASS_TABLE.id          AS node_id,
                                    $rid                       AS predicate_id,
                                    $CLASS_TABLE.id          AS object_id,
                                    0   AS distance,
                                    0   AS relation_distance
                                    FROM $CLASS_TABLE
                                    LEFT JOIN  $LINK_TABLE AS existing_link
                                    ON ($CLASS_TABLE.id=existing_link.$SUBJECT_COL AND
                                            $rid=existing_link.$REL_COL AND
                                            $CLASS_TABLE.id=existing_link.$TARGET_COL)
                                    WHERE $CLASS_TABLE.is_relation=0
                                    $lj_cond
                                    ],
             });
    }

    for my $sid (keys %$t_superrelation_h) {
        for my $pid (@{$t_superrelation_h->{$sid}}) {
            #print STDERR "  SUBREL: $sid < $pid\n";

            push(@views,
                 {id=>"subrelation $sid $pid",
                  type=>'subrelation',
                  rule=>"A R B, R is_a R2 => A R2 B: reflexivity",
                  sql=>qq[
                                            SELECT DISTINCT
                                            x.$SUBJECT_COL          AS node_id,
                                            $pid               AS predicate_id,
                                            x.$TARGET_COL        AS object_id,
                                            x.distance         AS distance,
                                            x.relation_distance         AS relation_distance
                                            FROM  $LINK_TABLE AS x
                                            LEFT JOIN  $LINK_TABLE AS existing_link
                                            ON (x.$SUBJECT_COL=existing_link.$SUBJECT_COL AND
                                                    $pid=existing_link.$REL_COL AND
                                                    x.$TARGET_COL=existing_link.$TARGET_COL)
                                            WHERE x.$REL_COL = $sid
                                            $lj_cond
                                            ],
                 });
            
        }
        
    }

    my %ruleconf = %{$self->{ruleconf} || {}};
    if (%ruleconf) {
        $self->logmsg("applying only: ".join(" ",keys %ruleconf));
        @views = grep {$ruleconf{$_->{type}}} @views;
    }
    $self->logmsg("total views: ".scalar(@views));


    $sth_link = $dbh->prepare_cached("SELECT $LINK_ID_COL FROM $LINK_TABLE WHERE $SUBJECT_COL=? AND $REL_COL=? AND $TARGET_COL=?");
#my $sth_store = $dbh->prepare_cached("INSERT INTO $LINK_TABLE ($SUBJECT_COL,$REL_COL,$TARGET_COL,is_inferred) VALUES (?,?,?,'t')");
    $sth_store = $dbh->prepare_cached("INSERT INTO $LINK_TABLE ($SUBJECT_COL,$REL_COL,$TARGET_COL,distance,relation_distance) VALUES (?,?,?,?,?)");

    my $i_by_node_id = $self->get_intersections();
    foreach my $node_id (keys %$i_by_node_id) {
        my $intersection_h = $i_by_node_id->{$node_id};
        my $sql = $self->intersection_to_query($node_id,$intersection_h);
        #print STDERR "$sql\n";
        # we do this at the start - unless new intersections can be added
        unshift(@views,
                {id=>"intersection_for_$node_id",
                 type=>'intersection',
                 sql=>$sql});
    }

    my $done = 0;
    my $sweep = 0;
    unless ($skip{sweep}) {
        while (!$done) {
            $self->logmsg( "Sweep: $sweep" );
            my $links_added_this_sweep = 0;
            foreach my $view (@views) {
                my $links_added = $self->cache_view($view);
                $links_added_this_sweep += $links_added;
            }
            $self->logmsg( "Sweep: $sweep total_added: $links_added_this_sweep" );
            $done = 1 unless $links_added_this_sweep;
        }
    }
    unless ($skip{equivalence}) {
        $self->assert_sameas();
    }


    foreach my $view (@views) {
        my $view_id = $view->{id};
        print STDERR "  View: $view_id. Total time: $time_in_view_h{$view_id}\n";
    }

}

# TODO: insert and select in same step; or temp table
sub cache_view {
    my $self = shift;
    my $dbh = $self->dbh;
    my $view = shift;
    my $view_links_added = 0;
    my $offset = 0;
    my $init_time = time;

    my $view_id = $view->{id};
    $self->logmsg( "  View: $view_id" );
    my $done_with_view;
    while (!$done_with_view) {

        my $sql = $view->{sql};
        #$sql.= "ORDER BY x.$LINK_ID_COL,y.$LINK_ID_COL";
        #$sql.= " LIMIT $limit OFFSET $offset";
        my $sth = 
            $dbh->prepare_cached($sql);

        #$self->logmsg( "    Executing [$offset,$limit]" );
        $self->logmsg( "    Executing $sql" );
        
        $sth->execute;
        $self->logmsg( "    EXECUTED" );
        my $links_added = 0;
        my $links_in_db = 0;
        my $n_rows = 0;
        while (my $link = $sth->fetchrow_hashref) {
            $n_rows++;
            my @triple =
                ($link->{node_id},
                 $link->{predicate_id},
                 $link->{object_id});
            if ($triple[0] == $triple[2] && $view_id ne 'isa*' && $view->{type} ne 'reflexive') {
                # TODO: proper reflexivity rules. hardcode OK for is_a for now
                # also: will report cycles for intersections to self, which is normal?
                #
                # this gives us lots of spurious messages for GALEN, since the obo translation
                # uses anonymous IDs and class expression syntax
                $self->logmsg("    Cycle detected for node: $triple[0] pred: $triple[1]");
                next;
            }
            my $rv = $sth_link->execute(@triple);
            if ($n_rows % 1000 == 0) {
                $self->logmsg("    Checked $n_rows links. Current: @triple");
            }
            if ($sth_link->fetchrow_array) {
                $links_in_db++;
            }
            else {
                #print STDERR "NEW @triple\n";
                $sth_store->execute(@triple,                
                                    $link->{distance},
                                    $link->{relation_distance},
                    );
                $links_added++;
            }
        }
        #$offset += $limit;
        $done_with_view=1 unless $links_added;
        $view_links_added += $links_added;
        $self->logmsg( "    Links added: $links_added [in_view: $view_links_added] already_there: $links_in_db" );
    }
    my $end_time = time;
    my $time_in_view = $end_time-$init_time;
    $time_in_view_h{$view_id} += $time_in_view;
    return $view_links_added;
}

sub assert_sameas {
    my $self = shift;
    my $dbh = $self->dbh;
    $self->logmsg("fetching reciprocal subclass links");
    my $eqs =
        $dbh->selectall_arrayref("SELECT DISTINCT x.$SUBJECT_COL, x.$TARGET_COL FROM $LINK_TABLE AS x INNER JOIN $LINK_TABLE AS y ON (y.$TARGET_COL=x.$SUBJECT_COL AND x.$TARGET_COL=y.$SUBJECT_COL) WHERE x.$REL_COL=$is_a AND y.$REL_COL=$is_a AND x.$SUBJECT_COL != x.$TARGET_COL", {Slice=>{}});
    $self->logmsg("got reciprocal subclass links: ".scalar(@$eqs));
    foreach (@$eqs) {
        $dbh->do("INSERT INTO sameas ($SUBJECT_COL,$TARGET_COL,is_inferred) VALUES ($_->{node_id},$_->{object_id},'t')");
        $dbh->do("INSERT INTO sameas ($TARGET_COL,$SUBJECT_COL,is_inferred) VALUES ($_->{node_id},$_->{object_id},'t')");
    }
    $self->logmsg("done sameas");
}

# TODO: port this
sub get_intersections {
    my $self = shift;
    my $dbh = $self->dbh;
    my %skip = %{$self->skip || {}};
    my $i_by_node_id = {};
    unless ($skip{intersections}) {
        #my $ilinks = $dbh->selectall_arrayref("SELECT DISTINCT $SUBJECT_COL,$REL_COL,$TARGET_COL,combinator FROM $LINK_TABLE WHERE combinator='I'",{Slice=>{}});
        my $ilinks = $dbh->selectall_arrayref("SELECT DISTINCT $SUBJECT_COL,$REL_COL,$TARGET_COL FROM $ASSERTED_LINK_TABLE WHERE complete=1",{Slice=>{}});
        foreach (@$ilinks) {
            push(@{$i_by_node_id->{$_->{node_id}}}, $_);
        }
    }
    return $i_by_node_id;
}

sub intersection_to_query {
    my $self = shift;
    my $dbh = $self->dbh;
    my $defined_node_id = shift;
    my $i_h = shift;
    my @conds = @$i_h;
    my $linknum=0;
    my @links = ();

    # TODO: remember, is_a is reflexive..
    # TODO: sub-relations
    my $where =
        join(" AND ",
             map {
                 $linknum++;
                 my $link = "link_".$linknum;
                 push(@links,"link AS $link");
                 # TODO: omit negation links
                 my $q = "$link.$SUBJECT_COL=subsumed_node.$SUBJECT_COL AND $link.$REL_COL = $_->{predicate_id} AND $link.$TARGET_COL = $_->{object_id} AND $link.combinator!='U'";
                 $q;
             } @conds);
    my $from = join(', ',@links);
    
    my $sql =
        qq[
  SELECT DISTINCT
    subsumed_node.$SUBJECT_COL  AS node_id,
    $is_a                  AS predicate_id,
    $defined_node_id       AS object_id
  FROM node AS subsumed_node, $from
  WHERE
      $where
    ];
    return $sql;
}

sub delete_inferred_links {
    my $self = shift;
    my $dbh = $self->dbh;
    my $link_ids = $dbh->selectcol_arrayref("SELECT $LINK_ID_COL FROM $LINK_TABLE WHERE is_inferred='t'");
    $dbh->{AutoCommit}=0;
    my $n=0;
    foreach my $link_id (@$link_ids) {
        print STDERR "Deleting $link_id\n";
        $dbh->do("DELETE FROM $LINK_TABLE WHERE $LINK_ID_COL=$link_id");
        $n++;
        if ($n % 1000 == 0) {
            print STDERR "COMMITTING\n";
            $dbh->commit;
        }
    }
    $dbh->commit;
    print STDERR "Deleted all inferred links\n";
}

sub get_or_put_relation {
    my $self = shift;
    my $dbh = $self->dbh;
    my $rel = shift;
    my @nids = 
        $dbh->selectrow_array("SELECT $SUBJECT_COL FROM node WHERE $ACC_COL='$rel'");
    if (@nids == 1) {
        return $nids[0];
    }
    elsif (@nids > 1) {
        die "@nids";
    }
    else {
        $dbh->do("INSERT INTO node ($ACC_COL,metatype) VALUES ('$rel','R')");
        return $self->get_or_put_relation($rel);
    }
    
}

sub logmsg {
    my $self = shift;
    return unless $self->verbose;
    my $msg = shift;
    my $t = time;
    print STDERR "LOG $t : $msg\n";
}


1;