/usr/local/CPAN/Combine/Combine/GraphAlgorithm.pm
package Combine::GraphAlgorithm;
sub new {
my $this = shift;
my $class = ref($this) || $this;
my $self = {};
$self->{NumberNodes}=0;
$self->{Nodes} = {}; # a hash with all nodes
#each node will have 2 lists (possibly empty): inlinks and outlinks
# and possibly a topic score associated: score
bless ($self , $class);
return $self;
}
sub addLink {
my ($self,$fromnode, $tonode, $weight) = @_;
# if ($fromnode == $tonode) { warn "No selflinking!"; return; }
# if (!defined($self->{Nodes}->{$fromnode})) {
if (! defined($weight)) { $weight = 1.0; }
#fails if link $from -> $to already added!!!
push(@{$self->{Nodes}->{$fromnode}->{outlinks}}, $tonode);
push(@{$self->{Nodes}->{$tonode}->{inlinks}}, $fromnode);
${$self->{Nodes}->{$fromnode}->{relevance}}{$tonode} += $weight;
}
sub setScore {
my ($self,$node,$score) = @_;
$self->{Nodes}->{$node}->{score} = $score;
}
sub nodes {
my ($self) = @_;
return keys(%{$self->{Nodes}});
}
sub outDegree {
my ($self,$node) = @_;
my @n = @{$self->{Nodes}->{$node}->{outlinks}};
return $#n+1;
}
sub inDegree {
my ($self,$node) = @_;
my @n = @{$self->{Nodes}->{$node}->{inlinks}};
return $#n+1;
}
sub hasScore {
my ($self,$node) = @_;
return defined($self->{Nodes}->{$node}->{score});
}
sub getScore {
my ($self,$node) = @_;
if ( defined($self->{Nodes}->{$node}->{score}) ) {
return $self->{Nodes}->{$node}->{score};
} else {return 0; }
}
sub deleteNode {
my ($self,$node) = @_;
delete($self->{Nodes}->{$node}->{outlinks});
my @l = $self->linkedToBy($node);
foreach my $n (@l) {
#walk through list of nodes that link to me and remove my node
my @links = $self->linksTo($n);
@{$self->{Nodes}->{$n}->{outlinks}} = ();
foreach my $l (@links) {
if ($l != $node) { push(@{$self->{Nodes}->{$n}->{outlinks}},$l); }
}
}
delete($self->{Nodes}->{$node}->{inlinks});
delete($self->{Nodes}->{$node}->{score});
delete($self->{Nodes}->{$node}->{relevance});
delete($self->{Nodes}->{$node});
}
sub linksTo {
my ($self,$node) = @_;
return @{$self->{Nodes}->{$node}->{outlinks}};
}
sub linkedToBy {
my ($self,$node) = @_;
return @{$self->{Nodes}->{$node}->{inlinks}};
}
sub addBackLinks {
my ($self) = @_;
my @pages = $self->nodes();
foreach my $i (@pages) {
my @backlinks = $self->linkedToBy($i);
my @links = $self->linksTo($i);
my %li = ();
foreach my $l (@links) { $li{$l}=1; } #To detect duplicates
foreach my $l (@backlinks) {
#$l is a backlink for $i
#use score divided by total no of backlinks as weight
#what if link already exists?
if (!defined($li{$l})) {
push(@{$self->{Nodes}->{$i}->{outlinks}}, $l);
}
${$self->{Nodes}->{$i}->{relevance}}{$l} += $self->getScore($l)/($#backlinks+1);
}
}
}
sub PageRank {
my ($self,$rmDang, $Bias) = @_;
# Parameters
# $rmDang (boolean) remove dangling pages NOT USED
# $Bias (boolean) use bias vector when calculating PageRank
# $d1 is an Graph object that contains bias vector as attribute
# to the vertices. May be modified by this subroutine!
# Using algorithm (1) The Power method from "Deeper Inside PageRank"
my @pages;
my $n;
my $i;
my %Ri; #The pageRank vector (x(k?)T)
my %Ri1;#The pageRank vector (x(k?)T)
my %Ei; #Topical (personalization) vector (vT in alg)
my $d = 0.85; #The alpha param
@pages = $self->nodes();
my $npages = $#pages + 1;
print "Got $npages nodes\n";
########
if ( $Bias ) {
#Init Ei with autoclass scores
# maintain sum(Ei) == 1
print "Initializing bias vector\n";
my $totscore=0;
foreach $i (@pages) {
$totscore += $self->getScore($i);
}
$normconst = 1.0/$totscore;
print "Using $normconst as normalization factor; Tot=$totscore\n";
my $riNorm=0.0;
foreach $i (@pages) {
$Ei{$i}=0.0;
$Ri{$i}=0.0;
if ( $self->hasScore($i) ) {
$Ei{$i} = $self->getScore($i) * $normconst;
# $Ri{$i}=1.0/$npages; #Uniform starting PageRank vector
$Ri{$i}=$Ei{$i}; # Topic-score starting PageRank vector
} else { print "WARN node $i has no score\n"; }
}
my $eisum; foreach $i (@pages) { $eisum += $Ei{$i}; }
print "EiSUM = $eisum\n";
my $risum; foreach $i (@pages) { $risum += $Ri{$i}; }
print "RiSUM = $risum\n";
} else {
#Uniform normalization (so that sum(Ei) = 1)
foreach $i (@pages) { $Ri{$i}=1.0/$npages; $Ei{$i}= 1.0/$npages;}
}
#Normalize relevance weights
foreach $i (@pages) {
# $i links to $l
my $nlinks=$self->outDegree($i);
if ($nlinks==0) {
next;
}
my @links = $self->linksTo($i); #Sparse P matrix
my $sum=0.0;
foreach my $l (@links) {
$sum += ${$self->{Nodes}->{$i}->{relevance}}{$l};
}
# print "Node $i: SUM=$sum\n";
foreach my $l (@links) {
${$self->{Nodes}->{$i}->{relevance}}{$l} = ${$self->{Nodes}->{$i}->{relevance}}{$l} / $sum;
$r = ${$self->{Nodes}->{$i}->{relevance}}{$l};
# print " link to $l relevance=$r\n";
}
}
my $loops=0;
my $lenDelta = 1.0;
my $Ri1sum;
while ( ($lenDelta > 0.0000001) ) {
$loops++;
my $dangContr=0.0; # the term x(k-1)T*a
foreach $i (@pages) {
# $i links to $l
my $nlinks=$self->outDegree($i);
if ($nlinks==0) {
$dangContr += $Ri{$i};
next;
}
my @links = $self->linksTo($i); #Sparse P matrix
# $mcontr = $Ri{$i}/($nlinks); #Uniform probability for all links
foreach my $l (@links) {
#nonuniform jump probability
my $mcontr = $Ri{$i} * (${$self->{Nodes}->{$i}->{relevance}}{$l});
$Ri1{$l} += $mcontr; # x(k-1)T*P
#print " Adding $mcontr to node $l making sum=$Ri1{$l}\n";
}
}
$Ri1sum=0.0;
$lenDelta=0.0;
foreach $i (@pages) {
my $tmp = $Ri{$i}; #Old PageRank
$Ri{$i} = $d * $Ri1{$i} + ($d * $dangContr + (1.0-$d))*$Ei{$i};
$Ri1sum += $Ri{$i};
$lenDelta = ($Ri{$i} - $tmp) * ($Ri{$i} - $tmp); #Euclidian length
$Ri1{$i}=0.0;
}
$lenDelta = sqrt($lenDelta);
# print "Difference lenDelta=$lenDelta; Ri1sum=$Ri1sum\n";
} #end while
print "LOOPS: $loops; Convergence lenDelta=$lenDelta; Ri1sum=$Ri1sum\n";
return %Ri;
}
#################
sub PageRankBL {
my ($self,$rmDang, $Bias) = @_;
# Parameters
# $rmDang (boolean) remove dangling pages NOT USED
# $Bias (boolean) use bias vector when calculating PageRank
# $d1 is an Graph object that contains bias vector as attribute
# to the vertices. May be modified by this subroutine!
# Using algorithm (1) The Power method from "Deeper Inside PageRank"
# But doing calculations on backlinks only
my @pages;
my $n;
my $i;
my %Ri; #The pageRank vector (x(k?)T)
my %Ri1;#The pageRank vector (x(k?)T)
my %Ei; #Topical (personalization) vector (vT in alg)
my $d = 0.85; #The alpha param
@pages = $self->nodes();
my $npages = $#pages + 1;
print "Got $npages nodes\n";
########
if ( $Bias ) {
#Init Ei with autoclass scores
# maintain sum(Ei) == 1
print "Initializing bias vector\n";
my $totscore=0;
foreach $i (@pages) {
$totscore += $self->getScore($i);
}
$normconst = 1.0/$totscore;
print "Using $normconst as normalization factor; Tot=$totscore\n";
my $riNorm=0.0;
foreach $i (@pages) {
$Ei{$i}=0.0;
$Ri{$i}=0.0;
if ( $self->hasScore($i) ) {
$Ei{$i} = $self->getScore($i) * $normconst;
# $Ri{$i}=1.0/$npages; #Uniform starting PageRank vector
$Ri{$i}=$Ei{$i}; # Topic-score starting PageRank vector
} else { print "WARN node $i has no score\n"; }
}
my $eisum; foreach $i (@pages) { $eisum += $Ei{$i}; }
print "EiSUM = $eisum\n";
my $risum; foreach $i (@pages) { $risum += $Ri{$i}; }
print "RiSUM = $risum\n";
} else {
#Uniform normalization (so that sum(Ei) = 1)
foreach $i (@pages) { $Ri{$i}=1.0/$npages; $Ei{$i}= 1.0/$npages;}
}
#Normalize relevance weights
foreach $i (@pages) {
# $i links to $l
my $nlinks=$self->outDegree($i);
if ($nlinks==0) {
next;
}
### my @links = $self->linksTo($i); #Sparse P matrix
my @links = $self->linkedToBy($i); #Sparse P matrix
my $sum=0.0;
foreach my $l (@links) {
$sum += ${$self->{Nodes}->{$l}->{relevance}}{$i};
}
# print "Node $i: SUM=$sum\n";
foreach my $l (@links) {
${$self->{Nodes}->{$l}->{relevance}}{$i} = ${$self->{Nodes}->{$l}->{relevance}}{$i} / $sum;
$r = ${$self->{Nodes}->{$l}->{relevance}}{$i};
# print " link to $l relevance=$r\n";
}
}
my $loops=0;
my $lenDelta = 1.0;
my $Ri1sum;
while ( ($lenDelta > 0.0000001) ) {
$loops++;
my $dangContr=0.0; # the term x(k-1)T*a
foreach $i (@pages) {
# $i links to $l
### my $nlinks=$self->outDegree($i);
my $nlinks=$self->inDegree($i);
if ($nlinks==0) {
$dangContr += $Ri{$i};
next;
}
### my @links = $self->linksTo($i); #Sparse P matrix
my @links = $self->linkedToBy($i); #Sparse P matrix
# $mcontr = $Ri{$i}/($nlinks); #Uniform probability for all links
foreach my $l (@links) {
#nonuniform jump probability
my $mcontr = $Ri{$i} * (${$self->{Nodes}->{$l}->{relevance}}{$i});
$Ri1{$i} += $mcontr; # x(k-1)T*P
#print " Adding $mcontr to node $l making sum=$Ri1{$l}\n";
}
}
$Ri1sum=0.0;
$lenDelta=0.0;
foreach $i (@pages) {
my $tmp = $Ri{$i}; #Old PageRank
$Ri{$i} = $d * $Ri1{$i} + ($d * $dangContr + (1.0-$d))*$Ei{$i};
$Ri1sum += $Ri{$i};
$lenDelta = ($Ri{$i} - $tmp) * ($Ri{$i} - $tmp); #Euclidian length
$Ri1{$i}=0.0;
}
$lenDelta = sqrt($lenDelta);
# print "Difference lenDelta=$lenDelta; Ri1sum=$Ri1sum\n";
} #end while
print "LOOPS: $loops; Convergence lenDelta=$lenDelta; Ri1sum=$Ri1sum\n";
return %Ri;
}
#################
sub printProbMatrix {
my ($self)=@_;
my %P;
foreach my $i ($self->nodes()) {
foreach my $j ($self->nodes()) {
$P{$i}{$j}=0;
}
}
foreach my $i ($self->nodes()) {
foreach my $j (@{$self->{Nodes}->{$i}->{outlinks}}) {
# $P{$i}{$j}=1;
$P{$i}{$j}=${$self->{Nodes}->{$i}->{relevance}}{$j};
}
}
print "P: ";
foreach my $i ($self->nodes()) { print "$i "; }
print "\n";
foreach my $i ($self->nodes()) {
print "$i: ";
foreach my $j ($self->nodes()) {
my $p=$P{$i}{$j};
print "$p ";
}
print "\n";
}
}
sub HITS {
my ($Graph) = @_;
my $k=5;
my %x;
my %y;
my @pages = $Graph->vertices;
#Evt initialize to rscore?
foreach $p (@pages) { $x{$p}=1; $y{$p}=1; }
my %yp;
foreach $i (0..$k) {
#I operation
print " #I operation\n";
my %xp;
foreach $p (@pages) {
$w=0;
@q = $Graph->predecessors($p);
foreach $q (@q) { $xp{$p} += $y{$q}; }
## foreach $q (0..$n) {
## if ( $links{$q,$p} ) { $w += $y[$q]; }
## }
## $xp[$p]=$w;
##
}
#O operation
print " #O operation\n";
my %yp;
foreach $p (@pages) {
@q = $Graph->successors($p);
foreach $q (@q) { $yp{$p} += $xp{$q}; }
## $w=0;
## foreach $q (0..$n) {
## if ( $links{$p,$q} ) { $w += $xp[$q]; }
## }
## $yp[$p]=$w;
}
%x = Normalize(%xp);
%y = Normalize(%yp);
# &PrProgress($i);
}
my %HITS;
foreach $i (keys(%x)) {
$HITS{$i}{AUTH}=$x{$i};
$HITS{$i}{HUB}=$y{$i};
delete($y{$i});
}
foreach $i (keys(%y)) {
$HITS{$i}{HUB}=$y{$i};
}
return %HITS;
}
sub Normalize {
my (%arr) = @_;
my $sum = 0.0;
foreach my $v (keys(%arr)) { $sum += $arr{$v}*$arr{$v}; }
my $nv = sqrt($sum);
# print "Norm: NV=$nv; #=$#_\n";
my %xn;
foreach my $v (keys(%arr)) {
$xn{$v} = $arr{$v}/$nv;
}
return %xn;
}
##################
sub PrProgress {
($it) = @_;
my %TMx;
my %TMy;
print "Status iteration $it\n";
foreach my $ii (0..$n) {
if ($x[$ii]>0.001) {$TMx{$ii}=$x[$ii];}
if ($y[$ii]>0.001) {$TMy{$ii}=$y[$ii];}
}
my $ant=0;
foreach $ii (sort {$TMx{$b} <=> $TMx{$a};} keys(%TMx)) {
print "X($ant): $TMx{$ii} $ii $urls[$ii]\n";
last if ($ant++>10);
}
$ant=0;
foreach $ii (sort {$TMy{$b} <=> $TMy{$a};} keys(%TMy)) {
print "Y($ant): $TMy{$ii} $ii $urls[$ii]\n";
last if ($ant++>10);
}
}
############
1;