/usr/local/CPAN/Combine/Combine/XWI2XML.pm
package Combine::XWI2XML;
use strict;
use Combine::XWI;
use Encode;
use Combine::Config;
use Compress::Zlib;
use MIME::Base64;
my $level=0; #Used to calculate indentation for pretty printing
our %dcMap;
%dcMap = (
'rights' => 'dc:rights',
'coverage' => 'dc:coverage',
'creator' => 'dc:creator',
'content' => 'dc:description',
'geo.country' => 'dc:coverage',
'email' => 'dc:publisher',
'language' => 'dc:language',
'identifier-url' => 'dc:identifier',
'timemodified' => 'dc:date',
'copyright' => 'dc:rights',
'classification' => 'dc:subject',
'url' => 'dc:identifier',
'timecreated' => 'dc:date',
'category' => 'dc:subject',
'description' => 'dc:description',
'location' => 'dc:coverage',
'originator' => 'dc:creator',
'subject' => 'dc:subject',
'author' => 'dc:creator',
'publisher' => 'dc:publisher',
'pd' => 'dc:date',
'publisher-email' => 'dc:publisher',
'abstract' => 'dc:description',
'documenttype' => 'dc:type',
'doc-rights' => 'dc:rights',
'page-topic' => 'dc:subject',
'topicname' => 'dc:subject',
'keyword' => 'dc:subject',
'document-rights' => 'dc:rights',
'keywords' => 'dc:subject',
'resource-type' => 'dc:type',
'summary' => 'dc:description',
'creation-date' => 'dc:date',
'type' => 'dc:type',
'document-classification' => 'dc:subject',
'country' => 'dc:coverage',
'progid' => 'dc:format',
'content-language' => 'dc:language',
'content-type' => 'dc:format',
'title' => 'dc:title',
'created' => 'dc:date',
'timemodified' => 'dc:date',
'doc-type' => 'dc:type',
'mimetype' => 'dc:type'
);
##########################SUBS########################
sub XWI2XML {
my ($xwi, $inclHTML, $inclCanonDoc, $collapseinLinks, $nooutLinks) = @_;
my $recordid=$xwi->recordid;
my $Rsummary = '';
my $md5 = $xwi->md5;
my $res = StartTag("documentRecord md5id=\"$md5\" id=\"$recordid\"");
$res .= ToXML('modifiedDate', time2iso($xwi->modifiedDate));
$res .= ToXML('expiryDate', time2iso($xwi->expiryDate));
$res .= ToXML('checkedDate', time2iso($xwi->checkedDate));
$res .= ToXML('mimeType', $xwi->type);
$res .= ToXML('httpServer', $xwi->server);
#urls
#? $xwi->url_rewind;
$res .= StartTag('urls');
my $sv = Combine::Config::Get('MySQLhandle');
my $sth = $sv->prepare(qq{SELECT urlstr from urls,recordurl WHERE recordurl.recordid=? AND recordurl.urlid=urls.urlid;});
my $rv = $sth->execute($recordid);
my $urlstr;
my %servers=();
while (($urlstr)=$sth->fetchrow_array) {
$res .= ToXML('url',$urlstr);
$urlstr =~ s|http://([^/:]+).*|$1|;
$servers{$urlstr}=1;
}
$res .= EndTag('urls');
$res .= StartTag('servers');
foreach my $s (keys(%servers)) { $res .= ToXML('server',$s); }
$res .= EndTag('servers');
#originalDoc
if ($inclHTML) {
my $base64=MIME::Base64::encode(Compress::Zlib::memGzip(${$xwi->content}));
$res .= '<originalDocument mimeType="text/html" compression="gzip" encoding="base64" charSet="UTF-8">' . "\n" . $base64 . '</originalDocument>' . "\n";
}
#documentText
my $ok = 1;
if ($inclCanonDoc) {
my $html = ${$xwi->content};
if (length($html) > 10) {
$html =~ s/<!DOCTYPE[^>]+>//;
$html =~ s/ / /g;
my ($canonicalDoc,$errs);
require Combine::CleanXML2CanDoc;
my $converter = Combine::CleanXML2CanDoc->new('indentation'=>2);
($ok,$canonicalDoc,$errs) = $converter->convert($html);
if ($ok == 0) { $res .= $canonicalDoc . "\n"; } #if conversion OK save resul
}
}
if ( (!$inclCanonDoc) || ($ok != 0) ) { #fallback if conversion fails
$res .= StartTag('canonicalDocument');
$res .= ToXML('section', ${$xwi->text});
$res .= EndTag('canonicalDocument');
}
#meta
$xwi->meta_rewind;
$res .= StartTag('metaData');
my $tit= $xwi->title;
$res .= ToXMLAttr('meta',"name=\"title\"", $tit);
if ( !defined($tit) || $tit =~ /^\s*$/ ) {
#Empty title => generate title from text and 1st heading
my @ip = split(/\s+/,substr(${$xwi->text},0,100),5);
my ($head,$t) = split(/;/, $xwi->heading_get, 2);
my $ip = join(' ', $ip[0], $ip[1], $ip[2], $ip[3]);
# if ( $ip =~ /$head/ ) { $res .= ToXMLAttr('meta',"name=\"title\"", $ip); }
# else { $res .= ToXMLAttr('meta',"name=\"title\"", $head . ' ' . $ip); }
$res .= ToXMLAttr('meta',"name=\"title\"", $head . ' ' . $ip);
}
my ($name,$content);
while (1) {
($name,$content) = $xwi->meta_get;
last unless $name;
if ( $name eq 'Rsummary' ) {
$Rsummary = $content;
next;
}
$name =~ s/"/ /g;
$name = encodeXML($name);
if (($name =~ /^dc\./)) {
$name =~ s/^dc\.\s*/dc:/;
my $t;
($name,$t) = split('\.', $name, 2);
if ($name =~ /subject/) { $res .= DCsubj($name,$content); }
else { $res .= ToXMLAttr('meta', "name=\"$name\"", $content); }
} elsif ( defined($dcMap{$name}) ) {
if ($dcMap{$name} =~ /subject/) { $res .= DCsubj($dcMap{$name},$content); }
else { $res .= ToXMLAttr('meta', "name=\"$dcMap{$name}\"", $content); }
}
# else { $res .= ToXMLAttr('meta', "name=\"$name\"", $content); } #????????
}
$res .= EndTag('metaData');
# links
$res .= StartTag('links');
$res .= StartTag('outlinks');
$xwi->link_rewind;
my $antImgLinks=0;
my ($netlocid, $urlid, $anchor, $ltype, $lmd5);
$sth = $sv->prepare(qq{SELECT urlstr FROM urls WHERE urls.urlid=?;});
my $sth1 = $sv->prepare(qq{SELECT md5 FROM recordurl WHERE urlid=?;});
my %seen = ();
# while (1) {
while (!$nooutLinks) {
($urlstr, $netlocid, $urlid, $anchor, $ltype) = $xwi->link_get;
last unless ($urlstr || $netlocid);
if ( $urlstr eq '' ) {
$rv = $sth->execute($urlid);
($urlstr)=$sth->fetchrow_array;
}
next if (defined($seen{$urlstr,$anchor}));
$seen{$urlstr,$anchor}=1;
$res .= StartTag('link' . " type=\"$ltype\"");
$rv = $sth1->execute($urlid);
($lmd5)=$sth1->fetchrow_array;
# $res .= ToXML('linkurl', $urlstr . '; ' . $urlid); #Keep? Attribute?
$res .= ToXML('anchorText', $anchor);
if (defined($lmd5)) {
$res .= ToXMLAttr('location', "documentId=\"$lmd5\"", $urlstr);
} else {
$res .= ToXML('location', $urlstr);
}
if ($ltype eq 'img') { $antImgLinks++; }
$res .= EndTag('link');
}
$res .= EndTag('outlinks');
#anchors from other pages linking to this page
my $from;
$sth = $sv->prepare(qq{SELECT urlstr,anchor,md5,linktype from links,urls,recordurl WHERE links.urlid=? AND links.recordid=recordurl.recordid AND recordurl.urlid=urls.urlid;});
$rv = $sth->execute($xwi->urlid);
if ( $rv >= 1 ) {
%seen = ();
my @internalLinks =();
my %inlinkHosts;
$res .= StartTag('inlinks');
while (($from,$anchor,$lmd5,$ltype)=$sth->fetchrow_array) {
$anchor = Encode::decode('utf8',$anchor);
next if ( defined($seen{$from,$anchor}) || ($anchor eq '') || defined($seen{$anchor}) );
$seen{$from,$anchor}=1;
if ($collapseinLinks) {
$seen{$anchor}=1;
}
my $s = $from;
$s =~ s|http://([^/:]+).*|$1|;
if (defined($servers{$s})) {#from same server as page, just save and put last in list
my $tres = StartTag('link' . " type=\"$ltype\"");
$tres .= ToXML('anchorText',$anchor);
$tres .= ToXMLAttr('location', "documentId=\"$lmd5\"", $from);
$tres .= EndTag('link');
push(@internalLinks,$tres);
next;
}
$inlinkHosts{$s}=1;
$res .= StartTag('link' . " type=\"$ltype\"");
$res .= ToXML('anchorText',$anchor);
$res .= ToXMLAttr('location', "documentId=\"$lmd5\"", $from);
$res .= EndTag('link');
}
$res .= join('',@internalLinks);
$res .= EndTag('inlinks');
my $no = scalar(keys %inlinkHosts);
if (defined($no) && $no>0) {$res .= ToXML('inlinkHosts', $no);}
}
$res .= EndTag('links');
# analysis
$xwi->heading_rewind;
$res .= ToXML('headings', $xwi->heading_get); # Only one cumulative heading stored in database
$res .= ToXML('Rsummary',$Rsummary);
#analys - robot
$xwi->robot_rewind;
while (1) {
($name,$content) = $xwi->robot_get;
last unless $name;
next if ( ($name eq 'hostinlinks') || ($name eq 'inlinks') ||
($name eq 'outlinks') || ($name eq 'charsetlist') );
if ( $name eq 'domain' ) { $name = 'topLevelDomain'; }
$name =~ s/"/ /g;
$res .= ToXMLAttr('property', "name=\"$name\"", $content);
}
#topic
$xwi->topic_rewind;
my ($cls,$absscore, $relscore, $terms);
while (1) {
($cls,$absscore, $relscore,$terms) = $xwi->topic_get;
last unless $cls;
$res .= StartTag('topic' . " absoluteScore=\"$absscore\" relativeScore=\"$relscore\"");
$res .= ToXML('class', $cls);
# $res .= ToXML('terms', $terms);
my %seen;
foreach my $t (split(',\s*',$terms)) {
$t =~ s/\\.//g;
$t =~ tr/\?\*\^\[\]//d;
# remove trailing 's' (OK?) and replace '@and' with ','
$t = join(', ', map {s/s$//;$_} split(' @and ', $t) );
next if (defined($seen{$t}));
$res .= ToXML('terms', $t);
$seen{$t}=1;
}
$res .= EndTag('topic');
}
$res .= EndTag('documentRecord');
return $res;
}
sub DCsubj {
my ($name,$val) = @_;
my $frag = '';
foreach my $t (split(',\s*',$val)) {
$frag .= ToXMLAttr('meta', "name=\"$name\"", $t);
}
return $frag;
}
sub ToXML {
my ($name,$val) = @_;
return '' if ( !defined($val) || $val =~ /^\s*$/ );
#XMLify tag-name
$name =~ tr/0-9a-zA-Z:_\-./_/c;
if ($name =~ /^\d/) { $name='_' . $name; }
#XMLify val!!
$val =~ s/&/&/g;
$val =~ s/</</g;
$val =~ s/>/>/g;
$val =~ tr/[\x00-\x08\x0B\x0C\x0E-\x1F]/ /s;
return &prefix()."<$name>$val</$name>\n";
}
sub ToXMLAttr {
my ($name,$attr,$val) = @_;
return '' if ( !defined($val) || $val =~ /^\s*$/ );
#XMLify tag-name
$name =~ tr/0-9a-zA-Z:_\-./_/c;
if ($name =~ /^\d/) { $name='M' . $name; }
#XMLify val!!
$val =~ s/&/&/sg;
$val =~ s/</</sg;
$val =~ s/>/>/sg;
# $val =~ tr/- -\^/ /;
$val =~ tr/[\x00-\x08\x0B\x0C\x0E-\x1F]/ /s;
return &prefix()."<$name $attr>$val</$name>\n";
}
sub encodeXML {
my ($val)=@_;
$val =~ s/&/&/g;
$val =~ s/</</g;
$val =~ s/>/>/g;
$val =~ tr/[\x00-\x08\x0B\x0C\x0E-\x1F]/ /s;
return $val;
}
sub normalizeName {
#Names and Tokens
# [4] NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] | [#x10000-#xEFFFF]
# [4a] NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]
}
sub prefix {
my $prefix='';
foreach my $i (1..$level) { $prefix.=' '; }
return $prefix;
}
sub StartTag {
my ($name) = @_;
my $str = &prefix() . "<$name>\n";
$level++;
return $str;
}
sub EndTag {
my ($name) = @_;
$level--;
my $str = &prefix() . "</$name>\n";
return $str;
}
sub XMLtext {
my ($name) = @_;
if ( $name eq '' ) { return ''; }
else { return &prefix() . "$name\n"; }
}
sub time2iso {
my ($t) = @_;
if ($t) {
my @t = gmtime($t);
$t[5] = 1900 + $t[5]; #year
$t[4] = $t[4] + 1; #month
foreach my $i (4,3,1,0) { if ($t[$i]<10) { $t[$i] = "0$t[$i]"; } }
return join('-',($t[5],$t[4],$t[3])) . ' ' . join(':',($t[2],$t[1],$t[0]));
} else { return ''; }
}
########################################
1;