/usr/local/CPAN/GOBO/GOBO/Parsers/GAFParser.pm
package GOBO::Parsers::GAFParser;
use Moose;
use strict;
extends 'GOBO::Parsers::Parser';
with 'GOBO::Parsers::GraphParser';
use GOBO::Node;
use GOBO::Gene;
use GOBO::Evidence;
use GOBO::Annotation;
use GOBO::ClassExpression;
use Carp;
sub parse_header {
my $self = shift;
my $g = $self->graph;
while($_ = $self->next_line) {
if (/^\!.*/) {
}
else {
$self->unshift_line($_);
# set the parse_header to 1
$self->parsed_header(1);
return;
}
}
# we are still in the header and have reached the end of the file
$self->parsed_header(1);
return;
}
sub parse_body {
my $self = shift;
my $g = $self->graph;
while($_ = $self->next_line) {
if (/^\!/) {
print STDERR "Warning! header in unexpected location: $_\n";
next;
}
chomp;
my @vals = split(/\t/);
my ($genedb,
$geneacc,
$genesymbol,
$qualifier,
$termacc,
$ref,
$evcode,
$with,
$aspect,
$genename,
$genesyn,
$genetype,
$genetaxa,
$assocdate,
$source_db,
$annotxp, # experimental!
$geneproduct
) = @vals;
if (!$genesymbol) {
confess("No symbol in line: $_");
}
my $geneid = "$genedb:$geneacc";
my $gene = $g->noderef($geneid);
my @taxa = split(/[\|\;]/,$genetaxa);
@taxa = map {s/^taxon:/NCBITaxon:/;$_} @taxa;
my $taxon = shift @taxa;
if (!$gene->label) {
bless $gene, 'GOBO::Gene';
$gene->label($genesymbol);
$gene->add_synonyms(split(/\|/,$genesyn));
# TODO; split
$gene->taxon($g->noderef($taxon));
$gene->type($g->noderef($genetype));
}
my $cnode = $g->term_noderef($termacc);
if (!$cnode->namespace) {
$cnode->namespace(_aspect2ns($aspect));
}
my %qualh = map {lc($_)=>1} (split(/[\|]\s*/,$qualifier || ''));
my $ev = new GOBO::Evidence(type=>$g->term_noderef($evcode));
# TODO: discriminate between pipes and commas
# (semicolon is there for legacy reasons - check if this can be removed)
my @with_objs = map {$g->noderef($_)} split(/\s*[\|\;\,]\s*/, $with);
$ev->supporting_entities(\@with_objs);
my @refs = split(/\|/,$ref);
my $provenance = $g->noderef(pop @refs); # last is usually PMID
$provenance->add_xrefs([@refs]);
my $annot =
new GOBO::Annotation(node=>$gene,
target=>$cnode,
provenance=>$provenance,
source=>$g->noderef($source_db),
date=>$assocdate,
);
if ($geneproduct) {
$geneproduct =~ s/\s+//g;
if ($geneproduct) {
$annot->specific_node($g->noderef($geneproduct));
}
}
# if >1 taxon supplied, additional taxon specifies target species
if (@taxa) {
my $xp =
GOBO::ClassExpression::RelationalExpression->new(relation=>'target_taxon',
target=>$g->noderef($taxon));
$annot->add_target_differentia($xp);
}
$annot->evidence($ev);
foreach my $qk (keys %qualh) {
$annot->add_qualifier($g->noderef($qk));
}
if ($qualh{not}) {
$annot->negated(1);
}
if ($annotxp) {
$annotxp =~ s/\s+//g;
if ($annotxp) {
my $xp = GOBO::ClassExpression->parse_idexpr($g,$annotxp);
$annot->add_target_differentia($xp);
}
}
$g->add_annotation($annot);
#push(@{$g->annotations},$annot);
}
return;
}
# the following is specific to GO
sub _aspect2ns {
my $aspect = shift;
return 'molecular_function' if $aspect eq 'F';
return 'biological_process' if $aspect eq 'P';
return 'cellular_component' if $aspect eq 'C';
}
1;