| ONTO-PERL documentation | Contained in the ONTO-PERL distribution. |
OBO::CCO::IntActParser - An IntAct to OBO parser/filter.
A parser for IntAct to OBO conversion. The conversion is filtered according to the proteins already existing in the input ontology.
Vladimir Mironov <vladimir.mironov@bio.ntnu.no>
Copyright (C) 2006 by Vladimir Mironov
This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself, either Perl version 5.8.7 or, at your option, any later version of Perl 5 you may have available.
Usage - $intact_parser->parse($intact_file) Returns - XML::XPath object Args - IntAct data file path Function - parses IntAct data file
Usage - $IntActParser->work($ontology, $xpath, $up_map, $up_core_map, $adding_new_terms) Returns - OBO::Core::Ontology object Args - OBO::Core::Ontology object, XML::XPath object, \( map UniProt AC=>ID, \( map UniProt AC=>ID), \( indirect file handle, # for writing a UniProt map for added proteins (AC=>ID) parent term name for proteins (string), # to link new proteins to, e.g. 'cell cycle protein' taxon id (string), # e.g. '3702' ) Function - adds OBO::Core::Term objects for interactions and proteins to ontology
| ONTO-PERL documentation | Contained in the ONTO-PERL distribution. |
# $Id: IntActParser.pm 2109 2010-09-29 Erick Antezana $ # # Module : IntActParser.pm # Purpose : Parse IntAct files # License : Copyright (c) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved. # This program is free software; you can redistribute it and/or # modify it under the same terms as Perl itself. # Contact : CCO <ccofriends@psb.ugent.be> # package OBO::CCO::IntActParser;
use strict; use warnings; use Carp; use OBO::Core::Term; use XML::XPath; sub new { my $class = shift; my $self = {}; bless( $self, $class ); return $self; }
# TODO - the parser should return a hash: interaction_object => \(array of participant_objects) iso $xpath ? sub parse { my ($self, $intact_file) = @_; croak "No IntAct file to parse!\n" if (!$intact_file); my $xpath = XML::XPath->new( filename => $intact_file ) or croak "Failed to parse file $intact_file"; return $xpath; }
sub work { my ($self, $onto, $xpath, $up_map, $up_core_map, $adding_new_terms) = @_; # the notion of a 'core protein' is essential for preventing chaining # core proteins are defined as those labelled in GOA files with GO biological process terms relevant to the current project (e.g. 'gene regulation') # $up_map contains all proteins for the biological taxon in question # $up_core_map contains core proteins for ALL the taxa in the project (generated by GoaParser) my ( $FH, $parent_protein_name, $taxon_id, ) = @{$adding_new_terms}; my @rel_types = ( 'is_a', 'participates_in', 'has_participant', 'has_source'); foreach (@rel_types) { croak "Not a valid relationship type" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} ); } my $parent_protein = $onto->get_term_by_name($parent_protein_name) || croak "No term for $parent_protein_name in ontology: $!"; my $taxon = $onto->get_term_by_id ( "NCBI:$taxon_id" ) || croak "No taxon term for $taxon_id in ontology: $!"; my %proteins; # UP AC => OBO::Core::Term object my $int_set = $xpath->find("/entrySet/entry/interactionList/interaction"); foreach my $interaction ( $int_set->get_nodelist() ) { my $int_type = $interaction->find( "interactionType/names/shortLabel/text()", $interaction ); # $int_type is an object XML::XPath::NodeSet $int_type = $int_type->string_value(); # interaction type my $mi_type = $onto->get_term_by_name_or_synonym($int_type); if (!$mi_type) { carp "The ontology does not contain the term: '$int_type'\n"; next; } my $int_id = $interaction->find( "\@id", $interaction ); $int_id = $int_id->string_value(); # interaction id my $int_name = $interaction->find( "names/shortLabel/text()", $interaction ); $int_name = $int_name->string_value(); # interaction name my $int_comment = $interaction->find( "names/fullName/text()", $interaction ); $int_comment = $int_comment->string_value(); # interaction full name my $xref = $interaction->find( "xref/primaryRef/\@id", $interaction ); $xref = $xref->string_value(); #buildinig the hashes my %core_interactors; # 'participant IntAct id' => '[0|1]', '1' for core proteins, '0' otherwilse my %exp_roles; # 'participant IntAct id' => 'experimental role' my %accs; # 'participant IntAct id' => 'UniProt AC' my $participants = $xpath->find("/entrySet/entry/interactionList/interaction[\@id = $int_id]/participantList/participant"); foreach my $participant ( $participants->get_nodelist() ) { my $part_id = $participant->find("\@id"); $part_id = $part_id->string_value(); # participant id my $int_ref = $participant->find( "interactorRef/text()", $participant ); $int_ref = $int_ref->string_value(); # ref for the interactor my $acc = $xpath->find("/entrySet/entry/interactorList/interactor[\@id = $int_ref]/xref/primaryRef/\@id"); $acc = $acc->string_value(); # UniProt accession my $role = $participant->find("experimentalRoleList/experimentalRole/names/shortLabel/text()", $participant); $role = $role->string_value(); # experimental role $exp_roles{$part_id} = $role; $accs{$part_id} = $acc; $core_interactors{$part_id} = (contains_key( $up_core_map, $acc )) ? 1 : 0; } # end of foreach participant; %exp_roles, %core_interactors, %accs are now built # filtering interactions # only interactions containing at least one core protein # TODO if the core protein is a pray, only the bait is retained (strinct interpretation, as an option) # now may still contain heterologous proteins next unless contains_value( \%core_interactors, 1 ); # creating interaction terms my $int_term = OBO::Core::Term->new(); $int_term->name("$int_name $int_type"); $int_comment =~ s/\n+//g; # cleaning the comment lines $int_comment =~ s/\t+//g; # cleaning the comment lines $int_comment =~ s/\r+//g; # cleaning the comment lines $int_term->comment("$int_comment"); $int_term->id ( "IntAct:$xref" ); $onto->add_term($int_term); $onto->create_rel( $int_term, 'is_a', $mi_type ); # now individual participants: foreach ( keys %accs ) { my $ac = $accs{$_}; # $_ - participant IntAct id my $prot_name = $up_map->{$ac}; next if ( !$prot_name ); # heterologous protein my $protein; $protein = $proteins{$ac}; if ( !defined $protein ) { # the first occurrence of this protein $protein = $onto->get_term_by_name($prot_name); if ( !defined $protein ) { $protein = OBO::Core::Term->new(); $protein->name($prot_name); $protein->id ( "UniProtKB:$ac" ); $onto->add_term($protein); $onto->create_rel( $protein, 'is_a', $parent_protein); $onto->create_rel( $protein, 'has_source', $taxon); print $FH "$ac\t$prot_name\n"; } $proteins{$ac} = $protein; # the protein was not yet in the hash } $onto->create_rel( $protein, 'participates_in', $int_term ); $onto->create_rel( $int_term, 'has_participant', $protein); } # end of foreach participant } # end of foreach interaction return $onto; } sub contains_key { my ( $hash, $key ) = @_; return ( defined $hash->{$key} ) ? 1 : 0; } sub contains_value { my ( $hash, $value ) = @_; # select( ( select(STDOUT), $| = 1 )[0] ); # flushing the buffer - needed??? foreach ( keys %{$hash} ) { return 1 if ( "$hash->{$_}" eq $value ); } return 0; } 1;