OBO::CCO::IntActParser - An IntAct to OBO parser/filter.


ONTO-PERL documentation Contained in the ONTO-PERL distribution.

Index


Code Index:

NAME

Top

OBO::CCO::IntActParser - An IntAct to OBO parser/filter.

DESCRIPTION

Top

A parser for IntAct to OBO conversion. The conversion is filtered according to the proteins already existing in the input ontology.

AUTHOR

Top

Vladimir Mironov <vladimir.mironov@bio.ntnu.no>

COPYRIGHT AND LICENSE

Top

parse

  Usage    - $intact_parser->parse($intact_file)
  Returns  - XML::XPath object
  Args     - IntAct data file path
  Function - parses IntAct data file 

work

  Usage - $IntActParser->work($ontology, $xpath, $up_map, $up_core_map, $adding_new_terms)
  Returns - OBO::Core::Ontology object 
  Args - OBO::Core::Ontology object, XML::XPath object, \( map UniProt AC=>ID, \( map UniProt AC=>ID), 
  \(
	  indirect file handle, # for writing a UniProt map for added proteins (AC=>ID)
	  parent term name for proteins (string), # to link new proteins to, e.g. 'cell cycle protein'
	  taxon id (string), # e.g. '3702'
  )
  Function - adds OBO::Core::Term objects for interactions and proteins to ontology


ONTO-PERL documentation Contained in the ONTO-PERL distribution.
# $Id: IntActParser.pm 2109 2010-09-29 Erick Antezana $
#
# Module  : IntActParser.pm
# Purpose : Parse IntAct files
# License : Copyright (c) 2006, 2007, 2008 Cell Cycle Ontology. All rights reserved.
#           This program is free software; you can redistribute it and/or
#           modify it under the same terms as Perl itself.
# Contact : CCO <ccofriends@psb.ugent.be>
#

package OBO::CCO::IntActParser;

use strict;
use warnings;
use Carp;
use OBO::Core::Term;
use XML::XPath;

sub new {
	my $class = shift;
	my $self  = {};
	bless( $self, $class );
	return $self;
}

# TODO - the parser should return a hash: interaction_object => \(array of participant_objects) iso $xpath ?
sub parse {
	my ($self, $intact_file) = @_;
	croak "No IntAct file to parse!\n" if (!$intact_file);
	my $xpath = XML::XPath->new( filename => $intact_file ) or croak "Failed to parse file $intact_file";
	return $xpath;
}

sub work {
	my ($self, $onto, $xpath, $up_map, $up_core_map, $adding_new_terms) = @_;
	# the notion of a 'core protein' is essential for preventing chaining
	# core proteins are defined as those labelled in GOA files with GO biological process terms relevant to the current project (e.g. 'gene regulation')
	# $up_map contains all proteins for the biological taxon in question
	# $up_core_map contains core proteins for ALL the taxa in the project (generated by GoaParser)
	my (
	$FH,
	$parent_protein_name,
	$taxon_id,
	) = @{$adding_new_terms};
	
	my @rel_types = ( 'is_a', 'participates_in', 'has_participant', 'has_source');
	foreach (@rel_types) {
		croak "Not a valid relationship type" unless ( $onto->{RELATIONSHIP_TYPES}->{$_} );
	}
	
	my $parent_protein = $onto->get_term_by_name($parent_protein_name) || croak "No term for $parent_protein_name in ontology: $!";
	my $taxon = $onto->get_term_by_id ( "NCBI:$taxon_id" ) || croak "No taxon term for $taxon_id in ontology: $!";
	my %proteins; # UP AC => OBO::Core::Term object
	
	my $int_set = $xpath->find("/entrySet/entry/interactionList/interaction");
	foreach my $interaction ( $int_set->get_nodelist() ) {
		my $int_type = $interaction->find( "interactionType/names/shortLabel/text()", $interaction ); # $int_type is an object XML::XPath::NodeSet
		$int_type = $int_type->string_value();          # interaction type
		
		my $mi_type = $onto->get_term_by_name_or_synonym($int_type);
		if (!$mi_type) {
			carp "The ontology does not contain the term: '$int_type'\n";
			next;
		}
		my $int_id = $interaction->find( "\@id", $interaction );
		$int_id = $int_id->string_value(); # interaction id
		my $int_name =  $interaction->find( "names/shortLabel/text()", $interaction );
		$int_name = $int_name->string_value();          # interaction name
		my $int_comment = $interaction->find( "names/fullName/text()", $interaction );
		$int_comment = $int_comment->string_value();    # interaction full name		
		my $xref = $interaction->find( "xref/primaryRef/\@id", $interaction );
		$xref = $xref->string_value();		
		
		#buildinig the hashes
		my %core_interactors; # 'participant IntAct id' => '[0|1]', '1' for core proteins, '0' otherwilse
		my %exp_roles; # 'participant IntAct id' => 'experimental role'
		my %accs; # 'participant IntAct id' => 'UniProt AC'
		my $participants = $xpath->find("/entrySet/entry/interactionList/interaction[\@id = $int_id]/participantList/participant");
		foreach my $participant ( $participants->get_nodelist() ) {			
			my $part_id = $participant->find("\@id");
			$part_id    = $part_id->string_value();  # participant id
			my $int_ref = $participant->find( "interactorRef/text()", $participant );
			$int_ref    = $int_ref->string_value();  # ref for the interactor    
			my $acc     = $xpath->find("/entrySet/entry/interactorList/interactor[\@id = $int_ref]/xref/primaryRef/\@id");
			$acc        = $acc->string_value();  # UniProt accession
			my $role    = $participant->find("experimentalRoleList/experimentalRole/names/shortLabel/text()", $participant);
			$role       = $role->string_value(); # experimental role
			
			$exp_roles{$part_id} = $role;
			$accs{$part_id}      = $acc;
			$core_interactors{$part_id} = (contains_key( $up_core_map, $acc )) ? 1 : 0;	
			
		} # end of foreach participant; %exp_roles, %core_interactors, %accs are now built
		
		# filtering interactions
		# only interactions containing at least one core protein
		# TODO if the core protein is a pray, only the bait is retained (strinct interpretation, as an option)
		# now may still contain heterologous proteins
		next unless contains_value( \%core_interactors, 1 );		
		
		# creating interaction terms
		my $int_term = OBO::Core::Term->new();
		$int_term->name("$int_name $int_type");
		$int_comment =~ s/\n+//g; # cleaning the comment lines
		$int_comment =~ s/\t+//g; # cleaning the comment lines
		$int_comment =~ s/\r+//g; # cleaning the comment lines
		$int_term->comment("$int_comment");
		$int_term->id ( "IntAct:$xref" );
		$onto->add_term($int_term);
		$onto->create_rel( $int_term, 'is_a', $mi_type );

		#  now individual participants:
		foreach ( keys %accs ) {
			my $ac = $accs{$_}; # $_ - participant IntAct id
			my $prot_name = $up_map->{$ac};
			next if ( !$prot_name ); # heterologous protein
			my $protein;
			$protein = $proteins{$ac};
			if ( !defined $protein ) { # the first occurrence of this protein
				$protein = $onto->get_term_by_name($prot_name);
				if ( !defined $protein ) {
					$protein = OBO::Core::Term->new();
					$protein->name($prot_name);
					$protein->id ( "UniProtKB:$ac" );
					$onto->add_term($protein);
					$onto->create_rel( $protein,  'is_a', $parent_protein);
					$onto->create_rel( $protein,  'has_source', $taxon);
					print $FH "$ac\t$prot_name\n";
				}
				$proteins{$ac} = $protein; # the protein was not yet in the hash
			}
			$onto->create_rel( $protein,  'participates_in', $int_term );
			$onto->create_rel( $int_term, 'has_participant', $protein);			
		} # end of foreach participant		
	} # end of foreach interaction
	return $onto;
}

sub contains_key {
	my ( $hash, $key ) = @_;
	return ( defined $hash->{$key} ) ? 1 : 0;
}

sub contains_value {
	my ( $hash, $value ) = @_;
#	select( ( select(STDOUT), $| = 1 )[0] );    # flushing the buffer - needed??? 
	foreach ( keys %{$hash} ) {
		return 1 if ( "$hash->{$_}" eq $value );
	}
	return 0;
}

1;