/usr/local/CPAN/Bio-ConnectDots/Bio/ConnectDots/ConnectorSet/IPI.pm
package Bio::ConnectDots::ConnectorSet::IPI;
use strict;
use vars qw(@ISA);
use Bio::ConnectDots::ConnectorSet;
@ISA = qw(Bio::ConnectDots::ConnectorSet);
sub parse_entry {
my ($self) = @_;
my $input_fh = $self->input_fh;
while (<$input_fh>) {
chomp;
if (/^\/\//) {
next unless $self->have_dots;
return 1;
} #end of if
if (/^ID/) {
my ( $field, $IPI, $others ) = split /\s+/;
$self->put_dot( 'IPI_id', $IPI );
$IPI =~ s/\.[0123456789]$//;
$self->put_dot( 'IPI_id_old', $IPI );
}
if (/^AC/) {
my ( $field, @retired ) = split /\s+/;
foreach my $id (@retired) {
$id =~ s/;//g;
$self->put_dot( 'Retired_id', $id );
}
}
if (/^DE/) {
my ( $field, $prot_name ) = /(^DE)\s+(.+)$/;
$prot_name =~ s/.$//;
$self->put_dot( 'Protein_Name', $prot_name );
}
if (/^OS/) {
$_ =~ /^OS\s+(\w+\s+\w+)\s+\(.+\)\./;
my $organism = $1;
$self->put_dot( 'Organism', $organism );
}
if (/^OX/) {
my ( $field, $TaxID ) = split /\s+/;
$TaxID =~ s/NCBI_TaxID=//g;
$TaxID =~ s/;//g;
$self->put_dot( 'NCBI_taxID', $TaxID );
}
if (/^DR/) {
my @DR = split /\s+/;
if ( $DR[1] eq 'REFSEQ_XP;' ) {
chop( $DR[2] );
$DR[2] =~ s/;$//g;
$self->put_dot( 'REFSEQ_XP', $DR[2] );
if ( $DR[3] =~ /GI:/ ) {
$DR[3] =~ s/^GI://g;
$DR[3] =~ s/;$//g;
$self->put_dot( 'GI', $DR[3] );
}
}
if ( $DR[1] eq 'PRINTS;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'PRINTS', $DR[2] );
}
if ( $DR[1] eq 'InterPro;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'InterPro', $DR[2] );
}
if ( $DR[1] eq 'Pfam;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'pfam', $DR[2] );
}
if ( $DR[1] eq 'ProDom;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'ProDom', $DR[2] );
}
if ( $DR[1] eq 'SMART;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'SMART', $DR[2] );
}
if ( $DR[1] eq 'PROSITE;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'PROSITE', $DR[2] );
}
if ( $DR[1] eq 'HUGO;' ) {
chop( $DR[3] );
$DR[3] =~ s/;//g;
$self->put_dot( 'HUGO', $DR[3] );
}
if ( $DR[1] eq 'LocusLink;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'LocusLink', $DR[2] );
}
if ( $DR[1] eq 'GO;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'GO_id', $DR[2] );
}
if ( $DR[1] eq 'UniProt/Swiss-Prot;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$DR[2] =~ s/-\d//g;
$self->put_dot( 'SwissProt', $DR[2] );
}
if ( $DR[1] eq 'REFSEQ_NP;' ) {
chop( $DR[2] );
$DR[2] =~ s/;$//g;
$self->put_dot( 'REFSEQ_NP', $DR[2] );
if ( $DR[3] =~ /GI:/ ) {
$DR[3] =~ s/^GI://g;
$DR[3] =~ s/;$//g;
$self->put_dot( 'GI', $DR[3] );
}
}
if ( $DR[1] eq 'UniProt/TrEMBL;' ) {
/^DR\s+UniProt\/TrEMBL;\s*(.*)$/;
$self->put_dot( 'TREMBL', $1 );
}
if ( $DR[1] eq 'TIGRFAMs;' ) {
chop( $DR[2] );
$DR[2] =~ s/;//g;
$self->put_dot( 'TIGRFAMs', $DR[2] );
}
if ( $DR[1] eq 'ENSEMBL;' ) {
chop( $DR[2] );
$DR[2] =~ s/;$//g;
$self->put_dot( 'ENSEMBL_peptide', $DR[2] );
$DR[3] =~ s/;$//g;
$self->put_dot( 'ENSEMBL_gene', $DR[3] );
}
if ( $DR[1] eq 'RZPD;' ) {
/^DR\s+RZPD;\s*(.*)$/;
$self->put_dot( 'RZPD', $1 );
}
if ( $DR[1] eq 'Genew;' ) {
/^DR\s+Genew;\s*(.*)$/;
$self->put_dot( 'Genew', $1 );
}
} #if DR
if (/^SQ/) {
my @SQ = split /\s+/;
$self->put_dot( 'AA_length', $SQ[2] );
$self->put_dot( 'Molecular_Weight', $SQ[4] );
# # process sequence
# my $cont = 1;
# my $sequence;
# while ($cont) {
# $_ = <$input_fh>;
# chomp;
# if (/^\s+/) {
# s/\s//g; # remove white spaces between sets of 10 AAs
# $sequence .= $_;
# }
# else { $cont = 0; }
# }
# $self->put_dot( 'Protein_Sequence', $sequence );
# if (/^\/\//) {
# next unless $self->have_dots;
# return 1;
# }
} # end SQ
} #end of while
return undef;
} #end of sub
1;