Bio::DOOP::Util::Run::Fuzznuc - Fuzznuc module


Bio-DOOP-DOOP documentation Contained in the Bio-DOOP-DOOP distribution.

Index


Code Index:

NAME

Top

Bio::DOOP::Util::Run::Fuzznuc - Fuzznuc module

VERSION

Top

Version 0.7

SYNOPSIS

Top

   #!/usr/bin/perl -w

   use Bio::DOOP::DOOP;

   $db = Bio::DOOP::DBSQL->connect("user","pass","doop-plant-1_5","localhost");

   @list = ("81001020","81001110","81001200","81001225","81001230","81001290","81001470","81001580","81001610","81001620","81001680");

   $fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,'500','M',\@list,"/data/DOOP/dummy.txt");

   print $fuzznuc->get_tmp_file_name,"\n";

   $error = $fuzznuc->run('TTGGGC' , 1 , 0);

   if ($error == -1){
      die "No results or error!\n";
   }

   @res = @{$fuzznuc->get_results};

   for $result (@res){
      print $$result[0]->get_id,"| ",$$result[1]," ",$$result[2]," ",$$result[3]," ",$$result[4],"\n";
   }

DESCRIPTION

Top

This module is a wrapper for the EMBOSS (http://emboss.sourceforge.net) program fuzznuc. You can search for patterns in the promoter sequences.

AUTHORS

Top

Tibor Nagy, Godollo, Hungary and Endre Sebestyen, Martonvasar, Hungary

METHODS

Top

new

Create new Fuzznuc object.

Arguments:

1. Bio::DOOP::DBSQL object 2. promoter type (500, 1000, 3000) 3. subset type (depends on reference species) 4. arrayref of clusters 5. temporary file name (default: /tmp/fuzznuc_run.txt, will contain fasta sequences)

  $fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,500,'M',\@list,'/tmp/tmpfile');

new_by_file

Create new fuzznuc object from query file, containing cluster ids.

Arguments:

1. Bio::DOOP::DBSQL object 2. promoter type (500, 1000, 3000) 3. subset type (depends on reference species) 4. file containing cluster ids 5. temporary file name (default: /tmp/fuzznuc_run.txt, will contain fasta sequences)

  $fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,500,'M','/tmp/clusters.txt','/tmp/tmpfile');

new_by_tmp

Create new fuzznuc object from existing temporary file, containing query sequences in fasta format.

Arguments:

1. Bio::DOOP::DBSQL object 2. file containing fasta sequences

  $fuzznuc = Bio::DOOP::Util::Run::Fuzznuc->new($db,'/tmp/sequences.fasta');

get_tmp_file_name

Get the temporary file name.

   $tempname = $fuzznuc->get_tmp_file_name;

get_emboss_version

Get the installed emboss version.

   $version = $fuzznuc->get_emboss_version;

run

Runs fuzznuc, returns 0 on success, otherwise -1.

Arguments :

1. query pattern 2. mismatch number 3. complement (0 or 1)

   $fuzznuc_error = $fuzznuc->run('AACCAGGTT','1','1');

run_background

Runs fuzznuc in background, returns the process id.

Arguments :

1. query pattern 2. mismatch number 3. complement (0 or 1) 4. output filename

   $fuzznuc_pid = $fuzznuc->run_background('AACCAGGTT','1','1','/tmp/fuzznuc_result.txt');

get_raw_results

Returns an arrayref of arrays with the raw fuzznuc results, without Bio::DOOP objects. This is much faster as it does not use the database.

The results contain the following:

1. sequence ID 2. hit start 3. hit end 4. mismatch number 5. hit sequence 6. hit strand

   @result = @{$fuzznuc->get_raw_results};

get_results

Returns an arrayref of arrays with sequence objects and other information of the results.

The results contain the following:

1. Bio::DOOP::Sequence object 2. hit start 3. hit end 4. mismatch number 5. hit sequence 6. hit strand

   @result = @{$fuzznuc->get_raw_results};

get_results_from_file

Returns an arrayref of arrays with sequence objects and other information of the results from a results file. With this method you can fetch the results of different fuzznuc objects.

The results contain the following:

1. Bio::DOOP::Sequence object 2. hit start 3. hit end 4. mismatch number 5. hit sequence 6. hit strand

   @result = @{$fuzznuc->get_results_from_file};


Bio-DOOP-DOOP documentation Contained in the Bio-DOOP-DOOP distribution.
package Bio::DOOP::Util::Run::Fuzznuc;

use strict;
use warnings;
use Carp qw(cluck carp verbose);

our $VERSION = '0.7';

sub new {
  my $self                 = {};
  my $dummy                = shift;
  my $db                   = shift;
  my $promo_type           = shift;
  my $subset_type          = shift;
  my $cluster_id_list      = shift;
  my $tmp_filename         = shift;

  # TODO use File::Temp module
  if (!$tmp_filename) { $tmp_filename = "/tmp/fuzznuc_run.txt" }
  open TMP,">$tmp_filename";
  for my $cl_id (@{$cluster_id_list}){
     my $cl = Bio::DOOP::Cluster->new($db,,$cl_id,$promo_type);
     if ($cl == -1){ next }
     my $subset = $cl->get_subset_by_type($subset_type);
     if ($subset == -1){ next }
     my @seqs = @{$subset->get_all_seqs};
     for my $seq (@seqs){
        print TMP ">",$seq->get_id,"\n";
	print TMP $seq->get_raw_seq,"\n\n";
     }
  }
  close TMP;
  $self->{DB}              = $db;
  $self->{CLLIST}          = $cluster_id_list;
  $self->{TMP_FILE}        = $tmp_filename;

  bless $self;
  return($self);
}

sub new_by_file {
  my $self                 = {};
  my $dummy                = shift;
  my $db                   = shift;
  my $promo_type           = shift;
  my $subset_type          = shift;
  my $filename             = shift;
  my $tmp_filename         = shift;
  my @cluster_id_list;

  # TODO use File::Temp module
  if (!$tmp_filename) { $tmp_filename = "/tmp/fuzznuc_run.txt" }

  open CLUSTER_ID_FILE,$filename or cluck("No such file or directory!\n");
  open TMP,">$tmp_filename" or cluck("Can't write to the temporary file!\n");
  while(<CLUSTER_ID_FILE>){
     chomp;
     my $cl_id = $_;
     push @cluster_id_list,$cl_id;
     my $cl = Bio::DOOP::Cluster->new($db,,$cl_id,$promo_type);
     my $subset = $cl->get_subset_by_type($subset_type);
     if ($subset == -1) { next }
     my @seqs = @{$subset->get_all_seqs};
     for my $seq (@seqs){
        print TMP ">",$seq->get_id,"\n";
	print TMP $seq->get_raw_seq,"\n\n";
     }
  }
  close CLUSTER_ID_FILE;
  close TMP;

  $self->{DB}              = $db;
  $self->{CLLIST}          = \@cluster_id_list;
  $self->{TMP_FILE}        = $tmp_filename;

  bless $self;
  return($self);
}

sub new_by_tmp {
  my $self                 = {};
  my $dummy                = shift;
  my $db                   = shift;
  my $tmp_filename         = shift;

  $self->{DB}              = $db;
  $self->{TMP_FILE}        = $tmp_filename;
      
  bless $self;
  return($self);
}

sub get_tmp_file_name {
  my $self                 = shift;
  return($self->{TMP_FILE});
}

sub get_emboss_version {
  my $self                 = shift;
  return($self->{EMBOSSVER});
}

sub run {
  my $self                 = shift;
  my $pattern              = shift;
  my $mismatch             = shift;
  my $complement           = shift;

  my $file = $self->{TMP_FILE};

  my @result = `fuzznuc $file -pattern='$pattern' -sformat=fasta -pmismatch=$mismatch -complement=$complement -stdout -auto`;
  
  my $seq_id;
  my $start;
  my $end;
  my $mism;
  my $hitseq;
  my @parsed;
  my $strand;

  if ($#result == -1) { return(-1) } #No results or an error happened.
  for my $line (@result){
     if ($line =~ / Sequence: (\S+)/){
        $seq_id = $1;
     }
     if ($line =~ /\s+(\d+)\s+(\d+)\s+(\w+)\s+([0123456789.]+)\s+(\w+)/){
        $start  = $1;
	$end    = $2;
	$mism   = $4;
	$hitseq = $5;
	$mism =~ s/\./0/;
	$strand = $start < $end ? 1 : -1;
	push @parsed, "$seq_id $start $end $mism $hitseq $strand";
     }
  }

  $self->{RESULT} = \@parsed;
  return(0);
}

sub run_background {
  my $self                 = shift;
  my $pattern              = shift;
  my $mismatch             = shift;
  my $complement           = shift;
  my $outfile              = shift;
  my $file = $self->{TMP_FILE};
  my $pid;

  unless($pid = fork){
     `fuzznuc $file -pattern='$pattern' -pmismatch=$mismatch -sformat=fasta -complement=$complement -outfile=$outfile`;
  }

  return($pid);
}

sub get_raw_results {
  my $self                = shift;

  my @fuzznuc_res;
  my $res = $self->{RESULT};
  my $seq_id;
  my $start;
  my $end;
  my $mism;
  my $hitseq;
  my $strand;

  for my $line (@{$res}){
     ($seq_id,$start,$end,$mism,$hitseq,$strand) = split(/\s+/,$line);

     push @fuzznuc_res,[$seq_id,$start,$end,$mism,$hitseq,$strand];
  }

  return(\@fuzznuc_res);
}

sub get_results {
  my $self                = shift;

  my @fuzznuc_res;
  my $res = $self->{RESULT};
  my $seq_id;
  my $start;
  my $end;
  my $mism;
  my $hitseq;
  my $strand;

  for my $line (@{$res}){
     ($seq_id,$start,$end,$mism,$hitseq,$strand) = split(/\s+/,$line);
    
     my $seq = Bio::DOOP::Sequence->new_from_dbid($self->{DB},$seq_id);
     push @fuzznuc_res,[$seq,$start,$end,$mism,$hitseq,$strand];
  }

  return(\@fuzznuc_res);
}

sub get_results_from_file {
  my $self                 = shift;
  my $filename             = shift;

  my $seq_id;
  my $start;
  my $end;
  my $mism;
  my $hitseq;
  my @parsed;
  my $strand;

  open FILE, $filename or return(-1);
  while(<FILE>){
     chomp;
     my $line = $_;
     if ($line =~ / Sequence: (\S+)/){
        $seq_id = $1;
     }
     if ($line =~ /\s+(\d+)\s+(\d+)\s+(\w+)\s+([0123456789.]+)\s+(\w+)/){
        $start  = $1;
	$end    = $2;
	$mism   = $4;
	$hitseq = $5;
	$mism =~ s/\./0/;
	$strand = $start < $end ? 1 : -1;
        my $seq = Bio::DOOP::Sequence->new($self->{DB},$seq_id);
	push @parsed, [$seq,$start,$end,$mism,$hitseq,$strand];
     }
  }
  close FILE;

  $self->{RESULT} = \@parsed;
  return(\@parsed);
}

1;