/usr/local/CPAN/Bioinf/Bioinf.pm
package Bioinf;
require 5.000;
require Exporter;
use Carp;
@ISA = qw(Exporter);
@EXPORT= qw( ISS_server ONE_TO_THREE_LETTER One_To_Three_Letter Richardson_alpha_matrix
Roman Sheraga_alpha_matrix abs_numerically add_columns
add_ranges_in_msp_line amino_acid_compos_id_percent amino_acid_compos_id_percent_trend amino_acid_homology_matrix
arabic array_average array_chk array_least_occur
array_median array_most_occur array_sum ask_for_ENV_vars
assign_options_to_variables attach_classification_to_pdb_seq average_array average_of_array
beep bla_to_msf break_down_clu_file by_values
calc_compos_id_hash calc_factorial calculate_protein_volume capitalize_sentence
capitalize_word cc check_common_elements_in_array check_file_exists_in_path
check_homology_of_seq_pair check_if_defined check_if_files_exist check_if_sec_str_form_hash
check_input_file_extension check_linkage_of_2_similar_seqlet_sets check_parf_files chop_word
cls clu_to_sso_to_msp cluster_merged_seqlet_sets com_gap_pos_hash
common_compos_2_hash common_compos_id_hash compare_sec_template_with_db compos_id_percent_array
compos_id_percent_hash composition_table compress_files_by_gzip condense_number_string
condense_script convert_1_to_3_letter convert_3_to_1_letter convert_arr_and_str_2_hash
convert_array_to_hash convert_bla_multaln_to_msf convert_bla_to_msf convert_bla_to_msp
convert_char_to_0_or_1_hash convert_clu_to_msp convert_clu_to_sso_to_msp convert_dna_to_protein
convert_hmmls_to_msp_files convert_mmp_to_mrg convert_msp_line_to_mmp_line convert_num_0_or_1_hash_opposite
convert_num_to_0_or_1_hash convert_rna_to_protein convert_sso_to_msp convert_string_to_hash
convert_to_anti_sense corelation_coefficient correct_head_box count_num_of_char
cp create_sorted_cluster ctime default_help
define_secondary_structure_segments delbut detect_file_format_type die_if_file_not_present
diff_dates digitize_char dir_name dir_path
dir_search dir_search_single divide_array divide_clusters
divide_string do_hmm_sequence_search do_intermediate_sequence_search do_psi_blast_search
do_self_blastp_search do_sequence_search do_window_scan encrypt_passwd
exchange_query_with_match_in_msp exchange_query_with_match_in_msp extract_num_to_array extract_ori_seq
extract_words fasta_append fasta_kt1_search fasta_out_seq_no
fasta_output fasta_permute_array_write fasta_permute_hash_write fetch_seq
fetch_sequence_from_db fetch_subroutines fetch_swiss_seq file_size
fill_ending_space filter_by_string_length filter_hash_by_num_value filter_intermediates_by_E_value
filter_seq_DB_by_seq_length find_central_seq_msp_chunk find_central_sequence find_low_complexity_region
find_program_in_path find_seq_file_old find_seq_files find_source_perl_library
follow_seqlet_link fromJulian full_pwd_path geanfammer
geanfammer_main get_added_matched_regions_in_msp get_all_dirs_from_ENV get_all_msp_files
get_av_and_sd_seq_length get_av_seq_length get_average_sequence_size get_averaged_prediction
get_base_names get_column get_common_array_entry get_common_column
get_common_hash_keys get_correct_percent_alignment_rate get_date get_dir_names_only
get_domain_inside_domain get_each_posi_diff_hash get_extension_names get_false_positive_seq_matches
get_file_dir_names get_file_extensions get_first_seq_in_alignment get_full_dir_names
get_full_file_name get_full_path_dir_names get_full_pwd_path get_gap_positions
get_hash_value_average get_high_score_blocks get_homology_info_of_seq_pairs get_host_by_addr
get_host_by_name get_id_among_2 get_id_among_2_1 get_id_among_2_2
get_internal_dup_in_a_cluster get_isearch_result_stat get_largest_element get_largest_file
get_linked_sequence get_linux_kernel_version get_longest_str_size get_max_hash_by_value
get_median get_median get_msp_enquiry_sequence get_msp_matched_sequence
get_msp_range get_multiple_array_entry get_occurances_of_char get_occurances_of_shift_type_hash
get_occurances_of_shift_type_hash_all get_overlapping_range get_overlapping_seq_match_size get_pair_homol_array
get_pair_homol_hash get_path_dirs_from_ENV get_pdb_file_start_number get_peptide_occurance
get_percent_homo_hash get_percent_homol_arr get_percentage get_perl_keywords
get_posi_diff get_posi_diff_abs get_posi_diff_and_rms_hash get_posi_diff_hash
get_posi_rates_hash_out get_posi_rates_hash_out get_posi_rates_hash_out_compact get_posi_rates_hash_out_jp
get_posi_rates_hash_out_msf get_posi_sans_gaps get_posi_shift_hash get_posi_shift_hash_rms
get_posi_shift_rate get_posi_shift_rms_hash get_posi_shift_rms_whole get_position_shift_rate
get_probable_half get_pwd_dir get_pwd_dir_name get_residue_error_rate
get_scop_correcting_pairs get_sd_of_length_diff get_segment_shift_rate get_seq_fragments
get_seq_hash_sans_gaps get_seq_identity get_seqblock get_sequence_complexity
get_sequence_number get_shortest_str_size get_smallest_file get_stat_FASTA_search_result_in_msp_0_files
get_sub_hash get_subdir_names get_subroutine_calls get_time
get_total_memory_size_in_linux get_unix_shell_name get_whole_pwd_path get_windows_cs_rate_array
get_windows_sc_rate_array get_wrong_segment_rate handle_arguments handle_arguments_old
hash_average hash_catenate hash_chk hash_common
hash_common2 hash_common_by_keys hash_no_common hash_output_chk
hash_stat_for_all hash_substract_by_keys hostname if_file_older_than_x_days
import_ENV_vars initialize_code insert_gaps_in_seq_hash insert_lines_anywhere
interm_lib_search is_html isroman key_ready
link_ranges load_mount_info mail_it make_2D_aa_residue_matrix_array
make_2D_identity_matrix make_2D_identity_matrix_array make_6_frame_dna_sequences make_cdf_file
make_clustering_summary make_composition_ratio_table make_composition_ratio_table_simple make_composition_table
make_fasta_files_from_msp_1_files make_filtered_list make_hmm_from_alignment make_intermediate_sequence_library
make_one_array make_pairs_from_hash make_random_sequence make_reverse_seq_database
make_scrambled_seq_database make_seq_alignment_length_even make_seq_index_file make_sequence_match_table
make_singlet_list_from_pdb_entries make_standalone_subroutines make_swiss_index make_template_from_sec_str
max max_elem_array max_elem_string_array max_elem_string_array
max_str_key_hash max_str_value_hash maximum merge_array
merge_arrays_by_common_elements merge_hash merge_many_arrays merge_sequence_alignments
merge_sequence_in_msp_chunk merge_sequence_in_msp_file merge_similar_ranges merge_similar_seqlets
merge_superfam_fasta_files_for_ISL min min_elem_array min_str_key_hash
min_str_value_hash minimum msf_permute_array_write msf_permute_hash_write
msp_single_link_hash mv n normalize_numbers
numerically occurances one_to_three_letter open_ali_files
open_aln_files open_brk_files open_cel_files open_clu_files
open_dna_files open_dssp_files open_embl_files open_fasta_files
open_fil_file open_hlx_files open_hmmfs_files open_hmmls_files
open_jp_files open_lottery_file open_msf_files open_msf_jp_files
open_msp_files open_out_files open_pdb_files open_pdbg_files
open_phd_files open_pir_files open_predator_files open_rms_files
open_rms_files2 open_sdb_files open_self open_seq_alignment_files
open_seq_files open_sequence_index_files open_slx_files open_sso_files
open_sst_files open_sst_files_with_gap open_stride_dat_files open_stride_dat_files
open_subdir_and_go_in_and_do open_swissprot_seq_files open_tem_files opendir_and_go
opendir_and_go_in_and_do_something opendir_and_go_rand_fasta opendir_and_go_rand_fasta_and_clustal overlay_seq_by_certain_chars
overlay_seq_for_identical_chars overlay_seq_hash pair_percent_id_trend pairwise_iden_pos
pairwise_percent_id parse_arguments permute permute_binary
pick_random_files pick_random_hash_pairs pir_permute_array_write pir_permute_hash_write
pir_write pir_write plot_histogram_horizontally plot_vertically
print_clusfile_from_hash print_in_block print_seq_in_block print_seq_in_block_old
print_seq_in_block_with_print print_seq_in_columns produce_random_numbers push_if_not_already
push_if_not_already put_gaps_every_x_position_in_string put_gaps_every_x_position_in_string_special put_gaps_in_hash
put_msp_lines_to_hash_from_bla put_position_back_to_str_seq put_slash_before_special_chars pwd_dir_name
pwd_path rand_DNA_seq_generate rand_RNA_seq_generate rand_sequence_mul_array
rand_sequence_one_array rand_sequence_one_string rand_word randomise_file_contents
randomise_lines read_all_head_boxes read_any_dir read_any_dir2
read_any_dir_for_dir read_any_dir_simple read_any_seq_files read_blast_hits
read_correct_head_box read_dir_names_only read_file_extension_names_only read_file_names_only
read_first_head_box read_fssp_files read_full_dir_names read_head_box
read_head_box2 read_head_boxes read_hssp_no_inserts read_machine_readable_sso_lines
read_machine_unreadable_sso_lines read_option_table read_seq_matrix_files read_sso_lines
read_sst_files read_subroutines remov_com_column remov_com_column2
remov_common_gap remove_dup_in_array remove_dup_in_hash remove_dup_match_in_msp_files
remove_dup_seq_entry remove_elements_by_name remove_elements_by_pattern remove_elements_by_position
remove_file_extension remove_mail_header_in_files remove_non_char remove_repetitives_in_array
remove_similar_seqlets remove_small_files remove_text replace_lines
replace_subroutines replace_text reset_all_the_vars reset_shell_environment
rev_abs_numerically rev_lines_pdb rev_sequence_mul_array reverse_hash
reverse_sequences roman rotate_seq round
round_number round_numbers run_fasta_sequence_search scale_for_horizontal_histogram
scan_win_and_get_cs_rate_pairs scan_win_and_get_sc_rate_pairs scan_win_get_average scan_window_and_calc_average
scan_window_and_calc_something scan_windows_and_get_compos_seqid_rate scoring scramble_array
scramble_sequences sd se search_files_in_subdir
search_palindromes search_self self_self_search send_mail
sep seq_comp_percent1 seq_comp_percent2 seq_id_percent_array
seq_to_regexp set_debug set_debug_option set_special_options
shift_word shift_word_recursively show_array show_default_help
show_hash show_in_fasta show_options show_subclusterings
smaller_one sort_by_E_values sort_by_cluster_size sort_by_column
sort_by_column_bigger_first sort_by_digits_in_string sort_by_hash_values sort_by_keys
sort_files_by_size sort_files_by_time sort_hash_by_keys sort_hash_by_value
sort_hash_by_value_and_make_array sort_hash_value_by_column sort_string_by_length sort_words_in_string
split_fasta_files split_file_by_string split_files split_sequence
sqrt_array square_array sso_to_msp ssp_permute_array_write
ssp_permute_hash_write ssp_write steve_permute_array strip_rotated_seq
strip_sequence_ranges substract_array substract_hash_by_keys substract_hash_by_values
sum_array sum_digits_in_string sum_hash_values sum_hash_values_of_string
sum_of_array sum_of_squared_array sum_x_mul_y_arrays superpose_hash
superpose_seq_hash take_file_name takeout_subroutines tally_2_hashes
tell_seq_length tempname three_to_one_letter tidy_secondary_structure_segments
time_date toJulian transform_values trim_numbers
update_subroutines weighted_av weighted_average word_wrap
write_aln_files write_ardf_files write_c3al_files write_c3ss_files
write_dof_files write_evss_files write_fasta write_fasta_array
write_fasta_seq_by_seq write_gcg_file write_gcg_genbank_file write_genbank_file
write_good_bad_list_in_divide_clusters write_head_box write_html_headbox write_iss_file
write_jp write_modeller_ali_file write_modeller_top_file write_mprf_files
write_msf write_msp3_files write_msp_files write_nhco_files
write_parf_files write_pdbg_files write_pir_file write_prdl_files
write_pred_files write_primer_file write_rdif2_files write_rdif_files
write_reverse_seq_files write_sdb_file write_seq_files write_staden_file
write_subroutines x_mul_y_arrays );
#!/usr/bin/perl
#______________________________________________________________________
# Title : Bio::Bioinf (for Bio Perlogical) or bio_lib.pl
# Usage : require "Bioinf.pl"; ##<-- This is very slow, so you'd better
# use Bio::Bioinf; <-- When you have Bioinf.pm module installed under Bio
# or
# use Bioinf;
#
# or, copy the subroutines in your prog. or make a smaller lib files
# which are classified according to functions(like, Bio_Seq.pl
# for sequence handling, Bio_Array.pl for various array
# subroutines..), or make your own module out of this, do whatever
# you want....
#
# Function : 1) This is a comprehensive perl subroutine library developed
# under Bioperl project and others.
# URL: http://cyrah.med.harvard.edu/Bioperlsub/
#
# 2) The design of this module is for simple layer biological
# module than multilevel object oriented module.
#
# 3) This also serves as the depository database for various
# perl subroutines or algorithms developed in
# Bioinformatics and Genome projects.
#
# 4) You can copy any of the sub routines in this file, modify, use
# in yours...
# PLEASE MODIFY AS FREELY AS YOU WANT !! All has the
# same PERL copyright
#
# 5) All the subroutines are tested in small files
# If you want to have such single example program
# to see how they really work, pls contact me( A Biomatic )
# For example, a file called 'handle_arguments.pl' exists to
# test the subroutine 'handle_arguments'. Usually you can find them
# in http://cyrah.med.harvard.edu/Bioinf.pl.html
#
# Example : require "Bioinf.pl"; BUT, I recommand you take subroutines out and
# use it directly or modify in your programs.
# use Bioinf;
#
# Warning : For the enhancement of Biology, Biomatics, and Science.
# This is a development companion.
# Class is for classification of my subroutines. If it is Bio, it can
# be useful for biological sequence data handling. If it's Utility,
# it can also be used for general purpose file handling stuff.
# File, Array, Hash,... are my classification items.
# Keywords : Biology, perl library, sequence handling lib
# Options : nothing (used as subroutine library or as Bioinf.pm module)
#
# Author : J. Park, Jason Johnson, Sarah Teichmann, Alex Bateman,
# Astrid Reinhardt, and anybody contributed.
# jong@salt2.med.harvard.edu
# Category : Bioinf
# Version : 2.0 (Aug/1/1998)
#------------------------------------------------------------------
print "\n ################################################################\n";
print " # #\n";
print " # Using Bioperl subroutine Module for Bioinformatics & Biology #\n";
print " # #\n";
print " ################################################################\n";
#&parse_arguments(1);
## The following box is used as the header for any subroutines developed to
## give information on the subroutines. It is used by Jong as a template.
#______________________________________________________________________________
# Title :
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
## Following variables in 'my' are very commonly used ones. I have
## put those to be copied into any new subroutines to be developed
## This is because, in Perl, every variable is global unless you mark them
## to be inside the subroutines. Many BUGs are coming from not localizing vars.
## This array variables are used as a defalt insertion for the subroutine
## 'handle_arguments'. If you add this box in any sub, 'handle_arguments'
## subroutine will be called and any arguments passed to the subroutine will
## be classified to file, dir, string, hash(as reference), array(as reference),
## pure number, or option(with -) prefix. etc. For more detail look at
## handle_argument's header.
sub IGNORE_THIS{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
}
#________________________________________________________________________
# Title : handle_arguments
# Usage : Just put the whole box delimited by the two '###..' lines below
# to inside of your subroutines. It will call 'handle_arguments'
# subroutine and parse all the given input arguments.
# To use, claim the arguments, just use the variable in the box.
# For example, if you had passed 2 file names for files existing
# in your PWD(or if the string looks like this: xxxx.ext),
# you can claim them by $file[0], $file[1] in
# your subroutine.
# Function : Sorts input arguments going into subroutines and returns default
# arrays of references for various types (file, dir, hash, array,,,,)
# If you give (\@out, @file), it will put @out into @array as a ref
# and also the contents of @out will be dereferenced and put to
# raw_string regardless what is in it).
#
# Example : 'handle_arguments(\@array, $string, \%hash, 8, 'any_string')
# Warning :
# Keywords : handling arguments, parsing arguments,
# Options :
# Returns : Following GLOBAL variables
#
# $num_opt, @num_opt @file @dir
# $char_opt, @char_opt %vars @array,
# @hash @string, @raw_string @range,
#
# $num_opt has 10,20
# @num_opt has (10, 20)
# @file has xxxx.ext
# @dir has dir or /my/dir
# $char_opt has 'A,B'
# @char_opt has (A, B)
# @array has (\@ar1, \@ar2)
# @hash has (\%hash1, \%hash2)
# @string ('sdfasf', 'dfsf')
# @raw_string (file.ext, dir_name, 'strings',,)
# @range has values like 10-20
# %vars deals with x=2, y=3 stuff.
#
# Argument : any type, any amount
# Category : general programming
# Version : 4.8
#--------------------------------------------------------------------
sub handle_arguments{
my($c, $d, $e, $f, $i, $j, $k, $l, $s, $t, $x, $y, $z, $char_opt, $dir, @hash,
$file, $in_dir, $num_opt, @char_opt, @dir, @file, @string, @file_dir, @k,
@num_opt, @raw_string,@string, @array, %vars, @range, @temp, $temp,
@char_options);
&set_debug_option;
if(@_<1){ print chr(7),"\n This is handle_arguments. No args Passed, Error?\n"}
elsif( (@_ ==1)&& (ref($_[0]) eq 'ARRAY') ){ # when there is only 1 argument
push(@array, $_[0]);
push(@k, $_[0]);
}elsif( (@_==1)&&( !ref($_[0]) ) ){
if(-f $_[0]){ push(@file, $_[0]); push(@string, $_[0]) }
elsif(-d $_[0]){ push(@dir, $_[0]); push(@string, $_[0]) }
elsif($_[0]=~/^\d+$/){ push(@num_opt, $_[0]); $num_opt.=$_[0] }
elsif($_[0]=~/^\w+$/){ push(@string, $_[0]); }
}elsif(@_ >=1){ @k = @_ }
#####______Start of general argument handling______######
for($k=0; $k < @k ;$k++){
if( !ref($k[$k]) ){
if($k[$k]=~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){ push(@char_opt, $1); $char_opt .= "$1\,";
}elsif($k[$k]=~ /^\-([a-zA-Z]+)$/){ ## When multiple option is given,
@char_options = split(/\,|/, $1); push(@char_opt, @char_options);
$char_opt .= join("\,", @char_options); ## '-' should be used. eg. '-HEGI'
}elsif($k[$k]=~ /^(\w+)\=(\S* *)$/){ $vars{$1}=$2; $vars .= "$1\,";
}elsif($k[$k]=~ /^(\-?\d+)$/){ push(@num_opt, $1); $num_opt .= "$1\,";
}elsif($k[$k]=~ /^\d+\.?\d*\-\d+\.?\d*$/){ push(@range, $k[$k] );
}elsif(-f $k[$k]){ push(@file, $k[$k] );
}elsif(-d $k[$k]){ push(@dir, $k[$k] );
}elsif($k[$k]=~ /\/[\w\d\.\-]+[\/].+[\/]$/){push(@dir, $k[$k] );
}elsif($k[$k]=~ /^\/[\w\d\.\-]+[\/]*$/){ push(@dir, $k[$k] );
}elsif($k[$k]=~ /^[\/\w\d\-\.]+\.\w+$/){ push(@file, $k[$k] );
}elsif($k[$k]=~ /\S\/[\/\w\d\-\.]+\.\w+$/){ push(@file, $k[$k] );
}elsif($k[$k]=~/^\w+[\/\\\w\d\.\-]+$/){ push(@string, $k[$k] );
# string does not have space, but includes '\', '/', '.'
}else{ push(@raw_string, $k[$k] ); }
}elsif( ref($k[$k]) ){
if( ref($k[$k]) eq "SCALAR"){
if(${$k[$k]} =~ /^[\-]?([a-zA-Z]\d*) {0,5}$/){ push(@char_opt, $1); $char_opt .= "$1\,";
}elsif(${$k[$k]}=~ /^\-([a-zA-Z]+)$/){ push(@char_opt, @char_options);
$char_opt .= join("\,", @char_options); ## as an option string.
}elsif(${$k[$k]}=~ /^(\w+)\=(\S* *)$/){ $vars{$1}=$2; $vars .= "$1\,";
}elsif(${$k[$k]}=~ /^(\-?\d+)$/){ $num_opt .= "$1\,"; push(@num_opt, $1);
}elsif(${$k[$k]}=~ /^\d+\.?\d*\-\d+\.?\d*$/){ push(@range, $k[$k] );
}elsif(-f ${$k[$k]}){ push(@file, ${$k[$k]} );
}elsif(-d ${$k[$k]}){ push(@dir, ${$k[$k]} );
}elsif(${$k[$k]}=~ /\/[\/\w\d\.\-]+[\/].+[\/]/){ push(@dir, ${$k[$k]} );
}elsif(${$k[$k]}=~/^\/[\/\w\d\.\-]+[\/]*$/){ push(@dir, ${$k[$k]} );
}elsif(${$k[$k]}=~ /^[\/\w\d\-\.]+\.\w+$/){ push(@file, ${$k[$k]} );
}elsif(${$k[$k]}=~/^\w+[\w\d\.\-]+$/){ push(@string, ${$k[$k]} );
}else{ push(@raw_string, ${$k[$k]}); }
}elsif(ref($k[$k]) eq "ARRAY"){ my @temp_arr = @{$k[$k]}; push(@array, $k[$k]);
for ($i=0; $i<@temp_arr; $i++){
if(-f $temp_arr[$i]){ push(@file, $temp_arr[$i]);
}elsif($temp_arr[$i]=~/^\d+\.?\d*\-\d+\.?\d*$/){ push(@range,$temp_arr[$i] );
}elsif(-d $temp_arr[$i]){ push(@dir , $temp_arr[$i]);
}elsif($temp_arr[$i]=~/\/[\/\w\d\.\-]+[\/].+[\/]/){ push(@dir, $temp_arr[$i] );
}elsif($temp_arr[$i]=~/^\/[\/\w\d\.\-]+[\/]*$/){ push(@dir, $temp_arr[$i] );
}elsif($temp_arr[$i]=~/^[\/\w\d\-\.]+\.\w+$/){ push(@file,$temp_arr[$i] );
push(@string,$temp_arr[$i] );
}elsif($temp_arr[$i]=~/^\w+[\w\d\.\-]+$/){ push(@string,$temp_arr[$i]);
}else{ push(@raw_string, $temp_arr[$i]); }
}
}elsif(ref($k[$k]) eq "HASH"){ push(@hash, $k[$k] ); }
}
}
@raw_string=(@raw_string, @string);
@file = @{&remove_dup_in_arrayH(\@file)};
#-----------------------------------------------------
sub remove_dup_in_arrayH{ my($i, @nondup, @out_ref, %duplicate, @orig, @out_ref);
for($i=0; $i<@_; $i++){ undef(%duplicate);
if(ref($_[$i]) eq 'ARRAY'){ @orig = @{$_[$i]}; }
@nondup = grep { ! $duplicate{$_}++ } @orig; push(@out_ref, \@nondup);
}
if(@out_ref ==1){ return($out_ref[0]);}
elsif(@out_ref >1){ return(@out_ref); }
}
#-----------------------------------------------------
return(\@hash, \@array, \@string, \@dir, \@file, \@num_opt,
\@char_opt, \$num_opt, \$char_opt, \@raw_string, \%vars, \@range );
}
#________________________________________________________________________
# Title : set_debug_option
# Usage : &set_debug_option;
# Function : If you put '#' or '##' at the prompt of any program which uses
# this sub you will get verbose printouts for the program if the program
# has a lot of comments.
# Example : set_debug_option # <-- at prompt.
# Warning :
# Keywords :
# Options : # for 1st level of verbose printouts
# ## for even more verbose printouts
# $debug becomes 1 by '#' or '_'
# $debug2 becomes 1 by '##' or '__'
#
# Returns : $debug
# Argument :
# Category :
# Version : 1.8
#--------------------------------------------------------------------
sub set_debug_option{
my($j, $i, $level);
unless( defined($debug) ){
for($j=0; $j < @ARGV; $j ++){
if( $ARGV[$j] =~/^(_+)$|^(#+)$/){ # in bash, '#' is a special var, so use '_'
print __LINE__," >>>>>>> Debug option is set by $1 <<<<<<<<<\n";
$debug=1;
print chr(7);
print __LINE__," \$debug is set to ", $debug, "\n";
splice(@ARGV,$j,1); $j-- ;
$level = length($1)+1;
for($i=0; $i < $level; $i++){
${"debug$i"}=1;
print __LINE__," \$debug${i} is set to ", ${"debug$i"}, "\n";
}
}
}
}
}
#__________________________________________________________________________
# Title : sort_by_E_values
# Usage : @out=@{&sort_by_E_values(\@input_line_array)};
# Function : it sorts by the 2nd column(E-value, in msp file), small comes top
# Example :
# Keywords : sort_by_2nd_column, sort_by_second_column, sort_by_e_values,
# sort_by_evalues,
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#----------------------------------------------------------------------------
sub sort_by_E_values{
my (@in);
if(ref $_[0] eq 'ARRAY'){
@in = @{$_[0]};
}else{
@in = @_;
}
@in= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map {/^ {0,3}\S+ +(\S+)/ && [$_, $1] } @in;
return(\@in);
}
#__________________________________________________________________________
# Title : sort_hash_value_by_column
# Usage : @out=@{&sort_by_column(\%input_line_hash, <column num>)};
# Function : it sorts values of hash by the given column , small comes top. Unless number is
# is given, it sorts by the first column.
# It returnns ARRAY of the keys of the input HASH!!!
#
# It can handle gzipped file. It called gunzip to open and sort.
#
# Example : Above will sort the file xxxx.msp by its 3rd column(numerically)
# small numbers will come to the top.
# Keywords : sort_by_2nd_column, sort_by_second_column, sort_by_e_values,
# sort_by_evalues, sort_hash_by_column, sort_value_by_column,
# Options :
# s for sorting stringwise
# d for sorting by digit
# n for sorting by digit(numerically)
# numerically an alias of n
#
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub sort_hash_value_by_column{
my (%in, $i, $col, $sort_numerically, $sort_non_numerically, @keys);
$sort_numerically=1;
if(@_ < 2 ){ print "\n# FATAL: sort_by_column needs 2 arguments\n"; exit }
for (@_){
if(ref $_ eq 'HASH'){ %in =%{$_}; }
elsif( ref $_ eq 'SCALAR'){ $col=${$_}; }
elsif(/^\d+$/){ $col=$_ }
elsif(/^ *[nd] *$/i){ $sort_numerically=1; $sort_non_numerically=0; }
elsif(/^ *n[umerically]* *$/i){ $sort_numerically=1; $sort_non_numerically=0; }
elsif(/^ *s *$/i){ $sort_non_numerically=1; $sort_numerically=0; }
}
$col--;
@keys= keys %in;
if($sort_numerically ){
@keys= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { [$_, ($in{$_}=~/(\S+)/g)[$col] ] } @keys;
}else{ # here let's do the sring sort
@keys= map {$_->[0]} sort { $a->[1] cmp $b->[1] } map { [$_, ($in{$_}=~/(\S+)/g)[$col] ] } @keys;
}
return(\@keys);
}
#__________________________________________________________________________
# Title : sort_by_column
# Usage : @out=@{&sort_by_column(\@input_line_array, <column num>)};
# Function : it sorts by the given column , small comes top. Unless number is
# is given, it sorts by the first column.
#
# It can handle gzipped file. It called gunzip to open and sort.
#
# Example : sort_by_column.pl 3 xxxx.msp
# Above will sort the file xxxx.msp by its 3rd column(numerically)
# small numbers will come to the top.
# Keywords : sort_by_2nd_column, sort_by_second_column, sort_by_e_values,
# sort_by_evalues,
# Options :
# s for sorting stringwise
# d for sorting by digit
# n for sorting by digit(numerically)
# Category :
# Version : 1.4
#----------------------------------------------------------------------------
sub sort_by_column{
my (@in, @M, $col, $sort_numerically, $sort_non_numerically);
unless(@_ ==2 ){ print "\n# FATAL: sort_by_column needs 2 arguments\n"; exit }
$sort_numerically=1;
for (@_){
if(ref $_ eq 'ARRAY'){ @in =@{$_}; }
elsif( ref $_ eq 'SCALAR'){ $col=${$_}; }
elsif(/^\d+$/){ $col=$_ }
elsif(/^ *[nd] *$/i){ $sort_numerically=1; $sort_non_numerically=0; }
elsif(/^ *s *$/i){ $sort_non_numerically=1; $sort_numerically=0; }
}
$col--;
if($sort_numerically ){ ## if the first and last elements are digits?
@in= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { [$_, ($_=~/(\S+)/g)[$col] ] } @in;
}else{ # here let's do the sring sort
@in= map {$_->[0]} sort { $a->[1] cmp $b->[1] } map { [$_, ($_=~/(\S+)/g)[$col] ] } @in;
}
return(\@in);
}
#__________________________________________________________________________
# Title : sort_by_cluster_size
# Usage : @out=@{&sort_by_cluster_size(\@input_line_array)};
# Function : it sorts by the 1st digit before '-' as in 2-183_cluster, 2-140_cluster,
# etc.
# Example :
# Keywords : sort_by_columns, sort_by_text_columns, sort_by_column_numerically
# sort_by_pattern
# Options :
# Category :
# Version : 1.2
#----------------------------------------------------------------------------
sub sort_by_cluster_size{
my (@in, @M, $col);
if(@_ < 1 ){ print "\n# FATAL: sort_by_cluster_size needs 1 argument\n"; exit }
if(ref $_[0] eq 'ARRAY'){ @in = @{$_[0]}; }else{ @in = @_; }
$col=0;
@in= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { [$_, ($_=~/^(\S+)\-/)[$col] ] } @in;
return(\@in);
}
#__________________________________________________________________________
# Title : sort_by_column_bigger_first
# Usage : @out=@{&sort_by_column_bigger_first(\@input_line_array, 1)};
# Function : it sorts by the 2nd column(E-value, in msp file), small comes top
# by the help of ts <decoux@moulon.inra.fr>
# Example :
# Keywords : sort_by_columns, sort_by_text_columns, sort_by_column_numerically
#
# Options :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub sort_by_column_bigger_first{
my (@in, @M);
unless(@_ ==2 ){ print "\n# FATAL: sort_by_column_bigger_first needs 2 arguments\n"; exit }
if(ref $_[0] eq 'ARRAY'){ @in = @{$_[0]}; }else{ @in = @_; }
if(ref $_[1] eq 'SCALAR'){ $col=${$_[1]}; }else{ $col=$_[1]; }
$col--;
@in= map {$_->[0]} sort { $b->[1] <=> $a->[1] } map { [$_, ($_=~/(\S+)/g)[$col] ] } @in;
return(\@in);
}
#______________________________________________________________________________
# Title : make_6_frame_dna_sequences
# Usage : %six_dna_frame_seqs=%{&make_6_frame_dna_sequences(\%input_seq)};
# Function : It makes 3 different types of frames for input seq
# Then it reverses the input seq and makes another 3 frames!!
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub make_6_frame_dna_sequences{
my(%six_frame_seqs, %original_input_seq, $orig_name,
$frame_fr1_name, $frame_fr2_name, $frame_fr3_name,
$frame_rf1_name, $frame_rf2_name, $frame_rf3_name,
$frame_fr1, $frame_fr2, $frame_fr3);
%original_input_seq=%{$_[0]};
($orig_name, $frame_fr1)=%original_input_seq;
$frame_fr1_name="$orig_name\_fr1";
$frame_fr2_name="$orig_name\_fr2";
$frame_fr3_name="$orig_name\_fr3";
$frame_rf1_name="$orig_name\_rf1";
$frame_rf2_name="$orig_name\_rf2";
$frame_rf3_name="$orig_name\_rf3";
$frame_fr1 =$frame_fr1;
$frame_fr2 =substr($frame_fr1, 1);
$frame_fr3 =substr($frame_fr1, 2);
$frame_rf1 =reverse($frame_fr1);
$frame_rf2 =substr($frame_rf1, 1);
$frame_rf3 =substr($frame_rf1, 2);
%six_frame_seqs=($frame_fr1_name, $frame_fr1,
$frame_fr2_name, $frame_fr2,
$frame_fr3_name, $frame_fr3,
$frame_rf1_name, $frame_rf1,
$frame_rf2_name, $frame_rf2,
$frame_rf3_name, $frame_rf3 );
return(\%six_frame_seqs);
}
#_____________________________________________________________________
# Title : make_scrambled_seq_database
# Usage : &make_reverse_seq_database(\@input_database_fasta_file);
# Function :
# Example :
# Warning :
# Keywords : scramble_seq_database, create_scrambled_seq_database
# Options :
# Category :
# Version : 1.1
#-------------------------------------------------------------------
sub make_scrambled_seq_database{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (%seqs, %scrambled_seqs, $fasta_file_for_DB,$base,$ext,$out_file_name );
for($i=0; $i< @file; $i++){
$fasta_file_for_DB =$file[$i];
$base=${&get_base_names($fasta_file_for_DB)};
#~~~~~~~~~~~~~~~~~~~ To prevent growing of _sc_sc_sc... ~~~~~~~~~~~~`
if($base=~/^(\S+)_sc[\_sc]*$/){ $base=$1 }
$ext =${&get_file_extensions($file[$i])};
if($ext=~/\S/){
$out_file_name="$base\_sc\.$ext";
$out_bak_file ="$base\_sc_bak\.$ext";
}else{
$out_file_name="$base\_sc\.fa";
$out_bak_file ="$base\_sc_bak\.fa";
print "\n# There was no file ext for $base, attaching \"fa\" as default\n";
}
if(-s $out_file_name){
print "\n# $out_file_name already exists, moving it to $out_bak_file\n";
}
%seqs=%{&open_fasta_files(\$fasta_file_for_DB)};
%reversed_seqs=%{&scramble_sequences(\%seqs)};
&write_fasta(\%reversed_seqs, $out_file_name );
if(-s $out_file_name){
print "\n# make_scrambled_seq_database: Supposedly wrote new file: $out_file_name\n";
}else{
print "\n# make_scrambled_seq_database: Error in writing: $out_file_name\n";
}
}
print "\n# make_scrambled_seq_database sub finished \n";
}
#__________________________________________________________________________
# Title : make_2D_identity_matrix_array
# Usage : @matrix=@{&make_2D_identity_matrix(\@seq1, \@seq2)};
# Function : @matrix is like $matrix[1][2]=1;
# This assigns number 1 to array element
# If one array is given, it makes self to self matrix.
# When 2 are given, make matrix for the 2
# Example :
# Keywords : make_matrix
# Options :
# $skip_gap_char = g for skipping gap char (any special char)
# Returns :
# Argument :
# Category :
# Version : 1.2
#----------------------------------------------------------------------------
sub make_2D_identity_matrix_array{
my (@matrix, $skip_gap_char, $k, $l, @seq_1, @seq_0);
for($i=0; $i< @_; $i++){
if($_[$i]=~/g/){
$skip_gap_char='g';
splice (@_, $i, 1);
$i--;
}elsif(ref($_[$i]) eq 'ARRAY'){
push(@seqs, $_[$i]);
}
}
@seq_0=@{$seqs[0]};
@seq_1=@{$seqs[1]};
unless(@seq_1){ @seq_1=@seq_0; };
for($k=0; $k< @seq_0; $k++){
for($l=0; $l< @seq_1; $l++){
if($seq_1[$l] =~/\W/ and $skip_gap_char){ next };
if($seq_0[$k] eq $seq_1[$l]){
$matrix[$k][$l]=1;
print "# X\[$k\] Y\[$l\] = 1 \n";
}
}
}
return(\@matrix);
}
#__________________________________________________________________________
# Title : make_2D_aa_residue_matrix_array
# Usage : @matrix=@{&make_2D_aa_residue_matrix_array(\@seq)};
# Function : @matrix is like $matrix[1][2]='A'; when aa residue is identical
# This assigns identical residue to array element
# If one array is given, it makes self to self matrix.
# When 2 are given, make matrix for the 2
# Example :
# Keywords : make_matrix
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub make_2D_aa_residue_matrix_array{
my @seq=@{$_[0]};
my @seq2=@{$_[1]};
if(@_ == 1){ @seq2=@seq };
my (@residue_matrix, $k, $l);
for($k=0; $k< @seq; $k++){
for($l=0; $l< @seq2; $l++){
if($seq[$k] eq $seq2[$l]){
$residue_matrix[$k][$l]="$seq[$k]";
print "# $seq[$k] = $l \n";
}
}
}
return(\@residue_matrix);
}
#__________________________________________________________________________
# Title : make_2D_identity_matrix
# Usage : @matrix=@{&make_2D_identity_matrix(\$seq, [\$seq2] )};
# Function : @matrix is like $matrix[1][2]=1;
# This assigns number 1 to array element
# Example :
# Keywords : make_matrix, make_identity_matrix
# Options :
# s for show axis
# Returns :
# Argument :
# Category :
# Version : 1.3
#----------------------------------------------------------------------------
sub make_2D_identity_matrix{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@matrix, @seq1, @seq2 );
if(@_ > 1){ @seq1=split(//, $string[0]); @seq2=split(//, $string[1]);
}else{ @seq2=@seq1= split(//, $string[0]); }
if($char_opt=~/s/){
@matrix = map { $m = $_; "$m ".join('', map {$_ eq $m or ' '} @seq1 )."\n"; } @seq2;
}else{
@matrix = map { $m = $_; join('', map {$_ eq $m or ' '} @seq1 )."\n"; } @seq2;
}
foreach (@matrix) { print ; }
return(\@matrix);
}
#________________________________________________________________________________
# Title : amino_acid_homology_matrix
# Usage : $yes_no=${&amino_acid_homology_matrix('E', 'D')};
# Function :
# Example :
# Keywords : are_they_homologous, amino_acid_homology_table, compare_amino_acid_homology
# Options :
# Category :
# Version : 1.0
#--------------------------------------------------------------------------------
sub amino_acid_homology_matrix{
my ($amino_acid1, $amino_acid2, $hydrophobic_group, $neural_polar,
$acidic_group, $basic_group, $proline);
$amino_acid1=${$_[0]} || $_[0];
$amino_acid2=${$_[1]} || $_[1];
$hydrophobic_group='LIFV'; # A excluded by me
$neural_polar ='STCNQ'; # M excluded by me
$acidic_group='ED';
$basic_group='KRH';
$proline='P';
@groups=($hydrophobic_group, $neural_polar, $acidic_group, $basic_group);
for($i=0; $i< @groups; $i++){
if($groups[$i] =~/$amino_acid1/ and $groups[$i] =~/$amino_acid2/){
return(\1);
}
}
return(\0);
}
#______________________________________________________________________________
# Title : write_reverse_seq_files
# Usage :
# Function :
# Example :
# Keywords : write_rev_seq_files, write_reverse_msf_files
# Options :
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub write_reverse_seq_files{
my $inputfile=$_[0];
my $base=${&get_base_names(\$inputfile)};
my $ext=${&get_extension_names(\$inputfile)};
my $out_rev_file="$base\_rv\.$ext";
my ($hash_out, $order_array)=&open_msf_files(\$inputfile, 'o');
my %seq=%{$hash_out};
my @order=@{$order_array};
my @file_written=@{&write_msf(\%seq, \$out_rev_file, \@order)};
print "\n# (INFO) \@file_written: @file_written\n";
return(\@file_written);
}
#_____________________________________________________________________
# Title : make_reverse_seq_database
# Usage : &make_reverse_seq_database(\@input_database_fasta_file);
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.2
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub make_reverse_seq_database{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
$| = 1;
for($i=0; $i< @file; $i++){
my $fasta_file_for_DB =$file[$i];
my $base=${&get_base_names($fasta_file_for_DB)};
my $ext =${&get_file_extensions($file[$i])};
my($out_file_name, %seqs, %reversed_seqs);
if($ext=~/\S/){
$out_file_name="$base\_rv\.$ext";
}else{
$out_file_name="$base\_rv\.fa";
}
%seqs=%{&open_fasta_files(\$fasta_file_for_DB)};
%reversed_seqs=%{&reverse_sequences(\%seqs)};
&write_fasta(\%reversed_seqs, $out_file_name );
if(-s $out_file_name){
print "\n# make_reverse_seq_database: Supposedly wrote: $out_file_name\n";
}else{
print "\n# make_reverse_seq_database: Error in writing: $out_file_name\n";
}
}
print "\n# make_reverse_seq_database sub finished \n";
}
#__________________________________________________________________________
# Title : make_hmm_from_alignment
# Usage : @out_hmm_file_names=@{&make_hmm_from_alignment(\@file, "$over_write")};
# Function :
# Example :
# Keywords : HMM, hidden markov model, make_HMM_from_alignment,
# make_hmm_from_msf_file, create_hmm_from_alignment,
# create_hmm_from_msf_file,
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub make_hmm_from_alignment{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
local (@out_hmm_files, $base, $out_hmm_file);
for($i=0; $i< @file; $i++){
if($file[$i]=~/\.msf/){
$base=${&get_base_names($file[$i])};
$out_hmm_file= "$base\.hmm";
if($char_opt=~/o/ or !(-s $out_hmm_file) ){
system("hmmb -P BLOSUM62 -B 200 -w $out_hmm_file $file[$i]");
push(@out_hmm_files, $out_hmm_file);
}else{
print "\n# The $out_hmm_file file already exists. To overwrite use -o opt\n";
}
}
}
if(@out_hmm_files > 1){
return(\@out_hmm_files);
}else{
return(\$out_hmm_files[0]);
}
}
#__________________________________________________________________
# Title : get_false_positive_seq_matches
# Usage : %seq=%{&get_false_positive_seq_matches(\%msp_1, \%msp2)};
# Function : gets sequences which are wrongly matched from intermediate seq search
# Example :
#
# OUTPUT looks like the following;
# d1dvh__=d1fcdc1 7.1e-08
# d1fcdc1=d1dvh__ 7.1e-08
# d5cytr_=d351c__ 5.3e-08
# d351c__=d5cytr_ 5.3e-08
# d1cyi__=d2mtac_ 9.1e-06
# d2mtac_=d1cyi__ 9.1e-06
# d1cyi__=d5cytr_ 0.00045
# d5cytr_=d1cyi__ 0.00045
#
# Warning : The default is to show the best E value(lowest that is)
#
# Keywords :
# Options : _ for debugging.
# # for debugging.
#
# Returns :
# Argument :
# Category :
# Version : 1.0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub get_false_positive_seq_matches{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%iss_input)= %{$hash[0]};
my(%final_table_Evalue, %final_table_score, $inter_seq_seq_name, $inter_seq_score,
$inter_seq_E_value, $match_seq_seq_name, $match_seq_score, $match_seq_E_value,
$name_combi1, $name_combi2, $each_iss_line, $all_enquiry_seqs);
@iss_lines = sort values %iss_input;
if(@array > 0){ ## When the names of enquiry was given as an array, use it!
$all_enquiry_seqs=join(' ', sort @{$array[0]} );
}else{ ## otherwise, detect yourself.
for($i=0; $i< @iss_lines; $i++){
$each_iss_line=$iss_lines[$i];
if($each_iss_line=~/^ *(\S+) +/){
$all_enquiry_seqs{$1}++;
}
}
$all_enquiry_seqs=join(' ', sort keys %all_enquiry_seqs );
}
for($i=0; $i< @iss_lines; $i++){
$each_iss_line=$iss_lines[$i];
if($each_iss_line=~/^ *(\S+) +(\S+)\((\d+)\)\((\S+)\) +(\S+)\((\d+)\)\((\S+)\)/){
$inter_seq_seq_name= $2;
$inter_seq_score = $3;
$inter_seq_E_value = $4;
$match_seq_seq_name= $5;
$match_seq_score = $6;
$match_seq_E_value = $7;
$name_combi1="$1\=$match_seq_seq_name";
$name_combi2="$match_seq_seq_name\=$1";
if($all_enquiry_seqs !~/$match_seq_seq_name/){
$false_positive_matches{$name_combi1}="$inter_seq_score $inter_seq_E_value";
next;
}
}
}
if($char_opt=~/v/){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Wrting the false positives
#__________________________________
@keys_false=sort keys %false_positive_matches;
print "\n\n# False positives: " if @keys_false > 0;
for $key_false (@keys_false){
if($key_false =~/\S+/){
printf ("\n%-30s %-30s", $key_false, $false_positive_matches{$key_false});
}
}
print "\n";
}
return(\%false_positive_matches);
}
#__________________________________________________________________
# Title : make_sequence_match_table
# Usage : %sequence_match_table=%{&make_sequence_match_table(\%msp_1, \%msp2)};
# Function : makes a table of match with the values for E values.
# Example :
#
# INPUT looks like this: (the iss file format), first column is key
#
# d1ten__(110)(0.00031) d1fna__ d1fna___1-91(578)(6.9e-37) d1ten__(110)(0.00031)
# d1cfb_2(255)(7.8e-16) d1cfb_2 HSU55258_741-838(255)(5.6e-12) d1cfb_2(255)(7.8e-16)
#
# OUTPUT looks like the following;
# d1dvh__=d1fcdc1 Correct: 7.1e-08
# d1fcdc1=d1dvh__ Correct: 7.1e-08
# d5cytr_=d351c__ Correct: 5.3e-08
# d351c__=d5cytr_ Correct: 5.3e-08
# d1cyi__=d2mtac_ Wrong: 9.1e-06
#
# Keywords : make_sequence_match_Evalue_table, Evalue_table, make_Evalue_table
# make_iss_sequence_match_table
# Options : _ for debugging.
# # for debugging.
# s for skip SELF to SELF match entries
# w for Smith-Waterman score result out than E value out
# r for reflexive output
#
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns :
# Argument :
# Category :
# Version : 1.5
#-------------------------------------------------------------------------------
sub make_sequence_match_table{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%iss_input)= %{$hash[0]};
my(%final_table_Evalue, %final_table_score, $inter_seq_seq_name, $inter_seq_score,
$inter_seq_E_value, $match_seq_seq_name, $match_seq_score, $match_seq_E_value, $key_seq,
$each_iss_line, $all_enquiry_seqs, $name_combi1, $name_combi2, @sorted_names, $name_sorted,
%final_table_interm_and_matched_score, %final_table_interm_and_matched_Evalue, %scop_bugs);
@iss_lines = sort values %iss_input;
if($char_opt=~/v/){ print "\n# make_sequence_match_table: \$char_opt is $char_opt\n" ; }
if($char_opt=~/r/){ $non_reflexive=0;
}else{ $non_reflexive=1; } # default , not to print result in two ways
if(@array > 0){ ## When the names of enquiry was given as an array, use it!
$all_enquiry_seqs=join(' ', sort @{$array[0]} );
}else{ ## otherwise, detect yourself.
for($i=0; $i< @iss_lines; $i++){
$each_iss_line=$iss_lines[$i];
if($each_iss_line=~/^ *(\S+) +/){ $all_enquiry_seqs{$1}++; }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# $all_enquiry_seqs contains all the sequences in the group
#____________________________________________________________________
$all_enquiry_seqs=join(' ', sort keys %all_enquiry_seqs );
}
for($i=0; $i< @iss_lines; $i++){
my $each_iss_line=$iss_lines[$i];
if($each_iss_line=~/^ *(\S+) +(\S+)\((\d+)\)\((\S+)\) +(\S+)\((\d+)\)\((\S+)\)/){
$key_seq=$1;
$inter_seq_seq_name= $2;
$inter_seq_score = $3;
$inter_seq_E_value = $4;
$match_seq_seq_name= $5;
$match_seq_score = $6;
$match_seq_E_value = $7;
if( $key_seq eq $match_seq_seq_name and $char_opt=~/s */ ){ next } ## avoiding self self match
@sorted_names=sort ($1, $match_seq_seq_name);
$name_combi1="$1\=$match_seq_seq_name";
$name_combi2="$match_seq_seq_name\=$1";
$name_sorted="$sorted_names[0]\=$sorted_names[1]";
if($all_enquiry_seqs !~/$match_seq_seq_name/){
if($non_reflexive){
$false_positive_matches{$name_sorted}="$inter_seq_score $inter_seq_E_value : $match_seq_score $match_seq_E_value";
}else{
$false_positive_matches{$name_combi1}="$inter_seq_score $inter_seq_E_value : $match_seq_score $match_seq_E_value";
}
next;
}elsif($final_table_score{$name_combi1} < $inter_seq_score or
$final_table_score{$name_combi2} < $inter_seq_score or
$final_table_score{$name_sorted} < $inter_seq_score){
$final_table_score{$name_combi1}=$inter_seq_score;
$final_table_score{$name_combi2}=$inter_seq_score;
$final_table_Evalue{$name_combi1}=$inter_seq_E_value;
$final_table_Evalue{$name_combi2}=$inter_seq_E_value;
if($non_reflexive){
$final_table_interm_and_matched_score{$name_sorted} = "$inter_seq_score $match_seq_score";
$final_table_interm_and_matched_Evalue{$name_sorted} = "$inter_seq_E_value $match_seq_E_value";
}else{
$final_table_interm_and_matched_score{$name_combi1} = "$inter_seq_score $match_seq_score";
$final_table_interm_and_matched_score{$name_combi2} = "$inter_seq_score $match_seq_score";
$final_table_interm_and_matched_Evalue{$name_combi1} = "$inter_seq_E_value $match_seq_E_value";
$final_table_interm_and_matched_Evalue{$name_combi2} = "$inter_seq_E_value $match_seq_E_value";
}
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# printing out to screen
#__________________________________________________________
if($char_opt =~/w/){ ## returning Smith-waterman score than E value
@keys = sort keys %final_table_interm_and_matched_Evalue;
for $key (@keys){
if($key =~/\S+/){
printf ("\n%-30s Correct: %-50s", $key, $final_table_interm_and_matched_Evalue{$key});
}
}
}else{
@keys = sort keys %final_table_interm_and_matched_score;
for $key (@keys){
if($key =~/\S+/){
printf ("\n%-30s Correct: %-50s", $key, $final_table_interm_and_matched_score{$key});
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Writing the false positives
#__________________________________
@keys_false=sort keys %false_positive_matches;
#print "\n\n# False positives: " if @keys_false > 0;
for $key_false (@keys_false){
if($key_false =~/\S+/){
if($scop_bugs{$key_false}){
printf ("\n%-30s Correct: %-50s", $key_false, $false_positive_matches{$key_false});
}else{
printf ("\n%-30s Wrong: %-50s", $key_false, $false_positive_matches{$key_false});
}
%scop_bugs=qw(d2kauc1=d2kauc 1 d1pkya2=d1pkya1 1 d1pbe_1=d1pbe_2 1
d1dih_1=d1dih_2 1 d2ohxa2=d2ohxa1 1 d1poxa3=d1pvda2 1
d1efga1=d1efga2 1 d1bct__=d1brd__ 1 d1qora1=d1qora2 1
d2ohxa1=d2ohxa2 1);
}
}
print "\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Returning the hash result
#_____________________________________
if($char_opt =~/w/){ ## returning Smith-waterman score than E value
return(\%final_table_interm_and_matched_score);
}else{
return(\%final_table_interm_and_matched_Evalue);
}
}
#__________________________________________________________________
# Title : write_iss_file
# Usage : &write_iss_file(\%msp1, \%msp2); ## for 2 msp_x file input
# Function : writes the intermediate sequence search file.
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : write_interm_seq_search_file
# v for showing the output in STDOUT
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Category :
# Version : 1.2
#---------------------------------------------------------------------------
sub write_iss_file{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%msp_1, %msp_2, %merged_1, %merged_2);
%msp_1=%{$hash[0]};
%msp_2=%{$hash[1]};
@msp1_keys=sort keys %msp_1;
@msp2_keys=sort keys %msp_2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
# removing the empty matches and merging matches according to the enquiry seqs.
# Following input will become;
# ..
# xxxxx
# xxxxx YYYYY(xx)(yy)
# xxxxx zzzzz(xx)(yy) ttttt(xx)(yy)
# ..
# -> xxxxx 'YYYYY(xx)(yy) zzzzz(xx)(yy) ttttt(xx)(yy)'
#____________________________________________________________________________________
for($i=0; $i< @msp1_keys; $i++){
$enquiry_seq = $msp1_keys[$i];
#my ($seq_name, $sw_score, $evalue)=$enquiry_seq=~/(\S+)\((\S+)\)\((\S+)\)/;
#-- if $msp_1{$enquiry_seq} is not empty, assigns name, score, evalue etc to vars, or next
if($msp_1{$enquiry_seq}=~/\S+/){
($seq_name, $sw_score, $evalue)=$enquiry_seq=~/(\S+)\((\S+)\)\((\S+)\)/;
$merged_msp1{$seq_name} .=$msp_1{$enquiry_seq};
}else{
next;
}
}
for($i=0; $i< @msp2_keys; $i++){
$enquiry_seq = $msp2_keys[$i];
#-- if $msp_2{$enquiry_seq} is not empty, assigns name, score, evalue etc to vars, or next
if($msp_2{$enquiry_seq}=~/\S+/){
$merged_msp2{$enquiry_seq} .=$msp_2{$enquiry_seq};
}else{
next;
}
}
@merged_msp1_keys=sort keys %merged_msp1;
@merged_msp2_keys=sort keys %merged_msp2;
for($i=0; $i< @merged_msp1_keys; $i++){
$enquiry_seq=$merged_msp1_keys[$i];
@intermediate_seqs=sort split(/ +/, $merged_msp1{$enquiry_seq});
for($j=0; $j< @intermediate_seqs; $j++){
$intermediate_seq=$intermediate_seqs[$j];
($inter_seq_name, $sw_score, $evalue)=$intermediate_seq=~/(\S+)\((\S+)\)\((\S+)\)/;
@final_matches=sort split(/ +/, $merged_msp2{$inter_seq_name});
for($k=0; $k < @final_matches; $k ++){
$final_matched_seq = $final_matches[$k];
if($char_opt=~/v/){
printf ("%-18s %-40s %-38s\n", $enquiry_seq, $intermediate_seq, $final_matched_seq);
}
$final_out{$final_matched_seq}=
sprintf ("%-18s %-40s %-38s\n", $enquiry_seq, $intermediate_seq, $final_matched_seq);
}
#print "\n";
}
#print "\n";
}
#print "\n";
return(\%final_out);
}
#______________________________________________________________________________
# Title : get_perl_keywords
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub get_perl_keywords{
my(%perl_keywords);
my @keywords=qw( AUTOLOAD BEGIN CORE DESTROY END abs accept alarm and atan2 bind binmode bless caller chdir chmod chop chown chr chroot
close closedir cmp connect continue cos crypt dbmclose dbmopen defined delete die do dump each else elsif endgrent endhostent endnetent endprotoent endpwent endservent
eof eq eval exec exit exp fcntl fileno flock for foreach fork format formline ge getc getgrent getgrgid getgrnam gethostbyaddr gethostbyname gethostent getlogin
getnetbyaddr getnetbyname getnetent getpeername getpgrp getppid getpriority getprotobyname getprotobynumber getprotoent getpwent getpwnam getpwuid getservbyname
getservbyport getservent getsockname getsockopt glob gmtime goto grep gt hex if index int ioctl join keys kill last lc lcfirst le length link listen local localtime log lstat
lt m mkdir msgctl msgget msgrcv msgsnd my ne next no not oct open opendir or ord pack package pipe pop print printf push q qq quotemeta qw qx rand read readdir readline
readlink readpipe recv redo ref rename require reset return reverse rewinddir rindex rmdir s scalar seek seekdir select
semctl semget semop send setgrent sethostent setnetent setpgrp setpriority setprotoent setpwent setservent setsockopt shift
shmctl shmget shmread shmwrite shutdown sin sleep socket socketpair sort splice split sprintf sqrt srand stat
study sub substr symlink syscall sysread system syswrite tell telldir tie time times tr truncate uc ucfirst
umask undef unless unlink unpack unshift untie until use utime values vec wait waitpid wantarray
warn while write x xor y
);
foreach(@keywords){
$perl_keywords{$_}=$_;
}
return(\%perl_keywords);
}
#______________________________________________________________________________
# Title : get_homology_info_of_seq_pairs
# Usage :
# Function :
# Example : %seq_pair_homology_table=%{&get_homology_info_of_seq_pairs(\%pairs_excluded,
# \%pdbg_hash_table)};
#
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub get_homology_info_of_seq_pairs{
my($i, %pairs_to_be_checked,%seq_pairs_homology_table, @pairs,
$homology_info, %pdbg_hash_table);
for($i=0; $i< @_; $i++){
my %in_hash=%{$_[$i]};
my @seq_names=keys %in_hash;
if($in_hash{$seq_names[0]}=~/^\S+[\t ]+\S+$/){
%pairs_to_be_checked=%in_hash; %in_hash=();
}elsif($in_hash{$seq_names[0]}=~/^\S+$/){
%pdbg_hash_table=%in_hash; %in_hash=();
}
}
@pairs=keys %pairs_to_be_checked;
for($i=0; $i< @pairs; $i++){
if($pairs[$i]=~/^(\S+)[\t ]+(\S+)/){
$homology_info=${&check_homology_of_seq_pair(\$pairs[$i], \%pdbg_hash_table)};
$seq_pairs_homology_table{$pairs[$i]}=$homology_info;
print "\n#>> $pairs[$i] $homology_info" if $verbose;
}
}
return(\%seq_pairs_homology_table);
}
#________________________________________________________________________________
# Title : get_overlapping_seq_match_size
# Usage : $ovlapsize=${&get_overlapping_seq_match_size($st1, $en1, $st2, $en2)
# Function :
# Example :
# Keywords : CF: get_overlapping_range, get_overlapping_seq_match
# Options :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub get_overlapping_seq_match_size{
my($start1, $end1, $start2, $end2, $overlapping_region_matched);
if(@_ == 4){
$start1=$_[0]; $end1 =$_[1]; $start2=$_[2]; $end2 =$_[3];
}elsif(@_==2){
if( $_[0]=~/(\d+)\-(\d+)/ ){
$start1=$1; $end1 =$2;
}elsif($_[1]=~/(\d+)\-(\d+)/ ){
$start2=$1; $end2 =$2;
}else{
print "\n# (ERROR) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
print " They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
print " You got it, Sarah?? Try again my dear!\n";
}
}else{
print "\n# (ERROR) get_overlapping_seq_match_size: I need 2 or 4 arguments for regions\n";
print " They look like ($start1, $end1, $start2, $end2) or ('10-100', '20-211')\n";
print " You got it, Sarah?? Try again my dear!\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ---------
# ------
#___________________________________
if($start1 >= $start2 and $end1 >= $end2){
$overlapping_region_matched=$end2-$start1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ---------
# ----------
#___________________________________
elsif($start1 <= $start2 and $end1 <= $end2){
$overlapping_region_matched=$end1-$start2;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# -----
# ----------
#___________________________________
elsif($start1 >= $start2 and $end1 <= $end2){
$overlapping_region_matched=$end1-$start1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ---------
# ----
#___________________________________
elsif($start1 <= $start2 and $end1 >= $end2){
$overlapping_region_matched=$end2-$start2;
}
return(\$overlapping_region_matched);
}
#______________________________________________________________________________
# Title : get_unix_shell_name
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu, On commercial use issue, Email me.
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub get_unix_shell_name{
my($shell_env);
if($ENV{'SHELL'}=~/\/(\w+)$/ or $ENV{'SHELL'}=~/(\w+)$/){
$shell_env=$1
}else{
print "\n# (ERROR) SHELL env setting is not on, I can not give you SHELL type\n";
exit;
}
return(\$shell_env);
}
#______________________________________________________________________________
# Title : get_stat_FASTA_search_result_in_msp_0_files
# Usage : &get_stat_FASTA_search_result_in_msp_0_files(\@file);
# Function :
# Example :
# Keywords : get_stat00_result, get_stat_msp0_files, get_stat_single_search_result
# Options :
#
# $E_value= by e=
# $verbose=v by v
# $show_options=o by o
# $step = by s=
# $score_thresh1= by t1=
# $score_thresh2= by t2=
# $E_mult_factor1 = by m1=
# $E_mult_factor2 = by m2=
#
# Category : statistics, search, bio
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub get_stat_FASTA_search_result_in_msp_0_files{
my($num_enq_seq, $pdbg_file, %input_file_base, $score_thresh1, $score_thresh2,
$E_mult_factor1, $E_mult_factor2, @seqs, @pdbg_seqs, @MSP0, @array,
$E_value, %final_stat_big_hash, @bases, $i, $j, $k);
my $leng_thresh=10;
$score_thresh1 = 73;
my $simple_pdbg_read_opt='b';
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Arguments handling
#_____________________________________
for($i=0; $i< @_; $i++){
if($_[$i]=~/e=(\S+)/){ $E_value=$1;
splice(@_, $i, 1); $i--;
}elsif(ref($_[$i]) eq 'ARRAY'){
@array=@{$_[$i]};
for($j=0; $j < @array; $j++){
if($array[$j]=~/(\S+)\.pdbg$/){ $input_file_base{$1}=$1;
}elsif($array[$j]=~/(\S+)\.msp_?0$/){ $input_file_base{$1}=$1;
}
}
}elsif($_[$i]=~/(\S+)\.pdbg$/){
$input_file_base{$1}=$1;
}elsif($_[$i]=~/(\S+)\.msp_?0$/){
$input_file_base{$1}=$1;
}elsif($_[$i]=~/m1=(\S+)/){ $E_mult_factor1=$1;
splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/m2=(\S+)/){ $E_mult_factor2=$1;
splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/t1=(\S+)/){ $score_thresh1=$1;
splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/t2=(\S+)/){ $score_thresh2=$1;
splice(@_, $i, 1); $i--;
}elsif($_[$i]=~/o=(\S+)/){ $show_options=$1;
splice(@_, $i, 1); $i--;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Handling options
#_________________________________
if($E_value=~/^ *$/){
$E_value=5;
print "\n# WARNING: you did not set 'e=x.xxx' option default $E_value used\n";
}
if($E_mult_factor1 !~/\S/){ $E_mult_factor1 =1; }
if($E_mult_factor2 !~/\S/){ $E_mult_factor2 =1; }
if($show_options=~/o/){
print "\n#---- \$step : $step";
print "\n#---- \$score_thresh1 : $score_thresh1";
print "\n#---- \$score_thresh2 : $score_thresh2\n";
}
@bases=keys %input_file_base;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main loop
#__________________________________________
for($i=0; $i< @bases; $i++){
my(%msp_0, %msp_00, $score, $evalue, $enquiry, @keys0, @keys2, $j,
%stat, %stat2, $sum_correct, $sum_false, @non_dup, $base, $pdbg_file,
@seqs, @pdbg_seqs, $msp_0_file, @MSP0, @keys, $k, %correct );
$base=$bases[$i];
$pdbg_file ="$base\.pdbg";
$msp_0_file="$base\.msp0";
print "\n# $base $pdbg_file $msp_0_file\n" if $verbose;
unless(-s $pdbg_file or -s $msp_0_file){
print "\n", __LINE__, "# file is missing. I need xxx.pdbg, xxx.msp_0\n\n";
}
@seqs=@pdbg_seqs= keys %{&open_pdbg_files($pdbg_file, $simple_pdbg_read_opt)};
if(@pdbg_seqs < 2){
print "\n# too little sequneces @pdbg_seqs $pdbg_file\n";
exit;
}
print "\n# Result of open_pdbg_files, \@seqs are : @seqs \n" if $verbose;
open(MSP0, "$msp_0_file");
@MSP0=<MSP0>;
close(MSP0);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# opening each msp0 file
#______________________________________
for(@MSP0){
if(/^(\S+) +(\S+) +\S* *(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# Length checking
#__________________________________________________
$seq_leng1=$4-$3;
$seq_leng2=$7-$6;
if($seq_leng1 < $leng_thresh or $seq_leng2 < $leng_thresh){
if($verbose){
print "\n# LENG $seq_leng1, $seq_leng2: $seq_leng1 $seq_leng2 $5 => $8 $1 $2 skipping\n";
next;
}
}
$score=$1;
$evalue=$2;
$enquiry=$5;
$match_seq=$8;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Evalue checking
#__________________________________________________
if($evalue > $E_value ){
if($enquiry=~/^(\S+)_\d+\-\d+/){
$msp_0{"$1"} ="" unless $msp_0{"$1"};
next;
}else{
$msp_0{"$enquiry"} ="" unless $msp_0{"$enquiry"};;
next;
}
}
if($score < $score_thresh1){ next; }
if($enquiry=~/^(\S+)_\d+\-\d+/){
$msp_0{"$1"} .="$match_seq ";
}else{
$msp_0{"$enquiry"} .="$match_seq ";
$msp_00{join(' ', sort($enquiry, $match_seq))} = " $score $evalue";
}
}
}
%stat=%msp_0;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# filtering duplicates
#____________________________________________
@keys=keys %stat;
for($k=0; $k< @keys; $k++){
@split=split(/ +/,$stat{$keys[$k]});
@non_dup=@{&remove_dup_in_array(\@split)};
for($j=0; $j<@non_dup; $j++){
if($non_dup[$j]=~/^ *$/){
splice(@non_dup, $j, 1); $j--;
next;
}
if($non_dup[$j] eq $keys[$k]){
splice(@non_dup, $j, 1); $j--;
next;
}
}
$stat2{$keys[$k]}=join(' ', @non_dup);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Showing the actual matched sequences
# %stat has following contents
# d1ash__ d1bam__ d1mba__ d2lhb__
# d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
# d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#______________________________________________________________________________
if($verbose=~/v/){
@keys= sort keys %stat2;
for($k=0; $k< @keys; $k++){
print "$keys[$k]: $stat2{$keys[$k]}\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Getting statistics
#_________________________________________
$evalue=$s;
$E_mult_factor1=1;
@output=@{&get_isearch_result_stat(\%stat2, \@pdbg_seqs, \$evalue,
\$base, \$E_mult_factor1, $leng_thresh, \%msp_00)};
%correct=%{$output[3]};
%final_stat_big_hash=(%final_stat_big_hash, %correct);
if($verbose){
@keys=sort keys %correct;
for($k=0; $k< @keys; $k++){
print "$keys[$k] $correct{$keys[$k]}\n";
}
}
}
return(\%final_stat_big_hash);
}
#________________________________________________________________________________
# Title : get_scop_correcting_pairs
# Usage : %correct=%{&get_scop_correcting_pairs()};
# Function :
# Example :
# Keywords : get_pdb_correcting_pairs , correct_pairs_in_scop, correct_homology_pairs
# Options :
# Category :
# Version : 1.4
#--------------------------------------------------------------------------------
sub get_scop_correcting_pairs{
my (%correcting_pairs, @correcting_pairs);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# %correcting_pairs is a correcting table for old pdb40d file classi
#_____________________________________________________________________
@correcting_pairs=( # should be pairs
'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
'd1bct__ d1brd__', 'd1qora1 d1qora2',
'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
'd1gesa2 d2tmda3', 'd1lvl_2 d2tmda3',
'd2tmda3 d2tpra2', 'd1tde_1 d2tmda3',
'd1nhp_2 d2tmda3', 'd1gesa1 d2tmda3',
'd1lvl_1 d2tmda3', 'd2tmda3 d2tpra1',
'd1fcda1 d2tmda3', 'd1nhp_1 d2tmda3',
'd1tde_2 d2tmda3', 'd1pbe_1 d2tmda3',
'd1ebha1 d1ebha2', 'd1gesa2 d2dlda2', ## 3.4.1 with
'd1gesa2 d1psda2', 'd1nhp_2 d2dlda2',
'd1ldm_1 d1tde_2', 'd1coy_1 d1ldb_1',
'd1lvl_2 d1psda2', 'd1psda2 d1tde_2',
'd1hyha1 d1tde_2', 'd1fcda1 d1ldm_1',
'd1hdca_ d1nhp_2', 'd1fcda1 d1hlpa1',
'd1llda1 d1lvl_2', 'd2dlda2 d2tpra2',
'd1ldm_1 d1nhp_2', 'd1llda1 d1pbe_1',
'd1gdha2 d2tpra1', 'd1ldb_1 d1nhp_2',
'd1gesa2 d1scua2', 'd1fcda1 d1hyha1',
'd1gesa1 d1hlpa1', 'd1gdha2 d1gesa2',
'd1lvl_2 d2dlda2', 'd1gesa1 d2dlda2',
'd1nhp_2 d2ohxa2', 'd1tde_2 d2dlda2', # 3.4.1. with 3.18.1, 3.17.1.
'd1nhp_1 d2cmd_1', 'd1fcda1 d1ldb_1',
'd1lvl_1 d2ohxa2', 'd1nhp_2 d2naca2',
'd1pbe_1 d2ohxa2', 'd1gdha2 d1nhp_2',
'd2cmd_1 d2tpra1', 'd1tde_1 d2cmd_1',
'd1llda1 d1nhp_2', 'd1hlpa1 d1nhp_2',
'd1nhp_1 d2dlda2', 'd1hyha1 d1nhp_2',
'd1nhp_2 d1psda2', 'd1fcda1 d2cmd_1',
'd1fcda1 d1llda1', 'd1lvl_2 d1udpa_',
'd1psda2 d2tpra2', 'd1hdca_ d1lvl_2',
'd1gesa2 d1llda1', 'd1nhp_2 d1qora2',
'd1ldm_1 d2tpra1', 'd1coy_1 d2dlda2',
'd2dlda2 d2tpra1', 'd1hdca_ d1pbe_1',
'd1coy_1 d1gdha2', 'd1nhp_2 d2cmd_1',
'd1llda1 d1tde_1', 'd1llda1 d1lvl_1',
'd1bdma1 d2tpra1', 'd1gd1o1 d2tpra2',
'd1ldb_1 d1lvl_1', 'd1hlpa1 d1tde_2',
'd1coy_1 d1psda2', 'd1nhp_2 d1udpa_',
'd1llda1 d1tde_2', 'd1tde_2 d2cmd_1',
'd1llda1 d2tpra2', 'd1ldb_1 d1tde_1',
'd1coy_1 d1hlpa1', 'd1coy_1 d2cmd_1',
'd1bdma1 d1gesa2', 'd1hyha1 d2tpra2',
'd1gesa2 d1hyha1', 'd1gesa2 d2ohxa2',
'd1ldb_1 d1tde_2', 'd1hlpa1 d1pbe_1',
'd1ldm_1 d2tpra2', 'd2ohxa2 d2tpra1',
'd1ldb_1 d2tpra2', 'd1gesa2 d1ldm_1',
'd1lvl_2 d1qora2', 'd1gesa1 d2naca2',
'd1coy_1 d1llda1', 'd1coy_1 d1hyha1',
'd1coy_1 d1ldm_1', 'd1ldm_1 d1lvl_2',
'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
'd1gd1o1 d1lvl_1', 'd1fvl__ d1kst__',
'd1kst__ d2ech__', 'd1hsaa2 d1std__', ## d1hsaa.. is NOT homol, but to fix a problem in E_100_e_0.0005_j30_segged_2092
'd1afp__ d1hfi__'
);
for($i=0; $i< @correcting_pairs; $i++){
$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
}
return(\%correcting_pairs);
}
#__________________________________________________________________
# Title : get_isearch_result_stat
# Usage : &get_self_isearch_stat(\%stat2, \@pdbg_seqs, \$evalue);
# Function :
# Example : Following input (hash eg: %stat2, input with the first word as key)
# will become columnar output.
#
# d1ash__ d1bam__ d1mba__ d2lhb__
# d1baba_ d1flp__ d1hbg__ d1hlb__ d1mba__ d1mbd__ d2lhb__ d3aaha_ d3sdha_
# d1cpca_ d1cpcb_ d1gof_1 d2ts1_1
#
# Will become:
# ....
# d1ash__ d2lhb__ Homolog: G1 98 0.012
# d1baba_ d1flp__ Homolog: G1 82 0.072
# d1baba_ d1hbg__ Homolog: G1 79 0.13
# d1baba_ d2lhb__ Homolog: G1 228 8e-12
# d1baba_ d3aaha_ Nomolog: G1 74 2
# d1baba_ d3sdha_ Homolog: G1 92 0.012
# d1cola_ d1hbg__ Nomolog: G1 79 0.59
# d1cpca_ d1cpcb_ Homolog: G1 176 4.9e-08
# ....
#
# Keywords : get_stat_interm_search, get_intermediate_search_stat
# Options : _ for debugging.
# # for debugging.
# Package : Bio
# Reference : http://sonja.acad.cai.cam.ac.uk/perl_for_bio.html
# Returns : [$av_correct, $num_enq_seq]
# Tips :
# Argument :
# Todo :
# Author : A Scientist
# Category :
# Version : 2.2
#-----------------------------------------------------------------------------
sub get_isearch_result_stat{
my (@keys, $num_enq_seq, @pdbg_seqs_ori, $c, $d, $i, %correct_pairs,
$sum_correct, $sum_false, $match_seq, $percent_correct, $correct, @correct,
$av_correct, $av_false, $actual_e_value, $correct_matched,
%correcting_pairs, @correcting_pairs, %correct);
my %seqs=%{$_[0]};
my @pdbg_seqs=@{$_[1]};
my $evalue=${$_[2]};
my $pdbg_base=${$_[3]} || $ARGV[3];
my $E_mult_factor1=${$_[4]};
my $E_mult_factor2=${$_[4]};
if(ref($_[5])){ $leng_thresh =${$_[5]} }else{ $leng_thresh=$_[5]; }
my %msp_0=%{$_[6]};
my %msp_00=%{$_[7]};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# %correcting_pairs is a correcting table for old pdb40d file classi
#_____________________________________________________________________
@correcting_pairs=( # should be pairs
'd2kauc1 d2kauc2', 'd1pkya1 d1pkya2',
'd1pvda2 d1trka1', 'd1pbe_1 d1pbe_2',
'd1poxa3 d1pvda2', 'd1efga1 d1efga2',
'd1dsba1 d1dsba2', 'd2gsta1 d2gsta2',
'd1bct__ d1brd__', 'd1qora1 d1qora2',
'd2ohxa1 d2ohxa2', 'd1efga2 d1eft_1',
'd1tada1 d1tada2', 'd1gsea1 d1gsea2',
'd1gesa2 d2tmda3', 'd1lvl_2 d2tmda3',
'd2tmda3 d2tpra2', 'd1tde_1 d2tmda3',
'd1nhp_2 d2tmda3', 'd1gesa1 d2tmda3',
'd1lvl_1 d2tmda3', 'd2tmda3 d2tpra1',
'd1fcda1 d2tmda3', 'd1nhp_1 d2tmda3',
'd1tde_2 d2tmda3', 'd1pbe_1 d2tmda3',
'd1ebha1 d1ebha2', 'd1gesa2 d2dlda2', ## 3.4.1 with 3.15.1
'd1gesa2 d1psda2', 'd1nhp_2 d2dlda2',
'd1ldm_1 d1tde_2', 'd1coy_1 d1ldb_1',
'd1lvl_2 d1psda2', 'd1psda2 d1tde_2',
'd1hyha1 d1tde_2', 'd1fcda1 d1ldm_1',
'd1hdca_ d1nhp_2', 'd1fcda1 d1hlpa1',
'd1llda1 d1lvl_2', 'd2dlda2 d2tpra2',
'd1ldm_1 d1nhp_2', 'd1llda1 d1pbe_1',
'd1gdha2 d2tpra1', 'd1ldb_1 d1nhp_2',
'd1gesa2 d1scua2', 'd1fcda1 d1hyha1',
'd1gesa1 d1hlpa1', 'd1gdha2 d1gesa2',
'd1lvl_2 d2dlda2', 'd1gesa1 d2dlda2',
'd1nhp_2 d2ohxa2', 'd1tde_2 d2dlda2',
'd1nhp_1 d2cmd_1', 'd1fcda1 d1ldb_1',
'd1lvl_1 d2ohxa2', 'd1nhp_2 d2naca2',
'd1pbe_1 d2ohxa2', 'd1gdha2 d1nhp_2',
'd2cmd_1 d2tpra1', 'd1tde_1 d2cmd_1',
'd1llda1 d1nhp_2', 'd1hlpa1 d1nhp_2',
'd1nhp_1 d2dlda2', 'd1hyha1 d1nhp_2',
'd1nhp_2 d1psda2', 'd1fcda1 d2cmd_1',
'd1fcda1 d1llda1', 'd1lvl_2 d1udpa_',
'd1psda2 d2tpra2', 'd1hdca_ d1lvl_2',
'd1gesa2 d1llda1', 'd1nhp_2 d1qora2',
'd1ldm_1 d2tpra1', 'd1coy_1 d2dlda2',
'd2dlda2 d2tpra1', 'd1hdca_ d1pbe_1',
'd1coy_1 d1gdha2', 'd1nhp_2 d2cmd_1',
'd1llda1 d1tde_1', 'd1llda1 d1lvl_1',
'd1bdma1 d2tpra1', 'd1gd1o1 d2tpra2',
'd1ldb_1 d1lvl_1', 'd1hlpa1 d1tde_2',
'd1coy_1 d1psda2', 'd1nhp_2 d1udpa_',
'd1llda1 d1tde_2', 'd1tde_2 d2cmd_1',
'd1llda1 d2tpra2', 'd1ldb_1 d1tde_1',
'd1coy_1 d1hlpa1', 'd1coy_1 d2cmd_1',
'd1bdma1 d1gesa2', 'd1hyha1 d2tpra2',
'd1gesa2 d1hyha1', 'd1gesa2 d2ohxa2',
'd1ldb_1 d1tde_2', 'd1hlpa1 d1pbe_1',
'd1ldm_1 d2tpra2', 'd2ohxa2 d2tpra1',
'd1ldb_1 d2tpra2', 'd1gesa2 d1ldm_1',
'd1lvl_2 d1qora2', 'd1gesa1 d2naca2',
'd1coy_1 d1llda1', 'd1coy_1 d1hyha1',
'd1coy_1 d1ldm_1', 'd1ldm_1 d1lvl_2',
'd1eny__ d1nhp_2', 'd1pbe_1 d2pgd_2',
'd1ldb_1 d1pbe_1', 'd1ldb_1 d1lvl_2',
'd1gesa2 d1hlpa1', 'd1dhr__ d1nhp_2',
'd1hdca_ d1tde_1', 'd1gesa1 d1psda2',
'd1pbe_1 d2cmd_1', 'd1tde_2 d1udpa_',
'd1pbe_1 d2dlda2', 'd1hdca_ d1tde_2',
'd1gesa2 d1ldb_1', 'd1psda2 d2tpra1',
'd1gdha2 d1lvl_2', 'd1tde_1 d2dlda2',
'd1ldm_1 d1pbe_1', 'd1pbe_1 d1scua2',
'd1gesa1 d2ohxa2', 'd1lvl_2 d2naca2',
'd1gd1o1 d1lvl_1'
);
for($i=0; $i< @correcting_pairs; $i++){
$correcting_pairs{$correcting_pairs[$i]}=$correcting_pairs[$i];
}
if($E_mult_factor1=~/^ *$/){ $E_mult_factor1=1; };
@keys=sort keys %seqs;
@keys=@{&strip_sequence_ranges(\@keys)};
@keys=@{&remove_dup_in_array(\@keys)};
@pdbg_seqs_ori=@pdbg_seqs;
$num_enq_seq=@pdbg_seqs;
print "\n# In get_isearch_result_stat: PDBG seqs $num_enq_seq \n=> @pdbg_seqs\n\n" if $verbose;
#@pdbg_seqs=@{&strip_sequence_ranges(\@pdbg_seqs)};
#@pdbg_seqs=@{&remove_dup_in_array(\@pdbg_seqs)};
if($num_enq_seq < 2){ print "\n# \$num_enq_seq is less than 2 @pdbg_seqs $base\n"; exit; }
for($c=0; $c < @keys; $c++){
my($enq_seq, $correct, $false_positive);
$num_of_matched=@match_seqs=split(/ +/, $seqs{$keys[$c]});
$enq_seq=$keys[$c];
for($d=0; $d< @match_seqs; $d++){
my($correct_matched, @sorted);
$match_seq=$match_seqs[$d];
$sorted=join(' ', sort ($enq_seq, $match_seq) );
for($i=0; $i< @pdbg_seqs; $i++){
if($match_seq =~/d?$pdbg_seqs[$i]/i or $correcting_pairs{$sorted} ){
print "\n# \$match_seq = $match_seq, \$pdbg_seqs $pdbg_seqs[$i] \$enq_seq: $enq_seq\n" if $verbose;
$correct++;
$correct_matched=1;
unless($correct{$sorted}){
$correct_group{$base} .="Homolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Homolog: $base $msp_0{$sorted}";
}
}
if($correct_matched !=1){
$false_positive++;
unless( $correct{$sorted} ){
$correct_group{$base} .="Nomolog: $sorted $base $msp_0{$sorted}\n";
}
$correct{$sorted} = "Nomolog: $base $msp_0{$sorted}";
}
}
if(@match_seqs == 0){ @match_seqs=1; $percent_correct=0; }
$sum_correct += $correct;
$sum_false += $false_positive;
}
$av_correct = $sum_correct/$num_enq_seq;
$av_false = $sum_false /($num_enq_seq);
#### $actual_e_value becomes whatever $E_mult_factor1 defined ~~~~~~~~~~~~
if($E_mult_factor1 != 1){
$actual_e_value=$evalue * $E_mult_factor1;
}elsif($E_mult_factor2 != 1){
$actual_e_value= $evalue * $E_mult_factor2;
}else{ $actual_e_value=$evalue }
$num_enq_seq--;
$sum_correct_for_additional = $num_enq_seq+1;
$match_count=$sum_correct_for_additional * $av_correct;
#$sum_correct= $sum_correct_for_additional;
if($verbose){
printf ("%-10s %-12s %-13f %-13f %-7s %-7s %-7s %-7s %-4s\n", $pdbg_base,
$actual_e_value, $av_correct, $av_false, $num_enq_seq,
$sum_correct_for_additional, $sum_false, $match_count, $leng_thresh);
}
@correct_new=@{&remove_dup_in_array(\@correct_new)};
for($i=0; $i< @correct_new; $i++){
print "\n# correct new: $correct_new[$i]" ;
}
$num_correct=$match_count/2;
print "Num of non-reflective correcct: $num_correct Nomolog: $sum_false \n\n" if $verbose;
return([$av_correct, $sum_correct, $num_enq_seq, \%correct, \%correct_group]);
}
#__________________________________________________________________
# Title : strip_sequence_ranges
# Usage :
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : remove_sequence_ranges, remove_sequence_name_ranges,
# remove_ranges_in_sequences, strip_sequence_name_ranges,
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub strip_sequence_ranges{
my (@out, $i);
my @in=@{$_[0]} or @in=@_;
for($i=0; $i< @in; $i++){
if($in[$i]=~/^(\S+)_\d+\-\d+/){
push(@out, $1);
}else{
push(@out, $in[$i]);
}
}
return(\@out);
}
#__________________________________________________________________________
# Title : open_sequence_index_files
# Usage : open_sequence_index_files(<indexfilename>, <sequencename>);
# Function : returns seqname with its seek pos in fasta sequence db file.
# Example : %index=%{&open_sequence_index_files(\@INDEX_FILE, \@input_seq_names)};
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : open_seq_index_files, open_seq_idx_files, open_idx_files,
# get_sequence_index, get_seq_index, get_sequence_with_index
# Options : _ or # for debugging
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------------------
sub open_sequence_index_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( %final_index, %long_index);
for($i=0; $i< @file; $i++){
open(INDEX, "$file[$i]");
while(<INDEX>){
if(/^(\S+) +(\S+)$/){
$long_index{$1}=$2;
}
}
for($j =0; $j < @string; $j++){ #<<<< @string has the sequence NAMEs >>>>
if($input_seq_names[$j]=~/^(\S+)_\d+\-\d+/){
$seq_with_index{$input_seq_names[$j]}=$long_index{$1};
}else{
$seq_with_index{$input_seq_names[$j]}=$long_index{$input_seq_names[$j]};
}
}
}
return(\%final_index);
}
#__________________________________________________________________
# Title : do_intermediate_sequence_search
# Usage : &do_intermediate_sequence_search(\%pdb_seq, $owl_db_fasta, $ARGV[0], $single_msp, $over_write,
# "u=$upper_expect_limit", "l=$lower_expect_limit", "k=$k_tuple" );
#
# Function :
# Example : &do_intermediate_sequence_search(\%pdb_seq, $owl_db_fasta, $ARGV[0], $single_msp, $over_write,
# "u=$upper_expect_limit", "l=$lower_expect_limit", "k=$k_tuple" );
#
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options :
# Query_seqs= for enquiry sequences eg) "Query_seqs=$ref_of_hash"
# DB= for target DB "DB=$DB_used"
# File= to get file base(root) name. "File=$file[0]"
# m for MSP format directly from FASTA or Ssearch result than through sso_to_msp to save mem
# s for the big single output (msp file output I mean)
# o for overwrite existing xxxx.fa files for search
# c for create SSO file (sequence search out file)
# R for adding ranges to the enquiry sequences as well.
# k= for k-tuple value. default is 1 (ori. FASTA prog. default is 2)
# u= for $upper_expect_limit
# l= for $lower_expect_limit
# a= for choosing either fasta or ssearch algorithm
#
# Returns : the names of files created (xxxxx.msp, yyy.msp,,)
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------------------
sub do_intermediate_sequence_search{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@final_out, $add_range, $single_big_msp, $base_name, $create_sso, @nondup,
$Single_msp_out_file, %duplicate, $Evalue_thresh, $Score_thresh, @SSO, $sequence_DB,
@sso, @temp, $algorithm, $margin, $out_msp_file, @MSP, @final_msp_file_names_out,
$upper_expect_limit, $lower_expect_limit, $k_tuple, %seq_input, %MSP, $add_range_to_enquiry );
my ($E_val) = 5; ## default 5 <<<<<<<<<<<<<<<<<<<<<
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# DEFAULTS
#________________________________________
$k_tuple=1;
$algorithm='fasta';
$upper_expect_limit=10;
$lower_expect_limit=0;
$Score_thresh =75;
$margin =0;
$add_range ='';
$sequence_DB =$ENV{'PDB40D_FASTA'};
if($vars{'a'}=~/\S+/){ $algorithm = $vars{'a'} };
if($vars{'u'}=~/\d+/){ $upper_expect_limit = $vars{'u'} };
if($vars{'l'}=~/\d+/){ $lower_expect_limit = $vars{'l'} };
if($vars{'k'}=~/\d+/){ $k_tuple = $vars{'k'} };
if($vars{'t'}=~/\d+/){ $Score_thresh = $vars{'t'} };
if($vars{'m'}=~/\d+/){ $margin = $vars{'m'} };
if($vars{'r'}=~/\S+/){ $add_range = 'r' };
if($vars{'s'}=~/\S+/){ $single_big_msp = 's' };
if($vars{'DB'}=~/\S+/){ $sequence_DB = $vars{'DB'} };
if($vars{'File'}=~/\S+/){ $input_file_name = $vars{'File'} };
if($vars{'Query_seqs'}=~/\S+/){ %seq_input = %{$vars{'Query_seqs'}}};
if($vars{'e'} =~/\S+/){ $E_val = $vars{'e'} };
if($char_opt=~/r/){ $add_range = 'r' }
if($char_opt=~/R/){ $add_range_to_enquiry = 'R' }
if($char_opt=~/c/){ $create_sso = 'c' }
if($char_opt=~/s/){ $single_big_msp = 's'; print "\n# Single file opt is set\n"; }
if($char_opt=~/m/){ $msp_directly_opt = 'm' }
if($char_opt=~/i/){ $do_intermediate_search = 'i' }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# When, you didn't use "DB=$XXX" and "File=$FXXX" format, first file input is DB etc
#_______________________________________________________________________________________
if($input_file_name=~/^$/){ $input_file_name=$file[1];
print "\n# You did not use \"File=\$XXX\" format\n" };
if($sequence_DB=~/^$/){ $sequence_DB =$file[0];
print "\n# You did not use \"DB=\$XXX\" format\n" };
print "\n# Finished writing the enquiry fasta files from \%seq_input by write_fasta_seq_by_seq";
print "\n# I am in do_sequence_search sub, Target database used : $sequence_DB \n";
my $base = ${&get_base_names($input_file_name)};
$out_msp_file="$base\.msp";
@temp=`$algorithm -m 10 -H -E $E_val $input_file_name $sequence_DB $k_tuple`;
if(@temp < 40){ print "\n# There must be error , \@temp is too small\n\n"; }
my @msp_hashes_from_temp = @{&open_sso_files(\@temp, $add_range,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
$add_range_to_enquiry)};
my @msp_from_temp= values %{$msp_hashes_from_temp[0]};
$MSP{$out_msp_file} = \@msp_from_temp;
open(MSPOUT, ">$out_msp_file");
for($i=0; $i< @msp_from_temp; $i++){
print MSPOUT $msp_from_temp[$i];
print $msp_from_temp[$i];
}
close MSPOUT;
return(\$out_msp_file);
}
#____________________________________________________________________________________
# Title : do_sequence_search
# Usage : &do_sequence_search("Query_seqs=\%pdb_seq", "DB=$sequence_db_fasta",
# "File=$ARGV[0]", $single_msp, $over_write,
# "u=$upper_expect_limit", "l=$lower_expect_limit",
# "k=$k_tuple", $No_processing );
# Function : do FASTA, SSEARCH or BLASTPGP(psi-blast) search
# Example : &do_sequence_search(\%pdb_seq, $owl_db_fasta, $ARGV[0], $single_msp, $over_write,
# "u=$upper_expect_limit", "l=$lower_expect_limit", "k=$k_tuple" );
#
# Keywords : sequence_search
# Options :
# Query_seqs= for enquiry sequences eg) "Query_seqs=$ref_of_hash"
# DB= for target DB "DB=$DB_used"
# File= to get file base(root) name. "File=$file[0]"
# m for MSP format directly from FASTA or Ssearch result than through sso_to_msp to save mem
# s for the big single output (msp file output I mean)
# s= for the single big msp file name
# o for overwrite existing xxxx.fa files for search
# c for create SSO file (sequence search out file)
# d for very simple run and saving the result in xxxx.gz format in sub dir starting with one char
# r for reverse the query sequence
# R for attaching ranges of sequences
# k= for k-tuple value. default is 1 (ori. FASTA prog. default is 2)
# u= for $upper_expect_limit
# l= for $lower_expect_limit
# a= for choosing either fasta or ssearch algorithm
# d= for defining the size of subdir made. 2 means it creates
# eg, DE while 1 makes D
# d for $make_gz_in_sub_dir_opt, putting resultant sso files in gz format and in single char subdir
# D for $make_msp_in_sub_dir_opt, convert sso to msp and put in sub dir like /D/, /S/
# n for new format to create new msp file format with sso_to_msp routine
# PVM= for PVM run of FASTA (FASTA only)
# M for machine readable format -m 10 option
# M= for machine readable format -m 10 option
# N for 'NO' do not do any processing but, do the searches only.
# FILE_AGE for defining the age of file in days to be overwritten.
# L for Lean output(removes xxxx.fa query seq file)
#
# Returns : the names of files created (xxxxx.msp, yyy.msp,,)
# Version : 5.4
#----------------------------------------------------------------------------------------
sub do_sequence_search{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@final_out, $add_range, $single_big_msp, $base_name, $create_sso, @nondup,
$Single_msp_out_file, %duplicate, $Evalue_thresh, $Score_thresh, @SSO, $sequence_DB,
@sso, @temp, $algorithm, $margin, $out_msp_file, @MSP, @final_msp_file_names_out,
$upper_expect_limit, $lower_expect_limit, $k_tuple, %seq_input, %MSP, $No_processing,
$new_format, $PVM_FASTA_run, $over_write, $sub_dir_size, $age_in_days_of_out_file,
$over_write_by_age, $Lean_output, $gzipped_msp_file, $gzipped_sso_file,
$defined_all_ok, $make_msp_in_sub_dir_opt );
my ($E_val) = 5; ## default 5 <<<<<<<<<<<<<<<<<<<<<
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# DEFAULTS
#________________________________________
$k_tuple =1; # 1 or 2, 1 is more sensitive
$algorithm ='fasta';
$sub_dir_size =2; # the default char number taken from seq name to make sub dirs
$upper_expect_limit=1;
$lower_expect_limit=0;
$Score_thresh =75; # FASTA or SSSEARCH score
$margin =0; # sequence region margin. If it is 2, 2 more edged residues will be added
$add_range ='';
$pwd =`pwd`; chomp($pwd);
$age_in_days_of_out_file=1000; ## If the files being tested is older than this, let's make anew
if($vars{'a'}=~/\S+/){ $algorithm = $vars{'a'} };
if($vars{'u'}=~/\d+/){ $upper_expect_limit = $vars{'u'} };
if($vars{'l'}=~/\d+/){ $lower_expect_limit = $vars{'l'} };
if($vars{'k'}=~/\d+/){ $k_tuple = $vars{'k'} };
if($vars{'t'}=~/\d+/){ $Score_thresh = $vars{'t'} };
if($vars{'m'}=~/\d+/){ $margin = $vars{'m'} };
if($vars{'d'}=~/\d+/){ $sub_dir_size = $vars{'d'} };
if($vars{'r'}=~/\S+/){ $add_range = 'r' };
if($vars{'s'}=~/\S+/){ $single_big_msp = 's' };
if($vars{'DB'}=~/\S+/){ $sequence_DB=$vars{'DB'} ;
if(-s $sequence_DB){
}elsif(-s "../$sequence_DB"){ $sequence_DB= "../$sequence_DB"
}elsif(-s "../../$sequence_DB"){ $sequence_DB= "../../$sequence_DB";
}else{
print "\n# (E) do_sequence_search: You set DB param, but I can\'t find $sequence_DB\n";
exit;
}
}else{ print "\n# (E) do_sequence_search: I need DB param defined, sorry, aborting\n"; }
if($vars{'FILE'}=~/\S+/){ $input_file_name = $vars{'FILE'}; push(@file,$input_file_name) };
if($vars{'File'}=~/\S+/){ $input_file_name = $vars{'File'}; push(@file,$input_file_name) };
if($vars{'FILE_AGE'}=~/\S+/){ $age_in_days_of_out_file= $vars{'FILE_AGE'}; };
if($vars{'Query_seqs'}=~/\S+/){ %seq_input = %{$vars{'Query_seqs'}}};
if($vars{'Query'}=~/\S+/){ %seq_input = %{$vars{'Query'}}};
if($vars{'u'} =~/\S+/){ $E_val = $vars{'u'} };
if($vars{'PVM'} =~/\S+/){ $PVM_FASTA_run = $vars{'PVM'}; print "\n# PVM opt is set\n"; };
if($vars{'M'} =~/\S+/){ $machine_readable = $vars{'M'}; };
if($char_opt=~/r/){ $add_range = 'r' }
if($char_opt=~/L/){ $Lean_output = 'L' }
if($char_opt=~/o/){ $over_write = 'o' }
if($char_opt=~/c/){ $create_sso = 'c' }
if($char_opt=~/s/){ $single_big_msp = 's'; print "\n# Single file opt is set\n"; }
if($char_opt=~/m/){ $msp_directly_opt = 'm' }
if($char_opt=~/M/){ $machine_readable = 'M' }
if($char_opt=~/d/){ $save_in_gz_in_sub_dir = 'd' }
if($char_opt=~/D/){$make_msp_in_sub_dir_opt= 'D' } # for simple search and storing msp file
if($char_opt=~/N/){ $No_processing = 'N'; $create_sso='c'; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# When no %seq is given, but files
#___________________________________________
if(@hash==0 and @file > 0){
print "\n# (i) do_sequence_search: You did not put sequences as in \%seq, but raw sequence file @file!\n";
print " I will run \'open_fasta_files\' sub to fetch sequences to store in \%seq_input\n";
%seq_input=%{&open_fasta_files(\@file)};
}else{
print "\n# (i) do_sequence_search: I will use given seqs in \%seq_input from \%\{\$hash\[0\]\}\n";
%seq_input=%{$hash[0]};
}
my (@seq_names) = keys %seq_input;
$base_name = ${&get_base_names($input_file_name)};
print "\n# (i) line:",__LINE__, ", do_sequence_search, \$algorithm => $algorithm, \$base_name:$base_name
$input_file_name <--> $sequence_DB\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
# If one of the files is missing, ask if it is for self self search and
# make query=target, else die
#_______________________________________________________________________
$defined_all_ok=&check_if_defined($input_file_name, $sequence_DB);
unless($defined_all_ok){
print "\n Did you want to do self self search? ->(y/n) ";
$answer_for_self_self=getc;
if($answer_for_self_self =~/y/i){
if($input_file_name){ $sequence_DB=$input_file_name }
else{ $input_file_name=$sequence_DB }
}else{
print "\n# You seemed made a mistake, O.K., I will kill myself!\n\n";
print chr(7); exit;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) If blast is chosen run Blast
#_________________________________________________________
if($algorithm=~/[psi\-]*[pb][last]*/i){
print "\n# (i) Doing PSI search with @file\n";
@final_out=@{&do_psi_blast_search(\@file, "d=$source_DB_file",
"i=$input_seq_file", $over_write,
$make_msp_in_sub_dir_opt, $Lean_output)};
return(\@final_out); #<<<<<<----------- F I N I S H
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Controlling which kind of search it should do. Do save_in_gz_in_sub_dir first if d is set
#______________________________________________________________________________________________
if( $make_msp_in_sub_dir_opt ){ ## convert sso to msp and put in sub dir like /D/, /S/
for($x=0; $x < @seq_names; $x++){
my ($over_write_sso_by_age, $over_write_msp_by_age, %single_seq,
$out_file_sso_gz_name, $out_file_msp_name, $out_file_gz_name, $existing_sso);
my ($seq_name, $seq)= ($seq_names[$x], $seq_input{$seq_names[$x]});
my $first_char= substr("\U$seq_name", 0, $sub_dir_size);
mkdir ("$first_char", 0777) unless -d $first_char;
chdir("$first_char");
print "\n# (i) do_sequence_search: You set \'d\' or \'D\' opt\n";
print "# (i) making subDIRs ($first_char) with $seq_name $sequence_DB to store MSP files\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Let's make each fasta file for each seq to be used in searching
#_____________________________________________________________________
my $temp_file_name="$seq_name.fa";
%single_seq=($seq_name, $seq_input{$seq_name});
&write_fasta(\%single_seq, $temp_file_name ); ## e for writing each file
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Making output file name according to the option given
#_______________________________________________________________________
if($machine_readable and $algorithm=~/[fastassearch]+/){
$out_file_sso_name="$seq_name\.msso";
}else{ $out_file_sso_name="$seq_name\.sso"; }
$out_file_sso_gz_name="$out_file_sso_name\.gz";
$out_file_msp_name="$seq_name\.msp";
$out_file_gz_name="$seq_name\.msp\.gz";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if SSO file already there
#_______________________________________________________________________
if(-s $out_sso_file){ $existing_sso=$out_file_sso_name }
elsif(-s $out_sso_gz_name){ $existing_sso=$out_file_sso_gz_name }
if(-s $out_msp_name){ $existing_msp=$out_file_msp_name }
elsif(-s $out_gz_name){ $existing_msp=$out_file_gz_name }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If the dates of files created are long ago, overwrite to refresh
#____________________________________________________________________
if( (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
$over_write_sso_by_age='o';
}
if( (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
$over_write_msp_by_age='o';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To check if the target seq DB is in ../
#________________________________________________
if(-s $sequence_DB){
print "\n# (i) Good, target \$sequence_DB $sequence_DB is in this working dir\n";
}elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB"; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Searching: Making MSP files directly, MSP file format is the major format used in geanfammer!
#_________________________________________________________________________________________
if($char_opt =~/D/){ #### To make MSP file
if( !(-s $out_file_gz_name or -s $out_file_msp_name) or $over_write or $over_write_msp_by_age){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2.1) Running run_fasta_sequence_search !!
#_______________________________________________________
print "\n# (i) Running run_fasta_sequence_search !!\n";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_file_msp_name",
"File=$temp_file_name", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
$gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To make gzipped SSO files and MSP files
#_______________________________________________
elsif($create_sso or $char_opt=~/m/){ ### To make gzipped SSO files
if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
print "\n# (i) Running run_fasta_sequence_search with \"\$create_sso option\"!!\n\n";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_file_msp_name", "$create_sso",
"File=$temp_file_name", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
$gzipped_sso_file=${&compress_files_by_gzip($out_file_sso_name)};
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}else{
if( !(-s $out_file_sso_name or -s $out_file_sso_gz_name ) or $over_write or $over_write_sso_by_age){
system(" $algorithm -m 10 -H -E $E_val $temp_file_name $sequence_DB $k_tuple > $out_file_sso_name");
system("gzip $out_file_sso_name");
}else{
print "\n# Line No. ", __LINE__,", $out_file_gz_name already exists or
\$over_write is set or NOT older than $age_in_days_of_out_file\n";
}
}
unlink("$seq_name.fa") if -s "$seq_name.fa";
unlink("$first_char/$seq_name\.fa") if -s "$first_char/$seq_name\.fa";
print "\n# Sub dir $first_char and $seq_name\.msp has been made, finishing do_sequence_search\n";
chdir ('..');
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# F I N I S H
#________________________________________
goto EXIT;
} # if ($char_opt =~/[dD]/){ is finished
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Writing on PWD. This is the big single MSP output
#____________________________________________________________
$Single_msp_out_file="$base_name\.msp" if($single_big_msp eq 's');
if(-s $Single_msp_out_file and !$over_write ){
print "\n# (i) $Single_msp_out_file exists, and no \$over_write is set, skipping \n";
push(@final_out, $Single_msp_out_file);
}else{ $over_write ='o'; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if it is necessary to write each sequences.fa files
#______________________________________________________
if( $over_write ){ &write_fasta_seq_by_seq(\%seq_input, 'e'); } ## e for writing each seq file
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# When, you didn't use "DB=$XXX" and "File=$FXXX" format, first file input is DB etc
#_______________________________________________________________________________________
$defined_all_ok=&check_if_defined($input_file_name, $sequence_DB);
if(!$defined_all_ok){ print "\n# (E) FATAL: do_sequence_search: You did not use \"DB=\$XXX\" format\n"; exit };
print "\n# Finished writing the enquiry fasta files from \%seq_input by write_fasta";
print "\n# I am in do_sequence_search sub, Target database used : $sequence_DB with seqs of \'@seq_names\'\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main search with given @seq_names
#______________________________________________________________
for($j=0; $j< @seq_names; $j++){ # @seq_names has sequence names coming from (@seq_names) = keys %seq_input;
my ($over_write_sso_by_age, @temp, $existing_sso, $out_gz_name,
$over_write_msp_by_age, $existing_msp, $out_msp_file, $seq_name);
$seq_name=$seq_names[$j];
$each_seq_fasta="$seq_name\.fa";
$out_msp_file="$seq_name\.msp";
$out_gz_name="$seq_name\.msp\.gz";
$out_msso_file="$seq_name\.msso";
&die_if_file_not_present($each_seq_fasta);
print "\n# (i) :-) Found $each_seq_fasta is searched against $sequence_DB\n";
if($algorithm=~/fasta/){ $out_sso_file="$seq_name\.fsso";
}elsif($algorithm=~/ssearch/){ $out_sso_file="$seq_name\.ssso"; }
$out_sso_gz_name="$out_sso_name\.gz";
if(-s $out_sso_file){ $existing_sso=$out_sso_file }
elsif(-s $out_sso_gz_name){ $existing_sso=$out_sso_gz_name }
if(-s $out_msp_file){ $existing_msp=$out_msp_file }
elsif(-s $out_gz_name){ $existing_msp=$out_gz_name }
if( (localtime(time- (stat($existing_sso))[9]))[3] > $age_in_days_of_out_file ){
$over_write_sso_by_age='o';
}
if( (localtime(time- (stat($existing_msp))[9]))[3] > $age_in_days_of_out_file ){
$over_write_msp_by_age='o';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# To check if the target seq DB is in ../
#________________________________________________
if(-s $sequence_DB){ print "\n# (i) \$sequence_DB $sequence_DB exists, Good\n";
}elsif( -s "../$sequence_DB"){ $sequence_DB="../$sequence_DB";
}elsif( -s "../../$sequence_DB"){ $sequence_DB="../../$sequence_DB"; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If MSP file already exist
#_____________________________________________________________
if( -s $out_msp_file and !$over_write_msp_by_age and !$over_write ){
print "\n# (i) File: $out_msp_file exists, skipping, to overwrite use \'o\' opt or set days";
push(@final_out, $out_msp_file);
}else{ ## -E is for e value cutoff. -b is for num of seq fetched
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~
# K-tuple is 1 by default. If xxxx.sso exsts, skip running fasta or ssearch
#________________________________________________________________________________
if(-s $out_sso_file and !$over_write ){ ## If SSO is already present, JUST READ IT!
print "\n# (i) Just opening existing $out_sso_file $out_sso_file $out_msp_file $over_write_msp_by_age $over_write\n";
open(SSO_ALREADY, "$out_sso_file");
@temp=<SSO_ALREADY>;
print "\n# (i) \@temp has ", scalar(@temp), " lines\n";
close(SSO_ALREADY);
&compress_files_by_gzip($out_sso_file);
}else{ ## Run FASTA HERE
print "\n# (i) Running \"run_fasta_sequence_search\" ";
$gzipped_msp_file=${&run_fasta_sequence_search(
"a=$algorithm",
"O=$out_msp_file", "$create_sso",
"File=$each_seq_fasta", "e=$E_val",
"DB=$sequence_DB", "k=$k_tuple", "$machine_readable")};
push(@final_out, $gzipped_msp_file) if -s $gzipped_msp_file ;
unlink($each_seq_fasta) if $Lean_output;
}
}
if($machine_readable and $create_sso and -s $out_sso_file){ &cp($out_sso_file, $out_msso_file); }
} # end of for($j=0; $j< @seq_names; $j++){
return(\@final_out);
EXIT:
} # do_sequence_search
#__________________________________________________________________________
# Title : do_hmm_sequence_search
# Usage : &do_hmm_sequence_search(\@file, "method=$default_search_method",
# $over_write, "DB=$pdbd40_seq_fasta");
#
# Function : does hmm sequence search using Sean Eddy's HMMER (hmmls, hmmfs)
# Example :
# Keywords : do_seq_search_with_hmm, do_hmmt_sequence_search
# Options :
# "method=ls" for turning hmmls search option on (default)
# "method=fs" for turning hmmfs search option on
# method= by method=
# o for overwriting existint xxxxx.hmm files
# E=Enguiry_name for specifying enquiry seq name rather than 'HMM', the default
# t=20 for score thresh at the level of hmmls. Default of hmmls is 0. example showed has 15
# $evalue_cutoff= by e=
# $over_write = o by -o o
# Returns :
# Argument :
# Version : 1.6
#----------------------------------------------------------------------------
sub do_hmm_sequence_search{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($version_number, @out_hmm_file_names, $evalue_cutoff);
my $score_thresh=5; # default threshold
$evalue_cutoff=3;
$default_search_method='hmmsearch';
$version_number=2; ## default
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# checking the existence of hmm search file
#_____________________________________________________
if(&check_file_exists_in_path("hmmsearch")){
$default_search_method='hmmsearch';
$version_number=2;
}elsif(&check_file_exists_in_path("hmmls")){
$default_search_method='hmmls';
$version_number=1;
}else{
print "\n# (ERROR) $0 can not find hmmsearch or hmmls, Please put them in the PATH\n\n";
if($vars{'method'}=~/ls/){
}elsif( $vars{'method'}=~/fs/){ $default_search_method='hmmfs';
}else{ exit; }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Checking the target DB file
#______________________________________________________________
if( $vars{'DB'} =~/\S/ and -s $vars{'DB'}){ $target_DB = $vars{'DB'}
}else{ print "\n# (WARN) I need target DB to search for hmmls-fs. Use: DB=xxxx.fa form";
print "\n# or Default PDB40D_FASTA ENV setting will be used for DB\n";
$target_DB= $ENV{'PDBD40_SEQ_FASTA'};
unless(-s $target_DB){
print "\n# (ERROR) Even the default DB setting $target_DB does not exist, check path/file\n\n";
exit;
}
}
if($vars{'E'}=~/\S/){ $enquiry_name =$vars{'E'} }
if($vars{'t'}=~/\S/){ $score_thresh =$vars{'t'} }
if($vars{'e'}=~/\S/){ $evalue_cutoff =$vars{'e'} }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# running hmm with @file (hmmb files) against target_DB
#____________________________________________________________
for($i=0; $i< @file; $i++){
print "\n# Running do_hmm_sequence_search with $file[$i], options are: $char_opt\n";
if($vars{'E'}=~/\S/){ $base=$enquiry_name; # When $enquiry_name is given, it uses for output name
}else{
$base=${&get_base_names($file[$i])};
}
if($default_search_method=~/hmmfs/){
$output_hmm_result = "$base\.hmmfs";
}elsif($default_search_method=~/hmmsearch/ or $default_search_method=~/hmmls/){
$output_hmm_result = "$base\.hmmls";
}
if($char_opt=~/o/ or !(-s $output_hmm_result) ){
if($version_number==2){
print "Running: $default_search_method -T $score_thresh -E $evalue_cutoff $file[$i] $target_DB \> $output_hmm_result\n";
system("$default_search_method -T $score_thresh -E $evalue_cutoff $file[$i] $target_DB > $output_hmm_result");
}else{
print "Running: $default_search_method -t $score_thresh $file[$i] $target_DB \> $output_hmm_result\n";
system("$default_search_method -t $score_thresh $file[$i] $target_DB > $output_hmm_result");
}
}else{
print "\n# The $out_hmm_file file already exists. To overwrite use -o opt\n";
}
push(@out_hmm_file_names, $output_hmm_result);
}
if(@out_hmm_file_names > 1){
return(\@out_hmm_file_names);
}else{
return(\$out_hmm_file_names[0]);
}
}
#_______________________________________________________________________
# Title : divide_clusters
# Usage : ÷_clusters(\@file);
# Function : This is the main funciton for divclus.pl
# divides complex single linkage cluster into smaller duplication
# module level sub clusters.
# Example : ÷_clusters(\@file, $verbose, $range, $merge, $sat_file,
# $dindom, $indup, "T=$length_thresh", "e=$evalue", $over_write,
# $optimize, "s=$score", "f=$factor");
#
# Keywords : divicl, divclus, div_clus, divide clusters
# Options : _ for debugging.
# f=<digit> for determing the factor in filtering out non-homologous
# regions, 7 = 70% now!!
# l=<digit> for seqlet(duplication module) length threshold
# t=<digit> for seqlet(duplication module) length threshold
# (same as l opt, confusing, huh? )
# s=<digit> for score threshold
# e=<digit> for evalue threshold
# z for activating remove_similar_sequences, rather than remove_dup....
# o for overwriting
# v for verbose printout (infor)
# D for dynamic factor
# S $short_region= S by S -S # taking shorter region overlap in removing similar reg
# L $large_region= L by L -L # taking larger region overlap in removing similar reg
# A $average_region=A by A -A # taking average region overlap in removing similar reg
# o for $over_write
#
# Version : 2.9
#------------------------------------------------------------------------
sub divide_clusters{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($merge, $verbose, $sat_file, $length_thresh, $factor, $indup, $indup_percent,
$score, @temp_show_sub, $optimize, $file, $evalue, $over_write, $din_dom,
$sum_seq_num, $base_1, $output_clu_file, $short_region, $large_region,
$average_region, $dynamic_factor, @sub_clustering_clu_files);
$factor=7; # default factor is 7 for 70%
$length_thresh=30;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Dealing with options
#_________________________________________
if($char_opt=~/m/){ $merge='m';
}if($char_opt=~/v/){ $verbose='v'; # for showing debugging information
}if($char_opt=~/i/){ $indup='i';
}if($char_opt=~/z/){ $optimize='z';
}if($char_opt=~/o/){ $over_write='o';
}if($char_opt=~/d/){ $din_dom='d';
}if($char_opt=~/s/){ $sat_file='s';
}if($char_opt=~/y/){ $dynamic_factor='y';
}if($char_opt=~/S/){ $short_region ='S';
}if($char_opt=~/L/){ $large_region ='L';
}if($char_opt=~/A/){ $average_region='A';
}if($vars{'T'}=~/\d+/){ $length_thresh= $vars{'T'};
}if($vars{'l'}=~/\d+/){ $length_thresh= $vars{'l'}; ## synonym of 't'
}if($vars{'f'}=~/\S+/){ $factor= $vars{'f'};
}if($vars{'s'}=~/\d+/){ $score = $vars{'s'};
}if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'};
}if($vars{'E'}=~/\d+/){ $evalue= $vars{'E'}; # synonym of e
}
$percent_fac=$factor*10;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (0) When one file input was given (yes, divclus can handle multiple files, Sarah!)
#________________________________________________________________________________
if(@file == 1){ #<=== @file has xxxx.msp, yyyy.msp zzzz.msp ....,
print "\n# (1) divide_clusters: One single file was given=> \"@file\"\n" if $verbose;
$file=$file[0];
$base_1=${&get_base_names($file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Define the output cluster file name: eg, 3-232_cluster_F7.clu , F7 means factor used is 7
#______________________________________________________________________________________________
$output_clu_file="$base_1\_F${factor}\.clu";
if( !$over_write and -s $output_clu_file){
print "\n# $output_clu_file Already EXISTS, skipping. Use \'o\' opt to overwrite\n"; exit;
}
print "# (2) divide_clusters: processing ONE single file \"@file\" with merge_sequence_in_msp_file\n" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
# sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format from msp format
#________________________________________________________________________________
@grouped_seq_names=@{&merge_sequence_in_msp_file(\@file, "s=$score", $optimize, $din_dom, $sat_file,
$optimize, "T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose,
$short_region, $large_region, $average_region, $over_write, $dynamic_factor)};
if($verbose){
print "\n\n# (3) divide_clusters: finished running \"merge_sequence_in_msp_file\"\n ==>";
for($i=0; $i< @grouped_seq_names; $i++){
print "\n-->> $grouped_seq_names[$i]";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) This is critical seqlet merging step
#________________________________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, $dynamic_factor,
"f=$factor", $short_region, $large_region, $average_region, $optimize)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (5) This is showing the result in clu file format
#________________________________________________________________________________
@temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor" );
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $file);
}else{ push(@bad, $file); }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (6) Final write up stage (unecessary)
#_______________________________________________________________
&write_good_bad_list_in_divide_clusters(\@good, \@bad);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# when more than one single file input is given
#____________________________________________________________
elsif(@file >1 ){
my (@good, @bad);
if($indup =~/i/i){ open (INDUP, ">indup_stat\.txt"); } # this is not essential.
for($i=0; $i< @file; $i++){
my (@grouped_seq_names, @temp_show_sub);
my $indup_c=0;
my $big_msp_file=$file[$i];
unless(-s $big_msp_file){ print "\n# (E) \$big_msp_file does not exist\n"; exit }
$base_1=${&get_base_names($big_msp_file)};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Define the output cluster file name: eg, 3-232_cluster_F7.clu , F7 means factor used is 7
#______________________________________________________________________________________________
$output_clu_file="$base_1\_F${factor}\.clu";
print "\n# DIVCLUS: just before merge_sequence_in_msp_file, \$output_clu_file is $output_clu_file from input file $big_msp_file" if $verbose;
if( !$over_write and -s $output_clu_file){
print "\n# $output_clu_file Already EXISTS, skipping. Use \'w\' opt to overwrite\n";
next; }
print "\n# (1) divide_clusters: processing file \"$big_msp_file\" for $output_clu_file" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) If clu file(eg 2-1618_ss.clu ) is in pwd, tries to skip
#____________________________________________________________
if((-s $output_clu_file) > 10 and $over_write !~/o/){
print "# $output_clu_file exists, skipping, use \"o\" option to overwrite\n"; next;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3) merge_sequence_in_msp_file does not do much. Just filtering and producing
# sequences in ISPA_PBS_21-215 VPR_PBS_160-354 format of STRING from msp format
# $big_msp_file is an MSP file
#________________________________________________________________________________
print "\n# (i) divide_clusters : I am merging seq in $big_msp_file file for $output_clu_file\n" if $verbose;
@grouped_seq_names=@{&merge_sequence_in_msp_file(\$big_msp_file, "s=$score", $din_dom, $sat_file, $optimize,
"T=$length_thresh", "e=$evalue", "f=$factor", "$range", "$merge", $verbose, $over_write,
$short_region, $large_region, $average_region, $dynamic_factor )};
if($verbose){
print "\n# \@file has more than one input file\n # The result of \"merge_sequence_in_msp_file\"\n";
print "@grouped_seq_names";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) Clustering the sets of merged seqlets => CORE algorithm
#____________________________________________________________
@out=@{&cluster_merged_seqlet_sets(\@grouped_seq_names, "f=$factor", $optimize, $dynamic_factor,
$short_region, $large_region, $average_region)};
@temp_show_sub=&show_subclusterings(\@out, $big_msp_file, $sat_file, $dindom, $indup,
"e=$evalue", "p=$percent_fac", "f=$factor");
$good_bad = $temp_show_sub[0];
$indup_c = $temp_show_sub[1];
$sum_seq_num += $temp_show_sub[2];
push(@sub_clustering_out_files, @{$temp_show_sub[3]});
if($good_bad==1){ push(@good, $big_msp_file);
}else{ push(@bad, $big_msp_file); }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
&write_good_bad_list_in_divide_clusters(\@good, \@bad);
sub write_good_bad_list_in_divide_clusters{
my (@good, @bad, $i); @good=@{$_[0]}; @bad=@{$_[1]};
open(GOODBAD, ">good_bad.list");
print GOODBAD "GOOD: all link : 000\n";
for($i=0; $i< @good; $i++){ print GOODBAD "$good[$i]\n"; }
print GOODBAD "BAD : Not all link: 000\n";
for($i=0; $i< @bad; $i++){ print GOODBAD "$bad[$i]\n"; }
close(GOODBAD);
}
#_______________________________________________________________
}
return(\@sub_clustering_out_files); # contains (xxxx.clu, yyy.clu,, )
}
#______________________________________________________________________________
# Title : remove_file_extension
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub remove_file_extension{
my (@modified_files, $i, @files);
@files=@_;
for($i=0; $i< @files; $i++){
$base=${&get_base_names($files[$i])};
rename($files[$i], $base);
push(@modified_files, $base);
}
return(\@modified_files);
}
#______________________________________________________________________________
# Title : remove_small_files
# Usage : @files_removed=@{&remove_small_files(@ARGV)};
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub remove_small_files{
my($file_size_cut_line, $size, @files_removed, $i, @files);
$file_size_cut_line=$_[0];
$file_size_cut_line_bigger_than=$ARGV[1];
@files=@{&read_file_names_only('.')};
for($i=0; $i< @files; $i++){
$size= -s $files[$i];
if($size <= $file_size_cut_line and $size >= $file_size_cut_line_bigger_than){
unlink($files[$i]);
push(@files_removed, $files[$i]);
print "\n# (i) $files[$i] is removed , size= $size byte";
}
}
print "\n$0 finished. You might have killed some useful files :-) \n\n\n\n";
return(\@files_removed);
}
#______________________________________________________________________________
# Title : remove_mail_header_in_files
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub remove_mail_header_in_files{
my (@modified_files, $i, @files);
@files=@_;
for($i=0; $i< @files; $i++){
my ($line_count);
&cp("$files[$i]", "$files[$i]\.bak");
open(FILE_BAK, "$files[$i]\.bak");
open(FILE, ">$files[$i]");
while(<FILE_BAK>){
$line_count++;
if($line_count > 30){
print FILE $_; next;
} # No point in looking very far down !
if(/^From +\S/){ next;
}elsif(/^Date\: +/){ next;
}elsif(/^From: +/){ next;
}elsif(/^To: +/){ next;
}elsif(/^Subject: +/){ next;
}else{ print FILE;
}
}
close(FILE_BAK);
close(FILE);
if(-s $files[$i] > ( $original_file_size - 400) ){
push(@modified_files,$files[$i]);
print "\n# (i) $files[$i] has real size, I am removing $files[$i]\.bak\n";
}else{
print "\n# (i) The file size of new $files[$i] is a bit small, I am leaving $files[$i]\.bak\n";
}
}
return(\@modified_files);
}
#_____________________________________________________________________________
# Title : remove_similar_seqlets
# Usage : @seqlets=@{&remove_similar_seqlets(\@split)};
# Function : merges(gets average starts and ends ) of similar
# seqlets to reduce them into smaller numbers. This can also handle
# names like XLBGLO2R_8-119_d1hlm__.
#
# Example : @seqlets=@{&remove_similar_seqlets(\@mrg1, $mrg2, \@mrg3)};
# while @mrg1=qw(M_2-100 M_2-110 M_8-105 M_4-108 N_10-110 N_12-115);
# $mrg2='Z_3-400 Z_2-420';
# @mrg3=('X_2-300 X_3-300', 'X_2-300', 'X_5-300 X_2-301' );
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : merge_sequence_names, merge_seq_names, merge_sequence_ranges
# merge_seq_ranges
# Options : _ for debugging.
# # for debugging.
# f= for factor
# S for shorter region matched is used
# A for average region matched is used
# L for larger region matched is used
# Category :
# Version : 2.0
#-------------------------------------------------------------------------------
sub remove_similar_seqlets{
my ($i, $j, $seq1, $smaller_leng, $leng1, $leng2, $start1, $end1, $seq2, $start2,
$av_diff, $num_of_seq, $av_end, $av_start, $end2, @seqlets,
@array_input, @seqlet, $tail1, $tail2, $shorter_region, $larger_region,
$average_region);
my $factor=7; ## !!! This var makes big difference in the final clustering
$average_region = 'A'; ## default is to get the average of comparing regions
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'ARRAY'){
@array_input=@{$_[$i]};
for($j=0; $j<@array_input; $j++){
@seqlet=split(/ +/, $array_input[$j]);
push(@seqlets, @seqlet);
}
if($verbose){
print "\n# remove_similar_seqlets: ARRAY ref is given as input\n";
print "# They are: @seqlets, Sarah\n";
}
}elsif($_[$i]=~/f=(\S+)/){ $factor=$1
}elsif($_[$i]=~/^(S) *$/){ $shorter_region=$1 ; $average_region=0;
}elsif($_[$i]=~/^(L) *$/){ $larger_region =$1 ; $average_region=0;
}elsif($_[$i]=~/^(A) *$/){ $average_region=$1 ; $shorter_region=$larger_region=0;
}elsif($_[$i]=~/\S+\_\d+\-\d+/){
push(@seqlets, split(/ +/, $_[$i]) );
}elsif(ref($_[$i]) eq 'SCALAR' and ${$_[$i]}=~/\S+\_\d+\-\d+/){
push(@seqlets, split(/ +/, ${$_[$i]}) );
}
}
print "\n# remove_similar_seqlets : I am using \$factor : $factor\n" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Sorting is necessary as I am not doing the real thorough comparison
#______________________________________________________________________
$num_of_seq=@seqlets=sort @seqlets;
if($verbose){
print "\n# (1) remove_similar_seqlets: Num of seq to merge: $num_of_seq (from \@seqlets)";
print "\n# (2) remove_similar_seqlets: \@seqlets are @seqlets\n";
}
my ($short_start, $large_start, $short_end, $large_end);
for($i=0; $i< @seqlets; $i++){
if($seqlets[$i]=~/^ *(\S+)_(\d+)\-(\d+)(\S*)/){ ## last (\S*) is necessary for XLBGLO2R_8-119_d1hlm__
my($seq1, $start1, $end1, $tail1)=($1, $2, $3, $4);
if($seqlets[$i+1]=~/^(\S+)_(\d+)\-(\d+)(\S*)/){
($seq2, $start2, $end2, $tail2)=($1, $2, $3, $4);
if($seq1 eq $seq2){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$diff_start=abs($start1 - $start2);
$diff_end =abs($end1 - $end2 );
$leng1=$end1-$start1;
$leng2=$end2-$start2;
if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
if( ($diff_start+$diff_end)/2 <= $smaller_leng*($factor/10) ){
if($average_region){
$av_start=int(($start1+$start2) / 2);
$av_end =int(($end1 + $end2) / 2);
$seqlets[$i]="$seq1\_$av_start\-${av_end}$tail1"; # $tail1 is for names like XLBGLO2R_8-119_d1hlm__
print "\n# new seqlet : $seqlets[$i]\n" if $verbose;
splice(@seqlets, $i+1, 1);
$i--;
}else{
if($start1 < $start2){
$short_start=$start2; $large_start=$start1; ## note that short start should be $start2 if $start2 is bigger
}else{
$short_start=$start1; $large_start=$start2;
}
if($end1 < $end2){
$short_end=$end1; $large_end=$end2;
}else{
$short_end=$end2; $large_end=$end1;
}
if($shorter_region){
$seqlets[$i]="$seq1\_$short_start\-${short_end}$tail1";
}elsif($larger_region){
$seqlets[$i]="$seq1\_$large_start\-${large_end}$tail1";
}
splice(@seqlets, $i+1, 1);
$i--;
}
}
}
}
}
}
print "\n# (3) remove_similar_seqlets: The final out are: @seqlets\n" if $verbose;
return(\@seqlets);
}
#__________________________________________________________________________
# Title : show_subclusterings
# Usage : &show_subclusterings(\@out);
# Function : This is the very final sub of divclus.pl
# Example : @temp_show_sub=&show_subclusterings(\@out, $file, $sat_file, $dindom, $indup);
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : print_subclusterings, sum_subclusterings, write_subclustering
# show_clusterings, display_subclusterings
# Options :
# f for file output, eg: xxxxxxx.sat
# Category :
# Version : 2.7
#-------------------------------------------------------------------------
sub show_subclusterings{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($max_size, $sat_file_name, $clu_file_name,
$ori_cluster_size, $ori_cluster_num, $good_bad, @keys, $percentage_fac,
$indup, @sizes, $sum_seq_num, $indup_percent, $indup_count, %tem4,
@sub_clustering_out_files); # clusall_1e-5_clu_14-324_ss.sat
my @out=@{$array[0]};
$indup_count=0;
if($char_opt=~/d/){ $dindom=1; }
if($char_opt=~/i/){ $indup=1; }
if($vars{'f'}=~/\S+/){ $factor= $vars{'f'}; }
if($vars{'p'}=~/\d+/){ $percentage_fac= int($vars{'p'}); }
if($vars{'s'}=~/\d+/){ $score = $vars{'s'}; }
if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'}; }
#print "\n# (1) show_subclusterings : \@file has : @file\n";
if( $file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.msp/ or
$file[0]=~/([\S+_]*?(\d+)\-(\d+)[_\w]*)\.sat/ ){
$ori_cluster_size=$2;
$ori_cluster_num =$3;
$base=$1;
$sat_file_name="$base\.sat";
$clu_file_name="$base\.clu";
}else{
print "\n# (2) LINE:",__LINE__," The \@file input to show_subclusterings is not the right format, dying\n";
print "\n Sarah!, right format looks like: 13-234.msp or 8-420_cluster.msp \n"; exit;
}
open(CLU, ">$clu_file_name") or die "\n# (ERROR) show_subclusterings failed miserably to open \"$clu_file_name\" \n";
push(@sub_clustering_out_files, $clu_file_name);
@out=@{&sort_string_by_length(\@out)};
for($i=0; $i< @out; $i++){ # @out has ( 'YAL054C_98-695 YBR041W_90-617', 'YBR115C_230-842 YBR222C_16-537 YER015W_121-686', etc)
my $count+=$i+1;
my ( $int_dup_number, $sub_clu_size, $seq_with_range, @sp, $new_clus_NAME,
%tem, %tem2, %tem3, $j, @keys, $num_seq);
if($out[$i]=~/^ *$/){ next }
@sp=sort split(/ +/, $out[$i]);
for($j=0; $j < @sp; $j++){
$seq_with_range=$sp[$j];
if($seq_with_range=~/^((\S+)_((\d+)\-(\d+)))/){
$tem{$2}++;
$tem2{$2}.=sprintf("%-15s ", $1);
$tem3{$2} =$3;
$tem4{$2} =$5-$4;
}
}
@keys=sort keys %tem;
$num_seq=$sub_clu_size=@keys;
if($max_size < $sub_clu_size){
$max_size=$sub_clu_size; ## This is to collect the sizes of clusters to see if it is good.
}
$indup_count= &print_summary_for_divclus(
$count, \%tem2, \%tem,
$ori_cluster_num,
$ori_cluster_size,
$dindom,
$clu_file_name,
\%tem3, \%tem4,
$indup, );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Local subroutine
#_______________________________________________________________
sub print_summary_for_divclus{ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
my(@keys, $indup_count, $x, $m, $percentage_fac);
my $count=$_[0]; # count of cluster
my %tem2=%{$_[1]}; my $num_seq=@keys=sort keys %tem2;
my %tem=%{$_[2]}; my $ori_cluster_num=$_[3];
my $new_clus_NAME=$ori_cluster_num.'0'.$count.'0'.$num_seq;
my $ori_cluster_size=$_[4];
my $dindom=$_[5]; my %tem3=%{$_[7]};
my $indup=$_[9]; my (%internal_dup);
my %tem4=%{$_[8]};
#~~~~~~~~~~ Domain Inside Domain ~~~~~~~~~~~~~~~~~
if($dindom){
for($x=0; $x <@keys; $x++){
@domain_inside_domain=@{&get_domain_inside_domain($tem2{$keys[$x]})};
@domain_inside_domain=@{&remove_dup_in_array(\@domain_inside_domain)};
for($m=0; $m< @domain_inside_domain; $m++){ print " # Dindom: $m : $domain_inside_domain[$m]\n"; }
print "\n";
}
}
#==========================================================================================
#~~~~~~~~~~ Internal duplication ~~~~~~~~~~~~~~
if($indup==1){
# @keys is the same as sub cluster size,
for($x=0; $x < @keys; $x++){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checks each sequence for duplication
#___________________________________________________
my %internal_dup=%{&get_internal_dup_in_a_cluster( $tem2{$keys[$x]} )};
my @dup_keys=keys %internal_dup;
if(@dup_keys > 0){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# This calculates the actual duplicated number rather than jus tthe sequences
#______________________________________________________________________________
$indup_count++;
printf ("%-14s %-12s %-4s", $keys[$x], $new_clus_NAME, $num_seq);
for($m=0; $m< @dup_keys; $m++){
printf ("%-19s=> %s\n", $dup_keys[$m], $internal_dup{ $dup_keys[$m] } );
}
}
}
}
#~~~~~~~~~~ Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~
print CLU "Cluster size $num_seq\n";
printf CLU ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
print "Cluster size $num_seq\n";
printf ("Cluster number %-12s # E:%-5s Factor:%-2s P:%-2s, Ori size:%-4s Sub:%-4s From:%-12s\n",
$new_clus_NAME, $evalue, $factor, $percentage_fac,
$ori_cluster_size, $num_seq, $ori_cluster_num);
for($x=0; $x <@keys; $x++){
printf CLU (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
printf (" %-4s %-5s %-17s %-10s %-3s leng: %-s\n",
$num_seq, $ori_cluster_num, $keys[$x], $tem3{$keys[$x]}, $tem{$keys[$x]}, $tem4{$keys[$x]});
}
return($indup_count);
}
}
close(CLU); ## this is a bug fix
if($max_size == $ori_cluster_size){ $good_bad=1;
}else{ $good_bad=0; }
print "\n# Sarah, Do you think the subclusterings are O.K.?" if $verbose;
print "\n# Tell me, if you feel suspicious, jong\@salts.med.harvard.edu\n\n" if $verbose;
return($good_bad, $indup_count, $ori_cluster_size, \@sub_clustering_out_files);
}
#__________________________________________________________________________
# Title : exchange_query_with_match_in_msp
# Usage : @exchanged_msp=@{&exchange_query_with_match_in_msp(\@file)};
# Function :
# Example :
# Keywords : swap_query_with_match_in_msp, invert_query_with_match_in_msp,
# swap_query_seq_with_match_seq_in_msp,
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub exchange_query_with_match_in_msp{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%exchanged_msp, @sorted_by_query_seq_names, @new_msp_lines);
$open_msp_files_x_opt = 'x';
if($char_opt=~/n/){ $names_only='n' }
%exchanged_msp=%{&open_msp_files(@file, $open_msp_files_x_opt, $names_only )};
@new_msp_lines=values %exchanged_msp;
@sorted_by_query_seq_names=
map{ $_->[0] } sort {$a->[1] cmp $b->[1]} map {/^\d+ +\S+ +\d+ +\d+ +(\S+)/ && [$_, $1] } @new_msp_lines;
return(\@sorted_by_query_seq_names);
}
#______________________________________________________________
# Title : get_internal_dup_in_a_cluster
# Usage :
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub get_internal_dup_in_a_cluster{
$cluster_line=$_[0] || ${$_[0]};
my($i, $j, @seq, %out, $seq_name);
my $overlap_factor=40;
my $min_inside_dom_size=40;
@seq=split(/ +/, $cluster_line); ## These sequence are single seq with different regions
@seq= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2] }
map {/^((\S+)_(\d+)\-(\d+) *.*)$/ && [$1, $2, $3, $4]} @seq;
F1:for($i=0; $i< @seq; $i++){
$seq1=$seq[$i];
if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
$seq_name=$1;
$start1=$2;
$end1=$3;
}
F:for($j=1; $j< @seq; $j++){
$seq2=$seq[$j];
if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
$start2=$2;
$end2=$3;
}
$leng2=$end2-$start2;
$margin=$leng2/12; ## 8% overlap is regarded as not overlapping
if(( ($start1+$margin) > $end2)||
( ($start2+$margin) > $end1)){ # skips non overlapping seqlets
$out{"$start1\-$end1"}.="$start2\-$end2 ";
splice(@seq, $j, 1);
$j--;
}
}
}
#@out=sort (@out);
#@out=@{&remove_dup_in_array(\@out)};
#@out=@{&remove_similar_seqlets(\@temp, "f=2")};
return(\%out);
}
#______________________________________________________________
# Title : get_domain_inside_domain
# Usage :
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : find_dindoms, domain_inside_domain, domain_in_domain
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub get_domain_inside_domain{
$cluster_line=$_[0] || ${$_[0]};
my($i, $j, @seq, @out);
my $overlap_factor=40;
my $min_inside_dom_size=40;
@seq=split(/ +/, $cluster_line);
F1:for($i=0; $i< @seq; $i++){
$seq1=$seq[$i];
if($seq1=~/^(\S+)_(\d+)\-(\d+)/){
$seq_name=$1;
$start1=$2;
$end1=$3;
}
F:for($j=0; $j< @seq; $j++){
$seq2=$seq[$j];
if($seq1 eq $seq2){ next } ### Skip IDENTICAL ones (xxxx_1-10, xxxx_1-10)
if($seq2=~/^(\S+)_(\d+)\-(\d+)/){
$start2=$2;
$end2=$3;
}
if(($start1 > $end2)||($start2 > $end1)){ # skips non overlapping seqlets
next;
}
if(($start1 > $start2)&&($end1 < $end2)){ # -----
$leng_seq1=$end1-$start1; # ----------
$leng_seq2=$end2-$start2;
if(( ($leng_seq2/2) >= $leng_seq1 )&&
($leng_seq1 > $min_inside_dom_size) ){ # if seq1 is less than 60% of seq2, it is a hidden domain
push(@out, "$seq2\($seq1\)");
}
}elsif(($start1 < $start2)&&($end1 > $end2)){ # -----------
$leng_seq1=$end1-$start1; # ------
$leng_seq2=$end2-$start2;
if(( ($leng_seq1/2) >= $leng_seq2)&&
($leng_seq2 > $min_inside_dom_size) ){ # if seq1 is less than 60% of seq2, it is a hidden domain
push(@out, "$seq1\($seq2\)");
}
}
}
}
return(\@out);
}
#______________________________________________________________
# Title : scale_for_horizontal_histogram
# Usage :
# Function : used to make things like:
#
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub scale_for_horizontal_histogram{
my @query=@{$_[0]};
if(@query > 2400){
$condense_factor=20;
}elsif(@query > 2200){
$condense_factor=18;
}elsif(@query > 1900){
$condense_factor=16;
}elsif(@query > 1600){
$condense_factor=15;
}elsif(@query > 1400){
$condense_factor=14;
}elsif(@query > 1200){
$condense_factor=12;
}elsif(@query > 1000){
$condense_factor=10;
}elsif(@query > 800){
$condense_factor=9;
}elsif(@query > 630){
$condense_factor=8;
}elsif(@query > 440){
$condense_factor=6;
}elsif(@query> 220){
$condense_factor=4;
}elsif(@query > 120){
$condense_factor=3;
}else{
$condense_factor=2;
}
return(\$condense_factor);
}
#______________________________________________________________
# Title : get_added_matched_regions_in_msp
# Usage :
# Function : This reads MSP file regions matched for a target seq
# and adds things up to plot horizontally.
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub get_added_matched_regions_in_msp{
my @lines=@{$_[0]};
for($i=0; $i< @lines; $i++){
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# $1 $2 $3 $4 $5 $6 $7 $8
# 171 41.18 6 73 HI1690 9 76 HI0736 sodium...
#,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
if($lines[$i]=~/^ *(\d+) +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+) *(.*)/){
if($4 eq $7){
$query_name=$4;
$query_leng=$3;
for($j=0; $j<$query_leng; $j++){ $query[$j]=0; }
next;
}else{
if($match_name ne $7){ push(@matched_members, $7); }
$query_start=$2;
$query_end =$3;
$query_seq =$4;
$match_start=$5;
$match_end =$6;
$desc =$8;
$match_name =$7;
for($k= $query_start; $k<$query_end; $k++){
$query[$k]++;
}
}
}
}
return(\@query);
}
#______________________________________________________________
# Title : cluster_merged_seqlet_sets
# Usage : @out=@{&cluster_merged_seqlet_sets(\@lines)};
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# $short_region= S by S -S # taking shorter region overlapped in removing similar regions
# $large_region= L by L -L # taking larger region overlapped in removing similar regions
# $average_region=A by A -A # taking average region overlapped in removing similar regions
#
# Version : 1.8
#--------------------------------------------------------------
sub cluster_merged_seqlet_sets{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($optimize, @splited1, @splited2, $verbose, $link_or_not);
my @seq_names_in_clu=@{$array[0]};
$link_or_not=0;
my $factor=7; # 7 means 70% now
if($vars{'f'}=~/(\S+)$/){ $factor=$1 }
if($char_opt=~/o/){ $optimize=1 }
if($char_opt=~/S/){ $short_region='S'; }
if($char_opt=~/L/){ $large_region='L'; }
if($char_opt=~/A/){ $average_region='A'; }
if($char_opt=~/v/){ $verbose=1 }
if($verbose){ print "\n# (1) cluster_merged_seqlet_sets: Checking linkage and merging <<<<<>>>>>\n@seq_names_in_clu\n"; }
F1: for($i=0; $i< @seq_names_in_clu; $i++){
@splited1=split(/ +/, $seq_names_in_clu[$i]);
for($j=0; $j< @seq_names_in_clu; $j++){
if($seq_names_in_clu[$i] eq $seq_names_in_clu[$j]){ next }
@splited2=split(/ +/, $seq_names_in_clu[$j]);
$link_or_not=${&check_linkage_of_2_similar_seqlet_sets(\@splited1, \@splited2, "f=$factor")};
print "\n +++++ \$link_or_not is $link_or_not +++" if $verbose;
if($link_or_not==1){
if($verbose){
print "\n# (2) cluster_merged_seqlet_sets: \n $seq_names_in_clu[$i] \n and $seq_names_in_clu[$j] \n are linked \n";
}
if($optimize){ ##---- This will also remove similar seqlets, not only identical ones
$seq_names_in_clu[$i]=join(' ', sort @{&remove_similar_seqlets( [@splited1, @splited2],
$short_region, $large_region, $average_region)} );
}else{
$seq_names_in_clu[$i]=join(' ', sort @{&remove_dup_in_array( [@splited1, @splited2])} );
}
splice(@seq_names_in_clu, $j,1);
$j--; $i--;
next F1;
}
}
}
return(\@seq_names_in_clu);
}
#______________________________________________________________________________
# Title : check_homology_of_seq_pair
# Usage :
# Function :
# Example : $homology_info=${&check_homology_of_seq_pair(\$pairs[$i], \%pdbg_hash_table)};
#
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub check_homology_of_seq_pair{
my($i, %pdbg_hash_table,%input_hash, $seq_name1, $seq_name2,
$classification1, $classification2);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Argument handling
#___________________________________
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'SCALAR' and ${$_[$i]}=~/^(\S+)[\t ]+(\S+)$/){
($seq_name1, $seq_name2)=($1,$2);
splice(@_, $i, 1); $i--;
}elsif(ref($_[$i]) eq 'ARRAY' and @{$_[$i]} eq 2){
($seq_name1, $seq_name2)=@{$_[$i]}; splice(@_, $i, 1); $i--;
}elsif(ref($_[$i]) eq 'HASH'){
%input_hash=%{$_[$i]};
my @keys=keys %input_hash;
if($keys[0]=~/\S+/ and $input_hash{$keys[0]}=~/(\d+\.\d+\.\d+)/){
%pdbg_hash_table=%input_hash; splice(@_, $i, 1); $i--;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# When seq pair name is given in HASH=(seq1, seq2)
#___________________________________________________
elsif($keys[0]=~/^\S+$/ and $input_hash{$keys[0]}=~/^\S+$/){
($seq_name1, $seq_name2)= ($keys[0], $input_hash{$keys[0]});
splice(@_, $i, 1); $i--;
}elsif($keys[0]=~/^(\S+)[\t ](\S+)$/){
($seq_name1, $seq_name2)= ($1, $2); splice(@_, $i, 1); $i--;
}
}
}
if($pdbg_hash_table{$seq_name1}=~/^(\d+\.\d+\.\d+)/){
$classification1=$1;
}else{
print "\n\t# (W) \'$pdbg_hash_table{$seq_name1}\' is NOT FOUND in \%pdbg_hash_table, One member group??";
}
if($pdbg_hash_table{$seq_name2}=~/^(\d+\.\d+\.\d+)/){
$classification2=$1;
}else{
print "\n\t# (W) \"$pdbg_hash_table{$seq_name2}\" is NOT FOUND in \%pdbg_hash_table, One member group??";
}
if($classification1 and $classification1 eq $classification2){
print "\n\t# (i) $seq_name1 $classification1 == $seq_name2 $classification2";
return(\'Homolog');
}else{
print "\n\t# (i) $seq_name1 $classification1 =X= $seq_name2 $classification2";
return(\'Nomolog');
}
}
#______________________________________________________________
# Title : check_linkage_of_2_similar_seqlet_sets
# Usage :
# Function : connects two clusters of seqlets if they share
# identical or near identical seqlets
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : check_link, check_relation, check_relatedness
# Options : _ for debugging.
# $factor = by f= # eg) "f=$factor" in the higher level sub
#
# Returns :
# Argument :
# Category :
# Version : 1.7
#--------------------------------------------------------------
sub check_linkage_of_2_similar_seqlet_sets{
my ($seq1, $name1, $start1, $end1, $seq2,
$leng1, $leng2, $name2, $start2, $end2, $diff_start,
$diff_end);
my @splited1=@{$_[0]};
my @splited2=@{$_[1]};
my $link_or_not=0;
my $factor=7; # this means 70% sequence region overlap of the intermediate is chosen
if($_[2]=~/f=(\S+)/i){
$factor=$1;
}
F1: for($s=0; $s<@splited1; $s++){
if($splited1[$s]=~/^ *((\S+)_(\d+)\-(\d+))/){
$seq1=$1;
$name1=$2;
$start1=$3;
$end1=$4;
}
F2: for($t=0; $t< @splited2; $t++){
if($splited2[$t]=~/^ *((\S+)_(\d+)\-(\d+))/){
$seq2=$1;
$name2=$2;
$start2=$3;
$end2=$4;
}
if($seq1 eq $seq2){ $link_or_not=1; return(\$link_or_not) }
if($name1 ne $name2){
next F2;
}elsif($name1 eq $name2){ ## ~~~~~~~~~~~~~ THIS is the MOST IMP CORE PART ~~~~~~~~~~~~~
$leng1=$end1-$start1;
$leng2=$end2-$start2;
if($leng1 >= $leng2){ $smaller_leng=$leng2; }else{ $smaller_leng=$leng1; }
$diff_start=abs($start1-$start2);
$diff_end =abs($end1 -$end2 );
if((($diff_start+$diff_end)/2) <= ($smaller_leng/$factor) ){
$link_or_not=1;
return(\$link_or_not);
}
}## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
}
return(\$link_or_not);
}
#__________________________________________________________________________
# Title : merge_arrays_by_common_elements
# Usage : @out=@{&merge_arrays_by_common_elements(\@ref_of_arrays)}
# Function : merges arrays if there are common array elements.
# if @A has (1,2,3) and @B has (2, 4, 5), they share 2, so
# they are merged to be (1,2,3,4,5)
# Example :
# Keywords : cluster_arrays_by_common_elements, merge_arrays_if_common_elements
# merge_array_if_common_elements, merge_arrays_when_common_elements_occur
# merge_arrays
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#----------------------------------------------------------------------------
sub merge_arrays_by_common_elements{
my ($i, @mother_array);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Handling input array
#______________________________________
if( @_==1 and ref($_[0]) eq 'ARRAY'){ @mother_array=@{$_[0]};
}elsif(@_ > 1){ @mother_array=@_;
}else{
print "\n# The input for merge_arrays_by_common_elements needs one ref of array or multiple refs of array\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main algo
#______________________________________
for($i=0; $i< @mother_array; $i++){
my @merged=(@{$mother_array[$i]}, @{$mother_array[$i+1]});
my ($common_or_not, %merged_hash, $j);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checks if there is a common element
#______________________________________
for($j=0; $j< @merged; $j++){
$merged_hash{$merged[$j]}++;
if($merged_hash{$merged[$j]} > 1){ $common_or_not=1 }
}
my @non_redundant=keys %merged_hash;
if($common_or_not==1){
$mother_array[$i]=\@non_redundant;
splice(@mother_array, ($i+1), 1);
$i--;
}
}
return(\@mother_array);
}
#________________________________________________________________________________
# Title : check_parf_files
# Usage : $number_of_parf=${&check_parf_files(@input)};
# Function : checks if given file(s) is a parf file and returns the number of
# identified parf file. If you check 2 files and both are parf, you
# will get (\$num_of_parf_file) value of 2.
# Example :
# PARF file looks like this>
# d1nsca_ d3nn9__ Homolog -664.92 2.43.1.1.3 2.43.1.1.2
# d1dppa_ d2olba_ Homolog -617.41 3.68.1.1.6 3.68.1.1.1
# d2ach.1a1 d9api.1a1 Homolog -556.38 5.2.1.1.3 5.2.1.1.4
# Keywords :
# Options :
# Author :
# Category :
# Version : 1.0
#--------------------------------------------------------------------------------
sub check_parf_files{
my(@parf_file, $i, $j, @array, $counter, $num_of_parf_file);
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'SCALAR' and -f ${$_[$i]}){
push(@parf_file, ${$_[$i]})
}elsif(-f $_[$i]){
push(@parf_file, $_[$i])
}elsif(ref($_[$i]) eq 'ARRAY'){
@array=@{$_[$i]};
for($j=0; $j< @array; $j++){
if(-f $array[$j]){
push(@parf_file, $array[$j]);
}
}
}
}
print "\n# There were: @parf_file\n" if $verbose;
for($i=0; $i< @parf_file; $i++){ ## usually @parf_file has only one element!
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if it is parf file inside the file
#__________________________________________________________
open(INPUT_FILE, "<$parf_file[$i]");
while(<INPUT_FILE>){
$counter++;
if(/^ *\S+[\t ]+\S+[\t ]+[HN]omolog[\t ]+\S+[\t ]+\S+[\t ]+\S+/){
$num_of_parf_file++;
print "\n# $parf_file[$i] is a PARF file\n" if $verbose;
last;
}else{
if($counter > 100){ ## giving up, it is not PARF file!
print "\n# $0 needs to have PARF files, others are ignored" if $verbose;
last;
}else{
next;
}
}
}
close(INPUT_FILE);
}
return(\$num_of_parf_file);
}
#__________________________________________________________________________
# Title : check_common_elements_in_array
# Usage : &check_common_elements_in_array($mother_array[$i], $mother_array[$i+1]));
# Function : accepts 1 or 2 refs of arrays and checks if there is any
# common(repeating) elements between the two (or inside one)
# The result is either ref of 1, or 0
# Example :
# Keywords : is_there_common_element, if_common_elements
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#----------------------------------------------------------------------------
sub check_common_elements_in_array{
my(@merged_array, %common_elemnts_count, $m);
if(@_ > 1){
@merged_array=(@{$_[0]}, @{$_[1]});
}else{
@merged_array=(@{$_[0]});
}
for $m (@merged_array){
$common_elemnts_count{$m}++;
if($common_elemnts_count{$m} > 1){
print "\n# $common_elemnts_count{$m} $m common!\n";
return(\1);
}
}
return(\0);
}
#__________________________________________________________________________
# Title : link_ranges
# Usage : @all_ranges = @{&link_ranges(@all_ranges)};
# Function : merges ranges(10-20, 11-21 etc) when there is any overlap
# is present
# If you put a reverse range like '2000-20', it will
# complain and reverse the order and do the job after correction.
#
# Example : INPUT:
#
# @input=( '1-30 1-40 1-50',
# '2-49 4-40 2-99'....)
#
# Keywords : connect_ranges, link_overlapping_ranges, connect_overlapping_ranges
# Options : _ for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------
sub link_ranges{
my (@all_ranges, $new_start, $new_end, @output, $i, $seq1, $start1,
$end1, $seq2,
$smaller_leng, $start2, $end2, @split, @split1, @split2);
my $leng_thresh=30;
my $optimize=0;
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'ARRAY'){
@all_ranges=@{$_[$i]};
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Sorting the ranges by the starting range number.(essential)
#______________________________________________________________
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-/ and [$_, $1] } @all_ranges;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# iterating merger
#_________________________________________________________
for($i=0; $i< @all_ranges; $i++){
if($all_ranges[$i] =~/(\d+)\-(\d+)/){
($start1, $end1)=($1, $2);
if($start1 > $end1){
print "\n# link_ranges: Error, \$start1 :$start1 is larger than \$end1: $end1\n";
print "\n# Exchanging the start and end, and starting it all over again";
$all_ranges[$i]="$2\-$1";
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-/ and [$_, $1] } @all_ranges;
$i= -1;
next;
}
}
if($all_ranges[$i+1] =~/(\d+)\-(\d+)/){
($start2, $end2)=($1, $2);
if($start2 > $end2){
print "\n# link_ranges: Error, \$start2 :$start2 is larger than \$end2: $end2\n";
print "\n# Exchanging the start and end, and starting it all over again";
$all_ranges[$i+1]="$2\-$1";
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-/ and [$_, $1] } @all_ranges;
$i= -1;
next;
}
if($start1 <= $start2 and $end1 >= $end2){ ## -----------------
$new_start=$start1; # --------
$new_end =$end1;
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--;
next;
}
if( $start2 <= $start1 and $end2 >= $end1){ ## -------
$new_start=$start2; # ----------------
$new_end =$end2;
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--;
next;
}
if($start1 <= $start2 and $start2 <= $end1){ # -----------
$new_start=$start1; # -----------
$new_end =$end2;
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--;
next;
}
if($start2 <= $start1 and $start1 <= $end2){ # -----------
$new_start=$start2; # ---------
$new_end =$end1;
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--;
next;
}
}
}
return(\@all_ranges);
}
#__________________________________________________________________________
# Title : merge_similar_ranges
# Usage : @all_ranges = @{&merge_similar_seqlets(@all_ranges)};
# Function : merges ranges(10-20, 11-21 etc) when there is any overlap
# is present (resulting in average start and end at each level)
# If you put a reverse range like '2000-20', it will
# complain and reverse the order and do the job after correction.
#
# Example : INPUT:
#
# @input=( '1-30 1-40 1-50',
# '2-49 4-40 2-99'....)
#
# Keywords : merge_similar_regions, merge_ranges, merge_regions,
# merge_sequence_ranges, merge_overlap_ranges, connect_ranges
# connect_overlapping_ranges, connect_similar_ranges,
# remove_similar_ranges
# Options : f= for setting factor (0.7 for 70% overlap minimum)
#
# Returns :
# Argument :
# Category :
# Version : 1.3
#--------------------------------------------------------------------------
sub merge_similar_ranges{
my (@all_ranges, $new_start, $new_end, @output, $i, $seq1, $start1,
$end1, $seq2, $average_leng,
$smaller_leng, $start2, $end2, @split, @split1, @split2);
my $factor=0.9; # 0.8 means 80% overlap
my $leng_thresh=30;
my $optimize=0;
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'ARRAY'){
@all_ranges=@{$_[$i]};
}elsif($_[$i]=~/f=(\S+)/){
$factor=$factor_ori=$1;
#print "\n# merge_similar_ranges: Factor used will be $factor\n\n";
}elsif($_[$i]=~/z/i){
$optimize=1 }
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Sorting the ranges by the starting range number.(essential)
#______________________________________________________________
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-\d+ *$/ and [$_, $1] } @all_ranges;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# iterating merger
#_________________________________________________________
for($i=0; $i< @all_ranges; $i++){
$factor=$factor_ori;
if($all_ranges[$i] =~/(\d+)\-(\d+) *$/){ ## ranges are at the end
($start1, $end1)=($1, $2);
if($start1 > $end1){
print "\n# merge_similar_ranges: Error, \$start1 :$start1 is larger than \$end1: $end1\n";
print "\n# Exchanging the start and end, and starting it all over again";
$all_ranges[$i]="$2\-$1";
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-\d+ *$/ and [$_, $1] } @all_ranges;
$i= -1;
next;
}
}
if($all_ranges[$i+1] =~/(\d+)\-(\d+) *$/){ ## ranges are at the end
($start2, $end2)=($1, $2);
if($start2 > $end2){
print "\n# merge_similar_ranges: Error, \$start2 :$start2 is larger than \$end2: $end2\n";
print "\n# Exchanging the start and end, and starting it all over again";
$all_ranges[$i+1]="$2\-$1";
@all_ranges=map {$_->[0]} sort { $a->[1] <=> $b->[1] }
map { $_=~/(\d+)\-\d+ *$/ and [$_, $1] } @all_ranges;
$i= -1;
next;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
$average_leng=(($end2+$end1)/2)-(($start1+$start2)/2); ## this is protein seq ranges
$factor-=$average_leng/5000; ## adjusting $factor according to the size of range.
#_____________________________________________________________
if($start1 <= $start2 and $end1 >= $end2){ ## -----------------
if(($end1-$start1)*$factor <= ($end2-$start2) ){ # --------
$new_start=(($start1+$start2)/2);
$new_end =(($end1+$end2)/2);
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--; next;
}
}
if( $start2 <= $start1 and $end2 >= $end1){ ## -------
if(($end2-$start2)*$factor <= ($end1-$start1) ){ # ----------------
$new_start=(($start2+$start1)/2);
$new_end =(($end1+$end2)/2);
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--; next;
}
}
if($start1 <= $start2 and $start2 <= $end1){ # -----------
if(($end1-$start2) >= ($end2-$start1)*$factor ){ # -----------
$new_start=(($start2+$start1)/2);
$new_end =(($end1+$end2)/2);
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--; next;
}
}
if($start2 <= $start1 and $start1 <= $end2){ # -----------
if(($end2-$start1) >= ($end1-$start2)*$factor ){ # -----------
$new_start=(($start2+$start1)/2);
$new_end =(($end1+$end2)/2);
splice(@all_ranges, $i, 2, "$new_start\-$new_end");
$i--; next;
}
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# making everything integer at the last minute to save CPU
#___________________________________________________________
@all_ranges = map{ /(\S+)\-(\S+) *$/ and int($1).'-'.int($2) } @all_ranges;
return(\@all_ranges);
}
#_________________________________________________________________________
# Title : merge_similar_seqlets
# Usage : @all_seqlets = @{&merge_similar_seqlets(@all_seqlets)};
# Function : merges seqlet sets which have identical
# sequences and share similar regions by connection factor of 30%
# This means, if any two seqlets from the same sequences which
# share more than 70% seqlet regions overlapping are merged
# This only sees the very first sequence in the seqlets line!!!
# (so, PARTIAL MERGE !!)
# Example : INPUT:
#
# @input=( 'seq1_1-30 seq2_1-40 seq3_1-50',
# 'seq1_2-49 seq3_4-40 seq4_2-99'....)
#
# @output=('seq1_1-30 seq2_1-45 seq3_2-45 seq4_2-99');
#
# Keywords : merge_similar_sequences, merge_sequence_names, merge_sequences,
# merge_sequence_ranges, merge_similar_sequences_with_ranges,
# merge_seqlets, merge_duplication_modules
# Options :
#
# f=<digit> for determing the factor in filtering out non-homologous
# regions, 7 = 70% now!!
# l=<digit> for seqlet(duplication module) length threshold
# z for activating remove_similar_sequences, rather than remove_dup....
# S $short_region= S by S -S # taking shorter region overlap in removing similar reg
# L $large_region= L by L -L # taking larger region overlap in removing similar reg
# A $average_region=A by A -A # taking average region overlap in removing similar reg
#
# Category :
# Version : 2.0
#-------------------------------------------------------------------------------
sub merge_similar_seqlets{
my (@all_seqlets, @result_all_seqlets, $i, $j, $k, $seq1, $start1, $end1, $seq2,
$smaller_leng, $start2, $end2, @split, @split1, @split2,
$short_region, $large_region, $average_region, $overlapping_seq_match_size);
my $factor=7; # 30% sequence mismatch region is allowed(3)
my $leng_thresh=30;
my $optimize=1;
$average_region='A'; # default
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Sorting (parsing) input to get options and input array
#_________________________________________________________
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'ARRAY'){
@all_seqlets=@{$_[$i]};
}elsif($_[$i]=~/f=(\S+)/){ $factor=$1;
}elsif($_[$i]=~/z/i){ $optimize=1;
}elsif($_[$i]=~/l=(\d+)/i){ $leng_thresh=$1;
}elsif($_[$i]=~/^S/){ $short_region='S'; $large_region=$average_region='';
}elsif($_[$i]=~/^L/){ $large_region='L'; $short_region=$average_region='';
}elsif($_[$i]=~/^A/){ $average_region='A'; $short_region=$large_region =''; }
}
if(@all_seqlets==1){
print "\n# (1) merge_similar_seqlets: \@all_seqlets == 1, returning\n" if $verbose;
print "\n# \@all_seqlets = @all_seqlets\n" if $verbose;
return(\@all_seqlets);
}else{
print "\n# (1) merge_similar_seqlets: \@all_seqlets > 1, Processing\n" if $verbose;
if( $verbose){
for($i=0; $i< @all_seqlets; $i++){
print "\n# $all_seqlets[$i]" ;
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is to remove which are identical in @all_seqlets;
#_________________________________________________________
F1: for($i=0; $i< @all_seqlets; $i++){
my $merged_two_seqlet_lines;
if($all_seqlets[$i] eq $all_seqlets[$i+1]){
print "\n# (2) merge_similar_seqlets: \$all_seqlets\[$i\] equals \$all_seqlets\[$i+1\]\n" if $verbose;
splice(@all_seqlets, $i+1, 1);
$i-- if $i >0;
next F1;
}else{
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# @split1 and 2 are arrays from different string entry in @all_seqlets
#______________________________________________________________________
@split1=sort split(/ +/, $all_seqlets[$i]);
@split2=sort split(/ +/, $all_seqlets[$i+1]);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# (3) If the first elements of @split1 and 2 are identical, lets merge the two arrays
#________________________________________________________________________________
if($split1[0] eq $split2[0] or $split1[0] eq $split2[1] or $split1[0] eq $split2[2]){
print "\n# (3) \$split1[0] and \$split2[0] are identical, good!\n" if $verbose;
@split=(@split1, @split2);
if(1){ #~~~~~ optimize option removes similar seqlets
$all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
$short_region, $large_region, $average_region)} );
}else{
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Only removes exactly identical ones
#__________________________________________________________
$all_seqlets[$i]= join(' ', @{&remove_dup_in_array(\@split, 's')} );
}
print "\n# (3) New \$all_seqlets\[\$i\] is \n$all_seqlets[$i]\n" if $verbose;
splice(@all_seqlets, $i+1, 1);
$i-- if $i >0;
next F1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) If the first elements of @split1 and 2 are NOT identical, lets check the sequence ranges
#_____________________________________________________________________________________________
if($verbose){
print "\n# (4) merge_similar_seqlets: the first elements of \@split1 and \@split2 are NOT identical\n";
print "# (4) >>>>>>>>> Checking the regions of them, \$factor= $factor\n";
}
F2: for($j=0; $j < @split1; $j++){
if($split1[$j] =~/^ *(\S+)_(\d+)\-(\d+)/){
my ($seq1, $start1, $end1)=($1, $2, $3);
F3: for($k=0; $k<@split2; $k++){
if($split2[$k] =~/^ *(\S+)_(\d+)\-(\d+)/){
my($seq2, $start2, $end2)=($1, $2, $3);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
# Check if the seqs are identicl (from the two arrays), no point to merge which are not identical from the first
#__________________________________________________________________________________________
if($seq1 eq $seq2){
if($verbose){
print "# (5) <<<<< The sequence names are identical with different regions, See if we can merge!\n";
}
$diff_start=abs($start1-$start2);
$diff_end =abs($end1 -$end2 );
$leng1=$end1-$start1;
$leng2=$end2-$start2;
if($leng1 >= $leng2){
$smaller_leng=$leng2;
$larger_leng =$leng1
}else{
$smaller_leng=$leng1;
$larger_leng =$leng2
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checking the minimal seq region leng here
#______________________________________________________
if($smaller_leng < $leng_thresh){ next }
$overlapping_seq_match_size=${&get_overlapping_seq_match_size($start1, $end1, $start2, $end2)};
$averge_seq_leng_of_2_seqs=($leng1+$leng2)/2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is the critically important part
#_______________________________________________________________
if($average_region){
$finally_adjusted_seq_leng=$averge_seq_leng_of_2_seqs*($factor/10);
}elsif($short_region){
$finally_adjusted_seq_leng=$smaller_leng*($factor/10);
}elsif($large_region){
$finally_adjusted_seq_leng=$larger_leng*($factor/10);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Now let's check if we regard them homologous or not\
#_______________________________________________________
if( $overlapping_seq_match_size >= $finally_adjusted_seq_leng){
@split= (@split1, @split2);
if ($verbose){
print "# (5) merge_similar_seqlets: \$averge_seq_leng_of_2_seqs=$averge_seq_leng_of_2_seqs";
print " $$$$ Merging occurs with \n@split \n";
}
if($optimize){ #~~~~~ $optimize option removes similar seqlets
$all_seqlets[$i]= join(' ', sort @{&remove_similar_seqlets(\@split,
$short_region, $large_region, $average_region)} );
}else{
$all_seqlets[$i]= join(' ', @{&remove_dup_in_array(\@split, 's')} );
}
$merged_two_seqlet_lines=1;
splice(@all_seqlets, $i+1, 1);
$i-- if $i >0;
next F1;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# We believe they are not homologous
#____________________________________________
else{
if ($verbose){
print "\n# (5) merge_similar_seqlets !! MERGING DID NOT occur->
$split1[$j] $split2[$k] \n";
print "\n# \$averge_seq_leng_of_2_seqs= $averge_seq_leng_of_2_seqs, \$overlapping_seq_match_size= $overlapping_seq_match_size \$finally_adjusted_seq_leng= $finally_adjusted_seq_leng\n";
print "\n# (5) merge_similar_seqlets, \$all_seqlets[$i]\$all_seqlets[$i+1]\n$all_seqlets[$i]\n$all_seqlets[$i+1]\n";
}
next F3;
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If there is no range (region) in seq naem, let's skip, as there is no way to check
#__________________________________________________________________________________
else{ # when split2 does not match xxx_10-20 format
next;
}
}
}else{ # when split1 does not match xxx_10-20 format
next;
}
}
unless($merged_two_seqlet_lines){
}
}
print "\n# \@all_seqlets is @all_seqlets =======\n" if $verbose;
return(\@all_seqlets);
}
#______________________________________________________________
# Title : sort_by_digits_in_string
# Usage :
# Function : sorts arrays of strings like
#
# MJ0228_314-573 MJ1197_348-601
# MJ0228_451-576 sll0078_502-594 sll1425_489-611
# MJ0228_479-572 sll0078_502-594
#
# According to the digits after seq names _314-, _451-, _479-
# in the above
# This only looks at the very first sequence in the string
#
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.4
#--------------------------------------------------------------
sub sort_by_digits_in_string{
my (@out, $i, @temp1, @temp2, $old, @T);
my @array_of_string=sort @{$_[0]};
for($i=0; $i<= @array_of_string; $i++){
if($array_of_string[$i]=~/^((\S+)_(\d+)\-(\d+) *.*)$/){
unless(defined($old)){
$old=$2;
push(@temp1, $1);
push(@temp2, $3);
next;
}elsif($2 eq $old){
push(@temp1, $1);
push(@temp2, $3);
next;
}elsif( ($2 ne $old)||($i==$#array_of_string) ){
&sort_and_put_strings_to_out;
push(@temp1, $1);
push(@temp2, $3);
$old =$2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
sub sort_and_put_strings_to_out{
my ($j, $k, $num);
@temp2=sort { $a<=>$b } @temp2; ## sort numerically
F1: for($j=0; $j< @temp2; $j++){
$num=$temp2[$j];
for($k=0; $k< @temp1; $k++){
if($temp1[$k]=~/^(\S+)_$num\-/){
push(@out, $temp1[$k]);
splice(@temp1, $k, 1);
$k--;
splice(@temp2, $j, 1);
$j--;
next F1;
}
}
}
@temp1=@temp2=();
}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
}elsif($i > 0){ ## for the very last sort
&sort_and_put_strings_to_out;
}
}
return(\@out);
}
#______________________________________________________________
# Title : sort_words_in_string
# Usage :
# Function : sort words in strings sperated by ' ' or "\n"
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : sort_words_in_sequences, sort_sequences_in_string,
# sort_strings_in_string, sort_string_by_words, sort_elements_in_string
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub sort_words_in_string{
my @in=@{$_[0]} || @_;
my @OUT;
for (@_){
push(@OUT, join(' ', sort split(/ +|\n/) ));
}
return(\@OUT);
}
#__________________________________________________________________________
# Title : convert_hmmls_to_msp_files
# Usage : @out=@{&convert_hmmls_to_msp_files(\@file)};
# Function :
# Example :
# Keywords : convert_hmmls_to_msp
# Options :
# S=$single_out_file_name for producing single msp file with all the hmmls contents
# E=Enguiry_name for specifying enquiry seq name rather than 'HMM', the default
# $bit_score_threshold= by t=
# Returns :
# Argument :
# Version : 1.4
#----------------------------------------------------------------------------
sub convert_hmmls_to_msp_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@all_the_files_written, $written_file, $base,
$bit_score_threshold, $out_msp_file_name, $input_hmmls_file,
%out);
$bit_score_threshold=10;
if($vars{'S'}=~/\S/){ $single_out_file_name=$vars{'S'} };
if($vars{'t'}=~/\S/){ $bit_score_threshold =$vars{'t'} };
if($vars{'E'}=~/\S/){ $enquiry_name =$vars{'E'} # default $enquiry_name is input file base
}else{ $enquiry_name='HMM' }
for($i=0; $i< @file; $i++){
if($vars{'E'}=~/\S/){ $base=$enquiry_name; # When $enquiry_name is given, it uses for output name
}else{
$base=${&get_base_names($file[$i])};
}
$out_msp_file_name="$base\.msp";
$input_hmmls_file=$file[$i];
if($vars{'S'}=~/\S/){
%out=(%out, %{&open_hmmls_files($input_hmmls_file,
'm', "E=$enquiry_name",
"t=$bit_score_threshold"
)} );
}else{
%out=%{&open_hmmls_files($input_hmmls_file,
"t=$bit_score_threshold", 'm')}; # m for msp out
$written_file=${&write_msp_files(\%out, $out_msp_file_name)};
push(@all_the_files_written, $written_file);
}
}
if($vars{'S'}=~/\S/){
$written_file=${&write_msp_files(\%out, $single_out_file_name)};
push(@all_the_files_written, $written_file);
}
if(@all_the_files_written > 1){
return(\@all_the_files_written);
}else{
return(\$all_the_files_written[0]);
}
}
#______________________________________________________________
# Title : convert_mmp_to_mrg
# Usage :
# Function :
# Example :
# Example OUT as string
#
# slr1950 sll1920 sll0672 sll1076 sll1614 slr0797 slr0798 slr0822 slr1729
# slr1729 sll1076 sll0672 sll1614 sll1920 slr0797 slr0798 slr0822 slr1950
#
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub convert_mmp_to_mrg{
my @mmp=@{$_[0]};
my($i, $long, $out, @out, $leading_seq);
for($i=0; $i< @mmp; $i++){
if(($mmp[$i]=~/^ *\d+ +\d+\.?[e\-\d]* +\d+ +\d+ +(\S+) +\d+ +\d+ +(\S+) *$/)&&($1 eq $2)){
next;
}elsif($mmp[$i]=~/^ *\d+ +\d+\.?[e\-\d]* +\d+ +\d+ +(\S+) +\d+ +\d+ +(\S+) *$/){
$leading_seq=$1;
$long=$2;
$long=~s/\,/ /g;
$out="$leading_seq $long";
push(@out, $out);
}
}
return(\@out);
}
#_______________________________________________________________________________
# Title : add_ranges_in_msp_line
# Usage :
# Function : this adds ranges to the seqnames of msp files
# mmp line is msp line with additional sequences at the end
# Example :
# Keywords : convert_msp_to_mmp, convert_msp, convert_msp_2_mmp
# change_msp_to_mmp, add_range_in_msp, convert_msp_line_to_mmp_line
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.5
#-------------------------------------------------------------------------------
sub add_ranges_in_msp_line{
my $input_msp=${$_[0]} || $_[0];
my($score, $evalue, $long_1, $new_seq1, $new_seq2, $middle,
$start1, $end1, $start2, $end2, $seq1, $seq2, $new);
if($input_msp=~/^ *(\d+) +(\S+) *\S*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
($score, $evalue, $start1, $end1, $start2, $end2)=($1, $2, $3, $4, $6, $7);
($seq1, $seq2)=($5, $8);
if($seq1=~/(\S+)\_\d+\-\d+/){
$new_seq1="$1\_$start1\-$end1";
}else{
$new_seq1="$seq1\_$start1\-$end1";
}
if($seq2=~/(\S+)\_\d+\-\d+/){
$new_seq2="$1\_$start2\-$end2";
}else{
$new_seq2="$seq2\_$start2\-$end2";
}
$new=sprintf("%-6s %-9s %-5s %-5s %-32s %-5s %-5s %-32s",
$score, $evalue, $start1, $end1, $new_seq1, $start2, $end2, $new_seq2);
}
return(\$new);
}
#______________________________________________________________
# Title : convert_msp_line_to_mmp_line
# Usage :
# Function : this adds ranges to the seqnames of msp files
# mmp line is msp line with additional sequences at the end
# Example :
# Keywords : convert_msp_to_mmp, convert_msp, convert_msp_2_mmp
# change_msp_to_mmp, add_range_in_msp
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.5
#--------------------------------------------------------------
sub convert_msp_line_to_mmp_line{
my $input_msp=${$_[0]} || $_[0];
my($score, $evalue, $long_1, $new_seq1, $new_seq2, $middle,
$start1, $end1, $start2, $end2, $seq1, $seq2, $new);
if($input_msp=~/^ *(\d+) +(\S+) *\S*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
($score, $evalue, $start1, $end1, $start2, $end2)=($1, $2, $3, $4, $6, $7);
($seq1, $seq2)=($5, $8);
if($seq1=~/(\S+)\_\d+\-\d+/){
$new_seq1="$1\_$start1\-$end1";
}else{
$new_seq1="$seq1\_$start1\-$end1";
}
if($seq2=~/(\S+)\_\d+\-\d+/){
$new_seq2="$1\_$start2\-$end2";
}else{
$new_seq2="$seq2\_$start2\-$end2";
}
$new=sprintf("%-6s %-9s %-5s %-5s %-32s %-5s %-5s %-32s",
$score, $evalue, $start1, $end1, $new_seq1, $start2, $end2, $new_seq2);
}
return(\$new);
}
#________________________________________________________________________________
# Title : merge_sequence_alignments
# Usage : &merge_sequence_alignments(@seq); while @seq has
# @seq=(\%hash1, \%hash2); while %hash1 and %hash2 have
# %hash1=qw(seq1 ANN-NTMQQRRQQQRKRRRQQQSSSSTTST seq2 --NNN--QQ--QQQ--RRRR--SSSS--);
# %hash2=qw(seq2 NN-QQQQQ--RRRR----SS--SS--- seq3 -NNXQQQXQRTRRRXTTSTSSMMSSTTT);
#
# Function :
# Example :
# Keywords : combine_sequence_alignment, merge_sequence_alignment_pairs
# merge_seq_alignment, make_interm_alignment, make_3_way_alignment
# merge_alignment, combine_alignment
# Options :
# l= for sequence block length by print_seq_in_block subroutine
# t= for specifying the length of seq names shown.
# t for truncating the seq names in printing out.
# s for sorting the final output lines (default anyway for print_seq_in_block)
#
# Category :
# Version : 1.3
#--------------------------------------------------------------------------------
sub merge_sequence_alignments{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@splited1_common, @splited2_common, @splited1_non_common, @splited2_non_common,
@gap_pos1, @gap_pos2, $block_length, $sort_seq_names,
$truncate_name_to_10_char, $trunc_name_to, $block_range );
$block_length=60;
if($vars{'l'}=~/\d+/){ $block_length=$vars{'l'}; }
if($char_opt=~/t/){ $truncate_name_to_10_char='t' }
if($vars{'t'}=~/\d+/){ $trunc_name_to=$vars{'t'}; }
if($char_opt=~/s/){ $sort_seq_names='s' }
if($vars{'r'}=~/(\d+\-\d+)/){ $block_range= $1 };
for($i=0; $i< @hash; $i+=2){
my %hash1=%{$hash[$i]};
my %hash2=%{$hash[$i+1]};
my ($pair1_name1, $pair1_name2)=keys %hash1;
my ($pair2_name1, $pair2_name2)=keys %hash2;
# finding the common entry
if("$pair1_name1" eq "$pair2_name1"){
$common_seq_entry=$pair1_name1;
$non_common_seq_entry1=$pair1_name2;
$non_common_seq_entry2=$pair2_name2;
}elsif("$pair1_name1" eq "$pair2_name2"){
$common_seq_entry=$pair2_name2;
$non_common_seq_entry1=$pair1_name2;
$non_common_seq_entry2=$pair2_name1;
}elsif("$pair1_name2" eq "$pair2_name1"){
$common_seq_entry=$pair1_name2;
$non_common_seq_entry1=$pair1_name1;
$non_common_seq_entry2=$pair2_name2;
}else{
print "\n# merge_sequence_alignments:
ERROR, I can not find common seq entry: $pair1_name1 $pair2_name1 $pair2_name1 $pair2_name2\n";
next;
}
%hash1=%{&make_seq_alignment_length_even(\%hash1)};
%hash2=%{&make_seq_alignment_length_even(\%hash2)};
#~~~~~~~~~~~~~~~~~~~~~~~`~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Aligning the start of common sequences
# i.e. putting pre gap part to a sequence which does not have it.
#
# NKNWKLRAHLC-KHTGEKP---FPCKEEGCDKGFTSLHHLTRHS---ITHTGEKN--FKCDSDKCDLTFTTKANMKKHFNRFH splited1_common
# --------------NKNWKLRAHLCKHTGEKPFPCKEEGCDKGFTSLHHLTRHSITHTGEKNFKCDSDKCDLTFTTKANMKKHFNRFH-- splited2_common
# becomes->
#
# --------------NKNWKLRAHLC-KHTGEKP---FPCKEEGCDKGFTSLHHLTRHS---ITHTGEKN--FKCDSDKCDLTFTTKANMKKHFNRFH splited1_common
# --------------NKNWKLRAHLCKHTGEKPFPCKEEGCDKGFTSLHHLTRHSITHTGEKNFKCDSDKCDLTFTTKANMKKHFNRFH-- splited2_common
#________________________________________________________________________
if( $hash1{$common_seq_entry}=~/^(\-+)/){
$hash2{$common_seq_entry}="$1".$hash2{$common_seq_entry};
$hash2{$non_common_seq_entry2}="$1".$hash2{$non_common_seq_entry2};
}elsif($hash2{$common_seq_entry}=~/^(\-+)/){
$hash1{$common_seq_entry}="$1".$hash1{$common_seq_entry};
$hash1{$non_common_seq_entry1}="$1".$hash1{$non_common_seq_entry1};
}
@gap_pos1=@{&get_gap_positions(\$hash1{$common_seq_entry}, 'p' )}; # p means all positive positions wanted
@gap_pos2=@{&get_gap_positions(\$hash2{$common_seq_entry}, 'p' )};
@splited1_common =split(//, $hash1{$common_seq_entry} );
@splited1_non_common=split(//, $hash1{$non_common_seq_entry1} );
@splited2_common =split(//, $hash2{$common_seq_entry} );
@splited2_non_common=split(//, $hash2{$non_common_seq_entry2} );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Core algorithm
#__________________________________________
for($i=0; $i< @splited1_common; $i++){
if($splited1_common[$i] ne $splited2_common[$i]){
if($splited1_common[$i]=~/\W/){
splice(@splited2_common, $i, 0, '-');
splice(@splited2_non_common, $i, 0, '-');
}elsif($splited2_common[$i]=~/\W/){
splice(@splited1_common, $i, 0, '-');
splice(@splited1_non_common, $i, 0, '-');
if($splited1_common[$i] eq $splited1_non_common[$i]){
$homology_line2[$i]=':';
}else{
$homology_line2[$i]=' ';
}
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Making Homology line
#_____________________________________________
for($i=0; $i< @splited1_non_common; $i++){
if($splited1_non_common[$i] eq '-'){ $homology_line1[$i]=' '; }
if($splited1_non_common[$i] eq $splited1_common[$i] and $splited1_non_common[$i] eq $splited2_non_common[$i] ){
$homology_line1[$i]= $splited1_non_common[$i] unless $splited1_non_common[$i] eq '-';
}elsif($splited1_non_common[$i] eq $splited1_common[$i] ){
$homology_line1[$i]=$splited1_non_common[$i] unless $splited1_non_common[$i] eq '-';
}elsif($splited2_non_common[$i] eq $splited1_non_common[$i]){
$homology_line1[$i]=':' unless $splited1_non_common[$i] eq '-';
}elsif( ${&amino_acid_homology_matrix($splited2_non_common[$i], $splited1_non_common[$i])} ){
$homology_line1[$i]='.' unless $splited1_non_common[$i] eq '-';
}
else{ $homology_line1[$i]=' '; }
if($splited2_non_common[$i] eq '-'){ $homology_line2[$i]=' '; }
if($splited2_non_common[$i] eq $splited2_common[$i] and $splited2_non_common[$i] eq $splited1_non_common[$i] ){
$homology_line2[$i]=$splited2_non_common[$i] unless $splited2_non_common[$i] eq '-';
}elsif($splited2_non_common[$i] eq $splited2_common[$i]){
$homology_line2[$i]=$splited2_non_common[$i] unless $splited2_non_common[$i] eq '-';
}elsif($splited2_non_common[$i] eq $splited1_non_common[$i]){
$homology_line2[$i]=':' unless $splited2_non_common[$i] eq '-';
}elsif( ${&amino_acid_homology_matrix($splited2_non_common[$i], $splited1_non_common[$i])} ){
$homology_line2[$i]='.' unless $splited1_non_common[$i] eq '-';
}else{ $homology_line2[$i]=' '; }
}
if($verbose){
print @splited1_non_common, " \t splited1_non_common\n";
#print @homology_line1, " \t homology line\n";
#print @splited1_common, " \t splited1_common\n";
print @homology_line1, " \t homology line\n";
print @splited2_common, " \t splited2_common\n";
print @homology_line2, " \t homology line\n";
print @splited2_non_common, " \t splited2_non_common\n";
}
$out_hash_issa{"1 $non_common_seq_entry1 "}=join('', @splited1_non_common);
$out_hash_issa{"3 $common_seq_entry"}=join('', @splited2_common);
$out_hash_issa{'2 homol_line1'}=join('', @homology_line1);
$out_hash_issa{'4 homol_line2'}=join('', @homology_line2);
$out_hash_issa{"5 $non_common_seq_entry2"}=join('', @splited2_non_common);
&print_seq_in_block(\%out_hash_issa, $sort_seq_names, "t=$trunc_name_to", "f=defaul_result\.issa",
$truncate_name_to_10_char, "l=$block_length", "r=$block_range");
}
}
#________________________________________________________________________________________
# Title : merge_sequence_in_msp_file
# Usage :
# Function :
# Example : INPUT: (MSP file) ===>
# 59 2.6 47 64 d2pia_3 10 30 d1erd___10-30
# 161 1.1e-07 24 91 d2pia_3 16 85 d1frd___16-85
#
# 722 0 1 106 d1put__ 1 106 d1put___1-106
# 66 4.9 2 68 d1put__ 43 106 d2lbp___43-106
# 69 1.3 12 49 d1put__ 81 120 d1cgo___81-120
#
# 60 3.3 13 38 d1frd__ 32 57 d1orda1_32-57
# 65 1.7 21 58 d1frd__ 40 69 d2mtac__40-69
#
# ==== OUTPUT ===>
# d1frd___1-98 d1frd___1-98_1-98 d1frd___16-85 d2pia_3_24-91_24-91
# d1frd___16-85_16-85 d2pia_3_24-91
# d1put___1-106 d1put___1-106_1-106
# d2pia_3_1-98 d2pia_3_1-98_1-98
#
# Keywords : mergr_seq_in_msp_file, merge_sequence_in_msp, merge_sequences_in_msp_file
# Options :
# $dynamic_factor = y by y -y # adjusting factor value dynamically(more seq higher factor)
# $short_region = S by S -S # taking shorter region overlapped in removing similar regions
# $large_region = L by L -L # taking larger region overlapped in removing similar regions
# $average_region = A by A -A # taking average region overlapped in removing similar regions
#
# Version : 2.9
#----------------------------------------------------------------------------------------
sub merge_sequence_in_msp_file{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($msp_value, @all_seqlets, %temp_hash, @msp_chunks, $clu_out, $size_of_all_seqlets,
$base, $optimize, $mrg_out, @arr, $sat_out, %final_hash_out, @final_pre_hash,
$length_thresh, $merge, $factor, $evalue, $score, $dynamic_factor, $score_match,
$eval_match, $query_seq, $query_start, $query_stop, $match_seq, $match_start,
$short_region, $large_region, $average_region, $original_clu_size, $match_stop);
$factor=$default_factor=7; #~~~~ default connection factor U, 7 means 70% now!
$length_thresh=30;
$evalue=10;
$score =75;
if(@file < 1){ print "\n# (E) merge_sequence_in_msp_file needs at least 1 MSP file\n"; exit }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Following changes the defaults with given parameters
#_____________________________________________________________
if($char_opt=~/z/i){ $optimize='z'; ## This will cause using remove_similar_seqlets than remove_dup_in_array !
}if($char_opt=~/m/){ $merge='m';
}if($char_opt=~/y/){ $dynamic_factor='y';
}if($char_opt=~/v/){ $verbose='v';
}if($char_opt=~/S/){ $short_region='S';
}if($char_opt=~/L/){ $large_region='L';
}if($char_opt=~/A/){ $average_region='A';
}if($vars{'T'}=~/\d+/){ $length_thresh=$vars{'T'};
}if($vars{'f'}=~/\S+/){ $factor=$vars{'f'}; ## Here I give a generous $factor !
}if($vars{'s'}=~/\d+/){ $score = $vars{'s'};
}if($vars{'e'}=~/\d+/){ $evalue= $vars{'e'}; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Just to inform what parameters have been chosen
#_____________________________________________________________
if($verbose){
print "\n# (1) merge_sequence_in_msp_file : default \$score : $score";
print "\n# : default \$evalue : $evalue";
print "\n# : used \$length_thresh : $length_thresh";
print "\n# : default \$factor : $default_factor";
print "\n# : used \$factor : $factor";
print "\n# : \$dynamic_factor : $dynamic_factor\n";
}
for($c=0; $c< @file; $c++){
open(MSP, "$file[$c]");
$base=${&get_base_names($file[$c])};
$clu_out="$base\_F${factor}.clu"; # <-- This is the most important output. Sarah's program will process this
$sat_out="$base\_F${factor}.sat";
print "# (2) merge_sequence_in_msp_file : processing $file[$c] for $clu_out\n" if $verbose;
my @msp1=<MSP>;
for($i=0; $i< @msp1; $i++){
#~~~~~~~~~~ Include range or NOT in the seq name ~~~~~~~~~~~~~~~~~~~~~~~~~~`
# %temp_hash is just to get the chunk of MSP block. As msp file uses empty line as a delimiter
#____________________________________________________________________________
if($char_opt=~/r/){
if($msp1[$i]=~/^ *(\d+) +(\S+) *\S* +(\d+) +(\d+) +(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
$score_match=$1;
$eval_match=$2;
$query_seq=$5;
$query_start=$3;
$query_stop=$4;
$match_seq=$8;
$match_start=$6;
$match_stop=$7;
if($query_seq=~/\S+_\d+\-\d+$/){ $new_seq1=$query_seq }else{ $new_seq1="$query_seq\_$query_start\-$query_stop"; }
if($match_seq=~/\S+_\d+\-\d+$/){ $new_seq2=$match_seq }else{ $new_seq2="$match_seq\_$match_start\-$match_stop"; }
if($new_seq1 eq $new_seq2){
print "# (3.0) merge_sequence_in_msp_file: Skipped = $msp1[$i]" if $verbose;
next
};
if($score_match < $score or $eval_match > $evalue){
print "# (3.1) merge_sequence_in_msp_file: Skipped = $msp1[$i]" if $verbose;
next
};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Modifying $msp1[$i] line !!!
#______________________________
$msp1[$i]=sprintf("%s %-3s %s %s %s %s %s %s",
$score_match, $eval_match, $query_start, $query_stop, $new_seq1, $match_start,
$match_stop, $new_seq2);
$temp_hash{$query_seq}.="$msp1[$i]\n";
}
}else{
if($msp1[$i]=~/^ *(\d+)[ \t]+(\S+)[ \t]*\S*[ \t]+\d+[ \t]+\d+[ \t]+(\S+)[_\d+\-\d+]? +\d+[\t ]+\d+[ \t]+\S+/){
if($1 < $score or $2 > $evalue){
print "# (3.0) merge_sequence_in_msp_file: Skipped = $msp1[$i]" if $verbose;
next };
$temp_hash{$3}.="$msp1[$i]\n";
}
}#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
}
close(MSP);
}
$original_clu_size=@msp_chunks= values(%temp_hash); ## Using temp hash is more than 2 times faster than push
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Determining the dynamic factor here (when 'd' opt is set)
#____________________________________________________________
if($dynamic_factor){
#--> 100 => 10, 1000 => 15, 10000 => 20
print "\n# ### \$factor: $factor\n";
$factor += (log($original_clu_size)*5)/10 - 1; ## This is a simplistic.
if($factor > 9.5){ $factor=9.5 } # this is the very upper limit for any factor.
print "\n# ### \$factor: $factor\n";
}
if($verbose){
print "# (4) merge_sequence_in_msp_file: The msp chunks used are:\n";
for($i=0; $i< @msp_chunks; $i++){
print "$msp_chunks[$i]\n";
}
}
for($i=0; $i< @msp_chunks; $i++){
print "\n# (5) merge_sequence_in_msp_file: Processing eash chunk with merge_sequence_in_msp_chunk\n" if $verbose;
@arr=@{&merge_sequence_in_msp_chunk($msp_chunks[$i], $verbose, $optimize,
"$merge", "e=$evalue", "s=$score", "f=$factor", "T=$length_thresh",
$short_region, $large_region, $average_region)};
push(@all_seqlets, @arr);
}
#~~~~~~~~~ sorting inner sequences in strings ~~~~~~~~~
#______________________________________________________
@all_seqlets=@{&sort_words_in_string(@all_seqlets)}; ## This speeds up about 2 times !!!
#~~~~~~~ Sort by the _digit- in seqlet names ~~~~~~~~~
@all_seqlets= map{$_->[0]} sort{$a->[1] cmp $b->[1] or $a->[2] <=> $b->[2] }
map {/^ *((\S+)_(\d+)\-(\d+).*)/ && [$1, $2, $3, $4]} @all_seqlets;
if( $verbose){
print "\n# (6) merge_sequence_in_msp_chunk: Showing the very final result before merging\n";
for($i=0; $i< @all_seqlets; $i++){
print "\n$all_seqlets[$i]";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# merge sequences in a simple way until there is no change in the array
# This is an incomplete merge(merges first seqlets of string ...
#______________________________________________________________________
for($i=0; $i< @msp_chunks; $i ++){
ITERATION_RETURN_POINT:
$size_of_all_seqlets=@all_seqlets;
@all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
$short_region, $large_region, $average_region, "f=$factor")};
if($size_of_all_seqlets > @all_seqlets){
@all_seqlets = @{&merge_similar_seqlets(\@all_seqlets, $optimize,
$short_region, $large_region, $average_region, "f=$factor")};
goto ITERATION_RETURN_POINT;
}else{
last;
}
}
if($optimize){
@all_seqlets=@{&remove_similar_seqlets(\@all_seqlets,
$short_region, $large_region, $average_region)};
@all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};
}else{
@all_seqlets=@{&remove_dup_in_array(\@all_seqlets)};
}
return(\@all_seqlets);
}
#__________________________________________________________________________
# Title : merge_sequence_in_msp_chunk
# Usage :
# Function : merges sequences which are linked by common regions
# This filters the sequences by evalue and ssearch score
# This is the main algorithm of merging similar sequences.
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : connect_sequence_in_msp, link_sequence_in_msp_chunk
# connect_sequence_in_msp_chunk, link_sequence_in_msp
# merge_sequence, link_sequence, connect_sequence
# Options : _ for debugging.
# # for debugging.
# m for merge file output format (.mrg)
# t= for threshold of seqlet length eg) "t=30"
# f= for overlap factor (usually between 2 to 7 )
# 2 means, if the two regions are not overlapped
# by more than HALF of of the smaller region
# it will not regard as common seqlet block
# s= for ssearch score minimum
# e= for ssearch e value maximum
# S for S -S # taking shorter region overlapped in removing similar regions
# L for L -L # taking larger region overlapped in removing similar regions
# A for A -A # taking average region overlapped in removing similar regions
#
# Returns :
# Argument :
# Version : 2.4
#--------------------------------------------------------------
sub merge_sequence_in_msp_chunk{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($ssearch_score2, $evalue_found2, $evalue_found1, $ssearch_score1, $optimize );
my ($L, %out_hash, @out, $LL, @Final_out, $verbose, $final_factor, $R_diff,
$short_region, $large_region, $average_region);
my $factor =4; # default factor for around 30% sequence mis-overlap is the threshold for common block
#~~~~~~~~~~~~~~ The lower the factor the larger clustering will occur ~~~~~~~~~~~~
my $score =75; # default ssearch score. seq below this will be chucked out
my $evalue =40; # default maximum e value used. Seq higher than this will be thrown out
my $length_thresh =30; # sequence length threshold. overlap less than this will be ignored
if($char_opt=~/v/){ $verbose = 'v'
}if($char_opt=~/z/){ $optimize = 'z'
}if($char_opt=~/S/){ $short_region='S';
}if($char_opt=~/L/){ $large_region='L';
}if($char_opt=~/A/){ $average_region='A'; }
if($vars{'T'}=~/\d+/){
$length_thresh=$vars{'T'}; print "\n# merge_sequence_in_msp_chunk: Thresh is $length_thresh\n" if (defined $verbose);
}if($vars{'f'}=~/\S+/){
$factor=$vars{'f'}; print "\n# merge_sequence_in_msp_chunk: Factor is $factor\n" if (defined $verbose);
}if($vars{'s'}=~/\d+/){
$score = $vars{'s'}; print "\n# merge_sequence_in_msp_chunk: Score is $score\n" if (defined $verbose);
}if($vars{'e'}=~/\d+/){
$evalue= $vars{'e'}; print "\n# merge_sequence_in_msp_chunk: Evalue is $evalue\n" if (defined $verbose);
}
my @seqlets=split(/\n+/, (${$_[0]} || $_[0]) );
print "@seqlets" if $verbose;
F1: for($i=0; $i < @seqlets; $i ++){
if($seqlets[$i]=~/^ *((\d+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
if($6 eq $9){ splice(@seqlets, $i, 1); $i--; next };
($long_match1, $enq_seq1, $mat_seq1, $R_start1, $R_end1 )=($1, $6, $9, $4, $5);
$R_leng1=$R_end1-$R_start1; $ssearch_score1= $2; $evalue_found1 = $3;
}
if( ($R_leng1 < $length_thresh) || ($ssearch_score1 < $score) ){ splice(@seqlets, $i, 1); $i--; next; }
if( $evalue_found1 > $evalue){ splice(@seqlets, $i, 1); $i--; next; }
F2: for($j=0; $j < @seqlets; $j ++){
if($seqlets[$i] eq $seqlets[$j]){ next };
if($seqlets[$j]=~/^ *((\d+) +(\d+\.?[e\-\d]*) +(\d+) +(\d+) +(\S+) +(\d+) +(\d+)) +(\S+) *(.*)/){
($long_match2, $enq_seq2, $mat_seq2, $R_start2, $R_end2)=($1, $6, $9, $4, $5);
$R_leng2=$R_end2-$R_start2; $ssearch_score2=$2; $evalue_found2= $3;
}
if( ($R_leng2 < $length_thresh)||($ssearch_score2 < $score) ){ splice(@seqlets, $j, 1); $j--; next; }
if( $evalue_found2 > $evalue){ splice(@seqlets, $j, 1); $j--; next; }
$R_diff=abs($R_leng1-$R_leng2)/2; ## <<<---- Note it is div by 2
if($R_leng2 < $R_leng1){ $smaller_leng=$R_leng2; }else{ $smaller_leng=$R_leng1; }
$Start_diff=abs($R_start1-$R_start2)/2; ## <<<---- Note it is div by 2
$final_factor=$smaller_leng/$factor;
#~~~~~~~~~~ If average R_diff and average Start_diff are less then 1/7 of the smaller seqlet
#~~~~~~~~~~ we regard they are same selqets
if(( $R_diff < $final_factor ) && ### $Start_diff is essential!
($Start_diff < $final_factor ) ){ ### if diff is less than around 30% of the smaller length
if($verbose=~/v/){
print "\n\$R_diff:$R_diff \$Start_diff:$Start_diff $smaller_leng $final_factor $factor";
}
if($R_leng2 >= $R_leng1){
#~~~~~ $mat_seq1 or $mat_seq2 can increase to 'slr1453,sll0238', so you need ',' in the middle only
$extended_name="$mat_seq2,$mat_seq1";
$L=length($extended_name);
$LL=length($long_match2)+2;
$seqlets[$i]= sprintf("%-${LL}s %-${L}s", $long_match2, $extended_name);
splice(@seqlets, $j, 1);
$i-- unless($i==0);
$j--;
next F1;
}elsif( $R_leng1 >= $R_leng2){ ## chooses the bigger range seq
$extended_name="$mat_seq1,$mat_seq2"; # must be ',' not ' '
$L=length($extended_name);
$LL=length($long_match1)+2;
$seqlets[$i]=sprintf("%-${LL}s %-${L}s", $long_match1, $extended_name);
splice(@seqlets, $j, 1);
$i-- unless($i <= 0);
$j--;
next F1;
}
}else{
next F2;
}
}
}
if($char_opt=~/m/){
for($i=0; $i< @seqlets; $i++){
if($seqlets[$i]=~/^ *\d+ +\d+\.?[e\-\d]* +\d+ +\d+ +(\S+) +\d+ +\d+ +(\S+) *$/){
if($1 eq $2){ next }
$leading_seq=$1; $long=$2; $long=~s/\,/ /g;
push(@Final_out, "$leading_seq $long" );
}
}
}
@Final_out=sort @Final_out;
print "\n\n\n# \@Final_out\n@Final_out \n=================\n " if $verbose;
return(\@Final_out);
}
#______________________________________________________________
# Title : get_overlapping_range
# Usage : @n1=@{&get_overlapping_range(\@ranges1, \@ranges2)};
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : get_overlapping_range_in_msp, get_overlapping_range_in_msp_file,
# get_overlapping_seq_match_range, get_overlap_seq_match_range
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub get_overlapping_range{
my (@new_range, $R_start1, $R_start2);
($R_start1, $R_end1)=@{$_[0]}[0..1];
($R_start2, $R_end2)=@{$_[1]}[0..1];
if(($R_start1 <= $R_start2)&& # ------------
( $R_end1 >= $R_end2) ){ # -------
@new_range= ($R_start2, $R_end2);
}elsif(($R_start1 <= $R_start2)&& # -----------
( $R_end1 <= $R_end2) && # -----------
( $R_end1 > $R_start2) ){
@new_range= ($R_start2, $R_end1);
}elsif(($R_start1 >= $R_start2)&& # -----------
( $R_end1 >= $R_end2 ) && # -----------
( $R_end2 > $R_start1) ){
@new_range= ($R_start1, $R_end2);
}elsif(($R_start1 >= $R_start2)&& # ------
( $R_end1 <= $R_end2) ){ # -----------
@new_range= ($R_start1, $R_end1);
}else{ # ----
@new_range=(0,0); # --------
}
return(\@new_range);
}
#______________________________________________________________________________
# Title : find_source_perl_library
# Usage : $source_library=${&find_source_perl_library};
# Function : gets the default perl sub source library from ENV setenv
# Example :
# Keywords :
# Options :
# Author :
# Category :
# Version : 1.1
#------------------------------------------------------------------------------
sub find_source_perl_library{
my($source_library);
print "\n# $0: You did not use \"s=\" option for \$source_library\n";
print "\n# I am trying to retrieve your default source lib. \n";
if( defined( $ENV{'MY_PERL_LIB'} ) ){
$source_library=$ENV{'MY_PERL_LIB'};
}elsif( defined( $ENV{'BIO_PERL'} ) ){
$bioperl_lib=$ENV{'BIO_PERL'};
}elsif(-e "/gn0/jong/Perl/Bio.pl"){
$source_library="/gn0/jong/Perl/Bio.pl";
}elsif(-e "/home/jong/Perl/Bio.pl"){
$source_library="/home/jong/Perl/Bio.pl";
}elsif(-e "/Perl/Bio.pl"){
$source_library="/Perl/Bio.pl";
}elsif(-e "Bio.pl"){
$source_library="Bio.pl";
}elsif(-e "/usr/Perl/Bio.pl"){
$source_library="/usr/Perl/Bio.pl";
}elsif(-e "/ss0/sat/Script/Bio.pl"){
$source_library="/ss0/sat/Script/Bio.pl";
}elsif(-e "/ss0/agb/Script/Bio.pl"){
$source_library="/ss0/agb/Script/Bio.pl";
}else{
print "\n $0 can not find source library, please set BIO_PERL env\n";
}
return(\$source_library);
}
#______________________________________________________________
# Title : find_central_seq_msp_chunk
# Usage : This finds the correct msp chunk with given seq name
# and big original or any msp chunk
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub find_central_seq_msp_chunk{
my $central_seq=${$_[0]};
my @MSP=@{$_[1]};
my ($j, $range, @MSP_1);
for($j=0; $j<@MSP; $j++){
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# $1 $2 $3 $4 $5 $6 $7 $8
# 171 41.18 6 73 HI1690 9 76 HI0736 sodium...
#,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
if($MSP[$j]=~/^ *(\d+) +\d+\.*\d* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+) +(.+)/i){
if( ($4 eq $central_seq) && ($4 ne $7) ){
$range="$2 \- $3";
push(@MSP_1, $range);
}
}
}
return(\@MSP_1);
}
#______________________________________________________________
# Title : find_central_sequence
# Usage :
# Function : accepts msp file and finds the central sequence.
# central sequence is in the centre of all the member
# sequences in a group or cluster
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub find_central_sequence{
#"""""""""""""""""< handle_arguments{ head Ver 3.9 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (%score, $out, @Keys);
for($i=0; $i< @file; $i++){
my($input_file) = ${$file[$i]} || $file[$i];
if($debug eq 1){ print "\n inputfile is $input_file\n" };
unless (-e $input_file){
print chr(7);
print "\n\n\t This is sub open_msp_files in $0 \n\n";
print "\t Fatal: The input file $input_file is not in the directory \n";
}
open(FILE_1,"$input_file");
@MSP=<FILE_1>;
for($j=0; $j<@MSP; $j++){
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# $1 $2 $3 $4 $5 $6 $7 $8
# 171 41.18 6 73 HI1690 9 76 HI0736 sodium...
#,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
if($MSP[$j]=~/^ *(\d+) +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+) +(.+)/){
if($4 eq $7){
$seq_name=$7;
}elsif( ($4 ne $7) && ( defined($seq_name) ) ){
$score{$seq_name}+= $1;
}
}
}
}
@Keys=keys %score;
for($i=0; $i< @Keys; $i++){
if($score{$Keys[$i]} > $largest){
$largest=$score{$Keys[$i]};
$out=$Keys[$i];
}
}
return(\$out);
}
#______________________________________________________________
# Title : write_dof_files
# Usage : &write_dof_files(\@msps);
# while @msps means msp file names
# Function : write Alex's domfam file. it prints out tilde lines
# if the seqlet matched are below threshold defined.
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# v for verbose STDOUT
# n for NO seq start and end number display
# t= for teshold (eg, t=40 for Blastp(or ssearch) score 40 threshold)
#
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------
sub write_dof_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my $thresh = 0;
my @msps=@file;
my (@msp_name, $msp_file, @list, $leng, $scale);
$thresh=$vars{'t'} if defined( $vars{'t'} ) ;
$scale =$vars{'s'};
for($i=0; $i < @msps; $i++){ ### @msps should have msp file names
my($x, $leng, $line_size, $o, $I_len, $SC, %count);
my @msp_name=split(/\./, $msps[$i]);
my $base=$msp_name[0];
my $out_dof_file="$base\_$thresh\.dof";
open(DOF, ">$out_dof_file");
open(MSP, "$msps[$i]");
my @output=<MSP>;
###### Getting automatic $scale ~~~~~~~~~~~~~~~~~~~~~~~
unless($scale=~/\d+/){
for($j=0; $j< @output; $j++){
if($output[$j]=~/^ *\S+[\t ]+\S+[\t ]+1[\t ]+(\d+)[\t ]+\S+/){
$leng=$1 if ($1 > $leng);
}
}
if($leng > 1300){ $scale = 20;
}else{
$scale=int($leng / (log($leng)*10) );
}
if($scale < 5){
$scale=5;
}
}
if($output[0]=~/^$/){ splice(@output, 0, 1); }
if($remove=~/r/){ shift(@output); } ## removing the first line
#######====== Drawing the top line ###########
$line_size=int($leng/$scale);
#######====== SCALE writing =======###########
print DOF "\n NAME LENG FROM- TO ";
print "\n NAME LENG FROM- TO " if ($char_opt=~/v/);
my $div=int($leng/$scale);
my $Scaled=int($div/$scale);
for($x=1; $x< $leng; $x+=$div){
$I_len=length($x);
$SC=$Scaled-$I_len;
print DOF $x."."x$SC;
print $x."."x$SC if ($char_opt=~/v/);
}
#####~~~~ Processing MSP file lines ~~~~~~~~
for ($o=0; $o< @output; $o++){
my $each_msp_line=$output[$o];
my ($line,$score, $start1, $end1, $query, $start2, $end2,
$put_blank_line, $no_num, $target_seq,$first_time,
$S2L, $E2L, $L);
if($each_msp_line =~/^$/){
print DOF "\n";
print "\n" if ($char_opt=~/v/);
$first_time=1;
next;
}elsif($each_msp_line =~/^ *(\d+)[ \t]+\S+[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\S+)/){
$score=$1;
#if($score < $thresh){
# print "\n>$4 score too low"; next;
#}
$start1 =$2;
$end1 =$3;
$query =$4;
$start2 =$5;
$end2 =$6;
$target_seq=$7;
$target_leng=length($fasta{$target_seq});
if($count{$target_seq} >= 1){
$first_time=0;
$put_blank_line=0;
}elsif($count{$target_seq} < 1){
$first_time=1;
}
$count{$target_seq}++;
}
my $S=int($start1/$scale);
my $E=int($end1/$scale);
$L=$E-$S+1;
if($char_opt=~/n/i){
if($score < $thresh){
$line=" "x$S."\~"x$L;
}else{
$line=" "x$S."\-"x$L;
}
}else{
$S2L=length($start2);
$E2L=length($end2);
$L=$L-$S2L-$E2L;
if($L < 1){ $L=1 }
if($score < $thresh){
$line=" "x$S.$start2."\~"x$L.$end2;
}else{
$line=" "x$S.$start2."\-"x$L.$end2;
}
}
### Actual writing ####
if($first_time==1){
$first_time=0;
# Name leng strt-end |---------------------------------------------------------
printf DOF "\n\>%-11s %-4d %-4d\-%4d %-${line_size}s", $target_seq, $target_leng, $start1, $end1, $line;
printf ("\n\>%-11s %-4d %-4d\-%4d %-${line_size}s",
$target_seq, $target_leng, $start1, $end1, $line) if ($char_opt=~/v/);
}elsif($first_time !=1){
if($put_blank_line==1){
print DOF "\n";
printf DOF "\n %-11s %-4d %-4d\-%4d %-${line_size}s", $target_seq, $target_leng, $start1, $end1,$line;
print "\n" if ($char_opt=~/v/);
printf ("\n %-11s %-4d %-4d\-%4d %-${line_size}s",
$target_seq, $target_leng, $start1, $end1,$line) if ($char_opt=~/v/);
}else{
printf DOF "\n %-11s %-4d %-4d\-%4d %-${line_size}s", $target_seq, $target_leng, $start1, $end1,$line;
printf ( "\n %-11s %-4d %-4d\-%4d %-${line_size}s",
$target_seq, $target_leng, $start1, $end1,$line) if ($char_opt=~/v/);
}
}
}
print DOF "\n\n";
print "\n" if ($char_opt=~/v/);
print "\n# ~~~~~ lines mean match regions with below threshold ($thresh)" if $thresh > $score;
print "\n# $out_dof_file is created \n";
}
}
#______________________________________________________________
# Title : make_filtered_list
# Usage :
# Function : this is the core of check_genome_cluster.pl
# finds good linkage seqlets in msp files
# Example :
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub make_filtered_list{ #####################################33
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
$|=1;
my ($result, @msp,%temp_msp, @num_of_all_links,$link_counter,$diff1, $diff2);
my $num_seq=0;
my @sizes_of_seqlets;
open(MSP, "$file[0]");
FIRST_FOR:for($c=0; $c< @file; $c++){
my %temp_msp;
#print "\nFirst\(${c}\)th INPUT file processing\n";
my @msp1=<MSP>;
my (@msp, $MSP);
for($i=0; $i< @msp1; $i++){
if($msp1[$i]=~/^ *(\d+) +\d+\.?[e\-\d]*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\w+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\w+)/i){
if($1 > $msp_value){
$temp_msp{$4} .= $msp1[$i];
}
}
}
close(MSP);
#print "\n", %temp_msp, "\n";
@msp=values %temp_msp;
$num_seq=@msp;
print "\nTOP Number of seq is: $num_seq \n";
#"""""" Real algorithm starts HERE##''''''''''''''''''''''''''''''''''''''''
# optimization: I found no need to go through all msp chunk. One is enough by experience
#MSP1: for($i=0; $i< @msp; $i++){ # @msp has (mspchunk1, mspchunk2...)
if($fast==1){ $msp_chunk_num=1
}else{ $msp_chunk_num=@msp }
MSP1: for($i=0; $i< $msp_chunk_num; $i++){ # @msp has (mspchunk1, mspchunk2...)
my $pos=$i+1;
my @seqlets1=split(/\n+/, $msp[$i]);
print " MSP1 ${i}th MSP chunk is handled #######","\n";
my $temp=@seqlets1-1;
SEQLET1: for($j=1; $j < @seqlets1; $j++){
my @OUTPUT=&follow_seqlet_link($seqlets1[$j], @msp);
my $depth_of_linking=${$OUTPUT[0]};
my $size_of_common_seqlet=${$OUTPUT[1]};
if(($depth_of_linking==@msp)&&($size_of_common_seqlet > $threshold)){
$result=1;
if($fast == 1){
last FIRST_FOR;
#goto EXIT_1;
}
}
#push(@num_of_all_links, $depth_of_linking);
#push(@sizes_of_seqlets, $size_of_common_seqlet);
}
}
}
#print "\n All searched links: \n", "@num_of_all_links", "\n";
#print "\n Sizes of common seqlet\n", "@sizes_of_seqlets\n";
EXIT_1:
return(\$result, \$num_seq);
}
#______________________________________________________________
# Title : follow_seqlet_link
# Usage :
# Function :
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub follow_seqlet_link{
my $seqlet_line=shift;
my @msp=@_;
my ($i, $j, $link_counter, @common_range,$seqlet_very_ori, @ranges_very_ori,
@new_ranges, $seqlet_ori, $matched_ori, @ranges1, @ranges2);
if($seqlet_line=~/^ *\d+[ \t]+\d+\.?[e\-\d]*[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\w+)[ \t]+(\d+)[ \t]+(\d+)[ \t]+(\w+)/i){
$seqlet_ori =$3;
$seqlet_very_ori=$3;
$matched_ori=$6;
@ranges1=($1, $2, $4, $5);
@ranges_very_ori=@ranges1;
}
my $visited .= "$seqlet_ori ";
#print "\n\=======$seqlet_ori \@ranges1 is @ranges1 =================================\n";
MSP: for($i=0; $i<@msp; $i++){
my @seqlets1=split(/\n+/, $msp[$i]);
#my @common_range;
SEQLET1: for($j=1; $j < @seqlets1; $j++){
my $seqlet_num=$j;
my @temp= &get_msp_range($seqlets1[$j]) if $seqlets1[$j]=~/\S/;
my @ranges2=@{$temp[0]}; # has (1 2 3 4);
my $seq2 = $temp[1];
my $matched_seq2= $temp[2];
#splice(@seqlets1, $j, 1);
#$j--;
if($seq2 eq $seqlet_ori){
next MSP; # to the next MSP chunk
}elsif(($seq2 eq $matched_ori)&&($visited !~ /$matched_seq2/)){
if($verbose=~/v$/i){
print "\n SEQLET1_________________________________________\(${pos}th MSP chunk\)\n";
print " SEQLET1 $temp number of seqlets for the ${pos}th msp file\n";
print " SEQLET1_________________________________________\(${seqlet_num}th seqlet )\n";
print " QUERY seqlet is $temp[1]: ", "@ranges2[0..1] MATCH seqlet is $temp[2]: ", "@ranges2[2..3]","\n";
print " Target seqlet is $seq2 : @ranges2\n";
}
if(($ranges1[2] >= $ranges2[0])&& ## =======
($ranges1[3] <= $ranges2[1]) ){ ## ==============
$link_counter++;
#push(@common_range, @ranges1, @ranges2);
#print "\n MSP2 \@ranges1 is ", "@ranges1" if($verbose=~/v/i);
#print "\n \@ranges2 is ", "@ranges2" if($verbose=~/v/i);
$diff1=$ranges1[2] - $ranges2[0];
$diff2=$ranges2[1] - $ranges1[3];
@new_ranges =($ranges1[2], $ranges1[3], ($ranges2[2]+$diff1), ($ranges2[3]-$diff2) );
@ranges1=(@new_ranges); #, $ranges2[2], $ranges2[3]);
$seqlet_ori=$seq2;
$matched_ori=$matched_seq2;
$visited .= "$seqlet_ori ";
if($verbose=~/v/i){
print "\n FIRST elsif Finalout @new_ranges \$link_counter=$link_counter", "\n";
print " \$num_seq = $num_seq\n";
}
if( ($link_counter+2) >= @msp){
#print "\n All link found \$link_counter = $link_counter, \$num_seq=$num_seq\n";
$result=1;
#$link_counter=0;
#print "\n Common range: ", "@common_range", "\n";
$visited .= "$matched_seq2 ";
#print " Sequence visited: $visited \n";
@common_range=();
$not_visited_msp_chunk=0;
goto EXIT;
}
$i=0;
next MSP;
}elsif(($ranges1[2] <= $ranges2[0])&& ## --------------
($ranges1[3] >= $ranges2[1]) ){ ## --------
$link_counter++;
#push(@common_range, @ranges1, @ranges2);
#print "\n \@ranges1 is ", "@ranges1";
#print "\n \@ranges2 is ", "@ranges2";
@new_ranges =($ranges2[0], $ranges2[1], $ranges2[2], $ranges2[3],);
#print "\n Second elsif Finalout ", @new_ranges, " \$link_counter=$link_counter\n";
#print " \$num_seq = $num_seq\n";
@ranges1=(@new_ranges);
$seqlet_ori=$seq2;
$matched_ori=$matched_seq2;
$visited .= "$seqlet_ori ";
if( ($link_counter+2) >= @msp){
#print "\n All link found \$link_counter = $link_counter, \$num_seq=$num_seq\n";
$result=1;
#$link_counter=0;
#print "\n ", "@common_range", "\n";
$visited .= "$matched_seq2 ";
#print " Sequence visited: $visited \n";
@common_range=();
$not_visited_msp_chunk=0;
goto EXIT;
}
$i=0;
next MSP;
}elsif(($ranges1[2] <= $ranges2[0])&& # ======
($ranges1[3] <= $ranges2[1]) && ## =======
($ranges1[3] >= $ranges2[0]) ){
$link_counter++;
#push(@common_range, @ranges1, @ranges2);
#print "\n \@ranges1 is ", "@ranges1";
#print "\n \@ranges2 is ", "@ranges2";
#print " \$num_seq = $num_seq\n";
$diff2=$ranges2[1] - $ranges1[3];
@new_ranges=($ranges2[0], $ranges1[3], $ranges2[2], ($ranges2[3]-$diff2));
@ranges1=(@new_ranges);
$seqlet_ori=$seq2;
$matched_ori=$matched_seq2;
$visited .= "$seqlet_ori ";
#print "\n Third elsif Finalout ", @new_ranges, " \$link_counter=$link_counter\n";
if( ($link_counter+2) >= @msp){
#print "\n All link found \$link_counter = $link_counter, \$num_seq=$num_seq\n";
$result=1;
#$link_counter=0;
#print "\n ", "@common_range", "\n";
$visited .= "$matched_seq2 ";
#print " Sequence visited: $visited \n";
@common_range=();
$not_visited_msp_chunk=0;
goto EXIT;
}
$i=0;
next MSP;
}elsif(($ranges1[2] >= $ranges2[0])&& # ======
($ranges1[3] >= $ranges2[1])&& ## =======
($ranges1[2] <= $ranges2[1]) ){
$link_counter++;
#push(@common_range, @ranges1, @ranges2);
#print "\n \@ranges1 is ", "@ranges1";
#print "\n \@ranges2 is ", "@ranges2";
$diff1=$ranges1[2] - $ranges2[0];
@new_ranges=($ranges1[2], $ranges2[1], ($ranges2[0]+$diff1), $ranges2[1]);
@ranges1=(@new_ranges);
$seqlet_ori=$seq2;
$matched_ori=$matched_seq2;
$visited .= "$seqlet_ori ";
#print "\n Fourth elsif Finalout ", @new_ranges, " \$link_counter=$link_counter\n";
#print " \$num_seq = $num_seq\n";
if( ($link_counter+2) >= @msp){
#print "\n All link found \$link_counter = $link_counter, \$num_seq=$num_seq\n";
$result=1;
#$link_counter=0;
#print "\n ", "@common_range", "\n";
$visited .= "$matched_seq2 ";
#print " Sequence visited: $visited \n";
@common_range=();
$not_visited_msp_chunk=0;
goto EXIT;
}
$i=0;
next MSP;
}else{
if($verbose=~/v/i){
print "\nX X X X X Link broken ", @new_ranges, " \$link_counter=$link_counter\n";
}
next SEQLET1;
}
}
}
}
EXIT:
$final_num_of_seq_linked=$link_counter+2;
if($final_num_of_seq_linked==@msp){
$seqlet_leng=$new_ranges[$#common_range]-$new_ranges[$#common_range-1]+1;
#print "\n Common Seqlet size: $seqlet_leng \n";
}else{
$seqlet_leng=0;
}
#print "\nLINKING seq num for seqlet $seqlet_very_ori \(","@ranges_very_ori","\) is $final_num_of_seq_linked \n";
return(\$final_num_of_seq_linked, \$seqlet_leng);
}
#________________________________________________________________________________
# Title : convert_clu_to_msp
# Usage : @written_msp_files=@{&convert_clu_to_msp(\$single_linkage_file)};
# Function : reads in a big single linkage cluster file(or normal cluster file)
# and creates a big msp file which contains all the entries in the
# cluster file (usually with the extension of sclu or clu)
# This normally reads in xxxx.mso, xxxx.sso like files, but if the
# corresponding xxx.msp file already exists, it concatenates them to
# make a bigger one.
# Example :
# Keywords : clu_2_sso_2_msp, cluster_to_msp, cluster_to_sso_to_msp
# clu_to_sso_to_msp
# Options :
# Category :
# Version : 2.3
#--------------------------------------------------------------------------------
sub convert_clu_to_msp{
my($i, $j, $k, $s, $u, $v, $p, $m, $n, $y, @possible_extensions, $single_file_name,
@seq_names, @final_files, @U_L_case, $file, @file, @name_types,
@poss_sub_dir_heads, @written_msp_files, $Lean_output, $subdir_char_size,
$search_file_base, $found_real_subdir_name, $found_search_prog_exention_used);
$subdir_char_size=2; # default
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening cluster file (xx.clu)
# %clus looks like this: 2-507 YGR041W YLR353W
# 3-308 YDR222W YDR346C YLR225C
# 2-184 YCL066W YCR040W
#______________________________________________________________
my $clu=${$_[0]} || $_[0];
$Lean_output=${$_[1]} || $_[1];
if($verbose){
print "\n# convert_clu_to_msp : \"$clu\" is given
and I am processing it with clu_to_sso_to_msp\n" if defined $clu;
}
my %clus=%{&open_clu_files(\$clu)};
my @clusters= keys %clus;
my $num_of_cluster=@clusters=@{&sort_by_cluster_size(\@clusters)};
print "# (i) $0: convert_clu_to_msp: No. of cluster=$num_of_cluster after open_clu_files \n" if $verbose;
&show_array(\@clusters) if $verbose;
&show_hash(\%clus) if $verbose;
@possible_extensions=('msp', 'msp.gz', 'msso', 'msso.gz','fsso', 'pbla', 'pbla.gz',
'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
@U_L_case=('\U', '\L');
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Making each SINGLE linkage clu to MSP file format to be ready for divclus
#______________________________________________________________________________
for($i=0; $i< @clusters; $i++){
my (@seq_names, @final_files, $clus_name, $big_out_msp, @msp_hashes);
$clus_name=$clusters[$i];
unless($single_file_name=~/\S/){
$big_out_msp="$clus_name\_cluster\.msp"; #<<<----- final output name
}else{
$big_out_msp=$single_file_name;
}
push(@written_msp_files, $big_out_msp); ## This is the output of this sub
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If $clus_name.msp is already there, skip
#_____________________________________________
if( (-s $big_out_msp) > 100 and !$over_write ){
print "\n# (i) convert_clu_to_msp : $big_out_msp MSP file already exists, skipping\n";
print "# Use \$over_write option \'o\' to start all over again or \n";
print "# delete clustering files like XX-XX_cluster.clu to go on\n";
next ;
}
$num_of_seq_member=@seq_names=split(/ +/, $clus{$clusters[$i]}); # @seq_names has (HIU001, HI002, HI333, MJ111, etc)
print "# $0: convert_clu_to_msp: No. of seq member=$num_of_seq_member after split \n" if $verbose;
FOR0: for($j=0; $j < @seq_names; $j++){
my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low,
$file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
$file_name_prot_low_gz, $file_name_prot_up_gz);
$each_seq_name=$seq_names[$j];
my @poss_sub_dir_heads=('.'); ## <<<<------- This is critically important, when 'D' opt is not used!
if($each_seq_name=~/(\S+)_\d+\-\d+$/){
$each_seq_name_range=$each_seq_name;
$each_seq_name=$1;
@name_types=($each_seq_name, $each_seq_name_range);
}else{
@name_types=($each_seq_name);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Here I take chars from the sequ names, as dirs have fragments of chars
#_______________________________________________________________________________
for($s=1; $s <= $subdir_char_size ; $s++){ ## here, number 2 indicates, I check single or 2 char sub dir names
$sub_dir_head= substr($seq_names[$j], 0, $s);
push(@poss_sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
push(@poss_sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checking all the possible subdirectories to crop all the sso files
#_______________________________________________________________________________
FOR1: for($p=0; $p <= @poss_sub_dir_heads; $p++){ ## Default has '.' will make things like '././fam_8_8.pbla.gz'
$subd=$poss_sub_dir_heads[$p]; ## Also, the '<=' not '<' cures the same problem.
FOR2 : for($e=0; $e < @possible_extensions; $e++){
$ext=$possible_extensions[$e];
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This makes all the possible lower upper case names
#______________________________________________________
for( $u=0; $u < @U_L_case; $u++){
for($v=0; $v <@name_types; $v++){
$each_seq_name=$name_types[$v];
if($U_L_case[$u]=~/U/){ $each_seq_name="\U$each_seq_name";
}else{ $each_seq_name="\L$each_seq_name"; }
if(-s "$each_seq_name\.$ext"){
push(@final_files, "$each_seq_name\.$ext" ) ;
$found_search_prog_exention_used=$ext;
$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
$found_search_prog_exention_used=$ext;
next FOR0
}elsif(-s "$each_seq_name\.$ext\.gz"){
push(@final_files, "$each_seq_name\.$ext\.gz" ) ;
$found_search_prog_exention_used=$ext;
$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
$found_search_prog_exention_used=$ext;
next FOR0
}else{
$file_wanted="\.\/$subd\/$each_seq_name\.$ext";
if(-s $file_wanted){
push( @final_files, $file_wanted);
$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
$found_search_prog_exention_used=$ext;
next FOR0
}elsif(-s "$file_wanted\.gz"){
push( @final_files, "$file_wanted\.gz");
$found_search_prog_exention_used=$ext;
$found_real_subdir_name=$subd; ## This is to report the name of the actual subd found
next FOR0;
}
}
}
}
} # FOR2
} # FOR1
print @final_files, "\n";
} # FOR0
#print "\n# @final_files \n=============> $big_out_msp \n\n";
if(@final_files < 1){
print "\n# convert_clu_to_msp :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
next;
}
$write_each_msp_to_disk='';
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if small msp files have already made in previous steps
#________________________________________________________________
if($final_files[0]=~/(\S+)\.msp/){ ## concatenate msp into big_msp
$search_file_base=$1;
print "\n# $search_file_base $found_real_subdir_name $found_search_prog_exention_used\n" if $verbose;
if($final_files[0]=~/\S\.gz$/){
print "\n# $final_files[0] is gzipped \n";
system("gzip -d $final_files[0]");
$final_files[0]=~s/\.gz//;
}
open(BIG_MSP_FILE, ">$big_out_msp");
for($y=0; $y< @final_files; $y++){
open(SINGLE_MSP, "$final_files[$y]");
while(<SINGLE_MSP>){
print BIG_MSP_FILE $_;
}
}
close(BIG_MSP_FILE);
close(SINGLE_MSP);
push(@written_msp_files, $big_out_msp);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# Doing something for L option ($Lean_output)
#___________________________________________________
if($Lean_output and -d $found_real_subdir_name and $found_search_prog_exention_used){
for($y=0; $y< @seq_names; $y++){
unlink("$found_real_subdir_name\/$seq_names[$y]\.$found_search_prog_exention_used");
unlink("$found_real_subdir_name\/$seq_names[$y]\.$found_search_prog_exention_used\.gz");
}
}elsif($Lean_output){
for($y=0; $y< @seq_names; $y++){
unlink("$seq_names[$y]\.$found_search_prog_exention_used");
unlink("$seq_names[$y]\.$found_search_prog_exention_used\.gz");
}
}
}else{
if($write_each_msp_to_disk){
print "\# $0 : going to run open_sso_files with $write_each_msp_to_disk opt\n";
$big_out_msp=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
if(-s $big_out_msp > 200){ print "\n# $0: SUCCESS to create $big_out_msp :) :) :-) :-) ?\n"; }
}else{
print "\n# convert_clu_to_msp: I am running open_sso_files. \n";
@msp_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range,
$add_range2, $big_out_msp, $over_write)};
&write_msp_files(@msp_hashes, $big_out_msp); ## concatenates all the hash ref to one
}
}
}## end of for($i=0; $i< @clusters; $i++){
return(\@written_msp_files);
}# end of
#________________________________________________________________________________
# Title : clu_to_sso_to_msp (use convert_clu_to_msp)
# Usage : &clu_to_sso_to_msp(\$clu);
# Function : reads in a big single linkage cluster file(or normal cluster file)
# and creates a big msp file which contains all the entries in the
# cluster file (usually with the extension of sclu or clu)
# This normally reads in xxxx.mso, xxxx.sso like files, but if the
# corresponding xxx.msp file already exists, it concatenates them to
# make a bigger one.
# Example :
# Keywords : clu_2_sso_2_msp, cluster_to_msp, cluster_to_sso_to_msp
# convert_clu_to_sso_to_msp
# Options : USE, convert_clu_to_sso_to_msp, this is obsolute now
# Category :
# Version : 1.7
#--------------------------------------------------------------------------------
sub clu_to_sso_to_msp{
my($i, $j, $k, $s, $u, $p, $m, $n, $y, @possible_extensions, @list,
@final_files, @U_L_case, $file, @file, @written_msp_files);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening cluster file (xx.clu)
# %clus looks like this: 2-507 YGR041W YLR353W
# 3-308 YDR222W YDR346C YLR225C
# 2-184 YCL066W YCR040W
#______________________________________________________________
my $clu=${$_[0]} || $_[0];
if($verbose){
print "\n# clu_to_sso_to_msp : \"$clu\" is given
and I am processing it with clu_to_sso_to_msp\n" if defined $clu;
}
my %clus=%{&open_clu_files(\$clu)};
my @keys= keys %clus;
my $num_of_cluster=@keys=@{&sort_by_cluster_size(\@keys)};
print "# $0: clu_to_sso_to_msp: No. of cluster=$num_of_cluster after open_clu_files \n" if $verbose;
&show_array(\@keys) if $verbose;
&show_hash(\%clus) if $verbose;
@possible_extensions=('msp', 'sso', 'msso', 'msso.gz','fsso', 'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
@U_L_case=('\U', '\L');
for($i=0; $i< @keys; $i++){
my (@list, @final_files, $clus_name, $big_out_msp, @msp_hashes);
$clus_name=$keys[$i];
unless($single_file_name=~/\S/){
$big_out_msp="$clus_name\_cluster\.msp"; #<<<----- final output name
}else{
$big_out_msp=$single_file_name;
}
push(@written_msp_files, $big_out_msp); ## This is the output of this sub
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If $clus_name.msp is already there, skip
#_____________________________________________
if( (-s $big_out_msp) > 100 and !$over_write ){
print "\n# clu_to_sso_to_msp : $big_out_msp MSP file already exists, skipping\n";
print "# Use \$over_write option \'o\' to start all over again or \n";
print "# delete clustering files like XX-XX_cluster.clu to go on\n";
next ;
}
$num_of_seq_member=@list=split(/ +/, $clus{$keys[$i]}); # @list has (HIU001, HI002, HI333, MJ111, etc)
print "# $0: clu_to_sso_to_msp: No. of seq member=$num_of_seq_member after split \n" if $verbose;
FOR0: for($j=0; $j < @list; $j++){
my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low, @sub_dir_heads,
$file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
$file_name_prot_low_gz, $file_name_prot_up_gz);
$each_seq_name=$list[$j];
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Here I take chars from the sequ names, as dirs have fragments of chars
#_______________________________________________________________________________
for($s=1; $s <=2 ; $s++){ ## here, number 2 indicates, I check single or 2 char sub dir names
$sub_dir_head= substr($list[$j], 0, $s);
push(@sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
push(@sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checking all the possible subdirectories to crop all the sso files
#_______________________________________________________________________________
FOR1: for($p=0; $p < @sub_dir_heads; $p++){
$subd=$sub_dir_heads[$p];
FOR2 : for($e=0; $e < @possible_extensions; $e++){
$ext=$possible_extensions[$e];
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This makes all the possible lower upper case names
#______________________________________________________
for( $u=0; $u < @U_L_case; $u++){
if($U_L_case[$u]=~/U/){ $each_seq_name="\U$each_seq_name";
}else{ $each_seq_name="\L$each_seq_name"; }
if(-s "$each_seq_name\.$ext"){ push(@final_files, "$each_seq_name\.$ext" ) ; next FOR0 }
elsif(-s "$each_seq_name\.$ext\.gz"){ push(@final_files, "$each_seq_name\.$ext\.gz" ) ; next FOR0 }
else{
$file_wanted="\.\/$subd\/$each_seq_name\.$ext";
if(-s $file_wanted){
push( @final_files, $file_wanted); next FOR0 }
elsif(-s "$file_wanted\.gz"){
push( @final_files, "$file_wanted\.gz");
next FOR0
}
}
}
} # FOR2
} # FOR1
} # FOR0
print "\n# @final_files \n=============> $big_out_msp \n\n" if $verbose;
if(@final_files < 1){
print "\n# clu_to_sso_to_msp :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
next;
}
# $write_each_msp_to_disk='w';
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if small msp files have already made in previous steps
#________________________________________________________________
if($final_files[0]=~/\.msp *$/){ ## concatenate msp into big_msp
open(BIG_MSP_FILE, ">$big_out_msp");
for($y=0; $y< @final_files; $y++){
open(SINGLE_MSP, "$final_files[$y]");
while(<SINGLE_MSP>){
print BIG_MSP_FILE $_;
}
}
close(BIG_MSP_FILE);
close(SINGLE_MSP);
push(@written_msp_files, $big_out_msp);
}else{
if($write_each_msp_to_disk){
print "\# $0 : going to run open_sso_files with $write_each_msp_to_disk opt\n";
$big_out_msp=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
if(-s $big_out_msp > 200){ print "\n# $0: SUCCESS to create $big_out_msp :) :) :-) :-) ?\n"; }
}else{
print "\n# clu_to_sso_to_msp: I am running open_sso_files. \n";
@msp_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
&write_msp_files(@msp_hashes, $big_out_msp); ## concatenates all the hash ref to one
}
}
}
return(\@written_msp_files);
}# end of
#________________________________________________________________________________
# Title : convert_clu_to_sso_to_msp
# Usage : &clu_to_sso_to_msp(\$clu);
# Function : reads in a big single linkage cluster file(or normal cluster file)
# and creates a big msp file which contains all the entries in the
# cluster file (usually with the extension of sclu or clu)
# This normally reads in xxxx.mso, xxxx.sso like files, but if the
# corresponding xxx.msp file already exists, it concatenates them to
# make a bigger one.
# Example :
# Keywords : clu_2_sso_2_msp, cluster_to_msp, cluster_to_sso_to_msp
# clu_to_sso_to_msp
# Options :
# Category :
# Version : 1.8
#--------------------------------------------------------------------------------
sub convert_clu_to_sso_to_msp{
my($i, $j, $k, $s, $u, $p, $m, $n, $y, @possible_extensions, @list,
@final_files, @U_L_case, $file, @file, @written_msp_files);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening cluster file (xx.clu)
# %clus looks like this: 2-507 YGR041W YLR353W
# 3-308 YDR222W YDR346C YLR225C
# 2-184 YCL066W YCR040W
#______________________________________________________________
my $clu=${$_[0]} || $_[0];
if($verbose){
print "\n# clu_to_sso_to_msp : \"$clu\" is given
and I am processing it with clu_to_sso_to_msp\n" if defined $clu;
}
my %clus=%{&open_clu_files(\$clu)};
my @keys= keys %clus;
my $num_of_cluster=@keys=@{&sort_by_cluster_size(\@keys)};
print "# $0: clu_to_sso_to_msp: No. of cluster=$num_of_cluster after open_clu_files \n" if $verbose;
&show_array(\@keys) if $verbose;
&show_hash(\%clus) if $verbose;
@possible_extensions=('msp', 'sso', 'msso', 'msso.gz','fsso', 'ssso', 'fso', 'out', 'prot.sso', 'prot.ts');
@U_L_case=('\U', '\L');
for($i=0; $i< @keys; $i++){
my (@list, @final_files, $clus_name, $big_out_msp, @msp_hashes);
$clus_name=$keys[$i];
unless($single_file_name=~/\S/){
$big_out_msp="$clus_name\_cluster\.msp"; #<<<----- final output name
}else{
$big_out_msp=$single_file_name;
}
push(@written_msp_files, $big_out_msp); ## This is the output of this sub
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If $clus_name.msp is already there, skip
#_____________________________________________
if( (-s $big_out_msp) > 100 and !$over_write ){
print "\n# clu_to_sso_to_msp : $big_out_msp MSP file already exists, skipping\n";
print "# Use \$over_write option \'o\' to start all over again or \n";
print "# delete clustering files like XX-XX_cluster.clu to go on\n";
next ;
}
$num_of_seq_member=@list=split(/ +/, $clus{$keys[$i]}); # @list has (HIU001, HI002, HI333, MJ111, etc)
print "# $0: clu_to_sso_to_msp: No. of seq member=$num_of_seq_member after split \n" if $verbose;
FOR0: for($j=0; $j < @list; $j++){
my($sub_dir_head, $file_name_low, $file_name_up, $file_name_prot_low, @sub_dir_heads,
$file_name_prot_up, $file_name_low_gz, $file_name_up_gz,
$file_name_prot_low_gz, $file_name_prot_up_gz);
$each_seq_name=$list[$j];
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Here I take chars from the sequ names, as dirs have fragments of chars
#_______________________________________________________________________________
for($s=1; $s <=2 ; $s++){ ## here, number 2 indicates, I check single or 2 char sub dir names
$sub_dir_head= substr($list[$j], 0, $s);
push(@sub_dir_heads, "\L$sub_dir_head") if (-d "\L$sub_dir_head" );
push(@sub_dir_heads, "\U$sub_dir_head") if (-d "\U$sub_dir_head" );
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checking all the possible subdirectories to crop all the sso files
#_______________________________________________________________________________
FOR1: for($p=0; $p < @sub_dir_heads; $p++){
$subd=$sub_dir_heads[$p];
FOR2 : for($e=0; $e < @possible_extensions; $e++){
$ext=$possible_extensions[$e];
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This makes all the possible lower upper case names
#______________________________________________________
for( $u=0; $u < @U_L_case; $u++){
if($U_L_case[$u]=~/U/){ $each_seq_name="\U$each_seq_name";
}else{ $each_seq_name="\L$each_seq_name"; }
if(-s "$each_seq_name\.$ext"){ push(@final_files, "$each_seq_name\.$ext" ) ; next FOR0 }
elsif(-s "$each_seq_name\.$ext\.gz"){ push(@final_files, "$each_seq_name\.$ext\.gz" ) ; next FOR0 }
else{
$file_wanted="\.\/$subd\/$each_seq_name\.$ext";
if(-s $file_wanted){
push( @final_files, $file_wanted); next FOR0 }
elsif(-s "$file_wanted\.gz"){
push( @final_files, "$file_wanted\.gz");
next FOR0
}
}
}
} # FOR2
} # FOR1
} # FOR0
print "\n# @final_files \n=============> $big_out_msp \n\n" if $verbose;
if(@final_files < 1){
print "\n# clu_to_sso_to_msp :LINE no.: ", __LINE__, " ERROR: \@final_files is empty. Serious error\n";
print "\n If you have sub dir which have more than 2 chars as names, you may increase the default 2 to 3 in the above\n";
next;
}
# $write_each_msp_to_disk='w';
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Check if small msp files have already made in previous steps
#________________________________________________________________
if($final_files[0]=~/\.msp *$/){ ## concatenate msp into big_msp
open(BIG_MSP_FILE, ">$big_out_msp");
for($y=0; $y< @final_files; $y++){
open(SINGLE_MSP, "$final_files[$y]");
while(<SINGLE_MSP>){
print BIG_MSP_FILE $_;
}
}
close(BIG_MSP_FILE);
close(SINGLE_MSP);
push(@written_msp_files, $big_out_msp);
}else{
if($write_each_msp_to_disk){
print "\# $0 : going to run open_sso_files with $write_each_msp_to_disk opt\n";
$big_out_msp=${&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
if(-s $big_out_msp > 200){ print "\n# $0: SUCCESS to create $big_out_msp :) :) :-) :-) ?\n"; }
}else{
print "\n# clu_to_sso_to_msp: I am running open_sso_files. \n";
@msp_hashes=@{&open_sso_files(\@final_files, $uppercase_seq_name, $write_each_msp_to_disk,
"u=$upper_expect_limit", $new_format, $add_range, $add_range2, $big_out_msp, $over_write)};
&write_msp_files(@msp_hashes, $big_out_msp); ## concatenates all the hash ref to one
}
}
}
return(\@written_msp_files);
}# end of
#______________________________________________________________________________
# Title : sso_to_msp
# Usage : &sso_to_msp(@ARGV, $single_out_opt);
# Function : This takes sso file(s) and produces MSP file. It
# concatenate sso file contents when more than one
# sso file is given.
# Example : &sso_to_msp(@ARGV, 'OUT.msp', $single_out_opt);
# Warning : This capitalize all the input file names when
# producing xxxxx.msp. xxxxx.sso -> XXXX.sso
# Keywords : sso_file_to_msp_file, convert_sso_to_msp,
# Options : _ for debugging.
# # for debugging.
# v for showing the MSP result to screen
# s for making single MSP file for each sso file
# as well as big MSP file which has all sso
# u= for upper expectation value limit
# l= for lower expect val limit
# s= for single file name input eg. "s=xxxxx.msp"
# n for new format (msp2 format)
# r for adding range
# r2 for adding ranges in all sequence names
#
# Returns : the file names created (xxxx.msp, yyyy.msp,,,,)
# Argument :
# Category :
# Version : 2.6
#-----------------------------------------------------------------------------
sub sso_to_msp{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($upper_expect_limit, $lower_expect_limit)=(50, 0);
my (%sso, @sso, @SSO, $big_out_msp1, @final_out, $big_out_msp2,
$create_sso, $single_out_opt, $add_range, $add_range2, $big_out_msp,
$Evalue_thresh, $new_format, $Score_thresh, $margin, $single_file_name);
if($vars{'u'}=~/([\.\d]+)/){ $upper_expect_limit = $vars{'u'} };
if($vars{'l'}=~/([\.\d]+)/){ $lower_expect_limit = $vars{'l'} };
if($vars{'t'}=~/(\d+)/){ $Score_thresh = $vars{'t'} };
if($vars{'m'}=~/(\d+)/){ $margin = $vars{'m'} };
if($vars{'s'}=~/\S/){ $single_file_name = $vars{'s'} };
if($char_opt=~/r2/){ $add_range='r'; $add_range2='r2' }
if($char_opt=~/r/){ $add_range = 'r' }
if($char_opt=~/c/){ $create_sso = 'c' }
if($char_opt=~/s/){ $single_out_opt='s' }
if($char_opt=~/n/){ $new_format='n' }
print "\n# File given to sso_to_msp is \"@file\", Normally xxx.sso file names\n";
if($single_file_name=~/\S/){
$big_out_msp=$single_file_name;
}else{
for($i=0; $i < @file; $i++){
if($file[$i]=~/\.msp$/){ ## when output file name is given
$big_out_msp=$file[$i];
splice(@file, $i, 1);
$i--;
}elsif($file[$i]=~/^(\d+\-\d+)([_\d]*)\.[mfs]?sso/){ ## creates xxxx.msp file name from xxxx.sso
$big_out_msp1="\U$1"."$2"."\.msp";
$big_out_msp2="\U$1".".msp";
}elsif($file[$i]=~/^(\S+)\.[mfs]?sso$/){
$big_out_msp1="\U$1"."\.msp";
$big_out_msp2="\U$1"."_all".".msp";
print "\n# sso_to_msp: File matched xxxx.sso format \n";
}elsif($file[$i]=~/^(\S+)\.out$/){
$big_out_msp1="\U$1"."\.msp";
$big_out_msp2="\U$1"."_all".".msp";
print "\n# sso_to_msp: File matched xxxx.out format \n";
}elsif($file[$i]=~/^(\S+)\.p[rot\,]*\.ts\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}elsif($file[$i]=~/^(\S+)\.ts\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}elsif($file[$i]=~/^(\S+)\.out\.gz/ or $file[$i]=~/^(\S+)\.[mfs]?sso\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}
}
}
if(defined($big_out_msp)){
$big_out_msp1=$big_out_msp2=$big_out_msp;
print "\n# \$big_out_msp is defined as \'$big_out_msp\'\n";
}else{
print "\n# sso_to_msp: You did not define the big MSP file out format, so $big_out_msp1 \n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) When File was given to this sub routine
#__________________________________________
if(@file == 1){ ## ONE single file input??
print "# one file @file is given, OUT will be: $big_out_msp1 \n";
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
"m=$margin",
$new_format,
"s=$big_out_msp")};
push(@final_out, &write_msp_files(@sso, $big_out_msp1,
$single_out_opt, $add_range) );
}elsif(@file > 1){ ## MOre than 1 file input??
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"l=$lower_expect_limit",
"u=$upper_expect_limit",
"m=$margin",
$new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp2,
$single_out_opt, $add_range)} ); ## concatenates all the hash ref to one
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) When NO File but ARRAY is given
# Here, you can have SSO files created
#__________________________________________
elsif(@array >=1){
print "\n# In sso_to_msp, \@array is given rather than \@file";
@sso=@{&open_sso_files(@array, "u=$upper_expect_limit", $add_range2,
"l=$lower_expect_limit", $add_range, $create_sso,
"m=$margin", $new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp,
$single_out_opt, $add_range)} );
}
return(\@final_out);
}
#______________________________________________________________________________
# Title : convert_sso_to_msp
# Usage : &convert_sso_to_msp(@ARGV, $single_out_opt);
# Function : This takes sso file(s) and produces MSP file. It
# concatenate sso file contents when more than one
# sso file is given.
# Example : &convert_sso_to_msp(@ARGV, 'OUT.msp', $single_out_opt);
# Warning : This capitalize all the input file names when
# producing xxxxx.msp. xxxxx.sso -> XXXX.sso
# Keywords : sso_file_to_msp_file, convert_sso_to_msp,
# Options : _ for debugging.
# # for debugging.
# v for showing the MSP result to screen
# s for making single MSP file for each sso file
# as well as big MSP file which has all sso
# u= for upper expectation value limit
# l= for lower expect val limit
# s= for single file name input eg. "s=xxxxx.msp"
# n for new format (msp2 format)
# r for adding range
# r2 for adding ranges in all sequence names
#
# Returns : the file names created (xxxx.msp, yyyy.msp,,,,)
# Argument :
# Category :
# Version : 2.6
#-----------------------------------------------------------------------------
sub convert_sso_to_msp{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($upper_expect_limit, $lower_expect_limit)=(50, 0);
my (%sso, @sso, @SSO, $big_out_msp1, @final_out, $big_out_msp2,
$create_sso, $single_out_opt, $add_range, $add_range2, $big_out_msp,
$Evalue_thresh, $new_format, $Score_thresh, $margin, $single_file_name);
if($vars{'u'}=~/([\.\d]+)/){ $upper_expect_limit = $vars{'u'} };
if($vars{'l'}=~/([\.\d]+)/){ $lower_expect_limit = $vars{'l'} };
if($vars{'t'}=~/(\d+)/){ $Score_thresh = $vars{'t'} };
if($vars{'m'}=~/(\d+)/){ $margin = $vars{'m'} };
if($vars{'s'}=~/\S/){ $single_file_name = $vars{'s'} };
if($char_opt=~/r2/){ $add_range='r'; $add_range2='r2' }
if($char_opt=~/r/){ $add_range = 'r' }
if($char_opt=~/c/){ $create_sso = 'c' }
if($char_opt=~/s/){ $single_out_opt='s' }
if($char_opt=~/n/){ $new_format='n' }
print "\n# File given to convert_sso_to_msp is \"@file\", Normally xxx.sso file names\n";
if($single_file_name=~/\S/){
$big_out_msp=$single_file_name;
}else{
for($i=0; $i < @file; $i++){
if($file[$i]=~/\.msp$/){ ## when output file name is given
$big_out_msp=$file[$i];
splice(@file, $i, 1);
$i--;
}elsif($file[$i]=~/^(\d+\-\d+)([_\d]*)\.[mfs]?sso/){ ## creates xxxx.msp file name from xxxx.sso
$big_out_msp1="\U$1"."$2"."\.msp";
$big_out_msp2="\U$1".".msp";
}elsif($file[$i]=~/^(\S+)\.[mfs]?sso$/){
$big_out_msp1="\U$1"."\.msp";
$big_out_msp2="\U$1"."_all".".msp";
print "\n# convert_sso_to_msp: File matched xxxx.sso format \n";
}elsif($file[$i]=~/^(\S+)\.out$/){
$big_out_msp1="\U$1"."\.msp";
$big_out_msp2="\U$1"."_all".".msp";
print "\n# convert_sso_to_msp: File matched xxxx.out format \n";
}elsif($file[$i]=~/^(\S+)\.p[rot\,]*\.ts\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}elsif($file[$i]=~/^(\S+)\.ts\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}elsif($file[$i]=~/^(\S+)\.out\.gz/ or $file[$i]=~/^(\S+)\.[mfs]?sso\.gz/){
$big_out_msp1="\U$1".".msp";
$big_out_msp2="\U$1"."_all".".msp";
}
}
}
if(defined($big_out_msp)){
$big_out_msp1=$big_out_msp2=$big_out_msp;
print "\n# \$big_out_msp is defined as \'$big_out_msp\'\n";
}else{
print "\n# convert_sso_to_msp: You did not define the big MSP file out format, so $big_out_msp1 \n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) When File was given to this sub routine
#__________________________________________
if(@file == 1){ ## ONE single file input??
print "# one file @file is given, OUT will be: $big_out_msp1 \n";
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"u=$upper_expect_limit",
"l=$lower_expect_limit",
"m=$margin",
$new_format,
"s=$big_out_msp")};
push(@final_out, &write_msp_files(@sso, $big_out_msp1,
$single_out_opt, $add_range) );
}elsif(@file > 1){ ## MOre than 1 file input??
@sso=@{&open_sso_files(@file, $add_range, $add_range2,
"l=$lower_expect_limit",
"u=$upper_expect_limit",
"m=$margin",
$new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp2,
$single_out_opt, $add_range)} ); ## concatenates all the hash ref to one
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) When NO File but ARRAY is given
# Here, you can have SSO files created
#__________________________________________
elsif(@array >=1){
print "\n# In convert_sso_to_msp, \@array is given rather than \@file";
@sso=@{&open_sso_files(@array, "u=$upper_expect_limit", $add_range2,
"l=$lower_expect_limit", $add_range, $create_sso,
"m=$margin", $new_format)};
push(@final_out, @{&write_msp_files(@sso, $big_out_msp,
$single_out_opt, $add_range)} );
}
return(\@final_out);
}
#________________________________________________________________________________
# Title : bla_to_msf (this is not used. Use convert_bla_to_msf)
# Usage : @msf_file_made=@{&bla_to_msf(\@bla_file)};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced
# Example :
# Keywords : convert_bla_to_msf
# Options :
# Author :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub bla_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, %alignment_hash, %alignment_hash_query,
%alignment_hash_subject);
$choose_iteration=1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
if(/^Query=(\S+)/){
$query_seq=$1; last;
}
}
close(BLAST_OUTPUT);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Searching\.\.\.\.\.\.\.\.\.\.\./){
$present_iteration++;
if($present_iteration > $choose_iteration){
last
}else{
%alignment_hash_subject=%alignment_hash_query=();
}
}elsif(/^\> *(\S+)/){
$subject_seq=$1;
$start_point='';
if($alignment_hash_subject{$subject_seq}){
$seq_already_in=1;
$subject_seq='';
next;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, match the line to get expectation value
#________________________________________________________________
elsif($subject_seq
and /^[\t ]*Score[\t ]*\=[\t ]*(\S+)[\t ]*bits.+\,[\t ]*Expect[\t ]*=[\t ]*(\S+)/i){
$expect_value=$2;
unless($alignment_hash_subject{$subject_seq} or $expect_value > $e_val_threshold){
$alignment_hash_subject{"$subject_seq"}="$expect_value ";
$alignment_hash_query{"$subject_seq"}="$expect_value ";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, and expectation val is less than thresh, match Query seq line,
# 0.0005 = $e_val_threshold
#_____________________________________________________________________________________
elsif($subject_seq and $expect_value < $e_val_threshold and /Query +(\d+) +(\S+) +\d+/){
if($start_point){
$alignment_hash_query{"$subject_seq"}.=$2;
}else{ # If this is the first match of 'query', put dashes according to the start point
$start_point=$1;
$alignment_hash_query{"$subject_seq"}.="_"x($start_point-1).$2;
$alignment_hash_subject{"$subject_seq"}.="_"x($start_point-1);
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, and expectation val is less than thresh, match Sbjct seq line
#_____________________________________________________________________________________
elsif($subject_seq and $expect_value < $e_val_threshold and /Sbjct +\d+ +(\S+) +\d+/){
$alignment_hash_subject{"$subject_seq"}.=$1;
}
}
close(BLAST_OUTPUT);
# now in %alignment_hash, I have many pairs like:
# --------VAVCQNMGIGK--DGNLPWPPLRNEYKYFQR
# --------WARKNKLGWGFELKGSMPSAPLITEQTYFKD
# -----------------------KTWFSIPEKNRPLK
# -----------------------KTWEEIPALDKELK
$output_msf="$file_base_name\.msf";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
# sorting the keys and then the first column of hash value(which is evals) and then
# keys by keys names when the E vals are equal
# This way, I get:
# 0 : d8dfr__ query 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 1 : d8dfr__ sbjct 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 2 : nr_DYR_CHICK query 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 3 : nr_DYR_CHICK sbjct 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# ...
#___________________________________________________________________
@keys= map{ $_->[1] }
sort { $a->[0] <=> $b->[0] }
map{ $alignment_hash_subject{$_}=~/^(\S+)/ or $_=~/^(\S+)/ ; [$1, $_] }
sort keys %alignment_hash_subject;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# Making the final alignments by adjusting gaps in every pairwise step comparison
# keys are 'd8dfr__ query', 'd8dfr__ sbjct',,,
# values are '7e-92 VRSLNSIVAVCQ....', '7e-92 VRSLNSIVAVCQN....'
#________________________________________________
$template_query=$alignment_hash_query{$keys[0]};
if($alignment_hash_query{$keys[0]}=~/^\S+ +(\S+)/){ @template_query_seq=split(//, $1); }
print "\n", @template_query_seq, "\n" if $verbose;
for($j=0; $j < @keys; $j++){
my($k, $evalue, @gapped_position, $query_seq, $g);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# matching query entry and inserting gaps
#__________________________________________________
$query_name=$keys[$j];
if($alignment_hash_query{$query_name}=~/^(\S+) +(\S+)$/){
$evalue=$1;
$query_seq=$2; }
if($query_seq !~/\-/){ next }
my @splited_query_seq=split(//, $query_seq);
$longest_query_seq=@splited_query_seq if @splited_query_seq > $longest_query_seq;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# skip gaps at the beginning
#_____________________________________________________
if($splited_query_seq[0] eq '_'){
for($k=0; $k < @splited_query_seq; $k++){
if($splited_query_seq[$k] ne '_' and $splited_query_seq[$k] ne '-'){
last;
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finding all the gapped position and saving them
#___________________________________________________
for( $s=$k-1; $s < @splited_query_seq; $s++){
if($splited_query_seq[$s] eq '-'){
push(@gapped_position, $s);
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) TEMPLATE: matching sbjct entry and inserting gaps
#__________________________________________________
if(@gapped_position < 1){ next }
for($g=0; $g< @gapped_position; $g++){
$char_posi=$template_query_seq[$gapped_position[$g]] ;
if($char_posi ne '-'){
splice(@template_query_seq, $gapped_position[$g], 0, '-');
}
}
print "\n# gaps are @gapped_position \n" if $verbose;
@gapped_position=();
next;
}
#print "\n ", @template_query_seq, "\n The raw subject lines are:\n";
if($verbose){
for($k=0; $k< @keys; $k++){
print $alignment_hash_subject{$keys[$k]}, "\n";
}
print "\n The raw QUERY lines\n";
for($k=0; $k< @keys; $k++){
print $alignment_hash_query{$keys[$k]}, "\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Fixing subject sequences according to template and query seqs
#____________________________________________________________________
for($g=0; $g < @keys; $g++){
$subject_name=$keys[$g];
my($evalue, @splited_subject_seq, @splited_query_seq);
$alignment_hash_query{ $subject_name}=~/^(\S+) +(\S+)/;
@splited_query_seq =split(//, $2);
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
@splited_subject_seq=split(//, $2);
$evalue=$1;
for($t=0; $t< @template_query_seq; $t++){
if($template_query_seq[$t] ne '-' ){
next
}elsif($template_query_seq[$t] eq '-'){
$char_of_the_position=$splited_query_seq[$t];
if($char_of_the_position ne '-' and $char_of_the_position ne '_'){
#print "\n# \$t is $t";
#print "\n# \$evalue is $evalue\n ==>";
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n ==>";
splice(@splited_subject_seq, $t, 0, '-');
splice(@splited_query_seq, $t, 0, '-');
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n";
next;
}elsif($char_of_the_position eq '_'){
splice(@splited_subject_seq, 0, 0, '_');
splice(@splited_query_seq, 0, 0, '_');
}elsif($char_of_the_position eq '-'){
next;
}
}
}
$new_subject_seq=join('', @splited_subject_seq);
$new_query_seq =join('', @splited_query_seq);
#$alignment_hash{$keys[$g]}="$evalue $new_subject_seq";
#$alignment_hash{$keys[$g-1]}="$evalue $new_query_seq";
$alignment_hash_subject{$subject_name}="$evalue $new_subject_seq";
$alignment_hash_query{$subject_name} ="$evalue $new_query_seq";
}
print "\n";print @template_query_seq, "\n" if $verbose;
for($h=0; $h< @keys; $h++){
$subject_name=$keys[$h];
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
#print "\n $alignment_hash_query{$subject_name}";
print "\n $alignment_hash_subject{$subject_name}";
$final_seq_out{$subject_name}=$2;
}
&write_msf(\%final_seq_out, \$output_msf);
push(@final_out, $output_msf);
}
return(\@final_out);
}
#________________________________________________________________________________
# Title : convert_bla_to_msf
# Usage : @msf_file_made=@{&convert_bla_to_msf(\@bla_file)};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced
# Example :
# Keywords : convert_bla_to_msf
# Options :
# Author :
# Category :
# Version : 1.1
#--------------------------------------------------------------------------------
sub convert_bla_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, %alignment_hash, %alignment_hash_query,
%alignment_hash_subject);
$choose_iteration=1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
if(/^Query=(\S+)/){
$query_seq=$1; last;
}
}
close(BLAST_OUTPUT);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Searching\.\.\.\.\.\.\.\.\.\.\./){
$present_iteration++;
if($present_iteration > $choose_iteration){
last
}else{
%alignment_hash_subject=%alignment_hash_query=();
}
}elsif(/^\> *(\S+)/){
$subject_seq=$1;
$start_point='';
if($alignment_hash_subject{$subject_seq}){
$seq_already_in=1;
$subject_seq='';
next;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, match the line to get expectation value
#________________________________________________________________
elsif($subject_seq
and /^[\t ]*Score[\t ]*\=[\t ]*(\S+)[\t ]*bits.+\,[\t ]*Expect[\t ]*=[\t ]*(\S+)/i){
$expect_value=$2;
unless($alignment_hash_subject{$subject_seq} or $expect_value > $e_val_threshold){
$alignment_hash_subject{"$subject_seq"}="$expect_value ";
$alignment_hash_query{"$subject_seq"}="$expect_value ";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, and expectation val is less than thresh, match Query seq line,
# 0.0005 = $e_val_threshold
#_____________________________________________________________________________________
elsif($subject_seq and $expect_value < $e_val_threshold and /Query +(\d+) +(\S+) +\d+/){
if($start_point){
$alignment_hash_query{"$subject_seq"}.=$2;
}else{ # If this is the first match of 'query', put dashes according to the start point
$start_point=$1;
$alignment_hash_query{"$subject_seq"}.="_"x($start_point-1).$2;
$alignment_hash_subject{"$subject_seq"}.="_"x($start_point-1);
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# If $subject_seq defined, and expectation val is less than thresh, match Sbjct seq line
#_____________________________________________________________________________________
elsif($subject_seq and $expect_value < $e_val_threshold and /Sbjct +\d+ +(\S+) +\d+/){
$alignment_hash_subject{"$subject_seq"}.=$1;
}
}
close(BLAST_OUTPUT);
# now in %alignment_hash, I have many pairs like:
# --------VAVCQNMGIGK--DGNLPWPPLRNEYKYFQR
# --------WARKNKLGWGFELKGSMPSAPLITEQTYFKD
# -----------------------KTWFSIPEKNRPLK
# -----------------------KTWEEIPALDKELK
$output_msf="$file_base_name\.msf";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
# sorting the keys and then the first column of hash value(which is evals) and then
# keys by keys names when the E vals are equal
# This way, I get:
# 0 : d8dfr__ query 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 1 : d8dfr__ sbjct 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 2 : nr_DYR_CHICK query 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# 3 : nr_DYR_CHICK sbjct 7e-92 VRSLNSIVAVCQNMGIGKDGNLPWPPLRNEYKYFQRMTSTSHVEGKQNAVIM
# ...
#___________________________________________________________________
@keys= map{ $_->[1] }
sort { $a->[0] <=> $b->[0] }
map{ $alignment_hash_subject{$_}=~/^(\S+)/ or $_=~/^(\S+)/ ; [$1, $_] }
sort keys %alignment_hash_subject;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# Making the final alignments by adjusting gaps in every pairwise step comparison
# keys are 'd8dfr__ query', 'd8dfr__ sbjct',,,
# values are '7e-92 VRSLNSIVAVCQ....', '7e-92 VRSLNSIVAVCQN....'
#________________________________________________
$template_query=$alignment_hash_query{$keys[0]};
if($alignment_hash_query{$keys[0]}=~/^\S+ +(\S+)/){ @template_query_seq=split(//, $1); }
print "\n", @template_query_seq, "\n" if $verbose;
for($j=0; $j < @keys; $j++){
my($k, $evalue, @gapped_position, $query_seq, $g);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# matching query entry and inserting gaps
#__________________________________________________
$query_name=$keys[$j];
if($alignment_hash_query{$query_name}=~/^(\S+) +(\S+)$/){
$evalue=$1;
$query_seq=$2; }
if($query_seq !~/\-/){ next }
my @splited_query_seq=split(//, $query_seq);
$longest_query_seq=@splited_query_seq if @splited_query_seq > $longest_query_seq;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# skip gaps at the beginning
#_____________________________________________________
if($splited_query_seq[0] eq '_'){
for($k=0; $k < @splited_query_seq; $k++){
if($splited_query_seq[$k] ne '_' and $splited_query_seq[$k] ne '-'){
last;
}
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finding all the gapped position and saving them
#___________________________________________________
for( $s=$k-1; $s < @splited_query_seq; $s++){
if($splited_query_seq[$s] eq '-'){
push(@gapped_position, $s);
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) TEMPLATE: matching sbjct entry and inserting gaps
#__________________________________________________
if(@gapped_position < 1){ next }
for($g=0; $g< @gapped_position; $g++){
$char_posi=$template_query_seq[$gapped_position[$g]] ;
if($char_posi ne '-'){
splice(@template_query_seq, $gapped_position[$g], 0, '-');
}
}
print "\n# gaps are @gapped_position \n" if $verbose;
@gapped_position=();
next;
}
#print "\n ", @template_query_seq, "\n The raw subject lines are:\n";
if($verbose){
for($k=0; $k< @keys; $k++){
print $alignment_hash_subject{$keys[$k]}, "\n";
}
print "\n The raw QUERY lines\n";
for($k=0; $k< @keys; $k++){
print $alignment_hash_query{$keys[$k]}, "\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Fixing subject sequences according to template and query seqs
#____________________________________________________________________
for($g=0; $g < @keys; $g++){
$subject_name=$keys[$g];
my($evalue, @splited_subject_seq, @splited_query_seq);
$alignment_hash_query{ $subject_name}=~/^(\S+) +(\S+)/;
@splited_query_seq =split(//, $2);
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
@splited_subject_seq=split(//, $2);
$evalue=$1;
for($t=0; $t< @template_query_seq; $t++){
if($template_query_seq[$t] ne '-' ){
next
}elsif($template_query_seq[$t] eq '-'){
$char_of_the_position=$splited_query_seq[$t];
if($char_of_the_position ne '-' and $char_of_the_position ne '_'){
#print "\n# \$t is $t";
#print "\n# \$evalue is $evalue\n ==>";
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n ==>";
splice(@splited_subject_seq, $t, 0, '-');
splice(@splited_query_seq, $t, 0, '-');
#print @splited_query_seq, "\n ==>";
#print @splited_subject_seq, "\n";
next;
}elsif($char_of_the_position eq '_'){
splice(@splited_subject_seq, 0, 0, '_');
splice(@splited_query_seq, 0, 0, '_');
}elsif($char_of_the_position eq '-'){
next;
}
}
}
$new_subject_seq=join('', @splited_subject_seq);
$new_query_seq =join('', @splited_query_seq);
#$alignment_hash{$keys[$g]}="$evalue $new_subject_seq";
#$alignment_hash{$keys[$g-1]}="$evalue $new_query_seq";
$alignment_hash_subject{$subject_name}="$evalue $new_subject_seq";
$alignment_hash_query{$subject_name} ="$evalue $new_query_seq";
}
print "\n";print @template_query_seq, "\n" if $verbose;
for($h=0; $h< @keys; $h++){
$subject_name=$keys[$h];
$alignment_hash_subject{$subject_name}=~/^(\S+) +(\S+)/;
#print "\n $alignment_hash_query{$subject_name}";
print "\n $alignment_hash_subject{$subject_name}";
$final_seq_out{$subject_name}=$2;
}
&write_msf(\%final_seq_out, \$output_msf);
push(@final_out, $output_msf);
}
return(\@final_out);
}
#________________________________________________________________________________
# Title : convert_bla_to_msp
# Usage : %hash_out_final=%{&convert_bla_to_msp(\$file, [$Lean_output])};
# Function : reads in PSI blast output and produces MSP file format.
# Takes all the good hits below certain threshold in multiple iteration
# Reports the best evalue with a given sequence name
# Example : %hash_out=%{&convert_bla_to_msp(\$file)};
# Keywords : pbla_to_msp, blast_to_msp, bla_2_msp, blastp_to_msp_format,
# blast_to_msp_format, convert_bla_to_msp, convert_bla_to_msp_files
# bla_to_msp
# Options :
# $pdbd_seq_only d for getting dxxxx_ like seq names only(pdb40d names for examp)
# $all_seq a for forcing all seq conversion
# $which_iteration= by i= # choose which iteration result you want to take
# $which_iteration as just a digit
# $report_only_the_best=b by b -b
# $take_only_the_last_iteration=l by l
# $accumulative_hits_eval_thresh= by e=
# $genome_seq_only=g by g
# $nrdb_seq_only=n by n
# $evalue_thresh= by E=
# $Accumulate_matches=A by A -A
# $Lean_output=L by L -L # to remove search output to unclutter
#
# Author : Sarah Teichmann and Jong Park, jong@salt2.med.harvard.edu
# Version : 4.0
#--------------------------------------------------------------------------------
sub convert_bla_to_msp{
my($i, $j, $k, @lines, $match_string_count, $line_count, $query_string_count,
$match_length, $Lean_output,
$lines, $duplicated_match_count, $new_sorted_name, $sorted_name, $verbose,
$pdbd_seq_only, $entry_found, $which_iteration, $report_only_the_best,
$genome_seq_only, $all_seq, $header_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $original_query, $nrdb_seq_only,
$get_the_final_iteration, $read_entry_lines, $verbose, $Accumulate_matches);
my $match_leng_thresh=10;
### This localization is critial NOT my, as I use a sub which relies on this
local(%hash_out, %accumulative_hits, $file, $score, $score_ori, $evalue,
$evalue_ori, $seq_id, $query_range_start, $query_range_stop,
$query, $match_string_start, $match_string_stop, $matched,
$read_point_found);
$duplicated_match_count=0;
my $evalue_thresh=$accumulative_hits_eval_thresh=1; ## default eval threshes
$query='query_seq'; ## default query seq name, to avoid blank name
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Processing the input arguments to get file and options etc
#_____________________________________________________________
for (@_){
if(ref $_ eq 'ARRAY'){ @lines =@{$_};
}elsif( ref $_ eq 'SCALAR' and -s ${$_} ){ $file=${$_};
}elsif( -s $_ ){ $file=$_;
}elsif(/^ *d *$/){ $pdbd_seq_only='d'; $all_seq=''; $genome_seq_only='';
print "\n $0: convert_bla_to_msp, You set \$pdbd_seq_only option, I will skip others.\n";
}elsif(/^ *[i=]*(\d+) *$/){ $which_iteration=$1;
}elsif(/^ *b *$/){ $report_only_the_best='b';
}elsif(/^ *a *$/){ $all_seq='a'; $genome_seq_only=''; $pdbd_seq_only=''; $nrdb_seq_only='';
}elsif(/^ *g *$/){ $genome_seq_only='g'; $all_seq=''; $pdbd_seq_only='';$nrdb_seq_only='';
}elsif(/^ *n *$/){ $nrdb_seq_only='n'; $all_seq=''; $pdbd_seq_only=''; $genome_seq_only='';
}elsif(/^ *l *$/){ $take_only_the_last_iteration='l';
}elsif(/^ *v *$/){ $verbose='v';
}elsif(/^ *L *$/){ $Lean_output='L';
}elsif(/e=(\S+)/){ $accumulative_hits_eval_thresh=$1;
}elsif(/E=(\S+)/){ $evalue_thresh=$1;
}elsif(/A$/){ $Accumulate_matches='A'; }
}
unless($which_iteration){ $get_the_final_iteration=1 }
print "\n\n# (W) convert_bla_to_msp: NO \$Accumulate_matches opt \'A\' set. Sure??\n\n", chr(7) if !$Accumulate_matches;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# If the input file is gzipped, uncompress it to text file and then open
#__________________________________________________________________
if($file=~/\.gz *$/){
print "\n# (INFO) >>>>> Running gunzip to open $file";
open(BLA_FILE, "gunzip -c $file|") || die "\n# $0: Failed to open $file\n";
print "\n# (INFO) $0: bla_to_msp, input file was $file\n";
if($file=~/^([de]*\d\d*\w\w\w\w\w)\./){ $query=$1;
}
}else{
open(BLA_FILE, "$file") || die "\n# $0: Failed to open $file\n";
print "\n# (i) $0: convert_bla_to_msp, input file was $file\n";
if($file=~/^([de]*\d\d*\w\w\w\w\w)\./){ $query=$1;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# UP to NOW is frivalous option handling stuff
#_______________________________________________________
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Main reading in .pbla file (or any extension)
#____________________________________________________________________________
while(<BLA_FILE>){
$line_count++; $lines=$_; ## putting $_ to $lines var
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Matching "BLASTP 2.0.4 [Feb-24-1998]"
#____________________________________________________________
if(!$header_found and $lines=~/^ *BLASTP +\d/){ # blastp header as BLASTP 2.0.2 [Sep-3-1997] or BLASTP 2.0.4 [Feb-24-1998]
$header_found=1;
print "\n# (1) \$header_found becomes $header_found\n";
}elsif($header_found and $lines=~/^ *BLASTP +\d/){ print "\n# (E) \$header_found is already SET and I found \"BLASTP\", ERROR!!!\n"; exit }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1.1) If I reach the end of the opened file, I use &put_msp_lines_to_hash_from_bla sub to write the final msp line and finish
#________________________________________________________________________________________________________________
if( eof ){
if( $read_point_found <= $which_iteration or $get_the_final_iteration){
print " # (i) <<<< The end of file reached, writing $sorted_name \n\n";
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
last;
}
}
if($lines=~/^ *$/ or $lines=~/^ +Length +\= +\d+ *$/){ next } ## skipping some junk lines
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) Extracting query seq name(this is the only place to get it)
#____________________________________________________________
if($lines=~/^ *Query= +(\S+)/){ $query=$original_query=$1; next }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~
# (3) 'Searching......done' line indicates new search step(iteration)
#_________________________________________________________________________
if( $lines=~/^ *Searching\.\.+[done]/i ){
$read_point_found++;
if($verbose){
print "\n==========================================================";
print "\n# (3.1) Searching........ line found, so \$read_point_found is $read_point_found\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3.2) This is just error checking
#__________________________________________________________
if(!$header_found){ ## header is 'BLASTP 2.0.4 [Feb-24-1998]', the very first line in bla output
print chr(7); print chr(7); print chr(7);print chr(7);print chr(7);
print "\n# (ERROR)\'Sequences producing\' line is matched while BLASTP header is not found\n";
print "\n# (ERROR) convert_bla_to_msp: It means the file is truncated !!!!\n"; exit;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# (3.3) Following is the KEY part for controlling iteration
#__________________________________________________________
if( $which_iteration and $read_point_found < $which_iteration){
print "\n# (INFO) skipped, \$which_iteration: ($which_iteration), \$read_point_found: ($read_point_found)" if $verbose;
$match_string_count=$query_string_count=$score=$evalue=$seq_id=$score_ori=$evalue_ori='';
$query_range_stop=$query_range_start=$match_string_stop=$msp_line=$new_sorted_name='';
$entry_found=$duplicated_match_count=0;
if( !$Accumulate_matches){
%hash_out=(); ## this is to remove any discarded pairs in the iteration
}
print "\n# (INFO) ===> New iteration ====\$read_point_found: $read_point_found, \$which_iteration:$which_iteration\n";
next;
}elsif( $which_iteration and $read_point_found == $which_iteration){
$read_entry_lines=1; next;
}elsif( $which_iteration and $read_point_found > $which_iteration){
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
last;
}elsif(!$which_iteration){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# (3.4) Default situation
#____________________________________________________________
print "\n# (WARN) You did not set \$which_iteration option \n\n" if $verbose;
if($read_point_found > 1){
print "\n (3.3) Writing the last entry $sorted_name BEFORE next Searching........ line\n";
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
}
$match_string_count=$query_string_count=$score=$evalue=$seq_id=$score_ori=$evalue_ori='';
$query_range_stop=$query_range_start=$match_string_stop=$msp_line=$new_sorted_name='';
$entry_found=$duplicated_match_count=0;
if( !$Accumulate_matches){ %hash_out=(); $entry_found=0; $duplicated_match_count=0; }
$read_entry_lines=1; ## this is set by 'Searching......' line
next;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) '> xxxx ' New sequence entry, '>' starts
#__________________________________________________________
elsif($read_entry_lines and $lines=~/^\> *(\S+)/){
$temp_match=$1;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4.0) To get the pdbd seq names only 'dxxx__' sort of thing
#________________________________________________________
if($pdbd_seq_only and ($temp_match !~/^pdb_\S+/ and $temp_match !~/^[cde]\d\S+/) ){
$entry_found=0; print "\n# NOT pdb seq\n"; next;
}elsif($genome_seq_only and $temp_match !~/^gn_\S+/){
$entry_found=0; print "\n# NOT genome seq\n"; next
}elsif($nrdb_seq_only and $temp_match !~/^nr_\S+/){
$entry_found=0; print "\n# NOT nrdb\n"; next
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# (4.1) This is the DEFAULT
#_____________________________________________________________
}else{ ## This is default and equivalent to have the $all_seq option on.
$entry_found=1;
if($match_string_count){ ## $match_string_count is incremented only by 'Sbjct' line
print " (4.1) Writing $sorted_name ent to \%hash_out\n" if $verbose;
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
$match_string_count=0;
$duplicated_match_count=0;
print " (4.1) reset \$match_string_count\n\n" if $verbose;
}else{ print " (4.1) \$match_string_count or \$duplicated_match_count is not right\n" if $verbose; }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Only with new seq entry, I count the pair occurrances
#__________________________________________________________________
$query=$original_query; $query_string_count='';
$matched=$temp_match; ## this should be here, after if
$sorted_name=join(' ', sort($query, $matched) );
print "\n# (4) >>> NEW \$sorted_name is $sorted_name <<------ \$entry_found ($entry_found)\n" if $verbose;
}
$match_string_count=0;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (5) Matching Score = 325 bits (824), Expect = 6e-89 << 2 >>
#_________________________________________________________________
elsif( ($entry_found and $lines=~/^[\t ]*Score[\t ]*\=[\t ]*(\S+)[\t ]*bits +\(\S+\)\,[\t ]*Expect[\t ]*=[\t ]*(\S+)/i)
or ($entry_found and $lines=~/^[\t ]*Score[\t ]*\=[\t ]*(\S+)[\t ]*bits.+\,[\t ]*Expect[\t ]*=[\t ]*(\S+)/i)){
$score_ori=$1;
$evalue_ori=$2;
if($evalue_ori=~/^e\-\d\d\d/){ $evalue_ori="1".$evalue_ori; } ## bug fix for short eval in blast distribution
if($match_string_count){ # $match_string_count is increased when Sbjct word is found
if($evalue > $evalue_thresh){ $evalue=$evalue_ori; $score=$score_ori; next }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# When Only the first match(best evalue) is required, write msp line and reset $entry_found var
#_________________________________________________________________________________________________
if($report_only_the_best){
print " (5) \$report_only_the_best is set\n" if $verbose;
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
$entry_found=0; next;
}else{
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~`
# duplicated match count means, query matched more than one region of a match seq
#__________________________________________________________________________________
$duplicated_match_count++;
$sorted_name="$sorted_name $duplicated_match_count";
print " ====(5) Multiple region for \"$new_sorted_name\" is found =========== e= $evalue\n" if $verbose;
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
}
$score=$score_ori; $evalue=$evalue_ori;
}else{
print " (5) \$match_string_count is not set NO write \$evalue_ori $evalue_ori\n" if $verbose;
$evalue=$evalue_ori; $score=$score_ori;
} ## to next line
sub reset_all_the_vars{
print " !!!! Reseting all the vars !!!!\n" if $verbose;
$query_string_count=$score=$evalue=$seq_id=$query_range_stop=$query_range_start='';
$match_string_stop=$msp_line=$new_sorted_name=$match_string_count='';
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# (6) Matching Identities = 158/158 (100%), Positives = 158/158 (100%) ,
#____________________________________________________________________________________
elsif( $entry_found and $lines=~/^ *Identities += +\S+\/(\S+) +\( *(\S+) *\%\)/i){
$query_string_count=$match_string_count=0;
$seq_id=$2/100;
$match_length=$1;
if($match_length < $match_leng_thresh){
print " (6) \$match_leng_thresh $match_leng_thresh > \$match_length $match_length" if $verbose;
$entry_found=0;
$match_string_count=1;
next;
}else{
print " (6) $sorted_name : ABOVE leng thresh. \$seq_id= $seq_id, \$match_length= $match_length\n" if $verbose;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (7) Matching 'Query: 2 GIRAATSQEINELT..' line ,
#_________________________________________________________________
elsif($entry_found and $lines=~/^ *Query\:?[\t ]+(\d+) +\D+ +(\d+)/){
$query_string_count++;
$query_line_found=1;
if($query_string_count==1){ $query_range_start=$1; $query_range_stop =$2;
}elsif($query_string_count > 1){ $query_range_stop=$2; }
print " (7) Query: line found: $query\_$query_range_start\-$query_range_stop\n" if $verbose;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (8) Matching 'Sbjct: 2 GIRAATSQEINELT..' line
#_________________________________________________________________
elsif($entry_found and $query_line_found and $lines=~/^ *Sbjct\:? +(\d+) +[\w\-]+ +(\d+)/i){
$match_string_count++;
$subject_line_found=1;
if($match_string_count==1){ $match_string_start=$1;
$match_string_stop =$2;
}elsif($match_string_count > 1){ $match_string_stop=$2; }
print " (8) Sbjct: line found: $temp_match\_$match_string_start\-$match_string_stop\n" if $verbose;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (9) Matching ' Database: ' line , << END >>
#_________________________________________________________________
elsif( ($entry_found and $lines=~/^ +Database: +\S+/) or eof){ # the very last write
if($evalue > $evalue_thresh){ last
}else{
print " <<<< The end of file reached, writing $sorted_name\n" if $verbose;
@out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out, \%accumulative_hits, $query,$matched,$evalue, $score, $seq_id,
$sorted_name, $query_range_start, $query_range_stop,$match_string_start,
$match_string_stop, $read_point_found, $accumulative_hits_eval_thresh,
$take_only_the_last_iteration, $accumulative_hits_eval_thresh, $evalue_thresh)};
%hash_out= %{$out_from_put_msp_lines[0]};
%accumulative_hits=%{$out_from_put_msp_lines[1]};
$read_point_found= $out_from_put_msp_lines[2];
last;
}
}
}
close(BLA_FILE);
unless( $take_only_the_last_iteration){
print "\n# >> ACCUMULATIVE HITS are reported as you did not set \$take_only_the_last_iteration opt!!\n";
%hash_out=(%hash_out, %accumulative_hits);
}
print "\n Congratulations!\n";
print ">>>>>>>---- Finished reading in xxxx.pbla file Now sorting \%hash_out ----<<<<<<<<\n\n";
&show_hash(\%hash_out) if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
# CLeaning up the BLA file if $Lean_output is set
#_____________________________________________________
$gzipped_search_file="$file\.gz";
if($Lean_output ){ ## If Lean_out opt is set and $file exists and %hash_out is not empty, remove $file
if(-s $file){
unlink($file); ## removes fam_8_8.pbla etc,
}elsif(-s $gzipped_search_file){
unlink($gzipped_search_file); ## removes fam_8_8.pbla.gz etc,
}else{
print "\n# (E) convert_bla_to_msp: tried to remove search out file for \$Lean_output opt,
but failed. Something is wrong. Think! or report to jong\@salt2.med.harvard.edu,
jong\@mrc-lmb.cam.ac.uk, sat\@mrc-lmb.cam.ac.uk, jong_p\@hotmail.com\n";
exit;
}
}
return(\%hash_out);
}
#______________________________________________________________________________
# Title : put_msp_lines_to_hash_from_bla
# Usage : @out_from_put_msp_lines=@{&put_msp_lines_to_hash_from_bla(\%hash_out,
# $query,$matched,$evalue, $score, $seq_id,
# $sorted_name, $query_range_start,
# $query_range_stop,$match_string_start,
# $match_string_stop, $read_point_found,
# $accumulative_hits_eval_thresh,
# $take_only_the_last_iteration)};
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.2
#------------------------------------------------------------------------------
sub put_msp_lines_to_hash_from_bla{
my (@finale_out, $sorted_name, $msp_line, $evalue, $score, $matched, $seq_id, $query_range_start,$accumulative_hits_eval_thresh,
$query_range_stop, $query, $match_string_start, $match_string_stop, $read_point_found, %hash_out, %accumulative_hits, $evalue_thresh);
%hash_out=%{$_[0]}; %accumulative_hits=%{$_[1]};
$query=$_[2]; $matched=$_[3];
$evalue=$_[4]; $score=$_[5];
$seq_id=$_[6]; $sorted_name=$_[7];
$query_range_start=$_[8]; $query_range_stop =$_[9];
$match_string_start=$_[10]; $match_string_stop=$_[11];
$read_point_found=$_[12]; $accumulative_hits_eval_thresh=$_[13];
$take_only_the_last_iteration=$_[14];
$accumulative_hits_eval_thresh=$_[15];
$evalue_thresh=$_[16];
$query ="$query\_$query_range_start\-$query_range_stop";
if($matched !~/^\S+\_\d+\-\d+ *$/){ $matched="$matched\_$match_string_start\-$match_string_stop";
}elsif($matched =~/^(\S+)\_\d+\-\d+ *$/){ $matched="$1\_$match_string_start\-$match_string_stop"; }
if($score=~/\S/ and $evalue=~/\S/ and $match_string_start=~/\S/ and $evalue_thresh > $evalue){
$msp_line=sprintf("%-6s %-9s %-5s %-5s %-5s %-32s %-5s %-5s %-38s %-3s\n",
$score, $evalue, $seq_id, $query_range_start, $query_range_stop,
$query, $match_string_start, $match_string_stop, $matched, $read_point_found);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is where I really put the matches !!!
#_____________________________________________________
if($hash_out{$sorted_name}=~/^\S+ +(\S+) +/){
if($1 >= $evalue){
print " (1) put_msp_lines_to_hash_from_bla: $1 >= $evalue WRITING to hash. 1\n" if $verbose;
$hash_out{$sorted_name}=$msp_line;
}else{
print " put_msp_lines_to_hash_from_bla: $1 < $evalue_ori NO write to hash\n" if $verbose; }
}else{
print " (2) put_msp_lines_to_hash_from_bla: NO eval >= $evalue WRITING to hash. 2\n" if $verbose;
$hash_out{$sorted_name}=$msp_line;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This part is to rescue the hits dropped by matrix migration
#_________________________________________________________________
if(!$take_only_the_last_iteration and $evalue <= $accumulative_hits_eval_thresh ){
if($accumulative_hits{$sorted_name}){
if($accumulative_hits{$sorted_name}=~/^[\t ]*\S+[\t ]+(\S+)[\t ]/){
if($evalue < $1){
$accumulative_hits{$sorted_name}=$msp_line; } }
}else{ $accumulative_hits{$sorted_name}=$msp_line; }
}
}else{ }
@finale_out=(\%hash_out, \%accumulative_hits, $read_point_found, $query, $matched, $evalue, $score, $seq_id, $sorted_name,
$query_range_start, $query_range_stop, $match_string_start, $match_string_stop );
return(\@finale_out);
}
#________________________________________________________________________________
# Title : convert_bla_multaln_to_msf
# Usage : @msf_file_made=@{&convert_bla_multaln_to_msf(\@bla_file, [i=2])};
# Function : matched each query seq name and if the E value is lower than
# my arbitrary threshold, I put the subject and target pair
# alignment into a hash.
# In later iterations, the latest is replaced,
# when you use m6 option for PSI blast
# this adds '00x' extensions to the repeatedly occurring seq names
#
# Example : @msf_file_made=@{&convert_bla_multaln_to_msf(\@bla_file,
# $verbose, "i=$iteration")};
# Keywords : psi_blast_to_msf, psi_blast_multaln_to_msf
# Options :
# i=$iteration
# v for verbose
# Author :
# Category :
# Version : 1.6
#--------------------------------------------------------------------------------
sub convert_bla_multaln_to_msf{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($e_val_threshold)=0.0005;
my(@template_query_seq, @keys, $present_iteration, $blank_line_counter,
%alignment_hash_subject, $seq_order, $choose_iteration, %final_output_hash,
$seq_name, %seq_names_in_block, $put_alphabet_to_number_only_name);
$choose_iteration=1;
if($vars{'i'}=~/(\d+)/){
$choose_iteration=$1;
}
if($char_opt=~/v/){ $verbose='v' }
if($char_opt=~/a/){ $put_alphabet_to_number_only_name='a' }
print "\n# $0: bla_multaln_to_msf, \$choose_iteration is $choose_iteration\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Opening file
#______________________________________________
for($i=0; $i< @file; $i++){
$file_base_name=${&get_base_names($file[$i])};
print "\n# bla_multaln_to_msf: processing $file[$i]\n";
my($present_iteration, %seq_names_in_block, $seq_name_ori, $sequence);
open(BLAST_OUTPUT, $file[$i]);
while(<BLAST_OUTPUT>){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Finds the query sequence, resets $start_point and next line
#____________________________________________
if(/^Query= *(\S+)/){
$query_seq=$1;
print "\n# The query sequence is: $query_seq\n";
}elsif(/^Searching\.\.\.\.\.\.\.\./ or eof){ ### to make sure it gets the last one
$present_iteration++;
if($present_iteration > $choose_iteration){
%final_output_hash=%alignment_hash_subject;
last;
}else{
%final_output_hash=%alignment_hash_subject;
%alignment_hash_subject=();
%seq_names_in_block=();
$seq_order='';
}
}elsif(/^(QUERY) +\d* *(\S\S+) *\d*$/){
%seq_names_in_block=();
$seq_name=$1;
$seq_order=$seq_name;
$seq_names_in_block{$seq_name}++;
$alignment_hash_subject{$seq_name} .= $2; ## bug fix '.'
}elsif(/^(\S+) +\d+ +(\S+) +\d+ *$/){
$seq_name=$seq_name_ori=$1;
$sequence=$2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# This is to replace number only seq names to alphabetical ones
#_____________________________________________________________
if($put_alphabet_to_number_only_name and $seq_name=~/^\d+$/){
$seq_name='T'.$seq_name;
$seq_name_ori='T'.$seq_name_ori;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is to handle multiple identical seq name entry
#_____________________________________________________
if($seq_names_in_block{$seq_name} > 0){
$seq_name=$seq_name."00$seq_names_in_block{$seq_name}";
}
$seq_order.=" $seq_name";
$alignment_hash_subject{$seq_name}.=$sequence;
$seq_names_in_block{$seq_name_ori}++; ## NOTE that it is $seq_name_ori not $seq_name
}elsif(/^(\S+) +(\-+) *$/){
$seq_name=$seq_name_ori=$1;
$sequence=$2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# This is to replace number only seq names to alphabetical ones
#_____________________________________________________________
if($put_alphabet_to_number_only_name and $seq_name=~/^\d+$/){
$seq_name='T'.$seq_name;
$seq_name_ori='T'.$seq_name_ori;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is to handle multiple identical seq name entry
#_____________________________________________________
if($seq_names_in_block{$seq_name} > 0){
$seq_name=$seq_name."00$seq_names_in_block{$seq_name}";
}
$seq_order.=" $seq_name";
$alignment_hash_subject{$seq_name}.=$sequence;
$seq_names_in_block{$seq_name_ori}++; ## NOTE that it is $seq_name_ori not $seq_name
}
}
close(BLAST_OUTPUT);
print "\n# finished reading in BLAST output(@file) \n";
# now in %alignment_hash, I have many pairs like:
# --------VAVCQNMGIGK--DGNLPWPPLRNEYKYFQR
# --------WARKNKLGWGFELKGSMPSAPLITEQTYFKD
# -----------------------KTWFSIPEKNRPLK
# -----------------------KTWEEIPALDKELK
print "\n# the seq order is: $seq_order\n" if $verbose;
$output_msf="$file_base_name\.msf";
print "\n# $0: running write_msf subroutine\n";
&write_msf(\%final_output_hash, \$output_msf, "o=$seq_order");
print "\n# $0: $output_msf is created\n";
push(@final_out, $output_msf);
}
return(\@final_out);
}
#______________________________________________________________
# Title : get_sub_hash
# Usage : %sub_hash=%{&get_sub_hash(\%FASTA, \@list)};
# Function : fetches hash keys and values by giving keys to
# a hash
# Example :
# Warning : You MUST NOT delete '# options : ..' entry
# as it is read by various subroutines.
# Keywords : subhash, sub_hash, get_hash_elements, fetch_sub_hash
# take_sub_hash, get_hash_by_keys, get_sub_hash_by_keys
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub get_sub_hash{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (%out_hash, @out_hash_all_ref);
for($i=0; $i < @hash; $i++){
my %hash=%{$hash[$i]};
my @keys = keys %hash;
for($j=0; $j < @raw_string; $j++){
for($l=0; $l < @keys; $l++){
if($keys[$l] eq $raw_string[$j]){
$out_hash{$keys[$l]}=$hash{$keys[$l]};
}
}
}
push(@out_hash_all_ref, \%out_hash);
}
if(@out_hash_all_ref > 1){
return(@out_hash_all_ref);
}else{
return($out_hash_all_ref[0]);
}
}
#______________________________________________________________
# Title : get_smallest_file
# Usage : $smallest_file_name=${&get_largest_file(@ARGV)};
# Function : checks the size of files and returns the smallest
# one's name. If a file is not present in pwd or
# specified absolute path, it ignores it.
# Example :
# Keywords : choose_smallest_file, smallest_file, find_smallest_file
# get_the_smallest_file, choose_the_smallest_file,
# fetch_smallest_file, take_smallest_file, get_smaller_file,
# Options : _ for debugging.
# # for debugging.
# e for extract the smallest from the input array
# leaving it one element less, in this case
# there will be two returning refs.
# Category :
# Version : 1.3
#--------------------------------------------------------------
sub get_smallest_file{
my @in;
if(ref $_[0] eq 'ARRAY'){
@in = @{$_[0]};
}else{
@in = @_;
}
my $smallest=10000000000;
my ($smallest_file, $i, $extract_opt);
for($i=0; $i< @in; $i++){
if(($in[$i]=~/^\-?e$/i)&&(!(-f $in[$i])) ){
$extract_opt=1;
splice(@in, $i, 1);
$i--;
}
}
for($i=0; $i< @in; $i++){
my $size=(-s $in[$i]);
if($size < $smallest){
$smallest=$size;
if($extract_opt ==1){
print "\$extract_opt is $extract_opt \n";
push(@in, $smallest_file) if defined($smallest_file);
$smallest_file = splice(@in, $i, 1);
print "\n $smallest_file \n";
$i--;
}else{
$smallest_file=$in[$i];
}
}
}
if($extract_opt==1){
return(\$smallest_file, \@in);
}else{ return(\$smallest_file); }
}
#______________________________________________________________
# Title : get_largest_file
# Usage : $largest_file_name=${&get_largest_file(@ARGV)};
# Function : checks the size of files and returns the largest
# one's name. If a file is not present in pwd or
# specified absolute path, it ignores it.
# Example :
# Keywords : choose_largest_file, largest_file, find_largest_file
# get_the_largest_file, choose_the_largest_file, get_biggest_file
# fetch_largest_file, take_largest_file, get_bigger_file, get_larger_file
# Options : _ for debugging.
# # for debugging.
# e for extract the largest from the input array
# leaving it one element less, in this case
# there will be two returning refs.
# Category :
# Version : 1.4
#--------------------------------------------------------------
sub get_largest_file{
my @in;
if(ref $_[0] eq 'ARRAY'){
@in = @{$_[0]};
}else{
@in = @_;
}
my ($largest_file, $largest, $i, $extract_opt);
for($i=0; $i< @in; $i++){
if(($in[$i]=~/^\-?e$/i)&&(!(-f $in[$i])) ){
$extract_opt=1;
splice(@in, $i, 1);
$i--;
}
}
for($i=0; $i< @in; $i++){
my $size=(-s $in[$i]);
if($size > $largest){
$largest=$size;
if($extract_opt ==1){
print "\$extract_opt is $extract_opt \n";
push(@in, $largest_file) if defined($largest_file);
$largest_file = splice(@in, $i, 1);
print "\n $largest_file \n";
$i--;
}else{
$largest_file=$in[$i];
}
}
}
if($extract_opt==1){
return(\$largest_file, \@in);
}else{ return(\$largest_file); }
}
#______________________________________________________________
# Title : get_sequence_complexity
# Usage : print "\n", ${&get_sequence_complexity(\$seq)};
# Function : caculates the single sequence's sequence complexity
# If the seq given is larger than 20, it divides it into
# frags of 20 aa and gets the average of it.
# Example : ${&get_sequence_complexity(\$seq)};
# while $seq='TTTTTACDEFGHIKLMNPQRSTVWYAAAAACCCADFADFA'
# Warning :
# Keywords : sequence_complexity, calc_sequence_complexity,
# calc_seq_complexity, get_seq_complexity, seg
# Options : _ for debugging.
# # for debugging.
# 'w=' for window size as the first arg
# Returns : Ref. for a scalar digit.
# Argument : ref. of string.
# Category :
# Version : 1.3
#--------------------------------------------------------------
sub get_sequence_complexity{
my ($complexity, @seq,$i, $j, @frag);
my $win=20;
if(ref($_[0]) eq 'ARRAY'){
@seq=@{$_[0]};
}else{
$seq=${$_[0]} || $_[0];
@seq=split(//, $seq);
}
if(defined($_[1])){ $win=${$_[1]} || $_[1]; }
if(@seq <= $win){
my (%seq, @keys);
for($i=0; $i< @seq; $i++){
$seq{$seq[$i]}++;
}
@keys= keys %seq;
$complexity=@keys/@seq;
}else{
my @frag=@{÷_array(\@seq, "s=$win")};
my @complexity=();
for($i=0; $i < @frag; $i++){
my (%seq, @keys);
my @arr=@{$frag[$i]};
for($j=0; $j< @arr; $j++){
$seq{$arr[$j]}++;
}
@keys=keys %seq;
push(@complexity, @keys/$win);
}
$complexity=${&array_average(\@complexity)};
}
return(\$complexity);
}
#______________________________________________________________________________
# Title : calc_factorial
# Usage :
# Function :
# Example :
# Keywords : calculate_factorial, get_factorial
# Options :
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub calc_factorial{
my ($number, $factorial, $i);
$factorial=1;
if(ref($_[0]) eq 'SCALAR'){
$number=${$_[0]};
}else{
$number=$_[0];
}
for($i=1; $i <=$number; $i++){
$factorial=$factorial*$i;
}
return(\$factorial);
}
#______________________________________________________________
# Title : make_swiss_index
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub make_swiss_index{
my ($swiss, %index);
if(-e "$ENV{'SWDIR'}seq.dat" ){
$swiss="$ENV{'SWDIR'}seq.dat";
}elsif( -f "$ENV{'SWISS'}seq.dat" ){
$swiss="$ENV{'SWISS'}seq.dat";
}elsif( -e 'seq.dat'){
$swiss="seq.dat";
}elsif( -f "$ENV{'swiss'}seq.dat"){
$swiss="$ENV{'swiss'}seq.dat";
}else{
ASK: print "\n Where is your swissprot seq.dat file?\n";
$swiss=<>;
chomp($swiss);
if(-e "$swiss"){
goto OPEN;
}else{
goto ASK;
}
}
OPEN: open(DB, "$swiss");
while(<DB>){
if(/^ID[\t ]+(\w+) +/){
$index{$1}=tell(DB);
print "\n$1 $index{$1}";
}
}
}
#_____________________________________________________________________________
# Title : fetch_sequence_from_db
# Usage : %sequence=%{&fetch_sequence_from_db($input_file, \@string)};
# Function : accept seq names (with or without ranges like _10-111 )
# and produces hash ref.
# As an option, you can write(xxxx.fa) the sequences in pwd
# with the file names with sequence names.
# The default database used is FASTA format OWL database.
# You can change this by S (for Swissprot either fasta
# or full format), P for PDB fasta format data.
# If you give the path name of DB, it will look for the
# DB given.
#
# This automatically checks sequence family number as
# in >d1bpi___7.6.1
# and attaches the number in final %sequence output
#
# Example : %seq=%{&fetch_sequence_from_db(\@input, seq.fa, seq.fa.idx)};
# while @input=qw( 11S3_HELAN_11-31 A1AB_CANFA A1AT_PIG )
# Keywords : fetch_seq_from_db, fetch_sequence_from_database
# Options : _ or # for debugging.
# w for write fasta file
# s= for putting source DB file name manually
# d=p100 for PDB100 fasta database from ENV
# d=p40 for PDB40 fasta database from ENV
# d=p for PDB database (usually p100) from ENV
# d=s for Swissprot database from ENV
# d=o for OWL database from ENV
# i= for index filename. If not specified, this looks for it in the same dir as fast
# t= for msp_threshold
# msp_threshold=0.0005 # when MSP file is given as input for getting seq names
#
# Returns : ref of hash
# Argument : gets names of sequences
# eg) \@array, \%hash, \$seq, while @array=(seq1, seq2), $seq='seq1 seq1'
# %hash=(seq1, xxxx, seq2, yyyy);
#
# Version : 3.6
#------------------------------------------------------------------------------
sub fetch_sequence_from_db{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(@DATABASE, @INDEX_FILE, %sequence, %seq_with_index, @input_seq_names,
%long_index, @Keys, $R_start, $NAME, $R_leng, $found_seq_count,
$eval_blastpgp_thresh, $seq_found1, $sequence, @keys, $index_file,
$source_DB_file, $matching_seq, $match_start, $match_stop);
$eval_blastpgp_thresh=0.001; # default
if($vars{'msp_threshold'}=~/(\S+)/ or $vars{'mt'}=~/(\S+)/ or $vars{'t'}=~/(\S+)/ ){
$eval_blastpgp_thresh=$1;
print "\n# (INFO) YOU have set the \$eval_blastpgp_thresh $eval_blastpgp_thresh\n\n";
}
if($vars{'s'}=~/(\S+)/ or $vars{'DB'}=~/(\S+)/ ){
$source_DB_file=$1; push(@DATABASE, $source_DB_file);
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# getting input seq names from all sources
#________________________________________________________
for(0..@hash){ # from the given hash (keys names)
push(@input_seq_names, keys %{$hash[$_]} );
}
for(0..@raw_string){ ## from given sequence names
push(@input_seq_names, split(/ +/, $raw_string[$_]) );
}
for($i=0; $i<@file; $i++){ ## From MSP file input (get only MATCHED sequences)
if($file[$i]=~/\.msp/){
print "\n# (INFO) MSP file input is detected !\n";
my ($seq_with_range);
open(MSP, $file[$i]);
while(<MSP>){
if(/^ *\S+ +(\S+) +\S+ +\S+ +\S+ +\S+ +\S+ +\S+ +(nr_\S+_DROME_\S+) +/){
push(@input_seq_names, $2) if $1 < $eval_blastpgp_thresh;
}elsif(/^[\t ]*\S+[\t ]+(\S+)[\t ]+\S*[\t ]*(\d+)[\t ]+(\d+)[\t ]+\S+[\t ]+(\d+)[\t ]+(\d+)[\t ]+(\S+) */){
$matching_seq=$6; $match_start=$4; $match_stop=$5;
$evalue=$1;
if($matching_seq=~/^(\S+)_\d+\-\d+/){
$seq_with_range=$matching_seq;
}else{
$seq_with_range="$matching_seq\_$match_start\-$match_stop";
}
push(@input_seq_names, $seq_with_range) if $evalue < $eval_blastpgp_thresh;
print "\n# (INFO) pushing $seq_with_range" if $verbose;
}
}
close(MSP);
splice(@file, $i, 1);
$i--;
}
}
print "\n# (1) fetch_sequence_from_db: \@raw_string has: ", scalar(@raw_string), " elements";
print "\n# (2) fetch_sequence_from_db: No. of seq to fetch is:",scalar(@input_seq_names);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Choose the DBs and INDEX for fetching sequences. All files input must be DATABAE or INDEXfile
#___________________________________________
if(@file > 0){ print "\n# (i) GOOD! Input \@file has \"@file\"";
for($i=0; $i< @file; $i++){
if(-T $file[$i] and $file[$i]=~/\.fa[sta]?$/){ push(@DATABASE, $file[$i]);
if(-T "$file[$i]\.idx"){ push(@INDEX_FILE, "$file[$i]\.idx"); next; }
}elsif((-T $file[$i]) and ($file[$i]=~/\.seq$/)){ push(@DATABASE, $file[$i]);
}elsif((-T $file[$i]) and ($file[$i]=~/\.dat$/)){ push(@DATABASE, $file[$i]);
}elsif(-T $file[$i] and $file[$i]=~/(\S+)\.idx$/){ push(@INDEX_FILE, $file[$i]);
push(@DATABASE, $1) if -s $1; next;
}else{
print "\n# WARN: fetch_sequence_from_db:
You put a file-name-like which is not a fasta DB. Error. I am removing $file[$i]";
splice(@file, $i, 1);
$i--;
}
}
}else{
print "\n# (E)fetch_sequence_from_db: \@file is <= 0, ERROR\n"; exit;
}
if($vars{'d'}=~/^p[100]*$/){
if( -T $ENV{'PDB_FASTA'} ){ push(@DATABASE, $ENV{'PDB_FASTA'} ); }
elsif( -T $ENV{'PDB_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'PDB_SEQ_FASTA'} ); }
elsif( -T $ENV{'PDB100_FASTA'} ){ push(@DATABASE, $ENV{'PDB100_FASTA'} ); }
if( -T $ENV{'PDB_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'PDB_FASTA_INDEX'} ); }
}elsif( $vars{'d'}=~/^p\d+d$/ ){
if( -T $ENV{'PDB100D_FASTA'} ){ push(@DATABASE, $ENV{'PDB100D_FASTA'}); }
elsif( -T $ENV{'PDBD100_FASTA'} ){ push(@DATABASE, $ENV{'PDBD100_FASTA'}); }
elsif( -T $ENV{'PDB100D_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'PDB100D_SEQ_FASTA'}); }
elsif( -T $ENV{'PDBD100_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'PDBD100_SEQ_FASTA'}); }
if( -T $ENV{'PDB100D_SEQ_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'PDB100D_SEQ_FASTA_INDEX'}); }
elsif( -T $ENV{'PDBD100_SEQ_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'PDBD100_SEQ_FASTA_INDEX'}); }
}elsif( $vars{'d'}=~/^p40/ ){
if( -T $ENV{'PDB40_FASTA'} ){ push(@DATABASE, $ENV{'PDB40_FASTA'}); }
elsif( -T $ENV{'PDB40_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'PDB40_SEQ_FASTA'}); }
if( -T $ENV{'PDB40_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'PDB40_FASTA_INDEX'}); }
}elsif( $vars{'d'}=~/^p90/ ){
if( -T $ENV{'PDB90_FASTA'} ){ push(@DATABASE, $ENV{'PDB90_FASTA'} ); }
elsif( -T $ENV{'PDB90_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'PDB90_SEQ_FASTA'}); }
if( -T $ENV{'PDB90_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'PDB90_FASTA_INDEX'}); }
}
if( $vars{'d'}=~/^s *$/){
if( -T $ENV{'SWISS_FASTA'} ){ push(@DATABASE, $ENV{'SWISS_FASTA'}); }
elsif( -T $ENV{'SWISS_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'SWISS_SEQ_FASTA'}); }
elsif( -T $ENV{"SWISS_DIR\/seq.fa"} ){ push(@DATABASE, $ENV{"SWISS_DIR\/seq.fa"}); }
if( -T $ENV{'SWISS_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'SWISS_FASTA_INDEX'}); }
elsif( -T $ENV{'SWINDEX'} ){ push(@INDEX_FILE, $ENV{'SWINDEX'}); }
}elsif( $vars{'d'}=~/^o *$/){
if( -T $ENV{'OWL_FASTA'} ){ push(@DATABASE, $ENV{'OWL_FASTA'}); }
elsif( -T $ENV{'OWL_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'OWL_SEQ_FASTA'}); }
elsif( -T $ENV{"OWL_DIR\/owl.fa"} ){ push(@DATABASE, $ENV{"OWL_DIR\/owl.fa"}); }
if( -T $ENV{'OWL_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'OWL_FASTA_INDEX'}); }
print "\n# Fetching sequences from OWL\n";
}elsif( $vars{'d'}=~/^n *$/){
if( -T $ENV{'NRDB_FASTA'} ){ push(@DATABASE, $ENV{'NRDB_FASTA'}); }
elsif( -T $ENV{'NRDB_SEQ_FASTA'} ){ push(@DATABASE, $ENV{'NRDB_SEQ_FASTA'}); }
if( -T $ENV{'NRDB_FASTA_INDEX'} ){ push(@INDEX_FILE, $ENV{'NRDB_FASTA_INDEX'}); }
elsif( -T $ENV{'NRDB_FASTA_IDX'} ){ push(@INDEX_FILE, $ENV{'NRDB_FASTA_IDX'}); }
print "\n# Fetching sequences from OWL\n";
}elsif( $vars{'d'}=~/^\S+\.\S+$/ and -T $vars{'d'} ){ push(@DATABASE, $vars{'d'} ); }
if( $vars{'i'}=~/\S+\.\S+$/ and -T $vars{'i'} ){ push(@INDEX_FILE, $vars{'i'} ); }
if(@INDEX_FILE == 0 and @DATABASE > 0){
if( ${&if_file_older_than_x_days("$DATABASE[0]\.idx", 5)} > 0 ){
print "\n# (i) fetch_sequence_from_db: $DATABASE[0]\.idx is old, rerunning make_seq_index_file\n";
$index_file=${&make_seq_index_file(\@DATABASE)};
push(@INDEX_FILE, $index_file);
}elsif((-s "$DATABASE[0]\.idx") > 20){
print "\n# (i) fetch_sequence_from_db: $DATABASE[0]\.idx is larger than 20, USING IT";
push(@INDEX_FILE, "$DATABASE[0]\.idx");
}else{
print "\n# (ERROR) fetch_sequence_from_db: Some weird error in pushing \$index_file to \@INDEX_FILE\n"; exit;
}
}elsif(@INDEX_FILE > 0 and @DATABASE > 0){
}else{
print "\n\n# (W) \@INDEX_FILE and \@DATABASE are not big enough(0)\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Final check for ALL the inputs
#___________________________________________________
if( @DATABASE < 1){ print "\n# fetch_sequence_from_db: DATABASE file no found. Error\n"; exit }
if( @INDEX_FILE < 1){
print "\n# (E) fetch_sequence_from_db: \@INDEX_FILE has less than 1 elem. Error\n";
push(@INDEX_FILE, ${&make_seq_index_file(@DATABASE)});
print " fetch_sequence_from_db called make_seq_index_file to make @INDEX_FILE\n";
}
}
if($debug==1){
print "\n# DATABASE used : @DATABASE";
print "\n# INDEX_FILE used : @INDEX_FILE";
print "\n# input_seq_names : @input_seq_names";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# To skip a seq name bug (from Sarah)
#________________________________________________________
if($input_seq_names[0]=~/\S\-\S+\-\S/){
print "\n# (W) NO good having more than 2 dashes in seq. name: $input_seq_names[0], dying \n";
sleep(2);
}
##~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
## Now I have @DATABASE, @INDEX_FILE, @input_seq_names
##_______________________________________________________________
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Reading in index file to get 'seq' 'seek pos' to make %seq_with_index
#__________________________________________________________________________
print "\n# fetch_sequence_from_db: \@INDEX_FILE @INDEX_FILE, \@DATABASE :@DATABASE\n";
for($i=0; $i< @INDEX_FILE; $i++){
open(INDEX, "$INDEX_FILE[$i]");
while(<INDEX>){ if(/(\S+) +(\S+)/){ $long_index{$1}=$2; } }
for($j =0; $j < @input_seq_names; $j++){
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
# If DATABASE has sequence names with ranges already index the seq with ranges
#____________________________________________________________________________________
if($input_seq_names[$j]=~/(\S+\_\d+\-\d+)$/ and $long_index{$1}){
$seq_with_index{$1}=$long_index{$1};
}elsif( ($input_seq_names[$j]=~/pdb_(\S+\_\d+\-\d+)/ or $input_seq_names[$j]=~/nr_(\S+\_\d+\-\d+)/ )
and $long_index{$1}){
$seq_with_index{$1}=$long_index{$1};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
# If DATABASE has sequence names without ranges index the seq without ranges
#____________________________________________________________________________________
}elsif($input_seq_names[$j]=~/(\S+)\_\d+\-\d+$/ and $long_index{$1}){
$seq_with_index{$input_seq_names[$j]}=$long_index{$1}; # !!!! <--- This line is critical
}elsif($input_seq_names[$j]=~/(\S+)\_\d+\-\d+/ and $long_index{"$1\_"}){ # to handle Tim's new pdb100.fa files
$seq_with_index{$input_seq_names[$j]}=$long_index{"$1\_"};
print "\n# Warning: $1 (from $input_seq_names[$j]) matched with $1\_ in $INDEX_FILE[$i],
I hope this is correct!!\n";
}elsif($input_seq_names[$j]=~/nr_(\S+)\_\d+\-\d+/ and $long_index{"$1"}){ # to handle Tim's new pdb100.fa files
print " (W) $input_seq_names[$j] matched nr_XXXX_ddd-ddd format removing nr_\n";
$seq_with_index{$1}=$long_index{$1};
}elsif($input_seq_names[$j]=~/pdb_(\S+)\_\d+\-\d+/ and $long_index{$1}){
print " (W) $input_seq_names[$j] matched pdb_XXXX_ddd-ddd format removing pdb_\n";
$seq_with_index{$1}=$long_index{$1}; # !!!! <--- This line is critical
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`~``
# If input_seq_name has SCOP superfamily numbers
#____________________________________________________________________________________
elsif($input_seq_names[$j]=~/^(\S+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/ and $long_index{$1}){
$seq_with_index{"$1\_$2"}=$long_index{$1}; # !!!! <--- This line is critical
}elsif($input_seq_names[$j]=~/\S/ and $long_index{$input_seq_names[$j]}){
$seq_with_index{$input_seq_names[$j]}=$long_index{$input_seq_names[$j]}
}else{
print chr(7);
print "\n# (E) $input_seq_names[$j](with, sans range) has NO corrspndng indx in $INDEX_FILE[$i]";
}
}
close INDEX;
if ( scalar(keys %seq_with_index) < 1){
print "\n# fetch_sequence_from_db: \%seq_with_index is too small, ERROR?\n";
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# Fetching sequences from DATABASE
#_______________________________________________________________
print "\n# fetch_sequence_from_db: Fetching seqs from @DATABASE with @INDEX_FILE ";
@Keys= sort {$seq_with_index{$a} <=> $seq_with_index{$b} } keys %seq_with_index; ## <<< NOTE it is @Keys, not @keys
print "\n# (3) fetch_sequence_from_db: No. of seq indexed is:", scalar(@Keys);
for($f=0; $f< @DATABASE; $f++){
open(DB_FASTA, $DATABASE[$f]);
F0: for($e=0; $e< @Keys; $e++){
my ($seq_found1, $super_fam_class, $NAME, $R_leng, $R_start, $sequence);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# When seq name has range attachment, it handles
#________________________________________________
if($Keys[$e]=~/(\S+)_(\d+)\-(\d+)$/){
$NAME=$1;
$R_start=$2-1; ## to fit in substr function
$R_leng =$3-$2+1; ## to fit in substr
print "\n# (4) fetch_sequence_from_db: Sequences have ranges ($R_start-$R_leng) only (not superfamily numb.) \n";
}
elsif($Keys[$e]=~/(\S+)_(\d+)\-(\d+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/){
$NAME=$1;
$R_start=$2-1; ## to fit in substr function
$R_leng =$3-$2+1; ## to fit in substr
$super_fam_class=$4;
print "\n# (4) fetch_sequence_from_db: Sequences have ranges and superfamily numb.\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# When superfamily (scop) number is attached
#___________________________________________________
elsif($Keys[$e]=~/(\S+)\_(\d+\.\d+\.\d+)[\.\d+\.\d+]*/){
$NAME=$1;
$super_fam_class=$2;
print "\n# (4) fetch_sequence_from_db: Sequences have SCOP superfamily numbers only \n";
}elsif($Keys[$e]=~/^ *(\S+)[\,]*$/){
print "\n# (4) fetch_sequence_from_db: Sequences DON't have ranges or SCOP superfam numb.\n";
$NAME=$1;
}
print "\n# BEFORE reading in DB file. \$NAME is $NAME";
if($seq_with_index{$NAME}=~/(\d+)/ # It is importnt having $seq_with_index{$Keys[$e]}
or $seq_with_index{$Keys[$e]}=~/(\d+)/
or $seq_with_index{"$NAME\,"}=~/(\d+)/ # this is for overcoming '>xxxx,' entry(the comma)
or $seq_with_index{"$NAME\_"}=~/(\d+)/ # to handle Tim's >c1eru_ 3.30.1.1.4
or $seq_with_index{"pdb\_$NAME"}=~/(\d+)/ # to handle Sarah's pdb_xxxxx
or $seq_with_index{"nr\_$NAME"}=~/(\d+)/ # to handle Sarah's nr_xxxxx
){
my $finding_position= $1-300;
if( $finding_position >= 0 ){ seek(DB_FASTA, $1-300, 0); # -300 is necessary
}elsif($finding_position < 0){ seek(DB_FASTA, 0, 0); } ## This is essential !!!
while(<DB_FASTA>){
if(!$seq_found1){
if(/\> *$NAME[\,_]? *\d*/){
$seq_found1=1;
print "\n# $NAME is found in DB, Good ";
}
}else{
if(/^ *(\w+) *$/ ){
$sequence .=$1; ## you should use $1 to avoid including NEW line
unless(eof DB_FASTA){ next ## This is critically important to prevent error.
}else{ goto PUT_SEQ } ## If the last seq has only one single line seq string, it could be a problem
}elsif( (/^ *\> *\S+/) or (eof DB_FASTA) ){
#======= When range is defined ==================
PUT_SEQ:
if($R_start =~/\d+/){
$sequence{$Keys[$e]}=substr($sequence, $R_start, $R_leng); next; #
print "\n# $sequence{$Keys[$e]} is put to \%sequence";
}
#======= To handle superfamily information given ==========
if($super_fam_class){
$sequence{$Keys[$e]}=$sequence;
$acquired_seq_count++;
}
#======= When range is NOT defined ==================
else{
$sequence{$Keys[$e]}=$sequence;
print "\n# $sequence is put to \%sequence";
}
$R_start='';
$sequence='';
$seq_found1=''; ## reset $R_start, $seq_found1,,
next F0;
}
}
}
}else{
print "\n# Error, the sequence pos for $NAME (from $Keys[$e]) in DB doesnt exist in xxxx.idx file?\n";
}
}
close DB_FASTA;
}
#print "\n# (6) fetch_sequence_from_db: counted fetched seqs: $found_seq_count, $acquired_seq_count";
#print "\n# (7) fetch_sequence_from_db: Fetching seq has finished \n";
return(\%sequence);
}
#______________________________________________________________
# Title : fetch_seq
# Usage : &fetch_seq(@ARGV);
# Function : fetches swissprot entry or fasta format seq with
# given seq name(like SAA_HORSE, SA*HORSE, SAA,..)
# you can give multi files(SAA*, SAU*) at the same
# time. This uses ENV setting of 'SWDIR'
# Example : &fetch_swiss_seq(@ARGV);
# Keywords : fetch_swissprot_sequence, fetch_sequence,
# find_swiss_sequence, find_sequence
# Options : _ for debugging.
# # for debugging.
# -f for fasta format file output
# -a is for ALL matched seq. (same as using glob=> *YEAST)
# -c is for Creating seq.idx file
# -h is for HELP!
# -g is for GDF file format output
# -l is for list of match entries(in 1 column)
# -s is for species option (input name mst be species (YEAST, RAT, HUMAN..)
# n= is for Number of seq you want to get from swissprot
# s= is for Size limit. Min seq size in swiss, s=10 -> minimum 11 aa seq.
# S= is for Size limit. Max seq size in swiss, s=1000 -> get less than 1000
#
# Argument : swissprot seqname
# Category :
# Version : 1.7
#--------------------------------------------------------------
sub fetch_seq{
my @in=@_;
my ($FASTA_index, $FASTA, $where_index, %index, $question, $i,
$s,$t,$fasta,$index_file, $all, $species,$target, $matched, $seq, $gdf, $list, $count, $create);
my $SEQ_size_max=100000000;
if(@_ < 1){ &HELP_fetch_seq;
}else{
F: for($t=0; $t<@in; $t++){ #'''''''''''' PROMPT ARGV processing ''''''''''''''''''
if($in[$t]=~/^\-c$/i){
$create=1; splice(@in, $t, 1); $t--;
print "\n You should provide database\(e.g, seq.dat\) file with this opt, I guess you did\n";
print "\n If you wanted to make an index with any fasta db, you also have to\n";
print " give the file name. e.g:\n $0 -c /DB/swiss/seq.dat\n";
print " or $0 -c my_db.fa\n\n";
next; }
if($in[$t]=~/^\-af$/){ $fasta=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-afs$/){ $species=$fasta=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-ag$/){ $gdf=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-g$/){ $gdf=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-f$/i){ $fasta=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-a$/i){ $all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-l$/i){ $list=$all=1; splice(@in, $t, 1); $t--; next; }
if($in[$t]=~/^\-s$/i){ $species=$all=1; splice(@in, $t, 1); $t--; next; }
if( ($in[$t]=~/seq\.dat/)&&(-f $in[$t])){ ## if the path for swiss prot is given
$DB=$in[$t]; splice(@in, $t, 1); $t--; next; }
if( ($in[$t]=~/seq\.idx/)&&(-e $in[$t])){ ## if the path for swiss index is given
$index_file=$in[$t]; splice(@in, $t, 1); $t--; next; }
#''''''' SWiss prompt input file check ''''''''''''''''''
if( -f $in[$t]){
open(TEMP, "$in[$t]");
while(<TEMP>){
if(/^ID[\t ]+\w+/){$DB=$in[$t]; splice(@in, $t, 1);$t--;next F;}}
close TEMP;
}
#'''''''' FASTA prmpt input file check '''''''''''''''''''
if( (-f $in[$t]) && !(defined($FASTA))){
open(TEMP, "$in[$t]");
while(<TEMP>){
if(/^\> {0,4}\S+/){$FASTA=$in[$t]; $fasta=1;
if(-s "$FASTA\.idx"){ $FASTA_index="$FASTA\.idx"; }
splice(@in, $t, 1);$t--;next F;}}
close TEMP;
}
#'''''''' INDEX file automatic check ''''''''''''''''''
if( -f $in[$t]){
open(TEMP2, "$in[$t]");
my ($first_pos, $Count, @splited);
while(<TEMP2>){
$Count++;
if( $Count>3 ){
if(/^ {0,2}\S+ +(\d+)/){
if(defined($first_pos) && ($1-$first_pos ) > 1000 ){
$index_file=$in[$t];
splice(@in, $t, 1);$t--;next F;
}elsif( defined($first_pos) && ($1-$first_pos)<1000 ){
$FASTA_index=$in[$t]; $fasta=1;
if($FASTA_index=~/^(\S+)\.\w+$/){
if(-s $1){ $FASTA= $1; }
}
splice(@in, $t, 1);$t--;next F;
}
$first_pos=$1;
}
}
}
close TEMP2;
}
if($in[$t]=~/^\-h$/i){ &HELP_fetch_seq; exit;}
if($in[$t]=~/^n=(\d+)$/i){ $SEQ_num_to_fetch=$1;
splice(@in, $t, 1);$t--;next F;}
if($in[$t]=~/^s=(\d+)$/){ $SEQ_size_min=$1; $fasta=1;
splice(@in, $t, 1);$t--;next F;}
if($in[$t]=~/^S=(\d+)$/){ $SEQ_size_max=$1; $fasta=1;
splice(@in, $t, 1);$t--;next F;}
}
if(($create==1)&&(defined($DB)) ){ goto CREATE; }
elsif(($create==1) && (defined($FASTA)) ){ goto CREATE; }
elsif($create==1){
print "\n You must give db filename (e.g. seq.dat) with path to make an index";
print "\n I can handle fasta db file to make an index\n";
exit;
}
}
if($SEQ_size_max < $SEQ_size_min){ print "\n Seq size Max is smaller than min\n"; exit; }
##""""""""""""""""""""""" DB file if not defined """"""""""""""""""""""""""""""""""""""""""""
if (!defined($DB)){
if((!defined($FASTA))&&($fasta==1)&&(-T "$ENV{'FASTADB'}")){
$FASTA=$ENV{'FASTADB'};
}elsif(defined($FASTA) && ($fasta==1) &&($create !=1) ){
goto SW_INDEX;
}elsif(!defined($FASTA) && (defined($FASTA_index))&& !(-e "$ENV{'FASTADB'}") ){
print "\n NO fasta db is defined\n";
goto ASK;
}elsif(-e "$ENV{'SWDIR'}seq.dat" ){
$DB="$ENV{'SWDIR'}seq.dat";
}elsif(-e "$ENV{'FETCHSWISS'}seq.dat" ){
$DB="$ENV{'FETCHSWISS'}seq.dat";
}elsif(-e "$ENV{'FETCHSWISS'}" ){
$DB="$ENV{'FETCHSWISS'}";
}elsif(-e "$ENV{'SWDIR'}\/seq.dat" ){
$DB="$ENV{'SWDIR'}\/seq.dat";
}elsif( -f "$ENV{'SWISS'}seq.dat" ){
$DB="$ENV{'SWISS'}seq.dat";
}elsif( -f "$ENV{'SWISS'}\/seq.dat" ){
$DB="$ENV{'SWISS'}\/seq.dat";
}elsif( -e 'seq.dat'){
$DB="seq.dat";
}elsif( -f "$ENV{'swiss'}seq.dat"){
$DB="$ENV{'swiss'}seq.dat";
}elsif(-f "ENV{'HOME'}seq.dat"){
$DB="ENV{'HOME'}seq.dat";
}elsif(-f "ENV{'SWDIR'}\/seq.dat"){
$DB="ENV{'SWDIR'}\/seq.dat";
}else{
ASK: print "\n Where is your swissprot seq.dat(or fasta db) file?\n";
print " I recommand you to set the path for them in ENV vars\n";
print " e.g. export SWDIR=/DB/Swiss/ to where you put seq.dat\n";
print " e.g. export FASTADB=/DB/Swiss/my_swiss.fa for fasta database\n";
$swiss=<STDIN>;
chomp($swiss);
if( -f $swiss){
open(TEMP, "$swiss");
while(<TEMP>){
if(/^ID[\t ]+\w+/){ $DB=$swiss; goto SW_INDEX; }
elsif(/^\> {0,3}\S+/){ $FASTA=$swiss; goto SW_INDEX;}
}
close TEMP;
}else{
goto ASK;
}
}
}
##""""""""""""""""""""""""""""" INDEX file """"""""""""""""""""""""""""""""""""""""
if( !defined($index_file)){
SW_INDEX:
if((!defined($FASTA_index))&&($fasta==1)&&(-T "$ENV{'FASTAINDEX'}")){
$FASTA_index=$ENV{'FASTAINDEX'};
}elsif(!defined($FASTA_index)&&(-T $FASTA)){
goto W;
}elsif(defined($FASTA_index)&&(-T $FASTA)){
goto MAIN_SEARCH;
}elsif(-e "$ENV{'FETCHSWISSINDEX'}seq.idx" ){
$index_file="$ENV{'FETCHSWISSINDEX'}seq.idx";
}elsif(-e "$ENV{'FETCHSWISSINDEX'}\/seq.idx" ){
$index_file="$ENV{'FETCHSWISSINDEX'}\/seq.idx";
}elsif(-e "$ENV{'SWDIR'}seq.idx" ){
$index_file="$ENV{'SWDIR'}seq.idx";
}elsif( -f "$ENV{'SWISS'}seq.idx" ){
$index_file="$ENV{'SWISS'}seq.idx";
}elsif( -f "$ENV{'SWISS'}\/seq.idx" ){
$index_file="$ENV{'SWISS'}\/seq.idx";
}elsif( -f "$ENV{'SWINDEX'}" ){
$index_file="$ENV{'SWINDEX'}";
}elsif( -e 'seq.idx'){
$index_file="seq.idx";
}elsif( -f "$ENV{'swiss'}seq.idx"){
$index_file= "$ENV{'swiss'}seq.idx";
}elsif( -f "$ENV{'SWINDEX'}seq.idx"){
$index_file= "$ENV{'SWINDEX'}seq.idx";
}elsif( -f "$ENV{'HOME'}seq.idx"){
$index_file= "$ENV{'HOME'}seq.idx";
}elsif( -f "$ENV{'SWINDEX'}seq.idx"){
$index_file="$ENV{'SWINDEX'}\/seq.idx";
}elsif( -f "$ENV{'swindex'}seq.idx"){
$index_file="$ENV{'swindex'}seq.idx";
}elsif(defined($DB)|| defined($FASTA) ){
print "\n Your swissprot is in $DB, but no seq.idx file for it.\n";
W: print "\n Where is seq.idx(or fasta idx file eg. $FASTA\.idx), type path and filename?\n";
print " I recommand you to set the path for them in ENV vars later\n";
print " e.g. export SWINDEX=/DB/Swiss/ to where you put seq.dat index\n";
print " e.g. export FASTAINDEX=/DB/Swiss/my.fa.idx for fasta db index\n";
print " Asking where 3 times. After, will ask creation of seq.idx or $FASTA.idx\n";
$question++;
$where_index=<STDIN>;
chomp($where_index);
if(-f $where_index){
open(TMP, "$where_index");
while(<TMP>){
if($_=~/^ {0,2}\S+ +\d+/){
$index_file=$where_index;
print "\n Your index file seems to be right \($index_file\) \n";
goto MAIN_SEARCH;
}elsif($count > 4){ # read at least 4 lines and see if they are index
print "\n $where_index doesn't seem to be index file\n";
print "\n Terminate(t) or go on (g) trying\n";
$try=getc;
if($try=~/t/i){ exit; }
else{ goto W; }
}else{
$count++;
}
}
close TMP;
}else{
if($question > 2){
print "\n I can create the index in pwd for you run $0 and \n";
print "\n you can copy seq.idx(or $FASTA\.idx) into your swissprot dir later\n";
goto CREATE;
}
goto W;
}
#""""""""""""""" CREATION of INDEX file """""""""""""""""""""""""""""""""""""""""""""
CREATE:
if(defined($DB)){ print "\n Can I create seq.idx in pwd? (y+return or return)\n" }
if(defined($FASTA)){ print "\n Can I create $FASTA\.idx in pwd? (y+return or return)\n" }
$yes_no=getc;
if($yes_no=~/y/i){
if(defined($DB)){
print "\n seq.idx being created...\(1 min in my Linux\)\n";
open(DB, "$DB");
open(IDX, ">seq.idx");
print IDX "# swiss_index\n";
while(<DB>){
if(/^ID[\t ]+(\w+) +/){
$index{$1}=tell(DB);
print IDX "\n$1 $index{$1}";
}
}
close(DB);
close(IDX);
if(-s "seq.idx"){
print "\nGood. seq\.idx is created.";
print "\n Copy seq.idx to SWISSPROT dir or you can set\n";
print "absolute path ENV var \'SWINDEX\' to your seq.idx path\n";
print "e.g. #bash\> export SWINDEX=\/DB\/Swiss\/seq.idx\n\n";
if($create==1){ exit; }
}else{
print "\n Creation of seq.dat seems to have gone wrong";
}
}elsif(defined($FASTA)){
$F_idx="$FASTA\.idx";
print "\n $F_idx being created...\n";
open(FASTADB, "$FASTA");
open(FASTAIDX, ">$F_idx");
print FASTAIDX "# fasta_index\n";
while(<FASTADB>){
if(/^\> {0,4}(\S+) */){
$index{$1}=tell(FASTADB);
print FASTAIDX "\n$1 $index{$1}";
}
}
close(FASTADB);
close(FASTAIDX);
if(-s $F_idx){
print "\nGood! Copy $F_idx to your DB dir and set two ENV vars\n";
print "absolute path ENV var \'FASTADB\' to your fastadb path\n";
print "absolute path ENV var \'FASTAINDEX\' to your $F_idx path\n";
print "e.g. #bash\> export FASTADB =\/DB\/mySwiss\/$FASTA\n";
print "e.g. #bash\> export FASTAINDEX=\/DB\/mySwiss\/$F_idx\n";
print "e.g. #tcsh\> setenv FASTADB \/DB\/mySwiss\/$FASTA\n";
print "e.g. #tcsh\> setenv FASTAINDEX \/DB\/mySwiss\/$F_idx\n";
print "Unless, you can specify the database each time at prompt\n\n";
if($create==1){ exit; }
}else{
print "\n Creation of seq.dat or $F_idx seems to have gone wrong";
}
}
}else{
exit;
}
}
}
#""""""""""""""""""""""""""" MAIN SERACH """""""""""""""""""""""""""""""""""""""""""""""
MAIN_SEARCH:
for($i=0; $i<@in; $i++){
my (@possible, @pos, %possible); my $target=$in[$i];
if($target=~/\*/){
$target=~s/\*/\\\w\{0,6\}/; # to handle glob input
$all=1;
}
if(defined($index_file)){
open(INDEX, "$index_file");
if($species==1){
while(<INDEX>){
if( /(\w*\_$target) +(\d+)/ ){ $possible{$1}=$2; }
}
}else{
while(<INDEX>){
if( /(\w*$target\w*) +(\d+)/ ){ $possible{$1}=$2; }
}
}
close INDEX;
goto SWISS;
}elsif(($fasta==1) && (defined($FASTA_index)) ){
open(INDEX, "$FASTA_index");
if($species==1){
while(<INDEX>){
if( /(\w*\_$target) +(\d+)/ ){ $possible{$1}=$2; }
}
}else{
while(<INDEX>){
if( /(\w*$target\w*) +(\d+)/ ){ $possible{$1}=$2; }
}
}
close INDEX;
goto FASTA;
}
SWISS:
@poss = sort keys %possible;
if( (@poss >1)&&($all !=1)){
print "\n @poss","\n";
print chr(7);
print "\n There are more than a few seqs for $in[$i]";
print "\n be more specific! OR use -a option for all matched\n\n";
exit;
}elsif($all !=1){
print "\n";
open (DB, "$DB");
if(defined($SEQ_num_to_fetch)){
print "\n# You defined the number of sequence to fetch: $SEQ_num_to_fetch\n";
$num_sequence=$SEQ_num_to_fetch;
}else{ $num_sequence=@poss; }
A:for($p=0; $p < $num_sequence; $p++){
if($poss[$p]=~/\w*$target\w*/){
$matched=$possible{$poss[$p]}; # %possible has the name and index num
seek(DB, ($matched-52), 0);
while(<DB>){
if($gdf==1){
if(/ID[\t ]+$poss[$p] +\S+ +\S+ +(\d+)/){
printf ("%-24s %-3d %-7d %-14s %4s\n", "$poss[$p]\/1\-$1", 1, $1, $poss[$p], '0.0');
next A;
}
}
elsif(/^ {0,2}\/\// and $fasta !=1){ # !!! DO NOT put $ in /^ {0,2}\/\// as there is something
print "\/\/\n";
next A;
}elsif(/^ {0,2}\/\// and $fasta==1){ # !!! DO NOT put $ in /^ {0,2}\/\// as there is something
$seq=~s/ //g;
if( ($SEQ_size_min < length($seq))&&(length($seq) < $SEQ_size_max) ){
print "\>$poss[$p]\n$seq\n"; $seq=''; next A;
}else{ $seq=''; $num_sequence++; next A; }
}elsif( $fasta==1 and /^[\t ]+\w+/){
$seq.=$_;
next ;
}elsif($list==1){
if(/ID[\t ]+$poss[$p] +\S+ +\S+ +(\d+)/){
print "$poss[$p]\n";
next A;
}
}elsif($fasta !=1){
print ;
}
}
}
}
close(DB);
}elsif($all==1){
print "\n";
open (DB, "$DB");
if(defined($SEQ_num_to_fetch)){ $num_sequence=$SEQ_num_to_fetch;
}else{ $num_sequence=@poss; }
A:for($p=0; $p < $num_sequence; $p++){
if($poss[$p]=~/\w*$target\w*/){
$matched=$possible{$poss[$p]};
seek(DB, ($matched-51), 0);
while(<DB>){
if($gdf==1){
if(/ID[\t ]+$poss[$p] +\S+ +\S+ +(\d+)/){
printf ("%-24s %-3d %-7d %-14s %4s\n", "$poss[$p]\/1\-$1", 1, $1, $poss[$p], '0.0');
next A;
}
}elsif(/^ {0,2}\/\// and $fasta==1){ # !!! DO NOT put $ in /^ {0,2}\/\// as there is something
$seq=~s/ //g;
if( ($SEQ_size_min < length($seq))&&(length($seq) < $SEQ_size_max) ){
print "\>$poss[$p]\n$seq\n"; $seq=''; next A;
}else{ $seq=''; $num_sequence++; next A; }
}elsif(/^ {0,2}\/\// and $fasta !=1){ # !!! DO NOT put $ in /^ {0,2}\/\// as there is something
print "\/\/\n";
next A;
}elsif(($fasta==1)&&(/^[\t ]+\w+/)){
$seq.=$_;
next ;
}elsif($list==1){
if(/ID[\t ]+$poss[$p] +\S+ +\S+ +(\d+)/){
printf "$poss[$p]\n";
next A;
}
}elsif($fasta !=1){
print ;
}
}
}
}
close(DB);
}
FASTA:
@poss = sort keys %possible;
if( (@poss >1)&&($all !=1)){
print "\n @poss","\n";
print chr(7);
print "\n There are more than a few seqs for $in[$i]";
print "\n be more specific! OR use -a option for all matched\n\n";
exit;
}elsif($all !=1){
print "\n";
open (FAS, "$FASTA");
B:for($p=0; $p < @poss; $p++){
if($poss[$p]=~/\w*$target\w*/){
$matched=$possible{$poss[$p]};
seek(FAS, ($matched-350), 0);
my $seq_found;
while(<FAS>){
if((/^> {0,4}(\S+)/)&&($seq_found==1)){
next B;
}elsif(/^> {0,4}($poss[$p])/){
print;
$seq_found=1;
}elsif($seq_found==1){
print;
}
}
}
}
close(FAS);
}elsif($all==1){
print "\n";
open (FAS, "$FASTA");
B2:for($p=0; $p < @poss; $p++){
if($poss[$p]=~/\w*$target\w*/){
$matched=$possible{$poss[$p]};
seek(FAS, ($matched-350), 0);
my $seq_found;
while(<FAS>){
if((/^> {0,4}(\S+)/)&&($seq_found==1)){
next B2;
}elsif(/^> *($poss[$p])/){
print;
$seq_found=1;
}elsif($seq_found==1){
print;
}
}
}
}
close(FAS);
}
}
}
#______________________________________________________________
# Title : fetch_swiss_seq
# Usage :
# Function : fetches swissprot entry or fasta format seq with
# given seq name(like SAA_HORSE, SA*HORSE, SAA,..)
# you can give multi files(SAA*, SAU*) at the same
# time. This uses ENV setting of 'SWDIR'
# Example : &fetch_swiss_seq(@ARGV);
# Warning :
# Keywords : fetch_swissprot_sequence, fetch_sequence,
# find_swiss_sequence, find_sequence, fetch
# Options : _ for debugging.
# # for debugging.
# -f for fasta format file output
# Returns :
# Argument : swissprot seqname
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub fetch_swiss_seq{
my @in=@_;
my ($i, $index_file, $target, $matched, $seq);
if(@_ < 1){
print "\n Usage: $0 [-f] <any swissprot name entry>\n";
print " -f is for fasta output format only\n";
print "\n You have to set ENV var, SWDIR to seq.dat path\n";
print chr(7);
}
for($i=0; $i<@in; $i++){
if($in[$i]=~/\-f$/i){
$fasta=1;
splice(@in, $i, 1);
next;
}
}
if(-e "$ENV{'SWDIR'}seq.dat" ){
open(DB, "$ENV{'SWDIR'}seq.dat");
}elsif( -f "$ENV{'SWISS'}seq.dat" ){
open(DB, "$ENV{'SWISS'}seq.dat");
}elsif( -e 'seq.dat'){
open(DB, "seq.dat");
}elsif( -f "$ENV{'swiss'}seq.dat"){
open(DB, "$ENV{'swiss'}seq.dat");
}
if(-e "$ENV{'SWDIR'}seq.idx" ){
$index_file="$ENV{'SWDIR'}seq.idx";
}elsif( -f "$ENV{'SWISS'}seq.idx" ){
$index_file="$ENV{'SWISS'}seq.idx";
}elsif( -e 'seq.idx'){
$index_file="seq.idx";
}elsif( -f "$ENV{'swiss'}seq.idx"){
$index_file= "$ENV{'swiss'}seq.idx";
}
for($i=0; $i<@in; $i++){
my @possible;
my $target=$in[$i];
$target=~s/\*/\\\w\{0,4\}/; # to handle glob input
open(INDEX, "$index_file");
while(<INDEX>){
if( /(\w*$target\w*)/ ){
push(@possible, $1);
}
}
close INDEX;
open(INDEX, "$index_file");
if(@possible >1){
print "\n@possible", "\n";
print chr(7);
print "\n There are more than a few seqs for $in[$i], \n be more specific!\n\n";
}else{
print "\n";
A:while(<INDEX>){
if(/(\w*$target\w*)[\t ]+(\d+)/){
$matched=$1;
seek(DB, ($2-51), 0);
while(<DB>){
if((/^\/\/$/)&&($fasta==1)){
$seq=~s/ //g;
print "\>$matched\n$seq\n";
$seq='';
next A;
}elsif((/^\/\/$/) && ($fasta !=1)){
print "\n";
next A;
}elsif(($fasta==1)&&(/^[\t ]+\w+/)){
$seq.=$_;
next ;
}elsif($fasta !=1){
print ;
}
}
}
}
print "========= Search for $ARGV[$i] was a success\n" if @in > 1;
}
}
}
#______________________________________________________________
# Title : get_sequence_number
# Usage :
# Function : reads database and tells how many sequences are there
# fasta format db is only accepted for now.
# Example :
# Warning :
# Keywords : count_number_of_sequence, get_number_of_sequence
# get_sequence_number_in_fasta
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------
sub get_sequence_number{
my @file=@{$_[0]};
my %out;
for($i=0; $i< @file; $i++){
my $seq_number_in_db;
open(DB, "$file[$i]");
while(<DB>){
if(/^\> {0,5}\w+/){
$seq_number_in_db++;
}
}
close DB;
$out{$file[$i]}=$seq_number_in_db;
}
return(\%out);
}
#______________________________________________________________
# Title : write_msp_files
# Usage : &write_msp_files(\%in1, \%in2, ['s'], [$filename],,)
# Function : Writes input which is already in msp file format to
# files either the name is given or generated
# If more than one ref of hash is given, this will
# concatenate all the hashes to one big one to
# make one file.
# When NO output xxx.msp file name is given, it creates
# with the query sequence name.
# Example : &write_msp_files(@sso, 's', $out_file);
# Warning : When NO output xxx.msp file name is given, it creates
# with the query sequence name.
# Keywords : write_msp,
# Options : _ for debugging.
# # for debugging.
# s for each single file output for each hash input
# filename for putting output to the specified filename, should be xxx.msp
#
# Returns : if 's' option is set, it will make say,
# HI001.msp HI002.msp HI003.msp rather than
#
# HI001HI002HI003.msp
# eg of one output(single file case)
#
# 1027 0.0 1 154 HI0004 1 154 HI0004
# 40 0.0 84 132 HI0004 63 108 HI0001
# 31 0.0 79 84 HI0004 98 103 HI0003
#
# Category :
# Version : 2.8
#--------------------------------------------------------------
sub write_msp_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($out_msp_file, $add_range, @final_out, $msp_file_out,
@keys, $N, $temp_1, %hash, $query_seq_name, $single_out_opt);
if($char_opt=~/r/){ $add_range ='r' };
if($char_opt=~/s/){ $single_out_opt ='s' };
if(@file == 1){ $out_msp_file=$file[0]; $single_out_opt='' } # s is for single file output
if($single_out_opt eq 's'){ #~~~~~~~~~~~` single files output option WITHOUT a given outfilename
$msp_file_out='default_single_out.msp';
for($i=0; $i< @hash; $i++){
my %hash=%{$hash[$i]};
my @keys =sort keys %hash;
#------------------ Writing the first line ---------------------------
for($j=0; $j< @keys; $j++){
if($keys[$j]=~/(\S+)_\d+\-\d+/){ $N = $1 }else{ $N = $keys[$j] }
if($hash{$keys[$j]}=~/ +$N[\_\d+\-\d+]* +\d+ +\d+ +$N[\_\d+\-\d+]*/){
open(MSP_FILE, ">$msp_file_out") ||
die "# write_msp_files: I can not create $msp_file_out, check permission\n";
chomp( $hash{$keys[$j]} ); ## precaution
print MSP_FILE "# (H) $0 write_msp_files: $keys[$j]\n";
print MSP_FILE $hash{$keys[$j]}, "\n";
splice(@keys, $j, 1);
$j--; last;
}
}
#------------- Writing the rest of the lines ____________________
for($j=0; $j< @keys; $j++){
chomp( $hash{$keys[$j]} );
print MSP_FILE $hash{$keys[$j]}, "\n";
}
print MSP_FILE "\n";
}
if(-s $msp_file_out){
print "\n# write_msp_files: $msp_file_out is written \n";
}else{
print "\n# Error, write_msp_files\n"; exit
}
push(@final_out, $msp_file_out);
close(MSP_FILE);
return(\@final_out);
}else{
#~~~~~~~~~~~~~ DEfault ~~~~~~~~~~~~~~~~~~
# When output file name was given!
#________________________________________
if(@file==1){
my($temp_1);
open(MSP_FILE, ">$out_msp_file") || die "# write_msp_files: I can not create $out_msp_file, check permission\n";
print MSP_FILE "# (H) $0 write_msp_files: @file\n";
for($i=0; $i< @hash; $i++){
my %hash=%{$hash[$i]};
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Sorting %hash values by the second column(Evalue)
#_______________________________________________________
@keys= map {$_->[0]} sort { $a->[1] <=> $b->[1] } map { $hash{$_}=~/^ *\S+[\t ]+(\S+)[\t ]+/ and [$_, $1] } keys %hash;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# for Final output
#_____________________________
push(@final_out, $out_msp_file);
#--------- Writing the first line only --------------
for($j=0; $j< @keys; $j++){
if($keys[$j]=~/(\S+)_\d+\-\d+$/){ $N = $1 }else{ $N = $keys[$j] }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Following is to put the self match on top of the list
#________________________________________________________
if($hash{$keys[$j]}=~/ +$N[\_\d+\-\d+]* +\d+ +\d+ +$N[\_\d+\-\d+]*/){
$temp_1=$keys[0]; $keys[0]=$keys[$j]; $keys[$j]=$temp_1;
}
}
for($j=0; $j< @keys; $j++){
chomp($hash{$keys[$j]});
print MSP_FILE $hash{$keys[$j]}, "\n";
}
print MSP_FILE "\n";
}
print MSP_FILE "\n";
close(MSP_FILE);
if(-s $out_msp_file and $out_msp_file !~/^ *\.msp$/){
print "\n# write_msp_files: $out_msp_file is written\n" if(-s $out_msp_file);
}else{
print "\n# write_msp_files: ERROR. Either $out_msp_file is empty or \".msp\" is written\n";
}
}else{
for($i=0; $i< @hash; $i++){
my %hash=%{$hash[$i]};
my @keys =sort keys %hash;
($query_seq_name)=$hash{$keys[0]}=~/\S+ +\d+ +\d+ +(\S+) +\d+ +\d+ +\S+/;
$msp_file_out="$query_seq_name\.msp";
open(MSP_FILE, ">$msp_file_out") or die "\n# write_msp_files: Failed to open $msp_file_out\n";
print MSP_FILE "# (H) $0 write_msp_files: $query_seq_name\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# for Final output
#_____________________________
push(@final_out, $msp_file_out);
#~~~~~~~ Writing the first line only ~~~~~~~~~~~~~~~~~~
for($j=0; $j< @keys; $j++){
if($keys[$j]=~/(\S+)_\d+\-\d+$/){ $N = $1 }else{ $N = $keys[$j] }
if($hash{$keys[$j]}=~/ +$N[\_\d+\-\d+]* +\d+ +\d+ +$N[\_\d+\-\d+]*/){
$keys[0]=$temp_1; $keys[0]=$keys[$j]; $keys[$j]=$temp_1;
}
}
for($j=0; $j< @keys; $j++){
chomp($hash{$keys[$j]});
print MSP_FILE $hash{$keys[$j]}, "\n";
}
print MSP_FILE "\n";
}
print MSP_FILE "\n";
if(-s $out_msp_file and $out_msp_file !~/^ *\.msp$/){
print "\n# write_msp_files: $out_msp_file is written\n" if(-s $out_msp_file);
}else{
print "\n# write_msp_files: ERROR. Either $out_msp_file is empty or only \".msp\" is written\n";
}
close MSP_FILE;
}
}
if(@final_out ==1){ return( \$final_out[0] ); }else{
return(\@final_out);
}
}
#______________________________________________________________________
# Title : write_aln_files
# Function : writes multiple seqs. in msf format (takes one or more than one seq.!!)
# Usage : two argments: $seq_hash_reference and $output_file_name
# takes a hash which has got names keys and sequences values.
# uses Perl5 pointers(references).
# Example : &write_aln(\%hash, \$out_file_name);
# CLUSTAL W (1.74) multiple sequence alignment
#
#
# MMAF6040_1 -----MATDD--SIIVLDD----DDEDEA-AAQP-GPSNLPPN-PASTGPGPGLSQQATG
# AF015956_1 -----MATAN--SIIVLDD----DDEDEA-AAQP-GPSHPLPN-AASPGAG---------
# HSAB2381_80-900 KQRLLSVTSDEGSMNAFTGRGSPDTEIKINIKQESADVNVIGNKDVVTEEDLDVFKQAQE
# .* : *: .: . * * : * . : * . . .
#
# Options :
# $first_sequence_name= by f= # to put a certain seq at the first in writing
# Keywords :
# Version : 1.1
#----------------------------------------------------------------------
sub write_aln_files{
$| =1;
my($string, %input, $temp, $output_file, $first_sequence_name,
$name, $k, %final_output);
for($k=0; $k< @_; $k++){
if(ref($_[$k]) eq 'HASH'){
%input=%{$_[$k]};
}elsif(ref($_[$k]) eq 'SCALAR'){
$output_file=${$_[$k]};
}else{
if($_[$k]=~/f=(\S+)/){
$first_sequence_name=$1;
}else{
$output_file=$_[$k];
}
}
}
open (ALN_FILE_OUT,">$output_file"); # $string is the seq string.
print ALN_FILE_OUT 'CLUSTAL W (1.74) multiple sequence alignment', "\n\n";
my(@names) = sort keys %input;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Putting the wanted top first seq name at the top
#____________________________________________________
if($first_sequence_name){
for($k=0; $k< @names; $k++){
if($names[$k]=~/$first_sequence_name/){
$temp=$names[0];
$names[0]=$names[$k];
$names[$k]=$temp;
}
}
}
my($larg) = length($input{$names[0]});
for ($k=0; $k < $larg; $k+= 60){ # 60 residues interval
for($i=0; $i < @names; $i++){ # number of sequences
$names = $names[$i];
$input{$names[$i]}=~ s/\n//g; ## this is to remove MANY new lines in the input !!
$seq = substr($input{$names[$i]}, $k, 60);
#$seq = &put_gaps_every_x_position_in_string($seq, 10, ' ');
printf ALN_FILE_OUT ("%-18s %-60s\n", $names, $seq);
$final_output{$output_file}.=sprintf("%-18s %-60s\n", $names, $seq);
}
printf ALN_FILE_OUT "\n";
$final_output{$output_file}.="\n";
}
close(ALN_FILE_OUT);
return(\%final_output);
}
#______________________________________________________________________
# Title : write_msf
# Function : writes multiple seqs. in msf format (takes one or more than one seq.!!)
# Usage : two argments: $seq_hash_reference and $output_file_name
# takes a hash which has got names keys and sequences values.
# uses Perl5 pointers(references).
# Example : &write_msf(\%hash, \$out_file_name, ["o=$seq_order"]);
# eg) $seq_order='asdf seq2 seq3 seq5';
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# PileUp
#
# MSF: 1205 Type: P Check: 9937 ..
#
# Name: PYC1_YEAST oo Len: 1205 Check: 7954 Weight: 1.00
# Name: PYC2_YEAST oo Len: 1205 Check: 5807 Weight: 1.00
# Name: PYC_MOUSE oo Len: 1205 Check: 6176 Weight: 1.00
#
# //
#
# PYC1_YEAST MSQ.RKFAGL RDNFNLLGEK N......... .......... .KILVANRGE
# PYC2_YEAST MSSSKKLAGL RDNFSLLGEK N......... .......... .KILVANRGE
# PYC_MOUSE ...MLKFQTV RGGLRLLGVR RSSSAPVASP NVRRLEYKPI KKVMVANRGE
#
# PYC1_YEAST IPIRIFRTAH ELSMQTVAIY SHEDRLSTHK QKADEAYVIG EVGQYTPVGA
# PYC2_YEAST IPIRIFRSAH ELSMRTIAIY SHEDRLSMHR LKADEAYVIG EEGQYTPVGA
# PYC_MOUSE IAIRVFRACT ELGIRTVAVY SEQDTGQMHR QKADEAYLIG R..GLAPVQA
#
# PYC1_YEAST YLAIDEIISI AQKHQVDFIH PGYGFLSENS EFADKVVKAG ITWIGPPAEV
# PYC2_YEAST YLAMDEIIEI AKKHKVDFIH PGYGFLSENS EFADKVVKAG ITWIGPPAEV
# PYC_MOUSE YLHIPDIIKV AKENGVDAVH PGYGFLSERA DFAQACQDAG VRFIGPSPEV
#
# Keywords : write_msf_files, save_msf_files
# Version : 2.2
#------------------------------------------------------------
sub write_msf{
my($seq, $string, $name, $k, $i, $longest_seq_leng, @seq_order_final,
$seq_order, @files_created, @names, $fill_seq_to_the_end);
$| =1;
if(@_ < 2){
print "\n# write_msf: I need 2 arguments(hash and filename). Look at the header box\n";
print chr(7); exit;
}
my($gap_char)='-';
my(%input)=%{$_[0]};
my($output_file)=${$_[1]} || $_[1];
if($_[2]=~/o=(.+)/){
$seq_order=$1;
@seq_order=split(/ +/, $seq_order);
}elsif(ref($_[2]) eq 'ARRAY'){
@seq_order=@{$_[2]};
}
@names = sort keys %input;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If seq order information is given, sort the out output
#________________________________________________________________
if(@seq_order > 0){ # when seq names were given in an order, you just use it
print "\n# (INFO) \@seq_order is given \n";
for($i=0; $i< @seq_order; $i++){
$seq_order_item=$seq_order[$i];
if($seq_order_item=~/_rv$/){ $seq_order_item=~s/_rv$// }
for($j=0; $j<@names; $j++){
$names_item=$names[$j];
if($names_item=~/_rv$/){ $names_item=~s/_rv$// }
if($seq_order_item eq $names_item){
push(@seq_order_final, $names[$j]);
splice(@names, $j, 1); $j--;
}else{ next }
}
}
@names=@seq_order_final;
}
$longest_seq_leng=length($input{$names[0]});
for $name (@names){
$len = length($input{$name});
if($len< 1){ print "\n# (ERROR) The length of seq. in \%input with $name key is 0, error!\n";
exit; }
$longest_seq_leng=$len if $len > $longest_seq_leng;
}
push(@files_created, $output_file);
open (MSF_FILE_OUT,">$output_file"); # $string is the seq string.
print MSF_FILE_OUT " $output_file MSF: $longest_seq_leng",' Type: P Check: 9937 .. '; ## This is dummy
print MSF_FILE_OUT "\n\n";
for $name (@names){
$len = length($input{$name});
printf MSF_FILE_OUT (" Name: %-15s Len: %-5s Check: 9999 Weight: 1.00\n", $name, $longest_seq_leng);
}
print MSF_FILE_OUT "\n";
print MSF_FILE_OUT "\/\/\n\n";
#""""""""""""""""""""""""""""""""""""""""""""""""""
# MSF file form
#==================================================
format MSF_FILE_OUT =
@<<<<<<<<<<<<<<<<<<<< @<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
$names, $seq
.
for ($k=0; $k < $longest_seq_leng; $k+=50){ # 50 residues interval
for($i=0; $i < @names; $i++){ # number of sequences
$names = $names[$i];
$input{$names[$i]}=~ s/\n//g;
#$input{$names[$i]}=~ s/_/$gap_char/g; # automatically changes '_' to '-'
$seq = substr($input{$names[$i]}, $k, 50);
$seq_leng=length($seq);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# This is to fill the empty space to make complete block of seq
#________________________________________________________________
if($fill_seq_to_the_end){
$seq .="$gap_char"x($longest_seq_leng-$seq_leng); # putting '---' at the blank line end
# Above option will coredump 'seaview' seq editor
}
$seq=~s/[\-_ ]/$gap_char/g; # setting the final gap_char you like
$seq = &put_gaps_every_x_position_in_string_special($seq, 10, ' ');
sub put_gaps_every_x_position_in_string_special{
my($string); if(ref($_[0])){ $string = ${$_[0]};
}else{ $string = $_[0]; }
my($interval) = $_[1]; my($gap_char) = $_[2];
$string =~ s/(.{$interval,$interval})/$1$gap_char/g;
return($string);
}
select (MSF_FILE_OUT); ## to print out to a FILE
write MSF_FILE_OUT;
}
print "\n"; # next block starts.
}
close(MSF_FILE_OUT);
select STDOUT; # <- this is necessary to normalize output for other sub
return(\@files_created);
}
#______________________________________________________________
# Title : get_seqblock
# Usage :
# Function :
# Example : @blocks_in_hash=@{&get_seqblock(\%msf, 30)};
# Warning :
# Keywords : find_sequence_block, get_sequence_block,
# make_seq_block, make_seqblock, find_seqblock
# Options : _ for debugging.
# # for debugging.
# m= for margin length of the seqblock
# t= for threshold
# l= for min seqlet length
#
# Returns :
# Argument :
# Category :
# Version : 1.4
#--------------------------------------------------------------
sub get_seqblock{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($connect_gap, @seq_frag, %digitized, $verbose, %hash, $best_block_opt);
my $margin=3;
my $threshold=0.8;
my $min_seqlet_size=25;
$connect_gap=5;
my @vars=keys %vars;
for($i=0; $i< @vars; $i++){
if($vars[$i] eq 'm'){
$margin=$margin_ori=$vars{$vars[$i]};
}elsif($vars[$i] eq 't'){
$threshold=$vars{$vars[$i]};
}elsif($vars[$i] eq 'l'){
$min_seqlet_size=$vars{$vars[$i]};
}elsif($vars[$i] eq 'c'){
$connect_gap=$vars{$vars[$i]} if( defined($vars{$vars[$i]}) );
}
}
if($char_opt=~/b/){ $best_block_opt='b' }
if($char_opt=~/r/){ $range_in_name='r' }
if($char_opt=~/c/){ $connect_opt ='c' }
if($char_opt=~/v/){ $verbose='v' }
for($o=0; $o<@hash; $o++){
%hash=%{$hash[$o]};
%digitized=%{&convert_char_to_0_or_1_hash($hash[$o])};
}
%added=%{&add_columns(\%digitized)}; # 11111 + 1010101 => 2121211
&show_hash(\%added) if ($debug==1);
%blocks=%{&get_high_score_blocks(\%added,
"m=$margin", "t=$threshold", "l=$min_seqlet_size", $verbose,
"c=$connect_gap", $connect_opt, $best_block_opt, $range_in_name)};
my @keys=keys %blocks;
for($e=0; $e< @keys; $e++){
my $range="$keys[$e]\-$blocks{$keys[$e]}";
my $seq_let_leng=$blocks{$keys[$e]} - $keys[$e] + 1;
if($seq_let_leng < $min_seqlet_size){
next;
}else{
push(@RANGE, $range);
}
}
@seq_frag=&get_seq_fragments(\%hash, @RANGE,
"l=$min_seqlet_size", "$range_in_name");
return(\@seq_frag);
}
#______________________________________________________________
# Title : add_columns
# Usage :
# Function :
# Example :
# Warning : if the attached name is too long(over 12 char),
# it changes to 'Added_upX' while X is a numb.
# Keywords : add_seq_columns, add_sequence_columns,
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------
sub add_columns{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my $gap_char=' ';
for($i=0; $i<@hash; $i++){
my %hash=%{$hash[$i]};
my @names=keys %hash;
my %final_hash_out;
my @final_added;
my $out_seq_name='Add';
for($j=0; $j<@names; $j++){
$out_seq_name.= "_$names[$j]";
my $string =$hash{$names[$j]};
my @ar_string;
if($string=~/\d{1,5}[ \,]\d{1,5}[ \,]\d{1,5}/){
@ar_string =split(/$gap_char|\,/, $string );
}elsif($string=~/^\d{5,}$/){
@ar_string =split(//, $string );
}
for($s=0; $s < @ar_string; $s++){
$final_added[$s]=$ar_string[$s]+$final_added[$s];
}
}
if(length($out_seq_name) > 12){ $out_seq_name="Added_up${i}"; }
$final_hash_out{$out_seq_name}=join("$gap_char", @final_added);
push(@OUT_HASH, \%final_hash_out);
}
wantarray ? return(@OUT_HASH) : return($OUT_HASH[0]);
}
#____________________________________________________________________
# Title : get_high_score_blocks
# Usage : get_high_score_blocks(<ref. of hash for number string>)
# Function : gets hash of key and number string and filters out the
# number string region which is below certain threshold
# determined inside this sub and returns a selected high
# number regions
# Example : %block_start_end=%{&get_high_score_blocks(\%input_numb_block)};
# %out=%{&get_high_score_blocks(\%inp_numbs, 'v', 'b')};
# Warning : This assumes that the inputs are multiply aligned seq
# Keywords : high_scoring_regions
# get_high_scoring_blocks, find_blocks, get_blocks
# Options : _ for debugging.
# # for debugging.
# b for best_block_opt, returns best block only
# v for showing the final range hash output
# c for connect close blocks
# c= for connect close blocks with specific closing gap size
# m= for margin length of the seqblock
# t= for threshold
# l= for min seqlet length
#
# Returns :
# Argument : accepts one single ref. of hash
# Category :
# Version : 1.4
#--------------------------------------------------------------------
sub get_high_score_blocks{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my $min_seqblock_leng=25;
my $threshold=0.8;
my (%block_start_end, @possible_block, %hash, $range_in_name,$connect_gap);
my $margin=$margin_ori=2; # $margin is m in .....mmm111111111111mmm.....
my $gap_char=' ';
my @vars=keys %vars;
my $connection_gap=5;
my $connect_opt=1;
for($i=0; $i< @vars; $i++){
if($vars[$i] eq 'm'){
$margin=$margin_ori=$vars{$vars[$i]};
}elsif($vars[$i] eq 't'){
$threshold=$vars{$vars[$i]};
}elsif($vars[$i] eq 'l'){
$min_seqblock_leng=$vars{$vars[$i]};
}elsif($vars[$i] eq 'c'){
$connect_opt='c';
$connection_gap=$vars{$vars[$i]};
#print "\n get_high_score_blocks: \$connection_gap is $connection_gap\n";
}
}
if($char_opt=~/c/){ $connect_opt='c' }
if($char_opt=~/r/){ $range_in_name='r' }
if($char_opt=~/v/){
print "\n \$threshold is $threshold ";
print "\n \$margin is $margin ";
print "\n \$min_seqblock_leng is $min_seqblock_leng \n\n";
print "\n \$connection_gap is $connection_gap \n\n";
print "\n \$connect_opt is $connect_opt \n\n";
}
for($i=0; $i<@hash; $i++){
my @range;
my %hash_ori=%{$hash[$i]};
my @names=keys %hash_ori;
if(@names>1){ # If the hash has multi entry, make one added up hash
%hash=%{&add_columns(\%hash_ori)};
@names=keys %hash;
}else{ %hash=%hash_ori; }
for($j=0; $j< @names; $j++){
my $string=$hash{$names[$j]};
if($string=~/\d{1,5}[ \,]\d{1,5}[ \,]\d{1,5}/){
@ar_string =split(/$gap_char|\,/, $string );
}elsif($string=~/^\d{4,}$/){ ## the string should be minimum 4 length
@ar_string =split(//, $string );
}
my $largest = ${&get_largest_element(\@ar_string)};
my $cut_line=$largest*$threshold;
#print "\n \$cutline in get_high_score_blocks is $cut_line \n" if $debug==1;
#~~~~~~~~~~~~ Cutting the tops The core algorythm #######
for($s=0; $s< @ar_string; $s++){
if($ar_string[$s] > $cut_line){ # possible_block is the increasing seqlet
if(@possible_block == $min_seqblock_leng){
while( $ar_string[$s] > $cut_line){
$ar_string[$s]=1;
while($s+1+$margin > @ar_string){ $margin-- }
push(@possible_block, ($s+1+$margin));
$margin=$margin_ori;
$s++;
}
$ar_string[$s]=0; #<--- Should be 0 than 1
$block_start_end{$possible_block[0]}=$possible_block[$#possible_block];
@possible_block=();
}else{
$ar_string[$s]=1;
while(($s+1-$margin) < 0){ $margin-- };
push(@possible_block, ($s+1-$margin) );
$margin=$margin_ori;
}
}elsif($ar_string[$s] <= $cut_line){
$ar_string[$s]=0;
@possible_block=();
}
}
#print "\n", @ar_string,"\n" if $debug==1;
#~~~~~~~~~~~~ Cutting the tops The core algorythm #######
}
}
#print "\n@ar_string\n";
#&show_hash(\%block_start_end);
&show_hash(\%block_start_end) if($char_opt=~/v/);
#~~~~~~~~~~~~ Connecting blocks ~~~~~~~~~~~~~~~~~~~~~~
if($connect_opt=~/c/){
my @keys=sort numerically keys %block_start_end;
sub numerically{ $a <=> $b; }
### sorting the %block_start_end
for($i=0; $i< @keys; $i++){
push(@block_s_e, $keys[$i], $block_start_end{$keys[$i]});
}
for($i=1; $i< $#block_s_e; $i++){ ## must be $#block_s_e to stop
$first_end =$block_s_e[$i]; ## before it removes everything
$second_start=$block_s_e[$i+1];
#""""""" if gap is smaller than connection_gap given """"""""
if($connection_gap > ($second_start-$first_end) ){
splice(@block_s_e, $i, 2);
$i--;
}else{
$i++; # to skip to the next start correctly
}
}
%block_start_end=@block_s_e;
}
print "\n# Blocks start and end after connection(gap was $connection_gap)\n" if($char_opt=~/v/);
&show_hash(\%block_start_end) if($char_opt=~/v/);
#~~~~~~~~~~~~ Getting the largest ~~~~~~~~~~~~~~~~~~~~~~~
if($char_opt=~/b/){
print "\n# Getting the largest block only to get all the blocks use a opt\n";
my @keys=keys %block_start_end;
my ($largest, %largest,$range_size, $largest_key);
for($i=0; $i< @keys; $i++){
$range_size = $block_start_end{$keys[$i]}-$keys[$i];
if($range_size > $largest){
$largest=$range_size;
$largest_key=$keys[$i];
#print "\n $largest_key \n";
}
}
$largest{$largest_key}=$block_start_end{$largest_key};
%block_start_end=%largest;
print "\n# The best block chosen (from to) \n" if($char_opt=~/v/);
&show_hash(\%block_start_end) if($char_opt=~/v/);
}
return(\%block_start_end);
}
#______________________________________________________________
# Title : delbut
# Usage : delbut *.zip (delete files except xxxx.zip)
# Function :
# Example :
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------
sub delbut{
my $i;
@save_files=@{$_[0]} || @_;
for(@save_files){
unless(-e $_){
print "\n\n \"$_\" does not exist, so nothing is deleted\n\n";
print chr(7);
exit;
}
}
my @files=@{&read_file_names_only('.')};
my @del_files=@{&subtract_array(\@files, \@save_files)};
for($i=0; $i< @del_files; $i++){
system("rm -f $del_files[$i]");
}
print "\n\n Subdirs are never deleted \n\n";
}
#______________________________________________________________
# Title : get_msp_range
# Usage : @range=@{&get_msp_range($seqlet)};
# @temp=&get_msp_range($seqlet);
#
# Function :
# Example :
# Warning :
# Keywords : get_msp_file_ranges
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.5
#--------------------------------------------------------------
sub get_msp_range{
my $lines1=${$_[0]} || $_[0];
my ($SEQ, $num_seq, $matched_SEQ, @Ranges);
if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
$SEQ =$3;
$matched_SEQ=$6;
if($SEQ eq $matched_SEQ){ ## skipping self match
$num_seq++;
}else{
@Ranges=($1, $2, $4, $5); ## <-- example. (10-20, 30-45)
}
}
return wantarray ? (\@Ranges, \$SEQ, \$matched_SEQ): \@Ranges;
}
#______________________________________________________________
# Title : get_msp_enquiry_sequence
# Usage :
# Function : gets the name of sequence used as enquiry(target)
# Example :
# Warning :
# Keywords : get_msp_target_sequence, get_msp_enquiry_sequence_name
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub get_msp_enquiry_sequence{
my $lines1=${$_[0]} || $_[0];
my ($SEQ, $matched_SEQ);
if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
$SEQ =$3;
$matched_SEQ=$6;
}
return \$SEQ;
}
#______________________________________________________________
# Title : get_msp_matched_sequence
# Usage :
# Function : gets the name of sequence used as enquiry(target)
# Example :
# Warning :
# Keywords : get_msp_matched_sequence_name
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub get_msp_matched_sequence{
my $lines1=${$_[0]} || $_[0];
my ($SEQ, $matched_SEQ);
if($lines1 =~/^ *\d+ +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\S+) +(\d+) +(\d+) +(\S+)/){
$SEQ =$3;
$matched_SEQ=$6;
}
return \$matched_SEQ;
}
#______________________________________________________________
# Title : get_linked_sequence
# Function : opens msp file and links the sequences according
# to the matches.
# Usage :
# Example : seq1 ------------------------------
# |||||||||||
# seq2 --------------------------------
# OUT 000000000011111111111000000000000000000
#
# Warning :
# Keywords : link_sequence_from_msp_file, linked_sequenced_length
# get_clustered_sequence_length, get_annexed_sequence_length
# connect_sequences, merge_sequences, combine_sequences
# Options : _ for debugging.
# # for debugging.
# Returns : A ref. of an array
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub get_linked_sequence{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my ($final_leng, $start_diff,@MSP, %seq_sizes, $final_leng);
my ($max_head_overhang, $head_diff, $tail_diff,
$off_set, $max_tail_overhang, @LINKED, $LINKED);
my $Threshold=40;
for($i=0; $i< @file; $i++){
my($input_file) = ${$file[$i]} || $file[$i];
if($debug eq 1){ print "\n inputfile is $input_file\n" };
unless (-e $input_file){
print chr(7);
print "\n\n\t This is sub open_msp_files in $0 \n\n";
print "\t Fatal: The input file $input_file is not in the directory \n";
}
my %seq_sizes=%{&open_msp_files(\$input_file, '-s')};
my @NAmes=keys %seq_sizes;
for($s=0; $s< @NAmes; $s++){ # making '000000000000000.....';
my $len=$seq_sizes{$NAmes[$s]};
for($t=0; $t< $len; $t++){
${"$NAmes[$s]"}[$t]=0;
}
}
open(FILE_1,"$input_file");
my @MSP=<FILE_1>;
close(FILE_1);
for($j=0; $j<@MSP; $j++){
if($MSP[$j]=~/^ *(\d+) +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\w+) +(\d+) +(\d+) +(\w+) +(.+)/i){
if(($1 >= $Threshold)&& ($4 eq $7)){
push(@matched_members, $4);
}elsif(($1 >= $Threshold)&& ($4 ne $7)){
$matched_segment_count++;
if($match_name ne $7){ push(@matched_members, $7); }
$query_start=$2-1; $query_end =$3-1;
$query_seq =$4; $match_start=$5-1;
$match_end =$6-1; $desc =$8;
$match_name =$7;
for($x=$query_start; $x<= $query_end; $x++){
${"$query_name"}[$x]++;
}
for($y=$match_start; $y<= $match_end; $y++){
${"$match_name"}[$y]++;
}
}
}
}
for($j=0; $j<@MSP; $j++){
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# $1 $2 $3 $4 $5 $6 $7 $8
# 171 41.18 6 73 HI1690 9 76 HI0736 sodium...
#,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
if($MSP[$j]=~/^ *(\d+) +\d+\.?[e\-\d]* +(\d+) +(\d+) +(\w+) +(\d+) +(\d+) +(\w+) +(.+)/i){
if(($1 >= $Threshold)&& ($4 eq $7)){
$query_name=$4; $query_leng=$3;
push(@matched_members, $4);
}elsif(($1 >= $Threshold)&& ($4 ne $7)){
$matched_segment_count++;
if($match_name ne $7){ push(@matched_members, $7); }
$query_start=$2-1; $query_end =$3-1;
$query_seq =$4; $match_start=$5-1;
$match_end =$6-1; $desc =$8;
$match_name =$7;
}
@matched_seq_array= @{"$match_name"};
$matched_seq_array=join('', @matched_seq_array);
my $start_diff= $query_start - $match_start;
if($start_diff >= 0){
my $tail_diff= $start_diff + $seq_sizes{$match_name} - $seq_sizes{$query_name};
if($tail_diff > 0){
$max_tail_overhang = $tail_diff if $tail_diff > $max_tail_overhang;
for($x=0; $x< $tail_diff; $x++){
$tail_start=$seq_sizes{$match_name}-$tail_diff + $x;
@matched_seq_array=split(//,$matched_seq_array);
$tail_array[$x] +=$matched_seq_array[$tail_start];
}
}
}elsif($start_diff < 0){
$head_diff = abs($start_diff);
$max_head_overhang=$head_diff if $head_diff > $max_head_overhang;
for($z=0; $z< $head_diff; $z++){
$head_array[$z] += ${"$match_name"}[$z];
}
}
}
}
}
@LINKED=( @{"$match_name"}[0..($max_head_overhang-1)], @{"$query_name"}, @tail_array);
$LINKED=join('', @LINKED);
if($debug eq 1){
print __LINE__, " In open_msp_files \%sequence is", %sequence ,"\n";
}
$final_offset=$extened_number_line - $query_leng;
return(\@LINKED);
}
#______________________________________________________________________________
# Title : get_averaged_prediction
# Usage : %av_for_back_pred=%{&get_averaged_prediction(\%sec1, \%sec_rv)};
# Function : The content of out %average is
# $averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
# Example :
# Keywords : get_average_predator_prediction, average_predator_prediction
# get_averaged_sec_prediction, get_average_prediction
# Options :
# $reverse_order_of_one_hash=r by r
# $give_weight_with_good_match=w by w # this is to give preference to well
# $weight_factor= by w=
# matching sec. str. I add '0.1'
# Author : jong@salt2.med.harvard.edu sat@mrc-lmb.cam.ac.uk
# Version : 1.2
#------------------------------------------------------------------------------
sub get_averaged_prediction{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($position, $residue1, $residue2, %averaged, %hash1, %hash2, $sec_str1,
$sec_str2, $sum_reliability, $average_reliability, $dif_reliability,
$reverse_order_of_one_hash, $give_weight_with_good_match, $weight_factor);
if($char_opt=~/r/){ $reverse_order_of_one_hash='r' }
if($char_opt=~/w/){ $give_weight_with_good_match='w'; $weight_factor=0.05 }
if($vars{'w'}=~/(\S+)/){ $give_weight_with_good_match='w'; $weight_factor=$1 }
for($i=0; $i< @hash; $i++){
unless(%averaged > 2){
%hash1=%{$hash[$i]};
%hash2=%{$hash[$i+1]};
$i++;
}elsif( %averaged ){
%hash1=%averaged;
%hash2=%{$hash[$i]};
}
@keys=sort { $a <=> $b} keys %hash1;
for($k=0; $k< @keys; $k++){
$position =$keys[$k]; ## in case of predator prediction, key is the position of residue
$rev_posi =@keys-$position+1;
unless($reverse_order_of_one_hash){
$rev_posi=$position;
}
$residue1 =$hash1{$position}->[0];
$residue2 =$hash2{$rev_posi}->[0];
if($residue1=~/$residue2/i){ # if they are the same aa
$sec_str1 =$hash1{$position}->[1]; ## usually one of 'c h e'
$sec_str2 =$hash2{$rev_posi}->[1]; ## usually one of 'c h e'
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If both predictions are matching eg) H->H
#________________________________________________
if($sec_str1=~/$sec_str2/i){
$sum_reliability += ($hash1{$position}->[2] + $hash2{$rev_posi}->[2]);
$average_reliability=$sum_reliability/2 + $weight_factor; # usually 0.1
$averaged{$position}=[$residue1, $sec_str1, $average_reliability];
$average_reliability=$sum_reliability='';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# If both predictions are NOT matching eg) E->H
#________________________________________________
else{
if($hash1{$position}->[2] > $hash2{$rev_posi}->[2]){
$dif_reliability = ( $hash1{$position}->[2] - $hash2{$rev_posi}->[2]);
$averaged{$position}=[$residue1, $sec_str1, $dif_reliability];
}else{
$dif_reliability = ( $hash2{$rev_posi}->[2] - $hash1{$position}->[2]);
$averaged{$position}=[$residue1, $sec_str2, $dif_reliability];
}
}
}
}
}
return(\%averaged);
}
#______________________________________________________________
# Title : get_average_sequence_size
# Usage :
# Function :
# Example :
# Warning :
# Keywords : get_av_sequence_size, get_average_seq_size
# get_av_seq_size, average_seq_size, av_seq_size
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub get_average_sequence_size{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( @OUT_AV, $size, @size, @names, %hash, $sum, $av);
for($i=0; $i<@hash; $i++){
%hash=%{$hash[$i]};
@names=keys %hash;
for($j=0; $j<@names; $j++){
$size=length($hash{$names[$j]});
push(@size, $size);
}
for($j=0; $j<@size; $j++){
$sum+=$size[$j];
}
$av=int($sum/@names);
push(@OUT_AV, $av);
}
wantarray ? \@OUT_AV : \$OUT_AV[0];
}
#______________________________________________________________
# Title : get_linux_kernel_version
# Usage :
# Function :
# Example :
# Warning :
# Keywords : get_kernel_version, kernel_version,
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub get_linux_kernel_version {
my($image, $version, $i);
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'SCALAR'){
$image = ${$_[$i]};
}else{
$image = $_[$i];
}
unless(defined($image)){
if(-e '/vmlinuz'){
$image='/vmlinuz';
}elsif(-e '/boot/vmlinuz'){
$image='/boot/vmlinuz';
}elsif(-f '/boot/bvmlinux' ){
$image='/boot/bvmlinux';
}
}
print "\n# The final chosn \$image is $image\n";
my($str) = "phlogiston";
my($version_start) = 0;
my($version_length) = 10;
open(DATA, $image) or return(undef);
#seek(DATA, $version_start, 0);
while(<DATA>){
if(/(\d+\.\d+\.\d+)/){
$version=$1;
push(@versions, $version);
last;
}
}
close(DATA);
}
if(@versions > 1){
return(\@versions);
}else{
return(\$version)
}
}
#______________________________________________________________
# Title : load_mount_info
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------
sub load_mount_info {
undef %mounted;
undef %fs_type;
open(MTAB, "</etc/mtab") or die "Can't read /etc/mtab: $!\n";
while (<MTAB>) {
my($dev, $mp, $type) = split;
next if $dev eq 'none';
$mounted{$dev} = $mp;
$mounted{$mp} = $dev;
$fs_type{$dev} = $type;
}
close(MTAB);
}
#______________________________________________________________
# Title : plot_vertically
# Usage : &plot_vertically(\@query);
# Function : This is a sub used for plot_domains.pl for
# genome_analysis
# Example :
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub plot_vertically{
@numbers=@{$_[0]};
print "\n |===================================================\>\n";
for($i=0; $i<@numbers;$i++){
printf ("%-4d\|", $i);
print "\*"x$numbers[$i], "\n";
}
print " |===================================================\>\n";
}
#______________________________________________________________
# Title : plot_histogram_horizontally
# Usage : &plot_horizontally(\@query);
# Function :
# Example :
# Input: $input= '00001111111113333333333444444444111111111111111';
#
# Output:
# 00001111111113333333333444444444111111111111111
# 1-------------------------------------------47
# |
# |
# | *********
# | *******************
# | *******************
# | *******************************************
# |-----------------------------------------------
#
# Warning :
# Keywords : plot_horizontally, plot_numbers_horizontally, plot,
# plot_numbers,
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------
sub plot_histogram_horizontally{
my @numbers=@{$_[0]};
my $leng= @numbers;
my ($largest, @inversed, $m, $i);
for($i=0; $i< @numbers; $i++){
$largest=$numbers[$i] if $largest < $numbers[$i];
}
for($i=0; $i< @numbers; $i++){ # this inverse the digits
$inversed[$i]=abs($numbers[$i]-$largest);
}
print "\n ", @numbers;
print "\n 1", "\-"x($leng-4),$leng;
print "\n\|";
print "\n\|";
for($m=0; $m< $largest; $m++){
print "\n\|";
for($i=0; $i<@inversed;$i++){
if($inversed[$i] > 0){
print " ";
$inversed[$i]--;
}else{
print "\*";
}
}
}
print "\n\|", "\-"x@numbers;
print "\n";
}
#______________________________________________________________
# Title : condense_number_string
# Usage :
# Function : condenses the numbers by making an average with
# given factor. If the factor is 2 on number seq
# 1334284425 , result will be 23543
# 133428442 , 23541 <-- preserved end
# Factor 3 =>
# 133428442 , (1+3+3)/3 = 2
# (4+2+8)/3 = 4,,,
# Example : @output=@{&condense_number_string(\@input, $factor)};
# with @input=qw(1 2 4 10 10 22 2 3 44 2 3); and $factor=3
# Warning :
# Keywords : compact_number_string, compact_digits, condense
# condense_string
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------
sub condense_number_string{
my @ARRAY=@{$_[0]};
my $factor = ${$_[1]} || $_[1];
my ( $i, $j, @out );
unless(defined($factor)){ $factor=1 }
for($i=0; $i< @ARRAY; $i+=$factor){
my $temp_sum;
for($j=$i; $j < ($factor+$i); $j++){
$temp_sum+=$ARRAY[$j]
}
push(@out, int($temp_sum/$factor) );
}
return(\@out);
}
#___________________________________________________________
# Title : get_seq_fragments
# Usage : @seq_frag=&get_seq_fragments(\%msf, @RANGE);
# Function : gets sequence(string) segments with defined
# ranges.
# Example :
# %test=('seq1', '1234AAAAAAAAAAAaaaaa', 'seq2', '1234BBBBBBB');
# @range = ('1-4', '5-8');
#
# %out = %{&get_seq_fragments(\%test, \@range)};
# %out => (seq1_5-8 AAAAA
# seq2_5-8 BBBBB
# seq1_1-4 1234
# seq2_1-4 1234 )
#
# Warning :
# Keywords : get_sequence_fragments,
# Options : _ for debugging.
# # for debugging.
# l= for min seqlet length
# r for adding ranges in the seq names
#
# Returns :
# Argument :
# Category :
# Version : 1.8
#-------------------------------------------------------
sub get_seq_fragments{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my $min_seqlet_size=10;
my @vars=keys %vars;
my $no_range_in_name=1;
for($i=0; $i< @vars; $i++){
if($vars[$i] eq 'l'){
$min_seqlet_size=$vars{$vars[$i]};
}
}
if($char_opt=~/v/){ print "\n \$char_opt is $char_opt @char_opt\n"; }
if($char_opt=~/n/){ $no_range_in_name = 1 }
if($char_opt=~/r/){ $no_range_in_name = 0 }
print "\nget_seq_fragments \$no_range_in_name is $no_range_in_name \n";
for($i=0; $i< @hash; $i++){
my (%out_frag, $frag_name, $range_start, $range_end, @out_hash);
my %seqs = %{$hash[$i]};
my @names = keys %seqs;
if(@names==1){
for($j=0; $j < @names; $j++){
my $seq_name = $names[$j];
my $seq = $seqs{$seq_name};
for($k=0; $k< @range; $k++){
my $range = $range[$k];
if($no_range_in_name==1){
$frag_name = "$seq_name";
}else{
$frag_name = "$seq_name\_$range";
}
#if(length($frag_name)>14 ){
# $frag_name ='x'."${j}_${range}";
#}
($range_start, $range_end)=$range=~/(\d+\.?\d*)\-(\d+\.?\d*)/;
my $frag_len = $range_end-$range_start+1;
if($frag_len < $min_seqlet_size){
next;
}
my $fragment = substr($seq, $range_start-1, $frag_len);
$out_frag{$frag_name}=$fragment;
}
}
push(@out_hash, \%out_frag);
}elsif(@names > 1){
for($k=0; $k< @range; $k++){
my %out_frag=();
my $range=$range[$k];
($range_start, $range_end)=$range=~/(\d+\.?\d*)\-(\d+\.?\d*)/;
my $frag_len = $range_end-$range_start+1;
if($frag_len < $min_seqlet_size){
next;
}
for($j=0; $j < @names; $j++){
my $seq_name=$names[$j];
my $seq = $seqs{$seq_name};
if($no_range_in_name==1){
$frag_name = "$seq_name";
}else{
$frag_name = "$seq_name\_$range";
}
#if(length($frag_name)>15 ){
# $frag_name ='x'."${j}_${range}";
#}
if($range_start==0){ $range_start++; } ## This is a bugfix
my $fragment = substr($seq, $range_start-1, $frag_len);
$out_frag{$frag_name}=$fragment;
}
push(@out_hash, \%out_frag);
}
}
}
if(@out_hash > 1){ return(@out_hash)
}elsif(@out_hash==1){ return($out_hash[0]) }
}
#________________________________________________________________________
# Title : make_standalone_subroutines
# Usage : &make_standalone_subroutines(@ARGV);
# Example : &make_standalone_subroutines(@ARGV);
# Function : Creates each subroutine derived xxx.pl file from Bio.pl or any
# given library file. If there is a file for a sub already, it
# skips.
# Class : Utility
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub make_standalone_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($each_sub, %out_subs, %left_out, @lib, $ver, $real_sub_entry_found);
$|=1;
for($i=0; $i < @file; $i++){
open(LIB_FILE, "<$file[$i]")|| die "\n $file[$i] <- $! \n";
my @lib =<LIB_FILE>;
FOR: for($j=0; $j < @lib; $j++){
my (%out_subs, $each_sub);
my $title_found;
#"""" Taking the headbox """""""""""""
if( ($lib[$j]=~/^#+[_\-\*]{10,120} *$/)
&&($lib[$j+1]=~/^(# *title *: *([\w\-]+))[^\.pl]/i) ){
$each_sub=$2;
$title_found =1;
if( (-s "$each_sub\.pl") > 200 ){
print (-s "$each_sub\.pl"), " ";
print " $each_sub", " exists \n";
next FOR;
}elsif((-s "$each_sub\.pl") <= 200){
my $temp;
open (TEMP, "<$each_sub\.pl");
while(<TEMP>){
if(/^#[_\-\*]{10,120} *$/){ $temp++ }
elsif(/^# *title *: *[\w\-]+[^\.pl]/i ){
$temp++;
}elsif(/^# *\w+/){
$temp=$temp+0.5;
}
}
if($temp >2){
next FOR;
}
}
$out_subs{"$each_sub"}.="$lib[$j]$1\n";
$j+=2;
until( ($lib[$j]=~/^sub *\w+ *\{/)||($lib[$j]=~/^#---+ *$/) ||
($lib[$j]=~/^#_____+ *$/) || ($lib[$j]=~/^#\*\*+ *$/) ){
$lib[$j]=~s/( *)$//; #<-- removing ending space
$out_subs{"$each_sub"}.="$lib[$j]";
$j++;
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++; ## essential to remove #------------- line
}
#"""""""" Reading sub { } """""""
if( ($title_found==1)&&($lib[$j]=~/^sub +([\w\-]+) *\{/) ){
$each_sub=$1;
$out_subs{"$each_sub"}.="$lib[$j]";
if($lib[$j]=~/^sub +([\w\-]+) *\{.+\}/){
goto WRITE;
}
$j++;
until($lib[$j]=~/^\}/){
$out_subs{"$each_sub"}.="$lib[$j]"; $j++;
}
$out_subs{"$each_sub"}.="$lib[$j]"; ## to fetch '}'
$j++;
WRITE:
open (EACH_FILE, ">$each_sub\.pl");
print EACH_FILE "#\!\/perl\n";
print EACH_FILE "# Made by $0 at: ", `date`, "\n";
print EACH_FILE $out_subs{$each_sub};
close EACH_FILE;
%out_subs=();
#chmod
}
}
}#""""""""""""" end of for (@file)
close LIB_FILE;
}
#___________________________________________________________
# Title : is_html
# Usage :
# Function : Checks if it is an html file.
# Example : $html=&is_html(\@test);
# Warning :
# Keywords :
# Options : _ for debugging.
# # for debugging.
# Returns :
# Argument :
# Category :
# Version : 1.0
#-------------------------------------------------------
sub is_html{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my $html=0;
if( @string >0 ){
for($i=0; $i<@string; $i++){
if($string[$i]=~/^[.\n]{0, 100}\< *HTML *\>/i){
$html_head=1;
}if($string[$i]=~/[.\n]+\< *\/HTML *\>[\n.]{0,100}$/i){
$html_end=1;
}
}
if( ($html_head eq $html_end)&&($html_end=1)){
$html=1;
}
if($debug==1){ print "\n \@string is @string\n"; }
}elsif(@file>0){
for($i=0; $i< @file; $i++){
my $all_lines;
open(F, "$file[$i]");
while(<F>){
$all_lines.=$_;
}
print "\n All the lines of $file[$i] is $all_lines\n" if $debug==1;
if($all_lines =~/\< *HTML *\>/i){
if($all_lines=~/\< *\/ *HTML *\>/i){
$html=1;
print "\n html matched $html\n" if $debug ==1;
}
}
}
}elsif( @array>0 ){
@arr = @{$array[$i]};
for($i=0; $i< @arr; $i++){
if($arr[$i]=~/^[.\n]{0, 100}\< *HTML *\>/i){
$html_head=1;
}if($arr[$i]=~/[.\n]+\< *\/HTML *\>[\n.]{0,100}$/i){
$html_end=1;
}
}
if( ($html_head eq $html_end)&&($html_end=1) ){
$html=1;
}
}elsif(@hash>0){
for($i=0; $i< @hash; $i++){
@hash=%{$hash[$i]};
for($i=0; $i< @hash; $i++){
if($hash[$i]=~/^[.\n]{0, 100}\< *HTML *\>/i){
$html_head=1;
}if($hash[$i]=~/[.\n]+\< *\/HTML *\>[\n.]{0,100}$/i){
$html_end=1;
}
}
}
}
return($html);
}
#___________________________________________________________________
# Title : get_column
# Usage : &get_column(\@ar, 1,2 ,3);
# &get_column(\%ha, 1,2 ,3);
# &get_column(@ARGV);
# # where prompt is like: column.pl temp.txt 1 2 3 4
# Function : Prints any specified columns, can change order of them,
# can filter values of columns to filter (max or min value)
# Skipps blank line.
# Example : For getting only necessary columns
# Input: %Hash=(1, 'col1 col2 col3',
# 2, 'col1 col2 col3',
# 3, 'col1 col2 col3');
# input format: &get_column(\%Hash, 3,2,1, 'k'); # k is opt
# Ouput format: STDOUT as
#
# 1 col3 col2 col1
# 2 col3 col2 col1
# 3 col3 col2 col1
#
# Keywords : columns, column.pl, column, get_columns, take_columns,
# Options : # for debugging.
# _ for debugging.
# k for Key print when hash input is given.
# n for no first line display(Handy when you have title line
# and wanna remove it)
# ?max?=xxx for filtering column numbers by maximum of xxx
# ?min?=yyy for filtering column numbers by minimum of yyy
# (eg, min4=100000 means 4th column minimum is 100000)
# (eg, 1min4=10, 2min3=10, means get 4th column values
# below 10 as the first output column. Get 3rd
# column values below 10 as the second out column.
#
# $combine = 1 by -c c # c is for combining columns in different files
# $ignore = 1 by -i i # i is for ignoring leng diff in columns over 1 input
#
# Returns : Ref of
# Argument : Ref of Hash, Array or just filename, and wanted column numbers.
# Category :
# Version : 1.6
#---------------------------------------------------------------
sub get_column{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (%whole, $previous, @out, @columns);
my $len =4;
my @v_keys= sort keys %vars; ## To be able to exchange order of column.
if($char_opt=~/i/i){
$ignore =1;
}elsif($char_opt=~/c/i){
$combine=1;
}
if(@v_keys > 0){
for($i=0; $i< @v_keys; $i++){
if($v_keys[$i]=~/\d*(m..)(\d+)/i){
$M=$1.$2;
${"$1$2"}= $vars{$&};
push(@columns, $2);
}
}
}else{
@columns=@num_opt;
}
my $troubled_column ;
if((@num_opt==0)&&(@file>1)){ $combine=1 }; # when no column num. is given assume $combine
#""""""""""" When combine option is set """"""""""""""""""""""
if(($combine==1)&&(@file > 1)){
for ($f=0; $f<@file; $f++){
open(IN, "$file[$f]"); # real showing is now.
my @all_lines=<IN>;
if((@all_lines != $previous)&&($ignore !=1)){
print "\n The column lengths do not match in the inputs\n";
print "\n you can use -i option \n";
exit;
}
$previous=@all_lines;
for($w=0; $w< @all_lines; $w++){
if($all_lines[$w]=~/^[\t ]*$/){ next } # skipping blank line
chomp($all_lines[$w]);
$out[$w].="$all_lines[$w] ";
}
close IN;
}
push(@array, \@out);
}
###### File is given as input #######""""""""""""""""""""""""""""""""""""""
if((@file >=1)&&(@array < 1)){
my $file;
for $file(@file){
my ($line_num, $line_read);
my $change=0.1;
open(IN, "$file");
my @all_lines=<IN>;
for($q=0; $q < @all_lines; $q++){ # This open is only for getting largest column width size
$line_read++;
@splited=split(/ +/, $all_lines[$q]);
$l=${&get_longest_str_size(\@splited)};
if($l>$len){ $len=$l; $change++ }
if( ($line_read/$change) > 50 ){ last } # this is to check the column
} # consistency and stops after some
if($debug==1){
print "\n$line_read lines read to get right column size\n";
}
close(IN);
my $line_counter;
for($x=0; $x < @all_lines; $x++){
if($all_lines[$x]=~/^[\t \#]*$/){ next } # skipping blank line and comment lines
if($all_lines[$x]=~/^#/){ next } # skipping comment lines
$line_counter++;
if(($char_opt=~/n/i)&&($line_counter==1)){ next } ## NO title #
if(@M=$all_lines[$x]=~/(\S+)/g){
if(@columns < 1){
for($n=0; $n< @M; $n++){
$columns[$n]=$n+1;
}
}
for($r=0; $r< @columns; $r++){ # columns is from num_opt which is given at prompt (like 3 2 1)
$col =$columns[$r]-1; #
if( defined( ${"max$col"} ) && # when max or min is defined
defined( ${"min$col"} ) ){
if( ( ${"max$col"} > $M[$col])&&
( ${"min$col"} < $M[$col]) ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"max$col"}) ){ #--- When max and min are not defined.----#
if( ${"max$col"} > $M[$col] ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"min$col"}) ){
if( ${"min$col"} < $M[$col] ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}else{
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}
}
print "\n";
}
} #for
}
} ###### Array is given as input #######""""""""""""""""""""""""""""""""""
elsif(@array>0){ # if input is ('x y xx y','k t yy zz',,,,)
for($t=0; $t<@array; $t++){
my @arr=@{$array[$t]};
print "\n Array input\n" if $debug ==1;
for($i=0; $i<@arr;$i++){
@splited=split(/ +/,$arr[$i]);
$l=${&get_longest_str_size(\@splited)};
$len=$l if $l>$len;
}
for($i=0; $i< @arr; $i++){
if($arr[$i]=~/^[\t ]*$/){ splice(@arr, $i, 1); $i--; next }
if(($char_opt=~/n/i)&&($i==0)){ next } # skipping the first line
if(@M=$arr[$i]=~/(\S+)/g){
if(@columns < 1){
for($n=0; $n< @M; $n++){
$columns[$n]=$n+1;
}
}
for($j=0; $j< @columns; $j++){
$col =$columns[$j]-1; #
if( defined( ${"max$col"} ) && # when max or min is defined
defined( ${"min$col"} ) ){
if( ( ${"max$col"} > $M["$col"] )&&
( ${"min$col"} < $M[$col]) ){
printf "%-${len}s ",$M["$col"] unless($M["$col"] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"max$col"}) ){ #--- When max and min are not defined.----#
if( ${"max$col"} > $M["$col"] ){
printf "%-${len}s ",$M["$col"] unless($M["$col"] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"min$col"}) ){
if( ${"min$col"} < $M["$col"] ){
printf "%-${len}s ",$M["$col"] unless($M["$col"] eq '');
}else{
printf "%-${len}s ";
}
}else{
printf "%-${len}s ",$M["$col"] unless($M["$col"] eq '');
}
}
print "\n";
}
}
}
} ##### Hash is given as input #######""""""""""""""""""""""""""""""""""
elsif(@hash>0){
my @arr;
for($h=0; $h<@hash; $h++){
my @array=values %{$hash[$h]};
my @keys =keys %{$hash[$h]};
for($i=0; $i< @array; $i++){ # getting the longest str size
@arr=split(/ +/,$array[$i]);
$l=${&get_longest_str_size(\@arr)};
$len=$l if $l>$len;
}
for($i=0; $i< @array; $i++){
if($array[$i]=~/^[\t ]*$/){ splice(@array, $i, 1); $i--; next }
if(($char_opt=~/n/i)&&($i==0)){ next } # skipping the first line
printf "%-10s", $keys[$i] if($char_opt=~/k/i); ## Option for key printing
if(@M=$array[$i]=~/(\S+)/g){
if(@columns < 1){
for($n=0; $n< @M; $n++){
$columns[$n]=$n+1;
}
}
for($j=0; $j< @columns; $j++){
$col =$columns[$j]-1;
if( defined( ${"max$col"} ) && # when max or min is defined
defined( ${"min$col"} ) ){
if( ( ${"max$col"} > $M[$col])&&
( ${"min$col"} < $M[$col]) ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"max$col"}) ){ #--- When max and min are not defined.----#
if( ${"max$col"} > $M[$col] ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}elsif( defined(${"min$col"}) ){
if( ${"min$col"} < $M[$col] ){
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}else{
printf "%-${len}s ";
}
}else{
printf "%-${len}s ",$M[$col] unless($M[$col] eq '');
}
}
print "\n";
}
}
}
}
}
#____________________________________________________________________
# Title : write_sdb_file
# Function : gets a hash ref. and writes the SDB file with 'sprintf'
# Usage : @out=@{&write_sdb_file(\%seq)};
# Example : @out=@{&write_sdb_file(\%seq, 'v')}; ## for STDOUT as well
# ___________________________________________________________________________
# Title : EST_YEAST.sdb
# Full Name : Telomerase_yeast_699aa
# Nicknames :
# EMBL :
# PDB :
# Swissprot :
#
# Argument : \%ref_of_seq
# Keywords : write_sdb
# Returns :
# Options : v for verbose representation. This will print boxes on STDOUT
# n for no '#' leader.
# e for Endline( '-----------------------------..' )
# Version : 1.1
#---------------------------------------------------------------
sub write_sdb_file{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($Commont_Symbol, $delimiter, $Enclosed_came, $end_found, $end_line, $entry,
$length, $line, $name, $name_found, $name_found, $num,
$original_dir, $output, $out_string, $pre, $pwd, $start_line, $string, $string1,
$temp, $title_found, $type_DSSP, @arg_output, @Final_out, @k, @keys, @names, @out, @out_hash,
@out_hash_final, @output_box, @outref, @read_files, @str1, @str2, @string1,
%correct_head_box_entry, %Final_out, %hash, %input, %out_hash_final
);
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($Entry_length) =9 ;
my($VL) =60; ## key length( like in # Title ) ## value length
$num =80;
if($char_opt =~ /n/i){
$Commont_Symbol=' '; ## Comment symbol. For help display, you can change into ' '
}else{
$Commont_Symbol=''; # Comment symbol. Default head_box display.
}
for($x=0; $x < @hash; $x++){
my(%input) = %{$hash[$x]}; my(@keys)= sort (keys %input); my(@out);
#''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
## PUTTING an order in the printout entries. To make 'Title' come first
#''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
for($i=0; $i < @keys; $i++){
if($keys[$i]=~/^Title/i){
$temp=$keys[0]; $keys[0]=$keys[$i]; $keys[$i]=$temp;
}elsif($keys[$i]=~/^Enclosed?/i){
$temp=$keys[$#keys]; $keys[$#keys]=$keys[$i]; $keys[$i]=$temp;
}elsif($keys[$i]=~/^Usage$/i){
$temp=$keys[1]; $keys[1]=$keys[$i]; $keys[$i]=$temp;
}elsif($keys[$i]=~/^Function/i){
$temp=$keys[2]; $keys[2]=$keys[$i]; $keys[$i]=$temp;
}elsif($keys[$i]=~/^Example/i){
$temp=$keys[3]; $keys[3]=$keys[$i]; $keys[$i]=$temp;
}elsif($keys[$i]=~/^Version/i){
$temp=$keys[$#keys-2]; $keys[$#keys-2]=$keys[$i]; $keys[$i]=$temp;
#### To make null version value to '1.0'
if($input{$keys[$#keys-2]}=~/^ *$/){ $input{$keys[$#keys-2]}='1.0'; }
}elsif($keys[$i]=~/^Warning/i){
$temp2=$keys[$#keys-1]; $keys[$#keys-1]=$keys[$i]; $keys[$i]=$temp2;
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
## Writing starting line
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($start_line) = "$Commont_Symbol".'_'x"$num"."\n";
if( $char_opt =~ /v/i){
print $start_line; } # Prints to STDOUT,
my($Enclosed_came); ## <<-- This should be HERE !
$Entry_length=${&get_longest_str_size(\@keys)};
for( $i =0; $i < @keys; $i++){ #### @keys has been sorted before.
my($Len) = length($input{$keys[$i]});
my $delimiter = ':';
my($entry) = $keys[$i];
$entry =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e; ## capitalizing word
if($entry=~/^Enclosed?$/i){ $Enclosed_came = 1; }
my(@input) = split(/\n+/, $input{$keys[$i]});
if(@input > 0){
for($j =0; $j < @input; $j++){
## If NO entry name(blank) is given ##
if($j > 0){ ## If the value is a multi line.
$entry = ''; $delimiter=' '; }
if( $char_opt =~ /v/i){
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### This is to reduce the entry length of Enclosed content lines ##
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( ($Enclosed_came==1)&&($entry eq '') ){
$Entry_length=2; $VL=80; }
printf("$Commont_Symbol %-${Entry_length}s $delimiter %-${VL}s\n", $entry , $input[$j]);
}
if(($Enclosed_came==1)&&($entry eq '')){ $Entry_length=2; $VL=80; }
$out[$k++]=sprintf("$Commont_Symbol %-${Entry_length}s $delimiter %-${VL}s\n", $entry,$input[$j]);
if($entry=~/^Enclosed?/){ $Enclosed_came = 1; } }}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
## If the entries have null descriptions, just print entries ######
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif(@input ==0){
if( $char_opt =~ /v/i){
printf("$Commont_Symbol %-${Entry_length}s $delimiter %-${VL}s\n", $entry, ' '); }
$out[$k++]=sprintf("$Commont_Symbol %-${Entry_length}s $delimiter %-${VL}s\n", $entry, ' ');
}
}
############################################################
## Writing Ending line ##
############################################################
$end_line = "$Commont_Symbol".'-'x"$num"."\n";
if( ($char_opt =~ /v/i)&&($char_opt =~ /e/i) ){ print $end_line; }
if( $char_opt =~ /e/i){ push(@out, $end_line) }
unshift(@out, $start_line);
push(@Final_out, \@out);
}
if(@Final_out > 1){ @Final_out; }
elsif( @Final_out==1){ $Final_out[0] }
} #<--- END of write_sdb_file
#________________________________________________________________________
# Title : push_if_not_already
# Usage : @out=@{&push_if_not_already(\@mother_array, \@adding_array )};
# @out=@{&push_if_not_already(\@mother_array, $adding_scalar)};
# Function : returns ref. of an array for a list of non-repetitive entry.
# Example :
# Warning :
# Keywords : add_if_not_already, add_element_if_not_already, if_not_already
# add_element_if_not_already, push_element_if_not_already,
# if_no_already_push, put_element_if_not_already, add_new_element
# add_new_items_only, push_new_items_only, push_new_elements_only
# put_if_not_already,
# Options :
# Returns : a ref. of an array.
# Argument : two references. The first should be an array ref. The 2nd can be either
# scalar or array reference.
# Category :
# Version : 1.3
#--------------------------------------------------------------------
sub push_if_not_already{
my($already_in, $already, $i, @push_items_given);
my(@out_array)=@{$_[0]};
if(ref($_[0]) ne 'ARRAY'){ print "\n push_if_not_array need ref\n"; exit; }
push(@push_items_given, ${$_[1]}) if(ref($_[1]) eq 'SCALAR');
@push_items_given=@{$_[1]} if(ref($_[1]) eq 'ARRAY');
for $already (@out_array){ ## This for is to remove repetitive
for ($i=0; $i< @push_items_given; $i++){
if($already eq $push_items_given[$i]){ splice(@push_items_given,$i, 1); $i--; }
}
}
push(@out_array, @push_items_given);
return(\@out_array);
}
#_______________________________________________________________
# Title : compare_sec_template_with_db
# Usage :
# Function :
# Example :
# Warning :
# Keywords : sec structure mapping, map sec str, map_sec_structure
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#-----------------------------------------------------------
sub compare_sec_template_with_db{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
$ref_target_hash = shift @hash;
%target = %{$ref_target_hash};
@names = keys %target;
$name_target = $names[0];
if($name_target =~/\w+(\d+)/){ $ori_target_seq_len = length($1); }
@target_frag = split(/ +/, $target{$name_target} );
for($i =0; $i< @hash; $i ++){
%db=%{$hash[$i]};
@db_name = keys %db;
for($j=0; $j < @db_name; $j ++){
$name = $db_name[$j];
@db_frag = split( / +/, $db{$name} );
for($k=0; $k < @target_frag; $k ++){
if( ($target_frag[$k]=~/H(\d+)/i)&&($db_frag[$k]=~/H(\d+)/i) ){
$simple_match_output{$name}++;
$leng_diff = abs($1 - $2)/15;
$simple_match_output{$name} = $simple_match_output{$name}- $leng_diff;
}elsif( ($target_frag[$k]=~/E(\d+)/i)&&($db_frag[$k]=~/H(\d+)/i) ){
$simple_match_output{$name}--;
#$leng_diff = abs($1 - $2)/10;
#$simple_match_output{$name} = $simple_match_output{$name}- $leng_diff;
}elsif( ($target_frag[$k]=~/H(\d+)/i)&&($db_frag[$k]=~/E(\d+)/i) ){
$simple_match_output{$name}--;
#$leng_diff = abs($1 - $2)/10;
#$simple_match_output{$name} = $simple_match_output{$name}- $leng_diff;
}elsif( ($target_frag[$k]=~/E(\d+)/i)&&($db_frag[$k]=~/E(\d+)/i) ){
$simple_match_output{$name}++;
$leng_diff = abs($1 - $2)/15;
$simple_match_output{$name} = $simple_match_output{$name}- $leng_diff;
}
}
}
}
return(\%simple_match_output);
}
#___________________________________________________________________
# Title : get_peptide_occurance
# Usage :
# Function : gets the number of occurances of peptide(with given size) for
# any number of sequences given.
# Example : %stat=%{&get_peptide_occurance(\%pro_sequence, $size)};
# while %pro_sequence has one or more sequences like
# seq1 AAAAAAAAAAAA, seq2 BBBBBBBBBBBBBB, ...
# $size is number. For dipeptide=2, tripeptide=3, tetrapep=4...
# Warning :
# Keywords :
# Options :
# Returns :
# Argument : eg=> (\%ref_hash, 4)
# Category :
# Version : 1.2
#---------------------------------------------------------------
sub get_peptide_occurance{
my($k, $i, $s, $peptide, $pep_entry_num, @name, %stat);
%all=%{$_[0]};
$size=$_[1];
@name=keys %all;
for($k=0; $k<@name; $k++){
if($all{$name[$k]}=~/[BZX]/i){ next;
}else{
$seq_leng += length($all{$name[$k]});
my @seq=split(//, $all{$name[$k]});
my $all_occur_pep;
for($i=0; $i< (@seq-($size-1)); $i ++){
my $peptide;
for($s=0; $s < $size; $s++){
$peptide .= $seq[$i+$s];
}
$stat{$peptide}++;
$all_occur_pep ++;
print "\n$peptide $stat{$peptide}" if $debug==1;
$pep_entry_num=keys %stat;
if( ($debug==1)&&($pep_entry_num%100 == 0 ) ){
print "\n Present peptide entries are: $pep_entry_num out of $all_occur_pep residues \n";
}
}
}
}
return(\%stat);
}
#___________________________________________________________________
# Title : open_lottery_file
# Usage :
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#---------------------------------------------------------------
sub open_lottery_file{
my ($i, @lines, @bin1, @bin2, @bin3, @bin4, @bin5, @bin6, @bonus, @allbins);
open(F, "$_[0]");
if($_[1] =~/\-a/i){ $single_array_return=1 };
@lines = <F>;
for($i=0; $i< @lines; $i++){
if($lines[$i]=~/ {1,3}(\d\d) {1,3}(\d\d) {1,3}(\d\d) {1,3}(\d\d) {1,3}(\d\d) {1,3}(\d\d) {1,2}(\d\d)/){
push(@bin1, $1);
#print $1, "\n";
push(@bin2, $2);
push(@bin3, $3);
push(@bin4, $4);
push(@bin5, $5);
push(@bin6, $6);
push(@bonus, $7);
}
}
if($single_array_return == 1){ @allbins=( @bin1, @bin2, @bin3, @bin4, @bin5, @bin6, @bonus); }
else{ return(\@bin1, \@bin2, \@bin3, \@bin4, \@bin5, \@bin6, \@bonus); }
}
#_________________________________________________________________
# Title : get_probable_half
# Usage :
# Function : This produces a hash ref. which is supposed to be most probable
# according to the given array. It divides array into halves
# gets the more probable half until it gets one single number.
# Example :
# Warning :
# Keywords : get_frequent_halves,
# Options :
# Returns :
# Argument : \@array
# Category :
# Version : 1.0
#-----------------------------------------------------------------
sub get_probable_half{
my (%hash, $i, @keys, @values);
%hash=%{$_[0]};
@keys= sort {$a<=$b} keys %hash;
@values= values %hash;
print "\n Hash is ",%hash,"\n";
if(@keys == 1){
return(\%hash); last;
}elsif(@keys >1){
if((@keys % 2) != 0){ # make the number even
$keys[@keys]=$keys[$#keys];
}
@first_half=@keys[0..(@keys/2-1)];
@second_half=@keys[(@keys/2)..$#keys];
print "@first_half", "\n";
print "@second_half", "\n";
my($sum1, $sum2, %hash1, %hash2);
for($i=0; $i<(@keys/2); $i++){
$sum1 +=$hash{$first_half[$i]};
$sum2 +=$hash{$second_half[$i]};
$hash1{$first_half[$i]} =$hash{$first_half[$i]};
$hash2{$second_half[$i]}=$hash{$second_half[$i]};
}
if($sum1 > $sum2){ &get_probable_half(\%hash1);}
else{ &get_probable_half(\%hash2);}
}
}
#_______________________________________________________________
# Title : divide_array
# Usage : &show_array(÷_array(\@input, 6));
# Function : divides any array to the denominator given.
# If you give array of 100 elem, with 5, you will
# get 5 arrays with 20 elem each.
# Example :
# Warning :
# Keywords : split_array_into_pieces, split_array, chop_array,
# fragment_array,
# Options : s= for dividing the array with sub array size
# eg) to get 20 elem length sub arrays from
# a big array
# @ar_ref=@{÷_array(\@array, 's=20')};
# Returns :
# Argument :
# Category :
# Version : 1.4
#-----------------------------------------------------------
sub divide_array{
my ($size,$remaining, $size_div, $s);
my @array = @{$_[0]};
my @final_array_ref=();
if(ref($_[1])){
if(${$_[1]}=~/^\d+$/){
$denominator = ${$_[1]};
}elsif(${$_[1]}=~/s=(\d+)$/i){
$size=$1;
$size_div=1;
}
}elsif($_[1]=~/^\d+$/){
$denominator = $_[1];
}elsif($_[1]=~/s=(\d+)$/i){
$size=$1;
$size_div=1;
}
if((@_ ==1)&&($denominator == 0)){
print "\n Denominator is 0, error, setting to 1\n";
$denominator = 1;
}
if($size_div==1){
while(@array){
push(@final_array_ref, [splice(@array, 0, $size)]);
}
}else{
my $frag_ar_size = int(@array/$denominator);
if($debug eq 1){ print "\n Frag arr size is : $frag_ar_size \n" }
$remaining = @array % $denominator;
if($debug eq 1){ print "\n Remnant elem size is : $remaining \n" }
for($i=0; $i < $denominator; $i++){
if($remaining > 0){
push(@final_array_ref, [splice(@array, 0, ($frag_ar_size+1),)] );
$remaining --;
}elsif(($remaining == 0)&&(@array>0)){
push(@final_array_ref, [splice(@array, 0, ($frag_ar_size),)] );
}
}
}
return(\@final_array_ref);
}
#__________________________________________________________________________
# Title : split_fasta_files
# Usage : @names_of_single_files=@{&split_fasta_files(\@files)};
# Function :
# Example :
# Keywords : divide_fasta_files, split_fasta_db_files, divide_fasta_db_files
# make_single_fasta_files, write_single_fasta, write_single_fasta_files
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#----------------------------------------------------------------------------
sub split_fasta_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my (@each_single_seq_names, $out_single_fasta_file_name );
for($i=0; $i< @file; $i++){
open(FASTA, "$file[$i]");
while(<FASTA>){
if(/\> *(\S+)/){
$out_single_fasta_file_name="$1\.fa";
open(OUT_SINGLE, ">$out_single_fasta_file_name");
print OUT_SINGLE $_;
push(@each_single_seq_names, $1);
}elsif(/\w+/){
print OUT_SINGLE $_;
}
}
close OUT_SINGLE;
}
return(\@each_single_seq_names);
}
#______________________________________________________________________________
# Title : split_files
# Usage :
# Function :
# Example :
# Keywords :
# Options :
# $division_factor= by d=
# Author : jong@salt2.med.harvard.edu,
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub split_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($division_factor, $base_name);
$division_factor=2;
if($vars{'d'}=~/(\d+)/){ $division_factor=$1 };
for($i=0; $i< @file; $i++){
open(FILE, $file[$i]);
$base_name=${&get_base_names($file[$i])};
$extension=${&get_extension_names($file[$i])};
$total_line_num=@lines=<FILE>;
$splited_frag_size=int(@lines/$division_factor);
for($j=0; $j< $division_factor; $j++){
#$frag_file="$base_name\_s${j}\.split${j}";
$frag_file="$base_name\_s${j}\.$extension";
push(@splited_files, $frag_file);
open(FRAGMENT_FILE, ">$frag_file");
for($k=0; $k<= $splited_frag_size; $k++){
print FRAGMENT_FILE $lines[$k];
}
splice(@lines, 0, $splited_frag_size);
}
print FRAGMENT_FILE @lines; # Writing the very last remaining lines
close (FRAGMENT_FILE);
}
return(\@splited_files);
}
#_______________________________________________________________
# Title : split_sequence
# Usage : %out=%{&split_sequence(\%input, 2 )};
# Function : divides any string to the denominator given.
# Example : &show_array( ÷_string(\%input, 3) );
# while $input is 'seq', '12345789ABCDEFHIJKLMN'
# The output will be 'seq_1_half', '1234578'
# 'seq_2_half', '9ABCDEF'
# 'seq_3_half', 'HIJKLMN'
# Warning :
# Keywords : divide_string, split_string, chop_string, divide_sequence
# split_sequence(look at separate split_sequence sub),
# Options :
# $reverse_second_half=S by S -S
# $reverse_first_half =F by F -F
# $reverse_rest =R by R -R ## reversing all except the first
# $reverse_all =A by A -A # reverse all the fragments
# Returns :
# Argument :
# Category :
# Version : 1.3
#-----------------------------------------------------------
sub split_sequence{
my(@string, $frag_str_size, $remaining, @hash, $i, $j,
@seq_names, $denominator, %input_seq, @final_hash_ref,
$frag, $reverse_second_half, $reverse_first_half,
$reverse_rest, $reverse_all);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Checking arguments
#________________________________________________
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'HASH'){
push(@hash, $_[$i]);
}elsif(ref($_[$i]) eq 'SCALAR'){
if(${$_[$i]}=~/^(\d+\.?\d*)$/){ # this can handle fraction number!!
$denominator = int($1); # like 9.5
}
}elsif($_[$i]=~/^(\d+\.?\d*)$/){ # this can handle fraction number!!
$denominator = int($1); # like 9.5
}elsif($_[$i]=~/F/){ # this can handle fraction number!!
$reverse_first_half='F';
}elsif($_[$i]=~/S/){ # this can handle fraction number!!
$reverse_second_half='S';
}elsif($_[$i]=~/R/){ # this can handle fraction number!!
$reverse_rest='R'; $reverse_second_half='s';
}elsif($_[$i]=~/A/){ # this can handle fraction number!!
$reverse_all='A'; $reverse_rest='r'; $reverse_first_half='f';
$reverse_second_half='S';
}elsif($_[$i]=~/\S/){
print "\n# $0: split_sequence, You put some strange stuff to me!!\n\n";
exit;
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# When s option is given while denominator is over 2, set r option
#____________________________________________________________________
if($denominator > 2 and $reverse_second_half){
$reverse_rest='r';
}
if($denominator == 0){
print "\n Denominator is 0, error, setting to 1\n";
$denominator = 1;
}
print "\n# split_sequence: \$denominator is $denominator, with \@hash\n";
for($i=0; $i< @hash; $i++){
my %input_seq=%{$hash[$i]};
@seq_names=keys %input_seq;
my (%out_seq);
for($k=0; $k < @seq_names; $k++){
$seq_name=$seq_names[$k];
@string = split(//, $input_seq{ $seq_name });
$frag_str_size = int(@string/$denominator);
if($debug eq 1){ print "\n Frag str size is : $frag_str_size \n" }
$remaining = @string % $denominator;
for($j=0; $j < $denominator; $j++){
$frag_number=$j+1;
if($remaining > 0){
$frag=join('', splice(@string, 0, ($frag_str_size+1) ) );
$remaining --;
$seq_name_split="$seq_name\_$frag_number";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
# Before final, set the seq types
#_________________________________________________________________
if($frag_number==1 and $reverse_first_half){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($frag_number==2 and $reverse_second_half){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($frag_number >2 and $reverse_rest){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($reverse_all){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}
$out_seq{$seq_name_split}=$frag;
}elsif( $remaining == 0 and @string>0){
$seq_name_split="$seq_name\_$frag_number";
$frag=join('', splice(@string, 0, $frag_str_size,) );
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~```
# Before final, set the seq types
#_________________________________________________________________
if($frag_number==1 and $reverse_first_half){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($frag_number==2 and $reverse_second_half){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($frag_number >2 and $reverse_rest){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}elsif($reverse_all){
$frag=reverse($frag);
$seq_name_split="$seq_name_split\_rv";
}
$out_seq{$seq_name_split}=$frag;
}
}
}
push(@final_hash_ref, \%out_seq);
}
wantarray? \@final_hash_ref : $final_hash_ref[0];
}
#_______________________________________________________________
# Title : divide_string
# Usage : &show_array(÷_string(\$input, 6));
# Function : divides any string to the denominator given.
# Example : &show_array( ÷_string(\$input, 3) );
# while $input is '12345789ABCDEFHIJKLMN'
# The output will be '1234578 9ABCDEF HIJKLMN'
# Warning :
# Keywords : divide_string, split_string, chop_string, divide_sequence
# split_sequence(look at separate split_sequence sub),
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.4
#-----------------------------------------------------------
sub divide_string{
my (@array, $i, $j, $denominator, @temp, @string, $frag_str_size,
$remaining, $frag);
for($i=0; $i< @_; $i++){
if(ref($_[$i]) eq 'ARRAY'){
push(@array, @{$_[$i]});
}elsif(ref($_[$i]) eq 'SCALAR'){
if(${$_[$i]} =~/^(\d+)$/){
$denominator = $1;
}else{
push(@array, ${$_[$i]});
}
}elsif($_[$i]=~/^(\d+\.?\d*)$/){ # this can handle fraction number!!
$denominator = int($1); # like 9.5
}else{
push(@array, $_[$i]);
}
}
if($denominator == 0){
print "\n Denominator is 0, error, setting to 1\n";
$denominator = 1;
}
for($i=0; $i< @array; $i++){
my @temp;
my @string = split(//, $array[$i]);
$frag_str_size = int(@string/$denominator);
if($debug eq 1){ print "\n Frag str size is : $frag_str_size \n" }
$remaining = @string % $denominator;
for($j=0; $j < $denominator; $j++){
if($remaining > 0){
$frag=join('', splice(@string, 0, ($frag_str_size+1) ) );
push(@temp, $frag);
$remaining --;
}elsif( $remaining == 0 and @string>0){
$frag=join('', splice(@string, 0, $frag_str_size,) );
push(@temp, $frag);
}
}
push(@final_array_ref, \@temp);
}
wantarray? \@final_array_ref : $final_array_ref[0];
}
#____________________________________________________________
# Title : write_html_headbox
# Usage : &write_html_headbox($outfilename, \%entries);
# Function : write html format headbox explanation with
# given hashes of headbox content.
# Example :
# Warning : It takes off the last '/' when $URL has it
#
# Keywords : write_headbox_html, write headbox in html,
# write_headbox_files
# Options : 'd' for date inclusion at the top of the page
# f= for default ftp dir name
#
# Category :
# Version : 1.7
#-----------------------------------------------------------
sub write_html_headbox{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(%in, $name, $name1);
my $URL='ftp://cyrah.med.harvard.edu/pub/Perl/';
if($vars{'f'}=~/(\S+)\/$/){ ## checking if 'f' opt is set(for default ftp dir name)
$URL=$1;
}
for ($f=0; $f<@file; $f++){
$output_file=$file[$f];
open(FILE, ">$output_file");
print FILE "\<html\>\n";
if($char_opt =~/d/i){
print FILE "\<H2 ALIGN=CENTER\>";
print FILE "\<FONT COLOR=\"#DC143C\"\>$file[$f]\<\/H2\>\<\/FONT\>\n";
print FILE "\<br\>\n";
print FILE "\<H6 ALIGN=RIGHT\>", `date`, "Created by \<A href=\"$0\.html\"\>$0\<\/A\>\<\/H6\> \<hr\>";
}
for($i=0; $i< @hash; $i++){
my %in = %{$hash[$i]};
my @keys = sort keys %in;
$name= $in{'Title'};
if($name=~/(\S+)\.pl/){ ## to prevent making 'xxxx.pl.pl'
$name1=$1;
}else{
$name1=$name;
}
print FILE "\<H3\>\<A href=\"$name1\.pl\.html\"\>$name1\<\/A\>\<\/H3\>";
print FILE "Download \<A href=\"${URL}\/$name1\.pl\"\>$name1\.pl\<\/A\>\n";
print FILE "\<pre\>\n";
for($j = 0; $j < @keys; $j ++){
if($keys[$j]=~/(title)/i){
}elsif( ($keys[$j]=~/\w+/i)&&( $in{$keys[$j]}=~/\w+/) ){
chomp( $in{$keys[$j]} );
printf FILE ("\<b\>%-10s\<\/b\> %s\n", $keys[$j], $in{$keys[$j]});
}
}
print FILE "\<\/pre\>";
print FILE "\<hr\>\n\n";
}
print FILE "\<\/html\>\n";
close FILE;
}
}
#________________________________________________________________________
# Title : open_sdb_files
# Usage : %entries = %{&open_sdb_files(\$file_to_read )};
# Function :
# Example : Output is something like
# ('Title', 'read_head_box', 'Tips', 'Use to parse doc', ...)
# Warning :
# Keywords : read_sdb_files,read_sdb,
# Options : 'b' for remove blank lines. This will remove all the entries
# with no descriptions
# Returns : A hash ref.
# Argument : One or None. If you give an argu. it should be a ref. of an ARRAY
# or a filename, or ref. of a filename.
# If no arg is given, it reads SELF, ie. the program itself.
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub open_sdb_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($title_found, %Final_out, $variable_string, $TITLE, $title, @keys,
$end_found, $line, $entry, $entry_match, $End_line_num, $remove_blank,
$title_entry_null, $end_found, $Enclosed_entry, $Enclosed_var,
$blank_counter, $title_entry_exist, $entry_value, $temp_W, $Warning_part
);
for($r=0; $r<@file; $r++){
open(SEQ_IN, "$file[$r]");
my @whole_file =<SEQ_IN>;
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for($i=0; $i<@whole_file; $i++){
$whole_file[$i] =~ tr/\t/ {7}/; ## This is quite important to some parsing!!!
#"""""""""""""""""""""""""""""""""""""""""""
## The first and second line of box 1 ##
#"""""""""""""""""""""""""""""""""""""""""""
if( ($whole_file[$i]=~/^[_\*\-\/]{55,}$/)&& ## '______' is discarded
($whole_file[$i+1]=~/^ {0,4}([TitlNam]+e) {0,8}:? {0,20}(\S[\-\w\.:]*) *(Copyright.*)/i) ){
$TITLE = $1; $title = "$2\n"; $Final_out{'Warning'}.="$3\n";
$entry_match=$TITLE; ## The very first $entry_match is set to 'Title' to prevent null entry
if($TITLE =~ /^Title|Name$/i){ #
if( ($title=~/^\s+$/)||( $title eq "\n") ){
$title_entry_null =1; $title = ''; } }
$Final_out{$TITLE}=$title;
$title_found ++ ; $i++; ## << this is essential to prevent reading the same line again.
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
## The first and second line of box 2, #__________ or #**************
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&&($whole_file[$i]=~/^[_\*]{20,}$/)&&
($whole_file[$i+1]=~/^ *(\w{1,6}\s{0,2}\w+) {0,7}: {1,5}(.*) */i) ){
$title_found ++ ; $i++;
$entry_match=$1; $entry_value=$2;
$entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e; ## Capitalize words
$Final_out{$entry_match}.= "$entry_value\n";
last if $title_found > 1; next; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
## With PROPER entry 2 : descriptins like. 'Ussage : ssssssxxjkk kj'
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&&($title_found==1)&&
($whole_file[$i]=~ /^ {0,3}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)] {0,9}(\S.*) */i)){
$entry_match = $1;
$entry_value = $2;
$entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
$Final_out{$entry_match}.= "$entry_value\n"; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
# With proper entry 3 : descriptins like. 'Ussage :', But blank description ##
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&&($title_found==1)&&
($whole_file[$i]=~ /^ {0,3}(\w{1,4}\s{0,2}\w{1,7}) {0,8}[:\)]( {0,})$/i)){
$entry_match = $1;
$entry_value = $2;
$entry_match =~ s#^\S#(($tmp = $&) =~ tr/[a-z]/[A-Z]/,$tmp)#e;
$Final_out{$entry_match}.= " $entry_value\n"; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### all space line matching ##
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&& ##<----- If blank line is matched. Take the line
($title_found==1)&&($whole_file[$i]=~/^ {0, 110}$/) ){
$blank_counter++;
if($blank_counter > 2){ $blank_counter--; }
else{ $Final_out{$entry_match}.= " \n"; } }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### Anything after 3 space to 14 positions eg: '# HHHHHHHHH'
### To match 'examples' etc. INC. ':'
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&&
($title_found==1)&&($whole_file[$i]=~/^( {0,50})(\S.+)/) ){
$Final_out{$entry_match}.= "$2\n"; $blank_counter=0; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### Anything after 1 space to 11 positions ##
### To match 'examples' etc. EXC. ':' ##
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($end_found != 1)&&
($title_found==1)&&($whole_file[$i]=~/^ {0,16}([^:.]+)/) ){
$Final_out{$entry_match}.= "$1\n"; $blank_counter=0;}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
###-------End of the read_box reading--------##
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif( ($title_found==1)&&
($whole_file[$i]=~ /^[\*\-_]{15,}/)){ ## to match '#-----..' or '#*******..'(Astrid's)
$End_line_num = $i; $end_found++;
last; }
} ## < End of for loop
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### If title is not there at all ####
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
@keys=keys %Final_out;
for(@keys){
if(/^Title$/i){ ## No Entry of Title at all??
$TITLE =$&;
$title_entry_exist = 1;
if($Final_out{$_}=~/^ *$/){ ## if Title => Null or just space
$title_entry_null = 1; } } }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### When title entry is not there ####
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if( $title_entry_exist != 1){
for($s=$End_line_num+1; $s < $End_line_num+20; $s++){
if( $whole_file[$s] =~ /^sub {1,5}([\w\.]+) {0,6}\{/){
$Final_out{'Title'} = "$1\n"; last; }
elsif( $whole_file[$s] =~/^________________________________+/){
#######################################
## Uses running file name as titile ##
#######################################
$Final_out{'Title'} = "$0"; last;
}else{
#######################################
## Uses running file name as titile ##
#######################################
$Final_out{'Title'} = "$0";
}
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
### When title is blank ####
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
elsif($title_entry_null ==1){ ## It looks for 'sub xxxxx{ ' line to get title
### $End_line_num is the last line read.
for($s = $End_line_num+1; $s < $End_line_num+20; $s++){
if( $whole_file[$s] =~ /^sub {1,5}(\w+\.*\w*) {0,7}{/){
$Final_out{$TITLE} = "$1\n"; last; }
elsif( $whole_file[$s] =~/^#________________________+/){
#######################################
## Uses running file name as titile ##
#######################################
$Final_out{$TITLE} = "$0"; last;
}else{
#######################################
## Uses running file name as titile ##
#######################################
$Final_out{$TITLE} = "$0";
}
}
}
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
## Error handling, if no head box is found ####
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($title_found < 1){ print "\nFatal: No headbox found by read_head_box2 sub.\n"; }
push(@ref_of_seq_entry, \%Final_out);
}
if(@ref_of_seq_entry > 1){
@ref_of_seq_entry;
}else{ return (\%Final_out) }
}
#______________________________________________________________________________
# Title : open_stride_dat_files
# Usage : @out=@{&open_stride_dat_files(@ARGV)};
# Function :
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.2
#------------------------------------------------------------------------------
sub open_stride_dat_files{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my( @residues, @sec_str, $seq_name, $seq, %stride_sec_str_hash,
$stride_dat_file, @stride_sec_str_hash_final);
for($i=0; $i< @file; $i++){
my $stride_dat_file=$file[$i];
print "\n# (INFO) \$stride_dat_file $stride_dat_file is being opened\n";
open(STRIDE_DAT_FILE, $stride_dat_file);
while(<STRIDE_DAT_FILE>){
if(/^NM[\t ]+(\S+)\.brk/){
$seq_name=$1;
}elsif(/^NM[\t ]+(\S+)/){
$seq_name=$1;
}elsif(/^ *SQ +(\S+)/){
$seq=$1;
}elsif(/^ *SS +(\S+)/){
$sec_str=$1;
my %stride_sec_str_hash;
@residues=split(//, $seq);
@sec_str=split(//, $sec_str);
for($j=0; $j < @residues; $j++){
$stride_sec_str_hash{$j}=[$residues[$j], $sec_str[$j], '1.0', "$seq_name"];
}
push(@stride_sec_str_hash_final, \%stride_sec_str_hash);
}
}
close(STRIDE_DAT_FILE);
}
if(@stride_sec_str_hash_final > 1){
return(\@stride_sec_str_hash_final);
}elsif(@stride_sec_str_hash_final==1){
return(\%stride_sec_str_hash);
}
}
#_______________________________________________________________
# Title : get_pdb_file_start_number
# Usage :
# Function :
# Example :
# Warning :
# Keywords : start_number_of_pdb, startnumber, start number of PDB,
# get_start_number_of_pdb_file,
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#-----------------------------------------------------------
sub get_pdb_file_start_number{
my($start_number, $pdb_file);
if( ref($_[0]) ){
$pdb_file = ${$_[0]};
}else{
$pdb_file = $_[0];
}
open (PDB_FILE, "$pdb_file");
while(<PDB_FILE>){
if(/^ATOM 1 +\w+ +\w+ +[\w]* +(\d+) +/){
$start_number = $1; last;
}
}
\$start_number;
}
#_______________________________________________________________
# Title : write_modeller_top_file
# Usage : &write_modeller_top_file(\%hash, [v]);
# Function : Writes Modeller command file format.
# Example :
# $modelname = 'gfct';
# $template = '1ovt';
# %hash=($modelname, $template);
# &write_modeller_top_file(\%hash);
# Warning :
# Keywords :
# Options : v for verbose. You will get STDOUT of the result as well as file
# Returns : a file of xxxx.top form.
# Argument : 1 hash ref which has model name and template name -> (\%hash)
# while %hash is (modelname, tempalatename)
# Category :
# Version : 1.0
#-----------------------------------------------------------
sub write_modeller_top_file{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
format MODELLER_TOP_FORMAT =
INCLUDE
SET ALNFILE = @<<<<<<<<<<<<<<<<
$ali_file
SET KNOWNS = @<<<<<<<<<<<<<<<<
$pdb_file
SET SEQUENCE = @<<<<<<<<<<<<<<<<
$model
SET ATOM_FILES_DIRECTORY = './:../atom_files'
SET STARTING_MODEL = 1
SET ENDING_MODEL = 1
CALL ROUTINE = 'model'
.
format STDOUT =
INCLUDE
SET ALNFILE = @<<<<<<<<<<<<<<<<
$ali_file
SET KNOWNS = @<<<<<<<<<<<<<<<<
$pdb_file
SET SEQUENCE = @<<<<<<<<<<<<<<<<
$model
SET ATOM_FILES_DIRECTORY = './:../atom_files'
SET STARTING_MODEL = 1
SET ENDING_MODEL = 1
CALL ROUTINE = 'model'
.
########## Program starts ####################
for($i=0; $i<@hash;$i++){
($model, $pdb_file) = each %{$hash[$i]};
$out_file = "$model.top";
$ali_file = "$model.ali";
open (MODELLER_TOP_FORMAT, ">$out_file");
$pdb1 = "$ENV{'PDB'}\/$pdb_file.brk";
$pdb2 = "$ENV{'PDB'}\/$pdb_file.pdb";
if( !(-e $pdb1 ) && !( -e $pdb2 ) ){
print "\n Error the file $pdb1 or $pdb2\n";
}
$model ="\'$model\'";
$pdb_file ="\'$pdb_file\'";
$ali_file ="\'$ali_file\'";
write MODELLER_TOP_FORMAT;
if( $char_opt=~/v/i){ write STDOUT; }
}
}
#_______________________________________________________________
# Title : write_modeller_ali_file
# Usage : &write_modeller_ali_file(\%model, \%template, [\$outfilename], [v]);
# Function : Writes Modeller alignment format.
# Example :
# $out = 'test.ali';
# %model = qw(model AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAccccccccccc);
# %template = qw(templ CCAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCC 3 42);
# &write_modeller_ali_file(\%model, \%template, \$out);
# Warning :
# Keywords :
# Options : You can put 2 numbers for the second set of key and element for
# the second hash input as the starting and ending points of
# template(i.e. pdb file seq). Unless I calculate the size of seq.
# By default, it reads PDB file defined by ENV setting of 'PDB' and
# gets the starting number of pdb. If starting number is defined
# explicitly at input hash, the given starting number is used instead
# of PDB's.
# v for verbose. You will get STDOUT of the result as well as file
# Returns : a file of xxxx.ali form.
# Argument : 2 ref. of hash for seq. and optional output.name and option(s).
# If second input hash (for template) has 3rd and 4th element which are
# numbers they are regarded as the starting and ending number of the
# template(i.e. pdb file seq)
# Category :
# Version : 1.0
#-----------------------------------------------------------
sub write_modeller_ali_file{
#""""""""""""""""""""""< handle_arguments{ head Ver 1.2 >""""""""""""""""""""""""""""""""
my(@A ) = &handle_arguments( @_ ); my( $num_opt )=${$A[7]};my( $char_opt )=${$A[8]};
my(@hash) =@{$A[0]};my(@file) =@{$A[4]};my(@dir ) =@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};my(@raw_string)=@{$A[9]};
my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
if($debug==1){ print " \@hash has \"@hash\"\n \@raw_string has \"@raw_string\"
\@array has \"@array\"\n \@char_opt has \"@char_opt\"\n \@file has \"@file\"\n"; }
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
format MODELLER_ALI_FORMAT =
>P1;@<<<<<<<<<<<<<<<
$name
structureX: @<<<<<<<<<<<<<<: @<<<: : @<<<<<: : @<<<<<<<<<<<<<: :
$name $start_seq $seq_leng $name
@*
$seq
*
.
format STDOUT =
>P1;@<<<<<<<<<<<<<<<
$name
structureX: @<<<<<<<<<<<<<<: @<<<: : @<<<<<: : @<<<<<<<<<<<<<: :
$name $start_seq $seq_leng $name
@*
$seq
*
.
########## Program starts ####################
if($file[0]){
open (MODELLER_ALI_FORMAT, ">$file[0]");
}else{
@seq = %{$hash[0]};
$name = $seq[0]; print "\n\$name is $name" if $debug ==1;
open (MODELLER_ALI_FORMAT, ">$name.ali");
}
for($i=0; $i<@hash;$i++){
@seq = %{$hash[$i]};
$name = $seq[0]; print "\n\$name is $name" if $debug ==1;
$seq = $seq[1]; print "\n\$seq is $seq" if $debug ==1;
$seq_leng = length($seq);
## checking PDB entry of the template ##
$pdb1 = "$ENV{'PDB'}\/$name.brk";
$pdb2 = "$ENV{'PDB'}\/$name.pdb";
if( (-e $pdb1 ) || ( -e $pdb2 ) ){
$pdb_file = $pdb1;
$start_seq = ${&get_pdb_file_start_number( $pdb_file )};
}
## Handling the starting and ending seq points of template
if( defined($seq[2])&&($seq[2]=~/^\d+/) ){
$start_seq=$seq[2]; $seq_leng+=$start_seq; }
elsif( defined($start_seq) ){
$seq_leng+=$start_seq; }
else{ $start_seq =1; }
if( (defined($seq[2])) &&($end_seq != $seq_leng ) ){
print "\n Your template seq length does not match with actual seq size
\n I will put the calculated value \"$seq_leng\" as the template length\n\n";
}
print "\n\$seq_leng is $seq_leng\n" if $debug ==1;
write MODELLER_ALI_FORMAT;
if( $char_opt=~/v/i){ write STDOUT; }
}
}
#_______________________________________________________________
# Title : make_template_from_sec_str
# Usage : %target = %{&make_template_from_sec_str(\%seq)};
# Function : makes template of sec. str. like: 'H5 E4 E2' out of '__HHHHH__EEEE__EE__'
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#-----------------------------------------------------------
sub make_template_from_sec_str{
my(%out, @name, @fragments, %in, $name, $leng, $frag_seq, $name2);
#""""""""""""""""""""""< handle_arguments{ head Ver 1.1 >""""""""""""""""""""""""""""""""
my(@A ) = &handle_arguments( @_ ); my( $num_opt )=${$A[7]};my( $char_opt )=${$A[8]};
my(@hash) =@{$A[0]};my(@file) =@{$A[4]};my(@dir ) =@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};my(@raw_string)=@{$A[9]};
my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
for($j = 0; $j < @hash ; $j ++){
%in = %{$hash[$j]};
@name = keys %in;
print "\@name is @name\n", if($debug eq 1);
for($t=0; $t < @name; $t++){
$name = $name[$t];
print "\$name is $name\n", if($debug eq 1);
$leng=length($in{$name});
print "$leng\n", if($debug eq 1);
$name2 ="$name"."$leng"; # to attach sequence length
@fragments = split(/_+/, $in{$name});
print "\@fragments is @fragments\n", if($debug eq 1);
for($i = 0; $i < @fragments; $i++){
if($fragments[$i] =~/(\w)\w+/){
$fraglength = length($fragments[$i]);
$frag_seq .= "$1"."$fraglength "; # space is delimiter 'H5 E3 E5 E4'
print "\$frag_seq is $frag_seq\n", if($debug eq 1);
}
$out{$name2}=$frag_seq;
}
}
}
return(\%out);
}
#_______________________________________________________________
# Title : calculate_protein_volume
# Usage : %volumes=%{&calculate_protein_volume(\%seq)}
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#-----------------------------------------------------------
sub calculate_protein_volume{
my %final_volume;
my %volume=("A","88.6" ,"C","108.5","D","111.1","E","138.4",
"F","189.9","G","60.1" ,"H","153.2","I","166.7",
"K","168.6","L","166.7","M","162.9","N","117.7",
"P","122.7","Q","143.9","R","173.4","S","89.0",
"T","116.1","V","140.0","W","227.8","Y","193.6",
"a","88.6" ,"c","108.5","d","111.1","e","138.4",
"f","189.9","g","60.1" ,"h","153.2","i","166.7",
"k","168.6","l","166.7","m","162.9","n","117.7",
"p","122.7","q","143.9","r","173.4","s","89.0",
"t","116.1","v","140.0","w","227.8","y","193.6");
#""""""""""""""""""""""< handle_arguments{ head Ver 1.1 >""""""""""""""""""""""""""""""""""""""
my(@A ) = &handle_arguments( @_ ); my( $num_opt )=${$A[7]};my( $char_opt )=${$A[8]};
my(@hash) =@{$A[0]};my(@file) =@{$A[4]};my(@dir ) =@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};my(@raw_string)=@{$A[9]};
my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my %seq_hash = %{$hash[0]};
my @names = keys %seq_hash;
#if($debug == 1){
print "\n ",__LINE__, " Seq names are @names\n";
# }
for( $i=0 ; $i < @names; $i++){
my @seq=split(//, $seq_hash{$names[$i]});
for( $j=0; $j < @seq; $j ++){
$final_volume{$names[$i]} += $volume{$seq[$j]};
}
}
\%final_volume;
}
#_______________________________________________________________
# Title : extract_words
# Usage : @words = @{&extract_words(\$string)};
# Function :
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#-----------------------------------------------------------
sub extract_words{
#""""""""""""""""""""""< handle_arguments{ head Ver 1.1 >"""""""""""""""""""""""""""""""
my(@A ) = &handle_arguments( @_ ); my( $num_opt )=${$A[7]}; my( $char_opt )=${$A[8]};
my(@hash) =@{$A[0]}; my(@file) =@{$A[4]}; my(@dir ) =@{$A[3]}; my(@array)=@{$A[1]};
my(@string)=@{$A[2]}; my(@num_opt)=@{$A[5]}; my(@char_opt)=@{$A[6]}; my(@raw_string)=@{$A[9]};
my($i, $j, $c, $d, $e, $f, $g, $h, $k, $l, $p, $q, $r, $s, $t, $u, $v, $w, $x,$y,$z);
#"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
if($debug==1){
print __LINE__, " Args to extract_words are: \"@raw_string\"\n";
}
for($i=0; $i < @raw_string; $i ++){
if(ref($raw_string[$i]) eq 'SCALAR'){
$line = ${$raw_string[$i]};
}else{
$line = $raw_string[$i];
}
push( @words, split(/[\W\-\_]+/, $line) );
}
if($debug==1){
$num = @words;
print __LINE__, " Num of words are : \"$num\"\n";
}
\@words;
}
#________________________________________________________________________
# Title : replace_subroutines
# Usage :
# Function : replaces subroutines of given file(s) with supplied subs.
# Doesn't care version
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.1
#--------------------------------------------------------------------
sub replace_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 1.6 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my($i,$j,$c,$d,$e,$f,$g,$h,$k,
$l,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($each_sub, %out_subs, $VER, %left_out, @lib, $ver, $sub_name,
$real_sub_entry_found);
my %hash=%{&merge_hash(@hash)};
@array= keys %hash;
my @values= values %hash;
for($i=0; $i < @file; $i++){
open(LIB_FILE, "<$file[$i]")|| die "\n $file[$i] <- $! \n";
@lib =<LIB_FILE>;
for($j=0; $j < @lib; $j++){
for($s=0; $s < @array; $s++){
if($array[$s] =~/^([_a-zA-Z\-]+)(\d*\.*\d*)$/){
$sub_name=$1;
$VER =$2;
}
#"""" Taking the headbox """""""""""""
if( ($lib[$j]=~/^#[_\-\*]{10,130} *$/)&&($lib[$j+1]=~/^(# *title *: *$sub_name)[^\.pl]/i) ){
$real_sub_entry_found=0;
$out_subs{"$sub_name"}.="$lib[$j]$1\n";
$j+=2;
until( ($lib[$j]=~/^sub *\w+ *\{/)||($lib[$j]=~/^#[\-_\*]{10,130} *$/) ){
$lib[$j]=~s/( *)$//; #<-- removing ending space
#"""""""""""""""""""""""""""""""""""
# Taking version no.
#"""""""""""""""""""""""""""""""""""
if( ($char_opt !~ /nv/i) && ($lib[$j]=~/^# *version *: *([\d+\.\d+]*) */i) ){
if( $1=~/^[ ]*$/){ $ver = '1.0'; } ## make null to 1.0
elsif( $1=~/^(\d+)$/){ $ver = "$1\.0"; } ### make 2 to 2.0
elsif($1=~/^([\d+\.\d+]+)$/){ $ver = $1; } ## assign version
}
$out_subs{"$sub_name"}.="$lib[$j]";
$j++;
}
$out_subs{"$sub_name"}.="$lib[$j]";
$j++; ## essential to remove #------------- line
}
#"""""""" Reading sub { } """""""
if($lib[$j]=~/^sub +$sub_name *\{/){
$out_subs{"$sub_name"}.="$lib[$j]";
$j++;
until($lib[$j]=~/^\}/){
$out_subs{"$sub_name"}.="$lib[$j]"; $j++;
}
$out_subs{"$sub_name"}.="$lib[$j]"; ## to fetch '}'
$j++;
splice(@array, $s, 1); ## removing the subnames found
$s--;
unless(defined($ver)){ $ver = '1.0' }
unless($char_opt=~/nv/i){ ## if No version attachment option is set
$out_subs{"$sub_name$ver"}=$out_subs{$sub_name};
delete $out_subs{$sub_name};
}
}
}
$left_out{$file[$i]}.=$lib[$j]; ## Remnant file content of the operation
## just in case you want the left out ones.
}
close LIB_FILE;
open (LEFT_FILE, ">$file[$i]");
for($h= 0; $h < @values; $h++){### appending the new subs.
$left_out{$file[$i]} .= $values[$h];
}
print LEFT_FILE $left_out{$file[$i]};
close LEFT_FILE;
}#""""""""""""" end of for (@file)
@no_of_subs_fetched = keys %out_subs;
if(@array>0){
print chr(7);
print "\n# Following subs are not found in \"", "@file","\"\n ", "@array", "\n\n";
}
return( \%left_out ); # this has all the sub routines and other lines.
}
#______________________________________________________________________________
# Title : write_subroutines
# Usage : @out_file=@{&write_subroutines(\%head_box)};
# Function : Writes subroutine file xxxx.psub with given headbox including
# hash
# Example :
# Keywords :
# Options :
# Author : jong@salt2.med.harvard.edu
# Category :
# Version : 1.0
#------------------------------------------------------------------------------
sub write_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(@final_output_file, $sub_title, $subroutine_file);
for($i=0; $i< @hash; $i++){
my %hash=%{$hash[$i]};
$sub_title=$hash{'Title'};
$subroutine_file="$sub_title\.psub";
open(SUBROUTINE_FILE, ">$subroutine_file");
push(@final_output_file, $subroutine_file);
#print SUBROUTINE_FILE "#\!\/usr\/bin\/perl\n";
@keys=qw(Title Function Usage Example Keywords Options Author Category Version);
print SUBROUTINE_FILE "#"."____"x20, "\n";
for($j=0; $j < @keys; $j++){
if($hash{$keys[$j]}){
printf SUBROUTINE_FILE ("# %-9s : %s\n", $keys[$j], $hash{$keys[$j]});
}
}
print SUBROUTINE_FILE "#"."----"x20, "\n";
print SUBROUTINE_FILE $hash{'Content'}, "\n";
close SUBROUTINE_FILE;
}
return(\@final_output_file);
}
#________________________________________________________________________
# Title : read_subroutines
# Usage : @out_subs=@{&read_subroutines(\@file, $separate_hash_entry_opt)}; or
# %out_subs=%{&read_subroutines(\@file)};
# Function : retunrns ALL subroutines with the keys as subroutine names
# with version like ('show_array2.2' => 'subroutine in one string')
# It reports the subroutines not found in searched file(s)
# Example :
# Warning :
# Keywords :
# Options : 'nv' for no version attachment in the keys of returning hash of subroutines
# 'r' for getting remnant file content rather than the sub routines
# 't' for leaving the original file without the sub routines taken.
# $separate_hash_entry_opt=s by s
# Returns :
# Argument :
# Category :
# Version : 1.2
#--------------------------------------------------------------------
sub read_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($each_sub, %out_subs, %left_out, @lib, $ver, $real_sub_entry_found,
%final_out_subs, %out_subs, $separate_hash_entry_opt, $long_subname,
@final_separate_entry_out);
if($char_opt=~/s/){ $separate_hash_entry_opt = 's';
print "\n# (INFO) \$separate_hash_entry_opt opt is set"
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Parsing input files of perl programs
#_____________________________________________
for($i=0; $i < @file; $i++){
print "\n# (INFO) Opening $file[$i] to read subroutines\n";
open(LIB_FILE, "<$file[$i]")|| die "\n $file[$i] <- $! \n";
@lib =<LIB_FILE>;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# This for loop does not allow return until each sub is finished
#_____________________________________________________________________
for($j=0; $j < @lib; $j++){
my(%out_subs_separate);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Reading the first delimiter line and 'Title' line altogether
#_______________________________________________________________
if( ($lib[$j]=~/^#+[_\-\*]{10,120} *$/)
and $lib[$j+1]=~/^(#+ *title *: *([\w\-\.]+))/i ){
$long_subname=$1;
$sub_name=$2;
if($sub_name=~/\.pl$/){ next } ## to avoid the very first headbox
if($separate_hash_entry_opt){
$out_subs_separate{'Title'}=$sub_name;
}else{
$out_subs{"$sub_name"}.="$lib[$j]$long_subname\n";
}
$j+=2;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Until I hit end of headbox or sub {} line
#________________________________________________
until( $lib[$j]=~/^#+[\-\*_]{10,160} *$/ or
$lib[$j]=~/^sub +\w[\w\.]+/ ){
$lib[$j]=~s/( *)$//; #<-- removing ending space
#"""""""""""""""""""""""""""""""""""
# Taking version no.
#___________________________________
if( ($char_opt !~ /nv/i) && ($lib[$j]=~/^# *version *: *([\d+\.\d+]*) */i) ){
if( $1=~/^[ ]*$/){ $ver = '1.0'; } ## make null to 1.0
elsif( $1=~/^(\d+)$/){ $ver = "$1\.0"; } ### make 2 to 2.0
elsif($1=~/^([\d+\.\d+]+)$/){ $ver = $1; } ## assign version
}
if($separate_hash_entry_opt and $lib[$j]=~/^# *(\S\S\S+) *: *(.*)$/){
$sub_entry=$1;
$out_subs_separate{$sub_entry}=$2;
$j++;
}elsif($separate_hash_entry_opt and $lib[$j]=~/^# +(.*)$/){
$out_subs_separate{$sub_entry}.=" $1\n";
$j++;
}else{
$out_subs{"$sub_name"}.="$lib[$j]";
$j++;
}
}
$out_subs{"$sub_name"}.="$lib[$j]";
$j++; ## essential to remove #------------- line
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Reading sub { } part , the Content part
#__________________________________________
if($lib[$j]=~/^sub {1,9}([\w\-]+) *\{/){
print "\n# (INFO) Found sub $1\{ \tline ";
$sub_name=$1;
$out_subs{"$sub_name"}.="$lib[$j]";
$out_subs_separate{'Content'}.="$lib[$j]";
$j++;
until($lib[$j]=~/^\}/){
$out_subs{"$sub_name"}.="$lib[$j]";
$out_subs_separate{'Content'}.="$lib[$j]";
$j++;
print "\n# (INFO) reading in the content of sub" if $verbose;
}
$out_subs{"$sub_name"}.="$lib[$j]"; ## to fetch '}'
$out_subs_separate{'Content'}.="$lib[$j]"; ## to fetch '}'
$j++;
unless(defined($ver)){ $ver = '1.0' }
unless($char_opt=~/nv/i){ ## if No version attachment option is set
$final_out_subs{"$sub_name$ver"}=$out_subs{$sub_name};
$out_subs_separate{'Version'}=$ver;
}
print "\n# (INFO) Pushing the read lines for one sub to \@final_separate_entry_out";
push(@final_separate_entry_out, \%out_subs_separate);
}
}##------- for @lib loop
if($char_opt =~/[rt]/i){
$left_out{$file[$i]}.=$lib[$j]; ## Remnant file content of the operation
}
close LIB_FILE;
if($char_opt =~/t/i){ ## t is for takeout(take the subroutine out of the original file!!
open (LIB_FILE, ">$file[$i]");
print LIB_FILE $left_out{$file[$i]};
close LIB_FILE;
}
print $final_out_subs;
}#""""""""""""" end of for (@file)
if($char_opt =~ /r/i){
return( \%left_out ); # to get the files sans the subroutines.
}else{
if($separate_hash_entry_opt){
if(@final_separate_entry_out==1){
return( $final_separate_entry_out[0]);
}else{
print "\n\n# (INFO) returning multiple subroutine read";
return(\@final_separate_entry_out);
}
}else{
return( \%final_out_subs );
}
}
}
#________________________________________________________________________
# Title : fetch_subroutines
# Usage :
# Function : retunrns subroutines with the keys as subroutine names with version
# like in the form( 'show_array2.2' => 'subroutine in one string')
# It reports the subroutines not found in searched file(s). This
# requires the names of sub you want while read_subroutines will
# read any subroutines with their headbox to a hash.
# Example :
# Warning :
# Keywords :
# Options : 'nv' for no version attachment in the keys of returning hash of subroutines
# 'r' for getting remnant file content rather than the sub routines
# 't' for leaving the original file without the sub routines taken.
# 'h' for headbox only output.
# Returns :
# Argument :
# Category :
# Version : 2.5
#--------------------------------------------------------------------
sub fetch_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($each_sub, %hash2, %out_subs, %left_out, @lib, $ver);
@array=@{&remove_dup_in_array(\@string)};
print "\n# $0: I am fetching @array from @file\n\n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) Processing each file of sub lib file(yes, it can read many source lib
#__________________________________________________________________________
for($i=0; $i < @file; $i++){
open(LIB_FILE, "<$file[$i]")|| die "\n $file[$i] <- $! \n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (2) reading the whole lib. file into array
#__________________________________________
@lib =<LIB_FILE>;
for($j=0; $j < @lib; $j++){
my $title_found;
for($s=0; $s < @array; $s++){ # array has subroutine names
$each_sub = $array[$s];
#"""" Taking the headbox """""""""""""
if( ($lib[$j]=~/^#+[_\-\*]{8,140} *$/)
and ($lib[$j+1]=~/^(#+ *Title[ \t]*\:[ \t]*$each_sub\b).*/i) ){
$out_subs{"$each_sub"} .="$lib[$j]$1\n";
$j+=2;
$title_found=1;
until( ($lib[$j]=~/^#+[\-_\*]{8,150} *$/) or
($lib[$j]=~/^ {0,2}sub *$each_sub *\{ *[version]*\:?(\S*)/) ){
$version_number=$1;
$lib[$j]=~s/ *$//; #<-- removing ending space
#"""""""""""""""""""""""""""""""""""
# Taking version no.
#"""""""""""""""""""""""""""""""""""
if( ($char_opt !~ /nv/i)
and ($lib[$j]=~/^# *Version *\: *([\d+\.\d+]*) */i) ){
if( $1=~/^[ ]*$/){ $ver = '1.0'; } ## make null to 1.0
elsif($1=~/^(\d+)$/){ $ver = "$1\.0"; } ### make 2 to 2.0
elsif($1=~/^([\d+\.\d+]+)$/){ $ver = $1; } ## assign version
}
unless($ver){
$ver=$version_number;
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++;
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++; ## essential to remove #------------- line
}
if($char_opt =~ /h/i){ # 'h' for headbox only output.
goto SPLICE2;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Reading sub { }, when headbox is included, for JONG's lib
#____________________________________________________________
if( $title_found==1 and $lib[$j]=~/^sub +$each_sub\b *\{/ ){
$out_subs{"$each_sub"}.="$lib[$j]";
$j++; $title_found='';
until($lib[$j]=~/^\}/){
$out_subs{"$each_sub"}.="$lib[$j]"; $j++;
}
$out_subs{"$each_sub"}.="$lib[$j]"; ## to fetch '}'
$j++;
SPLICE2:
splice(@array, $s, 1); ## removing the subnames found
$s--;
unless(defined($ver)){ $ver = '1.0' }
unless($char_opt=~/nv/i){ ## if No version attachment option is set
$hash2{"$each_sub${ver}"}=$out_subs{$each_sub};
%out_subs=();
}else{
$hash2{"$each_sub"}=$out_subs{$each_sub};
%out_subs=();
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# Reading sub { }, when headbox is NOT included
#_________________________________________________________
elsif($lib[$j]=~/^ {0,2}sub +$each_sub\b *\{[\t ]*#* *(.*)/ ){
print "\n# (WARN) $0: $file[$i] does not have headbox(jong\' lib style)\n";
print "\n# I will assume you do NOT have it in your sub lib. All subs will have 1.0 version no.\n";
if($1=~/(\d+\.?\d*)/){
$version_number=$ver=$1;
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++; $title_found='';
until($lib[$j]=~/^\}/){
$out_subs{"$each_sub"}.="$lib[$j]"; $j++;
}
$out_subs{"$each_sub"}.="$lib[$j]"; ## to fetch '}'
$j++;
goto SPLICE2; # it is in a previous line
}
}
if($char_opt =~/[rt]/i){
$left_out{$file[$i]}.=$lib[$j]; ## Remnant file content of the operation
}
}
close LIB_FILE;
if($char_opt =~/t/i){
open (LIB_FILE, ">$file[$i]");
print LIB_FILE $left_out{$file[$i]};
close LIB_FILE;
}
}#""""""""""""" end of for (@file)
$no_of_subs_fetched = keys %out_subs;
if(@array>0){
print chr(7), chr(7);
print "\n# Following subs are not found in \"", "@file","\"\n ", "@array", "\n\n";
}
if($char_opt =~ /r/i){
return( \%left_out ); # to get the files sans the subroutines.
}else{
return( \%hash2 );
}
}
#________________________________________________________________________
# Title : update_subroutines
# Usage : &update_subroutines(\@file, \%fetched_subs);
# Function : replaces subroutines of given file(s) with supplied subs.
# If the given subroutine versions are not higher than the
# ones in the program, no upgrade would happen.
# This can read version information from '# Version : 1.0' line
# or sub xxxxx{ # Version : 1.0 line
# Example : &update_subroutines($file, \%fetched_subs);
# Warning :
# Keywords : upgrade_subroutines,
# Options :
# Returns :
# Argument :
# Category :
# Version : 2.8
#--------------------------------------------------------------------
sub update_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (1) %subs_from_lib is a default variable from &handle_arguments
#______________________________________________________
my %subs_from_lib=%{&merge_hash(@hash)};
my @subs_from_lib = keys %subs_from_lib; # @subs_from_lib are subroutine names
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``
# update_subroutines can handle many input perl files
#__________________________________________________________________
for($i=0; $i < @file; $i++){
open(TARGET_FILE, "<$file[$i]") or die "\n $file[$i] <- $! \n";
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# (2) Reading in all the perl_file_lines into arrary
#___________________________________________________
my @perl_file_lines =<TARGET_FILE>;
close TARGET_FILE;
my (%temp, %temp_with_version_info, %final_out, %latest_sub_hash, $VER, $sub_name,$ver,
$first_line, @found_subs, $sub_name2);
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (3) checking each line in @perl_file_lines of target file with already provided %subs_from_lib of subroutines
#____________________________________________________________________________________________________
for($j=0; $j < @perl_file_lines; $j++){
my ($loop_count, $title_found, $sub_name,
$title_found, $sub_found, $VER, $ver);
if( $perl_file_lines[$j]=~/^(#\!\/\w+.+perl)/){ ## first line match
$final_out{$file[$i]}.=$perl_file_lines[$j];
$j++;
print "\n# (INFO) Good! I found the very first line, $1 !!\n";
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (4) Putting update information line-> # Last Update by ./update_subrout
#______________________________________________________________________________
if($perl_file_lines[$j]=~/^# *last *update */i){
$final_out{$file[$i]}.="# Last Update by $0: ".`date`;
$j++;
}elsif($perl_file_lines[$j+1]=~/^# *last *update */i){
$final_out{$file[$i]}.="$perl_file_lines[$j]# Last Update by $0: ".`date`;
$j+=2;
}elsif($j < 4 and (!$perl_file_lines[$j]=~/^# *last *update */i) ){
$final_out{$file[$i]}.="# Last Update by $0: ".`date`.$perl_file_lines[$j];
$j++;
}
if($perl_file_lines[$j]=~/^__END__/){ last } ## this is to stop $0 reading in junk sub calls after __END__ line
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~````
# (5) this is to read non-subroutine related stuff
#_______________________________________________________
until( ($perl_file_lines[$j]=~/^#+[_\-\*\~]{8,160} *$/
and $perl_file_lines[$j+1]=~/^(#+ *Title *[:\;]* *\S+\b)[^\.]? */i ) ## NO 'xxxx.pl'
or ($perl_file_lines[$j]=~/^ {0,1}sub +[\w\-\.]+ *\{/) # until we hit sub xxxx{ line
or $j == $#perl_file_lines ){ ## when there is no headbox
$final_out{$file[$i]}.=$perl_file_lines[$j]; ## %final_out is the final output hash with perl_file_lines!!
$j++; # %final_out stores only non-subroutine stuff until the end
}
print "\n# (INFO) \$final_out\{\$file\[\$i\]\} so far is: \n$final_out{$file[$i]}\n" if $verbose;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~``~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# (6) NOW we found the headbox and let's go through all @subs_from_lib entry to match
# Going through all sub names. Now I am reading sub routine parts. @subs_from_lib has the names of subs to fetch
#__________________________________________________________________________________________________________
SUBS: for($s=0; $s < @subs_from_lib; $s++){
if($subs_from_lib[$s] =~/^([_a-zA-Z\-\d]+)(\d+\.*\d*) *$/){
$sub_name=$1;
if($2){ $VER =$2; }else{ $VER='1.0' } ## <--- this line is redundant
}elsif($subs_from_lib[$s] =~/^(\S+) *$/){ #### This is critically necessary
$sub_name=$1;
print "\n# (WARN) $0: \$subs_from_lib\[\$s\] doesnt seem to have ver info. Is this O.K?\n";
$VER='1.0';
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (7) matching #______________ and # Title :, IE, matching HEADBOX region
#___________________________________________________________________________
if( ($perl_file_lines[$j]=~/^#+[_\-\*\~]{8,160} *$/)
&& ($perl_file_lines[$j+1]=~/^(#+ *Title *[\:]* *$sub_name)[^\.]? */i) ){
$temp{"$sub_name"}.="$perl_file_lines[$j]$1\n";
$j+=2;
$title_found=1;
until( $perl_file_lines[$j]=~/^#+[\-_\*]{10,180} *$/ ){
$perl_file_lines[$j]=~s/ *$//;
if($perl_file_lines[$j] =~ /^# *Version *[\:\;]? *([\d*\.*\d*]*) */i){
if( $1 =~/^ *$/){ $ver = '1.0'; } ## make null to 1.0
elsif($1 =~/^(\d+)$/){ $ver = "$1\.0"; } ### make 1 to 1.0
elsif($1 =~/^(\d+\.\d+)$/){ $ver = $1; }
}
$temp{"$sub_name"}.="$perl_file_lines[$j]";
$j++;
}
$temp{"$sub_name"}.="$perl_file_lines[$j]";
$j++;
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (8) matching sub xxxxx{
#_____________________________________________
if($perl_file_lines[$j]=~/^sub +$sub_name *\{ *(.*)/){
$possible_version_info=$1;
if($possible_version_info=~/(\d+\.?\d*)/){
$ver=$1;
}
$sub_found =1;
$temp{"$sub_name"}.="$perl_file_lines[$j]";
$j++;
until( $perl_file_lines[$j] =~/^\}/){
$temp{"$sub_name"}.="$perl_file_lines[$j]";
$j++;
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`````
# following is out of paranoia
#___________________________________________
if( $perl_file_lines[$j] =~/^#[_\-\*]{8,150} *$/
and $perl_file_lines[$j+1]=~/^(# *Title *[:]* *$sub_name)[^\.pl]/i ){
$temp{"$sub_name"}.="\n\}\n"; $j++; # fixes missing '}' in the read sub hash.
goto SPLICE;
}elsif($perl_file_lines[$j+1]=~/^sub +[\w\-\.]+ *\{/){
$temp{"$sub_name"}.="\n\}\n";
goto SPLICE;
}
$loop_count++;
if($loop_count > 10000){
$final_out{$file[$i]} .="\n\}\n"; # fixes missing '}' in the input file
$temp{"$sub_name"}.="\n\}\n"; # fixes missing '}' in the read sub hash.
goto SPLICE;
}
}
if($perl_file_lines[$j]=~/^\}/){ $temp{"$sub_name"}.=$perl_file_lines[$j]; }
SPLICE:
push(@found_subs, splice(@subs_from_lib, $s, 1) );
$s--;
unless(defined($ver)){ $ver = '1.0' }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Changing $temp{$sub_name} to $temp{"$sub_name$ver"}
#_________________________________________________________
if(!$temp_with_version_info{"$sub_name$ver"}){
$temp_with_version_info{"$sub_name$ver"}=$temp{$sub_name};
}
%temp=();
last;
}else{ # end of if ($perl_file_lines[$j]=~/sub xxxxxx/)
next; # next to for(@subs_from_lib)
}
}## END of for(@subs_from_lib)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# (9) putting %temp hash to %latest_sub_hash
#____________________________________________________
if( $title_found and $sub_found and $VER < $ver ){
print "\nTitle found, SUB found for $sub_name VER $sub_name${VER} \< ver $ver";
unless($latest_sub_hash{ "$sub_name$ver" }){
$latest_sub_hash{ "$sub_name$ver" }= $temp_with_version_info{ "$sub_name$ver" };
}
delete($subs_from_lib{"$sub_name$VER"} );
$VER=$ver=$title_found=$sub_found='';
}elsif( $title_found and $sub_found and $VER >=$ver ){
print "\nTitle found, SUB found for $sub_name VER \>= ver ";
unless($latest_sub_hash{ "$sub_name$VER" }){
$latest_sub_hash{ "$sub_name$VER" }= $subs_from_lib{"$sub_name$VER"};
}
delete($temp_with_version_info{"$sub_name$ver"});
delete($subs_from_lib{"$sub_name$VER"});
$VER=$ver=$title_found=$sub_found='';
}elsif( !$title_found and $sub_found ){
print "\nTitle Not found, SUB found for $sub_name";
unless($latest_sub_hash{ "$sub_name$VER" }){
$latest_sub_hash{ "$sub_name$VER" }= $subs_from_lib{"$sub_name$VER"};
}
delete($subs_from_lib{"$sub_name$VER"});
delete($temp_with_version_info{"$sub_name$ver"});
$VER=$ver=$title_found=$sub_found='';
}elsif( $title_found and !$sub_found ){
print "\nTitle found, SUB not found for $sub_name";
$VER=$ver=$title_found=$sub_found='';
next;
}elsif( !$title_found and !$sub_found ){
$final_out{$file[$i]}.=$perl_file_lines[$j];
}
} # for (@perl_file_lines)
%merged_final_hash=%{&merge_hash(\%subs_from_lib, \%latest_sub_hash)};
@values= values %merged_final_hash;
open (LEFT_FILE, ">$file[$i]");
for($h= 0; $h < @values; $h++){ ### appending the new subs.
$final_out{$file[$i]} .= $values[$h];
}
print LEFT_FILE $final_out{$file[$i]};
close LEFT_FILE;
}#""""""""""""" end of for (@file)
return( \%final_out ); # this has all the sub routines and other lines.
}
#________________________________________________________________________
# Title : takeout_subroutines
# Usage :
# Function : retunrns subroutines with the keys as subroutine names with version
# like in the form( 'show_array2.2' => 'subroutine in one string')
# It reports the subroutines not found in searched file(s)
# fetch_subroutines also has this feature.
# Example :
# Warning : If there is no headbox and version no. It thinks the version
# is 1.0
# Keywords : take_out_subroutines, take_subroutines, cut_subroutines,
# cutout_subroutines, remove_subroutines
# Options : 'nv' for no version attachment in the keys of returning hash of subroutines
# 'r' for getting remnant file content rather than the sub routines
# Returns :
# Argument :
# Category :
# Version : 1.6
#--------------------------------------------------------------------
sub takeout_subroutines{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my($each_sub, %duplicate, %out_subs, %left_out, @lib, $ver, $real_sub_entry_found);
@array = @ori_array = grep { ! $duplicate{$_}++ } @raw_string; ## removing duplicates
for($i=0; $i < @file; $i++){
@array=@ori_array;
open(LIB_FILE, "$file[$i]") or die "\n $file[$i] <- $! \n";
@lib =<LIB_FILE>;
for($j=0; $j < @lib; $j++){
for($s=0; $s < @array; $s++){
$each_sub = $array[$s];
#"""" Taking the headbox """""""""""""
if( ($lib[$j]=~/^#[_\-\*\~]{20,140} *$/)&&
($lib[$j+1]=~/^(# *title *: *$each_sub)[^\.pl]/i) ){
$real_sub_entry_found=0;
$out_subs{"$each_sub"}.="$lib[$j]$1\n";
$j+=2;
until( ($lib[$j]=~/^sub *\w+ *\{/)||
($lib[$j]=~/^#[\-_\*\~]{20,140} *$/) ){
$lib[$j]=~s/( *)$//; #<-- removing ending space
#"""""""""""""""""""""""""""""""""""
# Taking version no.
#"""""""""""""""""""""""""""""""""""
if( ($char_opt !~ /nv/i) && ($lib[$j]=~/^# *version *: *([\d+\.\d+]*) */i) ){
if( $1=~/^[ ]*$/){ $ver = '1.0'; } ## make null to 1.0
elsif( $1=~/^(\d+)$/){ $ver = "$1\.0"; } ### make 2 to 2.0
elsif($1=~/^([\d+\.\d+]+)$/){ $ver = $1; } ## assign version
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++;
}
$out_subs{"$each_sub"}.="$lib[$j]";
$j++; ##<< essential to remove #------------- line
}
#"""""""" Reading sub { } """""""
if($lib[$j]=~/^sub +$each_sub *\{/){
$out_subs{"$each_sub"}.="$lib[$j]";
$j++;
until($lib[$j]=~/^\}/){
$out_subs{"$each_sub"}.="$lib[$j]"; $j++;
}
$out_subs{"$each_sub"}.="$lib[$j]"; ## to fetch '}'
$j++;
splice(@array, $s, 1); ## removing the subnames found
$s--;
unless(defined($ver)){ $ver = '1.0' }
unless($char_opt=~/nv/i){ ## if No version attachment option is set
$out_subs{"$each_sub$ver"}=$out_subs{$each_sub};
delete $out_subs{$each_sub};
}
}
}
$left_out{$file[$i]}.=$lib[$j]; ## Remnant file content of the operation
## just in case you want the left out ones.
}
close LIB_FILE;
open (LEFT_FILE, ">$file[$i]");
print LEFT_FILE $left_out{$file[$i]};
close LEFT_FILE;
}#""""""""""""" end of for (@file)
@no_of_subs_fetched = keys %out_subs;
if(@array>0){
print chr(7);
print "\n# Following subs are not found in \"", "@file","\"\n ", "@array", "\n\n";
}
if($char_opt =~ /r/i){
return( \%left_out ); # to get the files without the subroutines.
}else{
return( \%out_subs );
}
}
#________________________________________________________________________
# Title : get_subroutine_calls
# Usage : @sub_name_array= @{&get_subroutine_calls(\@AR))};
# Function : gets all the subroutine calls( like &show_hash ) in the given
# file name or array of lines which is the content of a file,
# text etc. If there is no input arg, it reads the running
# program as default input
# Example :
# Keywords : get_sub_names,get_subroutine_names, get_sub_calls,
# get_subroutine_calls, find_sub_calls, find_subroutine_calls
# Options :
# Category :
# Version : 2.2
#--------------------------------------------------------------------
sub get_subroutine_calls{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(@keywords)=qw(getc shift system warn undef abs cmp close fork caller
eval time chdir connect disconnect wait main);
my($keywords)=join(' ', @keywords);
my(@arr, @sub_names, @nondup, %duplicate, @sub_calls);
if(@_== 0){ open(FILE, "$0"); @arr = <FILE>; } # open self
elsif( @file > 0){
for($i=0; $i < @file; $i++){
open(FILE, "$file[$i]");
push(@arr, <FILE>);
}
}
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~`
# When subroutine names are specified as input
#_______________________________________________
if(@array>0){
for($i=0; $i<@array; $i++){
push(@arr, @{$array[$i]} ) if ref($array[$i]) eq 'ARRAY';
}
}
if( @raw_string>0){ push(@arr, @raw_string) }
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Main pattern matching part for &xxxxxx ; line
#__________________________________________________
for($i=0; $i< @arr; $i ++){
if($arr[$i]=~/^#+/){ next }
$arr[$i]=~s/^(.+)(# ..+)$/$1/; # I have to remove all COMMENTS as comments can have &xxx
if($arr[$i]=~/\$\~ = *\S/){
next;
}elsif($arr[$i]=~/^format *=/){
until($arr[$i]=~/^\./){ $i++ }
}elsif(@sub_calls=$arr[$i]=~/\&([\w\.\-]+)/g ){ # for $X = &round(&some(&some2(\$ssssss))); sort of thing
push(@sub_names, @sub_calls);
}elsif($arr[$i]=~/\&[^\&]+\w+\:\:(\w[\w\-\.\:]+) {0,3}.*[\;\,]/){ ## to handle &main::ssssub::sussb
push(@sub_names, $1);
}elsif($arr[$i]=~/\&(\w[\w\-\.\:]+) {0,3}.*[\;\,]/){
push(@sub_names, $1);
}elsif($arr[$i]=~/= *([a-zA-Z][\w\-]+) {0,3}\;/){
push(@sub_names, $1) unless($keywords=~/\b$1\b/);
}elsif($arr[$i]=~/= *[\@\%\$] *\{ *([a-zA-Z_\-]+[\d]*).+\} *\;/){ # for =${ xxxx }; or =${&xxxxx};
push(@sub_names, $1);
}
}
@nondup = grep { ! $duplicate{$_}++ } @sub_names; ## removing duplicates
for($i=0; $i< @keywords; $i++){
for($j=0; $j < @nondup; $j ++){
if($keywords[$i] eq $nondup[$j]){
splice(@nondup, $j, 1); $j--;
}
}
}
return(\@nondup);
}
#________________________________________________________________________
# Title : set_special_options (derived from set_debug_option)
# Usage : &set_special_options;
# Function : If you put special chars like '#' or '##', '###..' at the
# prompt of any program which uses
# this sub you will get verbose printouts for the program if
# the program has a lot of comments.
# Example : &set_special_options.pl ## <-- at prompt.
# Warning :
# Keywords :
# Options : # for 1st level of debugging printouts
# ## for even more debugging printouts
# + for more outputs(more calculations are shown, like statistics)
# ++ even more outputs.(
# $DEBUG becomes 1 by '#'
# $DEBUG2 becomes 1 by '##'
# $VERBOSE becomes 1 by '+'
# $VERBOSE2 becomes 1 by '++'
#
# Returns : $debug, $verbose
# Argument : Nothing in a program.
# Category :
# Version : 1.0
# generalized debug var is added for more verbose printouts.
#--------------------------------------------------------------------
sub set_special_options{
my($j, $i, $level, $key, %special_chars);
%special_chars=('DEBUG'=>'#', 'VERBOSE'=>'+');
for $key (keys %special_chars){
for($j=0; $j < @ARGV; $j ++){
if( $ARGV[$j] =~/([$special_chars{$key}]+)/){
print __LINE__," >>>>>>> Debug option is set by $1 <<<<<<<<<\n";
${"$key"}=1; print chr(7);
print __LINE__," \$$key is set to ", ${"$key"}, "\n";
splice(@ARGV,$j,1); $j-- ;
$level = length($1)+1;
for($i=0; $i < $level; $i++){
${"$key$i"}=1;
print __LINE__," \$${key}${i} is set to ", ${"$key$i"}, "\n";
}
}
}
}
}
#________________________________________________________________________
# Title : set_debug
# Usage : &set_debug;
# Function : If you put '#' or '##' at the prompt of any program which uses
# this sub you will get verbose printouts for the program if the program
# has a lot of comments.
# Example : set_debug # <-- at prompt.
# Warning :
# Keywords :
# Options : # for 1st level of verbose printouts
# ## for even more verbose printouts
# $debug becomes 1 by '#' or '_'
# $debug2 becomes 1 by '##' or '__'
#
# Returns : $debug
# Argument :
# Category :
# Version : 1.8
# generalized debug var is added for more verbose printouts.
#--------------------------------------------------------------------
sub set_debug{
my($j, $i, $level);
unless( defined($debug) ){
for($j=0; $j < @ARGV; $j ++){
if( $ARGV[$j] =~/^(_+)$|^(#+)$/){ # in bash, '#' is a special var, so use '_'
print __LINE__," >>>>>>> Debug option is set by $1 <<<<<<<<<\n";
$debug=1;
print chr(7);
print __LINE__," \$debug is set to ", $debug, "\n";
splice(@ARGV,$j,1); $j-- ;
$level = length($1)+1;
for($i=0; $i < $level; $i++){
${"debug$i"}=1;
print __LINE__," \$debug${i} is set to ", ${"debug$i"}, "\n";
}
}
}
}
}
#________________________________________________________________________
# Title : open_self
# Usage : @lines = &open_self;
# Function :
# Example :
# Warning :
# Keywords : read self, read_self, open self, open itself
# Options :
# Returns : one array
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub open_self{
open(SELF, "$0");
my(@Line)=<SELF>;
return( \@Line );
}
#________________________________________________________________________
# Title : tell_seq_length
# Usage : %hash_out = %{&tell_seq_length(\%hash_in)};
# Function : tells the sequence sizes of given sequences
# Example :
# Warning :
# Keywords :
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.0
#--------------------------------------------------------------------
sub tell_seq_length{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{$A[11]};
my($i,$j,$c,$d,$e,$f,$g,$h,$k,$l,$m,$n,$o,$p,$q,$r,$s,$t,$u,$v,$w,$x,$y,$z);
if($debug==1){print "\n\t\@hash=\"@hash\"
\@raw_string=\"@raw_string\"\n\t\@array=\"@array\"\n\t\@num_opt=\"@num_opt\"
\@char_opt=\"@char_opt\"\n\t\@file=\"@file\"\n\t\@string=\"@string\"\n" }
#""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
my(@out_hash, %hash);
for($i=0; $i < @hash; $i++){
%hash = %{$hash[$i]};
@keys = keys %hash;
for ($j=0; $j < @keys; $j ++){
if($hash{$keys[$j]}=~/\,\S+\,/){ @string= split(/\,/, $hash{$keys[$j]});
}else{ @string= split(//, $hash{$keys[$j]}); }
$h -> {$keys[$j]} = @string; ## $h is the ref. of the anonymous hash
} ## This is equivalent to "$h{$keys[$j]}= $length;"
push(@out_hash , $h ) ;
}
if(@out_hash == 1){ $out_hash[0]; }
elsif(@out_hash < 1){ die "\nSomething is wrong at tell_seq_length\n"; }
elsif(@out_hash > 1){ return(@out_hash); }
}
#________________________________________________________________________
# Title : do_window_scan
# Usage : @out_array = @{&do_window_scan(\@input_array, $win_size)};
# Often, bioters(Bio Computer Scientists) need to scan a long sequences
# of DNA or Protein like(ABADFAFASDFASFASDFDFA or 109384717817947) to
# caculate something out of them.
# This routine is providing such scanning
# function in perl.
# Function : This is the core part of any window (of sequences)
# scanning function.
# Example :
# Warning :
# Keywords : scan_sequence, scan_window
# Options :
# Returns :
# Argument :
# Category :
# Version : 1.3
#--------------------------------------------------------------------
sub do_window_scan{
my(@string1) = @{$_[0]};
my($win_size) = ${$_[1]} || $_[1];
my($w, $start, $end, $average1, @window_1, $w_abs, @out_string),
$start = -($win_size - ($win_size%2))/2; ## If the window size is odd numbers (eg 5),
## the starting average position is 0 from (-2,-1,0,1,2)
## If it is even numbers (eg 4)
## the starting position is 0 from (-2,-1,0,1)
$end = @string1 - ($win_size - ($win_size%2))/2;
## End point is also dependent on oddity of the numbers
## for window size.
for ($w= $start; $w < $end ; $w ++){
$moving_window = $w + $win_size - 1;
@window_1= @string1[$w .. $moving_window ];
if($w < 0){ #### This if is to prevent the circularization of the array
$w_abs = abs($w); #### (like 6 7 1 2 3 4 5 , for . . 1 2 3 4 5 from 1234567 )
splice(@window_1, 0, $w_abs); # $w_abs is the absolute value of $w
}
################ PUT YOUR calc HERE #####
$average1= ${average_of_array(\@window_1, 'int')};
################ PUT YOUR calc HERE #####
push(@out_string, $average1);
print "\nWinSize:$win_size halfwin: $half_win_size str size: $string_size \(from offset: $offset to mov_wind: $moving_window \) AV: $average1 of win1:", @window_1, "\n";
}
\@out_string;
}
#________________________________________________________________________
# Title : scan_window_and_calc_something
# Usage :
# Function : scans any given length window of sequence and computes something.
# Example :
# Warning :
# Keywords :
# Options : average for getting average of given window size.
# sum for getting sum of given window size.
# Returns :
# Argument :
# Category :
# Version : 1
#--------------------------------------------------------------------
sub scan_window_and_calc_something{
#"""""""""""""""""< handle_arguments{ head Ver 4.1 >"""""""""""""""""""
my(@A)=&handle_arguments(@_);my($num_opt)=${$A[7]};my($char_opt)=${$A[8]};
my(@hash)=@{$A[0]};my(@file)=@{$A[4]};my(@dir)=@{$A[3]};my(@array)=@{$A[1]};
my(@string)=@{$A[2]};my(@num_opt)=@{$A[5]};my(@char_opt)=@{$A[6]};
my(@raw_string)=@{$A[9]};my(%vars)=%{$A[10]};my(@range)=@{