#!/usr/bin/perl package Guidance; #don't forget: a package must end with a return value (1; in the end)!!!!! #use strict; use FileHandle; use Bio::SeqIO; use Bio::TreeIO; use Bio::Tree::TreeFunctionsI; use Bio::AlignIO; use Bio::Align::AlignI; use Bio::Tools::CodonTable; use File::Copy; use FindBin qw($Bin); use lib "$Bin/../bioSequence_scripts_and_constants/"; #use lib "/bioseq/bioSequence_scripts_and_constants/"; use MSA_parser; use GENERAL_CONSTANTS; use constant NEWIC2MAFFT => "$Bin/exec/newick2mafft.rb"; use constant MSA_SET_SCORE => "$Bin/../../programs/msa_set_score/msa_set_score"; use constant HOT_PROGRAM => "$Bin/exec/HoT/COS.pl"; use constant MAFFT_OP_DIST=> "$Bin/balibase.mafft_7123_mafft.op.Dist20bins.txt"; use constant MAFFT_OP_DIST_0_25=> "$Bin/balibase.mafft_7123_mafft.op2.Dist25bins.txt"; use constant MAFFT_EP_DIST_0_25=> "$Bin/balibase.mafft_7123_mafft.ep2.Dist20bins.txt"; use constant HOT_GUIDANCE2_PROGRAM => "$Bin/exec/HoT_COS_GUIDANCE2.pl"; use constant MIDPOINT_ROOTING_R => "$Bin/exec/MidPoint_Rooting.R"; my $MAFFT_OP_DIST_0_25="$Bin/balibase.mafft_7123_mafft.op2.Dist25bins.txt"; my $MAFFT_EP_DIST_0_25="$Bin/balibase.mafft_7123_mafft.ep2.Dist20bins.txt"; my $newick2mafft="$Bin/exec/newick2mafft.rb"; #my $MSA_Score_CSS="http://guidance.tau.ac.il/MSA_Colored.NEW.css"; my $MSA_Score_CSS="http://guidance.tau.ac.il/MSA_Colored.NEW.EM.css"; my $MidPoint_Rooting_R="$Bin/exec/MidPoint_Rooting.R"; #my $phylonet_prog = "$Bin/exec/phylonet_v1_7/phylonet_v1_7.jar"; my $isEqualTopologyProg = "$Bin/../../programs/isEqualTree/isEqualTree"; sub CreateHTML_Graph # Create an HTML BARs graph # GET: 1. CSV FILE (The X var is the first Col and the Y is the second, X must be sorted) # 2. HTML OUTPUT # 3. X Lable (OPTIONAL) ############################################################################################ { my $CSV_File=shift; # The Data Must be in the correct X order my $Out=shift; my $X_LABLE=shift; open (OUT,">>$Out") || return ("Guidance::CreateHTML_Graph: Can't open Output '$Out' $!"); open (DATA,$CSV_File) || return ("Guidance::CreateHTML_Graph: Can't open data file '$CSV_File' $!"); # GRAPH PROPERTIES my $graphHeight = 200; # target height of graph my $BarWidth = 15; # width of bars my $maxResult = 1; my $scale = 1; my $last_x = 0; # Set the scale $scale = $graphHeight / $maxResult; my $line=; # Header # SPACER BEFORE THE GRAPH print OUT " \n"; print OUT "\n"; print OUT "\n"; print OUT "\n"; print OUT " $X_LABLE
SCORE\n"; while ($line=) { chomp ($line); my @data=split(",",$line); # print "$data[0]\t$last_x\n"; if (($data[1] ne "NaN") and (($data[0]-$last_x)==1)) { print OUT " $\"$data[0]:$data[1]\"$ \n"; $last_x=$data[0]; } elsif ($data[1] ne "NaN") { while ($data[0]-$last_x!=1) { print OUT "

\n"; $last_x++; } print OUT " $\"$data[0]:$data[1]\"$ \n"; $last_x=$data[0]; } elsif ($data[1] eq "NaN") { print OUT " $\"$data[0]:$data[1]\"$ \n"; $last_x=$data[0]; } } print OUT "\n"; print OUT "\n"; print OUT "Column
\n"; close (OUT); close (DATA); } sub printColoredAlignment { ############################################################################################################################################# #@ARGV == 3 or die "USAGE: $0 IN_MSA_FILE OUT_HTML_FILE SCORES_FILE #SCORES_FILE - Each line should contain three values: column number, seq number, and a score between 0 and 1 (separated by white spaces)\n"; ############################################################################################################################################# my $inMsaFile=shift; my $outHtmlFile=shift; my $scoresFile=shift; my $codesFile=shift; # OPTIONAL # Read scores open SCORES, $scoresFile or return ("Can't open $scoresFile: $!"); my %scores; my %Code_Names; foreach () { next if (/^#/); s/^\s+//; my ($col, $seq, $score) = split; $scores{$seq}[$col] = $score; } # Read Codes if ($codesFile ne "") { open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); } # Read MSA my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); # print "DEPTH:",$aln->num_sequences;; @ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} else {return "printColoredAlignment: ".join (" ",@ans);} # Print HTML start # Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985 my %msaColors = (); my %msaPrintColors = (); my $lineCounter; my @line; my $key; my $fontSize=2; my $sequenceLengthForDisplay=400000; my @msaRightOrder=0; my $msaRightOrderCounter=0; my $tdWidth = 5; my @colorstep = (); #color steps $colorstep[0] = "#10C8D1"; #Not confident $colorstep[1] = "#8CFFFF"; $colorstep[2] = "#D7FFFF"; $colorstep[3] = "#EAFFFF"; $colorstep[4] = "#FFFFFF"; #average $colorstep[5] = "#FCEDF4"; $colorstep[6] = "#FAC9DE"; $colorstep[7] = "#F07DAB"; $colorstep[8] = "#A02560"; #Most confident $colorstep[9] = "#A02560"; #Most confident (the score is exactly 1) open MSACOLOREDHTML, ">$outHtmlFile" or die "Can't open $outHtmlFile: $!"; print MSACOLOREDHTML "\n\n\n\n\n"; print MSACOLOREDHTML "

MSA color-coded by GUIDANCE scores

\n\n"; print MSACOLOREDHTML "\n"; # Print colored HTML # counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay) for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) { my $blockEnd = $blockStart+$sequenceLengthForDisplay; $blockEnd = $aln->length if ($blockEnd > $aln->length); # Iterate over sequences and print up to sequenceLengthForDisplay residues # foreach my $seq ($aln->each_seq) { #HAIM COMMENT # my $depth = $aln->num_sequences; #HAIM ADD for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD my $seq = $aln->get_seq_by_pos($i); #HAIM ADD # next if ( $seq->id < 42 # || $seq->id > 73); # Print seq id print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n" if ($codesFile eq ""); print MSACOLOREDHTML "\n" if ($codesFile ne ""); # Print seq my @seq = split //, $seq->subseq($blockStart, $blockEnd); for(my $pos=0; $pos<@seq; $pos++) { my $res = $seq[$pos]; if ($res eq '-') { print MSACOLOREDHTML "\n"; } else { #print $seq->id,"\tscores{$seq->id}[$pos+1]:$scores{$seq->id}[$pos+1]\n"; #my $color = $colorstep[ int(9 * $scores{$seq->id}[$pos+1]) ]; #HAIM COMMENT my $color = $colorstep[ int(9 * $scores{$i}[$pos+1]) ]; if($color eq "#A02560"){ print MSACOLOREDHTML "\n"; } else { print MSACOLOREDHTML "\n"; } } } print MSACOLOREDHTML "\n\n"; } print MSACOLOREDHTML " \n\n"; } print MSACOLOREDHTML "

", $seq->id, "

", substr($Code_Names{$seq->id},0,25), "

$res

"; # print the color scale print MSACOLOREDHTML "\n
Legend:

\nThe alignment confidence scale:
\n\n\n\n

\n\n"; for (my $i=8 ; $i>=0 ; $i--){ if ($i == 0){ print MSACOLOREDHTML " ", $i+1, " "; } else { print MSACOLOREDHTML " ", $i+1, " "; } } print MSACOLOREDHTML "\n\n\n\n\n\n\n

Confident

<--->

Uncertain

\n"; print MSACOLOREDHTML "\n\n"; close MSACOLOREDHTML; close (SCORES); } sub AssignColorsToAlignment{ my $inMsaFile=shift; my $scoresFile=shift; my $codesFile=shift; # OPTIONAL open SCORES, $scoresFile or return ("Can't open $scoresFile: $!"); my %scores; my %Code_Names; foreach () { next if (/^\#/); s/^\s+//; my ($col, $seq, $score) = split; $scores{$seq}[$col] = $score; } # Read Codes if ($codesFile ne "") { open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); } # Read MSA my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); # print "DEPTH:",$aln->num_sequences;; @ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD # Print HTML start # Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985 my %msaColors = (); my %msaPrintColors = (); my $lineCounter; my @line; my $key; my $fontSize=2; my $sequenceLengthForDisplay=400000; my @msaRightOrder=0; my $msaRightOrderCounter=0; my $tdWidth = 5; my @colorstep = (); #color steps $colorstep[0] = "Score1"; #Not confident $colorstep[1] = "Score2"; $colorstep[2] = "Score3"; $colorstep[3] = "Score4"; $colorstep[4] = "Score5"; #average $colorstep[5] = "Score6"; $colorstep[6] = "Score7"; $colorstep[7] = "Score8"; $colorstep[8] = "Score9"; #Most confident $colorstep[9] = "Score9"; #Most confident (the score is exactly 1) # get Align max_seq_length my $seq = $aln->get_seq_by_pos(1); my $Align_width = $seq->length(); # STOPPED HERE } sub printColoredAlignment_With_CSS { ############################################################################################################################################# #@ARGV == 3 or die "USAGE: $0 IN_MSA_FILE OUT_HTML_FILE SCORES_FILE #SCORES_FILE - Each line should contain three values: column number, seq number, and a score between 0 and 1 (separated by white spaces)\n"; ############################################################################################################################################# my $inMsaFile=shift; my $outHtmlFile=shift; my $scoresFile=shift; my $codesFile=shift; # OPTIONAL # my $COL_SCORES_FIGURE=shift; # OPTIONAL # my $MSA_Length=shift; # Parameters for the PLOT beneath alignment my $ColScoresCSV=shift; my $XLable=shift; my $Seq_Scores=shift; # Read scores open (SEQ_SCORES,$Seq_Scores) or return ("Can't open $Seq_Scores: $!"); my %seq_scores=(); foreach () { next if (/^#/); s/^\s+//; my ($seq, $score) = split; $seq_scores{$seq} = $score; } close (SEQ_SCORES); open SCORES, $scoresFile or return ("Can't open $scoresFile: $!"); my %scores; my %Code_Names; foreach () { next if (/^#/); s/^\s+//; my ($col, $seq, $score) = split; $scores{$seq}[$col] = $score; } # Read Codes if ($codesFile ne "") { open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); } # Read MSA my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT # print "DEPTH:",$aln->num_sequences;; @ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD # Print HTML start # Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985 my %msaColors = (); my %msaPrintColors = (); my $lineCounter; my @line; my $key; my $fontSize=2; my $sequenceLengthForDisplay=400000; my @msaRightOrder=0; my $msaRightOrderCounter=0; my $tdWidth = 5; my @colorstep = (); #color steps $colorstep[0] = "Score1"; #Not confident $colorstep[1] = "Score2"; $colorstep[2] = "Score3"; $colorstep[3] = "Score4"; $colorstep[4] = "Score5"; #average $colorstep[5] = "Score6"; $colorstep[6] = "Score7"; $colorstep[7] = "Score8"; $colorstep[8] = "Score9"; #Most confident $colorstep[9] = "Score9"; #Most confident (the score is exactly 1) my @colorstep_code = (); #color steps $colorstep_code[0] = "#10C8D1"; #Not confident $colorstep_code[1] = "#8CFFFF"; $colorstep_code[2] = "#D7FFFF"; $colorstep_code[3] = "#EAFFFF"; $colorstep_code[4] = "#FFFFFF"; #average $colorstep_code[5] = "#FCEDF4"; $colorstep_code[6] = "#FAC9DE"; $colorstep_code[7] = "#F07DAB"; $colorstep_code[8] = "#A02560"; #Most confident # my $plot_width=0; # if ($MSA_Length>1000) # { # $plot_width=11.15*$MSA_Length; # } # else # { # $plot_width=11.5*$MSA_Length; # } open MSACOLOREDHTML, ">$outHtmlFile" or die "Can't open $outHtmlFile: $!"; print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "

MSA color-coded by GUIDANCE scores

\n\n"; print MSACOLOREDHTML "\n"; # Print colored HTML # get Align max_seq_length my $seq = $aln->get_seq_by_pos(1); my $Align_width = $seq->length(); # Print upper Scale print MSACOLOREDHTML "\n\n"; my $i=2; while ($i<$Align_width) # for (my $i=2;$i<$Align_width;$i++) { if (($i%10)==0) { my @digits=split("",$i); print MSACOLOREDHTML "\n"; foreach my $digit (@digits) { print MSACOLOREDHTML ""; $i++; } } else { print MSACOLOREDHTML ""; $i++; } } print MSACOLOREDHTML "\n\n"; # counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay) for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) { my $blockEnd = $blockStart+$sequenceLengthForDisplay; $blockEnd = $aln->length if ($blockEnd > $aln->length); # Iterate over sequences and print up to sequenceLengthForDisplay residues # foreach my $seq ($aln->each_seq) { #HAIM COMMENT # my $depth = $aln->num_sequences; #HAIM ADD for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD my $seq = $aln->get_seq_by_pos($i); #HAIM ADD # next if ( $seq->id < 42 # || $seq->id > 73); # Print seq id print MSACOLOREDHTML "\n"; # print SEQ COLOR my $Color_Class=""; if ($seq_scores{$i} eq "nan") {$Color_Class="ScoreNaN";} else {$Color_Class = $colorstep[ int(9 * $seq_scores{$i})];} print MSACOLOREDHTML "\n"; print MSACOLOREDHTML "\n" if ($codesFile eq ""); print MSACOLOREDHTML "\n" if ($codesFile ne ""); # Print seq my @seq = split //, $seq->subseq($blockStart, $blockEnd); for(my $pos=0; $pos<@seq; $pos++) { # for(my $pos=757; $pos<875; $pos++) { my $res = $seq[$pos]; if ($res eq '-') { print MSACOLOREDHTML "\n"; } else { #print $seq->id,"\tscores{$seq->id}[$pos+1]:$scores{$seq->id}[$pos+1]\n"; #my $color = $colorstep[ int(9 * $scores{$seq->id}[$pos+1]) ]; #HAIM COMMENT my $Color_Class=""; if ($scores{$i}[$pos+1] eq "nan") {$Color_Class="ScoreNaN";} else {$Color_Class = $colorstep[ int(9 * $scores{$i}[$pos+1]) ]}; print MSACOLOREDHTML "\n"; } } print MSACOLOREDHTML "\n\n"; } # print MSACOLOREDHTML " \n\n"; print MSACOLOREDHTML "\n\n"; } # Print lower Scale # print MSACOLOREDHTML "\n\n"; print MSACOLOREDHTML "\n\n"; $i=2; while ($i<$Align_width) # for (my $i=2;$i<$Align_width;$i++) { if (($i%10)==0) { my @digits=split("",$i); foreach my $digit (@digits) { print MSACOLOREDHTML "\n"; $i++; } } else { print MSACOLOREDHTML ""; $i++; } } print MSACOLOREDHTML "\n\n"; # print MSACOLOREDHTML " \n

	1	$digit
", $seq->id, "	", substr($Code_Names{$seq->id},0,25), "	$res	$res
1
	1	$digit

\n"; # Add the figure close (MSACOLOREDHTML); CreateHTML_Graph($ColScoresCSV,$outHtmlFile,$XLable); open MSACOLOREDHTML, ">>$outHtmlFile"; # if ($COL_SCORES_FIGURE ne "") # { # print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "\n"; # print MSACOLOREDHTML "

\n"; # } # print the color scale print MSACOLOREDHTML "\n
Legend:

\nThe alignment confidence scale:
\n";#\n\n\n

\n\n"; print MSACOLOREDHTML "\n\n\n

\n\n"; for (my $i=8 ; $i>=0 ; $i--){ if ($i == 0){ print MSACOLOREDHTML " ", $i+1, " "; } else { print MSACOLOREDHTML " ", $i+1, " "; } } # print MSACOLOREDHTML "\n\n\n\n\n\n\n

Confident

<--->

Uncertain

\n"; print MSACOLOREDHTML "\n\n\n\n\n\n\n

Confident

<--->

Uncertain

\n"; # print MSACOLOREDHTML "\n\n"; print MSACOLOREDHTML "

Insufficient Data

\n\n"; print MSACOLOREDHTML "\n

Insufficient Data

\n"; close MSACOLOREDHTML; } sub root_BP_trees #################################################################################################################### # Root all trees on BP dir #################################################################################################################### { my $bsDir = shift; my $dataset = shift; my $orig_prog = shift; my $bp_repeats = shift; my $suffix=shift; my $rooting_type=shift; # BioPerl | MidPoint if (!defined $suffix) { $suffix = ""; } if (!defined $rooting_type){ $rooting_type="BioPerl"; } $bsDir .= "/"; for (my $countTrees=0;$countTrees<$bp_repeats;++$countTrees) { #print "**** $dataset tree_$countTrees\n"; my $treeFile = $bsDir."tree_".$countTrees."/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix; if (-e $treeFile) { my $rootedTreeFile = $treeFile.".rooted"; # print "treeFile: $treeFile\nrootedTreeFile: $rootedTreeFile\n"; if (uc($rooting_type) eq "BIOPERL") { rootTree($treeFile,$rootedTreeFile); } elsif (uc($rooting_type) eq "MIDPOINT") { system ("R --slave --no-save --no-restore --no-environ --silent --args $treeFile $rootedTreeFile < $MidPoint_Rooting_R"); } open IN, "<$rootedTreeFile", or return ("can't open $rootedTreeFile"); my $newick = ""; foreach my $line () { $newick.=$line; } close IN; # if ($newick =~ m/:0[^\.]/ && $newick !~ m/:0\./) { if ($newick =~ m/:-/) { my $rootedTreeFile_withMinusLengths = $rootedTreeFile.".withMinusLengths"; my $command = "mv $rootedTreeFile $rootedTreeFile_withMinusLengths"; system ($command); $newick =~ s/:-/:/g; $command = "echo '$newick' > $rootedTreeFile"; system ($command); } if ($newick =~ m/:\d+[^\.\d+]/) { my $rootedTreeFile_badBranchLength = $rootedTreeFile.".badBranchLength"; my $command = "mv $rootedTreeFile $rootedTreeFile_badBranchLength"; system ($command); $newick =~ s/:(\d+)([^\.\d+])/:$1\.0$2/g; # print "$newick\n"; $command = "echo '$newick' > $rootedTreeFile"; system ($command); } }else { print "File does not exist: $treeFile\n"; } } return "ok"; } sub convertBPTrees2MafftFormat{ #################################################################################################################### #convert the tree to mafft format #################################################################################################################### #die " $0 required as input\n" if (scalar(@ARGV) < 4); my $bsDir = shift; my $dataset = shift; my $orig_prog = shift; my $bp_repeats = shift; my $ruby_path=shift; my $suffix=shift; my $rooting_type=shift; if (!defined $ruby_path) { $ruby_path="ruby"; } if (!defined $suffix) { $suffix = ""; } if (!defined $rooting_type) { $rooting_type="BioPerl"; } $bsDir .= "/"; for (my $countTrees=0;$countTrees<$bp_repeats;++$countTrees) { #print "**** $dataset tree_$countTrees\n"; my $treeFile = $bsDir."tree_".$countTrees."/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix; if (-e $treeFile) { my $rootedTreeFile = $treeFile.".rooted"; my $mafftFormatTreeFile = $treeFile.".mafftFormat"; # print "treeFile: $treeFile\nrootedTreeFile: $rootedTreeFile\nmafftFormatTreeFile: $mafftFormatTreeFile\n"; if (uc($rooting_type) eq "BIOPERL") { rootTree($treeFile,$rootedTreeFile); } elsif (uc($rooting_type) eq "MIDPOINT") { system ("R --slave --no-save --no-restore --no-environ --silent --args $treeFile $rootedTreeFile < $MidPoint_Rooting_R"); } open IN, "<$rootedTreeFile", or return ("can't open $rootedTreeFile"); my $newick = ""; foreach my $line () { $newick.=$line; } close IN; # if ($newick =~ m/:0[^\.]/ && $newick !~ m/:0\./) { if ($newick =~ m/:-/) { my $rootedTreeFile_withMinusLengths = $rootedTreeFile.".withMinusLengths"; my $command = "mv $rootedTreeFile $rootedTreeFile_withMinusLengths"; system ($command); $newick =~ s/:-/:/g; $command = "echo '$newick' > $rootedTreeFile"; system ($command); } if ($newick =~ m/:\d+[^\.\d+]/) { my $rootedTreeFile_badBranchLength = $rootedTreeFile.".badBranchLength"; my $command = "mv $rootedTreeFile $rootedTreeFile_badBranchLength"; system ($command); $newick =~ s/:(\d+)([^\.\d+])/:$1\.0$2/g; # print "$newick\n"; $command = "echo '$newick' > $rootedTreeFile"; system ($command); } system("$ruby_path $newick2mafft $rootedTreeFile > $mafftFormatTreeFile\n"); }else { print "File does not exist: $treeFile\n"; } } return "ok"; } sub runAlignBPtrees{ #die " $0 required as input\n" if (scalar(@ARGV) < 3); my $aln_program = ""; my $prog = shift; my $dataset = shift; my $noBPdir = shift; if ($noBPdir !~/(\/)$/){$noBPdir.="/";} my $aminoFile = shift; my $Seq_Type = shift; if ($Seq_Type eq "") {$Seq_Type="AminoAcids";} my $orig_prog=shift; if ($orig_prog eq "") {$orig_prog = $prog;} my $bp_repeats = shift; if ($bp_repeats eq "") {$bp_repeats = 20;} my $align_param=shift; # alignment additional parameters if (!defined $align_param) {$align_param = "";} my $suffix = shift; if (!defined $suffix) {$suffix = "";} my $update_file=shift; #OPTIONAL FOR SERVER TO UPDATE THE PROGRESS my $mafft_prog = shift; my $prank_prog = shift; my $clustalw_prog = shift; my $muscle_prog = shift; my $pagan_prog=shift; my $proc_num = shift; #AMIT my $PRANK_VERSION=0; if (uc($prog) eq "PRANK") { if ($Seq_Type eq "AminoAcids") {$aln_program = $prank_prog;} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $prank_prog;} elsif ($Seq_Type eq "Codons") {$aln_program = $prank_prog." -translate ";} # if PRANK find out it version my $prank_help=`$aln_program`; if ($prank_help=~/.*prunedata.*/){$PRANK_VERSION="121218";} elsif ($prank_help=~/.*showanc.*/){$PRANK_VERSION="120626";} #print "PRANK VERSION:>=$PRANK_VERSION\n"; } elsif (uc($prog) eq "MAFFT"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $mafft_prog." --quiet --amino ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $mafft_prog." --quiet --nuc ";} } elsif (uc($prog) eq "MAFFT_LINSI"){ if ($Seq_Type eq "AminoAcids") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --amino ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --nuc ";} } elsif (uc($prog) eq "CLUSTALW"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $clustalw_prog." -QUIET -TYPE=PROTEIN ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $clustalw_prog." -QUIET -TYPE=DNA ";} } elsif (uc($prog) eq "MUSCLE"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $muscle_prog." -quiet -seqtype protein ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $muscle_prog." -quiet -seqtype dna ";} } elsif (uc($prog) eq "PAGAN"){ $aln_program=$pagan_prog." "; } else { return "please specify either mafft / mafft_linsi / prank / clustalw / muscle / pagan as an alignment program\n"; } my $dir = $noBPdir."BP/"; # my $aminoFile = $noBPdir.$dataset.".fas".$suffix; # Running parallel processes using fork, each will run an equal share of the BP alignments # my $proc_num = 8; my $bp_per_proc = int($bp_repeats / $proc_num) + 1; my @children; # array of pid of children for (my $proc=0; $proc<$proc_num; $proc++) { print "Running proc num $proc\n"; my $pid = fork(); if ($pid) { # parent push(@children, $pid); } elsif ($pid == 0) { # child for (my $tree_num=0; $tree_num<$bp_per_proc; $tree_num++) { my $countTrees = $proc * $bp_per_proc + $tree_num; print "proc num $proc\ttree num $tree_num --> global tree index $countTrees\n"; last if ($countTrees == $bp_repeats); my $treeFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix; if (-e $treeFile) { my $alnFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln"; my $cmdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".cmd"; my $stdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".std"; # if (-e $alnFile) { # print "skipping tree $countTrees because $alnFile already exists.\n"; # next; # } my $cmd = ""; if (uc($prog) eq "PRANK") { if ($PRANK_VERSION==0) { $cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet -noxml > $stdFile";# >& $stdFile"; # -noxml is not supported any more } else # version >=120626 { $cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet > $stdFile"; # -noxml is not supported any more } } elsif ((uc($prog) eq "MAFFT") or (uc($prog) eq "MAFFT_LINSI")){ my $mafftFormatTreeFile = $treeFile.".mafftFormat"; unless (-e $mafftFormatTreeFile) { return ("ERROR: file does not exist: $mafftFormatTreeFile\n"); } if ($align_param=~/addfragments/) { # first build the core MSA my $core_aln_param=""; my @tmp=split (/\Q\-\-\E/,$align_param); my $tmp_size=@tmp; if ($tmp_size>=1) # command line type { for (my $i=0;$i<$tmp_size;$i++) { if ($tmp[$i]=~/addfragments/){delete $tmp[$i];} elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];} elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];} } $core_aln_param=join ("\\\-\\\-",@tmp); } else # server type { my @tmp=split (/\Q--\E/,$align_param); my $tmp_size=@tmp; for (my $i=0;$i<$tmp_size;$i++) { if ($tmp[$i]=~/addfragments/){delete $tmp[$i];} elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];} elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];} } $core_aln_param=join ("\\\-\\\-",@tmp); } #print "CORE PARAMS:$core_aln_param\n";; # QA my $mafftFormatCoreTreeFile=$dir."PRUNE_BP_FOR_CORE_ALN/tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees."CORE.mafftFormat"; my $mafftCoreMSA=$dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln.CORE"; $cmd="$aln_program $core_aln_param --retree 1 --treein $mafftFormatCoreTreeFile $aminoFile > $mafftCoreMSA; "; # use the core MSA and full tree to build the full alignment $cmd.="$aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $mafftCoreMSA > $alnFile"; } else { $cmd = "($aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $aminoFile > $alnFile)";# >& $stdFile"; } } elsif (uc($prog) eq "CLUSTALW") { $cmd = "$aln_program $align_param -infile=$aminoFile -usetree=$treeFile -outfile=$alnFile > $stdFile"; } elsif (uc($prog) eq "MUSCLE") { my $rooted_tree=$treeFile.".rooted"; unless (-e $rooted_tree) { return ("ERROR: file does not exist: $rooted_tree\n"); } $cmd = "$aln_program $align_param -in $aminoFile -usetree_nowarn $rooted_tree -out $alnFile > $stdFile"; } elsif (uc ($prog) eq "PAGAN") { my $rooted_tree=$treeFile.".rooted"; unless (-e $rooted_tree) { return ("ERROR: file does not exist: $rooted_tree\n"); } $cmd = "$aln_program $align_param --seqfile $aminoFile --treefile $rooted_tree --outfile $alnFile > $stdFile"; } #print "$cmd\n"; # QA system ("$cmd"); if ($update_file ne "") { open (PROGRESS,">$update_file"); print PROGRESS "\n

",$countTrees+1," out of $bp_repeats alternative alignments were created

\n"; close (PROGRESS); } # print "$cmdFile\n"; # open IN, ">$cmdFile" or return ("cannot open file $cmdFile\n"); # print IN "#!/bin/sh\n\ncd ".$dir."tree_$countTrees\n\n"; # print IN "$cmd\n"; # close IN; # system ($cmd); if ($prog eq "PRANK") { if ($PRANK_VERSION==0) { system ("cp $alnFile.1.fas $alnFile"); } elsif ($PRANK_VERSION eq "121218") { system ("cp $alnFile.best.fas $alnFile"); } else # ver >=120626 { system ("cp $alnFile.2.fas $alnFile"); } } if ($prog eq "CLUSTALW") { MSA_parser::convert_msa_format($alnFile,"clustalw","$alnFile.fs","fasta"); system ("mv $alnFile $alnFile.orig"); system ("mv $alnFile.fs $alnFile"); } if ($prog eq "PAGAN") { move("$alnFile".".fas", "$alnFile"); } #my $alias = $dataset.".".$prog; #my $qsub = "qsub -q heavy -N $alias $cmdFile"; # print $qsub."\n"; #system ($qsub); } } exit 0; } else { die "ERROR: fork failed: $!\n"; } } # Wait for child processes to end foreach (@children) { my $pid = waitpid($_, 0); print "done with pid $pid\n"; #if ($update_file ne "") { # open (PROGRESS,">$update_file"); # print PROGRESS "\n

",$countTrees+1," out of $bp_repeats alternative alignments were created

\n"; # close (PROGRESS); #} } return "ok"; } sub runAlignBPtrees_GUIDANCE3{ #die " $0 required as input\n" if (scalar(@ARGV) < 3); my $aln_program = ""; my $prog = shift; my $dataset = shift; my $noBPdir = shift; if ($noBPdir !~/(\/)$/){$noBPdir.="/";} my $aminoFile = shift; my $Seq_Type = shift; if ($Seq_Type eq "") {$Seq_Type="AminoAcids";} my $orig_prog=shift; if ($orig_prog eq "") {$orig_prog = $prog;} my $bp_repeats = shift; if ($bp_repeats eq "") {$bp_repeats = 20;} my $align_param=shift; # alignment additional parameters if (!defined $align_param) {$align_param = "";} my $suffix = shift; if (!defined $suffix) {$suffix = "";} my $update_file=shift; #OPTIONAL FOR SERVER TO UPDATE THE PROGRESS my $mafft_prog = shift; my $prank_prog = shift; my $clustalw_prog = shift; my $muscle_prog = shift; my $pagan_prog=shift; my $proc_num = shift; #AMIT my $GapPenDist=shift; # EMP | UNIF my $PRANK_VERSION=0; if (uc($prog) eq "PRANK") { if ($Seq_Type eq "AminoAcids") {$aln_program = $prank_prog;} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $prank_prog;} elsif ($Seq_Type eq "Codons") {$aln_program = $prank_prog." -translate ";} # if PRANK find out it version my $prank_help=`$aln_program`; if ($prank_help=~/.*prunedata.*/){$PRANK_VERSION="121218";} elsif ($prank_help=~/.*showanc.*/){$PRANK_VERSION="120626";} #print "PRANK VERSION:>=$PRANK_VERSION\n"; } elsif (uc($prog) eq "MAFFT"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $mafft_prog." --quiet --amino ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $mafft_prog." --quiet --nuc ";} } elsif (uc($prog) eq "MAFFT_LINSI"){ if ($Seq_Type eq "AminoAcids") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --amino ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --nuc ";} } elsif (uc($prog) eq "CLUSTALW"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $clustalw_prog." -QUIET -TYPE=PROTEIN ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $clustalw_prog." -QUIET -TYPE=DNA ";} } elsif (uc($prog) eq "MUSCLE"){ if ($Seq_Type eq "AminoAcids") {$aln_program = $muscle_prog." -quiet -seqtype protein ";} elsif ($Seq_Type eq "Nucleotides") {$aln_program = $muscle_prog." -quiet -seqtype dna ";} } elsif (uc($prog) eq "PAGAN"){ $aln_program=$pagan_prog." "; } else { return "please specify either mafft / mafft_linsi / prank / clustalw / muscle / pagan as an alignment program\n"; } # SAMPLE OP and EP my $OutEP="$noBPdir/SampledEPVals.log"; my $OutOP="$noBPdir/SampledOPVals.log"; my $op_vals_arr_ref=""; my $ep_vals_arr_ref=""; if (uc ($GapPenDist) eq "EMP") { if ($prog eq "MAFFT") { $OP_DistFile=Guidance::MAFFT_OP_DIST_0_25; $EP_DistFile=Guidance::MAFFT_EP_DIST_0_25; #print LOG "Sample op according to empiric distribution: SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats)\n"; print "Sample op according to empiric distribution: SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats)\n"; $op_vals_arr_ref=SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats); #print LOG "Sample ep according to empiric distribution: SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats)\n"; print "Sample ep according to empiric distribution: SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats)\n"; $ep_vals_arr_ref=SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats); } else { # print LOG "ERROR: EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n"; print "ERROR: EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n"; die "EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n"; } } elsif (uc ($GapPenDist) eq "UNIF") { if ($prog eq "MAFFT") { #print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(1,3,$OutOP,$FORM{Bootstraps})\n"; #$op_vals_arr_ref=SampleFromUnifomDist(1,3,$OutOP,$bp_repeats); # according to mafft web-site defaults: http://mafft.cbrc.jp/alignment/server/index.html # print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(0,25,$OutOP,$bp_repeats)\n"; print "Sample op according to uniform distribution: SampleFromUnifomDist(0,6,$OutOP,$bp_repeats)\n"; $op_vals_arr_ref=SampleFromUnifomDist(0,6,$OutOP,$bp_repeats); # print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(0,40,$OutOP,$bp_repeats)\n"; print "Sample ep according to uniform distribution: SampleFromUnifomDist(0,4,$OutEP,$bp_repeats)\n"; $ep_vals_arr_ref=SampleFromUnifomDist(0,4,$OutEP,$bp_repeats); } elsif ($prog eq "PRANK") { # print LOG "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.01,$OutOP,$FORM{Bootstraps})\n"; # $op_vals_arr_ref=Guidance::SampleFromUnifomDist(0,0.01,$OutOP,$FORM{Bootstraps}); # for prank v.140110 the defaults are: dna 0.025 / prot 0.005 # print LOG "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats)\n"; print "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats)\n"; $op_vals_arr_ref=SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats); # for prank v.140110 the defaults are: dna 0.025 / prot 0.005 } elsif ($prog eq "CLUSTALW") { # print LOG "Sample gap opening panelty according to uniform distribution: Guidance::SampleFromUnifomDist(4,16,$OutOP,$FORM{Bootstraps})\n"; print "Sample gap opening panelty according to uniform distribution: Guidance::SampleFromUnifomDist(4,16,$OutOP,$bp_repeats)\n"; $op_vals_arr_ref=SampleFromUnifomDist(4,16,$OutOP,$bp_repeats); } } my $dir = $noBPdir."BP/"; # my $aminoFile = $noBPdir.$dataset.".fas".$suffix; # Running parallel processes using fork, each will run an equal share of the BP alignments # my $proc_num = 8; my $bp_per_proc = int($bp_repeats / $proc_num) + 1; my @children; # array of pid of children for (my $proc=0; $proc<$proc_num; $proc++) { print "Running proc num $proc\n"; my $pid = fork(); if ($pid) { # parent push(@children, $pid); } elsif ($pid == 0) { # child for (my $tree_num=0; $tree_num<$bp_per_proc; $tree_num++) { my $countTrees = $proc * $bp_per_proc + $tree_num; print "proc num $proc\ttree num $tree_num --> global tree index $countTrees\n"; last if ($countTrees == $bp_repeats); my $treeFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix; if (-e $treeFile) { my $alnFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees."."OP_".$op_vals_arr_ref->[$countTrees]."_EP_".$ep_vals_arr_ref->[$countTrees].".".$prog.".aln"; my $cmdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".cmd"; my $stdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".std"; # if (-e $alnFile) { # print "skipping tree $countTrees because $alnFile already exists.\n"; # next; # } my $cmd = ""; if (uc($prog) eq "PRANK") { if ($PRANK_VERSION==0) { $cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet -noxml > $stdFile";# >& $stdFile"; # -noxml is not supported any more } else # version >=120626 { $cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet > $stdFile"; # -noxml is not supported any more } } elsif ((uc($prog) eq "MAFFT") or (uc($prog) eq "MAFFT_LINSI")){ my $mafftFormatTreeFile = $treeFile.".mafftFormat"; unless (-e $mafftFormatTreeFile) { return ("ERROR: file does not exist: $mafftFormatTreeFile\n"); } if ($align_param=~/addfragments/) { # first build the core MSA my $core_aln_param=""; my @tmp=split (/\Q\-\-\E/,$align_param); my $tmp_size=@tmp; if ($tmp_size>=1) # command line type { for (my $i=0;$i<$tmp_size;$i++) { if ($tmp[$i]=~/addfragments/){delete $tmp[$i];} elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];} elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];} } $core_aln_param=join ("\\\-\\\-",@tmp); } else # server type { my @tmp=split (/\Q--\E/,$align_param); my $tmp_size=@tmp; for (my $i=0;$i<$tmp_size;$i++) { if ($tmp[$i]=~/addfragments/){delete $tmp[$i];} elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];} elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];} } $core_aln_param=join ("\\\-\\\-",@tmp); } #print "CORE PARAMS:$core_aln_param\n";; # QA my $mafftFormatCoreTreeFile=$dir."PRUNE_BP_FOR_CORE_ALN/tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees."CORE.mafftFormat"; my $mafftCoreMSA=$dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln.CORE"; $cmd="$aln_program $core_aln_param --retree 1 --treein $mafftFormatCoreTreeFile $aminoFile > $mafftCoreMSA; "; # use the core MSA and full tree to build the full alignment $cmd.="$aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $mafftCoreMSA > $alnFile"; } else { $cmd = "$aln_program $align_param --op $op_vals_arr_ref->[$countTrees] --ep $ep_vals_arr_ref->[$countTrees] --retree 1 --treein $mafftFormatTreeFile $aminoFile > $alnFile";# >& $stdFile"; } } elsif (uc($prog) eq "CLUSTALW") { $cmd = "$aln_program $align_param -infile=$aminoFile -usetree=$treeFile -outfile=$alnFile > $stdFile"; } elsif (uc($prog) eq "MUSCLE") { my $rooted_tree=$treeFile.".rooted"; unless (-e $rooted_tree) { return ("ERROR: file does not exist: $rooted_tree\n"); } $cmd = "$aln_program $align_param -in $aminoFile -usetree_nowarn $rooted_tree -out $alnFile > $stdFile"; } elsif (uc ($prog) eq "PAGAN") { my $rooted_tree=$treeFile.".rooted"; unless (-e $rooted_tree) { return ("ERROR: file does not exist: $rooted_tree\n"); } $cmd = "$aln_program $align_param --seqfile $aminoFile --treefile $rooted_tree --outfile $alnFile > $stdFile"; } #print "$cmd\n"; # QA system ("$cmd"); if ($update_file ne "") { open (PROGRESS,">$update_file"); print PROGRESS "\n

",$countTrees+1," out of $bp_repeats alternative alignments were created

",$countTrees+1," out of $bp_repeats alternative alignments were created

\n"; # close (PROGRESS); #} } return "ok"; } sub rootTree { #################################################################################################################### #die "USAGE: $0 inTree outTree #inTree must be an unrooted tree - i.e. root node has at least 3 sons. #In outTree the root will have 2 sons #(all direct sons of root will be made biforcating - the rest of tree is left untouched)\n" #################################################################################################################### my $inTree=shift; my $outTree=shift; my $in = new Bio::TreeIO(-file => "$inTree", -format => "newick"); my $out = new Bio::TreeIO(-file => ">$outTree", -format => "newick"); while (my $tree = $in->next_tree) { my $root = $tree->get_root_node; my @sons = $root->each_Descendent; # Remove edges between root-sons foreach my $son (@sons) { $root->remove_Descendent($son); } # Iteratively add my $currFather = $root; while (@sons > 2) { my $son = shift @sons; $currFather->add_Descendent($son); my $midNode = new Bio::Tree::Node(); $currFather->add_Descendent($midNode); $midNode->branch_length(0); $currFather = $midNode; } $currFather->add_Descendent($sons[0]); $currFather->add_Descendent($sons[1]); $sons[0]->branch_length(0); $sons[1]->branch_length(0); $out->write_tree($tree); } } sub pullOutBPtrees_BBL { # TO DO: Unite with pullOutBPtrees # TO DO in order to unite: 1) get the file to parse as an input # 2) QA... #################################################################################################################### # pull out all the BP NJ trees into the BP directory # pull out the original NJ tree (that was done on the complete MSA file) #################################################################################################################### #die "Usage: $0 " if (scalar(@ARGV) < 4); my $noBPdir = shift; my $dataset = shift; my $bp_repeats = shift; my $alnProg = shift; unless ($noBPdir =~ m/\/$/) { $noBPdir.="/"; } if (-e $noBPdir.$dataset.".".$alnProg.".semphy.tree") { unlik ($noBPdir.$dataset.".".$alnProg.".semphy.tree"); } my $BPdir = $noBPdir."BP/"; unless (-e $BPdir) { system ("mkdir $BPdir"); } my $nonUniqueTreesDir=""; # my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.log"; # was in pullOutBPtrees my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.out"; print "semphy log file: $semphyLogFile\n"; my $countTrees=0; my $countUniqueTrees=0; my @numRepeats; if ($alnProg ne "MAFFT") # BUILED ALIGNEMT ONLY FOR UNIQUE TREES { $nonUniqueTreesDir = $BPdir."nonUniqueTrees/"; unless (-e $nonUniqueTreesDir) { system ("mkdir $nonUniqueTreesDir"); } } my $make_unique_trees=""; # yes | no # TO DO: take as arg one day... if ($alnProg ne "MAFFT") { $make_unique_trees="yes"; } else { $make_unique_trees="no"; } my $treeLine=""; my $read_reconstructed_tree=0; # will flag that the starting with BP tree... open IN, "<$semphyLogFile" or return "can't open file $semphyLogFile"; while (my $line=) { if ($line=~/^\# Finished tree reconstruction\./) # The real - not BP tree { my $dumpLine=; $dumpLine=; $dumpLine=; $treeLine=; my $treeFile = $noBPdir.$dataset.".".$alnProg.".semphy.tree"; open OUT, ">$treeFile" or return "can't open file $treeFile"; # write the tree into the treefile print OUT $treeLine; # close treefile close OUT; $treeLine=""; $read_reconstructed_tree=1; } elsif ((($line=~/^\# Tree after BBL\./) or ($line=~/The reconsructed tree:/)) and ($read_reconstructed_tree==1)) # BP tree after BBL { my $dumpLine= if ($line=~/^\# Tree after BBL\./); $treeLine=; if ($make_unique_trees eq "no") { $treeDir = $BPdir."/tree_".$countTrees."/" } else { $treeDir = $nonUniqueTreesDir."/tree_".$countTrees."/"; } unless (-e $treeDir) {system ("mkdir $treeDir");} $treeFile = $treeDir.$dataset.".".$alnProg.".semphy.tree_".$countTrees; $countTrees++; # open treefile open OUT, ">$treeFile" or return "can't open file $treeFile"; # write the tree into the treefile print OUT $treeLine; # close treefile close OUT; if ($make_unique_trees eq "yes") { for ($i=0;$i<$countUniqueTrees;++$i) { $uniqueTreeFile = $BPdir."tree_".$i."/".$dataset.".".$alnProg.".semphy.tree_".$i; $isEqualTopologyResFile = $treeDir."isEqualTopology.".$i.".std"; my $isEqualTopologyCommand = "$isEqualTopologyProg $treeFile $uniqueTreeFile"; my $isEqualTopology = `$isEqualTopologyCommand`; open OUT_EQUAL_TOP, ">$isEqualTopologyResFile" or return "can't open file $isEqualTopologyResFile"; print OUT_EQUAL_TOP "$isEqualTopology\n"; close OUT_EQUAL_TOP; if ($isEqualTopology == 1) { # same tree $numRepeats[$i]++; # print "$isEqualTopology == 1\n"; # debug OP last; } if ($isEqualTopology == 2) { print "skipping ERROR in isEqualTopology of $treeFile and $uniqueTreeFile\n"; next; } } if ($i == $countUniqueTrees) { # The new tree is unique push (@numRepeats,1); my $uniqueTreeDir = $BPdir."tree_".$countUniqueTrees."/"; unless (-e $uniqueTreeDir) {system ("mkdir $uniqueTreeDir");} my $uniqueTreeFile = $uniqueTreeDir.$dataset.".".$alnProg.".semphy.tree_".$countUniqueTrees; system ("cp $treeFile $uniqueTreeFile"); $countUniqueTrees++; } } } } close IN; if ($countTrees != $bp_repeats) { return "ERROR: dataset: $dataset \t countTrees: $countTrees while it should be $bp_repeats \n"; } if ($make_unique_trees eq "yes") { # print the number of repeats per unique tree into file my $numRepeatsFile = $BPdir."numRepeats"; open OUT_NUM_REPEATS, ">$numRepeatsFile" or return "can't open file $numRepeatsFile"; print OUT_NUM_REPEATS "@numRepeats"; close OUT_NUM_REPEATS; return "ok",$countUniqueTrees,\@numRepeats; } else { return "ok"; } } sub pullOutBPtrees { #################################################################################################################### # pull out all the BP NJ trees into the BP directory # pull out the original NJ tree (that was done on the complete MSA file) #################################################################################################################### #die "Usage: $0 " if (scalar(@ARGV) < 4); my $noBPdir = shift; my $dataset = shift; my $bp_repeats = shift; my $alnProg = shift; unless ($noBPdir =~ m/\/$/) { $noBPdir.="/"; } my $BPdir = $noBPdir."BP/"; unless (-e $BPdir) { system ("mkdir $BPdir"); } my $nonUniqueTreesDir=""; if ($alnProg ne "MAFFT") # BUILED ALIGNEMT ONLY FOR UNIQUE TREES { $nonUniqueTreesDir = $BPdir."nonUniqueTrees/"; unless (-e $nonUniqueTreesDir) { system ("mkdir $nonUniqueTreesDir"); } } # my $robinsonFouldCommand = "$phylonet_prog rf"; # open semphy file my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.log"; print "semphy log file: $semphyLogFile\n"; open IN, "<$semphyLogFile" or return "can't open file $semphyLogFile"; my $countTrees=0; my $countUniqueTrees=0; my @numRepeats; my $readTree=0; # this is a flag. 2 == read the tree. 3 == read the tree and check for uniqueness. my $treeDir=""; my $treeFile=""; foreach my $line () { if ($line =~ m/^\# Finished tree reconstruction\./) { $readTree=1; } elsif (($readTree==1) && ($line =~ m/^\# The tree/)) { $treeFile = $noBPdir.$dataset.".".$alnProg.".semphy.tree"; $readTree=2; } elsif (($readTree==1) && ($line =~ m/The reconsructed tree/)) { $readTree=2; if ($alnProg eq "MAFFT") { $treeDir = $BPdir."/tree_".$countTrees."/" } else # CHECK UNIQUE TREE ONLY NOT FOR MAFFT { $treeDir = $nonUniqueTreesDir."/tree_".$countTrees."/"; } unless (-e $treeDir) {system ("mkdir $treeDir");} $treeFile = $treeDir.$dataset.".".$alnProg.".semphy.tree_".$countTrees; $countTrees++; $readTree=3 if ($alnProg ne "MAFFT"); # CHECK UNIQUE TREE ONLY NOT FOR MAFFT } elsif ($readTree>=2) { # open treefile open OUT, ">$treeFile" or return "can't open file $treeFile"; # write the tree into the treefile print OUT $line; # close treefile close OUT; if ($readTree==3) { # compare the tree to all other unique trees for ($i=0;$i<$countUniqueTrees;++$i) { $uniqueTreeFile = $BPdir."tree_".$i."/".$dataset.".".$alnProg.".semphy.tree_".$i; # $RobinsonFouldResFile = $treeDir."rf.".$i.".std"; $isEqualTopologyResFile = $treeDir."isEqualTopology.".$i.".std"; # my $rfCommand = "$robinsonFouldCommand -m $treeFile -e $uniqueTreeFile"; my $isEqualTopologyCommand = "$isEqualTopologyProg $treeFile $uniqueTreeFile"; # my @rfResults = `$rfCommand`; my $isEqualTopology = `$isEqualTopologyCommand`; # my $editedRFresults = $rfResults[0]; # open OUT_RF, ">$RobinsonFouldResFile" or return "can't open file $RobinsonFouldResFile"; # print OUT_RF "@rfResults"; open OUT_EQUAL_TOP, ">$isEqualTopologyResFile" or return "can't open file $isEqualTopologyResFile"; print OUT_EQUAL_TOP "$isEqualTopology\n"; close OUT_EQUAL_TOP; # if ($editedRFresults =~ m/ERROR/) { # print "skipping error in rfResults of $treeFile and $uniqueTreeFile : $editedRFresults\n"; # print OUT_RF "skipping error in rfResults of $treeFile and $uniqueTreeFile : $editedRFresults\n"; # close OUT_RF; # next; # } # close OUT_RF; # print "isEqualTopology == $isEqualTopology\n"; #debug OP # if ($editedRFresults =~ m/^0\s+0\s+/) { # same tree, FP=0 and FN=0 if ($isEqualTopology == 1) { # same tree $numRepeats[$i]++; # print "$isEqualTopology == 1\n"; # debug OP last; } if ($isEqualTopology == 2) { print "skipping ERROR in isEqualTopology of $treeFile and $uniqueTreeFile\n"; next; } } if ($i == $countUniqueTrees) { # The new tree is unique push (@numRepeats,1); my $uniqueTreeDir = $BPdir."tree_".$countUniqueTrees."/"; unless (-e $uniqueTreeDir) {system ("mkdir $uniqueTreeDir");} my $uniqueTreeFile = $uniqueTreeDir.$dataset.".".$alnProg.".semphy.tree_".$countUniqueTrees; system ("cp $treeFile $uniqueTreeFile"); $countUniqueTrees++; } } $readTree=1; } } close IN; if ($countTrees != $bp_repeats) { return "ERROR: dataset: $dataset \t countTrees: $countTrees while it should be $bp_repeats \n"; } if ($alnProg ne "MAFFT") { # print the number of repeats per unique tree into file my $numRepeatsFile = $BPdir."numRepeats"; open OUT_NUM_REPEATS, ">$numRepeatsFile" or return "can't open file $numRepeatsFile"; print OUT_NUM_REPEATS "@numRepeats"; close OUT_NUM_REPEATS; return "ok",$countUniqueTrees,\@numRepeats; } else { return "ok"; } } sub copyBootstrapMSA2oneDir { #die "Usage: $0 " if (scalar(@ARGV) < 4); my ($BP_MSA_dir,$BPdir,$prog,$RefNumRepeats4UniqueTree) = @_; my @numRepeats4UniqueTree= @$RefNumRepeats4UniqueTree; print "numRepeats4UniqueTree: @numRepeats4UniqueTree\n"; mkdir ($BP_MSA_dir); my $countTrees=0; for ($i=0; $i) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Codes{$Code}=$Seq_name; } close (CODES); open (IN,$Aln_with_Codes) or return ("Guidance::codes2nameFastaFrom1 Can't open the Alignment with codes file: '$Aln_with_Codes' $!"); open (OUT,">$Aln_with_names") or return ("Guidance::codes2nameFastaFrom1 Can't open the Alignment with names file: '$Aln_with_names' for writing $!"); while (my $line=){ chomp ($line); if ($line=~/>(seq[0-9]+)$/) { print OUT ">",$Codes{$1},"\n"; } elsif ($line=~/^>([0-9]+)/) { print OUT ">",$Codes{$1},"\n"; } else # Seq Line { print OUT "$line\n"; } } close OUT; close IN; return "OK"; } sub ConvertNamesOfAlignWithSeed ################################################################################################### # Take alignment created with seed (by MAFFT) and convert the seq with _seed_ prefix into numbers # IMPORTANT: The seed seq must be first so MAFFT --reorder argument must not be use ################################################################################################### { my $aln=shift; my $out=shift; open (IN,$aln) || return "Guidance::ConvertNamesOfAlignWithSeed: Can't open In '$aln' $!"; open (OUT,">$out") || return "Guidance::ConvertNamesOfAlignWithSeed: Can't open Out '$out' $!"; my $seed_counter=0; while (my $line=) { if ($line=~/>_seed_(.*)/) { $seed_counter++; $line=~s/>_seed_(.*)/>$seed_counter/; print OUT "$line"; } else { print OUT $line; } } close (IN); close (OUT); return "ok"; } sub name2codeFasta_without_codded_out { #################################################################################################################### # Convert the names in a fasta file to numbers, and creates a code file with the names and the codes (running number) ################################################################################################################### my $in_fileName = shift; my $code_fileName = shift; my $counter_offset= shift; # optional my $in_file = Bio::SeqIO->new(-file => $in_fileName , '-format' => 'Fasta'); my $code_file = new FileHandle(">>$code_fileName") or return ("Can't write to $code_fileName $!"); $counter_offset=1 if (!defined $counter_offset); $counter_offset=1 if ($counter_offset==0); my $counter = $counter_offset; my $i; while ( my $seqObj = $in_file->next_seq() ) { my $name = $seqObj->display_id(); $name.= " ".$seqObj->desc() if ($seqObj->desc()); print $code_file "$name\t$counter\n"; $counter++; } $in_file->close(); $code_file->close(); return "ok",$counter; } sub name2codeFastaFrom1 { #################################################################################################################### # Convert the names in a fasta file to numbers, and creates a code file with the names and the codes (running number) ################################################################################################################### my $in_fileName = shift; my $code_fileName = shift; my $out_fileName = shift; my $counter_offset=shift; # optional my $OutNameFormat=shift; # {num|seqNum} - The format of name in the coded file (num=only the seq number, seqNum=seq%04u [HoT]) default=num; $OutNameFormat="num" if (!defined $OutNameFormat); $OutNameFormat="num" if ($OutNameFormat eq ""); my $in_file = Bio::SeqIO->new(-file => $in_fileName , '-format' => 'Fasta'); my $code_file = new FileHandle(">>$code_fileName") or return ("Can't write to $code_fileName $!"); my $out_file = new FileHandle(">$out_fileName") or return ("Can't write to $out_fileName"); $counter_offset=1 if (!defined $counter_offset); $counter_offset=1 if ($counter_offset==0) and ($OutNameFormat ne "seqNum"); my $counter = $counter_offset; my $i; while ( my $seqObj = $in_file->next_seq() ) { my $name = $seqObj->display_id(); $name.= " ".$seqObj->desc() if ($seqObj->desc()); if ($OutNameFormat eq "seqNum") { my $sn=sprintf('seq%04u',$counter); print $code_file "$name\t$sn\n"; } else { print $code_file "$name\t$counter\n"; } my $seq = $seqObj->seq(); if ($OutNameFormat eq "seqNum") { my $sn=sprintf('seq%04u',$counter); print $out_file ">$sn\n"; } else { print $out_file ">$counter\n"; } for($i=0;$iclose(); $in_file->close(); $code_file->close(); return "ok"; } sub average{ my $data = $_[0]; my $data_size=scalar(@{$data}); if ($data_size==0) { return ("ERROR: Empty array\n"); } my $total = 0; foreach my $val (@{$data}) { $total += $val; } my $average = $total / $data_size; return $average; } sub stdev{ my $data = $_[0]; my $average = $_[1]; my $data_size=scalar (@{$data}); if($data_size == 1){ return 0; } my $sqtotal = 0; foreach my $val (@{$data}) { $sqtotal += ($average-$val) ** 2; } my $std = ($sqtotal / ($data_size-1)) ** 0.5; return $std; } sub Calculate_mean_and_std # get a delimited file and calcilate the std and average { my $DataFile=shift; my $Column=shift; $Column--; # for split my $AVERAGE="NAN"; my $STD="NAN"; my @data=(); open (DATA_FILE,"<$DataFile") or return ("Calculate_mean_and_std: Can't open file '$DataFile' $!"); my $line=; # for header while ($line=) { chomp ($line); $line=~ s/^\s+|\s+$//g; my @line=split (/\s+/,$line); push (@data,$line[$Column]) if ((defined $line[$Column]) and ($line[$Column]=~/\d/)); } close (DATA_FILE); my $avg=average(\@data); my $std=stdev(\@data,$avg); return ($avg,$std); } sub removeLowSPseq_Consider_Z_OLD{ # will remove all sequences in which their Z score is bellow cutoff and their SP score is also below cutoff my $msaFile=shift; my $SeqSpFile=shift; my $outFile=shift; my $cutoof=shift; my $Z_cutoff=shift; my $removed_seq_file=shift; my ($mean,$std)=Calculate_mean_and_std ($SeqSpFile,2); my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$SeqSpFile' $!"); open (OUT,">$outFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$outFile' $!"); open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq_Consider_Z: Can't open file '$removed_seq_file' $!"); my $line=; #For header while ($line=){ if ($line=~m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { my $row_num=$1; my $Seq_SP_Score=$2; my $seq_obj = $aln->get_seq_by_pos($row_num); my $seq=$seq_obj->seq(); $seq=~s/-//g; $seq =~ s/(.{60,60})/$1\n/g ; $seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ; my $Z_score="NaN"; if ($std>0) {$Z_score=($Seq_SP_Score-$mean)/$std;} if ($Seq_SP_Score eq "nan") { print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score_Z_$Z_score\n",$seq,"\n"; } elsif (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # a negative outlier { if (($Seq_SP_Score<$cutoof)) # To avoid filter highly scored position { print REMOVED_SEQ ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } else { print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } } else { print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } } } close (SEQ_SP_SCORES); close (OUT); close (REMOVED_SEQ); return "OK"; } sub removeLowSPseq_Consider_Z{ # will remove all sequences in which their Z score is bellow cutoff and their SP score is also below cutoff # Alow removal of sequences by their row number ($type=ByRowNum)in the MSA (when using the MSA set score raw file the scores are for MSA row) # or by sequence name ($type=BySeqName) # when using ByRowNum - we use BioPerl object [default] # when using BySeqName - we use hash to represent MSA (currently without extenssive MSA validation) my $msaFile=shift; my $SeqSpFile=shift; my $outFile=shift; my $cutoof=shift; my $Z_cutoff=shift; my $removed_seq_file=shift; my $type=shift; # ByRowNum | BySeqName if (!defined $type){$type="ByRowNum";} my ($mean,$std)=Calculate_mean_and_std ($SeqSpFile,2); my ($MSA_HashRef,$MSA_Length,$MSA_Order_ArrayRef); # For $type=BySeqName my ($in,$aln); # For $type=ByRowNum if (uc($type) eq "BYSEQNAME") { my @ans=readMSA_to_Hash($msaFile); if ($ans[0] ne "OK"){return "removeLowSPseq_Consider_Z:$ans[0]\n";} $MSA_HashRef=$ans[1]; $MSA_Length=$ans[2]; $MSA_Order_ArrayRef=$ans[3]; } elsif (uc($type) eq "BYROWNUM") { $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!"; $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT } open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$SeqSpFile' $!"); open (OUT,">$outFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$outFile' $!"); open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq_Consider_Z: Can't open file '$removed_seq_file' $!"); my $line=; #For header while ($line=){ if ($line=~m/^\s*(\S+)\s+(\d+(\.\d+)?)/) { my $seq_id=$1; my $Seq_SP_Score=$2; my ($seq,$seq_name); if (uc($type) eq "BYSEQNAME") { $seq=join("",@{$MSA_HashRef->{$seq_id}}); # seq_id=seq_name $seq_name=$seq_id; } elsif (uc($type) eq "BYROWNUM") { $seq_obj = $aln->get_seq_by_pos($seq_id); # seq_id=row_num $seq=$seq_obj->seq(); $seq_name=$seq_obj->id(); } $seq=~s/-//g; $seq =~ s/(.{60,60})/$1\n/g ; $seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ; my $Z_score="NaN"; if ($std>0) {$Z_score=($Seq_SP_Score-$mean)/$std;} if ($Seq_SP_Score eq "nan") { print OUT ">".$seq_name,"_SP_$Seq_SP_Score_Z_$Z_score\n",$seq,"\n"; } elsif (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # a negative outlier { if (($Seq_SP_Score<$cutoof)) # To avoid filter highly scored position { print REMOVED_SEQ ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } else { print OUT ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } } else { print OUT ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n"; } } } close (SEQ_SP_SCORES); close (OUT); close (REMOVED_SEQ); return "OK"; } sub removeLowSPseq{ # Alow removal of sequences by their row number ($type=ByRowNum)in the MSA (when using the MSA set score raw file the scores are for MSA row) # or by sequence name ($type=BySeqName) # when using ByRowNum - we use BioPerl object [default] # when using BySeqName - we use hash to represent MSA (currently without extenssive MSA validation) my $msaFile=shift; my $SeqSpFile=shift; my $outFile=shift; my $cutoof=shift; my $removed_seq_file=shift; my $type=shift; # ByRowNum | BySeqName if (!defined $type){$type="ByRowNum";} my ($MSA_HashRef,$MSA_Length,$MSA_Order_ArrayRef); # For $type=BySeqName my ($in,$aln); # For $type=ByRowNum if (uc($type) eq "BYSEQNAME") { my @ans=readMSA_to_Hash($msaFile); if ($ans[0] ne "OK"){return "removeLowSPseq:$ans[0]\n";} $MSA_HashRef=$ans[1]; $MSA_Length=$ans[2]; $MSA_Order_ArrayRef=$ans[3]; } elsif (uc($type) eq "BYROWNUM") { $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!"; $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT } open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq: Can't open file '$SeqSpFile' $!"); open (OUT,">$outFile") or return ("removeLowSPseq: Can't open file '$outFile' $!"); open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq: Can't open file '$removed_seq_file' $!"); my $line=; #For header while ($line=){ if ($line=~m/^\s*(\S+)\s+(\d+(\.\d+)?)/) { my $seq_id=$1; my $Seq_SP_Score=$2; my ($seq,$seq_name); if (uc($type) eq "BYSEQNAME") { $seq=join ("",@{$MSA_HashRef->{$seq_id}}); # seq_id=seq_name $seq_name=$seq_id; } elsif (uc($type) eq "BYROWNUM") { $seq_obj = $aln->get_seq_by_pos($seq_id); # seq_id=row_num $seq=$seq_obj->seq(); $seq_name=$seq_obj->id(); } $seq=~s/-//g; $seq =~ s/(.{60,60})/$1\n/g ; $seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ; if ($Seq_SP_Score>=$cutoof) { print OUT ">".$seq_name,"\n",$seq,"\n"; } else { print REMOVED_SEQ ">".$seq_name,"\n",$seq,"\n"; } } } close (SEQ_SP_SCORES); close (OUT); close (REMOVED_SEQ); return "OK"; } sub removeLowSPseq_OLD{ # before the option to filter by seq name - do it by row number! my $msaFile=shift; my $SeqSpFile=shift; my $outFile=shift; my $cutoof=shift; my $removed_seq_file=shift; my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq: Can't open file '$SeqSpFile' $!"); open (OUT,">$outFile") or return ("removeLowSPseq: Can't open file '$outFile' $!"); open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq: Can't open file '$removed_seq_file' $!"); my $line=; #For header while ($line=){ if ($line=~m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { my $row_num=$1; my $Seq_SP_Score=$2; my $seq_obj = $aln->get_seq_by_pos($row_num); my $seq=$seq_obj->seq(); $seq=~s/-//g; $seq =~ s/(.{60,60})/$1\n/g ; $seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ; if ($Seq_SP_Score>=$cutoof) { print OUT ">".$seq_obj->id(),"\n",$seq,"\n"; } else { print REMOVED_SEQ ">".$seq_obj->id(),"\n",$seq,"\n"; } } } close (SEQ_SP_SCORES); close (OUT); close (REMOVED_SEQ); return "OK"; } sub removeLowSPsites_Consider_Z { # no checks of the input files are done. #die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF Z_CUTOFF" if (@ARGV < 5); my $msaFile=shift; my $spFile=shift; my $outFile=shift; my $cutoff=shift; my $Z_cutoff=shift; my $Pos_removed_file=shift; my $MSA_Length=0; my $Num_Pos_removed=0; my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta'); my @seqs; # read the file into an array while (my $seqObj = $in_fasta->next_seq()) { push(@seqs,$seqObj); if ($MSA_Length==0) { $MSA_Length=$seqObj->length; } } my ($mean,$std)=Calculate_mean_and_std ($spFile,2); open IN, "<$spFile" or return ("removeLowSPsites_Consider_Z: can't open file '$spFile'\n"); open OUT, ">$outFile" or return ("removeLowSPsites_Consider_Z: can't open file '$outFile' $!\n"); if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites_Consider_Z: can't open file '$Pos_removed_file' $!\n");} my $numRemovedPos = 0; foreach my $line() { if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { my $site_num=$1; my $site_score=$2; my $Z_score="NaN"; if ($std>0) {$Z_score=($site_score-$mean)/$std;} if (($Z_score ne "NaN") and ($Z_score<-$Z_cutoff)) # an outlier { if ($site_score < $cutoff) { removePos($site_num,$numRemovedPos,\@seqs); print REMOVED_POS "Remove Pos: $site_num\tScore: $site_score\tZ_Score:$Z_score\n" if ($Pos_removed_file ne ""); $numRemovedPos++; } } } } foreach my $seqObj (@seqs) { my $id = $seqObj->id(); my $seq = $seqObj->seq(); print OUT ">$id\n$seq\n"; } close (IN); close (OUT); close (REMOVED_POS); return ("OK",$numRemovedPos,$MSA_Length); } sub removeLowSPsites { # no checks of the input files are done. #die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF" if (@ARGV < 4); my $msaFile=shift; my $spFile=shift; my $outFile=shift; my $cutoff=shift; my $Pos_removed_file=shift; # my $Alphabet=shift; my $MSA_Length=0; my $Num_Pos_removed=0; my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta'); # my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta',-alphabet => $Alphabet); my @seqs; # read the file into an array while (my $seqObj = $in_fasta->next_seq()) { push(@seqs,$seqObj); if ($MSA_Length==0) { $MSA_Length=$seqObj->length; } } open IN, "<$spFile" or return ("removeLowSPsites: can't open file '$spFile'\n"); open OUT, ">$outFile" or return ("removeLowSPsites: can't open file '$outFile' $!\n"); if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites: can't open file '$Pos_removed_file' $!\n");} my $numRemovedPos = 0; foreach my $line() { if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { if ($2 < $cutoff) { removePos($1,$numRemovedPos,\@seqs); # removePos($1,$numRemovedPos,\@seqs,$Alphabet); print REMOVED_POS "Remove Pos: $1\tScore: $2\n" if ($Pos_removed_file ne ""); $numRemovedPos++; } } } foreach my $seqObj (@seqs) { my $id = $seqObj->id(); my $seq = $seqObj->seq(); print OUT ">$id\n$seq\n"; } close (IN); close (OUT); close (REMOVED_POS); return ("OK",$numRemovedPos,$MSA_Length); } sub removePos { my $numRemovedPos=$_[1]; my $seq_ref=$_[2]; # my $Alphabet=$_[3]; my $pos2remove = $_[0] - $numRemovedPos; foreach my $seqObj (@{$seq_ref}){ my $new_seq = ""; if ($pos2remove>1) { $new_seq = $seqObj->subseq(1,$pos2remove-1); } if ($pos2remove< $seqObj->length()) { $new_seq .= $seqObj->subseq($pos2remove+1,$seqObj->length()); } $seqObj->seq($new_seq); # $seqObj->alphabet($Alphabet); } } sub removeLowSPsites_NoBioPerl_Consider_Z { # no checks of the input files are done. #die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF Z_CUTOFF POS_REMOVED_FILE" if (@ARGV < 6); my $msaFile=shift; my $spFile=shift; my $outFile=shift; my $cutoff=shift; my $Z_cutoff=shift; my $Pos_removed_file=shift; my $Num_Pos_removed=0; my @ans=readMSA_to_Hash($msaFile); if ($ans[0] ne "OK"){return "removeLowSPsites_Consider_Z:$ans[0]\n";} my $MSA_HashRef=$ans[1]; # hash of array, each key is seq ID and the value is an array with the seq my $MSA_Length=$ans[2]; my $MSA_Order_ArrayRef=$ans[3]; my ($mean,$std)=Calculate_mean_and_std ($spFile,2); open IN, "<$spFile" or return ("removeLowSPsites_Consider_Z: can't open file '$spFile'\n"); open OUT, ">$outFile" or return ("removeLowSPsites_Consider_Z: can't open file '$outFile' $!\n"); if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites_Consider_Z: can't open file '$Pos_removed_file' $!\n");} my $numRemovedPos = 0; foreach my $line() { if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { my $site_num=$1; my $site_score=$2; my $Z_score="NaN"; if ($std>0) {$Z_score=($site_score-$mean)/$std;} if (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # an outlier { if ($site_score < $cutoff) { $MSA_HashRef=removePos_noBioPerl($site_num-1,$MSA_HashRef); print REMOVED_POS "Remove Pos: $site_num\tScore: $site_score\tZ_Score: $Z_score\n" if ($Pos_removed_file ne ""); $numRemovedPos++; } } } } foreach my $seqID (@{$MSA_Order_ArrayRef}) { my $seq = join ("",@{$MSA_HashRef->{$seqID}}); if ($seq=~/^[-]+$/) {print "WARNNING: After removing positions scored below $cutoff and Z_Score: $Z_cutoff, the sequence $seqID comprised of only gap characters...\n";} print OUT ">$seqID\n$seq\n"; } close (IN); close (OUT); close (REMOVED_POS); return ("OK",$numRemovedPos,$MSA_Length); } sub removeLowSPsites_NoBioPerl { # no checks of the input files are done. #die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF POS_REMOVED_FILE" if (@ARGV < 5); my $msaFile=shift; my $spFile=shift; my $outFile=shift; my $cutoff=shift; my $Pos_removed_file=shift; my $Num_Pos_removed=0; my @ans=readMSA_to_Hash($msaFile); if ($ans[0] ne "OK"){return "removeLowSPsites:$ans[0]\n";} my $MSA_HashRef=$ans[1]; my $MSA_Length=$ans[2]; my $MSA_Order_ArrayRef=$ans[3]; open IN, "<$spFile" or return ("removeLowSPsites: can't open file '$spFile'\n"); open OUT, ">$outFile" or return ("removeLowSPsites: can't open file '$outFile' $!\n"); if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites: can't open file '$Pos_removed_file' $!\n");} my $numRemovedPos = 0; foreach my $line() { chomp ($line); if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) { if ($2 < $cutoff) { my $Pos=$1; $MSA_HashRef=removePos_noBioPerl ($Pos-1,$MSA_HashRef); # Pos-1 because array start from 0; print REMOVED_POS "Remove Pos: $1\tScore: $2\n" if ($Pos_removed_file ne ""); $numRemovedPos++; } } } foreach my $seqID (@{$MSA_Order_ArrayRef}) { my $seq = join ("",@{$MSA_HashRef->{$seqID}}); if ($seq=~/^[-]+$/) {print "WARNNING: After removing positions scored below $cutoff, the sequence $seqID comprised of only gap characters...\n";} print OUT ">$seqID\n$seq\n"; } close (IN); close (OUT); close (REMOVED_POS); return ("OK",$numRemovedPos,$MSA_Length); } sub printMSA_Hash { my $refToHash=shift; foreach my $key (keys %{$refToHash}) { print ">$key\n",join ("",@{$refToHash->{$key}}),"\n"; } } sub readMSA_to_Hash { # Take an MSA in FASTA format and return (1) an hash of arrays where seq id is key and the seq is an array; (2) MSA length (3) ref to array with the order of seq header in orig file my $MSA_File=$_[0]; my %MSA_Hash=(); my $MSA_Length=0; my @MSA_Order=(); # open file open (my $inMSA, "<", $MSA_File) or return ("GUIDANCE::readMSA_to_Hash: FAILED to open '$MSA_File' $!"); ## 1.1. Read FASTA header and save it my $fastaLine = <$inMSA>; while (defined $fastaLine) { chomp $fastaLine; my $header = substr($fastaLine,1); ## 1.2. Read seq until next header $fastaLine = <$inMSA>; my $seq = ""; while ((defined $fastaLine) and (substr($fastaLine,0,1) ne ">" )) { chomp $fastaLine; $seq .= $fastaLine; $fastaLine = <$inMSA>; } ## 2.1 update hash my @seq_arr=split(//,$seq); if (exists $MSA_Hash{$header}) { print "[WARN] The sequence name '$header' appear more than once in the MSA '$MSA_File', Consider only the first instance....\n"; } else { $MSA_Hash{$header}=[@seq_arr]; push (@MSA_Order,$header); } if ($MSA_Length==0) { $MSA_Length=length ($seq); } } # close file close ($inMSA); return ("OK",\%MSA_Hash,$MSA_Length,\@MSA_Order); } sub removePos_noBioPerl { # Get MSAhash where each key is seqID and each value is reff to array with seq; # Give a position, rumove the specific char from all sequences my $PosToRemove=$_[0]; my $MSA_Hash_ref=$_[1]; foreach my $Seq_ID (keys %{$MSA_Hash_ref}) { $MSA_Hash_ref->{$Seq_ID}[$PosToRemove]=""; } return $MSA_Hash_ref; } sub codes2name_scoresFile { my $Score_File=shift; my $Codes_File=shift; my $Out=shift; my %Codes=(); open (CODES,$Codes_File) or return ("codes2name_scoresFile:Can't open the Codes file: '$Codes_File' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Codes{$Code}=$Seq_name; } close (CODES); open (OUT,">$Out") or return ("codes2name_scoresFile: Can't open out file: '$Out' $!"); open (SCORES,$Score_File) or return ("codes2name_scoresFile: Can't open Scores file: '$Score_File' $!"); my $line=; #Header; print OUT "SEQUENCE_NAME\tSEQUENCE_SCORE\n"; while ($line=) { my $code=""; my $score=""; if ($line=~/([0-9]+)\s+([0-9.]+)/){ $code=$1; $score=$2; } if ($Codes{$code}) { print OUT "$Codes{$code}\t$score\n"; } else { print OUT "$line"; } } close (OUT); close (SCORES); return "OK"; } sub Convert_to_Codons_Numbering { my $score_file=shift; my $score_codons_file=shift; open (IN,$score_file)|| return ("Can't open the input file: '$score_file' $!\n"); open (OUT,">$score_codons_file") || return ("Can't open the output file: '$score_codons_file' $!\n"); while (my $line=) { chomp($line); $line=trim($line); my @line=split(/\s+/,$line); # my $array_size=scalar(@line); if ($line[0]=~/[0-9]+/) { $array_size=scalar(@line)-1; print OUT $line[0]*3-2,"\t",join("\t",@line[1..$array_size]),"\n"; print OUT $line[0]*3-1,"\t",join("\t",@line[1..$array_size]),"\n"; print OUT $line[0]*3,"\t",join("\t",@line[1..$array_size]),"\n"; } else { print OUT join("\t",@line),"\n"; } } close (IN); close (OUT); return "OK"; } sub trim($) { my $string = shift; $string =~ s/^[\s\t]+//; $string =~ s/[\s\t]+$//; return $string; } sub MSA_row_Num_to_Seq_Name { my $MSA=shift; my $Codes_Name_File=shift; my %MSA_Row_Seq_Name=(); my %Code_Names=(); open (CODES,$Codes_Name_File) or return ("Guidance::MSA_row_Num_to_Seq_Name Can't open the Codes file: '$Codes_Name_File' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); # Read MSA my $MSA_Depth; my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $MSA) or return ("MSA_row_Num_to_Seq_Name can't open '$MSA': $!"); my $aln = $in->next_aln; $aln->verbose(1); # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); @ans=MSA_parser::check_msa_licit_and_size($MSA,"fasta","no"); if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} else {return "MSA_row_Num_to_Seq_Name: ".join (" ",@ans);} for(my $i=1;$i<=$MSA_Depth;$i++){ my $seq = $aln->get_seq_by_pos($i); my $Seq_Name=$Code_Names{$seq->id}; $MSA_row_Num_to_Seq_Name{$i}=$Seq_Name; print "ROW $i\t$Seq_Name\n"; } return ("OK",\%MSA_row_Num_to_Seq_Name); } sub codes2name_scoresFile_NEW # The Scores file { my $Score_File=shift; my $Codes_File=shift; my $MSA_File=shift; my $Out=shift; my %MSA_row_Num_to_Seq_Name=(); my %Code_Names=(); # Read codes open (CODES,$Codes_File) or return ("Guidance::codes2name_scoresFile_NEW Can't open the Codes file: '$Codes_File' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); # Read MSA to see which seq in which row and assign the correct code name my $MSA_Depth; my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $MSA_File) or return ("Guidance::codes2name_scoresFile_NEW can't open $MSA_File: $!"); my $aln = $in->next_aln; $aln->verbose(1); # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); @ans=MSA_parser::check_msa_licit_and_size($MSA_File,"fasta","no"); if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} else {return ": ".join (" ",@ans);} for(my $i=1;$i<=$MSA_Depth;$i++){ my $seq = $aln->get_seq_by_pos($i); my $Seq_Name=$Code_Names{$seq->id}; $MSA_row_Num_to_Seq_Name{$i}=$Seq_Name; # print "ROW $i\t$Seq_Name\n"; } #Add names to score file open (OUT,">$Out") or return ("Guidance::codes2name_scoresFile_NEW: Can't open out file: '$Out' $!"); open (SCORES,$Score_File) or return ("Guidance::codes2name_scoresFile_NEW: Can't open Scores file: '$Score_File' $!"); my $line=; #Header; print OUT "SEQUENCE_NAME\tSEQUENCE_SCORE\n"; while ($line=) { my $code=""; my $score=""; my $MSA_row_Num=""; if ($line=~/([0-9]+)\s+([0-9.]+)/){ $MSA_row_Num=$1; $score=$2; } if (defined $MSA_row_Num_to_Seq_Name{$MSA_row_Num}) { print OUT "$MSA_row_Num_to_Seq_Name{$MSA_row_Num}\t$score\n"; } else { print OUT "$line"; } } close (OUT); close (SCORES); return "OK"; } # JALVIEW OUTPUTS sub make_Jalview_AnnotationGraph { my $Jalview_AnnotFile=shift; # OUT FILE my $Data_File=shift; # file ordered by the X's, first field must be X my $Y_label=shift; # The Y label my $Y_data_Col=1; # The Col of Y data my $last_x = 0; open (OUT,">$Jalview_AnnotFile") || return ("Can't open outAnnotationsFile '$Jalview_AnnotFile': $!"); print OUT "JALVIEW_ANNOTATION\n"; print OUT "BAR_GRAPH\t$Y_label\t"; open (DATA_FILE,$Data_File) || die ("make_Jalview_AnnotationGraph: Can't open data file '$Data_File' $!"); my $line=; # header while ($line=) { chomp ($line); my @data=split(",",$line); if (($data[$Y_data_Col] ne "nan") and (($data[0]-$last_x)==1)) { print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|"; $last_x=$data[0]; } elsif ($data[1] ne "nan") { while ($data[0]-$last_x!=1) { print OUT "0,0,0|"; $last_x++; } print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|"; $last_x=$data[0]; } elsif ($data[$Y_data_Col] eq "nan") { print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|"; $last_x=$data[0]; } } print OUT "\n"; close (OUT); } sub make_Jalview_Color_MSA { # Data for MSA coloring my $inMsaFile=shift; # MSA File my $scoresFile=shift; # Scores File my $outJalviewFeaturesFile=shift; my $codesFile=shift; # OPTIONAL # Global VARS my $sequenceLengthForDisplay=400000; # Print HTML start open JALVIEW_FEATURES, ">$outJalviewFeaturesFile" or return ("Can't open outFeaturesFile '$outJalviewFeaturesFile': $!"); #td.Score9{ color: #FFFFFF; background: #A02560; #td.Score8{ background: #F07DAB; #td.Score7{ background: #FAC9DE; #td.Score6{ background: #FCEDF4; #td.Score5{ background: #FFFFFF; #td.Score4{ background: #EAFFFF; #td.Score3{ background: #D7FFFF; #td.Score2{ background: #8CFFFF; #td.Score1{ background: #10C8D1; #td.ScoreNaN background: #C0C0C0; print JALVIEW_FEATURES "Score1\t10C8D1\n"; print JALVIEW_FEATURES "Score2\t8CFFFF\n"; print JALVIEW_FEATURES "Score3\tD7FFFF\n"; print JALVIEW_FEATURES "Score4\tEAFFFF\n"; print JALVIEW_FEATURES "Score5\tFFFFFF\n"; print JALVIEW_FEATURES "Score6\tFCEDF4\n"; print JALVIEW_FEATURES "Score7\tFAC9DE\n"; print JALVIEW_FEATURES "Score8\tF07DAB\n"; print JALVIEW_FEATURES "Score9\tA02560\n"; print JALVIEW_FEATURES "ScoreNaN\tC0C0C0\n"; print JALVIEW_FEATURES "STARTGROUP\tGUIDANCE\n"; open SCORES, $scoresFile or return ("Can't open $scoresFile: $!"); my %scores; my %Code_Names; foreach () { next if (/^\#/); s/^\s+//; my ($col, $seq, $score) = split; $scores{$seq}[$col] = $score; } # Read Codes if ($codesFile ne "") { open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!"); while (my $line=) { chomp $line; my ($Seq_name,$Code)=split("\t",$line); $Code_Names{$Code}=$Seq_name; } close (CODES); } # Read MSA my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!"; my $aln = $in->next_aln; $aln->verbose(1); #HAIM COMMNET # Otherwise, bioperl adds sequence start/stop values $aln->set_displayname_flat(); #HAIM COMMENT @ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD # Print HTML start my %msaColors = (); my %msaPrintColors = (); my $lineCounter; my @line; my $key; my @msaRightOrder=0; my $msaRightOrderCounter=0; my @colorstep = (); #color steps $colorstep[0] = "Score1"; #Not confident $colorstep[1] = "Score2"; $colorstep[2] = "Score3"; $colorstep[3] = "Score4"; $colorstep[4] = "Score5"; #average $colorstep[5] = "Score6"; $colorstep[6] = "Score7"; $colorstep[7] = "Score8"; $colorstep[8] = "Score9"; #Most confident $colorstep[9] = "Score9"; #Most confident (the score is exactly 1) my @colorstep_code = (); #color steps $colorstep_code[0] = "#10C8D1"; #Not confident $colorstep_code[1] = "#8CFFFF"; $colorstep_code[2] = "#D7FFFF"; $colorstep_code[3] = "#EAFFFF"; $colorstep_code[4] = "#FFFFFF"; #average $colorstep_code[5] = "#FCEDF4"; $colorstep_code[6] = "#FAC9DE"; $colorstep_code[7] = "#F07DAB"; $colorstep_code[8] = "#A02560"; #Most confident # get Align max_seq_length my $seq = $aln->get_seq_by_pos(1); my $Align_width = $seq->length(); # counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay) for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) { my $blockEnd = $blockStart+$sequenceLengthForDisplay; $blockEnd = $aln->length if ($blockEnd > $aln->length); # Iterate over sequences and print up to sequenceLengthForDisplay residues for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD my $seq = $aln->get_seq_by_pos($i); #HAIM ADD # NEW my $Color_Class=""; # Print seq my @seq = split //, $seq->subseq($blockStart, $blockEnd);; my $gaps=0; for(my $pos=0; $pos<@seq; $pos++) { my $res = $seq[$pos]; my $Color_Class=""; my $prob=""; if ($res eq '-') { $gaps++; } elsif (($res ne '-') and ($scores{$i}[$pos+1] ne "nan")) { $Color_Class = $colorstep[ int(9 * $scores{$i}[$pos+1]) ]; $prob=$scores{$i}[$pos+1]; if ($Color_Class ne "Score5") { # print JALVIEW_FEATURES "$prob\t$Code_Names{$seq->id}\t-1\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; print JALVIEW_FEATURES "$prob\tID_NOT_SPECIFIED\t",$i-1,"\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; # COLOR JALVIEW by SEQ NUM - not SEQ ID } } elsif (($res ne '-') and ($scores{$i}[$pos+1] eq "nan")) { $Color_Class="ScoreNaN"; # print JALVIEW_FEATURES "$prob\t$Code_Names{$seq->id}\t-1\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; print JALVIEW_FEATURES "NA\tID_NOT_SPECIFIED\t",$i-1,"\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; # COLOR JALVIEW by SEQ NUM - not SEQ ID } } } } print JALVIEW_FEATURES "ENDGROUP\tGUIDANCE\n"; close (JALVIEW_FEATURES); } sub make_JalView_output { my $JalView_Applet_Page=shift; my $WorkingDir=shift; my $http=shift; # Colored MSA my $inMsa=shift; # MSA file with codes my $inMsa_With_names=shift; # MSA file with codes my $scores=shift; # hash of scores my $outJalviewFeaturesFile=shift; my $NamesCodeFile=shift; # Annotation Graph my $Jalview_AnnotFile=shift; # OUT FILE my $Data_File=shift; # file ordered by the X's, first field must be X my $Y_label=shift; # The Y label open (JALVIEW,">$JalView_Applet_Page") || return "Can't open Jalview output page: '$JalView_Applet_Page' $!"; make_Jalview_Color_MSA("$WorkingDir$inMsa",$scores,$WorkingDir.$outJalviewFeaturesFile,$NamesCodeFile); make_Jalview_AnnotationGraph($WorkingDir.$Jalview_AnnotFile,$Data_File,$Y_label); print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; print JALVIEW "\n"; } sub validate_Seqs{ my $workingDir=shift; my $in=shift; my $SeqType=shift; # AminoAcids | Nucleotides | Codons my $MSA=shift; # Yes,No my $CodonTable=shift; # requiered if SeqType is Codons open (IN,$workingDir.$in) || return ('sys_error', "Validate_Seqs:Can't open '$workingDir$in': $!"); my $seq=""; my $seq_name=""; my $seq_length=0; my $Warnning=""; my $Counter=0; open (OUT,">$workingDir"."$in".".FIXED") || return ('sys_error', "Validate_Seqs:Can't open '$workingDir$in".".FIXED': $!"); while (my $line=) { chomp ($line); $line=~ s/^\s+|\s+$//g; if (($line!~/>/)and ($line ne "")) { $seq=$seq.$line; } elsif ($line=~/^>(.*)/) { # validate prev seq if (($seq eq "") and ($seq_name ne "")) { return ("The sequence named '$seq_name' is missing
"); } if (($seq ne "") and ($seq_name ne "")) { # validate seq according to type if ($MSA eq "Yes") # Make sure alignment length equal { $seq_length=length($seq) if ($seq_length==0); # initialize the first one if (length($seq)!=$seq_length) { return ("The sequences of the provided MSA are not properly aligned, For example the seq: '$seq_name' does not aligned to all others. Please fix the alignment and run GUIDANCE again or provide GUIDANCE sequences only
"); } if ($SeqType eq "Codons") # Make sure that in Codon Alignment there are no stop Codons and all seq are divided by 3 { my $ans=validate_seq_in_CodonAlign($seq,$seq_name,$CodonTable); return ($ans) if ($ans ne "OK"); } } if ($MSA eq "No") { if ($seq=~/([-]+)$/) { $seq=~s/$1//; $Warnning="Gap characters (-) were removed from the end of the sequences"; } if ($seq=~/[-]/) { return ("Seq: named '$seq_name' contain a gap character '-' which is illigal when sequences are submited to GUIDANCE. If you intended to submit an alignment, please upload the file using the 'Upload MSA file for evaluation' option
"); } } if ($seq=~s/\*$//) { $Warnning="Star character (*) were removed from the end of the sequences"; } my @ans=validate_single_seq($seq_name,$seq,$SeqType); if ($ans[0] eq "OK") { print OUT ">$seq_name\n"; # prev seq print OUT "$seq\n"; # prev seq $Counter++; } else { return ($ans[0]); } } # Start new seq if ($line=~/^>(.*)/) { $seq_name=$1; $seq_name=~ s/^\s+|\s+$//g ; if ($seq_name eq "") { my $Seq_Num=$Counter+1; return ("Seq number $Seq_Num has no sequence name; Please fix and resubmit
"); } else { # $seq_name=$1; $seq=""; } } } } # validate last seq if (($seq eq "") and ($seq_name ne "")) { return ("The sequence named '$seq_name' is missing
"); } else { # validate seq according to type if ($MSA eq "Yes") # Make sure alignment length equal { $seq_length=length($seq) if ($seq_length==0); # initialize the first one if (length($seq)!=$seq_length) { return ("The sequences of the provided MSA are not properly aligned, For example the seq: '$seq_name' does not aligned to all others. Please fix the alignment and run GUIDANCE again or provide GUIDANCE sequences only
"); } if ($SeqType eq "Codons") # Make sure that in Codon Alignment there are no stop Codons and all seq are divided by 3 { my $ans=validate_seq_in_CodonAlign($seq,$seq_name,$CodonTable); return ($ans) if ($ans ne "OK"); } } if ($MSA eq "No") { if ($seq=~/([-]+)$/) { $seq=~s/$1//; $Warnning="Gap characters (-) were removed from the end of the sequences"; } if ($seq=~/[-]/) { return ("Seq: named '$seq_name' contain a gap character '-' which is illigal when sequences are submited to GUIDANCE. If you intended to submit an alignment, please upload the file using the 'Upload MSA file for evaluation' option
"); } } my @ans=validate_single_seq($seq_name,$seq,$SeqType); if ($ans[0] eq "OK") { print OUT ">$seq_name\n"; # prev seq print OUT "$seq\n"; # prev seq $Counter++; } else { return ($ans[0]); } } #if ($Counter<4) #{ # return ("Only $Counter sequences were provided, however at least 4 sequences are requiered"); #} close (OUT); close (IN); return ("OK",$Warnning,$in.".FIXED",$Counter); } sub validate_single_seq { my $Seq_Name=shift; my $seq=shift; my $seq_type=shift; if (($seq!~/[ABRNDCQEGHILKMFPSTWYVXZabrndcqeghilkmfpstwyvxz]+/) and ($seq_type eq "AminoAcids")) { return ("Seq: '$Seq_Name' is empty
"); } elsif (($seq!~/[ACTGUNactgun]+/) and ($seq_type ne "AminoAcids")) { return ("Seq: '$Seq_Name' is empty
"); } if (($seq=~/([^ABRNDCQEGHILKMFPSTWYVXZabrndcqeghilkmfpstwyvxz-])/) and ($seq_type eq "AminoAcids"))#Maybe allow: _*-? { return ("Seq: '$Seq_Name' contained the character: '$1' which is not a standard Amino Acid
"); } #----------- Amit ------------- if (($seq=~/([^ACGTRYWSMKHBVDNUXacgtrywsmkhbvdnux-])/) and ($seq_type ne "AminoAcids"))#Maybe allow: _*-? { my $wrong_char=$1; if (($seq=~ /[Uu]/) and ($seq_type eq "Nucleotides")) { return ("Currently GUIDANCE does not accept 'U's in nucleotide sequences, you may consider replacing the 'U's by 'T's and re-submit.
In addition, seq: '$Seq_Name' contained the character: '$wrong_char' which is not a standard Nucleotide
"); } return ("Seq: '$Seq_Name' contained the character: '$wrong_char' which is not a standard Nucleotide
"); } if (($seq=~ /[Uu]/) and ($seq_type eq "Nucleotides"))#Maybe allow: _*-? { return ("Currently GUIDANCE does not accept 'U's in nucleotide sequences, you may consider replacing the 'U's by 'T's and re-submit.
"); } #----------- Amit ------------- return ("OK"); } sub validate_seq_in_CodonAlign # Make sure no stop Codons { my $DNASequence = shift; my $DNASequenceName = shift; my $codonTableIndex = shift; my $stopCodon_Found="NO"; my $AASeq=""; my $codonTable_obj = Bio::Tools::CodonTable -> new ( -id => $codonTableIndex ); chomp ($DNASequence); my $seq_length = length($DNASequence); my $i =0; return ("Sequence '$DNASequenceName' is not a valid coding sequence: the sequence is of length $seq_length which it is not divided by 3") if ($seq_length % 3>0); while ($i<$seq_length-2) { $codon = substr($DNASequence, $i, 3); if ($codon eq '---') { $AA = '-'; } else { $AA = $codonTable_obj->translate($codon); } $AASeq.= $AA; $i+=3; } if ($AASeq =~ m/\*/){ return ("Sequence: '$DNASequenceName' contains a stop codon, please remove all stop codons (from all sequences) and submit to GUIDANCE again"); } return ("OK"); } sub SampleFromUnifomDist { my $start=shift; my $end=shift; my $out_sample_file_name = shift; my $sample_size = shift; my @sample=(); my @op_vals = (); my $range=$end-$start; open(my $OUT, ">>", $out_sample_file_name) or die "SampeFromEmpiricDistribution:cannot open OUT: '$out_sample_file_name' $!"; my $j = 0; my $RandNum = 0; while ($j<$sample_size){ my $op_rand=rand($range)+$start;; #my $op_rand = 2*rand() + 1; push (@sample,$op_rand); print $OUT "$op_rand\n"; $j++; } close ($OUT); return (\@sample); } sub SampelFromEmpiricDistribution { # Input: distribution file name, output file name; times to run (for debigging) my $distribution_file_name = shift; my $out_sample_file_name = shift; my $sample_size = shift; my @sample=(); my @op_vals = (); my @op_density = (); my @op_prob = (); my @op_CDF = (); my $i = 0; open(my $IN, "<", $distribution_file_name) or die "SampeFromEmpiricDistribution:cannot open IN: '$distribution_file_name' $!"; my $line=<$IN>; while ($line=<$IN>) { chomp $line; my @temp = split(/\s+/,$line); $op_vals[$i] = $temp[1]; $op_density[$i] = $temp[2]; $i++; } close ($IN); for ($i=0;$i>", $out_sample_file_name) or die "SampeFromEmpiricDistribution:cannot open OUT: '$out_sample_file_name' $!"; my $j = 0; my $DistRandNum = 0; my $RandNum = 0; while ($j<$sample_size){ $RandNum = rand(); $i=0; while ($RandNum>$op_CDF[$i]){ $i++; } $RandNum = rand(); $DistRandNum = $op_vals[$i] + $RandNum*($op_vals[$i+1]-$op_vals[$i]); push (@sample,$DistRandNum); print $OUT "$DistRandNum\n"; $j++; } close ($OUT); return (\@sample); } #---------------------------------- sub reformat_trees_branch_length { my $inTree=shift; my $outTree=shift; #print "reformat_trees_branch_length:Command line arguments: \t$inTree\n\t$outTree\n"; # Tree objets my $inputTree = new Bio::TreeIO(-file => "$inTree", -format => "newick"); my $outputTree = new Bio::TreeIO(-file => ">$outTree", -format => "newick"); my $tree = $inputTree->next_tree; foreach my $node ($tree->get_nodes() ) { my $len = $node->branch_length(); if(defined $len){ $node->branch_length(sprintf("%f",$len)); # } } $outputTree->write_tree($tree); #print "--- END --- \n"; } #---------------------------------- 1;