mirror of
http://43.156.76.180:8026/YuuMJ/EukPhylo.git
synced 2025-12-28 01:50:26 +08:00
3023 lines
104 KiB
Perl
3023 lines
104 KiB
Perl
#!/usr/bin/perl
|
|
|
|
package Guidance; #don't forget: a package must end with a return value (1; in the end)!!!!!
|
|
|
|
#use strict;
|
|
use FileHandle;
|
|
use Bio::SeqIO;
|
|
use Bio::TreeIO;
|
|
use Bio::Tree::TreeFunctionsI;
|
|
use Bio::AlignIO;
|
|
use Bio::Align::AlignI;
|
|
use Bio::Tools::CodonTable;
|
|
use File::Copy;
|
|
use FindBin qw($Bin);
|
|
|
|
use lib "$Bin/../bioSequence_scripts_and_constants/";
|
|
#use lib "/bioseq/bioSequence_scripts_and_constants/";
|
|
|
|
|
|
|
|
use MSA_parser;
|
|
use GENERAL_CONSTANTS;
|
|
|
|
use constant NEWIC2MAFFT => "$Bin/exec/newick2mafft.rb";
|
|
use constant MSA_SET_SCORE => "$Bin/../../programs/msa_set_score/msa_set_score";
|
|
use constant HOT_PROGRAM => "$Bin/exec/HoT/COS.pl";
|
|
use constant MAFFT_OP_DIST=> "$Bin/balibase.mafft_7123_mafft.op.Dist20bins.txt";
|
|
use constant MAFFT_OP_DIST_0_25=> "$Bin/balibase.mafft_7123_mafft.op2.Dist25bins.txt";
|
|
use constant MAFFT_EP_DIST_0_25=> "$Bin/balibase.mafft_7123_mafft.ep2.Dist20bins.txt";
|
|
use constant HOT_GUIDANCE2_PROGRAM => "$Bin/exec/HoT_COS_GUIDANCE2.pl";
|
|
use constant MIDPOINT_ROOTING_R => "$Bin/exec/MidPoint_Rooting.R";
|
|
|
|
my $MAFFT_OP_DIST_0_25="$Bin/balibase.mafft_7123_mafft.op2.Dist25bins.txt";
|
|
my $MAFFT_EP_DIST_0_25="$Bin/balibase.mafft_7123_mafft.ep2.Dist20bins.txt";
|
|
|
|
my $newick2mafft="$Bin/exec/newick2mafft.rb";
|
|
#my $MSA_Score_CSS="http://guidance.tau.ac.il/MSA_Colored.NEW.css";
|
|
my $MSA_Score_CSS="http://guidance.tau.ac.il/MSA_Colored.NEW.EM.css";
|
|
my $MidPoint_Rooting_R="$Bin/exec/MidPoint_Rooting.R";
|
|
#my $phylonet_prog = "$Bin/exec/phylonet_v1_7/phylonet_v1_7.jar";
|
|
my $isEqualTopologyProg = "$Bin/../../programs/isEqualTree/isEqualTree";
|
|
|
|
sub CreateHTML_Graph
|
|
# Create an HTML BARs graph
|
|
# GET: 1. CSV FILE (The X var is the first Col and the Y is the second, X must be sorted)
|
|
# 2. HTML OUTPUT
|
|
# 3. X Lable (OPTIONAL)
|
|
############################################################################################
|
|
{
|
|
my $CSV_File=shift; # The Data Must be in the correct X order
|
|
my $Out=shift;
|
|
my $X_LABLE=shift;
|
|
open (OUT,">>$Out") || return ("Guidance::CreateHTML_Graph: Can't open Output '$Out' $!");
|
|
open (DATA,$CSV_File) || return ("Guidance::CreateHTML_Graph: Can't open data file '$CSV_File' $!");
|
|
# GRAPH PROPERTIES
|
|
my $graphHeight = 200; # target height of graph
|
|
my $BarWidth = 15; # width of bars
|
|
my $maxResult = 1;
|
|
my $scale = 1;
|
|
my $last_x = 0;
|
|
# Set the scale
|
|
$scale = $graphHeight / $maxResult;
|
|
my $line=<DATA>; # Header
|
|
|
|
# SPACER BEFORE THE GRAPH
|
|
print OUT "<tr><td class=\"Score5\"> </td><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr>\n";
|
|
print OUT "<tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr>\n";
|
|
print OUT "<tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr><tr><td class=\"Seq_Name\"></td></tr>\n";
|
|
|
|
print OUT "<tr>\n";
|
|
print OUT "<td class=\"Score5\"> </td><td class=\"Seq_Name\" style = 'text-align: right'>$X_LABLE<br>SCORE</td>\n";
|
|
while ($line=<DATA>)
|
|
{
|
|
chomp ($line);
|
|
my @data=split(",",$line);
|
|
# print "$data[0]\t$last_x\n";
|
|
if (($data[1] ne "NaN") and (($data[0]-$last_x)==1))
|
|
{
|
|
print OUT "<td valign = bottom style = 'border-bottom: 1px solid black";
|
|
if ($data[0]==1)#The First Point, plot also the Y bar
|
|
{
|
|
print OUT ";border-left: 1px solid black";
|
|
}
|
|
print OUT ";'><img title=\"$data[0]:$data[1]\" src=\"http://guidance.tau.ac.il/blue.gif\" width=\"$BarWidth\" height=\"".$data[1]*$scale."\" border=\"1\"></td>\n";
|
|
$last_x=$data[0];
|
|
}
|
|
elsif ($data[1] ne "NaN")
|
|
{
|
|
while ($data[0]-$last_x!=1)
|
|
{
|
|
print OUT "<td valign = bottom style = 'border-bottom: 1px solid black;'><img src=\"http://guidance.tau.ac.il/blue.gif\" width=\"$BarWidth\" height=\"0\" border=\"0\"></td>\n";
|
|
$last_x++;
|
|
}
|
|
print OUT "<td valign = bottom style = 'border-bottom: 1px solid black;'><img title=\"$data[0]:$data[1]\" src=\"http://guidance.tau.ac.il/blue.gif\" width=\"$BarWidth\" height=\"".$data[1]*$scale."\" border=\"1\"></td>\n";
|
|
$last_x=$data[0];
|
|
}
|
|
elsif ($data[1] eq "NaN")
|
|
{
|
|
print OUT "<td valign = bottom style = 'border-bottom: 1px solid black;'><img title=\"$data[0]:$data[1]\" src=\"http://guidance.tau.ac.il/blue.gif\" width=\"$BarWidth\" height=\"".$data[1]*$scale."\" border=\"1\"></td>\n";
|
|
$last_x=$data[0];
|
|
}
|
|
}
|
|
print OUT "</tr>\n";
|
|
print OUT "</table>\n";
|
|
print OUT "<b><P align=\"center\">Column</p></b>\n";
|
|
close (OUT);
|
|
close (DATA);
|
|
}
|
|
|
|
sub printColoredAlignment {
|
|
#############################################################################################################################################
|
|
#@ARGV == 3 or die "USAGE: $0 IN_MSA_FILE OUT_HTML_FILE SCORES_FILE
|
|
#SCORES_FILE - Each line should contain three values: column number, seq number, and a score between 0 and 1 (separated by white spaces)\n";
|
|
#############################################################################################################################################
|
|
my $inMsaFile=shift;
|
|
my $outHtmlFile=shift;
|
|
my $scoresFile=shift;
|
|
my $codesFile=shift; # OPTIONAL
|
|
# Read scores
|
|
open SCORES, $scoresFile or return ("Can't open $scoresFile: $!");
|
|
my %scores;
|
|
my %Code_Names;
|
|
foreach (<SCORES>) {
|
|
next if (/^#/);
|
|
s/^\s+//;
|
|
my ($col, $seq, $score) = split;
|
|
$scores{$seq}[$col] = $score;
|
|
}
|
|
# Read Codes
|
|
if ($codesFile ne "")
|
|
{
|
|
open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
}
|
|
# Read MSA
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1);
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat();
|
|
# print "DEPTH:",$aln->num_sequences;<STDIN>;
|
|
@ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no");
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];}
|
|
else {return "printColoredAlignment: ".join (" ",@ans);}
|
|
# Print HTML start
|
|
# Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985
|
|
my %msaColors = ();
|
|
my %msaPrintColors = ();
|
|
my $lineCounter;
|
|
my @line;
|
|
my $key;
|
|
|
|
my $fontSize=2;
|
|
my $sequenceLengthForDisplay=400000;
|
|
my @msaRightOrder=0;
|
|
my $msaRightOrderCounter=0;
|
|
my $tdWidth = 5;
|
|
|
|
my @colorstep = (); #color steps
|
|
$colorstep[0] = "#10C8D1"; #Not confident
|
|
$colorstep[1] = "#8CFFFF";
|
|
$colorstep[2] = "#D7FFFF";
|
|
$colorstep[3] = "#EAFFFF";
|
|
$colorstep[4] = "#FFFFFF"; #average
|
|
$colorstep[5] = "#FCEDF4";
|
|
$colorstep[6] = "#FAC9DE";
|
|
$colorstep[7] = "#F07DAB";
|
|
$colorstep[8] = "#A02560"; #Most confident
|
|
$colorstep[9] = "#A02560"; #Most confident (the score is exactly 1)
|
|
open MSACOLOREDHTML, ">$outHtmlFile" or die "Can't open $outHtmlFile: $!";
|
|
print MSACOLOREDHTML "<html>\n<head>\n</head>\n<body>\n\n";
|
|
print MSACOLOREDHTML "<H1 align=center><u>MSA color-coded by GUIDANCE scores</u></H1>\n\n";
|
|
print MSACOLOREDHTML "<table border=0 CELLSPACING=1 CELLPADDING=0 >\n";
|
|
|
|
|
|
# Print colored HTML
|
|
|
|
# counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay)
|
|
for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) {
|
|
my $blockEnd = $blockStart+$sequenceLengthForDisplay;
|
|
$blockEnd = $aln->length if ($blockEnd > $aln->length);
|
|
|
|
# Iterate over sequences and print up to sequenceLengthForDisplay residues
|
|
# foreach my $seq ($aln->each_seq) { #HAIM COMMENT
|
|
# my $depth = $aln->num_sequences; #HAIM ADD
|
|
for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD
|
|
my $seq = $aln->get_seq_by_pos($i); #HAIM ADD
|
|
|
|
# next if ( $seq->id < 42
|
|
# || $seq->id > 73);
|
|
|
|
# Print seq id
|
|
print MSACOLOREDHTML "<tr>\n";
|
|
print MSACOLOREDHTML "<td><b><font face='Courier New' color='black' size=$fontSize>", $seq->id, "</font></b></td>\n" if ($codesFile eq "");
|
|
print MSACOLOREDHTML "<td><b><font face='Courier New' color='black' size=$fontSize>", substr($Code_Names{$seq->id},0,25), "</font></b></td>\n" if ($codesFile ne "");
|
|
|
|
# Print seq
|
|
my @seq = split //, $seq->subseq($blockStart, $blockEnd);
|
|
|
|
for(my $pos=0; $pos<@seq; $pos++) {
|
|
my $res = $seq[$pos];
|
|
if ($res eq '-') {
|
|
print MSACOLOREDHTML "<td width=$tdWidth><b><font face='Courier New' color='black' size=$fontSize>$res</font></b></td>\n";
|
|
} else {
|
|
#print $seq->id,"\tscores{$seq->id}[$pos+1]:$scores{$seq->id}[$pos+1]\n";
|
|
#my $color = $colorstep[ int(9 * $scores{$seq->id}[$pos+1]) ]; #HAIM COMMENT
|
|
my $color = $colorstep[ int(9 * $scores{$i}[$pos+1]) ];
|
|
if($color eq "#A02560"){
|
|
print MSACOLOREDHTML "<td width=$tdWidth><b><font face='Courier New' color='white' size=$fontSize><span style='background: $color;'>$res</span></font></b></td>\n";
|
|
}
|
|
else {
|
|
print MSACOLOREDHTML "<td width=$tdWidth><b><font face='Courier New' color='black' size=$fontSize><span style='background: $color;'>$res</span></font></b></td>\n";
|
|
}
|
|
}
|
|
}
|
|
print MSACOLOREDHTML "</tr>\n\n";
|
|
}
|
|
|
|
print MSACOLOREDHTML "<tr> </tr>\n\n";
|
|
}
|
|
print MSACOLOREDHTML "</table>";
|
|
|
|
# print the color scale
|
|
print MSACOLOREDHTML "\n<br><b><u>Legend:</u><br><br>\nThe alignment confidence scale:</b><br>\n<table border=0 cols=1 width=310>\n<tr><td align=center>\n<font face='Courier New' color='black' size=+1><center>\n";
|
|
for (my $i=8 ; $i>=0 ; $i--){
|
|
if ($i == 0){
|
|
print MSACOLOREDHTML "<font face='Courier New' color='white' size=$fontSize><span style='background: $colorstep[$i];'> ", $i+1, " </span></font>";
|
|
}
|
|
else {
|
|
print MSACOLOREDHTML "<font face='Courier New' color='black' size=$fontSize><span style='background: $colorstep[$i];'> ", $i+1, " </span></font>";
|
|
}
|
|
}
|
|
print MSACOLOREDHTML "</font></center>\n<center><table border=0 cols=3 width=310>\n<tr>\n<td align=left><b>Confident</b></td>\n<td align=center><b><---></b></td>\n<td align=right><b>Uncertain</b></td>\n</tr>\n</table></center>\n</td>\n</tr>\n</table>\n";
|
|
print MSACOLOREDHTML "</body>\n<html>\n";
|
|
close MSACOLOREDHTML;
|
|
close (SCORES);
|
|
}
|
|
|
|
sub AssignColorsToAlignment{
|
|
my $inMsaFile=shift;
|
|
my $scoresFile=shift;
|
|
my $codesFile=shift; # OPTIONAL
|
|
|
|
open SCORES, $scoresFile or return ("Can't open $scoresFile: $!");
|
|
my %scores;
|
|
my %Code_Names;
|
|
foreach (<SCORES>) {
|
|
next if (/^\#/);
|
|
s/^\s+//;
|
|
my ($col, $seq, $score) = split;
|
|
$scores{$seq}[$col] = $score;
|
|
}
|
|
# Read Codes
|
|
if ($codesFile ne "")
|
|
{
|
|
open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
}
|
|
# Read MSA
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1);
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat();
|
|
# print "DEPTH:",$aln->num_sequences;<STDIN>;
|
|
@ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD
|
|
else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD
|
|
# Print HTML start
|
|
# Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985
|
|
my %msaColors = ();
|
|
my %msaPrintColors = ();
|
|
my $lineCounter;
|
|
my @line;
|
|
my $key;
|
|
|
|
my $fontSize=2;
|
|
my $sequenceLengthForDisplay=400000;
|
|
my @msaRightOrder=0;
|
|
my $msaRightOrderCounter=0;
|
|
my $tdWidth = 5;
|
|
|
|
my @colorstep = (); #color steps
|
|
$colorstep[0] = "Score1"; #Not confident
|
|
$colorstep[1] = "Score2";
|
|
$colorstep[2] = "Score3";
|
|
$colorstep[3] = "Score4";
|
|
$colorstep[4] = "Score5"; #average
|
|
$colorstep[5] = "Score6";
|
|
$colorstep[6] = "Score7";
|
|
$colorstep[7] = "Score8";
|
|
$colorstep[8] = "Score9"; #Most confident
|
|
$colorstep[9] = "Score9"; #Most confident (the score is exactly 1)
|
|
# get Align max_seq_length
|
|
my $seq = $aln->get_seq_by_pos(1);
|
|
my $Align_width = $seq->length();
|
|
# STOPPED HERE
|
|
|
|
|
|
}
|
|
|
|
sub printColoredAlignment_With_CSS {
|
|
#############################################################################################################################################
|
|
#@ARGV == 3 or die "USAGE: $0 IN_MSA_FILE OUT_HTML_FILE SCORES_FILE
|
|
#SCORES_FILE - Each line should contain three values: column number, seq number, and a score between 0 and 1 (separated by white spaces)\n";
|
|
#############################################################################################################################################
|
|
my $inMsaFile=shift;
|
|
my $outHtmlFile=shift;
|
|
my $scoresFile=shift;
|
|
my $codesFile=shift; # OPTIONAL
|
|
# my $COL_SCORES_FIGURE=shift; # OPTIONAL
|
|
# my $MSA_Length=shift;
|
|
# Parameters for the PLOT beneath alignment
|
|
my $ColScoresCSV=shift;
|
|
my $XLable=shift;
|
|
my $Seq_Scores=shift;
|
|
# Read scores
|
|
open (SEQ_SCORES,$Seq_Scores) or return ("Can't open $Seq_Scores: $!");
|
|
my %seq_scores=();
|
|
foreach (<SEQ_SCORES>)
|
|
{
|
|
next if (/^#/);
|
|
s/^\s+//;
|
|
my ($seq, $score) = split;
|
|
$seq_scores{$seq} = $score;
|
|
}
|
|
close (SEQ_SCORES);
|
|
open SCORES, $scoresFile or return ("Can't open $scoresFile: $!");
|
|
my %scores;
|
|
my %Code_Names;
|
|
foreach (<SCORES>) {
|
|
next if (/^#/);
|
|
s/^\s+//;
|
|
my ($col, $seq, $score) = split;
|
|
$scores{$seq}[$col] = $score;
|
|
}
|
|
# Read Codes
|
|
if ($codesFile ne "")
|
|
{
|
|
open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
}
|
|
# Read MSA
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
# print "DEPTH:",$aln->num_sequences;<STDIN>;
|
|
@ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD
|
|
else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD
|
|
# Print HTML start
|
|
# Code from Conseq colored MSA: ~/pupkoSVN/trunk/www/conseq/runCalc_Conseq.pl line 985
|
|
my %msaColors = ();
|
|
my %msaPrintColors = ();
|
|
my $lineCounter;
|
|
my @line;
|
|
my $key;
|
|
|
|
my $fontSize=2;
|
|
my $sequenceLengthForDisplay=400000;
|
|
my @msaRightOrder=0;
|
|
my $msaRightOrderCounter=0;
|
|
my $tdWidth = 5;
|
|
|
|
my @colorstep = (); #color steps
|
|
$colorstep[0] = "Score1"; #Not confident
|
|
$colorstep[1] = "Score2";
|
|
$colorstep[2] = "Score3";
|
|
$colorstep[3] = "Score4";
|
|
$colorstep[4] = "Score5"; #average
|
|
$colorstep[5] = "Score6";
|
|
$colorstep[6] = "Score7";
|
|
$colorstep[7] = "Score8";
|
|
$colorstep[8] = "Score9"; #Most confident
|
|
$colorstep[9] = "Score9"; #Most confident (the score is exactly 1)
|
|
|
|
my @colorstep_code = (); #color steps
|
|
$colorstep_code[0] = "#10C8D1"; #Not confident
|
|
$colorstep_code[1] = "#8CFFFF";
|
|
$colorstep_code[2] = "#D7FFFF";
|
|
$colorstep_code[3] = "#EAFFFF";
|
|
$colorstep_code[4] = "#FFFFFF"; #average
|
|
$colorstep_code[5] = "#FCEDF4";
|
|
$colorstep_code[6] = "#FAC9DE";
|
|
$colorstep_code[7] = "#F07DAB";
|
|
$colorstep_code[8] = "#A02560"; #Most confident
|
|
# my $plot_width=0;
|
|
# if ($MSA_Length>1000)
|
|
# {
|
|
# $plot_width=11.15*$MSA_Length;
|
|
# }
|
|
# else
|
|
# {
|
|
# $plot_width=11.5*$MSA_Length;
|
|
# }
|
|
open MSACOLOREDHTML, ">$outHtmlFile" or die "Can't open $outHtmlFile: $!";
|
|
print MSACOLOREDHTML "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"\n";
|
|
print MSACOLOREDHTML "\"http://www.w3.org/TR/html4/strict.dtd\">\n";
|
|
print MSACOLOREDHTML "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n";
|
|
print MSACOLOREDHTML "\"http://www.w3.org/TR/html4/loose.dtd\">\n";
|
|
print MSACOLOREDHTML "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Frameset//EN\"\n";
|
|
print MSACOLOREDHTML "\"http://www.w3.org/TR/html4/frameset.dtd\">\n";
|
|
print MSACOLOREDHTML "<head>\n";
|
|
print MSACOLOREDHTML "<meta http-equiv=\"X-UA-Compatible\" content=\"IE=EmulateIE7\"/>\n";
|
|
# print MSACOLOREDHTML "<html>\n";
|
|
# print MSACOLOREDHTML "<head>\n";
|
|
print MSACOLOREDHTML "<link rel=\"stylesheet\" type=\"text/css\" href=\"$MSA_Score_CSS\"/>\n";
|
|
# print MSACOLOREDHTML "<style type=\"text/css\">\n";
|
|
# print MSACOLOREDHTML "img.plot{\n";
|
|
# print MSACOLOREDHTML "width: $plot_width"."px;\n";
|
|
## print MSACOLOREDHTML "border-right: 2px dotted #4169e1;\n";
|
|
## print MSACOLOREDHTML "border-bottom: 2px dotted #4169e1;\n";
|
|
# print MSACOLOREDHTML "}\n";
|
|
# print MSACOLOREDHTML "</style>\n";
|
|
|
|
print MSACOLOREDHTML "</head>\n";
|
|
print MSACOLOREDHTML "<H1 align=center><u>MSA color-coded by GUIDANCE scores</u></H1>\n\n";
|
|
print MSACOLOREDHTML "<table>\n";
|
|
# Print colored HTML
|
|
|
|
# get Align max_seq_length
|
|
my $seq = $aln->get_seq_by_pos(1);
|
|
my $Align_width = $seq->length();
|
|
|
|
# Print upper Scale
|
|
print MSACOLOREDHTML "<tr>\n<td class=\"Score5\"> </td><td class=\"Seq_Name\">                        </td><td>1</td>\n";
|
|
my $i=2;
|
|
while ($i<$Align_width)
|
|
# for (my $i=2;$i<$Align_width;$i++)
|
|
{
|
|
if (($i%10)==0)
|
|
{
|
|
my @digits=split("",$i);
|
|
print MSACOLOREDHTML "\n";
|
|
foreach my $digit (@digits)
|
|
{
|
|
print MSACOLOREDHTML "<td>$digit</td>";
|
|
$i++;
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
print MSACOLOREDHTML "<td></td>";
|
|
$i++;
|
|
}
|
|
}
|
|
print MSACOLOREDHTML "\n</tr>\n";
|
|
|
|
# counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay)
|
|
for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) {
|
|
my $blockEnd = $blockStart+$sequenceLengthForDisplay;
|
|
$blockEnd = $aln->length if ($blockEnd > $aln->length);
|
|
|
|
# Iterate over sequences and print up to sequenceLengthForDisplay residues
|
|
# foreach my $seq ($aln->each_seq) { #HAIM COMMENT
|
|
# my $depth = $aln->num_sequences; #HAIM ADD
|
|
for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD
|
|
my $seq = $aln->get_seq_by_pos($i); #HAIM ADD
|
|
|
|
# next if ( $seq->id < 42
|
|
# || $seq->id > 73);
|
|
|
|
# Print seq id
|
|
print MSACOLOREDHTML "<tr>\n";
|
|
# print SEQ COLOR
|
|
my $Color_Class="";
|
|
if ($seq_scores{$i} eq "nan") {$Color_Class="ScoreNaN";}
|
|
else {$Color_Class = $colorstep[ int(9 * $seq_scores{$i})];}
|
|
print MSACOLOREDHTML "<td class=\"$Color_Class\"> </td>\n";
|
|
print MSACOLOREDHTML "<td class=\"Seq_Name\">", $seq->id, "</td>\n" if ($codesFile eq "");
|
|
print MSACOLOREDHTML "<td class=\"Seq_Name\">", substr($Code_Names{$seq->id},0,25), "</td>\n" if ($codesFile ne "");
|
|
|
|
# Print seq
|
|
my @seq = split //, $seq->subseq($blockStart, $blockEnd);
|
|
|
|
for(my $pos=0; $pos<@seq; $pos++) {
|
|
# for(my $pos=757; $pos<875; $pos++) {
|
|
my $res = $seq[$pos];
|
|
if ($res eq '-') {
|
|
print MSACOLOREDHTML "<td>$res</td>\n";
|
|
} else {
|
|
#print $seq->id,"\tscores{$seq->id}[$pos+1]:$scores{$seq->id}[$pos+1]\n";
|
|
#my $color = $colorstep[ int(9 * $scores{$seq->id}[$pos+1]) ]; #HAIM COMMENT
|
|
my $Color_Class="";
|
|
if ($scores{$i}[$pos+1] eq "nan") {$Color_Class="ScoreNaN";}
|
|
else {$Color_Class = $colorstep[ int(9 * $scores{$i}[$pos+1]) ]};
|
|
print MSACOLOREDHTML "<td class=\"$Color_Class\">$res</td>\n";
|
|
}
|
|
}
|
|
print MSACOLOREDHTML "</tr>\n\n";
|
|
}
|
|
|
|
# print MSACOLOREDHTML "<tr> </tr>\n\n";
|
|
print MSACOLOREDHTML "<tr></tr>\n\n";
|
|
}
|
|
|
|
|
|
|
|
|
|
# Print lower Scale
|
|
# print MSACOLOREDHTML "<tr>\n<td class=\"Seq_Name\"> </td><td>1</td>\n";
|
|
print MSACOLOREDHTML "<tr>\n<td class=\"Score5\"> </td><td class=\"Seq_Name\"></td><td>1</td>\n";
|
|
$i=2;
|
|
while ($i<$Align_width)
|
|
# for (my $i=2;$i<$Align_width;$i++)
|
|
{
|
|
if (($i%10)==0)
|
|
{
|
|
my @digits=split("",$i);
|
|
foreach my $digit (@digits)
|
|
{
|
|
print MSACOLOREDHTML "\n<td>$digit</td>";
|
|
$i++;
|
|
}
|
|
|
|
}
|
|
else
|
|
{
|
|
print MSACOLOREDHTML "<td></td>";
|
|
$i++;
|
|
}
|
|
}
|
|
print MSACOLOREDHTML "\n</tr>\n";
|
|
|
|
# print MSACOLOREDHTML "<tr> </tr>\n</table>\n";
|
|
|
|
# Add the figure
|
|
close (MSACOLOREDHTML);
|
|
CreateHTML_Graph($ColScoresCSV,$outHtmlFile,$XLable);
|
|
open MSACOLOREDHTML, ">>$outHtmlFile";
|
|
# if ($COL_SCORES_FIGURE ne "")
|
|
# {
|
|
# print MSACOLOREDHTML "<table>\n";
|
|
# print MSACOLOREDHTML "<tr>\n";
|
|
# print MSACOLOREDHTML "<td class=\"Seq_Name\">                 </td>\n";
|
|
# print MSACOLOREDHTML "<td><img class=\"plot\" src=\"$COL_SCORES_FIGURE\"></td>\n";
|
|
# print MSACOLOREDHTML "</table>\n";
|
|
# }
|
|
|
|
# print the color scale
|
|
print MSACOLOREDHTML "\n<br><b><u>Legend:</u><br><br>\nThe alignment confidence scale:</b><br>\n";#<table border=0 cols=1 width=310>\n<tr><td align=center>\n<font face='Courier New' color='black' size=+1><center>\n";
|
|
print MSACOLOREDHTML "<table style = 'table-layout: auto;margin-left: 0em;margin-right: 0em;padding:1px 1px 1px 1px; margin:1px 1px 1px 1px; border-collapse: collapse;' border=0 cols=1 width=310>\n<tr><td align=center>\n<font face='Courier New' color='black' size=+1><center>\n";
|
|
for (my $i=8 ; $i>=0 ; $i--){
|
|
|
|
if ($i == 0){
|
|
|
|
print MSACOLOREDHTML "<font face='Courier New' color='white' size=$fontSize><span style='background: $colorstep_code[$i];'> ", $i+1, " </span></font>";
|
|
}
|
|
else {
|
|
|
|
print MSACOLOREDHTML "<font face='Courier New' color='black' size=$fontSize><span style='background: $colorstep_code[$i];'> ", $i+1, " </span></font>";
|
|
}
|
|
}
|
|
# print MSACOLOREDHTML "</font></center>\n<center><table border=0 cols=3 width=310>\n<tr>\n<td align=left><b>Confident</b></td>\n<td align=center><b><---></b></td>\n<td align=right><b>Uncertain</b></td>\n</tr>\n</table></center>\n</td>\n</tr>\n</table>\n";
|
|
print MSACOLOREDHTML "</font></center>\n<center><table style = 'table-layout: auto;margin-left:0em;margin-right: 0em;padding:1px 1px 1px 1px; margin:1px 1px 1px 1px; border-collapse: collapse;' border=0 cols=3 width=310>\n<tr>\n<td align=left><b>Confident</b></td>\n<td align=center><b><---></b></td>\n<td align=right><b>Uncertain</b></td>\n</tr>\n</table></center>\n</td>\n</tr>\n</table>\n";
|
|
|
|
# print MSACOLOREDHTML "<left><table border=0 cols=2 width=100>\n<tr>\n<td align=center class=\"ScoreNaN\"> </td><td align=left>Insufficient Data</b></td>";
|
|
print MSACOLOREDHTML "<left><table style = 'table-layout: auto;margin-left: 0em;margin-right:0em;padding:1px 1px 1px 1px; margin:1px 1px 1px 1px; border-collapse:collapse;' border=0 cols=2 width=100>\n<tr>\n<td align=center class=\"ScoreNaN\"> </td><td align=left>Insufficient Data</b></td>";
|
|
|
|
|
|
print MSACOLOREDHTML "</body>\n</table>\n";
|
|
close MSACOLOREDHTML;
|
|
}
|
|
|
|
sub root_BP_trees
|
|
####################################################################################################################
|
|
# Root all trees on BP dir
|
|
####################################################################################################################
|
|
{
|
|
my $bsDir = shift;
|
|
my $dataset = shift;
|
|
my $orig_prog = shift;
|
|
my $bp_repeats = shift;
|
|
my $suffix=shift;
|
|
my $rooting_type=shift; # BioPerl | MidPoint
|
|
if (!defined $suffix) {
|
|
$suffix = "";
|
|
}
|
|
if (!defined $rooting_type){
|
|
$rooting_type="BioPerl";
|
|
}
|
|
$bsDir .= "/";
|
|
|
|
for (my $countTrees=0;$countTrees<$bp_repeats;++$countTrees) {
|
|
#print "**** $dataset tree_$countTrees\n";
|
|
my $treeFile = $bsDir."tree_".$countTrees."/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix;
|
|
if (-e $treeFile) {
|
|
my $rootedTreeFile = $treeFile.".rooted";
|
|
# print "treeFile: $treeFile\nrootedTreeFile: $rootedTreeFile\n";
|
|
if (uc($rooting_type) eq "BIOPERL")
|
|
{
|
|
rootTree($treeFile,$rootedTreeFile);
|
|
}
|
|
elsif (uc($rooting_type) eq "MIDPOINT")
|
|
{
|
|
system ("R --slave --no-save --no-restore --no-environ --silent --args $treeFile $rootedTreeFile < $MidPoint_Rooting_R");
|
|
}
|
|
open IN, "<$rootedTreeFile", or return ("can't open $rootedTreeFile");
|
|
my $newick = "";
|
|
foreach my $line (<IN>) {
|
|
$newick.=$line;
|
|
}
|
|
close IN;
|
|
|
|
# if ($newick =~ m/:0[^\.]/ && $newick !~ m/:0\./) {
|
|
if ($newick =~ m/:-/) {
|
|
my $rootedTreeFile_withMinusLengths = $rootedTreeFile.".withMinusLengths";
|
|
my $command = "mv $rootedTreeFile $rootedTreeFile_withMinusLengths";
|
|
system ($command);
|
|
$newick =~ s/:-/:/g;
|
|
$command = "echo '$newick' > $rootedTreeFile";
|
|
system ($command);
|
|
}
|
|
if ($newick =~ m/:\d+[^\.\d+]/) {
|
|
my $rootedTreeFile_badBranchLength = $rootedTreeFile.".badBranchLength";
|
|
my $command = "mv $rootedTreeFile $rootedTreeFile_badBranchLength";
|
|
system ($command);
|
|
$newick =~ s/:(\d+)([^\.\d+])/:$1\.0$2/g;
|
|
# print "$newick\n";
|
|
$command = "echo '$newick' > $rootedTreeFile";
|
|
system ($command);
|
|
}
|
|
}else {
|
|
print "File does not exist: $treeFile\n";
|
|
}
|
|
}
|
|
return "ok";
|
|
}
|
|
|
|
sub convertBPTrees2MafftFormat{
|
|
####################################################################################################################
|
|
#convert the tree to mafft format
|
|
####################################################################################################################
|
|
#die " $0 <BPdir> <dataset> <original alignment program> <num_BP_repeats> <extra suffix - default ''> required as input\n" if (scalar(@ARGV) < 4);
|
|
|
|
my $bsDir = shift;
|
|
my $dataset = shift;
|
|
my $orig_prog = shift;
|
|
my $bp_repeats = shift;
|
|
my $ruby_path=shift;
|
|
my $suffix=shift;
|
|
my $rooting_type=shift;
|
|
if (!defined $ruby_path)
|
|
{
|
|
$ruby_path="ruby";
|
|
}
|
|
if (!defined $suffix) {
|
|
$suffix = "";
|
|
}
|
|
if (!defined $rooting_type)
|
|
{
|
|
$rooting_type="BioPerl";
|
|
}
|
|
$bsDir .= "/";
|
|
|
|
for (my $countTrees=0;$countTrees<$bp_repeats;++$countTrees) {
|
|
#print "**** $dataset tree_$countTrees\n";
|
|
my $treeFile = $bsDir."tree_".$countTrees."/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix;
|
|
if (-e $treeFile) {
|
|
my $rootedTreeFile = $treeFile.".rooted";
|
|
my $mafftFormatTreeFile = $treeFile.".mafftFormat";
|
|
# print "treeFile: $treeFile\nrootedTreeFile: $rootedTreeFile\nmafftFormatTreeFile: $mafftFormatTreeFile\n";
|
|
if (uc($rooting_type) eq "BIOPERL")
|
|
{
|
|
rootTree($treeFile,$rootedTreeFile);
|
|
}
|
|
elsif (uc($rooting_type) eq "MIDPOINT")
|
|
{
|
|
system ("R --slave --no-save --no-restore --no-environ --silent --args $treeFile $rootedTreeFile < $MidPoint_Rooting_R");
|
|
}
|
|
open IN, "<$rootedTreeFile", or return ("can't open $rootedTreeFile");
|
|
my $newick = "";
|
|
foreach my $line (<IN>) {
|
|
$newick.=$line;
|
|
}
|
|
close IN;
|
|
|
|
# if ($newick =~ m/:0[^\.]/ && $newick !~ m/:0\./) {
|
|
if ($newick =~ m/:-/) {
|
|
my $rootedTreeFile_withMinusLengths = $rootedTreeFile.".withMinusLengths";
|
|
my $command = "mv $rootedTreeFile $rootedTreeFile_withMinusLengths";
|
|
system ($command);
|
|
$newick =~ s/:-/:/g;
|
|
$command = "echo '$newick' > $rootedTreeFile";
|
|
system ($command);
|
|
}
|
|
if ($newick =~ m/:\d+[^\.\d+]/) {
|
|
my $rootedTreeFile_badBranchLength = $rootedTreeFile.".badBranchLength";
|
|
my $command = "mv $rootedTreeFile $rootedTreeFile_badBranchLength";
|
|
system ($command);
|
|
$newick =~ s/:(\d+)([^\.\d+])/:$1\.0$2/g;
|
|
# print "$newick\n";
|
|
$command = "echo '$newick' > $rootedTreeFile";
|
|
system ($command);
|
|
}
|
|
|
|
system("$ruby_path $newick2mafft $rootedTreeFile > $mafftFormatTreeFile\n");
|
|
}else {
|
|
print "File does not exist: $treeFile\n";
|
|
}
|
|
}
|
|
return "ok";
|
|
}
|
|
|
|
|
|
sub runAlignBPtrees{
|
|
#die " $0 <alignment program (prank/mafft/clustal/muscle/pagan)> <dataset> <noBPdir> <original alignment program - default = same as current program> <num_BP_repeats - default 20> <extra suffix - default ''> required as input\n" if (scalar(@ARGV) < 3);
|
|
my $aln_program = "";
|
|
my $prog = shift;
|
|
|
|
my $dataset = shift;
|
|
my $noBPdir = shift;
|
|
if ($noBPdir !~/(\/)$/){$noBPdir.="/";}
|
|
my $aminoFile = shift;
|
|
|
|
my $Seq_Type = shift;
|
|
if ($Seq_Type eq "") {$Seq_Type="AminoAcids";}
|
|
my $orig_prog=shift;
|
|
if ($orig_prog eq "") {$orig_prog = $prog;}
|
|
my $bp_repeats = shift;
|
|
if ($bp_repeats eq "") {$bp_repeats = 20;}
|
|
my $align_param=shift; # alignment additional parameters
|
|
if (!defined $align_param) {$align_param = "";}
|
|
my $suffix = shift;
|
|
if (!defined $suffix) {$suffix = "";}
|
|
my $update_file=shift; #OPTIONAL FOR SERVER TO UPDATE THE PROGRESS
|
|
my $mafft_prog = shift;
|
|
my $prank_prog = shift;
|
|
my $clustalw_prog = shift;
|
|
my $muscle_prog = shift;
|
|
my $pagan_prog=shift;
|
|
my $proc_num = shift; #AMIT
|
|
|
|
my $PRANK_VERSION=0;
|
|
if (uc($prog) eq "PRANK") {
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $prank_prog;}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $prank_prog;}
|
|
elsif ($Seq_Type eq "Codons") {$aln_program = $prank_prog." -translate ";}
|
|
# if PRANK find out it version
|
|
my $prank_help=`$aln_program`;
|
|
if ($prank_help=~/.*prunedata.*/){$PRANK_VERSION="121218";}
|
|
elsif ($prank_help=~/.*showanc.*/){$PRANK_VERSION="120626";}
|
|
#print "PRANK VERSION:>=$PRANK_VERSION\n";
|
|
} elsif (uc($prog) eq "MAFFT"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $mafft_prog." --quiet --amino ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $mafft_prog." --quiet --nuc ";}
|
|
} elsif (uc($prog) eq "MAFFT_LINSI"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --amino ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --nuc ";}
|
|
} elsif (uc($prog) eq "CLUSTALW"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $clustalw_prog." -QUIET -TYPE=PROTEIN ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $clustalw_prog." -QUIET -TYPE=DNA ";}
|
|
} elsif (uc($prog) eq "MUSCLE"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $muscle_prog." -quiet -seqtype protein ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $muscle_prog." -quiet -seqtype dna ";}
|
|
} elsif (uc($prog) eq "PAGAN"){
|
|
$aln_program=$pagan_prog." ";
|
|
}
|
|
|
|
else {
|
|
return "please specify either mafft / mafft_linsi / prank / clustalw / muscle / pagan as an alignment program\n";
|
|
}
|
|
my $dir = $noBPdir."BP/";
|
|
# my $aminoFile = $noBPdir.$dataset.".fas".$suffix;
|
|
|
|
# Running parallel processes using fork, each will run an equal share of the BP alignments
|
|
# my $proc_num = 8;
|
|
my $bp_per_proc = int($bp_repeats / $proc_num) + 1;
|
|
my @children; # array of pid of children
|
|
for (my $proc=0; $proc<$proc_num; $proc++) {
|
|
print "Running proc num $proc\n";
|
|
|
|
my $pid = fork();
|
|
if ($pid) {
|
|
|
|
# parent
|
|
push(@children, $pid);
|
|
|
|
} elsif ($pid == 0) {
|
|
|
|
# child
|
|
for (my $tree_num=0; $tree_num<$bp_per_proc; $tree_num++) {
|
|
my $countTrees = $proc * $bp_per_proc + $tree_num;
|
|
print "proc num $proc\ttree num $tree_num --> global tree index $countTrees\n";
|
|
last if ($countTrees == $bp_repeats);
|
|
my $treeFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix;
|
|
if (-e $treeFile) {
|
|
my $alnFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln";
|
|
my $cmdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".cmd";
|
|
my $stdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".std";
|
|
# if (-e $alnFile) {
|
|
# print "skipping tree $countTrees because $alnFile already exists.\n";
|
|
# next;
|
|
# }
|
|
my $cmd = "";
|
|
if (uc($prog) eq "PRANK") {
|
|
if ($PRANK_VERSION==0)
|
|
{
|
|
$cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet -noxml > $stdFile";# >& $stdFile"; # -noxml is not supported any more
|
|
}
|
|
else # version >=120626
|
|
{
|
|
$cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet > $stdFile"; # -noxml is not supported any more
|
|
}
|
|
} elsif ((uc($prog) eq "MAFFT") or (uc($prog) eq "MAFFT_LINSI")){
|
|
my $mafftFormatTreeFile = $treeFile.".mafftFormat";
|
|
unless (-e $mafftFormatTreeFile) {
|
|
return ("ERROR: file does not exist: $mafftFormatTreeFile\n");
|
|
}
|
|
if ($align_param=~/addfragments/)
|
|
{
|
|
# first build the core MSA
|
|
my $core_aln_param="";
|
|
my @tmp=split (/\Q\-\-\E/,$align_param);
|
|
my $tmp_size=@tmp;
|
|
if ($tmp_size>=1) # command line type
|
|
{
|
|
for (my $i=0;$i<$tmp_size;$i++)
|
|
{
|
|
if ($tmp[$i]=~/addfragments/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];}
|
|
}
|
|
$core_aln_param=join ("\\\-\\\-",@tmp);
|
|
}
|
|
else # server type
|
|
{
|
|
my @tmp=split (/\Q--\E/,$align_param);
|
|
my $tmp_size=@tmp;
|
|
for (my $i=0;$i<$tmp_size;$i++)
|
|
{
|
|
if ($tmp[$i]=~/addfragments/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];}
|
|
}
|
|
$core_aln_param=join ("\\\-\\\-",@tmp);
|
|
}
|
|
#print "CORE PARAMS:$core_aln_param\n";<STDIN>; # QA
|
|
my $mafftFormatCoreTreeFile=$dir."PRUNE_BP_FOR_CORE_ALN/tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees."CORE.mafftFormat";
|
|
my $mafftCoreMSA=$dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln.CORE";
|
|
$cmd="$aln_program $core_aln_param --retree 1 --treein $mafftFormatCoreTreeFile $aminoFile > $mafftCoreMSA; ";
|
|
# use the core MSA and full tree to build the full alignment
|
|
$cmd.="$aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $mafftCoreMSA > $alnFile";
|
|
}
|
|
else
|
|
{
|
|
$cmd = "($aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $aminoFile > $alnFile)";# >& $stdFile";
|
|
}
|
|
} elsif (uc($prog) eq "CLUSTALW") {
|
|
$cmd = "$aln_program $align_param -infile=$aminoFile -usetree=$treeFile -outfile=$alnFile > $stdFile";
|
|
} elsif (uc($prog) eq "MUSCLE") {
|
|
my $rooted_tree=$treeFile.".rooted";
|
|
unless (-e $rooted_tree) {
|
|
return ("ERROR: file does not exist: $rooted_tree\n");
|
|
}
|
|
$cmd = "$aln_program $align_param -in $aminoFile -usetree_nowarn $rooted_tree -out $alnFile > $stdFile";
|
|
} elsif (uc ($prog) eq "PAGAN") {
|
|
my $rooted_tree=$treeFile.".rooted";
|
|
unless (-e $rooted_tree) {
|
|
return ("ERROR: file does not exist: $rooted_tree\n");
|
|
}
|
|
$cmd = "$aln_program $align_param --seqfile $aminoFile --treefile $rooted_tree --outfile $alnFile > $stdFile";
|
|
}
|
|
#print "$cmd\n"; # QA
|
|
system ("$cmd");
|
|
if ($update_file ne "") {
|
|
open (PROGRESS,">$update_file");
|
|
print PROGRESS "\n<ul><li>",$countTrees+1," out of $bp_repeats alternative alignments were created</li></ul>\n";
|
|
close (PROGRESS);
|
|
}
|
|
# print "$cmdFile\n";
|
|
# open IN, ">$cmdFile" or return ("cannot open file $cmdFile\n");
|
|
# print IN "#!/bin/sh\n\ncd ".$dir."tree_$countTrees\n\n";
|
|
# print IN "$cmd\n";
|
|
# close IN;
|
|
# system ($cmd);
|
|
if ($prog eq "PRANK") {
|
|
if ($PRANK_VERSION==0)
|
|
{
|
|
system ("cp $alnFile.1.fas $alnFile");
|
|
}
|
|
elsif ($PRANK_VERSION eq "121218")
|
|
{
|
|
system ("cp $alnFile.best.fas $alnFile");
|
|
}
|
|
else # ver >=120626
|
|
{
|
|
system ("cp $alnFile.2.fas $alnFile");
|
|
}
|
|
}
|
|
if ($prog eq "CLUSTALW")
|
|
{
|
|
MSA_parser::convert_msa_format($alnFile,"clustalw","$alnFile.fs","fasta");
|
|
system ("mv $alnFile $alnFile.orig");
|
|
system ("mv $alnFile.fs $alnFile");
|
|
}
|
|
if ($prog eq "PAGAN")
|
|
{
|
|
move("$alnFile".".fas", "$alnFile");
|
|
}
|
|
#my $alias = $dataset.".".$prog;
|
|
#my $qsub = "qsub -q heavy -N $alias $cmdFile";
|
|
# print $qsub."\n";
|
|
#system ($qsub);
|
|
}
|
|
}
|
|
exit 0;
|
|
} else {
|
|
die "ERROR: fork failed: $!\n";
|
|
}
|
|
|
|
}
|
|
|
|
# Wait for child processes to end
|
|
foreach (@children) {
|
|
my $pid = waitpid($_, 0);
|
|
print "done with pid $pid\n";
|
|
#if ($update_file ne "") {
|
|
# open (PROGRESS,">$update_file");
|
|
# print PROGRESS "\n<ul><li>",$countTrees+1," out of $bp_repeats alternative alignments were created</li></ul>\n";
|
|
# close (PROGRESS);
|
|
#}
|
|
|
|
}
|
|
|
|
return "ok";
|
|
}
|
|
|
|
sub runAlignBPtrees_GUIDANCE3{
|
|
#die " $0 <alignment program (prank/mafft/clustal/muscle/pagan)> <dataset> <noBPdir> <original alignment program - default = same as current program> <num_BP_repeats - default 20> <extra suffix - default ''> required as input\n" if (scalar(@ARGV) < 3);
|
|
my $aln_program = "";
|
|
my $prog = shift;
|
|
|
|
my $dataset = shift;
|
|
my $noBPdir = shift;
|
|
if ($noBPdir !~/(\/)$/){$noBPdir.="/";}
|
|
my $aminoFile = shift;
|
|
|
|
my $Seq_Type = shift;
|
|
if ($Seq_Type eq "") {$Seq_Type="AminoAcids";}
|
|
my $orig_prog=shift;
|
|
if ($orig_prog eq "") {$orig_prog = $prog;}
|
|
my $bp_repeats = shift;
|
|
if ($bp_repeats eq "") {$bp_repeats = 20;}
|
|
my $align_param=shift; # alignment additional parameters
|
|
if (!defined $align_param) {$align_param = "";}
|
|
my $suffix = shift;
|
|
if (!defined $suffix) {$suffix = "";}
|
|
my $update_file=shift; #OPTIONAL FOR SERVER TO UPDATE THE PROGRESS
|
|
my $mafft_prog = shift;
|
|
my $prank_prog = shift;
|
|
my $clustalw_prog = shift;
|
|
my $muscle_prog = shift;
|
|
my $pagan_prog=shift;
|
|
my $proc_num = shift; #AMIT
|
|
my $GapPenDist=shift; # EMP | UNIF
|
|
|
|
my $PRANK_VERSION=0;
|
|
if (uc($prog) eq "PRANK") {
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $prank_prog;}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $prank_prog;}
|
|
elsif ($Seq_Type eq "Codons") {$aln_program = $prank_prog." -translate ";}
|
|
# if PRANK find out it version
|
|
my $prank_help=`$aln_program`;
|
|
if ($prank_help=~/.*prunedata.*/){$PRANK_VERSION="121218";}
|
|
elsif ($prank_help=~/.*showanc.*/){$PRANK_VERSION="120626";}
|
|
#print "PRANK VERSION:>=$PRANK_VERSION\n";
|
|
} elsif (uc($prog) eq "MAFFT"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $mafft_prog." --quiet --amino ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $mafft_prog." --quiet --nuc ";}
|
|
} elsif (uc($prog) eq "MAFFT_LINSI"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --amino ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = GENERAL_CONSTANTS::MAFFT_LINSI_GUIDANCE." --nuc ";}
|
|
} elsif (uc($prog) eq "CLUSTALW"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $clustalw_prog." -QUIET -TYPE=PROTEIN ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $clustalw_prog." -QUIET -TYPE=DNA ";}
|
|
} elsif (uc($prog) eq "MUSCLE"){
|
|
if ($Seq_Type eq "AminoAcids") {$aln_program = $muscle_prog." -quiet -seqtype protein ";}
|
|
elsif ($Seq_Type eq "Nucleotides") {$aln_program = $muscle_prog." -quiet -seqtype dna ";}
|
|
} elsif (uc($prog) eq "PAGAN"){
|
|
$aln_program=$pagan_prog." ";
|
|
}
|
|
|
|
else {
|
|
return "please specify either mafft / mafft_linsi / prank / clustalw / muscle / pagan as an alignment program\n";
|
|
}
|
|
|
|
# SAMPLE OP and EP
|
|
my $OutEP="$noBPdir/SampledEPVals.log";
|
|
my $OutOP="$noBPdir/SampledOPVals.log";
|
|
my $op_vals_arr_ref="";
|
|
my $ep_vals_arr_ref="";
|
|
|
|
if (uc ($GapPenDist) eq "EMP")
|
|
{
|
|
if ($prog eq "MAFFT")
|
|
{
|
|
$OP_DistFile=Guidance::MAFFT_OP_DIST_0_25;
|
|
$EP_DistFile=Guidance::MAFFT_EP_DIST_0_25;
|
|
|
|
#print LOG "Sample op according to empiric distribution: SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats)\n";
|
|
print "Sample op according to empiric distribution: SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats)\n";
|
|
$op_vals_arr_ref=SampelFromEmpiricDistribution($OP_DistFile,$OutOP,$bp_repeats);
|
|
#print LOG "Sample ep according to empiric distribution: SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats)\n";
|
|
print "Sample ep according to empiric distribution: SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats)\n";
|
|
$ep_vals_arr_ref=SampelFromEmpiricDistribution($EP_DistFile,$OutEP,$bp_repeats);
|
|
}
|
|
else
|
|
{
|
|
# print LOG "ERROR: EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n";
|
|
print "ERROR: EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n";
|
|
die "EMPIRICAL DISTRIBTION of gap penelties is available only for MAFFT...\n";
|
|
}
|
|
}
|
|
elsif (uc ($GapPenDist) eq "UNIF")
|
|
{
|
|
if ($prog eq "MAFFT")
|
|
{
|
|
#print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(1,3,$OutOP,$FORM{Bootstraps})\n";
|
|
#$op_vals_arr_ref=SampleFromUnifomDist(1,3,$OutOP,$bp_repeats); # according to mafft web-site defaults: http://mafft.cbrc.jp/alignment/server/index.html
|
|
|
|
# print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(0,25,$OutOP,$bp_repeats)\n";
|
|
print "Sample op according to uniform distribution: SampleFromUnifomDist(0,6,$OutOP,$bp_repeats)\n";
|
|
$op_vals_arr_ref=SampleFromUnifomDist(0,6,$OutOP,$bp_repeats);
|
|
# print LOG "Sample op according to uniform distribution: SampleFromUnifomDist(0,40,$OutOP,$bp_repeats)\n";
|
|
print "Sample ep according to uniform distribution: SampleFromUnifomDist(0,4,$OutEP,$bp_repeats)\n";
|
|
$ep_vals_arr_ref=SampleFromUnifomDist(0,4,$OutEP,$bp_repeats);
|
|
}
|
|
elsif ($prog eq "PRANK")
|
|
{
|
|
# print LOG "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.01,$OutOP,$FORM{Bootstraps})\n";
|
|
# $op_vals_arr_ref=Guidance::SampleFromUnifomDist(0,0.01,$OutOP,$FORM{Bootstraps}); # for prank v.140110 the defaults are: dna 0.025 / prot 0.005
|
|
# print LOG "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats)\n";
|
|
print "Sample op according to uniform distribution: Guidance::SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats)\n";
|
|
$op_vals_arr_ref=SampleFromUnifomDist(0,0.5,$OutOP,$bp_repeats); # for prank v.140110 the defaults are: dna 0.025 / prot 0.005
|
|
}
|
|
elsif ($prog eq "CLUSTALW")
|
|
{
|
|
# print LOG "Sample gap opening panelty according to uniform distribution: Guidance::SampleFromUnifomDist(4,16,$OutOP,$FORM{Bootstraps})\n";
|
|
print "Sample gap opening panelty according to uniform distribution: Guidance::SampleFromUnifomDist(4,16,$OutOP,$bp_repeats)\n";
|
|
$op_vals_arr_ref=SampleFromUnifomDist(4,16,$OutOP,$bp_repeats);
|
|
}
|
|
}
|
|
|
|
my $dir = $noBPdir."BP/";
|
|
# my $aminoFile = $noBPdir.$dataset.".fas".$suffix;
|
|
|
|
# Running parallel processes using fork, each will run an equal share of the BP alignments
|
|
# my $proc_num = 8;
|
|
my $bp_per_proc = int($bp_repeats / $proc_num) + 1;
|
|
my @children; # array of pid of children
|
|
for (my $proc=0; $proc<$proc_num; $proc++) {
|
|
print "Running proc num $proc\n";
|
|
|
|
my $pid = fork();
|
|
if ($pid) {
|
|
|
|
# parent
|
|
push(@children, $pid);
|
|
|
|
} elsif ($pid == 0) {
|
|
|
|
# child
|
|
for (my $tree_num=0; $tree_num<$bp_per_proc; $tree_num++) {
|
|
my $countTrees = $proc * $bp_per_proc + $tree_num;
|
|
print "proc num $proc\ttree num $tree_num --> global tree index $countTrees\n";
|
|
last if ($countTrees == $bp_repeats);
|
|
my $treeFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees.$suffix;
|
|
if (-e $treeFile) {
|
|
my $alnFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees."."OP_".$op_vals_arr_ref->[$countTrees]."_EP_".$ep_vals_arr_ref->[$countTrees].".".$prog.".aln";
|
|
my $cmdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".cmd";
|
|
my $stdFile = $dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".std";
|
|
# if (-e $alnFile) {
|
|
# print "skipping tree $countTrees because $alnFile already exists.\n";
|
|
# next;
|
|
# }
|
|
my $cmd = "";
|
|
if (uc($prog) eq "PRANK") {
|
|
if ($PRANK_VERSION==0)
|
|
{
|
|
$cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet -noxml > $stdFile";# >& $stdFile"; # -noxml is not supported any more
|
|
}
|
|
else # version >=120626
|
|
{
|
|
$cmd = "$aln_program $align_param -d=$aminoFile -o=$alnFile -t=$treeFile -once -quiet > $stdFile"; # -noxml is not supported any more
|
|
}
|
|
} elsif ((uc($prog) eq "MAFFT") or (uc($prog) eq "MAFFT_LINSI")){
|
|
my $mafftFormatTreeFile = $treeFile.".mafftFormat";
|
|
unless (-e $mafftFormatTreeFile) {
|
|
return ("ERROR: file does not exist: $mafftFormatTreeFile\n");
|
|
}
|
|
if ($align_param=~/addfragments/)
|
|
{
|
|
# first build the core MSA
|
|
my $core_aln_param="";
|
|
my @tmp=split (/\Q\-\-\E/,$align_param);
|
|
my $tmp_size=@tmp;
|
|
if ($tmp_size>=1) # command line type
|
|
{
|
|
for (my $i=0;$i<$tmp_size;$i++)
|
|
{
|
|
if ($tmp[$i]=~/addfragments/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];}
|
|
}
|
|
$core_aln_param=join ("\\\-\\\-",@tmp);
|
|
}
|
|
else # server type
|
|
{
|
|
my @tmp=split (/\Q--\E/,$align_param);
|
|
my $tmp_size=@tmp;
|
|
for (my $i=0;$i<$tmp_size;$i++)
|
|
{
|
|
if ($tmp[$i]=~/addfragments/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/multipair/){delete $tmp[$i];}
|
|
elsif ($tmp[$i]=~/6merpair/){delete $tmp[$i];}
|
|
}
|
|
$core_aln_param=join ("\\\-\\\-",@tmp);
|
|
}
|
|
#print "CORE PARAMS:$core_aln_param\n";<STDIN>; # QA
|
|
my $mafftFormatCoreTreeFile=$dir."PRUNE_BP_FOR_CORE_ALN/tree_$countTrees/".$dataset.".".$orig_prog.".semphy.tree_".$countTrees."CORE.mafftFormat";
|
|
my $mafftCoreMSA=$dir."tree_$countTrees/".$dataset.".".$orig_prog.".tree_$countTrees.".$prog.".aln.CORE";
|
|
$cmd="$aln_program $core_aln_param --retree 1 --treein $mafftFormatCoreTreeFile $aminoFile > $mafftCoreMSA; ";
|
|
# use the core MSA and full tree to build the full alignment
|
|
$cmd.="$aln_program $align_param --retree 1 --treein $mafftFormatTreeFile $mafftCoreMSA > $alnFile";
|
|
}
|
|
else
|
|
{
|
|
$cmd = "$aln_program $align_param --op $op_vals_arr_ref->[$countTrees] --ep $ep_vals_arr_ref->[$countTrees] --retree 1 --treein $mafftFormatTreeFile $aminoFile > $alnFile";# >& $stdFile";
|
|
}
|
|
} elsif (uc($prog) eq "CLUSTALW") {
|
|
$cmd = "$aln_program $align_param -infile=$aminoFile -usetree=$treeFile -outfile=$alnFile > $stdFile";
|
|
} elsif (uc($prog) eq "MUSCLE") {
|
|
my $rooted_tree=$treeFile.".rooted";
|
|
unless (-e $rooted_tree) {
|
|
return ("ERROR: file does not exist: $rooted_tree\n");
|
|
}
|
|
$cmd = "$aln_program $align_param -in $aminoFile -usetree_nowarn $rooted_tree -out $alnFile > $stdFile";
|
|
} elsif (uc ($prog) eq "PAGAN") {
|
|
my $rooted_tree=$treeFile.".rooted";
|
|
unless (-e $rooted_tree) {
|
|
return ("ERROR: file does not exist: $rooted_tree\n");
|
|
}
|
|
$cmd = "$aln_program $align_param --seqfile $aminoFile --treefile $rooted_tree --outfile $alnFile > $stdFile";
|
|
}
|
|
#print "$cmd\n"; # QA
|
|
system ("$cmd");
|
|
if ($update_file ne "") {
|
|
open (PROGRESS,">$update_file");
|
|
print PROGRESS "\n<ul><li>",$countTrees+1," out of $bp_repeats alternative alignments were created</li></ul>\n";
|
|
close (PROGRESS);
|
|
}
|
|
# print "$cmdFile\n";
|
|
# open IN, ">$cmdFile" or return ("cannot open file $cmdFile\n");
|
|
# print IN "#!/bin/sh\n\ncd ".$dir."tree_$countTrees\n\n";
|
|
# print IN "$cmd\n";
|
|
# close IN;
|
|
# system ($cmd);
|
|
if ($prog eq "PRANK") {
|
|
if ($PRANK_VERSION==0)
|
|
{
|
|
system ("cp $alnFile.1.fas $alnFile");
|
|
}
|
|
elsif ($PRANK_VERSION eq "121218")
|
|
{
|
|
system ("cp $alnFile.best.fas $alnFile");
|
|
}
|
|
else # ver >=120626
|
|
{
|
|
system ("cp $alnFile.2.fas $alnFile");
|
|
}
|
|
}
|
|
if ($prog eq "CLUSTALW")
|
|
{
|
|
MSA_parser::convert_msa_format($alnFile,"clustalw","$alnFile.fs","fasta");
|
|
system ("mv $alnFile $alnFile.orig");
|
|
system ("mv $alnFile.fs $alnFile");
|
|
}
|
|
if ($prog eq "PAGAN")
|
|
{
|
|
move("$alnFile".".fas", "$alnFile");
|
|
}
|
|
#my $alias = $dataset.".".$prog;
|
|
#my $qsub = "qsub -q heavy -N $alias $cmdFile";
|
|
# print $qsub."\n";
|
|
#system ($qsub);
|
|
}
|
|
}
|
|
exit 0;
|
|
} else {
|
|
die "ERROR: fork failed: $!\n";
|
|
}
|
|
|
|
}
|
|
|
|
# Wait for child processes to end
|
|
foreach (@children) {
|
|
my $pid = waitpid($_, 0);
|
|
print "done with pid $pid\n";
|
|
#if ($update_file ne "") {
|
|
# open (PROGRESS,">$update_file");
|
|
# print PROGRESS "\n<ul><li>",$countTrees+1," out of $bp_repeats alternative alignments were created</li></ul>\n";
|
|
# close (PROGRESS);
|
|
#}
|
|
|
|
}
|
|
|
|
return "ok";
|
|
}
|
|
|
|
sub rootTree {
|
|
####################################################################################################################
|
|
#die "USAGE: $0 inTree outTree
|
|
#inTree must be an unrooted tree - i.e. root node has at least 3 sons.
|
|
#In outTree the root will have 2 sons
|
|
#(all direct sons of root will be made biforcating - the rest of tree is left untouched)\n"
|
|
####################################################################################################################
|
|
my $inTree=shift;
|
|
my $outTree=shift;
|
|
|
|
my $in = new Bio::TreeIO(-file => "$inTree",
|
|
-format => "newick");
|
|
my $out = new Bio::TreeIO(-file => ">$outTree",
|
|
-format => "newick");
|
|
|
|
while (my $tree = $in->next_tree) {
|
|
my $root = $tree->get_root_node;
|
|
my @sons = $root->each_Descendent;
|
|
|
|
# Remove edges between root-sons
|
|
foreach my $son (@sons) {
|
|
$root->remove_Descendent($son);
|
|
}
|
|
|
|
# Iteratively add
|
|
my $currFather = $root;
|
|
while (@sons > 2) {
|
|
my $son = shift @sons;
|
|
$currFather->add_Descendent($son);
|
|
my $midNode = new Bio::Tree::Node();
|
|
$currFather->add_Descendent($midNode);
|
|
$midNode->branch_length(0);
|
|
$currFather = $midNode;
|
|
}
|
|
$currFather->add_Descendent($sons[0]);
|
|
$currFather->add_Descendent($sons[1]);
|
|
$sons[0]->branch_length(0);
|
|
$sons[1]->branch_length(0);
|
|
|
|
$out->write_tree($tree);
|
|
}
|
|
}
|
|
sub pullOutBPtrees_BBL { # TO DO: Unite with pullOutBPtrees
|
|
# TO DO in order to unite: 1) get the file to parse as an input
|
|
# 2) QA...
|
|
####################################################################################################################
|
|
# pull out all the BP NJ trees into the BP directory
|
|
# pull out the original NJ tree (that was done on the complete MSA file)
|
|
####################################################################################################################
|
|
|
|
#die "Usage: $0 <noBPdir> <dataset> <num_BP_repeats> <alignment program>" if (scalar(@ARGV) < 4);
|
|
|
|
my $noBPdir = shift;
|
|
my $dataset = shift;
|
|
my $bp_repeats = shift;
|
|
my $alnProg = shift;
|
|
|
|
unless ($noBPdir =~ m/\/$/) {
|
|
$noBPdir.="/";
|
|
}
|
|
if (-e $noBPdir.$dataset.".".$alnProg.".semphy.tree")
|
|
{
|
|
unlik ($noBPdir.$dataset.".".$alnProg.".semphy.tree");
|
|
}
|
|
my $BPdir = $noBPdir."BP/";
|
|
unless (-e $BPdir) {
|
|
system ("mkdir $BPdir");
|
|
}
|
|
my $nonUniqueTreesDir="";
|
|
# my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.log"; # was in pullOutBPtrees
|
|
my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.out";
|
|
print "semphy log file: $semphyLogFile\n";
|
|
|
|
my $countTrees=0;
|
|
my $countUniqueTrees=0;
|
|
my @numRepeats;
|
|
|
|
if ($alnProg ne "MAFFT") # BUILED ALIGNEMT ONLY FOR UNIQUE TREES
|
|
{
|
|
$nonUniqueTreesDir = $BPdir."nonUniqueTrees/";
|
|
unless (-e $nonUniqueTreesDir) {
|
|
system ("mkdir $nonUniqueTreesDir");
|
|
}
|
|
}
|
|
my $make_unique_trees=""; # yes | no # TO DO: take as arg one day...
|
|
if ($alnProg ne "MAFFT")
|
|
{
|
|
$make_unique_trees="yes";
|
|
}
|
|
else
|
|
{
|
|
$make_unique_trees="no";
|
|
}
|
|
my $treeLine="";
|
|
my $read_reconstructed_tree=0; # will flag that the starting with BP tree...
|
|
open IN, "<$semphyLogFile" or return "can't open file $semphyLogFile";
|
|
while (my $line=<IN>)
|
|
{
|
|
if ($line=~/^\# Finished tree reconstruction\./) # The real - not BP tree
|
|
{
|
|
my $dumpLine=<IN>;
|
|
$dumpLine=<IN>;
|
|
$dumpLine=<IN>;
|
|
$treeLine=<IN>;
|
|
my $treeFile = $noBPdir.$dataset.".".$alnProg.".semphy.tree";
|
|
open OUT, ">$treeFile" or return "can't open file $treeFile";
|
|
# write the tree into the treefile
|
|
print OUT $treeLine;
|
|
# close treefile
|
|
close OUT;
|
|
$treeLine="";
|
|
$read_reconstructed_tree=1;
|
|
}
|
|
elsif ((($line=~/^\# Tree after BBL\./) or ($line=~/The reconsructed tree:/)) and ($read_reconstructed_tree==1)) # BP tree after BBL
|
|
{
|
|
my $dumpLine=<IN> if ($line=~/^\# Tree after BBL\./);
|
|
$treeLine=<IN>;
|
|
if ($make_unique_trees eq "no")
|
|
{
|
|
$treeDir = $BPdir."/tree_".$countTrees."/"
|
|
}
|
|
else
|
|
{
|
|
$treeDir = $nonUniqueTreesDir."/tree_".$countTrees."/";
|
|
}
|
|
unless (-e $treeDir) {system ("mkdir $treeDir");}
|
|
$treeFile = $treeDir.$dataset.".".$alnProg.".semphy.tree_".$countTrees;
|
|
$countTrees++;
|
|
# open treefile
|
|
open OUT, ">$treeFile" or return "can't open file $treeFile";
|
|
# write the tree into the treefile
|
|
print OUT $treeLine;
|
|
# close treefile
|
|
close OUT;
|
|
if ($make_unique_trees eq "yes")
|
|
{
|
|
for ($i=0;$i<$countUniqueTrees;++$i) {
|
|
$uniqueTreeFile = $BPdir."tree_".$i."/".$dataset.".".$alnProg.".semphy.tree_".$i;
|
|
$isEqualTopologyResFile = $treeDir."isEqualTopology.".$i.".std";
|
|
my $isEqualTopologyCommand = "$isEqualTopologyProg $treeFile $uniqueTreeFile";
|
|
my $isEqualTopology = `$isEqualTopologyCommand`;
|
|
open OUT_EQUAL_TOP, ">$isEqualTopologyResFile" or return "can't open file $isEqualTopologyResFile";
|
|
print OUT_EQUAL_TOP "$isEqualTopology\n";
|
|
close OUT_EQUAL_TOP;
|
|
if ($isEqualTopology == 1) { # same tree
|
|
$numRepeats[$i]++;
|
|
# print "$isEqualTopology == 1\n"; # debug OP
|
|
last;
|
|
}
|
|
if ($isEqualTopology == 2) {
|
|
print "skipping ERROR in isEqualTopology of $treeFile and $uniqueTreeFile\n";
|
|
next;
|
|
}
|
|
}
|
|
if ($i == $countUniqueTrees) { # The new tree is unique
|
|
push (@numRepeats,1);
|
|
my $uniqueTreeDir = $BPdir."tree_".$countUniqueTrees."/";
|
|
unless (-e $uniqueTreeDir) {system ("mkdir $uniqueTreeDir");}
|
|
my $uniqueTreeFile = $uniqueTreeDir.$dataset.".".$alnProg.".semphy.tree_".$countUniqueTrees;
|
|
system ("cp $treeFile $uniqueTreeFile");
|
|
$countUniqueTrees++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
close IN;
|
|
if ($countTrees != $bp_repeats) {
|
|
return "ERROR: dataset: $dataset \t countTrees: $countTrees while it should be $bp_repeats \n";
|
|
}
|
|
if ($make_unique_trees eq "yes")
|
|
{
|
|
# print the number of repeats per unique tree into file
|
|
my $numRepeatsFile = $BPdir."numRepeats";
|
|
open OUT_NUM_REPEATS, ">$numRepeatsFile" or return "can't open file $numRepeatsFile";
|
|
print OUT_NUM_REPEATS "@numRepeats";
|
|
close OUT_NUM_REPEATS;
|
|
return "ok",$countUniqueTrees,\@numRepeats;
|
|
}
|
|
else
|
|
{
|
|
return "ok";
|
|
}
|
|
}
|
|
sub pullOutBPtrees {
|
|
####################################################################################################################
|
|
# pull out all the BP NJ trees into the BP directory
|
|
# pull out the original NJ tree (that was done on the complete MSA file)
|
|
####################################################################################################################
|
|
|
|
#die "Usage: $0 <noBPdir> <dataset> <num_BP_repeats> <alignment program>" if (scalar(@ARGV) < 4);
|
|
|
|
my $noBPdir = shift;
|
|
my $dataset = shift;
|
|
my $bp_repeats = shift;
|
|
my $alnProg = shift;
|
|
|
|
unless ($noBPdir =~ m/\/$/) {
|
|
$noBPdir.="/";
|
|
}
|
|
|
|
my $BPdir = $noBPdir."BP/";
|
|
unless (-e $BPdir) {
|
|
system ("mkdir $BPdir");
|
|
}
|
|
my $nonUniqueTreesDir="";
|
|
if ($alnProg ne "MAFFT") # BUILED ALIGNEMT ONLY FOR UNIQUE TREES
|
|
{
|
|
$nonUniqueTreesDir = $BPdir."nonUniqueTrees/";
|
|
unless (-e $nonUniqueTreesDir) {
|
|
system ("mkdir $nonUniqueTreesDir");
|
|
}
|
|
}
|
|
|
|
# my $robinsonFouldCommand = "$phylonet_prog rf";
|
|
# open semphy file
|
|
my $semphyLogFile = $BPdir.$dataset.".".$alnProg.".semphy.log";
|
|
print "semphy log file: $semphyLogFile\n";
|
|
open IN, "<$semphyLogFile" or return "can't open file $semphyLogFile";
|
|
my $countTrees=0;
|
|
my $countUniqueTrees=0;
|
|
my @numRepeats;
|
|
my $readTree=0; # this is a flag. 2 == read the tree. 3 == read the tree and check for uniqueness.
|
|
my $treeDir="";
|
|
my $treeFile="";
|
|
foreach my $line (<IN>) {
|
|
if ($line =~ m/^\# Finished tree reconstruction\./) {
|
|
$readTree=1;
|
|
}
|
|
elsif (($readTree==1) && ($line =~ m/^\# The tree/)) {
|
|
$treeFile = $noBPdir.$dataset.".".$alnProg.".semphy.tree";
|
|
$readTree=2;
|
|
}
|
|
elsif (($readTree==1) && ($line =~ m/The reconsructed tree/)) {
|
|
$readTree=2;
|
|
if ($alnProg eq "MAFFT")
|
|
{
|
|
$treeDir = $BPdir."/tree_".$countTrees."/"
|
|
}
|
|
else # CHECK UNIQUE TREE ONLY NOT FOR MAFFT
|
|
{
|
|
$treeDir = $nonUniqueTreesDir."/tree_".$countTrees."/";
|
|
}
|
|
unless (-e $treeDir) {system ("mkdir $treeDir");}
|
|
$treeFile = $treeDir.$dataset.".".$alnProg.".semphy.tree_".$countTrees;
|
|
$countTrees++;
|
|
$readTree=3 if ($alnProg ne "MAFFT"); # CHECK UNIQUE TREE ONLY NOT FOR MAFFT
|
|
}
|
|
elsif ($readTree>=2) {
|
|
# open treefile
|
|
open OUT, ">$treeFile" or return "can't open file $treeFile";
|
|
# write the tree into the treefile
|
|
print OUT $line;
|
|
# close treefile
|
|
close OUT;
|
|
if ($readTree==3) { # compare the tree to all other unique trees
|
|
for ($i=0;$i<$countUniqueTrees;++$i) {
|
|
$uniqueTreeFile = $BPdir."tree_".$i."/".$dataset.".".$alnProg.".semphy.tree_".$i;
|
|
# $RobinsonFouldResFile = $treeDir."rf.".$i.".std";
|
|
$isEqualTopologyResFile = $treeDir."isEqualTopology.".$i.".std";
|
|
# my $rfCommand = "$robinsonFouldCommand -m $treeFile -e $uniqueTreeFile";
|
|
my $isEqualTopologyCommand = "$isEqualTopologyProg $treeFile $uniqueTreeFile";
|
|
# my @rfResults = `$rfCommand`;
|
|
my $isEqualTopology = `$isEqualTopologyCommand`;
|
|
# my $editedRFresults = $rfResults[0];
|
|
# open OUT_RF, ">$RobinsonFouldResFile" or return "can't open file $RobinsonFouldResFile";
|
|
# print OUT_RF "@rfResults";
|
|
open OUT_EQUAL_TOP, ">$isEqualTopologyResFile" or return "can't open file $isEqualTopologyResFile";
|
|
print OUT_EQUAL_TOP "$isEqualTopology\n";
|
|
close OUT_EQUAL_TOP;
|
|
# if ($editedRFresults =~ m/ERROR/) {
|
|
# print "skipping error in rfResults of $treeFile and $uniqueTreeFile : $editedRFresults\n";
|
|
# print OUT_RF "skipping error in rfResults of $treeFile and $uniqueTreeFile : $editedRFresults\n";
|
|
# close OUT_RF;
|
|
# next;
|
|
# }
|
|
# close OUT_RF;
|
|
# print "isEqualTopology == $isEqualTopology\n"; #debug OP
|
|
# if ($editedRFresults =~ m/^0\s+0\s+/) { # same tree, FP=0 and FN=0
|
|
if ($isEqualTopology == 1) { # same tree
|
|
$numRepeats[$i]++;
|
|
# print "$isEqualTopology == 1\n"; # debug OP
|
|
last;
|
|
}
|
|
if ($isEqualTopology == 2) {
|
|
print "skipping ERROR in isEqualTopology of $treeFile and $uniqueTreeFile\n";
|
|
next;
|
|
}
|
|
}
|
|
if ($i == $countUniqueTrees) { # The new tree is unique
|
|
push (@numRepeats,1);
|
|
my $uniqueTreeDir = $BPdir."tree_".$countUniqueTrees."/";
|
|
unless (-e $uniqueTreeDir) {system ("mkdir $uniqueTreeDir");}
|
|
my $uniqueTreeFile = $uniqueTreeDir.$dataset.".".$alnProg.".semphy.tree_".$countUniqueTrees;
|
|
system ("cp $treeFile $uniqueTreeFile");
|
|
$countUniqueTrees++;
|
|
}
|
|
}
|
|
$readTree=1;
|
|
}
|
|
}
|
|
close IN;
|
|
if ($countTrees != $bp_repeats) {
|
|
return "ERROR: dataset: $dataset \t countTrees: $countTrees while it should be $bp_repeats \n";
|
|
}
|
|
if ($alnProg ne "MAFFT")
|
|
{
|
|
# print the number of repeats per unique tree into file
|
|
my $numRepeatsFile = $BPdir."numRepeats";
|
|
open OUT_NUM_REPEATS, ">$numRepeatsFile" or return "can't open file $numRepeatsFile";
|
|
print OUT_NUM_REPEATS "@numRepeats";
|
|
close OUT_NUM_REPEATS;
|
|
return "ok",$countUniqueTrees,\@numRepeats;
|
|
}
|
|
else
|
|
{
|
|
return "ok";
|
|
}
|
|
}
|
|
|
|
sub copyBootstrapMSA2oneDir {
|
|
|
|
#die "Usage: $0 <BP_MSA_dir> <BPdir> <alignment program (prank/mafft/clustal/muscle) <ref2array - numRepeats4UniqueTree>" if (scalar(@ARGV) < 4);
|
|
|
|
my ($BP_MSA_dir,$BPdir,$prog,$RefNumRepeats4UniqueTree) = @_;
|
|
my @numRepeats4UniqueTree= @$RefNumRepeats4UniqueTree;
|
|
print "numRepeats4UniqueTree: @numRepeats4UniqueTree\n";
|
|
mkdir ($BP_MSA_dir);
|
|
my $countTrees=0;
|
|
for ($i=0; $i<scalar(@numRepeats4UniqueTree); ++$i) {
|
|
$numRepeats = $numRepeats4UniqueTree[$i];
|
|
for (my $j=0; $j < $numRepeats; ++$j) {
|
|
system ("cp $BPdir/tree_$i/*.$prog.aln $BP_MSA_dir/repeated_tree_$countTrees.$prog.aln");
|
|
$countTrees++;
|
|
}
|
|
}
|
|
|
|
return "ok";
|
|
}
|
|
|
|
sub codes2nameFastaFrom1 {
|
|
my $Aln_with_Codes=shift;;
|
|
my $codes_file=shift;
|
|
my $Aln_with_names=shift;
|
|
my %Codes=();
|
|
open (CODES,$codes_file) or return ("Guidance::codes2nameFastaFrom1 Can't open the Codes file: '$codes_file' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Codes{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
open (IN,$Aln_with_Codes) or return ("Guidance::codes2nameFastaFrom1 Can't open the Alignment with codes file: '$Aln_with_Codes' $!");
|
|
open (OUT,">$Aln_with_names") or return ("Guidance::codes2nameFastaFrom1 Can't open the Alignment with names file: '$Aln_with_names' for writing $!");
|
|
while (my $line=<IN>){
|
|
chomp ($line);
|
|
if ($line=~/>(seq[0-9]+)$/)
|
|
{
|
|
print OUT ">",$Codes{$1},"\n";
|
|
}
|
|
elsif ($line=~/^>([0-9]+)/)
|
|
{
|
|
print OUT ">",$Codes{$1},"\n";
|
|
}
|
|
else # Seq Line
|
|
{
|
|
print OUT "$line\n";
|
|
}
|
|
}
|
|
close OUT;
|
|
close IN;
|
|
return "OK";
|
|
}
|
|
|
|
|
|
sub ConvertNamesOfAlignWithSeed
|
|
###################################################################################################
|
|
# Take alignment created with seed (by MAFFT) and convert the seq with _seed_ prefix into numbers
|
|
# IMPORTANT: The seed seq must be first so MAFFT --reorder argument must not be use
|
|
###################################################################################################
|
|
{
|
|
my $aln=shift;
|
|
my $out=shift;
|
|
open (IN,$aln) || return "Guidance::ConvertNamesOfAlignWithSeed: Can't open In '$aln' $!";
|
|
open (OUT,">$out") || return "Guidance::ConvertNamesOfAlignWithSeed: Can't open Out '$out' $!";
|
|
my $seed_counter=0;
|
|
while (my $line=<IN>)
|
|
{
|
|
if ($line=~/>_seed_(.*)/)
|
|
{
|
|
$seed_counter++;
|
|
$line=~s/>_seed_(.*)/>$seed_counter/;
|
|
print OUT "$line";
|
|
}
|
|
else
|
|
{
|
|
print OUT $line;
|
|
}
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
return "ok";
|
|
}
|
|
sub name2codeFasta_without_codded_out {
|
|
####################################################################################################################
|
|
# Convert the names in a fasta file to numbers, and creates a code file with the names and the codes (running number)
|
|
###################################################################################################################
|
|
my $in_fileName = shift;
|
|
my $code_fileName = shift;
|
|
my $counter_offset= shift; # optional
|
|
|
|
my $in_file = Bio::SeqIO->new(-file => $in_fileName , '-format' => 'Fasta');
|
|
my $code_file = new FileHandle(">>$code_fileName") or return ("Can't write to $code_fileName $!");
|
|
$counter_offset=1 if (!defined $counter_offset);
|
|
$counter_offset=1 if ($counter_offset==0);
|
|
my $counter = $counter_offset;
|
|
my $i;
|
|
|
|
while ( my $seqObj = $in_file->next_seq() ) {
|
|
my $name = $seqObj->display_id();
|
|
$name.= " ".$seqObj->desc() if ($seqObj->desc());
|
|
print $code_file "$name\t$counter\n";
|
|
$counter++;
|
|
}
|
|
$in_file->close();
|
|
$code_file->close();
|
|
return "ok",$counter;
|
|
}
|
|
sub name2codeFastaFrom1 {
|
|
####################################################################################################################
|
|
# Convert the names in a fasta file to numbers, and creates a code file with the names and the codes (running number)
|
|
###################################################################################################################
|
|
my $in_fileName = shift;
|
|
my $code_fileName = shift;
|
|
my $out_fileName = shift;
|
|
my $counter_offset=shift; # optional
|
|
my $OutNameFormat=shift; # {num|seqNum} - The format of name in the coded file (num=only the seq number, seqNum=seq%04u [HoT]) default=num;
|
|
$OutNameFormat="num" if (!defined $OutNameFormat);
|
|
$OutNameFormat="num" if ($OutNameFormat eq "");
|
|
|
|
my $in_file = Bio::SeqIO->new(-file => $in_fileName , '-format' => 'Fasta');
|
|
my $code_file = new FileHandle(">>$code_fileName") or return ("Can't write to $code_fileName $!");
|
|
my $out_file = new FileHandle(">$out_fileName") or return ("Can't write to $out_fileName");
|
|
$counter_offset=1 if (!defined $counter_offset);
|
|
$counter_offset=1 if ($counter_offset==0) and ($OutNameFormat ne "seqNum");
|
|
my $counter = $counter_offset;
|
|
my $i;
|
|
|
|
while ( my $seqObj = $in_file->next_seq() ) {
|
|
my $name = $seqObj->display_id();
|
|
$name.= " ".$seqObj->desc() if ($seqObj->desc());
|
|
if ($OutNameFormat eq "seqNum")
|
|
{
|
|
my $sn=sprintf('seq%04u',$counter);
|
|
print $code_file "$name\t$sn\n";
|
|
}
|
|
else
|
|
{
|
|
print $code_file "$name\t$counter\n";
|
|
}
|
|
my $seq = $seqObj->seq();
|
|
if ($OutNameFormat eq "seqNum")
|
|
{
|
|
my $sn=sprintf('seq%04u',$counter);
|
|
print $out_file ">$sn\n";
|
|
}
|
|
else
|
|
{
|
|
print $out_file ">$counter\n";
|
|
}
|
|
for($i=0;$i<length($seq);$i+=60){
|
|
print $out_file substr($seq,$i,60) . "\n";
|
|
}
|
|
if($i<length($seq)){
|
|
print $out_file substr($seq,$i,length($seq)-$i);
|
|
}
|
|
print $out_file "\n";
|
|
$counter++;
|
|
}
|
|
$out_file->close();
|
|
$in_file->close();
|
|
$code_file->close();
|
|
return "ok";
|
|
}
|
|
|
|
sub average{
|
|
my $data = $_[0];
|
|
my $data_size=scalar(@{$data});
|
|
if ($data_size==0) {
|
|
return ("ERROR: Empty array\n");
|
|
}
|
|
my $total = 0;
|
|
foreach my $val (@{$data}) {
|
|
$total += $val;
|
|
}
|
|
my $average = $total / $data_size;
|
|
return $average;
|
|
}
|
|
sub stdev{
|
|
my $data = $_[0];
|
|
my $average = $_[1];
|
|
my $data_size=scalar (@{$data});
|
|
if($data_size == 1){
|
|
return 0;
|
|
}
|
|
my $sqtotal = 0;
|
|
foreach my $val (@{$data}) {
|
|
$sqtotal += ($average-$val) ** 2;
|
|
}
|
|
my $std = ($sqtotal / ($data_size-1)) ** 0.5;
|
|
return $std;
|
|
}
|
|
sub Calculate_mean_and_std
|
|
# get a delimited file and calcilate the std and average
|
|
{
|
|
my $DataFile=shift;
|
|
my $Column=shift;
|
|
$Column--; # for split
|
|
my $AVERAGE="NAN";
|
|
my $STD="NAN";
|
|
my @data=();
|
|
open (DATA_FILE,"<$DataFile") or return ("Calculate_mean_and_std: Can't open file '$DataFile' $!");
|
|
my $line=<DATA_FILE>; # for header
|
|
while ($line=<DATA_FILE>)
|
|
{
|
|
chomp ($line);
|
|
$line=~ s/^\s+|\s+$//g;
|
|
my @line=split (/\s+/,$line);
|
|
push (@data,$line[$Column]) if ((defined $line[$Column]) and ($line[$Column]=~/\d/));
|
|
}
|
|
close (DATA_FILE);
|
|
my $avg=average(\@data);
|
|
my $std=stdev(\@data,$avg);
|
|
return ($avg,$std);
|
|
}
|
|
sub removeLowSPseq_Consider_Z_OLD{
|
|
# will remove all sequences in which their Z score is bellow cutoff and their SP score is also below cutoff
|
|
my $msaFile=shift;
|
|
my $SeqSpFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoof=shift;
|
|
my $Z_cutoff=shift;
|
|
my $removed_seq_file=shift;
|
|
|
|
my ($mean,$std)=Calculate_mean_and_std ($SeqSpFile,2);
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
|
|
open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$SeqSpFile' $!");
|
|
open (OUT,">$outFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$outFile' $!");
|
|
open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq_Consider_Z: Can't open file '$removed_seq_file' $!");
|
|
my $line=<SEQ_SP_SCORES>; #For header
|
|
while ($line=<SEQ_SP_SCORES>){
|
|
if ($line=~m/^\s*(\d+)\s+(\d+(\.\d+)?)/)
|
|
{
|
|
my $row_num=$1;
|
|
my $Seq_SP_Score=$2;
|
|
my $seq_obj = $aln->get_seq_by_pos($row_num);
|
|
my $seq=$seq_obj->seq();
|
|
$seq=~s/-//g;
|
|
$seq =~ s/(.{60,60})/$1\n/g ;
|
|
$seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ;
|
|
my $Z_score="NaN";
|
|
if ($std>0) {$Z_score=($Seq_SP_Score-$mean)/$std;}
|
|
if ($Seq_SP_Score eq "nan")
|
|
{
|
|
print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
elsif (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # a negative outlier
|
|
{
|
|
if (($Seq_SP_Score<$cutoof)) # To avoid filter highly scored position
|
|
{
|
|
print REMOVED_SEQ ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
else
|
|
{
|
|
print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
print OUT ">".$seq_obj->id(),"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
}
|
|
}
|
|
close (SEQ_SP_SCORES);
|
|
close (OUT);
|
|
close (REMOVED_SEQ);
|
|
return "OK";
|
|
|
|
}
|
|
|
|
sub removeLowSPseq_Consider_Z{
|
|
# will remove all sequences in which their Z score is bellow cutoff and their SP score is also below cutoff
|
|
# Alow removal of sequences by their row number ($type=ByRowNum)in the MSA (when using the MSA set score raw file the scores are for MSA row)
|
|
# or by sequence name ($type=BySeqName)
|
|
# when using ByRowNum - we use BioPerl object [default]
|
|
# when using BySeqName - we use hash to represent MSA (currently without extenssive MSA validation)
|
|
|
|
my $msaFile=shift;
|
|
my $SeqSpFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoof=shift;
|
|
my $Z_cutoff=shift;
|
|
my $removed_seq_file=shift;
|
|
my $type=shift; # ByRowNum | BySeqName
|
|
if (!defined $type){$type="ByRowNum";}
|
|
my ($mean,$std)=Calculate_mean_and_std ($SeqSpFile,2);
|
|
|
|
my ($MSA_HashRef,$MSA_Length,$MSA_Order_ArrayRef); # For $type=BySeqName
|
|
my ($in,$aln); # For $type=ByRowNum
|
|
|
|
if (uc($type) eq "BYSEQNAME")
|
|
{
|
|
my @ans=readMSA_to_Hash($msaFile);
|
|
if ($ans[0] ne "OK"){return "removeLowSPseq_Consider_Z:$ans[0]\n";}
|
|
$MSA_HashRef=$ans[1];
|
|
$MSA_Length=$ans[2];
|
|
$MSA_Order_ArrayRef=$ans[3];
|
|
}
|
|
elsif (uc($type) eq "BYROWNUM")
|
|
{
|
|
$in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!";
|
|
$aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
}
|
|
open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$SeqSpFile' $!");
|
|
open (OUT,">$outFile") or return ("removeLowSPseq_Consider_Z: Can't open file '$outFile' $!");
|
|
open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq_Consider_Z: Can't open file '$removed_seq_file' $!");
|
|
my $line=<SEQ_SP_SCORES>; #For header
|
|
while ($line=<SEQ_SP_SCORES>){
|
|
if ($line=~m/^\s*(\S+)\s+(\d+(\.\d+)?)/)
|
|
{
|
|
my $seq_id=$1;
|
|
my $Seq_SP_Score=$2;
|
|
my ($seq,$seq_name);
|
|
if (uc($type) eq "BYSEQNAME")
|
|
{
|
|
$seq=join("",@{$MSA_HashRef->{$seq_id}}); # seq_id=seq_name
|
|
$seq_name=$seq_id;
|
|
}
|
|
elsif (uc($type) eq "BYROWNUM")
|
|
{
|
|
$seq_obj = $aln->get_seq_by_pos($seq_id); # seq_id=row_num
|
|
$seq=$seq_obj->seq();
|
|
$seq_name=$seq_obj->id();
|
|
}
|
|
$seq=~s/-//g;
|
|
$seq =~ s/(.{60,60})/$1\n/g ;
|
|
$seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ;
|
|
my $Z_score="NaN";
|
|
if ($std>0) {$Z_score=($Seq_SP_Score-$mean)/$std;}
|
|
if ($Seq_SP_Score eq "nan")
|
|
{
|
|
print OUT ">".$seq_name,"_SP_$Seq_SP_Score_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
elsif (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # a negative outlier
|
|
{
|
|
if (($Seq_SP_Score<$cutoof)) # To avoid filter highly scored position
|
|
{
|
|
print REMOVED_SEQ ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
else
|
|
{
|
|
print OUT ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
}
|
|
else
|
|
{
|
|
print OUT ">".$seq_name,"_SP_$Seq_SP_Score"."_Z_$Z_score\n",$seq,"\n";
|
|
}
|
|
}
|
|
}
|
|
close (SEQ_SP_SCORES);
|
|
close (OUT);
|
|
close (REMOVED_SEQ);
|
|
return "OK";
|
|
|
|
}
|
|
sub removeLowSPseq{
|
|
# Alow removal of sequences by their row number ($type=ByRowNum)in the MSA (when using the MSA set score raw file the scores are for MSA row)
|
|
# or by sequence name ($type=BySeqName)
|
|
# when using ByRowNum - we use BioPerl object [default]
|
|
# when using BySeqName - we use hash to represent MSA (currently without extenssive MSA validation)
|
|
my $msaFile=shift;
|
|
my $SeqSpFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoof=shift;
|
|
my $removed_seq_file=shift;
|
|
my $type=shift; # ByRowNum | BySeqName
|
|
if (!defined $type){$type="ByRowNum";}
|
|
my ($MSA_HashRef,$MSA_Length,$MSA_Order_ArrayRef); # For $type=BySeqName
|
|
my ($in,$aln); # For $type=ByRowNum
|
|
if (uc($type) eq "BYSEQNAME")
|
|
{
|
|
my @ans=readMSA_to_Hash($msaFile);
|
|
if ($ans[0] ne "OK"){return "removeLowSPseq:$ans[0]\n";}
|
|
$MSA_HashRef=$ans[1];
|
|
$MSA_Length=$ans[2];
|
|
$MSA_Order_ArrayRef=$ans[3];
|
|
}
|
|
elsif (uc($type) eq "BYROWNUM")
|
|
{
|
|
$in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!";
|
|
$aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
}
|
|
|
|
open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq: Can't open file '$SeqSpFile' $!");
|
|
open (OUT,">$outFile") or return ("removeLowSPseq: Can't open file '$outFile' $!");
|
|
open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq: Can't open file '$removed_seq_file' $!");
|
|
my $line=<SEQ_SP_SCORES>; #For header
|
|
while ($line=<SEQ_SP_SCORES>){
|
|
if ($line=~m/^\s*(\S+)\s+(\d+(\.\d+)?)/)
|
|
{
|
|
my $seq_id=$1;
|
|
my $Seq_SP_Score=$2;
|
|
my ($seq,$seq_name);
|
|
if (uc($type) eq "BYSEQNAME")
|
|
{
|
|
$seq=join ("",@{$MSA_HashRef->{$seq_id}}); # seq_id=seq_name
|
|
$seq_name=$seq_id;
|
|
}
|
|
elsif (uc($type) eq "BYROWNUM")
|
|
{
|
|
$seq_obj = $aln->get_seq_by_pos($seq_id); # seq_id=row_num
|
|
$seq=$seq_obj->seq();
|
|
$seq_name=$seq_obj->id();
|
|
}
|
|
$seq=~s/-//g;
|
|
$seq =~ s/(.{60,60})/$1\n/g ;
|
|
$seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ;
|
|
if ($Seq_SP_Score>=$cutoof)
|
|
{
|
|
print OUT ">".$seq_name,"\n",$seq,"\n";
|
|
}
|
|
else
|
|
{
|
|
print REMOVED_SEQ ">".$seq_name,"\n",$seq,"\n";
|
|
}
|
|
}
|
|
}
|
|
close (SEQ_SP_SCORES);
|
|
close (OUT);
|
|
close (REMOVED_SEQ);
|
|
return "OK";
|
|
}
|
|
sub removeLowSPseq_OLD{ # before the option to filter by seq name - do it by row number!
|
|
my $msaFile=shift;
|
|
my $SeqSpFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoof=shift;
|
|
my $removed_seq_file=shift;
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $msaFile) or die "Can't open $msaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
|
|
open (SEQ_SP_SCORES,"<$SeqSpFile") or return ("removeLowSPseq: Can't open file '$SeqSpFile' $!");
|
|
open (OUT,">$outFile") or return ("removeLowSPseq: Can't open file '$outFile' $!");
|
|
open (REMOVED_SEQ,">$removed_seq_file") or return ("removeLowSPseq: Can't open file '$removed_seq_file' $!");
|
|
my $line=<SEQ_SP_SCORES>; #For header
|
|
while ($line=<SEQ_SP_SCORES>){
|
|
if ($line=~m/^\s*(\d+)\s+(\d+(\.\d+)?)/)
|
|
{
|
|
my $row_num=$1;
|
|
my $Seq_SP_Score=$2;
|
|
my $seq_obj = $aln->get_seq_by_pos($row_num);
|
|
my $seq=$seq_obj->seq();
|
|
$seq=~s/-//g;
|
|
$seq =~ s/(.{60,60})/$1\n/g ;
|
|
$seq .= "\n" unless (substr($seq, -1, 1) eq "\n") ;
|
|
if ($Seq_SP_Score>=$cutoof)
|
|
{
|
|
print OUT ">".$seq_obj->id(),"\n",$seq,"\n";
|
|
}
|
|
else
|
|
{
|
|
print REMOVED_SEQ ">".$seq_obj->id(),"\n",$seq,"\n";
|
|
}
|
|
}
|
|
}
|
|
close (SEQ_SP_SCORES);
|
|
close (OUT);
|
|
close (REMOVED_SEQ);
|
|
return "OK";
|
|
}
|
|
|
|
sub removeLowSPsites_Consider_Z {
|
|
# no checks of the input files are done.
|
|
#die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF Z_CUTOFF" if (@ARGV < 5);
|
|
|
|
my $msaFile=shift;
|
|
my $spFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoff=shift;
|
|
my $Z_cutoff=shift;
|
|
my $Pos_removed_file=shift;
|
|
|
|
my $MSA_Length=0;
|
|
my $Num_Pos_removed=0;
|
|
my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta');
|
|
my @seqs;
|
|
|
|
# read the file into an array
|
|
while (my $seqObj = $in_fasta->next_seq()) {
|
|
push(@seqs,$seqObj);
|
|
if ($MSA_Length==0)
|
|
{
|
|
$MSA_Length=$seqObj->length;
|
|
}
|
|
}
|
|
my ($mean,$std)=Calculate_mean_and_std ($spFile,2);
|
|
open IN, "<$spFile" or return ("removeLowSPsites_Consider_Z: can't open file '$spFile'\n");
|
|
open OUT, ">$outFile" or return ("removeLowSPsites_Consider_Z: can't open file '$outFile' $!\n");
|
|
if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites_Consider_Z: can't open file '$Pos_removed_file' $!\n");}
|
|
my $numRemovedPos = 0;
|
|
foreach my $line(<IN>) {
|
|
if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) {
|
|
my $site_num=$1;
|
|
my $site_score=$2;
|
|
my $Z_score="NaN";
|
|
if ($std>0) {$Z_score=($site_score-$mean)/$std;}
|
|
if (($Z_score ne "NaN") and ($Z_score<-$Z_cutoff)) # an outlier
|
|
{
|
|
if ($site_score < $cutoff) {
|
|
removePos($site_num,$numRemovedPos,\@seqs);
|
|
print REMOVED_POS "Remove Pos: $site_num\tScore: $site_score\tZ_Score:$Z_score\n" if ($Pos_removed_file ne "");
|
|
$numRemovedPos++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
foreach my $seqObj (@seqs) {
|
|
my $id = $seqObj->id();
|
|
my $seq = $seqObj->seq();
|
|
print OUT ">$id\n$seq\n";
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
close (REMOVED_POS);
|
|
return ("OK",$numRemovedPos,$MSA_Length);
|
|
}
|
|
sub removeLowSPsites {
|
|
# no checks of the input files are done.
|
|
#die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF" if (@ARGV < 4);
|
|
|
|
my $msaFile=shift;
|
|
my $spFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoff=shift;
|
|
my $Pos_removed_file=shift;
|
|
# my $Alphabet=shift;
|
|
my $MSA_Length=0;
|
|
my $Num_Pos_removed=0;
|
|
my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta');
|
|
# my $in_fasta = Bio::SeqIO->new(-file => $msaFile, '-format' => 'fasta',-alphabet => $Alphabet);
|
|
|
|
my @seqs;
|
|
|
|
# read the file into an array
|
|
while (my $seqObj = $in_fasta->next_seq()) {
|
|
push(@seqs,$seqObj);
|
|
if ($MSA_Length==0)
|
|
{
|
|
$MSA_Length=$seqObj->length;
|
|
}
|
|
}
|
|
open IN, "<$spFile" or return ("removeLowSPsites: can't open file '$spFile'\n");
|
|
open OUT, ">$outFile" or return ("removeLowSPsites: can't open file '$outFile' $!\n");
|
|
if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites: can't open file '$Pos_removed_file' $!\n");}
|
|
|
|
my $numRemovedPos = 0;
|
|
foreach my $line(<IN>) {
|
|
if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) {
|
|
if ($2 < $cutoff) {
|
|
removePos($1,$numRemovedPos,\@seqs);
|
|
# removePos($1,$numRemovedPos,\@seqs,$Alphabet);
|
|
print REMOVED_POS "Remove Pos: $1\tScore: $2\n" if ($Pos_removed_file ne "");
|
|
$numRemovedPos++;
|
|
}
|
|
}
|
|
}
|
|
foreach my $seqObj (@seqs) {
|
|
my $id = $seqObj->id();
|
|
my $seq = $seqObj->seq();
|
|
print OUT ">$id\n$seq\n";
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
close (REMOVED_POS);
|
|
return ("OK",$numRemovedPos,$MSA_Length);
|
|
}
|
|
sub removePos {
|
|
my $numRemovedPos=$_[1];
|
|
my $seq_ref=$_[2];
|
|
# my $Alphabet=$_[3];
|
|
my $pos2remove = $_[0] - $numRemovedPos;
|
|
foreach my $seqObj (@{$seq_ref}){
|
|
my $new_seq = "";
|
|
if ($pos2remove>1) {
|
|
$new_seq = $seqObj->subseq(1,$pos2remove-1);
|
|
}
|
|
if ($pos2remove< $seqObj->length()) {
|
|
$new_seq .= $seqObj->subseq($pos2remove+1,$seqObj->length());
|
|
}
|
|
$seqObj->seq($new_seq);
|
|
# $seqObj->alphabet($Alphabet);
|
|
}
|
|
}
|
|
|
|
sub removeLowSPsites_NoBioPerl_Consider_Z {
|
|
# no checks of the input files are done.
|
|
#die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF Z_CUTOFF POS_REMOVED_FILE" if (@ARGV < 6);
|
|
|
|
my $msaFile=shift;
|
|
my $spFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoff=shift;
|
|
my $Z_cutoff=shift;
|
|
my $Pos_removed_file=shift;
|
|
|
|
my $Num_Pos_removed=0;
|
|
my @ans=readMSA_to_Hash($msaFile);
|
|
if ($ans[0] ne "OK"){return "removeLowSPsites_Consider_Z:$ans[0]\n";}
|
|
my $MSA_HashRef=$ans[1]; # hash of array, each key is seq ID and the value is an array with the seq
|
|
my $MSA_Length=$ans[2];
|
|
my $MSA_Order_ArrayRef=$ans[3];
|
|
my ($mean,$std)=Calculate_mean_and_std ($spFile,2);
|
|
open IN, "<$spFile" or return ("removeLowSPsites_Consider_Z: can't open file '$spFile'\n");
|
|
open OUT, ">$outFile" or return ("removeLowSPsites_Consider_Z: can't open file '$outFile' $!\n");
|
|
if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites_Consider_Z: can't open file '$Pos_removed_file' $!\n");}
|
|
my $numRemovedPos = 0;
|
|
foreach my $line(<IN>) {
|
|
if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) {
|
|
my $site_num=$1;
|
|
my $site_score=$2;
|
|
my $Z_score="NaN";
|
|
if ($std>0) {$Z_score=($site_score-$mean)/$std;}
|
|
if (($Z_score ne "NaN") and ($Z_score<$Z_cutoff)) # an outlier
|
|
{
|
|
if ($site_score < $cutoff) {
|
|
$MSA_HashRef=removePos_noBioPerl($site_num-1,$MSA_HashRef);
|
|
print REMOVED_POS "Remove Pos: $site_num\tScore: $site_score\tZ_Score: $Z_score\n" if ($Pos_removed_file ne "");
|
|
$numRemovedPos++;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
foreach my $seqID (@{$MSA_Order_ArrayRef}) {
|
|
my $seq = join ("",@{$MSA_HashRef->{$seqID}});
|
|
if ($seq=~/^[-]+$/) {print "WARNNING: After removing positions scored below $cutoff and Z_Score: $Z_cutoff, the sequence $seqID comprised of only gap characters...\n";}
|
|
print OUT ">$seqID\n$seq\n";
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
close (REMOVED_POS);
|
|
return ("OK",$numRemovedPos,$MSA_Length);
|
|
}
|
|
sub removeLowSPsites_NoBioPerl {
|
|
# no checks of the input files are done.
|
|
#die "USAGE: $0 MSA_FILE SP_FILE OUT_FILE CUTOFF POS_REMOVED_FILE" if (@ARGV < 5);
|
|
|
|
my $msaFile=shift;
|
|
my $spFile=shift;
|
|
my $outFile=shift;
|
|
my $cutoff=shift;
|
|
my $Pos_removed_file=shift;
|
|
|
|
my $Num_Pos_removed=0;
|
|
my @ans=readMSA_to_Hash($msaFile);
|
|
if ($ans[0] ne "OK"){return "removeLowSPsites:$ans[0]\n";}
|
|
my $MSA_HashRef=$ans[1];
|
|
my $MSA_Length=$ans[2];
|
|
my $MSA_Order_ArrayRef=$ans[3];
|
|
open IN, "<$spFile" or return ("removeLowSPsites: can't open file '$spFile'\n");
|
|
open OUT, ">$outFile" or return ("removeLowSPsites: can't open file '$outFile' $!\n");
|
|
if ($Pos_removed_file ne ""){open REMOVED_POS, ">$Pos_removed_file" or return ("removeLowSPsites: can't open file '$Pos_removed_file' $!\n");}
|
|
|
|
my $numRemovedPos = 0;
|
|
foreach my $line(<IN>) {
|
|
chomp ($line);
|
|
if ($line =~ m/^\s*(\d+)\s+(\d+(\.\d+)?)/) {
|
|
if ($2 < $cutoff) {
|
|
my $Pos=$1;
|
|
$MSA_HashRef=removePos_noBioPerl ($Pos-1,$MSA_HashRef); # Pos-1 because array start from 0;
|
|
print REMOVED_POS "Remove Pos: $1\tScore: $2\n" if ($Pos_removed_file ne "");
|
|
$numRemovedPos++;
|
|
}
|
|
}
|
|
}
|
|
foreach my $seqID (@{$MSA_Order_ArrayRef}) {
|
|
my $seq = join ("",@{$MSA_HashRef->{$seqID}});
|
|
if ($seq=~/^[-]+$/) {print "WARNNING: After removing positions scored below $cutoff, the sequence $seqID comprised of only gap characters...\n";}
|
|
print OUT ">$seqID\n$seq\n";
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
close (REMOVED_POS);
|
|
return ("OK",$numRemovedPos,$MSA_Length);
|
|
}
|
|
sub printMSA_Hash
|
|
{
|
|
my $refToHash=shift;
|
|
foreach my $key (keys %{$refToHash})
|
|
{
|
|
print ">$key\n",join ("",@{$refToHash->{$key}}),"\n";
|
|
}
|
|
}
|
|
sub readMSA_to_Hash
|
|
{
|
|
# Take an MSA in FASTA format and return (1) an hash of arrays where seq id is key and the seq is an array; (2) MSA length (3) ref to array with the order of seq header in orig file
|
|
my $MSA_File=$_[0];
|
|
my %MSA_Hash=();
|
|
my $MSA_Length=0;
|
|
my @MSA_Order=();
|
|
# open file
|
|
open (my $inMSA, "<", $MSA_File) or return ("GUIDANCE::readMSA_to_Hash: FAILED to open '$MSA_File' $!");
|
|
## 1.1. Read FASTA header and save it
|
|
my $fastaLine = <$inMSA>;
|
|
while (defined $fastaLine) {
|
|
chomp $fastaLine;
|
|
my $header = substr($fastaLine,1);
|
|
## 1.2. Read seq until next header
|
|
$fastaLine = <$inMSA>;
|
|
my $seq = "";
|
|
while ((defined $fastaLine) and
|
|
(substr($fastaLine,0,1) ne ">" )) {
|
|
chomp $fastaLine;
|
|
$seq .= $fastaLine;
|
|
$fastaLine = <$inMSA>;
|
|
}
|
|
## 2.1 update hash
|
|
my @seq_arr=split(//,$seq);
|
|
if (exists $MSA_Hash{$header})
|
|
{
|
|
print "[WARN] The sequence name '$header' appear more than once in the MSA '$MSA_File', Consider only the first instance....\n";
|
|
}
|
|
else
|
|
{
|
|
$MSA_Hash{$header}=[@seq_arr];
|
|
push (@MSA_Order,$header);
|
|
}
|
|
if ($MSA_Length==0)
|
|
{
|
|
$MSA_Length=length ($seq);
|
|
}
|
|
}
|
|
# close file
|
|
close ($inMSA);
|
|
return ("OK",\%MSA_Hash,$MSA_Length,\@MSA_Order);
|
|
}
|
|
|
|
sub removePos_noBioPerl {
|
|
# Get MSAhash where each key is seqID and each value is reff to array with seq;
|
|
# Give a position, rumove the specific char from all sequences
|
|
my $PosToRemove=$_[0];
|
|
my $MSA_Hash_ref=$_[1];
|
|
foreach my $Seq_ID (keys %{$MSA_Hash_ref})
|
|
{
|
|
$MSA_Hash_ref->{$Seq_ID}[$PosToRemove]="";
|
|
}
|
|
return $MSA_Hash_ref;
|
|
}
|
|
|
|
sub codes2name_scoresFile
|
|
{
|
|
my $Score_File=shift;
|
|
my $Codes_File=shift;
|
|
my $Out=shift;
|
|
|
|
my %Codes=();
|
|
open (CODES,$Codes_File) or return ("codes2name_scoresFile:Can't open the Codes file: '$Codes_File' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Codes{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
open (OUT,">$Out") or return ("codes2name_scoresFile: Can't open out file: '$Out' $!");
|
|
open (SCORES,$Score_File) or return ("codes2name_scoresFile: Can't open Scores file: '$Score_File' $!");
|
|
my $line=<SCORES>; #Header;
|
|
print OUT "SEQUENCE_NAME\tSEQUENCE_SCORE\n";
|
|
while ($line=<SCORES>)
|
|
{
|
|
my $code="";
|
|
my $score="";
|
|
if ($line=~/([0-9]+)\s+([0-9.]+)/){
|
|
$code=$1;
|
|
$score=$2;
|
|
}
|
|
if ($Codes{$code})
|
|
{
|
|
print OUT "$Codes{$code}\t$score\n";
|
|
}
|
|
else
|
|
{
|
|
print OUT "$line";
|
|
}
|
|
}
|
|
close (OUT);
|
|
close (SCORES);
|
|
|
|
return "OK";
|
|
}
|
|
|
|
sub Convert_to_Codons_Numbering
|
|
{
|
|
my $score_file=shift;
|
|
my $score_codons_file=shift;
|
|
|
|
open (IN,$score_file)|| return ("Can't open the input file: '$score_file' $!\n");
|
|
open (OUT,">$score_codons_file") || return ("Can't open the output file: '$score_codons_file' $!\n");
|
|
while (my $line=<IN>)
|
|
{
|
|
chomp($line);
|
|
$line=trim($line);
|
|
my @line=split(/\s+/,$line);
|
|
# my $array_size=scalar(@line);
|
|
if ($line[0]=~/[0-9]+/)
|
|
{
|
|
$array_size=scalar(@line)-1;
|
|
print OUT $line[0]*3-2,"\t",join("\t",@line[1..$array_size]),"\n";
|
|
print OUT $line[0]*3-1,"\t",join("\t",@line[1..$array_size]),"\n";
|
|
print OUT $line[0]*3,"\t",join("\t",@line[1..$array_size]),"\n";
|
|
}
|
|
else
|
|
{
|
|
print OUT join("\t",@line),"\n";
|
|
}
|
|
}
|
|
close (IN);
|
|
close (OUT);
|
|
return "OK";
|
|
}
|
|
|
|
sub trim($)
|
|
{
|
|
my $string = shift;
|
|
$string =~ s/^[\s\t]+//;
|
|
$string =~ s/[\s\t]+$//;
|
|
return $string;
|
|
}
|
|
sub MSA_row_Num_to_Seq_Name
|
|
{
|
|
my $MSA=shift;
|
|
my $Codes_Name_File=shift;
|
|
my %MSA_Row_Seq_Name=();
|
|
my %Code_Names=();
|
|
open (CODES,$Codes_Name_File) or return ("Guidance::MSA_row_Num_to_Seq_Name Can't open the Codes file: '$Codes_Name_File' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
# Read MSA
|
|
my $MSA_Depth;
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $MSA) or return ("MSA_row_Num_to_Seq_Name can't open '$MSA': $!");
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1);
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat();
|
|
@ans=MSA_parser::check_msa_licit_and_size($MSA,"fasta","no");
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];}
|
|
else {return "MSA_row_Num_to_Seq_Name: ".join (" ",@ans);}
|
|
for(my $i=1;$i<=$MSA_Depth;$i++){
|
|
my $seq = $aln->get_seq_by_pos($i);
|
|
my $Seq_Name=$Code_Names{$seq->id};
|
|
$MSA_row_Num_to_Seq_Name{$i}=$Seq_Name;
|
|
print "ROW $i\t$Seq_Name\n";
|
|
}
|
|
return ("OK",\%MSA_row_Num_to_Seq_Name);
|
|
}
|
|
sub codes2name_scoresFile_NEW
|
|
# The Scores file
|
|
{
|
|
my $Score_File=shift;
|
|
my $Codes_File=shift;
|
|
my $MSA_File=shift;
|
|
my $Out=shift;
|
|
|
|
my %MSA_row_Num_to_Seq_Name=();
|
|
my %Code_Names=();
|
|
|
|
# Read codes
|
|
open (CODES,$Codes_File) or return ("Guidance::codes2name_scoresFile_NEW Can't open the Codes file: '$Codes_File' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
# Read MSA to see which seq in which row and assign the correct code name
|
|
my $MSA_Depth;
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $MSA_File) or return ("Guidance::codes2name_scoresFile_NEW can't open $MSA_File: $!");
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1);
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat();
|
|
@ans=MSA_parser::check_msa_licit_and_size($MSA_File,"fasta","no");
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];}
|
|
else {return ": ".join (" ",@ans);}
|
|
for(my $i=1;$i<=$MSA_Depth;$i++){
|
|
my $seq = $aln->get_seq_by_pos($i);
|
|
my $Seq_Name=$Code_Names{$seq->id};
|
|
$MSA_row_Num_to_Seq_Name{$i}=$Seq_Name;
|
|
# print "ROW $i\t$Seq_Name\n";
|
|
}
|
|
#Add names to score file
|
|
open (OUT,">$Out") or return ("Guidance::codes2name_scoresFile_NEW: Can't open out file: '$Out' $!");
|
|
open (SCORES,$Score_File) or return ("Guidance::codes2name_scoresFile_NEW: Can't open Scores file: '$Score_File' $!");
|
|
my $line=<SCORES>; #Header;
|
|
print OUT "SEQUENCE_NAME\tSEQUENCE_SCORE\n";
|
|
while ($line=<SCORES>)
|
|
{
|
|
my $code="";
|
|
my $score="";
|
|
my $MSA_row_Num="";
|
|
if ($line=~/([0-9]+)\s+([0-9.]+)/){
|
|
$MSA_row_Num=$1;
|
|
$score=$2;
|
|
}
|
|
if (defined $MSA_row_Num_to_Seq_Name{$MSA_row_Num})
|
|
{
|
|
print OUT "$MSA_row_Num_to_Seq_Name{$MSA_row_Num}\t$score\n";
|
|
}
|
|
else
|
|
{
|
|
print OUT "$line";
|
|
}
|
|
}
|
|
close (OUT);
|
|
close (SCORES);
|
|
return "OK";
|
|
}
|
|
|
|
# JALVIEW OUTPUTS
|
|
sub make_Jalview_AnnotationGraph
|
|
{
|
|
my $Jalview_AnnotFile=shift; # OUT FILE
|
|
my $Data_File=shift; # file ordered by the X's, first field must be X
|
|
my $Y_label=shift; # The Y label
|
|
my $Y_data_Col=1; # The Col of Y data
|
|
my $last_x = 0;
|
|
open (OUT,">$Jalview_AnnotFile") || return ("Can't open outAnnotationsFile '$Jalview_AnnotFile': $!");
|
|
print OUT "JALVIEW_ANNOTATION\n";
|
|
print OUT "BAR_GRAPH\t$Y_label\t";
|
|
|
|
open (DATA_FILE,$Data_File) || die ("make_Jalview_AnnotationGraph: Can't open data file '$Data_File' $!");
|
|
my $line=<DATA_FILE>; # header
|
|
while ($line=<DATA_FILE>)
|
|
{
|
|
chomp ($line);
|
|
my @data=split(",",$line);
|
|
if (($data[$Y_data_Col] ne "nan") and (($data[0]-$last_x)==1))
|
|
{
|
|
print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|";
|
|
$last_x=$data[0];
|
|
}
|
|
elsif ($data[1] ne "nan")
|
|
{
|
|
while ($data[0]-$last_x!=1)
|
|
{
|
|
print OUT "0,0,0|";
|
|
$last_x++;
|
|
}
|
|
print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|";
|
|
$last_x=$data[0];
|
|
}
|
|
elsif ($data[$Y_data_Col] eq "nan")
|
|
{
|
|
print OUT "$data[$Y_data_Col],$data[$Y_data_Col],$data[$Y_data_Col]|";
|
|
$last_x=$data[0];
|
|
}
|
|
}
|
|
print OUT "\n";
|
|
close (OUT);
|
|
}
|
|
sub make_Jalview_Color_MSA
|
|
{
|
|
# Data for MSA coloring
|
|
my $inMsaFile=shift; # MSA File
|
|
my $scoresFile=shift; # Scores File
|
|
my $outJalviewFeaturesFile=shift;
|
|
my $codesFile=shift; # OPTIONAL
|
|
# Global VARS
|
|
|
|
my $sequenceLengthForDisplay=400000;
|
|
|
|
# Print HTML start
|
|
|
|
open JALVIEW_FEATURES, ">$outJalviewFeaturesFile" or return ("Can't open outFeaturesFile '$outJalviewFeaturesFile': $!");
|
|
|
|
#td.Score9{ color: #FFFFFF; background: #A02560;
|
|
#td.Score8{ background: #F07DAB;
|
|
#td.Score7{ background: #FAC9DE;
|
|
#td.Score6{ background: #FCEDF4;
|
|
#td.Score5{ background: #FFFFFF;
|
|
#td.Score4{ background: #EAFFFF;
|
|
#td.Score3{ background: #D7FFFF;
|
|
#td.Score2{ background: #8CFFFF;
|
|
#td.Score1{ background: #10C8D1;
|
|
#td.ScoreNaN background: #C0C0C0;
|
|
print JALVIEW_FEATURES "Score1\t10C8D1\n";
|
|
print JALVIEW_FEATURES "Score2\t8CFFFF\n";
|
|
print JALVIEW_FEATURES "Score3\tD7FFFF\n";
|
|
print JALVIEW_FEATURES "Score4\tEAFFFF\n";
|
|
print JALVIEW_FEATURES "Score5\tFFFFFF\n";
|
|
print JALVIEW_FEATURES "Score6\tFCEDF4\n";
|
|
print JALVIEW_FEATURES "Score7\tFAC9DE\n";
|
|
print JALVIEW_FEATURES "Score8\tF07DAB\n";
|
|
print JALVIEW_FEATURES "Score9\tA02560\n";
|
|
print JALVIEW_FEATURES "ScoreNaN\tC0C0C0\n";
|
|
|
|
print JALVIEW_FEATURES "STARTGROUP\tGUIDANCE\n";
|
|
|
|
open SCORES, $scoresFile or return ("Can't open $scoresFile: $!");
|
|
my %scores;
|
|
my %Code_Names;
|
|
foreach (<SCORES>) {
|
|
next if (/^\#/);
|
|
s/^\s+//;
|
|
my ($col, $seq, $score) = split;
|
|
$scores{$seq}[$col] = $score;
|
|
}
|
|
# Read Codes
|
|
if ($codesFile ne "")
|
|
{
|
|
open (CODES,$codesFile) or return ("Guidance::printColoredAlignment Can't open the Codes file: '$codesFile' $!");
|
|
while (my $line=<CODES>)
|
|
{
|
|
chomp $line;
|
|
my ($Seq_name,$Code)=split("\t",$line);
|
|
$Code_Names{$Code}=$Seq_name;
|
|
}
|
|
close (CODES);
|
|
}
|
|
# Read MSA
|
|
my $in = Bio::AlignIO->new( '-format' => 'fasta' , -file => $inMsaFile) or die "Can't open $inMsaFile: $!";
|
|
my $aln = $in->next_aln;
|
|
$aln->verbose(1); #HAIM COMMNET
|
|
# Otherwise, bioperl adds sequence start/stop values
|
|
$aln->set_displayname_flat(); #HAIM COMMENT
|
|
@ans=MSA_parser::check_msa_licit_and_size($inMsaFile,"fasta","no"); #HAIM ADD
|
|
if ($ans[0] eq "OK"){$MSA_Depth=$ans[1];} #HAIM ADD
|
|
else {return "printColoredAlignment: ".join (" ",@ans);} #HAIM ADD
|
|
# Print HTML start
|
|
my %msaColors = ();
|
|
my %msaPrintColors = ();
|
|
my $lineCounter;
|
|
my @line;
|
|
my $key;
|
|
|
|
my @msaRightOrder=0;
|
|
my $msaRightOrderCounter=0;
|
|
|
|
my @colorstep = (); #color steps
|
|
$colorstep[0] = "Score1"; #Not confident
|
|
$colorstep[1] = "Score2";
|
|
$colorstep[2] = "Score3";
|
|
$colorstep[3] = "Score4";
|
|
$colorstep[4] = "Score5"; #average
|
|
$colorstep[5] = "Score6";
|
|
$colorstep[6] = "Score7";
|
|
$colorstep[7] = "Score8";
|
|
$colorstep[8] = "Score9"; #Most confident
|
|
$colorstep[9] = "Score9"; #Most confident (the score is exactly 1)
|
|
|
|
my @colorstep_code = (); #color steps
|
|
$colorstep_code[0] = "#10C8D1"; #Not confident
|
|
$colorstep_code[1] = "#8CFFFF";
|
|
$colorstep_code[2] = "#D7FFFF";
|
|
$colorstep_code[3] = "#EAFFFF";
|
|
$colorstep_code[4] = "#FFFFFF"; #average
|
|
$colorstep_code[5] = "#FCEDF4";
|
|
$colorstep_code[6] = "#FAC9DE";
|
|
$colorstep_code[7] = "#F07DAB";
|
|
$colorstep_code[8] = "#A02560"; #Most confident
|
|
|
|
# get Align max_seq_length
|
|
my $seq = $aln->get_seq_by_pos(1);
|
|
my $Align_width = $seq->length();
|
|
|
|
# counts how many times we print the whole section (relevants to sequences longer than the sequenceLengthForDisplay)
|
|
for(my $blockStart=1; $blockStart<$aln->length; $blockStart+=$sequenceLengthForDisplay) {
|
|
my $blockEnd = $blockStart+$sequenceLengthForDisplay;
|
|
$blockEnd = $aln->length if ($blockEnd > $aln->length);
|
|
|
|
# Iterate over sequences and print up to sequenceLengthForDisplay residues
|
|
for(my $i=1;$i<=$MSA_Depth;$i++){ #HAIM ADD
|
|
my $seq = $aln->get_seq_by_pos($i); #HAIM ADD
|
|
|
|
# NEW
|
|
my $Color_Class="";
|
|
# Print seq
|
|
my @seq = split //, $seq->subseq($blockStart, $blockEnd);;
|
|
my $gaps=0;
|
|
for(my $pos=0; $pos<@seq; $pos++) {
|
|
my $res = $seq[$pos];
|
|
my $Color_Class="";
|
|
my $prob="";
|
|
if ($res eq '-')
|
|
{
|
|
$gaps++;
|
|
}
|
|
elsif (($res ne '-') and ($scores{$i}[$pos+1] ne "nan")) {
|
|
$Color_Class = $colorstep[ int(9 * $scores{$i}[$pos+1]) ];
|
|
$prob=$scores{$i}[$pos+1];
|
|
if ($Color_Class ne "Score5")
|
|
{
|
|
# print JALVIEW_FEATURES "$prob\t$Code_Names{$seq->id}\t-1\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n";
|
|
print JALVIEW_FEATURES "$prob\tID_NOT_SPECIFIED\t",$i-1,"\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; # COLOR JALVIEW by SEQ NUM - not SEQ ID
|
|
}
|
|
}
|
|
elsif (($res ne '-') and ($scores{$i}[$pos+1] eq "nan")) {
|
|
$Color_Class="ScoreNaN";
|
|
# print JALVIEW_FEATURES "$prob\t$Code_Names{$seq->id}\t-1\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n";
|
|
print JALVIEW_FEATURES "NA\tID_NOT_SPECIFIED\t",$i-1,"\t",$pos+1-$gaps,"\t",$pos+1-$gaps,"\t$Color_Class\t$prob\n"; # COLOR JALVIEW by SEQ NUM - not SEQ ID
|
|
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
print JALVIEW_FEATURES "ENDGROUP\tGUIDANCE\n";
|
|
close (JALVIEW_FEATURES);
|
|
}
|
|
|
|
sub make_JalView_output
|
|
{
|
|
my $JalView_Applet_Page=shift;
|
|
my $WorkingDir=shift;
|
|
my $http=shift;
|
|
# Colored MSA
|
|
my $inMsa=shift; # MSA file with codes
|
|
my $inMsa_With_names=shift; # MSA file with codes
|
|
my $scores=shift; # hash of scores
|
|
my $outJalviewFeaturesFile=shift;
|
|
my $NamesCodeFile=shift;
|
|
|
|
# Annotation Graph
|
|
my $Jalview_AnnotFile=shift; # OUT FILE
|
|
my $Data_File=shift; # file ordered by the X's, first field must be X
|
|
my $Y_label=shift; # The Y label
|
|
|
|
open (JALVIEW,">$JalView_Applet_Page") || return "Can't open Jalview output page: '$JalView_Applet_Page' $!";
|
|
make_Jalview_Color_MSA("$WorkingDir$inMsa",$scores,$WorkingDir.$outJalviewFeaturesFile,$NamesCodeFile);
|
|
make_Jalview_AnnotationGraph($WorkingDir.$Jalview_AnnotFile,$Data_File,$Y_label);
|
|
print JALVIEW "<HTML>\n";
|
|
print JALVIEW "<applet CODEBASE=\"http://guidance.tau.ac.il/\"\n";
|
|
print JALVIEW "CODE=\"jalview.bin.JalviewLite\" width=100% height=100%\n";
|
|
print JALVIEW "ARCHIVE=\"jalviewApplet.jar\">\n";
|
|
print JALVIEW "<param name=\"file\" value=\"$http".$inMsa_With_names."\">\n";
|
|
print JALVIEW "<param name=\"features\" value=\"$http".$outJalviewFeaturesFile."\">\n";
|
|
print JALVIEW "<param name=\"annotations\" value=\"$http".$Jalview_AnnotFile."\">\n";
|
|
print JALVIEW "<param name=\"application_url\" value=\"http://www.jalview.org/services/launchApp\">\n";
|
|
print JALVIEW "<param name=\"showbutton\" VALUE=\"false\">\n";
|
|
print JALVIEW "<param name=\"showConservation\" VALUE=\"false\">\n";
|
|
print JALVIEW "<param name=\"showQuality\" VALUE=\"false\">\n";
|
|
print JALVIEW "<param name=\"showConsensus\" VALUE=\"false\">\n";
|
|
print JALVIEW "</APPLET>\n";
|
|
print JALVIEW "</HTML>\n";
|
|
|
|
}
|
|
|
|
sub validate_Seqs{
|
|
my $workingDir=shift;
|
|
my $in=shift;
|
|
my $SeqType=shift; # AminoAcids | Nucleotides | Codons
|
|
my $MSA=shift; # Yes,No
|
|
my $CodonTable=shift; # requiered if SeqType is Codons
|
|
open (IN,$workingDir.$in) || return ('sys_error', "Validate_Seqs:Can't open '$workingDir$in': $!");
|
|
my $seq="";
|
|
my $seq_name="";
|
|
my $seq_length=0;
|
|
my $Warnning="";
|
|
my $Counter=0;
|
|
open (OUT,">$workingDir"."$in".".FIXED") || return ('sys_error', "Validate_Seqs:Can't open '$workingDir$in".".FIXED': $!");
|
|
while (my $line=<IN>)
|
|
{
|
|
chomp ($line);
|
|
$line=~ s/^\s+|\s+$//g;
|
|
if (($line!~/>/)and ($line ne ""))
|
|
{
|
|
$seq=$seq.$line;
|
|
}
|
|
elsif ($line=~/^>(.*)/)
|
|
{
|
|
# validate prev seq
|
|
if (($seq eq "") and ($seq_name ne ""))
|
|
{
|
|
return ("The sequence named '$seq_name' is missing<br>");
|
|
}
|
|
if (($seq ne "") and ($seq_name ne ""))
|
|
{
|
|
# validate seq according to type
|
|
if ($MSA eq "Yes") # Make sure alignment length equal
|
|
{
|
|
$seq_length=length($seq) if ($seq_length==0); # initialize the first one
|
|
if (length($seq)!=$seq_length)
|
|
{
|
|
return ("The sequences of the provided MSA are not properly aligned, For example the seq: '$seq_name' does not aligned to all others. Please fix the alignment and run GUIDANCE again or provide GUIDANCE sequences only<br>");
|
|
}
|
|
if ($SeqType eq "Codons") # Make sure that in Codon Alignment there are no stop Codons and all seq are divided by 3
|
|
{
|
|
my $ans=validate_seq_in_CodonAlign($seq,$seq_name,$CodonTable);
|
|
return ($ans) if ($ans ne "OK");
|
|
}
|
|
}
|
|
if ($MSA eq "No")
|
|
{
|
|
if ($seq=~/([-]+)$/)
|
|
{
|
|
$seq=~s/$1//;
|
|
$Warnning="Gap characters (-) were removed from the end of the sequences";
|
|
}
|
|
if ($seq=~/[-]/)
|
|
{
|
|
return ("Seq: named '$seq_name' contain a gap character '-' which is illigal when sequences are submited to GUIDANCE. If you intended to submit an alignment, please upload the file using the 'Upload MSA file for evaluation' option<br>");
|
|
}
|
|
}
|
|
if ($seq=~s/\*$//)
|
|
{
|
|
$Warnning="Star character (*) were removed from the end of the sequences";
|
|
}
|
|
my @ans=validate_single_seq($seq_name,$seq,$SeqType);
|
|
if ($ans[0] eq "OK")
|
|
{
|
|
print OUT ">$seq_name\n"; # prev seq
|
|
print OUT "$seq\n"; # prev seq
|
|
$Counter++;
|
|
}
|
|
else
|
|
{
|
|
return ($ans[0]);
|
|
}
|
|
}
|
|
# Start new seq
|
|
if ($line=~/^>(.*)/)
|
|
{
|
|
$seq_name=$1;
|
|
$seq_name=~ s/^\s+|\s+$//g ;
|
|
if ($seq_name eq "")
|
|
{
|
|
my $Seq_Num=$Counter+1;
|
|
return ("Seq number $Seq_Num has no sequence name; Please fix and resubmit<br>");
|
|
}
|
|
else
|
|
{
|
|
# $seq_name=$1;
|
|
$seq="";
|
|
}
|
|
}
|
|
}
|
|
}
|
|
# validate last seq
|
|
if (($seq eq "") and ($seq_name ne ""))
|
|
{
|
|
return ("The sequence named '$seq_name' is missing<br>");
|
|
}
|
|
else
|
|
{
|
|
# validate seq according to type
|
|
if ($MSA eq "Yes") # Make sure alignment length equal
|
|
{
|
|
$seq_length=length($seq) if ($seq_length==0); # initialize the first one
|
|
if (length($seq)!=$seq_length)
|
|
{
|
|
return ("The sequences of the provided MSA are not properly aligned, For example the seq: '$seq_name' does not aligned to all others. Please fix the alignment and run GUIDANCE again or provide GUIDANCE sequences only<br>");
|
|
}
|
|
if ($SeqType eq "Codons") # Make sure that in Codon Alignment there are no stop Codons and all seq are divided by 3
|
|
{
|
|
my $ans=validate_seq_in_CodonAlign($seq,$seq_name,$CodonTable);
|
|
return ($ans) if ($ans ne "OK");
|
|
}
|
|
}
|
|
if ($MSA eq "No")
|
|
{
|
|
if ($seq=~/([-]+)$/)
|
|
{
|
|
$seq=~s/$1//;
|
|
$Warnning="Gap characters (-) were removed from the end of the sequences";
|
|
}
|
|
if ($seq=~/[-]/)
|
|
{
|
|
return ("Seq: named '$seq_name' contain a gap character '-' which is illigal when sequences are submited to GUIDANCE. If you intended to submit an alignment, please upload the file using the 'Upload MSA file for evaluation' option<br>");
|
|
}
|
|
}
|
|
my @ans=validate_single_seq($seq_name,$seq,$SeqType);
|
|
if ($ans[0] eq "OK")
|
|
{
|
|
print OUT ">$seq_name\n"; # prev seq
|
|
print OUT "$seq\n"; # prev seq
|
|
$Counter++;
|
|
}
|
|
else
|
|
{
|
|
return ($ans[0]);
|
|
}
|
|
}
|
|
#if ($Counter<4)
|
|
#{
|
|
# return ("Only $Counter sequences were provided, however at least 4 sequences are requiered");
|
|
#}
|
|
close (OUT);
|
|
close (IN);
|
|
return ("OK",$Warnning,$in.".FIXED",$Counter);
|
|
}
|
|
|
|
sub validate_single_seq
|
|
{
|
|
my $Seq_Name=shift;
|
|
my $seq=shift;
|
|
my $seq_type=shift;
|
|
if (($seq!~/[ABRNDCQEGHILKMFPSTWYVXZabrndcqeghilkmfpstwyvxz]+/) and ($seq_type eq "AminoAcids"))
|
|
{
|
|
return ("Seq: '$Seq_Name' is empty<br>");
|
|
}
|
|
elsif (($seq!~/[ACTGUNactgun]+/) and ($seq_type ne "AminoAcids"))
|
|
{
|
|
return ("Seq: '$Seq_Name' is empty<br>");
|
|
}
|
|
if (($seq=~/([^ABRNDCQEGHILKMFPSTWYVXZabrndcqeghilkmfpstwyvxz-])/) and ($seq_type eq "AminoAcids"))#Maybe allow: _*-?
|
|
{
|
|
return ("Seq: '$Seq_Name' contained the character: '$1' which is not a standard Amino Acid<br>");
|
|
}
|
|
#----------- Amit -------------
|
|
if (($seq=~/([^ACGTRYWSMKHBVDNUXacgtrywsmkhbvdnux-])/) and ($seq_type ne "AminoAcids"))#Maybe allow: _*-?
|
|
{
|
|
my $wrong_char=$1;
|
|
if (($seq=~ /[Uu]/) and ($seq_type eq "Nucleotides"))
|
|
{
|
|
return ("Currently GUIDANCE does not accept 'U's in nucleotide sequences, you may consider replacing the 'U's by 'T's and re-submit. <br> In addition, seq: '$Seq_Name' contained the character: '$wrong_char' which is not a standard Nucleotide <br>");
|
|
}
|
|
return ("Seq: '$Seq_Name' contained the character: '$wrong_char' which is not a standard Nucleotide<br>");
|
|
}
|
|
if (($seq=~ /[Uu]/) and ($seq_type eq "Nucleotides"))#Maybe allow: _*-?
|
|
{
|
|
return ("Currently GUIDANCE does not accept 'U's in nucleotide sequences, you may consider replacing the 'U's by 'T's and re-submit.<br>");
|
|
}
|
|
#----------- Amit -------------
|
|
return ("OK");
|
|
}
|
|
sub validate_seq_in_CodonAlign
|
|
# Make sure no stop Codons
|
|
{
|
|
my $DNASequence = shift;
|
|
my $DNASequenceName = shift;
|
|
my $codonTableIndex = shift;
|
|
my $stopCodon_Found="NO";
|
|
my $AASeq="";
|
|
my $codonTable_obj = Bio::Tools::CodonTable -> new ( -id => $codonTableIndex );
|
|
chomp ($DNASequence);
|
|
my $seq_length = length($DNASequence);
|
|
my $i =0;
|
|
return ("Sequence '$DNASequenceName' is not a valid coding sequence: the sequence is of length $seq_length which it is not divided by 3") if ($seq_length % 3>0);
|
|
while ($i<$seq_length-2)
|
|
{
|
|
$codon = substr($DNASequence, $i, 3);
|
|
if ($codon eq '---')
|
|
{
|
|
$AA = '-';
|
|
}
|
|
else
|
|
{
|
|
$AA = $codonTable_obj->translate($codon);
|
|
}
|
|
$AASeq.= $AA;
|
|
$i+=3;
|
|
}
|
|
if ($AASeq =~ m/\*/){
|
|
return ("Sequence: '$DNASequenceName' contains a stop codon, please remove all stop codons (from all sequences) and submit to GUIDANCE again");
|
|
}
|
|
return ("OK");
|
|
}
|
|
|
|
sub SampleFromUnifomDist
|
|
{
|
|
my $start=shift;
|
|
my $end=shift;
|
|
my $out_sample_file_name = shift;
|
|
my $sample_size = shift;
|
|
my @sample=();
|
|
my @op_vals = ();
|
|
my $range=$end-$start;
|
|
open(my $OUT, ">>", $out_sample_file_name) or die "SampeFromEmpiricDistribution:cannot open OUT: '$out_sample_file_name' $!";
|
|
my $j = 0;
|
|
my $RandNum = 0;
|
|
while ($j<$sample_size){
|
|
my $op_rand=rand($range)+$start;;
|
|
#my $op_rand = 2*rand() + 1;
|
|
push (@sample,$op_rand);
|
|
print $OUT "$op_rand\n";
|
|
$j++;
|
|
}
|
|
close ($OUT);
|
|
return (\@sample);
|
|
}
|
|
|
|
sub SampelFromEmpiricDistribution
|
|
{
|
|
# Input: distribution file name, output file name; times to run (for debigging)
|
|
my $distribution_file_name = shift;
|
|
my $out_sample_file_name = shift;
|
|
my $sample_size = shift;
|
|
my @sample=();
|
|
my @op_vals = ();
|
|
my @op_density = ();
|
|
my @op_prob = ();
|
|
my @op_CDF = ();
|
|
my $i = 0;
|
|
open(my $IN, "<", $distribution_file_name) or die "SampeFromEmpiricDistribution:cannot open IN: '$distribution_file_name' $!";
|
|
|
|
my $line=<$IN>;
|
|
while ($line=<$IN>) {
|
|
chomp $line;
|
|
my @temp = split(/\s+/,$line);
|
|
$op_vals[$i] = $temp[1];
|
|
$op_density[$i] = $temp[2];
|
|
$i++;
|
|
}
|
|
close ($IN);
|
|
|
|
for ($i=0;$i<scalar(@op_vals)-1;$i++){
|
|
$op_prob[$i] = $op_density[$i]*($op_vals[$i+1]-$op_vals[$i]);
|
|
$op_CDF[$i] = eval join('+', @op_prob[0..$i]);
|
|
}
|
|
|
|
open(my $OUT, ">>", $out_sample_file_name) or die "SampeFromEmpiricDistribution:cannot open OUT: '$out_sample_file_name' $!";
|
|
|
|
my $j = 0;
|
|
my $DistRandNum = 0;
|
|
my $RandNum = 0;
|
|
while ($j<$sample_size){
|
|
$RandNum = rand();
|
|
$i=0;
|
|
while ($RandNum>$op_CDF[$i]){
|
|
$i++;
|
|
}
|
|
$RandNum = rand();
|
|
$DistRandNum = $op_vals[$i] + $RandNum*($op_vals[$i+1]-$op_vals[$i]);
|
|
push (@sample,$DistRandNum);
|
|
print $OUT "$DistRandNum\n";
|
|
$j++;
|
|
}
|
|
close ($OUT);
|
|
return (\@sample);
|
|
}
|
|
|
|
#----------------------------------
|
|
sub reformat_trees_branch_length
|
|
{
|
|
|
|
my $inTree=shift;
|
|
my $outTree=shift;
|
|
#print "reformat_trees_branch_length:Command line arguments: \t$inTree\n\t$outTree\n";
|
|
|
|
# Tree objets
|
|
my $inputTree = new Bio::TreeIO(-file => "$inTree",
|
|
-format => "newick");
|
|
|
|
my $outputTree = new Bio::TreeIO(-file => ">$outTree",
|
|
-format => "newick");
|
|
my $tree = $inputTree->next_tree;
|
|
|
|
foreach my $node ($tree->get_nodes() ) {
|
|
my $len = $node->branch_length();
|
|
if(defined $len){
|
|
$node->branch_length(sprintf("%f",$len)); #
|
|
}
|
|
}
|
|
|
|
$outputTree->write_tree($tree);
|
|
|
|
#print "--- END --- \n";
|
|
}
|
|
|
|
#----------------------------------
|
|
1;
|