Katzlab dd76ab1d12 Added PTL2 Scripts
These are PTL2 files from Auden 2/9
2023-02-14 11:20:52 -05:00

862 lines
42 KiB
Perl

#!/usr/local/bin/perl
use strict;
use Storable;
use lib "/bioseq/bioSequence_scripts_and_constants/"; #"/db1/System/bioseq/scripts_for_servers";
use GENERAL_CONSTANTS;
use BIOSEQUENCE_FUNCTIONS;
use SELECTON_CONSTANTS;
use lib "/bioseq/Selecton/external_scripts";
use pipe_for_selecton;
my $WorkingDir = shift;
my $runCalcInput = $WorkingDir.shift;
my $formInput = $WorkingDir.shift;
# hashes to retrieve the runs' input from the storable element
my %FORM = ();
my %run_data = ();
# VARS FROM INPUT FILE
# vars read from form:
my ($epsilonPrecision, $query_seq_name_to_run, $upload_TREE_file, $optimizeBL);
# General vars
my ($OutLogFile, $querySeqFoundinMSA, $tree_faq, $PdbPrefix, $method, $run_name, $PdbFileNameUnc, $was_Pdb_uploaded, $proc_comm, $estimated_run_time, $begin_Q_runtime);
# HTML related vars
my ($SysErrorDef, $ContactDef, $WWWdir, $ErrorDef, $OutputURL);
# files which were created by the cgi
my ($fileDna_aligned, $treeUpload, $OutHtmlFile, $cgi_log_file, $sequences_names_file, $fileName_amino_aligned, $upload_unaligned_file_dna, $upload_MSA_file_dna, $clustal_aligned_file, $pdb_data);
my @sequences_names = (); # This array will hold the names of the sequences, as it appears in the input file.
my $qsub_ans_file = $WorkingDir."qsub_ans.txt"; # The flag file which is read by the daemon process
my $finish_flag = $WorkingDir."END_OK"; # The file which denotes that the run was finished.
&readInput();
my $http_path = GENERAL_CONSTANTS::SELECTON_URL;
### Sending e-mail from the cluster
my $smtp_server = GENERAL_CONSTANTS::SMTP_SERVER;
my $userName = GENERAL_CONSTANTS::ADMIN_USER_NAME;
my $userPass = GENERAL_CONSTANTS::ADMIN_PASSWORD;
my $mail = "mailto:".GENERAL_CONSTANTS::ADMIN_EMAIL."?subject=Selecton%20Run%20No:%20$run_name";
my $email_subject;
my $email_message;
my $email_system_return;
my $send_email_dir = GENERAL_CONSTANTS::SEND_EMAIL_DIR;
# VARS DEFINED IN THE SCRIPT:
# scripts or executables
my $biocluster_external_scripts_path = "/bioseq/Selecton/external_scripts/";
#my $selecton = "/bioseq/pupkoSVN/trunk/programs/selecton/selecton"; #"/d/bioinfo/users/adist/pupkoSVN/trunk/programs/selecton/selecton";
my $selecton = "/bioseq/Selecton/selecton.exe";
#my $selecton = $biocluster_external_scripts_path."srcSelecton/srcV2.2/selecton";
#my $mecSelecton = "/bioseq/pupkoSVN/trunk/programs/mec/mec";
my $mecSelecton = "/bioseq/Selecton/mec.exe";
#"/d/bioinfo/users/adist/pupkoSVN/trunk/programs/mec/mec"; # the exe of the kaks using the mec model is added by adid (29.1.07)
my $colorCoding = $biocluster_external_scripts_path . "colorCoding.v2.pl";
my $colorCodingLinear = $biocluster_external_scripts_path . "colorCodingLinear.pl";
my $statTest = "/cgi-bin/statTest.cgi"; # REMARK : SHOULD BE RE-WRITTEN AND RUN FROM THE CLUSTER
# vars
my $pdbUpload = $WorkingDir . $PdbFileNameUnc; #name for an uploaded PDB FILE
my $significance_test_faq = "/overview.html#meth5";
# files which will be created by this script
my $rsml = "rasmol.txt";#"colors.txt";
my $areTherePositiveSites = $WorkingDir."areSitesPositive.txt";
my $colorsLinear = "colors.html"; #results color-coded onto the linear sequence
my $outputScoreFile="kaks.res";
my $selection4Site_file = $WorkingDir."selection4Site.txt";
my $kaks_file = $WorkingDir."kaks.res";
my $params = "globalResult.txt";
my $log = "kaks4site.log";
my $final_out = $PdbPrefix . ".gradesPE";
my $finalDNAFile_seq_names = "DNA.names.msa";
my $finalAminoFile_seq_names = "AMINO.names.msa";
my $tree_out = "kaks4site.tree";
my $rasmol_file = $PdbPrefix .".rsml";
my $statistics_file = SELECTON_CONSTANTS::STATISTICS_FILE;
#*****************************
### FGiJ path and related files
my $FGiJ_path = "/fgij/";
my $FGiJ_pipe_pdb = $FORM{pdb_ID} . "_selecton" . $run_name . "_pipe.pdb";
my $FGiJ_link = $FGiJ_path . "fg.htm?mol=/results/" . $run_name . "/" . $FGiJ_pipe_pdb;
my $pipe_error = "pipe.error";
#---------------------------------------------------------------------------
# C A L C U L A T I O N
#---------------------------------------------------------------------------
$begin_Q_runtime = &printTime();
open LOG, ">".$OutLogFile;
print LOG &printTime();
print LOG "\nEntered selecton_run_calc.pl\nUpdating the HTML with running status\n";
# updating the HTML status to "running"
my $ans = &GENERAL_CONSTANTS::print_Q_status_in_html($OutHtmlFile, "Running", "no", $estimated_run_time);
print LOG $ans if ($ans ne "OK");
&run_calc();
### changing back the names in the DNA and AMINO files, so it will hold the original sequnces names.
&return_seq_names_to_files($fileName_amino_aligned, $finalAminoFile_seq_names, \@sequences_names);
&return_seq_names_to_files($fileDna_aligned, $finalDNAFile_seq_names, \@sequences_names);
#if a tree was created, we output it to the users, than first have to write to it original names
#if ((-e $WorkingDir.$tree_out) && !(-z $WorkingDir.$tree_out)){
# print LOG "renaming numbered tree output $tree_out to : no_tree.txt\n";
# my $cmd = "mv ".$WorkingDir.$tree_out." ".$WorkingDir."no_tree.txt";
# my $out = `$cmd`;
# print LOG "moving returned: $out\n";
# chmod 0744, $WorkingDir."no_tree.txt";
# print LOG "going to run change_tree_file_names\n";
# #&change_tree_file_names(\@sequences_names, $WorkingDir."no_tree.txt", $WorkingDir.$tree_out);
# print LOG "after change_tree_file_names\n";
#}
#------------
### create a pdb file with pipe in its header, for FGiJ to read
if ($was_Pdb_uploaded eq "yes"){
print LOG "Going to prepare pipe file\n";
&prepare_pipe(\@sequences_names);
}
else{
print LOG "Not creating pipe, since the value of \$was_Pdb_uploaded is: \"$was_Pdb_uploaded\"\n";
}
### PRINTING output FINAL NOTES AND LINKS
open OUTPUT, ">>" .$OutHtmlFile;
flock OUTPUT,2;
print OUTPUT "\n<H1><center><a name=finish>Selecton calculation is <font color=\"red\">FINISHED </font></a></center></H1>\n";
print OUTPUT "<h3><i>Final Result:</i></h3>";
if ($was_Pdb_uploaded eq "yes") { #3D
print OUTPUT "\n<p><b><A HREF='".$FGiJ_link."' TARGET=_blank>Graphical display of Selecton results</b></A> with FirstGlance in Jmol<br></p>\n";
}
else {
print OUTPUT "\n<p><b><A HREF= $colorsLinear> View Color Coded Selecton Results </A></b></p>\n";
}
# check if there are P.S. sites. If yes - a button for statistical testing
open POS, "<$areTherePositiveSites";
my @pos = <POS>;
chomp @pos;
close POS;
if ($pos[0] eq "yes"){
print OUTPUT "<font face=Verdana size=2>Positively selected sites found.</font><br>\n";
###### here is a link for statistical testing for the M8 and MEC models
if (($FORM{MODEL} eq "M8") || ($FORM{MODEL} eq "MEC")) {
my $ans = &add_data_to_input_file;
# in case the data was not added - we don't create a submit button, we let the user know he can contact us
if ($ans eq "OK"){
print OUTPUT "\n<FORM ENCTYPE=\"multipart/form-data\" ACTION=\"".$statTest."\" METHOD=\"POST\">\n";
print OUTPUT "<INPUT TYPE=\"submit\" VALUE=\"Test statistical significance of positive selection\"><br>\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"run_number\" VALUE=\"".$run_name ."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"input_file\" VALUE=\"".$runCalcInput."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"Q_log\" VALUE=\"".$OutLogFile."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"receipent\" VALUE=\"".$FORM{email_address}."\">\n";
print OUTPUT "<font face=Verdana size=1><a href = \"".$significance_test_faq."\">This will run your data with a null model of evolution</a></font></FORM><br>\n";
}
else{
print OUTPUT "<font face=Verdana size=2 color=\"green\">Please <a href=\"$mail\">contact us</a> if you wish to run a statistical test for your results, and mention this number: $run_name</font><br>";
}
}
}
else {
print OUTPUT "<font face=Verdana size=2>No positively selected sites found in the protein.</font><br>\n";
}
print OUTPUT "<h3>Output Files:</h3>\n\n";
### print the links
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.: Ka/Ks scores 2gether with color coding
print OUTPUT "<p><A HREF= $colorsLinear TARGET=colors_window> Codon Ka/Ks scores color-coded on the linear sequence</A></p>\n";
}
print OUTPUT "<p><b><A HREF= $outputScoreFile> Codon Ka/Ks scores (numerical values)";
# add the gapped output only in case there was gapped output (if the files are identical, the only difference will be in the line 'Displayed on sequence 1< including gaps>'
if ((-s $WorkingDir."kaks.res.gaps") - (-s $WorkingDir.$outputScoreFile) > 15){
print OUTPUT " - Reference sequence only</A></b></p>\n";
print OUTPUT "<p><img SRC=\"/New.gif\" BORDER=0 height=30 width=40><b><A HREF= \"kaks.res.gaps\"> Codon Ka/Ks scores (numerical values) - For each position in the MSA, including gaps</A></b><img SRC=\"/New.gif\" BORDER=0 height=30 width=40></p>\n";
}
else{
print OUTPUT "</A></b></p>\n";
}
if ( $upload_unaligned_file_dna ne "no"){ # print amino aln only if the user supplied a non-aligned file (then we run codon-align & produce an amino aln)
print OUTPUT "<p><A HREF= $finalAminoFile_seq_names TARGET=MSA_window> Amino Acid Multiple Sequence Alignment (in Fasta format)</A>\n";
}
print OUTPUT "<p><A HREF= $finalDNAFile_seq_names TARGET=codonMSA_window> Codon Multiple Sequence Alignment (in Fasta format)</A>\n";
if ($upload_TREE_file eq "NOT_GIVEN") { # no user tree supplied - NJ tree created
print OUTPUT "<p><A HREF= $tree_out TARGET=tree_window> Phylogenetic Tree</A>\n";
}
print OUTPUT "<p><A HREF = \"$params\">Likelihood and parameters of Selecton run</A>\n";
print OUTPUT "<p><A HREF = \"$log\">Log-file of Selecton run</A>\n";
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.
print OUTPUT "<p><A HREF= $rsml TARGET=spt_window>RasMol coloring script source</A></p>\n";
print OUTPUT "<p><A HREF= $FGiJ_pipe_pdb TARGET=pipe_pdb>PDB file updated with Selecton results in its header</A></p>\n";
}
print OUTPUT "\n<br><br><p><center>Please <a href= \"$mail\">report any problem</a> in case of need.</center></p>\n";
flock OUTPUT,8;
close OUTPUT;
### write to the output that the job has finished
open OUTPUT, "<$OutHtmlFile";
flock OUTPUT,2;
my @output = <OUTPUT>;
flock OUTPUT,8;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
flock OUTPUT,2;
foreach my $line (@output){
if ($line =~ /Selecton Job Status Page/i){ #finds the phrase "Selecton" job status page, case-insensitive
print OUTPUT "<H1 align=center>Selecton Job Status Page - <font color='red'>FINISHED</font></h1>\n";
print OUTPUT "<a href=#finish><H2 align=center>Go to the results</font></H2></a>\n";
}
else {
print OUTPUT $line;
}
}
flock OUTPUT,8;
close OUTPUT;
### stop the automatic reload
system 'echo "(cd '.$WorkingDir.' ; chmod -R og=rx * )" | /bin/tcsh';
&stop_reload;
# reporting the statistics file on a succssful ending
my $total_runtime = BIOSEQUENCE_FUNCTIONS::subtract_time_from_now($begin_Q_runtime);
open STATISTICS, ">>".$statistics_file;
flock STATISTICS, 2;
print STATISTICS "$run_name total runTime: $total_runtime\n";
flock STATISTICS, 8;
close STATISTICS;
$email_subject = "Your Selecton results for run number $run_name are ready";
$email_message = "Selecton finished calculation. Please click on the following link to view the results:\n$WWWdir"."output.html\nPlease note: the results will be kept on the server for three months.";
open LOG, ">>$OutLogFile";
print LOG "\nSending mail to user.\n";
GENERAL_CONSTANTS::send_mail("Selecton", $FORM{email_address}, $run_name, $email_subject, $email_message);
if (-e $WorkingDir."core"){
print LOG "remove core file from working directory\n";
unlink $WorkingDir."core";
}
print LOG "\nSelecton run completed successfully!";
print LOG "\n************** END OF LOG FILE *****************\n";
close LOG;
exit;
#----------------------------------------------------------------------------------------
# S U B R O U T I N E S
#----------------------------------------------------------------------------------------
sub readInput{
# ------ storable on -------
my $input_data = retrieve($runCalcInput);
%run_data = %$input_data;
my $FORM_data = retrieve($formInput);
%FORM = %$FORM_data;
$run_name = $run_data{run_name}; $WorkingDir = $run_data{WorkingDir}; $WWWdir = $run_data{WWWdir}; $epsilonPrecision = $run_data{epsilonPrecision}; $query_seq_name_to_run = $run_data{query_seq_name_to_run}; $optimizeBL = $run_data{optimizeBL}; $querySeqFoundinMSA = $run_data{querySeqFoundinMSA}; $tree_faq = $run_data{tree_faq}; $method = $run_data{method}; $SysErrorDef = $run_data{SysErrorDef}; $ErrorDef = $run_data{ErrorDef}; $ContactDef = $run_data{ContactDef}; $fileDna_aligned = $run_data{fileDna_aligned}; $treeUpload = $run_data{treeUpload}; $OutHtmlFile = $run_data{OutHtmlFile}; $cgi_log_file = $run_data{cgi_log_file}; $OutLogFile = $run_data{OutLogFile}; $fileName_amino_aligned = $run_data{fileName_amino_aligned}; $OutputURL = $run_data{OutputURL}; $estimated_run_time = $run_data{estimated_run_time}; $sequences_names_file = $run_data{sequences_names_file}; $was_Pdb_uploaded = $run_data{was_Pdb_uploaded};
($run_data{upload_TREE_file} eq "NOT_GIVEN") ? $upload_TREE_file = "" : $upload_TREE_file = $run_data{upload_TREE_file};
($run_data{PdbFileNameUnc} eq "NOT_GIVEN") ? $PdbFileNameUnc = "" : $PdbFileNameUnc = $run_data{PdbFileNameUnc};
($run_data{PdbPrefix} eq "NOT_GIVEN") ? $PdbPrefix = "" : $PdbPrefix = $run_data{PdbPrefix};
($run_data{pdb_data} eq "NOT_GIVEN") ? $pdb_data = "": $pdb_data = $run_data{pdb_data};
($run_data{clustal_aligned_file} eq "NOT_GIVEN") ? $clustal_aligned_file = "" : $clustal_aligned_file = $run_data{clustal_aligned_file};
$upload_MSA_file_dna = $run_data{upload_MSA_file_dna};
$upload_unaligned_file_dna = $run_data{upload_unaligned_file_dna};
# ------ storable on -------
# ------ storable off -------
#unless(open INPUT, $runCalcInput){
# open ANS, ">".$qsub_ans_file;
# print ANS "NOT_OK";
# close ANS;
# chmod 0755, $qsub_ans_file;
# exit;
#}
#while(<INPUT>){
# chomp;
#if(/RUN NAME: (.+)/) {$run_name = $1;}
#elsif(/WORKING DIR: (.+)/) {$WorkingDir = $1;}
#elsif(/WWW DIR: (.+)/){$WWWdir = $1;}
#elsif(/PRECISION LEVEL: (.+)/) {$epsilonPrecision = $1;}
#elsif(/EVOLUTONARY MODEL: (.+)/){$FORM{MODEL} = $1;}
#elsif(/EMPIRICAL MATRIX: (.+)/) {#ONLY IF IT IS MEC MODEL
#($1 eq "NOT_GIVEN") ? $FORM{EMPIRICAL_MATRIX} = "" : $FORM{EMPIRICAL_MATRIX} = $1;}
#elsif(/QUERY NAME TO RUN: (.+)/){$query_seq_name_to_run = $1;}
#elsif(/DISTRIBUTE CATEGORIES: (.+)/){$FORM{CATEGORIES} = $1;}
#elsif(/TREE_WAS_UPLOADED\?: (.+)/){#REMARK: CHANGE THIS VAR'S CONTENT IN THE REST OF THE SCRIPT TO TRUE/FALSE.
#($1 eq "NOT_GIVEN") ? $upload_TREE_file = "" : $upload_TREE_file = $1;}
#$upload_TREE_file = $1;}
#elsif(/OPTIMIZE BRANCH LENGTH\? (.+)/){$optimizeBL = $1;}
#elsif(/GENETIC CODE: (.+)/){$FORM{GENCODE} = $1;}
#elsif(/GIVEN QUERY NAME: (.+)/){$FORM{msa_SEQNAME} = $1;}
#elsif(/PDB ID: (.+)/){#if given
# ($1 eq "NOT_GIVEN") ? $FORM{pdb_ID} = "" : $FORM{pdb_ID} = $1;}
#elsif(/PDB NAME: (.+)/){
# ($1 eq "NOT_GIVEN") ? $PdbFileNameUnc = "" : $PdbFileNameUnc = $1;}
##elsif(/PDB CHAIN: (.+)/){
## ($1 eq "NOT_GIVEN") ? $FORM{chain} = "" : $FORM{chain} = $1;}
#elsif(/PDB PREFIX: (.+)/){
# ($1 eq "NOT_GIVEN") ? $PdbPrefix = "" : $PdbPrefix = $1;}
#elsif(/PDB DATA FILE: (.+)/){
# ($1 eq "NOT_GIVEN") ? $pdb_data = "" : $pdb_data = $1;}
#elsif(/CLUSTAL ALN: (.+)/){
# ($1 eq "NOT_GIVEN") ? $clustal_aligned_file = "" : $clustal_aligned_file = $1;} #elsif(/FOUND QUERY IN MSA\?: (.+)/){$querySeqFoundinMSA = $1;}
#elsif(/TREE FAQ: (.+)/){$tree_faq = $1;}
#elsif(/METHOD: (.+)/){$method = $1;}
#elsif(/USER EMAIL: (.+)/){
# ($1 eq "NOT_GIVEN") ? $FORM{email_address} = "" : $FORM{email_address} = $1;}
#elsif(/SYS ERROR: (.+)/){$SysErrorDef = $1;}
#elsif(/ERROR DEF: (.+)/){$ErrorDef = $1;}
#elsif(/CONTACT DEFINITION: (.+)/){$ContactDef = $1;}
#elsif(/WAS PDB UPLOADED\?: (.+)/){$was_Pdb_uploaded = $1;}
#elsif(/DNA FILE NAME: (.+)/){$fileDna_aligned = $1;}
#elsif(/UPLOADED TREE PATH: (.+)/){$treeUpload = $1;}
#elsif(/OUTPUT HTML PATH: (.+)/){$OutHtmlFile = $1;}
#elsif(/LOG PATH: (.+)/){$cgi_log_file = $1;}
#elsif(/QSUB LOG: (.+)/){$OutLogFile = $1;}
#elsif(/SEQ NAMES FILE: (.+)/){$sequences_names_file = $1;}
#elsif(/AMINO FILE NAME: (.+)/){$fileName_amino_aligned = $1;}
#elsif(/WAS A DNA UNALIGNED FILE UPLOADED\?: (.+)/){$upload_unaligned_file_dna = $1;}
#elsif(/WAS A DNA ALIGNED FILE UPLOADED\?: (.+)/){$upload_MSA_file_dna = $1;} #elsif(/URL OUTPUT: (.+)/){$OutputURL = $1;}
#elsif(/ESTIM RUNTIME: (.+)/){$estimated_run_time= $1;}
#}
#close INPUT;
# ------ storable off -------
# if reading was OK, we report it, for the daemon
open ANS, ">".$qsub_ans_file;
print ANS "OK";
close ANS;
chmod 0755, $qsub_ans_file;
# recreating the sequences array of the DNA sequences names
unless (open SEQ_NAMES, $WorkingDir.$sequences_names_file){
&sys_error_exit("cannot open the file ".$WorkingDir.$sequences_names_file." for reading $!\n");}
while(<SEQ_NAMES>){
chomp;
$sequences_names[0] = "";
if(/(\d+) (.+)/){
$sequences_names[$1] = $2;
}
}
close SEQ_NAMES;
}
#########################################################################################
# CALCULATION AND POST-PROCESSING
sub run_calc {
&run_kaks4site;
# The next routine assumes these files were created, therefore first we check that it was created
unless ((-e $selection4Site_file) && !(-z $selection4Site_file)
&& (-e $kaks_file) && !(-z $kaks_file))
{
&sys_error_exit("run_calc: The file $selection4Site_file or $kaks_file was not created (or contains no data) during the run of selecton.v2.2. Cannot process outputs");
}
#print LOG "run_calc : going to run routine change_colors_if_significant\n";
#&change_colors_if_significant;
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.
## run the script colorCoding.pl to produce the output files
print LOG "run_calc : touch $final_out\n";
$proc_comm = "perl $colorCoding $method \'$query_seq_name_to_run\' $WorkingDir $selection4Site_file $clustal_aligned_file $pdb_data $kaks_file $final_out $PdbFileNameUnc $fileName_amino_aligned $rsml $params";
print LOG "run_calc: running $proc_comm\n";
system 'echo "(cd '.$WorkingDir.';touch '.$final_out.'; chmod oug+rx '.$final_out.')" | /bin/tcsh';
system 'echo "(cd '.$WorkingDir.'; '.$proc_comm.')" | /bin/tcsh';
# check if the script $colorCoding found an error
if (-e $WorkingDir."error"){
&read_colors_error_and_exit;
}
# REMARK: I don't think it is necesseray in the new server, since we wont use PE
##### copy final PE files to pdbspt dir and compress the PDB file
#my $string1 = "cd $WorkingDir";
#my $string2 = "cp consurf.spt pdbspt/consurf.spt";
#my $string3 = "gzip -c $PdbFileNameUnc > pdbspt/pdbfile.ent";
#my $string4 = "mv consurf.spt colors.txt";
#my $string6 = "mv $rasmol_file rasmol.txt";
#my $string5 = "chmod ogu+rx pdbspt/*";
#
#print LOG "\nrun_calc: Copy the final PE files to pdbspt dir\n";
#
#system 'echo "('.$string1.'; '.$string2.'; '.$string3.'; '.$string4.'; '.$string6.';' .$string5.';)" | /bin/tcsh';
}
$proc_comm = "perl $colorCodingLinear $run_name $WorkingDir $colorsLinear $selection4Site_file $areTherePositiveSites";
print LOG "\nrun_calc: running $proc_comm \n";
system 'echo "(cd '.$WorkingDir.'; '.$proc_comm.')" | /bin/tcsh';
# check if the script $colorCodingLinear found an error
if (-e $WorkingDir."error"){
&read_colors_error_and_exit;
}
}
######################################################################################
# run kaks4site.exe
sub run_kaks4site {
###### should add verification that two refuting arguments aren't given here...
# my $selecton_comm="$selecton -c \'$fileDna_aligned\'"; #default run: bayesian, beta+w>1 (M8) , w=8, ref=1st seq, std nuc.code, NJ tree
my $selecton_comm="$selecton -i $WorkingDir" . $fileDna_aligned . " -e" . $epsilonPrecision; #default run: bayesian, beta+w>1 (M8) , w=8, ref=1st seq, std nuc.code, NJ treeepsilon by default set to 0.1
if ($FORM{MODEL} eq "MEC") { #if the model is MEC call another exe
$selecton_comm = "$mecSelecton -i $WorkingDir" . "$fileDna_aligned";
if ($FORM{EMPIRICAL_MATRIX} eq "JTT"){
$selecton_comm .= " -z 0";
}
if ($FORM{EMPIRICAL_MATRIX} eq "WAG"){
$selecton_comm .= " -z 1";
}
if ($FORM{EMPIRICAL_MATRIX} eq "mtREV24"){
$selecton_comm .= " -z 2";
}
if ($FORM{EMPIRICAL_MATRIX} eq "cpREV45"){
$selecton_comm .= " -z 3"; # if no z is given mecSelecton will run by default with JTT
}
}
if ($querySeqFoundinMSA eq "yes") {
$selecton_comm .= " -q \'$query_seq_name_to_run\'";
}
if ($FORM{MODEL} eq "M7") { # beta no additional omega1. prob(beta) set to 1
$selecton_comm .= " -p1 -Fp";
}
if ($FORM{MODEL} eq "M8a") { #beta + w = 1
$selecton_comm .= " -w1 -Fw"; #do not optimize omega (omega set to 1) (M8a)
}
if ($FORM{MODEL} eq "M5") {
$selecton_comm .= " -dg";
}
if ($FORM{CATEGORIES} ne "") {
my $catAdd=" -n ".$FORM{CATEGORIES};
$selecton_comm .= $catAdd;
}
if ($upload_TREE_file ne "NOT_GIVEN") {
$selecton_comm .= " -u \'$treeUpload\'";
}
if ($optimizeBL eq "n"){
$selecton_comm .= " -bn";
}
if ($FORM{GENCODE} != 0) {
my $genAdd=" -g ".$FORM{GENCODE};
$selecton_comm .= $genAdd;
}
print LOG "\nrun_kaks4site: running $selecton_comm\n";
print LOG "run_kaks4site: SeqName = ***$query_seq_name_to_run***\n";
system "cd $WorkingDir; $selecton_comm; chmod ogu+rx *"; # The program can't run from rsh
#check for user errors in kaks4ite.log
my $kaksLogFile = $WorkingDir.$log;
open OUTPUT, ">>$OutHtmlFile";
unless (open (LOGFILE,"$kaksLogFile")) {
close (LOGFILE);
&sys_error_exit("Error in run_kaks4site, $kaksLogFile does not exist");
}
while (<LOGFILE>){
my $line=$_;
if ($line =~ /\S+/){
my @userError = split(/\s/,$line);
if ($userError[0] eq "USER"){
$line =~ s/USER ERROR://; ## query sequence not found
my $first_line = $line;
while (<LOGFILE>){
$line = $_;
$first_line = "<br>".$first_line."<br>".$line;
}
print OUTPUT "\n<p><ul><li><font color='red'>Warning:</b></font> The query sequence name \'$FORM{msa_SEQNAME}\' is not found in the <A HREF=$fileDna_aligned TARGET=MSA_window>MSA file</A>.<br>The calculation continues. The first sequence in MSA is used as a query.</li></ul></p>\n";
close OUTPUT;
print LOG "\nrun_kaks4site: query sequence not found. Calculation continues with 1st sequence in MSA.\n";
}
if (($line =~ /found in the tree file but not found in the sequence file/) || ($line =~ /Error reading tree file/)) { #mismatch between MSA names and tree names
my $err=$line;
my $line1 = <LOGFILE>;
my $line2 = <LOGFILE>;
$err .= "$line1 "."$line2";
close (LOGFILE);
&print_to_output_and_exit("Error in tree file:<br>$err Please check that all the names in the sequence file are identical to all the names in the tree.", "run_kaks4site: $err");
}
elsif ($line =~ /Bad format in tree file/){
close (LOGFILE);
&print_to_output_and_exit("<b>Bad format in tree file.</b><br>Please correct your tree file according to <a href=\"$tree_faq\">Selecton accepted format</a> and re-submit your query.", "run_kaks4site: $line");
}
elsif ($line =~ /The nucleotide sequences contained the character: (.*)/) {
my $illegal = $1;
close (LOGFILE);
&print_to_output_and_exit("<b>The nucleotide sequences file contained an illegal character: $1. Only the following characters are accpted: A,C,G,T,-. Please correct your file and re-submit it to Selecton.","run_kaks4site: $line");
}
elsif($line =~ /Unable to read file. It is required that each line is no longer than/){
close (LOGFILE);
&print_to_output_and_exit("<b>Selecton does not accept DNA sequences which are longer than ".GENERAL_CONSTANTS::SELECTON_MAX_NUCLEOTIDE." nucleotides.</b>", "run_kaks4site: $line");
}
}
}
close OUTPUT;
close (LOGFILE);
}
######################################################################
# creating new files to hold DNASeqNames in DNA file and AMINO file
sub return_seq_names_to_files{
my $current_file = shift;
my $new_file = shift;
my $ref_seq_name_arr = shift; # reference to the sequence names array
unless (open IN, $WorkingDir.$current_file){
print LOG "could not open file $WorkingDir"."$current_file for reading. Names of files will be displayed as numbers.\n" ;
}
else {
unless (open OUT, ">".$WorkingDir .$new_file){
print LOG "could not open file $WorkingDir"."$new_file for writing. Names of files will be displayed as numbers.\n";
}
else{
while (<IN>){
if(/>(\d+)/){
print OUT ">".$ref_seq_name_arr->[$1]."\n";
}
else{
print OUT $_;
}
}
close OUT;
}
close IN;
}
}
######################################################################################
sub change_tree_file_names{
my $ref_sequences_names = shift;
my $input_tree = shift;
my $output_tree = shift;
my ($tree, $err);
print LOG "change_tree_file_names : going to change numbers from tree $input_tree to names in file $output_tree\n";
unless (open TREE, $input_tree){
print LOG "change_tree_file_names : could not open the file $input_tree for reading. the tree file will be presented with numbers\n";
return;
}
#check validity of input tree file
$tree = <TREE>;
close TREE;
my @tree_arr = split(/\(/, $tree);
my @sub_tree = ();
my @temp_arr;
my $sub_counter = 0;
# building the array @sub_tree, so that each cell will hold maximum one sequence name
for(my $i=0; $i<@tree_arr; $i++){
if ($tree_arr[$i] ne ""){
$tree_arr[$i] = "(".$tree_arr[$i];
}
if ($tree_arr[$i] =~ m/.*,.+/){
@temp_arr = split(/,/, $tree_arr[$i]);
foreach (@temp_arr){
$sub_tree[$sub_counter] = $_.",";
$sub_counter++;
}
}
else{
$sub_tree[$sub_counter] = $tree_arr[$i];
$sub_counter++;
}
}
# rebuilding the tree, this time replacing the sequences names with the names found in the DNA input file
my $final_tree = "";
my ($exp, $rest_of_exp, $new_rest_exp);
my $seq_found = "no";
for (my $k=1; $k<@sub_tree; $k++){
#in this part we wish to split the expression to 2 parts; left part : (?seq_name ; right part: all the rest
if ($sub_tree[$k] ne ""){
if ($sub_tree[$k] =~ m/(.+)(:.+)/){
$exp = $sub_tree[$k];
$rest_of_exp = "";
while ($exp =~ m/(.+)(:.+)/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name:distance,
elsif($sub_tree[$k] =~ m/(.+)(\);.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name)*,
elsif($sub_tree[$k] =~ m/(.+)(\)?.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# if the length (value after the ":") is equal to zero, we replace it with a very small value,
# because the selecton.exe cannot calculate trees with zeros
$new_rest_exp = "";
while($rest_of_exp =~ m/(.?:)(\d\.?\d*)(.+)/){
if(!($2>0) && !($2<0)){
$rest_of_exp = $3;
$new_rest_exp .= $1."0.000000001";
}
else{
$rest_of_exp = $3;
$new_rest_exp .= $1.$2;
}
}
$new_rest_exp .=$rest_of_exp;
$rest_of_exp = $new_rest_exp;
$exp =~ m/(\(?)(.+)/;
$final_tree.= $1.$ref_sequences_names->[$2].$rest_of_exp;
}
#an empty cell stands for a "(" sign
else{
$final_tree.= "(";
}
}
if ($final_tree =~ m/,$/){
chop $final_tree;
}
unless (open NEW_TREE, ">".$output_tree){
&sys_error_exit("change_tree_file_names:: cannot open file $output_tree for writing.");
}
print LOG "change_tree_file_names : printing edited tree to file $output_tree and chmod it.\n";
print NEW_TREE $final_tree;
close NEW_TREE;
chmod 0755, $output_tree;
}
######################################################################
# prepare the variables to be sent to the "pipe" script. than calls the script with all needed vars.
sub prepare_pipe{
my $sequences_names = shift;
print LOG "\nEentered prepare_pipe()\n";
# vars that should be edits before sent to the pipe script:
my ($pipe_pdb_id, $pipe_chain, @pipe_Model, $model_ref, $pipe_distribute_categories, $pipe_optimizeBL, @pipe_genetic, $genetic_ref, $pipe_dna_input, $pipe_query, $pipe_precision, $pipe_empirical);
($FORM{pdb_ID} eq "") ? $pipe_pdb_id = "UPLOADED" : $pipe_pdb_id = $FORM{pdb_ID};
($FORM{chain} eq "") ? $pipe_chain = "none" : $pipe_chain = $FORM{chain};
if ($FORM{MODEL} eq "M8") {$pipe_Model[0] = 'Positive selection enabled (M8, beta + w >= 1)' ;}
elsif ($FORM{MODEL} eq "M8a") {$pipe_Model[0] = 'Null model: no positive selection(M8a, beta + w = 1)' ;}
elsif ($FORM{MODEL} eq "M7") {$pipe_Model[0] = 'Null model: no positive selection(M7, beta)' ;}
elsif ($FORM{MODEL} eq "M5") {$pipe_Model[0] = 'Positive selection enabled(M5, gamma)' ;}
elsif ($FORM{MODEL} eq "MEC") {$pipe_Model[0] = 'Mechanistic Empirical Combination Model (MEC)' ;}
else {$pipe_Model[0] = "IGNORED";}
$model_ref = \@pipe_Model;
($FORM{MODEL} eq "MEC") ? $pipe_empirical = $FORM{EMPIRICAL_MATRIX} : $pipe_empirical = "IRRELEVANT";
if ($epsilonPrecision == 0.1) {$pipe_precision = "Intermediate precision";}
elsif ($epsilonPrecision == 1) {$pipe_precision = "Low precision- faster run";}
elsif ($epsilonPrecision == 0.01) {$pipe_precision = "High precision- slower run";}
else {$pipe_precision = "DEFAULT";}
($FORM{CATEGORIES} eq "") ? $pipe_distribute_categories = 8 : $pipe_distribute_categories = $FORM{CATEGORIES};
($optimizeBL eq "y") ? $pipe_optimizeBL = "True" : $pipe_optimizeBL = "False";
if ($FORM{GENCODE}==0) {$pipe_genetic[0] = "Nuclear Standard" ;}
elsif ($FORM{GENCODE}==1) {$pipe_genetic[0] = "Nuclear Blepharisma";}
elsif ($FORM{GENCODE}==2) {$pipe_genetic[0] = "Nuclear Ciliate";}
elsif ($FORM{GENCODE}==3) {$pipe_genetic[0] = "Nuclear Euplotid";}
elsif ($FORM{GENCODE}==4) {$pipe_genetic[0] = "Mitochondria Vertebrate";}
elsif ($FORM{GENCODE}==5) {$pipe_genetic[0] = "Mitochondria Invertebrate";}
elsif ($FORM{GENCODE}==6) {$pipe_genetic[0] = "Mitochondria Yeast";}
elsif ($FORM{GENCODE}==7) {$pipe_genetic[0] = "Mitochondria Ascidian";}
elsif ($FORM{GENCODE}==8) {$pipe_genetic[0] = "Mitochondria Echinoderm";}
elsif ($FORM{GENCODE}==9) {$pipe_genetic[0] = "Mitochondria Flatworm";}
elsif ($FORM{GENCODE}==10) {$pipe_genetic[0] = "Mitochondria Protozoan";}
else {$pipe_genetic[0] = "IGNORED";}
$genetic_ref = \@pipe_genetic;
($sequences_names->[$query_seq_name_to_run] eq "") ? $pipe_query = "\"\"" : $pipe_query = $sequences_names->[$query_seq_name_to_run];
#since there is only 1 input file, there will be only var sent to the pipe script. In order that the pipe script will know what kind of input it is - a short string is added at the beginning.
if ( $upload_unaligned_file_dna ne "no"){
$pipe_dna_input = "SELECTON_UN".$upload_unaligned_file_dna;
}
elsif ($upload_MSA_file_dna ne "no"){
$pipe_dna_input = "SELECTON_MSA".$upload_MSA_file_dna;
}
else{
$pipe_dna_input = "no_dna_input";
}
open ERROR, $WorkingDir.$pipe_error;
close ERROR;
chmod 0755, $WorkingDir.$pipe_error;
print LOG "running pipe_for_selecton::create_pipe with parameters:\n";
print LOG "$run_name $cgi_log_file $WorkingDir $rsml $FGiJ_pipe_pdb $pdbUpload $pipe_pdb_id $pipe_chain $pipe_dna_input $pipe_query $model_ref $pipe_distribute_categories $pipe_optimizeBL $genetic_ref $pipe_error $pipe_empirical $pipe_precision\n";
pipe_for_selecton::create_pipe($run_name, $cgi_log_file, $WorkingDir, $rsml, $FGiJ_pipe_pdb, $pdbUpload, $pipe_pdb_id, $pipe_chain, $pipe_dna_input, $pipe_query, $model_ref, $pipe_distribute_categories, $pipe_optimizeBL, $genetic_ref, $pipe_error, $pipe_precision, $pipe_empirical);
# checking if there was an error and the pipe file was not created properly
if (-e $WorkingDir.$pipe_error && !(-z $WorkingDir.$pipe_error)){
unless (open ERROR, $WorkingDir.$pipe_error){
&sys_error_exit("An error was found when trying to create the pipe file for Selecton.\nThe Error message should be written to file $WorkingDir"."$pipe_error, however this file could not be opened.\n");
}
&sys_error_exit("An error was found while trying to create the pipe file : ".<ERROR>);
}
}
######################################################################
sub sys_error_exit{
my $err = shift;
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT $SysErrorDef;
print OUTPUT $ContactDef;
close OUTPUT;
print LOG "\n$err\n";
&send_mail();
&send_mailSelecton("SYSTEM ERROR\n".$err);
&stop_reload;
exit;
}
##########################################################################################
# Stops the reload of the output page
sub stop_reload {
sleep 5;
open OUTPUT, "<$OutHtmlFile";
flock OUTPUT,2;
my @output = <OUTPUT>;
flock OUTPUT,8;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
flock OUTPUT,2;
foreach my $line (@output){ # we remove the refresh lines and the button which codes for Selecton cancelled job
unless ($line =~ /REFRESH/ or $line =~ /NO-CACHE/ or $line =~ /ACTION=\"cgi.+kill/ or
$line =~ /VALUE="Cancel Selecton Job"/ or $line =~/TYPE=hidden NAME="Qstat_file"/ or $line =~/TYPE=hidden NAME="selecton_http"/ or $line =~ /TYPE=hidden NAME="run_no"/ or $line =~ /TYPE=hidden NAME="cgi_pid"/ or $line =~ /Estimated run time is:/ or $line =~ /kill_process.cgi/ or $line =~ /<!--job_/){
print OUTPUT $line;
}
}
flock OUTPUT,8;
close OUTPUT;
print LOG "\n\nEnd time: ";
print LOG &printTime();
close LOG;
# remove the job from the running jobs list
&BIOSEQUENCE_FUNCTIONS::remove_job_from_running_log("Selecton", $run_name);
open FINISH, ">".$finish_flag;
close FINISH;
unlink $qsub_ans_file if (-e $qsub_ans_file);
chmod 0755, $OutHtmlFile;
chmod 0600, $WorkingDir."user_email.txt";
chmod 0711, $WorkingDir;
chmod 0600, $runCalcInput;
# if (-e $WorkingDir."core"){
# print LOG "remove core file from working directory\n";
# unlink $WorkingDir."core";
# }
}
#########################################################################################
# Sends an automatic mail when there are errors
sub send_mail { # to user
if ($FORM{email_address} ne ""){
$email_subject = "Error in Selecton running";
$email_message = "Hello!\n\nUnfortunately there was an error while running Selecton.\nPlease click on the following link to see more details\nWe apologize for the inconvenience\n\n$OutputURL";
print LOG "send_mail: sending system error to user:\n";
my $mail_line = 'perl sendEmail.pl -f \''.GENERAL_CONSTANTS::ADMIN_EMAIL.'\' -t '.$FORM{email_address}.' -u \''.$email_subject.'\' -xu '.$userName.' -xp '.$userPass.' -s '.$smtp_server.' -m \''.$email_message.'\'';
chdir $send_email_dir;
$email_system_return = `$mail_line`;
unless ($email_system_return =~ /successfully/) {
print LOG "The message was not sent successfully. system returned: $email_system_return\n";
}
}
}
#########################################################################################
sub send_mailSelecton{ # to selecton administrator
my $email_message = shift;
if ($FORM{email_address} eq ""){$FORM{email_address} = "NOT_GIVEN";}
$email_subject = "Error in Selecton running $run_name";
print LOG "send_mailSelecton: send error message to admin:\n";
my $mail_line = 'perl sendEmail.pl -f \''.GENERAL_CONSTANTS::ADMIN_EMAIL.'\' -t '.GENERAL_CONSTANTS::ADMIN_EMAIL.' -u \''.$email_subject.'\' -xu '.$userName.' -xp '.$userPass.' -s '.$smtp_server.' -m \''.$email_message.'\nUser email is: '.$FORM{email_address}.'\'';
chdir $send_email_dir;
$email_system_return = `$mail_line`;
unless ($email_system_return =~ /successfully/) {
print LOG "The message was not sent successfully. system returned: $email_system_return\n";
}
}
#########################################################################################
sub printTime {
my $theTime = BIOSEQUENCE_FUNCTIONS::printTime;
return $theTime;
}
#########################################################################################
sub print_to_output_and_exit{
my $html_err = shift;
my $log_err = shift;
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p>$ErrorDef<br>$html_err</p>\n";
print OUTPUT $ContactDef;
close OUTPUT;
print LOG "$log_err";
&send_mail();
&stop_reload;
exit;
}
#########################################################################################
# in case an error file was found during the run of either $colorCoding or $colorCodingLinear
sub read_colors_error_and_exit{
unless (open ERROR, $WorkingDir."error"){
&sys_error_exit("run_calc : An error was found after running $colorCoding, but the error file $WorkingDir"."error could not be opened.");
}
my ($h_err, $l_err);
while (<ERROR>){
chomp;
if (/HTML: (.+)/){
$h_err = $1;}
elsif(/LOG: (.+)/){
$l_err = $1;}
}
close ERROR;
($h_err =~ m/^sys$/) ? &sys_error_exit("run_calc : $l_err") : &print_to_output_and_exit($h_err, $l_err);
}
#########################################################################################
sub add_data_to_input_file{
# ---- storable on ----
$run_data{params} = $params;
$run_data{log} = $log;
$run_data{outputScoreFile} = $outputScoreFile;
$run_data{FORMInput} = $formInput;
# recreating the storable element
unlink $runCalcInput;
store \%run_data, $runCalcInput;
if (!(-e $runCalcInput) or (-z $runCalcInput)){
&sys_error_exit("Seems that the store filed. the file $runCalcInput doesn't exists or is of size zero\n");
}
chmod 0600, $runCalcInput;
# ---- storable on ----
# ---- storable off ----
#unless (open INPUT, ">>".$runCalcInput){
# open LOG, ">>$OutLogFile";
# print LOG "add_data_to_input_file : cannot open file $runCalcInput for writing $!. The statistical testing button cannot be active\n";
# close LOG;
# return "no";
#}
#print INPUT "GLOBAL RESULTS FILE: $params\n";
#print INPUT "KAKS LOG FILE: $log\n";
#print INPUT "SCORE RESULTS FILE: $outputScoreFile\n";
#close INPUT;
# ---- storable off ----
return "OK";
}
######################################################################
sub extract_file_name{
my $file = shift;
$file =~ s/\s/_/;
return $file;
}