#!/usr/local/bin/perl
use strict;
use Storable;
use lib "/bioseq/bioSequence_scripts_and_constants/"; #"/db1/System/bioseq/scripts_for_servers";
use GENERAL_CONSTANTS;
use BIOSEQUENCE_FUNCTIONS;
use SELECTON_CONSTANTS;
use lib "/bioseq/Selecton/external_scripts";
use pipe_for_selecton;
my $WorkingDir = shift;
my $runCalcInput = $WorkingDir.shift;
my $formInput = $WorkingDir.shift;
# hashes to retrieve the runs' input from the storable element
my %FORM = ();
my %run_data = ();
# VARS FROM INPUT FILE
# vars read from form:
my ($epsilonPrecision, $query_seq_name_to_run, $upload_TREE_file, $optimizeBL);
# General vars
my ($OutLogFile, $querySeqFoundinMSA, $tree_faq, $PdbPrefix, $method, $run_name, $PdbFileNameUnc, $was_Pdb_uploaded, $proc_comm, $estimated_run_time, $begin_Q_runtime);
# HTML related vars
my ($SysErrorDef, $ContactDef, $WWWdir, $ErrorDef, $OutputURL);
# files which were created by the cgi
my ($fileDna_aligned, $treeUpload, $OutHtmlFile, $cgi_log_file, $sequences_names_file, $fileName_amino_aligned, $upload_unaligned_file_dna, $upload_MSA_file_dna, $clustal_aligned_file, $pdb_data);
my @sequences_names = (); # This array will hold the names of the sequences, as it appears in the input file.
my $qsub_ans_file = $WorkingDir."qsub_ans.txt"; # The flag file which is read by the daemon process
my $finish_flag = $WorkingDir."END_OK"; # The file which denotes that the run was finished.
&readInput();
my $http_path = GENERAL_CONSTANTS::SELECTON_URL;
### Sending e-mail from the cluster
my $smtp_server = GENERAL_CONSTANTS::SMTP_SERVER;
my $userName = GENERAL_CONSTANTS::ADMIN_USER_NAME;
my $userPass = GENERAL_CONSTANTS::ADMIN_PASSWORD;
my $mail = "mailto:".GENERAL_CONSTANTS::ADMIN_EMAIL."?subject=Selecton%20Run%20No:%20$run_name";
my $email_subject;
my $email_message;
my $email_system_return;
my $send_email_dir = GENERAL_CONSTANTS::SEND_EMAIL_DIR;
# VARS DEFINED IN THE SCRIPT:
# scripts or executables
my $biocluster_external_scripts_path = "/bioseq/Selecton/external_scripts/";
#my $selecton = "/bioseq/pupkoSVN/trunk/programs/selecton/selecton"; #"/d/bioinfo/users/adist/pupkoSVN/trunk/programs/selecton/selecton";
my $selecton = "/bioseq/Selecton/selecton.exe";
#my $selecton = $biocluster_external_scripts_path."srcSelecton/srcV2.2/selecton";
#my $mecSelecton = "/bioseq/pupkoSVN/trunk/programs/mec/mec";
my $mecSelecton = "/bioseq/Selecton/mec.exe";
#"/d/bioinfo/users/adist/pupkoSVN/trunk/programs/mec/mec"; # the exe of the kaks using the mec model is added by adid (29.1.07)
my $colorCoding = $biocluster_external_scripts_path . "colorCoding.v2.pl";
my $colorCodingLinear = $biocluster_external_scripts_path . "colorCodingLinear.pl";
my $statTest = "/cgi-bin/statTest.cgi"; # REMARK : SHOULD BE RE-WRITTEN AND RUN FROM THE CLUSTER
# vars
my $pdbUpload = $WorkingDir . $PdbFileNameUnc; #name for an uploaded PDB FILE
my $significance_test_faq = "/overview.html#meth5";
# files which will be created by this script
my $rsml = "rasmol.txt";#"colors.txt";
my $areTherePositiveSites = $WorkingDir."areSitesPositive.txt";
my $colorsLinear = "colors.html"; #results color-coded onto the linear sequence
my $outputScoreFile="kaks.res";
my $selection4Site_file = $WorkingDir."selection4Site.txt";
my $kaks_file = $WorkingDir."kaks.res";
my $params = "globalResult.txt";
my $log = "kaks4site.log";
my $final_out = $PdbPrefix . ".gradesPE";
my $finalDNAFile_seq_names = "DNA.names.msa";
my $finalAminoFile_seq_names = "AMINO.names.msa";
my $tree_out = "kaks4site.tree";
my $rasmol_file = $PdbPrefix .".rsml";
my $statistics_file = SELECTON_CONSTANTS::STATISTICS_FILE;
#*****************************
### FGiJ path and related files
my $FGiJ_path = "/fgij/";
my $FGiJ_pipe_pdb = $FORM{pdb_ID} . "_selecton" . $run_name . "_pipe.pdb";
my $FGiJ_link = $FGiJ_path . "fg.htm?mol=/results/" . $run_name . "/" . $FGiJ_pipe_pdb;
my $pipe_error = "pipe.error";
#---------------------------------------------------------------------------
# C A L C U L A T I O N
#---------------------------------------------------------------------------
$begin_Q_runtime = &printTime();
open LOG, ">".$OutLogFile;
print LOG &printTime();
print LOG "\nEntered selecton_run_calc.pl\nUpdating the HTML with running status\n";
# updating the HTML status to "running"
my $ans = &GENERAL_CONSTANTS::print_Q_status_in_html($OutHtmlFile, "Running", "no", $estimated_run_time);
print LOG $ans if ($ans ne "OK");
&run_calc();
### changing back the names in the DNA and AMINO files, so it will hold the original sequnces names.
&return_seq_names_to_files($fileName_amino_aligned, $finalAminoFile_seq_names, \@sequences_names);
&return_seq_names_to_files($fileDna_aligned, $finalDNAFile_seq_names, \@sequences_names);
#if a tree was created, we output it to the users, than first have to write to it original names
#if ((-e $WorkingDir.$tree_out) && !(-z $WorkingDir.$tree_out)){
# print LOG "renaming numbered tree output $tree_out to : no_tree.txt\n";
# my $cmd = "mv ".$WorkingDir.$tree_out." ".$WorkingDir."no_tree.txt";
# my $out = `$cmd`;
# print LOG "moving returned: $out\n";
# chmod 0744, $WorkingDir."no_tree.txt";
# print LOG "going to run change_tree_file_names\n";
# #&change_tree_file_names(\@sequences_names, $WorkingDir."no_tree.txt", $WorkingDir.$tree_out);
# print LOG "after change_tree_file_names\n";
#}
#------------
### create a pdb file with pipe in its header, for FGiJ to read
if ($was_Pdb_uploaded eq "yes"){
print LOG "Going to prepare pipe file\n";
&prepare_pipe(\@sequences_names);
}
else{
print LOG "Not creating pipe, since the value of \$was_Pdb_uploaded is: \"$was_Pdb_uploaded\"\n";
}
### PRINTING output FINAL NOTES AND LINKS
open OUTPUT, ">>" .$OutHtmlFile;
flock OUTPUT,2;
print OUTPUT "\n
\n";
print OUTPUT "Final Result: ";
if ($was_Pdb_uploaded eq "yes") { #3D
print OUTPUT "\nGraphical display of Selecton results with FirstGlance in Jmol
\n";
}
else {
print OUTPUT "\n View Color Coded Selecton Results
\n";
}
# check if there are P.S. sites. If yes - a button for statistical testing
open POS, "<$areTherePositiveSites";
my @pos = ;
chomp @pos;
close POS;
if ($pos[0] eq "yes"){
print OUTPUT "Positively selected sites found. \n";
###### here is a link for statistical testing for the M8 and MEC models
if (($FORM{MODEL} eq "M8") || ($FORM{MODEL} eq "MEC")) {
my $ans = &add_data_to_input_file;
# in case the data was not added - we don't create a submit button, we let the user know he can contact us
if ($ans eq "OK"){
print OUTPUT "\n \n";
}
else{
print OUTPUT "Please contact us if you wish to run a statistical test for your results, and mention this number: $run_name ";
}
}
}
else {
print OUTPUT "No positively selected sites found in the protein. \n";
}
print OUTPUT "Output Files: \n\n";
### print the links
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.: Ka/Ks scores 2gether with color coding
print OUTPUT " Codon Ka/Ks scores color-coded on the linear sequence
\n";
}
print OUTPUT " Codon Ka/Ks scores (numerical values)";
# add the gapped output only in case there was gapped output (if the files are identical, the only difference will be in the line 'Displayed on sequence 1< including gaps>'
if ((-s $WorkingDir."kaks.res.gaps") - (-s $WorkingDir.$outputScoreFile) > 15){
print OUTPUT " - Reference sequence only
\n";
print OUTPUT " Codon Ka/Ks scores (numerical values) - For each position in the MSA, including gaps
\n";
}
else{
print OUTPUT "\n";
}
if ( $upload_unaligned_file_dna ne "no"){ # print amino aln only if the user supplied a non-aligned file (then we run codon-align & produce an amino aln)
print OUTPUT " Amino Acid Multiple Sequence Alignment (in Fasta format) \n";
}
print OUTPUT "
Codon Multiple Sequence Alignment (in Fasta format) \n";
if ($upload_TREE_file eq "NOT_GIVEN") { # no user tree supplied - NJ tree created
print OUTPUT "
Phylogenetic Tree \n";
}
print OUTPUT "
Likelihood and parameters of Selecton run \n";
print OUTPUT "
Log-file of Selecton run \n";
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.
print OUTPUT "
RasMol coloring script source
\n";
print OUTPUT "PDB file updated with Selecton results in its header
\n";
}
print OUTPUT "\n
Please report any problem in case of need. \n";
flock OUTPUT,8;
close OUTPUT;
### write to the output that the job has finished
open OUTPUT, "<$OutHtmlFile";
flock OUTPUT,2;
my @output = ;
flock OUTPUT,8;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
flock OUTPUT,2;
foreach my $line (@output){
if ($line =~ /Selecton Job Status Page/i){ #finds the phrase "Selecton" job status page, case-insensitive
print OUTPUT "Selecton Job Status Page - FINISHED \n";
print OUTPUT "Go to the results \n";
}
else {
print OUTPUT $line;
}
}
flock OUTPUT,8;
close OUTPUT;
### stop the automatic reload
system 'echo "(cd '.$WorkingDir.' ; chmod -R og=rx * )" | /bin/tcsh';
&stop_reload;
# reporting the statistics file on a succssful ending
my $total_runtime = BIOSEQUENCE_FUNCTIONS::subtract_time_from_now($begin_Q_runtime);
open STATISTICS, ">>".$statistics_file;
flock STATISTICS, 2;
print STATISTICS "$run_name total runTime: $total_runtime\n";
flock STATISTICS, 8;
close STATISTICS;
$email_subject = "Your Selecton results for run number $run_name are ready";
$email_message = "Selecton finished calculation. Please click on the following link to view the results:\n$WWWdir"."output.html\nPlease note: the results will be kept on the server for three months.";
open LOG, ">>$OutLogFile";
print LOG "\nSending mail to user.\n";
GENERAL_CONSTANTS::send_mail("Selecton", $FORM{email_address}, $run_name, $email_subject, $email_message);
if (-e $WorkingDir."core"){
print LOG "remove core file from working directory\n";
unlink $WorkingDir."core";
}
print LOG "\nSelecton run completed successfully!";
print LOG "\n************** END OF LOG FILE *****************\n";
close LOG;
exit;
#----------------------------------------------------------------------------------------
# S U B R O U T I N E S
#----------------------------------------------------------------------------------------
sub readInput{
# ------ storable on -------
my $input_data = retrieve($runCalcInput);
%run_data = %$input_data;
my $FORM_data = retrieve($formInput);
%FORM = %$FORM_data;
$run_name = $run_data{run_name}; $WorkingDir = $run_data{WorkingDir}; $WWWdir = $run_data{WWWdir}; $epsilonPrecision = $run_data{epsilonPrecision}; $query_seq_name_to_run = $run_data{query_seq_name_to_run}; $optimizeBL = $run_data{optimizeBL}; $querySeqFoundinMSA = $run_data{querySeqFoundinMSA}; $tree_faq = $run_data{tree_faq}; $method = $run_data{method}; $SysErrorDef = $run_data{SysErrorDef}; $ErrorDef = $run_data{ErrorDef}; $ContactDef = $run_data{ContactDef}; $fileDna_aligned = $run_data{fileDna_aligned}; $treeUpload = $run_data{treeUpload}; $OutHtmlFile = $run_data{OutHtmlFile}; $cgi_log_file = $run_data{cgi_log_file}; $OutLogFile = $run_data{OutLogFile}; $fileName_amino_aligned = $run_data{fileName_amino_aligned}; $OutputURL = $run_data{OutputURL}; $estimated_run_time = $run_data{estimated_run_time}; $sequences_names_file = $run_data{sequences_names_file}; $was_Pdb_uploaded = $run_data{was_Pdb_uploaded};
($run_data{upload_TREE_file} eq "NOT_GIVEN") ? $upload_TREE_file = "" : $upload_TREE_file = $run_data{upload_TREE_file};
($run_data{PdbFileNameUnc} eq "NOT_GIVEN") ? $PdbFileNameUnc = "" : $PdbFileNameUnc = $run_data{PdbFileNameUnc};
($run_data{PdbPrefix} eq "NOT_GIVEN") ? $PdbPrefix = "" : $PdbPrefix = $run_data{PdbPrefix};
($run_data{pdb_data} eq "NOT_GIVEN") ? $pdb_data = "": $pdb_data = $run_data{pdb_data};
($run_data{clustal_aligned_file} eq "NOT_GIVEN") ? $clustal_aligned_file = "" : $clustal_aligned_file = $run_data{clustal_aligned_file};
$upload_MSA_file_dna = $run_data{upload_MSA_file_dna};
$upload_unaligned_file_dna = $run_data{upload_unaligned_file_dna};
# ------ storable on -------
# ------ storable off -------
#unless(open INPUT, $runCalcInput){
# open ANS, ">".$qsub_ans_file;
# print ANS "NOT_OK";
# close ANS;
# chmod 0755, $qsub_ans_file;
# exit;
#}
#while( ){
# chomp;
#if(/RUN NAME: (.+)/) {$run_name = $1;}
#elsif(/WORKING DIR: (.+)/) {$WorkingDir = $1;}
#elsif(/WWW DIR: (.+)/){$WWWdir = $1;}
#elsif(/PRECISION LEVEL: (.+)/) {$epsilonPrecision = $1;}
#elsif(/EVOLUTONARY MODEL: (.+)/){$FORM{MODEL} = $1;}
#elsif(/EMPIRICAL MATRIX: (.+)/) {#ONLY IF IT IS MEC MODEL
#($1 eq "NOT_GIVEN") ? $FORM{EMPIRICAL_MATRIX} = "" : $FORM{EMPIRICAL_MATRIX} = $1;}
#elsif(/QUERY NAME TO RUN: (.+)/){$query_seq_name_to_run = $1;}
#elsif(/DISTRIBUTE CATEGORIES: (.+)/){$FORM{CATEGORIES} = $1;}
#elsif(/TREE_WAS_UPLOADED\?: (.+)/){#REMARK: CHANGE THIS VAR'S CONTENT IN THE REST OF THE SCRIPT TO TRUE/FALSE.
#($1 eq "NOT_GIVEN") ? $upload_TREE_file = "" : $upload_TREE_file = $1;}
#$upload_TREE_file = $1;}
#elsif(/OPTIMIZE BRANCH LENGTH\? (.+)/){$optimizeBL = $1;}
#elsif(/GENETIC CODE: (.+)/){$FORM{GENCODE} = $1;}
#elsif(/GIVEN QUERY NAME: (.+)/){$FORM{msa_SEQNAME} = $1;}
#elsif(/PDB ID: (.+)/){#if given
# ($1 eq "NOT_GIVEN") ? $FORM{pdb_ID} = "" : $FORM{pdb_ID} = $1;}
#elsif(/PDB NAME: (.+)/){
# ($1 eq "NOT_GIVEN") ? $PdbFileNameUnc = "" : $PdbFileNameUnc = $1;}
##elsif(/PDB CHAIN: (.+)/){
## ($1 eq "NOT_GIVEN") ? $FORM{chain} = "" : $FORM{chain} = $1;}
#elsif(/PDB PREFIX: (.+)/){
# ($1 eq "NOT_GIVEN") ? $PdbPrefix = "" : $PdbPrefix = $1;}
#elsif(/PDB DATA FILE: (.+)/){
# ($1 eq "NOT_GIVEN") ? $pdb_data = "" : $pdb_data = $1;}
#elsif(/CLUSTAL ALN: (.+)/){
# ($1 eq "NOT_GIVEN") ? $clustal_aligned_file = "" : $clustal_aligned_file = $1;} #elsif(/FOUND QUERY IN MSA\?: (.+)/){$querySeqFoundinMSA = $1;}
#elsif(/TREE FAQ: (.+)/){$tree_faq = $1;}
#elsif(/METHOD: (.+)/){$method = $1;}
#elsif(/USER EMAIL: (.+)/){
# ($1 eq "NOT_GIVEN") ? $FORM{email_address} = "" : $FORM{email_address} = $1;}
#elsif(/SYS ERROR: (.+)/){$SysErrorDef = $1;}
#elsif(/ERROR DEF: (.+)/){$ErrorDef = $1;}
#elsif(/CONTACT DEFINITION: (.+)/){$ContactDef = $1;}
#elsif(/WAS PDB UPLOADED\?: (.+)/){$was_Pdb_uploaded = $1;}
#elsif(/DNA FILE NAME: (.+)/){$fileDna_aligned = $1;}
#elsif(/UPLOADED TREE PATH: (.+)/){$treeUpload = $1;}
#elsif(/OUTPUT HTML PATH: (.+)/){$OutHtmlFile = $1;}
#elsif(/LOG PATH: (.+)/){$cgi_log_file = $1;}
#elsif(/QSUB LOG: (.+)/){$OutLogFile = $1;}
#elsif(/SEQ NAMES FILE: (.+)/){$sequences_names_file = $1;}
#elsif(/AMINO FILE NAME: (.+)/){$fileName_amino_aligned = $1;}
#elsif(/WAS A DNA UNALIGNED FILE UPLOADED\?: (.+)/){$upload_unaligned_file_dna = $1;}
#elsif(/WAS A DNA ALIGNED FILE UPLOADED\?: (.+)/){$upload_MSA_file_dna = $1;} #elsif(/URL OUTPUT: (.+)/){$OutputURL = $1;}
#elsif(/ESTIM RUNTIME: (.+)/){$estimated_run_time= $1;}
#}
#close INPUT;
# ------ storable off -------
# if reading was OK, we report it, for the daemon
open ANS, ">".$qsub_ans_file;
print ANS "OK";
close ANS;
chmod 0755, $qsub_ans_file;
# recreating the sequences array of the DNA sequences names
unless (open SEQ_NAMES, $WorkingDir.$sequences_names_file){
&sys_error_exit("cannot open the file ".$WorkingDir.$sequences_names_file." for reading $!\n");}
while(){
chomp;
$sequences_names[0] = "";
if(/(\d+) (.+)/){
$sequences_names[$1] = $2;
}
}
close SEQ_NAMES;
}
#########################################################################################
# CALCULATION AND POST-PROCESSING
sub run_calc {
&run_kaks4site;
# The next routine assumes these files were created, therefore first we check that it was created
unless ((-e $selection4Site_file) && !(-z $selection4Site_file)
&& (-e $kaks_file) && !(-z $kaks_file))
{
&sys_error_exit("run_calc: The file $selection4Site_file or $kaks_file was not created (or contains no data) during the run of selecton.v2.2. Cannot process outputs");
}
#print LOG "run_calc : going to run routine change_colors_if_significant\n";
#&change_colors_if_significant;
if ($was_Pdb_uploaded eq "yes") { #if user ran selecton with a PDB struct.
## run the script colorCoding.pl to produce the output files
print LOG "run_calc : touch $final_out\n";
$proc_comm = "perl $colorCoding $method \'$query_seq_name_to_run\' $WorkingDir $selection4Site_file $clustal_aligned_file $pdb_data $kaks_file $final_out $PdbFileNameUnc $fileName_amino_aligned $rsml $params";
print LOG "run_calc: running $proc_comm\n";
system 'echo "(cd '.$WorkingDir.';touch '.$final_out.'; chmod oug+rx '.$final_out.')" | /bin/tcsh';
system 'echo "(cd '.$WorkingDir.'; '.$proc_comm.')" | /bin/tcsh';
# check if the script $colorCoding found an error
if (-e $WorkingDir."error"){
&read_colors_error_and_exit;
}
# REMARK: I don't think it is necesseray in the new server, since we wont use PE
##### copy final PE files to pdbspt dir and compress the PDB file
#my $string1 = "cd $WorkingDir";
#my $string2 = "cp consurf.spt pdbspt/consurf.spt";
#my $string3 = "gzip -c $PdbFileNameUnc > pdbspt/pdbfile.ent";
#my $string4 = "mv consurf.spt colors.txt";
#my $string6 = "mv $rasmol_file rasmol.txt";
#my $string5 = "chmod ogu+rx pdbspt/*";
#
#print LOG "\nrun_calc: Copy the final PE files to pdbspt dir\n";
#
#system 'echo "('.$string1.'; '.$string2.'; '.$string3.'; '.$string4.'; '.$string6.';' .$string5.';)" | /bin/tcsh';
}
$proc_comm = "perl $colorCodingLinear $run_name $WorkingDir $colorsLinear $selection4Site_file $areTherePositiveSites";
print LOG "\nrun_calc: running $proc_comm \n";
system 'echo "(cd '.$WorkingDir.'; '.$proc_comm.')" | /bin/tcsh';
# check if the script $colorCodingLinear found an error
if (-e $WorkingDir."error"){
&read_colors_error_and_exit;
}
}
######################################################################################
# run kaks4site.exe
sub run_kaks4site {
###### should add verification that two refuting arguments aren't given here...
# my $selecton_comm="$selecton -c \'$fileDna_aligned\'"; #default run: bayesian, beta+w>1 (M8) , w=8, ref=1st seq, std nuc.code, NJ tree
my $selecton_comm="$selecton -i $WorkingDir" . $fileDna_aligned . " -e" . $epsilonPrecision; #default run: bayesian, beta+w>1 (M8) , w=8, ref=1st seq, std nuc.code, NJ treeepsilon by default set to 0.1
if ($FORM{MODEL} eq "MEC") { #if the model is MEC call another exe
$selecton_comm = "$mecSelecton -i $WorkingDir" . "$fileDna_aligned";
if ($FORM{EMPIRICAL_MATRIX} eq "JTT"){
$selecton_comm .= " -z 0";
}
if ($FORM{EMPIRICAL_MATRIX} eq "WAG"){
$selecton_comm .= " -z 1";
}
if ($FORM{EMPIRICAL_MATRIX} eq "mtREV24"){
$selecton_comm .= " -z 2";
}
if ($FORM{EMPIRICAL_MATRIX} eq "cpREV45"){
$selecton_comm .= " -z 3"; # if no z is given mecSelecton will run by default with JTT
}
}
if ($querySeqFoundinMSA eq "yes") {
$selecton_comm .= " -q \'$query_seq_name_to_run\'";
}
if ($FORM{MODEL} eq "M7") { # beta no additional omega1. prob(beta) set to 1
$selecton_comm .= " -p1 -Fp";
}
if ($FORM{MODEL} eq "M8a") { #beta + w = 1
$selecton_comm .= " -w1 -Fw"; #do not optimize omega (omega set to 1) (M8a)
}
if ($FORM{MODEL} eq "M5") {
$selecton_comm .= " -dg";
}
if ($FORM{CATEGORIES} ne "") {
my $catAdd=" -n ".$FORM{CATEGORIES};
$selecton_comm .= $catAdd;
}
if ($upload_TREE_file ne "NOT_GIVEN") {
$selecton_comm .= " -u \'$treeUpload\'";
}
if ($optimizeBL eq "n"){
$selecton_comm .= " -bn";
}
if ($FORM{GENCODE} != 0) {
my $genAdd=" -g ".$FORM{GENCODE};
$selecton_comm .= $genAdd;
}
print LOG "\nrun_kaks4site: running $selecton_comm\n";
print LOG "run_kaks4site: SeqName = ***$query_seq_name_to_run***\n";
system "cd $WorkingDir; $selecton_comm; chmod ogu+rx *"; # The program can't run from rsh
#check for user errors in kaks4ite.log
my $kaksLogFile = $WorkingDir.$log;
open OUTPUT, ">>$OutHtmlFile";
unless (open (LOGFILE,"$kaksLogFile")) {
close (LOGFILE);
&sys_error_exit("Error in run_kaks4site, $kaksLogFile does not exist");
}
while (){
my $line=$_;
if ($line =~ /\S+/){
my @userError = split(/\s/,$line);
if ($userError[0] eq "USER"){
$line =~ s/USER ERROR://; ## query sequence not found
my $first_line = $line;
while (){
$line = $_;
$first_line = " ".$first_line." ".$line;
}
print OUTPUT "\n
Warning: The query sequence name \'$FORM{msa_SEQNAME}\' is not found in the MSA file . The calculation continues. The first sequence in MSA is used as a query. \n";
close OUTPUT;
print LOG "\nrun_kaks4site: query sequence not found. Calculation continues with 1st sequence in MSA.\n";
}
if (($line =~ /found in the tree file but not found in the sequence file/) || ($line =~ /Error reading tree file/)) { #mismatch between MSA names and tree names
my $err=$line;
my $line1 = ;
my $line2 = ;
$err .= "$line1 "."$line2";
close (LOGFILE);
&print_to_output_and_exit("Error in tree file: $err Please check that all the names in the sequence file are identical to all the names in the tree.", "run_kaks4site: $err");
}
elsif ($line =~ /Bad format in tree file/){
close (LOGFILE);
&print_to_output_and_exit("Bad format in tree file. Please correct your tree file according to Selecton accepted format and re-submit your query.", "run_kaks4site: $line");
}
elsif ($line =~ /The nucleotide sequences contained the character: (.*)/) {
my $illegal = $1;
close (LOGFILE);
&print_to_output_and_exit("The nucleotide sequences file contained an illegal character: $1. Only the following characters are accpted: A,C,G,T,-. Please correct your file and re-submit it to Selecton.","run_kaks4site: $line");
}
elsif($line =~ /Unable to read file. It is required that each line is no longer than/){
close (LOGFILE);
&print_to_output_and_exit("Selecton does not accept DNA sequences which are longer than ".GENERAL_CONSTANTS::SELECTON_MAX_NUCLEOTIDE." nucleotides. ", "run_kaks4site: $line");
}
}
}
close OUTPUT;
close (LOGFILE);
}
######################################################################
# creating new files to hold DNASeqNames in DNA file and AMINO file
sub return_seq_names_to_files{
my $current_file = shift;
my $new_file = shift;
my $ref_seq_name_arr = shift; # reference to the sequence names array
unless (open IN, $WorkingDir.$current_file){
print LOG "could not open file $WorkingDir"."$current_file for reading. Names of files will be displayed as numbers.\n" ;
}
else {
unless (open OUT, ">".$WorkingDir .$new_file){
print LOG "could not open file $WorkingDir"."$new_file for writing. Names of files will be displayed as numbers.\n";
}
else{
while (){
if(/>(\d+)/){
print OUT ">".$ref_seq_name_arr->[$1]."\n";
}
else{
print OUT $_;
}
}
close OUT;
}
close IN;
}
}
######################################################################################
sub change_tree_file_names{
my $ref_sequences_names = shift;
my $input_tree = shift;
my $output_tree = shift;
my ($tree, $err);
print LOG "change_tree_file_names : going to change numbers from tree $input_tree to names in file $output_tree\n";
unless (open TREE, $input_tree){
print LOG "change_tree_file_names : could not open the file $input_tree for reading. the tree file will be presented with numbers\n";
return;
}
#check validity of input tree file
$tree = ;
close TREE;
my @tree_arr = split(/\(/, $tree);
my @sub_tree = ();
my @temp_arr;
my $sub_counter = 0;
# building the array @sub_tree, so that each cell will hold maximum one sequence name
for(my $i=0; $i<@tree_arr; $i++){
if ($tree_arr[$i] ne ""){
$tree_arr[$i] = "(".$tree_arr[$i];
}
if ($tree_arr[$i] =~ m/.*,.+/){
@temp_arr = split(/,/, $tree_arr[$i]);
foreach (@temp_arr){
$sub_tree[$sub_counter] = $_.",";
$sub_counter++;
}
}
else{
$sub_tree[$sub_counter] = $tree_arr[$i];
$sub_counter++;
}
}
# rebuilding the tree, this time replacing the sequences names with the names found in the DNA input file
my $final_tree = "";
my ($exp, $rest_of_exp, $new_rest_exp);
my $seq_found = "no";
for (my $k=1; $k<@sub_tree; $k++){
#in this part we wish to split the expression to 2 parts; left part : (?seq_name ; right part: all the rest
if ($sub_tree[$k] ne ""){
if ($sub_tree[$k] =~ m/(.+)(:.+)/){
$exp = $sub_tree[$k];
$rest_of_exp = "";
while ($exp =~ m/(.+)(:.+)/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name:distance,
elsif($sub_tree[$k] =~ m/(.+)(\);.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name)*,
elsif($sub_tree[$k] =~ m/(.+)(\)?.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# if the length (value after the ":") is equal to zero, we replace it with a very small value,
# because the selecton.exe cannot calculate trees with zeros
$new_rest_exp = "";
while($rest_of_exp =~ m/(.?:)(\d\.?\d*)(.+)/){
if(!($2>0) && !($2<0)){
$rest_of_exp = $3;
$new_rest_exp .= $1."0.000000001";
}
else{
$rest_of_exp = $3;
$new_rest_exp .= $1.$2;
}
}
$new_rest_exp .=$rest_of_exp;
$rest_of_exp = $new_rest_exp;
$exp =~ m/(\(?)(.+)/;
$final_tree.= $1.$ref_sequences_names->[$2].$rest_of_exp;
}
#an empty cell stands for a "(" sign
else{
$final_tree.= "(";
}
}
if ($final_tree =~ m/,$/){
chop $final_tree;
}
unless (open NEW_TREE, ">".$output_tree){
&sys_error_exit("change_tree_file_names:: cannot open file $output_tree for writing.");
}
print LOG "change_tree_file_names : printing edited tree to file $output_tree and chmod it.\n";
print NEW_TREE $final_tree;
close NEW_TREE;
chmod 0755, $output_tree;
}
######################################################################
# prepare the variables to be sent to the "pipe" script. than calls the script with all needed vars.
sub prepare_pipe{
my $sequences_names = shift;
print LOG "\nEentered prepare_pipe()\n";
# vars that should be edits before sent to the pipe script:
my ($pipe_pdb_id, $pipe_chain, @pipe_Model, $model_ref, $pipe_distribute_categories, $pipe_optimizeBL, @pipe_genetic, $genetic_ref, $pipe_dna_input, $pipe_query, $pipe_precision, $pipe_empirical);
($FORM{pdb_ID} eq "") ? $pipe_pdb_id = "UPLOADED" : $pipe_pdb_id = $FORM{pdb_ID};
($FORM{chain} eq "") ? $pipe_chain = "none" : $pipe_chain = $FORM{chain};
if ($FORM{MODEL} eq "M8") {$pipe_Model[0] = 'Positive selection enabled (M8, beta + w >= 1)' ;}
elsif ($FORM{MODEL} eq "M8a") {$pipe_Model[0] = 'Null model: no positive selection(M8a, beta + w = 1)' ;}
elsif ($FORM{MODEL} eq "M7") {$pipe_Model[0] = 'Null model: no positive selection(M7, beta)' ;}
elsif ($FORM{MODEL} eq "M5") {$pipe_Model[0] = 'Positive selection enabled(M5, gamma)' ;}
elsif ($FORM{MODEL} eq "MEC") {$pipe_Model[0] = 'Mechanistic Empirical Combination Model (MEC)' ;}
else {$pipe_Model[0] = "IGNORED";}
$model_ref = \@pipe_Model;
($FORM{MODEL} eq "MEC") ? $pipe_empirical = $FORM{EMPIRICAL_MATRIX} : $pipe_empirical = "IRRELEVANT";
if ($epsilonPrecision == 0.1) {$pipe_precision = "Intermediate precision";}
elsif ($epsilonPrecision == 1) {$pipe_precision = "Low precision- faster run";}
elsif ($epsilonPrecision == 0.01) {$pipe_precision = "High precision- slower run";}
else {$pipe_precision = "DEFAULT";}
($FORM{CATEGORIES} eq "") ? $pipe_distribute_categories = 8 : $pipe_distribute_categories = $FORM{CATEGORIES};
($optimizeBL eq "y") ? $pipe_optimizeBL = "True" : $pipe_optimizeBL = "False";
if ($FORM{GENCODE}==0) {$pipe_genetic[0] = "Nuclear Standard" ;}
elsif ($FORM{GENCODE}==1) {$pipe_genetic[0] = "Nuclear Blepharisma";}
elsif ($FORM{GENCODE}==2) {$pipe_genetic[0] = "Nuclear Ciliate";}
elsif ($FORM{GENCODE}==3) {$pipe_genetic[0] = "Nuclear Euplotid";}
elsif ($FORM{GENCODE}==4) {$pipe_genetic[0] = "Mitochondria Vertebrate";}
elsif ($FORM{GENCODE}==5) {$pipe_genetic[0] = "Mitochondria Invertebrate";}
elsif ($FORM{GENCODE}==6) {$pipe_genetic[0] = "Mitochondria Yeast";}
elsif ($FORM{GENCODE}==7) {$pipe_genetic[0] = "Mitochondria Ascidian";}
elsif ($FORM{GENCODE}==8) {$pipe_genetic[0] = "Mitochondria Echinoderm";}
elsif ($FORM{GENCODE}==9) {$pipe_genetic[0] = "Mitochondria Flatworm";}
elsif ($FORM{GENCODE}==10) {$pipe_genetic[0] = "Mitochondria Protozoan";}
else {$pipe_genetic[0] = "IGNORED";}
$genetic_ref = \@pipe_genetic;
($sequences_names->[$query_seq_name_to_run] eq "") ? $pipe_query = "\"\"" : $pipe_query = $sequences_names->[$query_seq_name_to_run];
#since there is only 1 input file, there will be only var sent to the pipe script. In order that the pipe script will know what kind of input it is - a short string is added at the beginning.
if ( $upload_unaligned_file_dna ne "no"){
$pipe_dna_input = "SELECTON_UN".$upload_unaligned_file_dna;
}
elsif ($upload_MSA_file_dna ne "no"){
$pipe_dna_input = "SELECTON_MSA".$upload_MSA_file_dna;
}
else{
$pipe_dna_input = "no_dna_input";
}
open ERROR, $WorkingDir.$pipe_error;
close ERROR;
chmod 0755, $WorkingDir.$pipe_error;
print LOG "running pipe_for_selecton::create_pipe with parameters:\n";
print LOG "$run_name $cgi_log_file $WorkingDir $rsml $FGiJ_pipe_pdb $pdbUpload $pipe_pdb_id $pipe_chain $pipe_dna_input $pipe_query $model_ref $pipe_distribute_categories $pipe_optimizeBL $genetic_ref $pipe_error $pipe_empirical $pipe_precision\n";
pipe_for_selecton::create_pipe($run_name, $cgi_log_file, $WorkingDir, $rsml, $FGiJ_pipe_pdb, $pdbUpload, $pipe_pdb_id, $pipe_chain, $pipe_dna_input, $pipe_query, $model_ref, $pipe_distribute_categories, $pipe_optimizeBL, $genetic_ref, $pipe_error, $pipe_precision, $pipe_empirical);
# checking if there was an error and the pipe file was not created properly
if (-e $WorkingDir.$pipe_error && !(-z $WorkingDir.$pipe_error)){
unless (open ERROR, $WorkingDir.$pipe_error){
&sys_error_exit("An error was found when trying to create the pipe file for Selecton.\nThe Error message should be written to file $WorkingDir"."$pipe_error, however this file could not be opened.\n");
}
&sys_error_exit("An error was found while trying to create the pipe file : ".);
}
}
######################################################################
sub sys_error_exit{
my $err = shift;
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT $SysErrorDef;
print OUTPUT $ContactDef;
close OUTPUT;
print LOG "\n$err\n";
&send_mail();
&send_mailSelecton("SYSTEM ERROR\n".$err);
&stop_reload;
exit;
}
##########################################################################################
# Stops the reload of the output page
sub stop_reload {
sleep 5;
open OUTPUT, "<$OutHtmlFile";
flock OUTPUT,2;
my @output = ;
flock OUTPUT,8;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
flock OUTPUT,2;
foreach my $line (@output){ # we remove the refresh lines and the button which codes for Selecton cancelled job
unless ($line =~ /REFRESH/ or $line =~ /NO-CACHE/ or $line =~ /ACTION=\"cgi.+kill/ or
$line =~ /VALUE="Cancel Selecton Job"/ or $line =~/TYPE=hidden NAME="Qstat_file"/ or $line =~/TYPE=hidden NAME="selecton_http"/ or $line =~ /TYPE=hidden NAME="run_no"/ or $line =~ /TYPE=hidden NAME="cgi_pid"/ or $line =~ /Estimated run time is:/ or $line =~ /kill_process.cgi/ or $line =~ /