Katzlab dd76ab1d12 Added PTL2 Scripts
These are PTL2 files from Auden 2/9
2023-02-14 11:20:52 -05:00

1382 lines
62 KiB
Perl

#!/usr/bin/perl
use CGI;
use CGI qw(:standard);
use CGI::Carp qw(warningsToBrowser fatalsToBrowser);
use strict;
use Storable;
use Bio::SeqIO;
#use Bio::TreeIO;
use Bio::Tree::NodeI;
use Bio::Root::Root;
use Bio::Tree::TreeI;
use lib "/bioseq/bioSequence_scripts_and_constants";
use GENERAL_CONSTANTS;
use SELECTON_CONSTANTS;
use BIOSEQUENCE_FUNCTIONS;
use TREE_parser;
use lib "/bioseq/Selecton/external_scripts";
use codonAlign;
###### READING DATA FROM FORM - FIRST TO DO
my $queryForm = new CGI;
my %FORM; #hash with form information
#***********************************
### input file, ref seq. and email address
my $upload_unaligned_file_dna = $queryForm->param('userFILEunaligned');
my $upload_MSA_file_dna=$queryForm->param('userFILEaligned');
$FORM{msa_SEQNAME} = $queryForm->param("msa_SEQNAME");
$FORM{email_address} = $queryForm->param("email_add");
my $recipient = $FORM{email_address};
# check if the user which runs this run has not exeeded its maximal number of runs
my $user_ip = $ENV{'REMOTE_ADDR'};
BIOSEQUENCE_FUNCTIONS::check_if_user_is_allowed("selecton",$user_ip, $recipient);
#***************************************
### advanced options
$FORM{pdb_ID} = $queryForm->param("pdb_ID");
my $upload_PDB_file = $queryForm->param("pdb_FILE");
my $pdbUploadName = "FILE";
$FORM{chain} = $queryForm->param("chain");
$FORM{MODEL} = $queryForm->param("MODEL");
$FORM{EMPIRICAL_MATRIX} = $queryForm->param("EMPIRICAL_MATRIX"); #adid added the mec model
my $empiricalMatrix = $FORM{EMPIRICAL_MATRIX};
my $epsilonPrecision = $queryForm->param("PRECISION"); # added v2.2 # NOTE: if more genetic codes are added (or changed), the info should be edit in the "prepare_pipe" routine.
my $model=$FORM{MODEL}; # NOTE: if more models are added (or changed), the info should be edit in 2 more places, as text: 1. in the OUTPUT file, in "start_output_html" routine, 2. in the "prepare_pipe" routine.
my $method="Bayesian"; ## ugly patch, since we removed ML: DO NOT REMOVE THIS LINE, it is necessary for colorCoding
#$FORM{DISTRIBUTION} = $queryForm->param("DISTRIBUTION");
$FORM{CATEGORIES} = $queryForm->param("CATEGORIES");
my $upload_TREE_file = $queryForm->param("tree_FILE");
$FORM{BL} = $queryForm->param("BL");
my $optimizeBL="y"; #true if checked
if ($FORM{BL} eq ""){
$optimizeBL="n";
}
$FORM{GENCODE} = $queryForm->param("GENCODE"); # NOTE: if more genetic codes are added (or changed), the info should be edit in the "prepare_pipe" routine.
#****************************************
#### TRANSLATE to LOWER case the PDB file and UPPER case the CHAIN
$FORM{pdb_ID} =~ tr/[A-Z]/[a-z]/;
$FORM{chain} =~ tr/[a-z]/[A-Z]/;
#****************************************
#### set pdb_ID to FILE if uploading a file
if ($upload_PDB_file ne "") {
$FORM{pdb_ID} = $pdbUploadName;
}
#**************************************
##### variables #######
my $querySeqFoundinMSA="no";
my $query=$FORM{msa_SEQNAME};
#**********************************
##### GENERAL PATHS
my $run_name = $^T; #the running dir NAME old $$
my $ibis_external_scripts_path = "/bioseq/bioSequence_scripts_and_constants/";
my $WorkingDir = GENERAL_CONSTANTS::SERVERS_RESULTS_DIR."Selecton/" . $run_name . "/";
my $http_path = GENERAL_CONSTANTS::SELECTON_URL;
my $job_canceled_page = $http_path.'/cancel_page.html';
my $WWWdir = $http_path."/results/" . $run_name . "/";
my $PdbPath = GENERAL_CONSTANTS::PDB_DIVIDED;
#***************************************
### PROGRAMS OR PERL OR EXECUTABLES
my $extractPDBinfo = $ibis_external_scripts_path . "extract_info_from_pdb.pl";
my $kill_job_script = "/cgi-bin/kill_process.cgi";
my $qsub_script = $WorkingDir."qsub.sh";
my $runClac_inQ = "/bioseq/Selecton/selecton_run_calc.pl";
my $clustalw= 'ssh bioseq@biocluster clustalw'; #'/usr/local/bin/clustalw';
my $muscle = 'ssh bioseq@biocluster muscle'; #'/usr/local/bin/muscle';
#***************************************
### Sending e-mail from ibis
my $send_email_dir = GENERAL_CONSTANTS::SEND_EMAIL_DIR_IBIS;
my $smtp_server = GENERAL_CONSTANTS::SMTP_SERVER;
my $userName = GENERAL_CONSTANTS::ADMIN_USER_NAME;
my $userPass = GENERAL_CONSTANTS::ADMIN_PASSWORD;
my $email_subject;
my $email_message;
my $email_system_return;
#***************************************
### output related paths
my $InpSeqFile = $WorkingDir . "path.txt"; #file containing ls result of pdb
my $OutputURL = $WWWdir ."output.html"; #link to output file
my $OutHtmlFile = $WorkingDir . "output.html"; #OUTPUT to the user
my $Logs_dir = GENERAL_CONSTANTS::SERVERS_LOGS_DIR."Selecton/";
my $OutLogFile = $Logs_dir.$run_name.".log";
my $QsubLogFile = $Logs_dir.$run_name."_Q.log";
#**************************************
##### SPECIFIC TO THIS RUN variables
my $pid; #to set the pid of the child
my $PdbFileDir = ""; #pdb file specific dir on path (2 letters) to check
my ($cgi_pid, $cmd);
my @sequences_names = (); # This array will hold the names of the sequences, as it appears in the input file.
my $Qstat_No_file = "QSTAT_NO";
my $estimated_run_time = "none";
#*****************************
### making absolute path file for ATEN pdb1ed5.ent.Z or pdbFILE.ent.Z if file was uploaded
my $PdbFileName = "pdb" .$FORM{pdb_ID}. ".ent.gz";
my $PdbFileNameUnc = "pdb" .$FORM{pdb_ID}. ".ent";
my $PdbPrefix = "pdb" .$FORM{pdb_ID};
if ($FORM{pdb_ID} eq "") {
$PdbPrefix="";
}
# file names in use only in case of PDB reading, for the use of the script $extractPDBinfo
my ($pdb_data) = ($PdbFileNameUnc) =~ /(\w+)/; #extracting prefix, put it in first var
my $title_file = $pdb_data.".title";
my $pdb_fasta = $pdb_data.".pdbfasta";
$pdb_data.=".pdbdata";
my $pdb_to_fasta_error = "pdb_to_fasta.error";
my $pdb_msa = $PdbPrefix . "_PDB_MSA.pdbfasta";
my $clustal_outFile = $PdbPrefix . "_PDB_MSA.out"; # the outfile for clusalw
my $clustal_aligned_file = $PdbPrefix."_PDB_MSA.aln";
#*****************************
### FILE NAMES IN USE
my $dnaMSAprefix= $PdbPrefix ."DNA";
my $aminoMSAprefix= $PdbPrefix ."AMINO";
my $fileUploadName_dna_unaligned=$dnaMSAprefix."_unaligned".".txt";
my $fileDna_aligned = $dnaMSAprefix.".msa";
my $fileName_amino_aligned = $aminoMSAprefix.".msa";
my $fileUploadPath_dna_unaligned=$WorkingDir . $fileUploadName_dna_unaligned; #uploaded file from user - unaligned
my $copied_dna_unaligned = $WorkingDir . "COPY_".$fileUploadName_dna_unaligned;; # a copy of the same file
my $wwwfileUploadPath_dna_unaligned = $WWWdir . $fileUploadName_dna_unaligned;#www path : uploaded file from user - unaligned
my $fileDnaPath_aligned = $WorkingDir . $fileDna_aligned; #dna file after alignment
my $fileAminoPath_aligned = $WorkingDir . $fileName_amino_aligned; #amino file after alignment
($PdbFileDir) = ($FORM{pdb_ID}) =~ /\w(\w{2})/;
my $PdbFilePath = $PdbPath . $PdbFileDir . "/" . $PdbFileName;
my $codonAlignLogFile=$WorkingDir."codonAlign.log";
my $treeUpload = $WorkingDir . "userTree.txt";
my $copied_treeUpload = $WorkingDir . "COPY_userTree.txt"; # a copy of the same file
my $userTree = $WWWdir . "userTree.txt";
my $userTree_copy = $WWWdir . "COPY_userTree.txt";
my $PdbFile = $PdbPrefix . ".ent";
my $sequences_names_file = "sequences.names";
my $runCalcInput = "runCalcInput.txt";
my $FormInput = "formInput.txt";
my $qsub_ans = $WorkingDir."qsub_ans.txt";
my $statistics_file = SELECTON_CONSTANTS::STATISTICS_FILE;
#*****************************
### pdb related links
my $pdbUpload = $WorkingDir . $PdbFileNameUnc; #name for an uploaded PDB FILE
#**********************************
### HTML definitions
my $ErrorDef = "<font size=+3 color='red'>ERROR! Selecton session has been terminated: </font>\n";
my $SysErrorDef = "<p><font size=+3 color='red'>SYSTEM ERROR - Selecton session has been terminated!</font><br><b>Please verify that there are no errors in your input file/s, and try to run Selecton again. Specifically, make sure your file is in the <a href=\"$http_path/faq.html#q4\">correct format</a>, and refer to the <a href=\"$http_path/faq.html\">FAQ</a> for further assistance.</b></p>\n";
my $tree_faq = $http_path.'/faq.html#q8';
my $SystemError = "<b>A system error occured during the calculation. Please try to run Selecton again in a few minutes.</b>\n";
my $mail = "mailto:".GENERAL_CONSTANTS::ADMIN_EMAIL."?subject=Selecton%20Run%20No:%20$run_name";
my $ContactDef = "<H3><center>For assistance please <a href=\"$mail\">contact us</a> and mention this number: $run_name</H3>\n";
my $QuickHelp = 'http://consurf.tau.ac.il/quick_helpver3.html#chain'; # REMARK : ugly, should change
my $status_faq = $http_path."/faq.html#q13";
my $reload_interval = 30;
#$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
#***************************************************
#### MAIN CGI THAT FORKS AND USES THE FILES TO RUN
#***************************************************
#********** FATHER **********
##### TO BE DONE BEFORE LONG PROCESS
if ($pid = fork) {
exit;
}
#******************************
#### CHILD ###
elsif (defined $pid) {
# create the WorkingDir
mkdir $WorkingDir;
system 'echo "(touch '.$OutLogFile.'; chmod 750 '.$OutLogFile.')" | /bin/tcsh';
open LOG, ">$OutLogFile";
print LOG "\n************** LOG FILE *****************\n\n";
print LOG "\nBegin time: ";
my $curr_time = &printTime();
print LOG "\nuser's email is: $recipient\n";
# add this run to the log
open LIST, ">>".GENERAL_CONSTANTS::SELECTON_LOG;
flock LIST, 2;
print LIST $curr_time." $run_name $user_ip $recipient\n";
flock LIST, 8;
close LIST;
# print the user's ip and run name in the running jobs list
open RUN_LIST, ">>".GENERAL_CONSTANTS::SELECTON_RUNNING_JOBS;
flock RUN_LIST, 2;
print RUN_LIST "$run_name $user_ip $recipient\n";
flock RUN_LIST, 8;
close RUN_LIST;
#********************************************
##### CREATE AND START file for output UPDATE
&start_output_html;
### Move directly to the output file
print "Location: $OutputURL\n\n";
# unmark comment if you want to print to the screen a message, instead of a new location:
#print header;
# print start_html;
# print h3("Due to a temporal error in Selecton server, the results page can not be viewed via the browser. Please save the link to your results, as soon as the error is fixed - you will be able to view it in this url:<br> $OutputURL<br><br>We deeply apologize for the inconvenience.<br><br><br><br>");
# print $ContactDef;
# print end_html;
#**************************************
##### disconnecting CHILD flush buffers
close(STDIN);
close(STDOUT);
close(STDERR);
### if PDB ID: checking if PDB FILE exists on aten pdb DB
# send user's e-mail to a special file
open USER_MAIL, ">$WorkingDir" . "user_email.txt";
print USER_MAIL $FORM{email_address};
close USER_MAIL;
chmod 0600, $WorkingDir. "user_email.txt";
if ($FORM{pdb_ID} ne "" and $FORM{pdb_ID} ne $pdbUploadName) {
&check_copy_uncomp_pdbID;
}
### PDB file: UPLOAD PDB file
elsif ($upload_PDB_file ne "") {
&upload_file ($pdbUpload, $upload_PDB_file);
}
### Verifying that user supplied a file:
if ($upload_unaligned_file_dna eq "" and $upload_MSA_file_dna eq "") {
my $err1="No file provided by the user. Aborting, this is supposed file name:"." $upload_unaligned_file_dna";
&print_to_output_and_exit($err1,$err1);
}
### to extract SEQRES and ATOMS fasta format from pdb aa
if ($upload_PDB_file ne "" or $FORM{pdb_ID} ne "") { #if user ran selecton with a PDB struct.
&extract_PDB_info();
}
### update page-a.htm file with title and ...
#&update_page_a_html; # REMARK: i think that this is not neceserray if not having PE
### UPLOAD MSA file
if ($upload_unaligned_file_dna ne ""){ # user uploaded an un-aligned file
print LOG "\nUser uploaded UNALIGNED DNA file. seqname: $FORM{msa_SEQNAME}\n";
&upload_file ($fileUploadPath_dna_unaligned, $upload_unaligned_file_dna); # the first is the UNIX path, the second is the name user gave
}
else{ # user uploaded an aligned file
print LOG "\nUser uploaded ALIGNED DNA file. seqname: $FORM{msa_SEQNAME}\n";
&upload_file ($fileUploadPath_dna_unaligned, $upload_MSA_file_dna); # the first is the UNIX path, the second is the name user gave
}
### UPLOAD TREE file
if ($upload_TREE_file ne "") {
&upload_file ($treeUpload, $upload_TREE_file);
&convertNewline($treeUpload) ;
&removeBPvalues($treeUpload);
&check_validity_tree_file($treeUpload);
}
# converts Mac or DOS newline to Unix newline
&convertNewline($fileUploadPath_dna_unaligned);
# create a copy to the original DNA file, since the names of the sequences will be replaced with numbers.
system("mv $fileUploadPath_dna_unaligned $copied_dna_unaligned");
# if the user supplies an unaligned file - aligns according to codons, and then translate.
# if the user supplies a codon-aligned file - verifies that there are no internal stop codons and that ORF/3=whole and translates
&codonAlign(\@sequences_names);
# in case there is a tree file: changes its names accordingly, saving the original file under the name $copied_treeUpload
if ($upload_TREE_file ne "") {
system("mv $treeUpload $copied_treeUpload");
&change_tree_file_names(\@sequences_names, "change_to_numbers", $copied_treeUpload, $treeUpload);
}
###**** SET CORRECT FORMAT MSA file - extract query seq and verify no. of seqs >=5
my $number_of_sequences_in_msa = &msa_format_extract(\@sequences_names);
# create a file that will hold all the relevant data for this run.
&print_data_files;
system 'echo "(touch '.$WorkingDir.'std.out; chmod 755 '.$WorkingDir.'std.out)" | /bin/tcsh';
unless (open QSUB_SH, ">$qsub_script")
{&sys_error_exit("cannot open the file $qsub_script for writing $!\n");}
print QSUB_SH '#!/bin/sh';
print QSUB_SH "\nperl $runClac_inQ $WorkingDir $runCalcInput $FormInput > $WorkingDir"."std.out";
close QSUB_SH;
chmod 0755, $qsub_script;
print LOG "submitting qsub job \"SELECTON_$run_name\"\n";
system "touch $QsubLogFile";
chmod 0744, $QsubLogFile;
#chmod 077, $QsubLogFile;
#submitting the qsub_script using a qsub command.
my $qsub_command = "ssh bioseq\@biocluster qsub -q bioseq -e $WorkingDir -o $WorkingDir -N SELECTON_$run_name $qsub_script";
if ($estimated_run_time ne "none"){
if ($model eq "M8" and $number_of_sequences_in_msa>48){ # the estimation for this case is not very accurate, so it is better
# to give it the maximum run time
$qsub_command.=" -l walltime=".GENERAL_CONSTANTS::MAX_WALLTIME;
}
else{
$qsub_command.=" -l walltime=$estimated_run_time";
}
}
else{
$qsub_command.=" -l walltime=".GENERAL_CONSTANTS::MAX_WALLTIME;
}
print LOG "going to run qsub command: ".$qsub_command."\n";
my $qsub_job_no = `$qsub_command`;
chomp($qsub_job_no); # the command returns an extra new line character
# writing the job number to a file.
open JOB_NO, ">".$WorkingDir.$Qstat_No_file;
print JOB_NO $qsub_job_no;
close JOB_NO;
chmod 0644, $WorkingDir.$Qstat_No_file;
$qsub_job_no =~ /(\d+)/;
$qsub_job_no = $1; # extracting only the process (Q) number
print LOG "\nafter submitting qsub job. JOB NUMBER: $qsub_job_no\n";
my $ans = BIOSEQUENCE_FUNCTIONS::enqueue_job($qsub_job_no, "Selecton", $run_name);
print LOG $ans if ($ans ne "ok");
#unless (open LIST, ">>".GENERAL_CONSTANTS::QUEUING_JOBS){
# print LOG "Could not open file ".GENERAL_CONSTANTS::QUEUING_JOBS.". Reason: $!\nThe job was not listed in the queuing_jobs list.\n";
# &printTime();
#}
#else{
# flock LIST, 2; # locks the list, so no other process will write to it. On the same time - if the list is currently locked by another process - it waits until the list file is realeased. The "2" and "8" are the operation symbols for "lock" and "unlock".
# print LIST "$qsub_job_no Selecton $run_name ".&printTime()."\n";
# flock LIST, 8;
# close LIST;
#}
print LOG "\n\nCGI ended successfully.";
close LOG;
}
else{
die "Can not fork the process, please contact ".GENERAL_CONSTANTS::ADMIN_EMAIL."\n";
}
exit;
###################################################################################
# SUB ROUTINES #
###################################################################################
# Start writing the output web page of Selecton
sub start_output_html {
system 'echo "(touch '.$OutHtmlFile.'; chmod 755 '.$OutHtmlFile.')" | /bin/tcsh';
unless (open OUTPUT, ">$OutHtmlFile"){
print LOG "\nstart_output_html: Cannot open the output file $OutHtmlFile\n";
exit;
}
print LOG "\nstart_output_html: Opening the file $OutHtmlFile, and change the permissions of the WorkingDir\n";
print OUTPUT <<EndOfHTML;
<HTML>
<HEAD> <META HTTP-EQUIV="REFRESH" CONTENT=$reload_interval> </HEAD>
<HEAD> <META HTTP-EQUIV="PRAGMA" CONTENT="NO-CACHE"> </HEAD>
<TITLE>Selecton Results $run_name</TITLE>
<style type="text/css">
#menu {
text-decoration: none;
color: white;
font-size: 12px;
font-weight: 700;
}
</style>
</HEAD>
<BODY bgcolor="#FFF5EE">
<table width=100% bgcolor="#400080"> <tr><td>
<table border=0 cols=6 width=450 bgcolor="#400080" cellpadding=1 cellspacing=0 >
<tr>
<td align=center><a href="/index.html" id=menu target=_top>HOME</a></td>
<td align=center><a href="/overview.html" id=menu target=_top>OVERVIEW</a></td>
<td align=center><a href="/gallery.html" id=menu target=_top>GALLERY</a></td>
<td align=center><a href="/faq.html" id=menu target=_top>FAQ</a></td>
<td align=center><a href="/credits.html" id=menu target=_top>CREDITS</a></td>
</tr>
</table>
</td><tr></table>
<H1 align=center>Selecton Job Status Page</h1>
<blockquote>
<p><font face=Verdana size=2>
<br><b>Selecton is now processing your request.<br><font size=+1>
<!--job_stat--><a href ="$status_faq" target=stat_faq>Your job status is:</a> Queued<br>
<!--job_pass-->The time that passed since submitting the query is: 00:00<br>
<!--job_time--Estimated run time is: -->
</font>
Please note this may be a lengthy process and an email will be sent to the address you supplied once the calculation is finished.</b><br>
This page will be automatically updated every 30 seconds. You can also reload it manually.<br>
Once the job has finished, several links to the output files will appear below.
<br><br>
If you wish to view these results at a later time without recalculating
them, please bookmark this page. The results will be kept on the server for three months.
</font></p>
<h4><font face=Verdana><u>Running Parameters:</u></h4>
EndOfHTML
print OUTPUT "<p><font face=Verdana size=2>\n";
print OUTPUT "PDB ID = $FORM{pdb_ID} <br>\n" if ($FORM{pdb_ID} ne "");
if ($upload_PDB_file ne ""){
if ($upload_PDB_file =~ m/^.*(\\|\/)(.*)/) {print OUTPUT "PDB file = $2 <br>\n";}
else {print OUTPUT "PDB file = $upload_PDB_file <br>\n";}
}
print OUTPUT "Chain identifier = $FORM{chain} <br>\n" if ($FORM{chain} ne "");
if ($upload_unaligned_file_dna ne ""){
if ($upload_unaligned_file_dna =~ m/^.*(\\|\/)(.*)/) {print OUTPUT "DNA unaligned file = $2 <br>\n" ;}
else {print OUTPUT "DNA unaligned file = $upload_unaligned_file_dna <br>\n" ;}
}
if ($upload_MSA_file_dna ne ""){
if ($upload_MSA_file_dna =~ m/^.*(\\|\/)(.*)/) {print OUTPUT "DNA MSA file = $2 <br>\n";}
else {print OUTPUT "DNA MSA file = $upload_MSA_file_dna <br>\n" ;}
}
print OUTPUT "Query sequence name in MSA file = $FORM{msa_SEQNAME} <br>\n" if ($upload_unaligned_file_dna ne "");
print OUTPUT "Model = Positive selection enabled (M8, beta + w >= 1) <br>\n" if ($model eq "M8");
print OUTPUT "Model = Null model: no positive selection(M8a, beta + w = 1) <br>\n" if ($model eq "M8a");
print OUTPUT "Model = Null model: no positive selection(M7, beta)<br>\n" if ($model eq "M7");
print OUTPUT "Model = Positive selection enabled(M5, gamma) <br>\n" if ($model eq "M5");
print OUTPUT "Model = Mechanistic Empirical Combination Model (MEC) <br>\n" if ($model eq "MEC");
if ($model eq "MEC"){
print OUTPUT "Amino-Acid empirical matrix to be expanded = ";
print OUTPUT $empiricalMatrix;
print OUTPUT "<br>\n";
}
print OUTPUT "Number of categories = $FORM{CATEGORIES} <br>\n" if ($FORM{CATEGORIES} ne "");
if ($upload_TREE_file ne ""){
if ($upload_TREE_file =~ m/^.*(\\|\/)(.*)/) {print OUTPUT "User tree file: $2 <br>\n" ;}
else {print OUTPUT "User tree file: $upload_TREE_file <br>\n" ;}
}
print OUTPUT "</p></font><br>\n\n";
$cgi_pid = $$;
chomp($cgi_pid);
print OUTPUT "\n<FORM ENCTYPE=\"multipart/form-data\" ACTION=\"$kill_job_script\" METHOD=\"POST\">\n";
print OUTPUT "<INPUT TYPE=\"submit\" VALUE=\"Cancel Selecton Job\"><br>\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"Qstat_file\" VALUE=\"".$Qstat_No_file."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"selecton_http\" VALUE=\"".$job_canceled_page."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"run_no\" VALUE=\"".$WorkingDir."\">\n";
print OUTPUT "<INPUT TYPE=hidden NAME=\"cgi_pid\" VALUE=\"".$cgi_pid."\">\n";
print OUTPUT "<h4><u>Messages regarding input and calculation:</u></h4>\n";
close OUTPUT;
}
###################################################################################
# checking if PDB FILE exists on aten pdb DB
sub check_copy_uncomp_pdbID {
#saving the ls output on a file to check file existence
system 'echo "(ls '.$PdbFilePath.' > '.$InpSeqFile.';)" | /bin/tcsh';
my $file_exist = `cat "$InpSeqFile"`;
#checking if file pdb exists
if ($file_exist eq "") {
print LOG "\ncheck_copy_uncomp_pdbID:ls $PdbFilePath > $InpSeqFile";
&print_to_output_and_exit("The PDB file with the ID \'$FORM{pdb_ID}\' is not found on our database, or not available right now.","\ncheck_copy_uncomp_pdbID: The pdb file $PdbFilePath does not exist on our DB\n");
}
#####if pdb exists continues calc
print LOG "\ncheck_copy_uncomp_pdbID: pdb file exists !\n";
#***************************************************
#####copy pdb.Z file locally to work on it freely an uncompress
#copy file to working dir
print LOG "\ncheck_copy_uncomp_pdbID: Copy $PdbFilePath to $WorkingDir\n";
system 'echo "(cp '.$PdbFilePath.' '.$WorkingDir.' ;)" | /bin/tcsh';
# system 'echo "(cd '.$WorkingDir.' ; chmod -R ogu+rx * )" | /bin/tcsh';
# check if the file exists in the WorkingDir
my $pdb_file = $WorkingDir . $PdbFileName;
unless (-e $pdb_file){
&sys_error_exit("check_copy_uncomp_pdbID: The PDB file was not copied to the directory $WorkingDir");
}
# change the permissions of the file and uncompress it
print LOG "\ncheck_copy_uncomp_pdbID: Change the permissions of $PdbFileName and gunzip it\n";
system 'echo "(cd '.$WorkingDir.'; chmod +wxr '.$PdbFileName.'; gunzip '.$PdbFileName.'; )" | /bin/tcsh';
}
####################################################################################
# UPLOAD file
sub upload_file {
my $full_path = shift; #NAME to be saved
my $file_full_name = shift; #FULL name of file
# strip the remote path and keep the filename
$file_full_name =~ m/^.*(\\|\/)(.*)/;
my $name = $2;
print LOG "\n upload_file : FILE_NAME = $full_path \n";
system 'echo "(touch '.$full_path.'; chmod oug+w '.$full_path.')" | /bin/tcsh';
# if the upload didn't work
unless (open(UPLOADFILE, ">$full_path")){
&sys_error_exit("upload_file: Can\'t open the file $full_path");
}
print LOG "\nupload_file: Upload the file $file_full_name and save it as $full_path\n";
while (<$file_full_name>) {
print UPLOADFILE;
}
close UPLOADFILE;
# verify that the size of the file is not zero
if (-z $full_path){
my $err = "upload_file: Cannot upload the file \'$file_full_name\'";
&send_mailSelecton($err);
&print_to_output_and_exit("<b>Cannot upload the file \'$file_full_name\', Please verify that the file exists and contains data.</b>", $err);
}
system "cd $WorkingDir; chmod ogu+rx *";
# to be able to write in the PDB file
system "chmod ogu+wxr $full_path";
print LOG "\nupload_file: system 'chmod ogu+wxr $full_path'\n";
#check file type
my @type = BIOSEQUENCE_FUNCTIONS::check_file_type($full_path);
if ($type[0] ne "OK"){
&sys_error_exit("upload_file: ".$type[1]);
}
unless ($type[1] eq "PLAIN_TEXT"){
&print_to_output_and_exit("The file which you have uploaded, '$file_full_name', could not be read by the server. It seems that its type is: $type[1] and not plain text, as required. Make sure your file is in the <a href=\"$http_path/faq.html#q4\">correct format</a>, and refer to the <a href=\"$http_path/faq.html\">FAQ</a> for further assistance.","upload_file : file type is: $type[1]\n");
}
}
###########################################################################################
# extract the SEQRES and ATOM sequences from the PDB file
sub extract_PDB_info {
my $html_error = "";
my $log_error = "";
$cmd = "$extractPDBinfo $PdbFileNameUnc $FORM{chain} $FORM{pdb_ID} none $run_name $OutHtmlFile $pdb_data none none $pdb_to_fasta_error none none none none $pdb_fasta $title_file $QuickHelp selecton";
### run the script
print LOG "\nextract_PDB_info: going to run $cmd\n";
#system 'echo "(cd '.$WorkingDir.'; '.$cmd.')" | /bin/tcsh';
`cd $WorkingDir;$cmd`;
## check if the script has created the file error
if (-e $WorkingDir.$pdb_to_fasta_error and !(-z $WorkingDir.$pdb_to_fasta_error)){
print LOG "extract_PDB_info: Found error while running $extractPDBinfo. Read the error from the file: $pdb_to_fasta_error\n";
unless (open ERROR, $WorkingDir.$pdb_to_fasta_error){
&print_to_output_and_exit($SystemError, "extract_PDB_info: unfortunately could not open the error file. abort.\n");
&send_mail();
&stop_reload;
exit;
}
#read the error file
while (<ERROR>){
if (/HTML: (.+)/){
$html_error = $1;
}
elsif(/LOG: (.+)/){
$log_error = $1;
}
}
close ERROR;
if ($html_error eq "sys" or $html_error eq "") {$html_error = $SystemError;}
&print_to_output_and_exit($html_error, "extract_PDB_info: $log_error\n");
}
# no error was found, make sure that all the files were created
if (!(-e $WorkingDir.$pdb_data) or !(-e $WorkingDir.$pdb_fasta) or !(-e $WorkingDir.$title_file)){
&print_to_output_and_exit($SystemError, "extract_PDB_info: The script $extractPDBinfo did not create one of its outputs. \n");
}
print LOG "extract_PDB_info: Extracting SEQRES and pdb sequences from $PdbFileNameUnc and chain $FORM{chain} !\n";
}
######################################################################
sub convertNewline{
my ($dnaFileUnixPath)=@_;
my $flip_comm="cd $WorkingDir;dos2unix -q $dnaFileUnixPath";
print LOG "\nconvertNewLine: running dos2unix -q $dnaFileUnixPath\n";
system "$flip_comm";
}
###########################################################################################
# remove bootstrap values
sub removeBPvalues {
my $treeFile=shift;
my $oldTreeFile = $WorkingDir .'oldUserTreeFile.txt';
TREE_parser::removeBPvalues($treeFile, $oldTreeFile);
}
###########################################################################################
# check the validity of the newick format of the uploaded tree
sub check_validity_tree_file {
my $treeFile=shift;
my $lineCounter=0;
my $rightBrackets=0;
my $leftBrackets=0;
my @lineArr;
my $line;
my $errorBool = 0;
my $noRegularFormatChar;
my $treeFileOneLine;
my $read_right_bracket = "no";
my $tempTreeFile = $WorkingDir .'TempTreeFile.txt';
open(TREEFILE,"$treeFile");
while (<TREEFILE>) {
$line = $_;
chomp($line);
$treeFileOneLine .= $line;
$lineCounter++;
}
close TREEFILE;
$line = $treeFileOneLine;
open OUTPUT, ">>$OutHtmlFile";
if ( $lineCounter>1) {
open TEMPTREEFILE, ">$tempTreeFile";
print TEMPTREEFILE $line;
unlink ($treeFile);
system 'echo "(cp -r '.$tempTreeFile.' '.$treeFile.'; chmod -R ogu+xr '.$treeFile.')" | /bin/tcsh';
unlink ($tempTreeFile);
}
@lineArr=split(//,$line);
foreach my $chain(@lineArr) {
if ($chain eq '(') {
$leftBrackets++;
$read_right_bracket = "no";
}
elsif ($chain eq ')') {
$rightBrackets++;
$read_right_bracket = "yes";
}
elsif ($chain =~ /([\!|\@|\#|\$|\^|\&|\*|\~|\`|\{|\}|\'|\?|\\|\/|\<|\>])/){
$noRegularFormatChar .= " \"$1\", " if $noRegularFormatChar !~ /\Q$1\E/;
$read_right_bracket = "no";
}
# if right after a right Bracket we read a character which is not legal (ie: , : ;) we output a message to the user, since we don't handle bootstrap values or internal node names
else{
if($read_right_bracket eq "yes"){
if($chain =~ /\d/){
&print_to_output_and_exit("<b>The <A HREF=$userTree TARGET=MSA_window>TREE file</A> you uploaded includes bootstrap values. Please remove them and resubmit your query.</b>\n", "check_validity_tree_file : found bootstrap values. Abort\n");
}
elsif($chain !~ /[,|:|;]/){
&print_to_output_and_exit("<b>The <A HREF=$userTree TARGET=MSA_window>TREE file</A> you uploaded includes internal nodes names. Please remove them and resubmit your query.</b>\n", "check_validity_tree_file : found internal nodes name in the tree. Abort\n");
}
}
$read_right_bracket = "no";
}
}
if ($leftBrackets ne $rightBrackets) {
print OUTPUT "\n<p>$ErrorDef<br><b>The <A HREF=$userTree TARGET=MSA_window>TREE file</A> which appears to be in Phylip format is missing parentheses</p>\n";
print OUTPUT $ContactDef;
$errorBool++;
}
if ($noRegularFormatChar =~ /.+/) {
$noRegularFormatChar =~ s/\,\s$//;
print OUTPUT "\n<p>$ErrorDef<br><b>The <A HREF=$userTree TARGET=MSA_window>TREE file</A> which appears to be in Phylip format, contains the following non-standard characters: ". qq($noRegularFormatChar) . ".</p>\n";
print OUTPUT $ContactDef;
$errorBool++;
}
close OUTPUT;
if ($errorBool ne '0') {
&send_mail();
&stop_reload;
exit;
}
}
####################################################################################
sub codonAlign{
my @ans;
print LOG "\ncodonAlign ";
my $ref_seq_name = shift;
#my %original_seq_name; # we use this hash to hold the original names for
my %codonTable = (0=>'1', 1=>'15', 2=>'6', 3=>'10', 4=>'2', # the codon Table input represents a number for each table, as was decided in the html.
5=>'5', 6=>'3', 7=>'13', 8=>'9', 9=>'14', 10=>'4',); # since the bioPerl modoule uses different numbers, we match each "input" number to the bioPerl table number.
# user supplied a codon aligned file -
if ($upload_unaligned_file_dna eq ""){
print LOG "calling codonAlign::DNA_checkLegal_and_crate_AAFile($copied_dna_unaligned, $fileDna_aligned, $WorkingDir, $codonTable{$FORM{GENCODE}}, $ref_seq_name, $fileName_amino_aligned, $OutHtmlFile, $WWWdir)\n";
@ans = codonAlign::DNA_checkLegal_and_crate_AAFile($copied_dna_unaligned, $fileDna_aligned, $WorkingDir, $codonTable{$FORM{GENCODE}}, $ref_seq_name, $fileName_amino_aligned, $OutHtmlFile, $WWWdir);
}
else{
print LOG "calling codonAlign::DNA_align($copied_dna_unaligned, $fileName_amino_aligned, $WorkingDir, $fileDna_aligned, $OutHtmlFile, $muscle, $WWWdir, $codonTable{$FORM{GENCODE}}, $ref_seq_name)\n";
@ans = codonAlign::DNA_align($copied_dna_unaligned, $fileName_amino_aligned, $WorkingDir, $fileDna_aligned, $OutHtmlFile, $muscle, $WWWdir, $codonTable{$FORM{GENCODE}}, $ref_seq_name);
}
unless($ans[0] eq "ok"){
my $err = $ans[1];
# in case it is a user error, we let him know via the html file
if ($ans[0] eq "user"){
&print_to_output_and_exit("An Error was found in your input file: $err", "codonAlign : error in codonAlign.pm : $err");
}
# a system error
else{
&sys_error_exit($err);
}
}
print LOG "codonAlign.pm returned OK\n";
}
######################################################################################
sub change_tree_file_names{
my $ref_sequences_names = shift;
my $new_tree_mode = shift; # change_to_numbers OR change_to_sequences
my $input_tree = shift;
my $output_tree = shift;
my ($tree, $err);
unless (open TREE, $input_tree){
if ($new_tree_mode eq "change_to_numbers") {
&sys_error_exit("change_tree_file_names : can not open file $input_tree for reading.") ;}
else{
print LOG "could not open the file $input_tree for reading. the tree file will be presented with numbers\n";
return;}
}
#check validity of input tree file
if ($new_tree_mode eq "change_to_numbers"){
print LOG "\nchange_tree_file_names : reading tree file\n";
while (<TREE>){
if ((/.+\r.+;/)||(/.+\n.+;/)){
$err = "Return or newLine charachters were found inside the tree file.";
&print_to_output_and_exit("An Error was found in your input tree file: $err<br>Please note: The input tree file should be written in one line. See <a href=\"$tree_faq\">Selecton accepted format</a> for more info.<br>\nPlease correct your input tree file and re-submit your query.<br>\n", "change_tree_file_names :: $err");
}
# chooping last ^M or \n
elsif((/^\(.+:.+\).*;\r$/)||(/^\(.+:.+\).*;\n$/)){
chop;
}
# if there is more than one tree - extracting the first tree
if(/(.+:.+;)(.+)/){
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> There is more than one tree in your <a href=$userTree_copy>input tree file</a>. The first tree will be used for calculations.</li></ul></p>\n";
$tree = $1;
while ($tree =~ m/(.+\;)(.+)/){
$tree = $1;
}
}
# the minimum requirments from a tree:
#elsif(/^\(.+:.+\).*;$/){
# $tree=$_;
#}
elsif(/^\(.+\).*;$/){
$tree=$_;
}
else{
$err = "The input tree file is not in a legal format.\n";
&print_to_output_and_exit("An Error was found in your input tree file:<br>$err See <a href=\"$tree_faq\">Selecton accepted format</a> for more info.<br>\nPlease correct your input tree file and re-submit your query.<br>", "change_tree_file_names :: $err");
}
}
print LOG "change_tree_file_names : tree input is legal\n";
}
else{
$tree = <TREE>;
}
close TREE;
my @tree_arr = split(/\(/, $tree);
my @sub_tree = ();
my @temp_arr;
my $sub_counter = 0;
# building the array @sub_tree, so that each cell will hold maximum one sequence name
for(my $i=0; $i<@tree_arr; $i++){
if ($tree_arr[$i] ne ""){
$tree_arr[$i] = "(".$tree_arr[$i];
}
if ($tree_arr[$i] =~ m/.*,.+/){
@temp_arr = split(/,/, $tree_arr[$i]);
foreach (@temp_arr){
$sub_tree[$sub_counter] = $_.",";
$sub_counter++;
}
}
else{
$sub_tree[$sub_counter] = $tree_arr[$i];
$sub_counter++;
}
}
# rebuilding the tree, this time replacing the sequences names with the names found in the DNA input file
my $final_tree = "";
my ($exp, $rest_of_exp, $new_rest_exp);
my $seq_found = "no";
for (my $k=1; $k<@sub_tree; $k++){
#in this part we wish to split the expression to 2 parts; left part : (?seq_name ; right part: all the rest
if ($sub_tree[$k] ne ""){
if ($sub_tree[$k] =~ m/(.+)(:.+)/){
$exp = $sub_tree[$k];
$rest_of_exp = "";
while ($exp =~ m/(.+)(:.+)/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name:distance,
elsif($sub_tree[$k] =~ m/(.+)(\);.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# in case the expression is of format: seq_name)*,
elsif($sub_tree[$k] =~ m/(.+)(\)?.+)/){
$exp = $1;
$rest_of_exp = $2;
while ($exp =~ m/(.+)(\))/){
$exp = $1;
$rest_of_exp = $2.$rest_of_exp;
}
}
# if the length (value after the ":") is equal to zero, we replace it with a very small value,
# because the selecton.exe cannot calculate trees with zeros
$new_rest_exp = "";
while($rest_of_exp =~ m/(.?:)(\d\.?\d*)(.+)/){
if(!($2>0) && !($2<0)){
$rest_of_exp = $3;
$new_rest_exp .= $1."0.000000001";
}
else{
$rest_of_exp = $3;
$new_rest_exp .= $1.$2;
}
}
$new_rest_exp .=$rest_of_exp;
$rest_of_exp = $new_rest_exp;
$exp =~ m/(\(?)(.+)/;
if ($new_tree_mode eq "change_to_numbers"){
for (my $in=1; $in<=($#$ref_sequences_names+1); $in++){
$seq_found = "no";
if ($ref_sequences_names->[$in] eq $2){
$seq_found = "yes";
$final_tree.= $1.$in.$rest_of_exp;
last;
}
}
if ($seq_found eq "no") {
# in case a sequence was found in the tree and not in the DNA file
&print_to_output_and_exit("The sequence name: \"$2\" was found in your tree file, but was not found in your DNA input file.<br>When submitting an input tree file, sequence names of both inputs must be identical. See <a href=\"$tree_faq\">Selecton accepted format</a> for more info.<br>\nPlease correct your input files and re-submit your query.<br>\n","change_tree_file_names : the sequence name $2 appears in tree file, does not appear in DNA input file");
}
}
else{
$final_tree.= $1.$ref_sequences_names->[$2].$rest_of_exp;
}
}
#an empty cell stands for a "(" sign
else{
$final_tree.= "(";
}
}
if ($final_tree =~ m/,$/){
chop $final_tree;
}
unless (open NEW_TREE, ">".$output_tree){
&sys_error_exit("change_tree_file_names:: cannot open file $output_tree for writing.");
}
print LOG "change_tree_file_names : printing edited tree to file $output_tree and chmod it.\n";
print NEW_TREE $final_tree;
close NEW_TREE;
chmod 0755, $output_tree;
}
######################################################################################
# extract the query sequence from the user-provided MSA file
sub msa_format_extract {
my $ref_seq_name = shift;
print LOG "\nentered msa_format_extract : \n";
my $original_seq_name = $FORM{msa_SEQNAME}; # we change the FORM name variable later, in order to search it in the DNA MSA that was built using numbers instead of original names.
if ($upload_PDB_file ne "" or $FORM{pdb_ID} ne "") { #if user ran selecton with a PDB struct, we change the name of the file ".pdbfasta" which was created by the extract_PDB_info routine and give writing permissions to all
my $str1 = "mv $pdb_fasta $pdb_msa";
my $str2 = "chmod ogu+wrx $pdb_msa";
print LOG "\nmsa_format_extract: mv $pdb_fasta $pdb_msa\n chmod ogu+wrx $pdb_msa\n";
system 'echo "(cd '.$WorkingDir.'; '.$str1.'; '.$str2.')" | /bin/tcsh';
}
# Change the sequence name, so it will fit the number that represents the sequence in the DNA file
for (my $i=1; $i<=($#$ref_seq_name+1); $i++){
if ($FORM{msa_SEQNAME} eq $ref_seq_name->[$i]){
$FORM{msa_SEQNAME} = $i;
last;
}
}
print LOG "\nmsa_format_extract: SeqName = \'$FORM{msa_SEQNAME}\'\n";
### verify that there at least 3 sequences, else - kill the script #changed from 5 to 3 on 1/2/06
my $msa = $WorkingDir.$fileName_amino_aligned;
unless (open FILE, "<$msa"){
&sys_error_exit("msa_format_extract: can not open file $msa for reading\n");
}
my $counter = 0;
my $TargetFound = 0;
my $querySeq="";
my $space = 0;
my $firstSequence;
my $firstSequenceName;
my $seq_length;
my $first_seq_length;
# replace '()' with '_' - that's what clustalw does!
my $inFile = Bio::SeqIO->new('-file' => "$msa" , '-format' => 'Fasta');
while ( my $seqObj = $inFile->next_seq() ) {
#obtaining first sequence - if query not found, 1st seq. used as query
if ($counter == 0) {
$firstSequence = $seqObj->seq();
$first_seq_length = length($firstSequence);
$firstSequenceName = $seqObj->display_id();
if ($seqObj->desc() ne ""){
$firstSequenceName .= " ".$seqObj->desc();
}
}
$counter++;
my $seq = $seqObj->seq();
my $name = $seqObj->display_id();
$seq_length = length($seq);
if ($seqObj->desc() ne ""){
$name .= " ".$seqObj->desc();
}
#if ($name =~ /^(\Q$FORM{msa_SEQNAME}\E\s*)/){ #find query sequence name
if ($name =~ /^($FORM{msa_SEQNAME}\s*)$/){ #find query sequence name
$querySeqFoundinMSA = "yes";
$query= $1;
$TargetFound = 1;
$querySeq = $seq;
($querySeq) =~ s/\W+//g;
}
}
$inFile->close();
unless ($querySeqFoundinMSA eq "yes") { # if query sequence not found - use first sequence
$querySeq=$firstSequence;
$query=$firstSequenceName;
}
$querySeq =~ s/-//g; #clear all gaps in the sequence
close FILE;
# The query sequnece name is not found in the MSA - warning
if ($TargetFound == 0){
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> The query sequence name \'$original_seq_name\' is not found in the <A HREF=COPY_$fileUploadName_dna_unaligned TARGET=MSA_window>input file</A>.<br>The calculation continues. The first sequence in MSA is used as query.</li></ul></p>\n";
close OUTPUT;
print LOG "\nmsa_format_extract: The query sequence name is not found in the MSA file. Calculation continues with 1st seq. in file\n";
}
if ($upload_PDB_file ne "" or $FORM{pdb_ID} ne "") { #if user ran selecton with a PDB struct.
### add the target sequence to the file _PDB_MSA.pdbfasta
unless (open PDBFASTA, ">>".$WorkingDir.$pdb_msa) {
&sys_error_exit("msa_format_extract: Can\'t open the file $pdb_msa");
}
print PDBFASTA "\n>$query\n$querySeq\n";
close PDBFASTA;
### Call 'alignPDB2refseq' to run clustalw and check the results
&alignPDB2refseq($pdb_msa, $clustal_outFile);
}
# there are less than 3 sequences - write error message and exit
if ($counter < 3){ #changed from 5 to 3 on 1/2/06
open OUTPUT, ">>$OutHtmlFile";
if ($counter == 1){
&print_to_output_and_exit("The <A HREF=COPY_$fileUploadName_dna_unaligned TARGET=MSA_window>input file</A> contains only one sequence. The minimal number of homologues required for the calculation is 5.<br>(Make sure that the file is saved as plain text and does not contain special characters).", "msa_format_extract: The MSA file contains only $counter sequences");
}
else {
&print_to_output_and_exit("The <A HREF=COPY_$fileUploadName_dna_unaligned TARGET=MSA_window>input file</A> contains only $counter sequences. The minimal number of homologues required for the calculation is 5.", "msa_format_extract: The MSA file contains only $counter sequences");
}
}
# there are less than 10 sequences - write a warning
elsif ($counter < 10){
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</font></b> The MSA file contains only $counter sequences. It is recommended to use an MSA file with at least 10 homologues. The calculation continues nevertheless.</li></ul></p>\n";
close OUTPUT;
}
# more than 100 seqeunces - not supported by the server
elsif ($counter > 100) {
&print_to_output_and_exit("The <A HREF=COPY_$fileUploadName_dna_unaligned TARGET=MSA_window>MSA file</A> contains $counter sequences. The Selecton server supports only < 100 sequences. For longer runs, please download the source code under SOURCE and install the program locally .", "msa_format_extract: The MSA file contains $counter (>100) sequences, Selecton aborted.");
}
else {
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p><ul><li>The calculation is performed on the $counter sequences obtained from the MSA file.</li></ul></p>\n";
close OUTPUT;
}
#calculate the estimated
if ($model eq "M8" or $model eq "MEC"){
$estimated_run_time = BIOSEQUENCE_FUNCTIONS::selecton_estimated_run_time( $first_seq_length, $counter, $model);
print LOG "\nmsa_format_extract : sending selecton_estimated_run_time( $first_seq_length, $counter, $model), time is: $estimated_run_time\n";
}
open STATISTICS, ">>".$statistics_file;
flock STATISTICS, 2;
print STATISTICS "$run_name $model $counter $first_seq_length\n";
flock STATISTICS, 8;
close STATISTICS;
return $counter;
}
###########################################################################
# The function runs 'clustalw' with two sequences,
# and checks their pairwise alignment.
# The arguments:
# 1. the parameters to run clustalw (input-file and additional parameters)
# 2. an output file
###########################################################################
sub alignPDB2refseq{
my $clustalw_parms = shift;
my $outfile = shift;
my %message = (SEQRES => "sequence extracted from the SEQRES field of the PDB file",
PDB => "sequence extracted from the ATOM field of the PDB file ",
MSA => "query sequence extracted from the MSA file");
my %SeqName = (SEQRES => "SEQRES sequence",
PDB => "ATOM sequence",
MSA => "query sequence");
# run clustalw with the given parameters
my $command = $clustalw . " ". $WorkingDir.$clustalw_parms . " > " . $WorkingDir.$outfile;
print LOG "\nalignPDB2refseq: running $command\n";
system 'echo "(cd '.$WorkingDir.'; '.$command.')" | /bin/tcsh';
my $full_outfile = $WorkingDir . $outfile;
unless (open OUT, "<$full_outfile"){
&sys_error_exit("alignPDB2refseq: Can\'t open the file $full_outfile");
}
my $msa_seq;
my $pdb_length ;
my $msa_seq_length;
my $score;
# search the outfile for the name of the sequences, their length
# and the score of the alignment
while (<OUT>){
if ($_ =~ /Sequence\s+\d:\s+PDB_\S?\s+(\d+)\s+aa/i){
$pdb_length = $1;
}
elsif ($_ =~ /Sequence\s+\d:\s+(.+)\s+(\d+)\s+aa/i){
$msa_seq = "MSA";
$msa_seq_length = $2;
}
elsif ($_ =~ /Sequences.+Aligned.+Score:\s+(\d+)/i){
$score = $1;
}
}
close OUT;
my $pdb = $PdbPrefix . ".ent";
system 'echo "(cd '.$WorkingDir.' ; chmod -R og+rx * )" | /bin/tcsh';
### check if the atoms sequence is longer than the SEQRES sequence
if ($msa_seq_length < $pdb_length){
# significant difference - stop the script!
if ($msa_seq_length < ($pdb_length * 0.9)){
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> The $message{MSA} is shorter than the $message{PDB}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues. The calculation continues nevertheless.</li></ul></p>\n";
close OUTPUT;
}
# just write a warning
else {
open OUTPUT, ">>$OutHtmlFile";
if ($FORM{pdb_ID} eq "FILE"){
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> The $message{MSA} is shorter than the $message{PDB}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues. The calculation continues nevertheless.</li></ul></p>\n";
}
else {
print OUTPUT "\n<p><ul><li>The $message{MSA} is shorter than the $message{PDB}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues. The calculation continues nevertheless.</li></ul></p>\n";
}
close OUTPUT;
}
}
### check if the SEQRES is longer than the atoms sequence
elsif ($pdb_length < $msa_seq_length){
# significant difference - stop the script!
if ($pdb_length < ($msa_seq_length * 0.2)){
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p>$ErrorDef<br><b>The $message{PDB} is significantly shorter than the $message{MSA}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues.</b></p>\n";
close OUTPUT;
print LOG "\nalignPDB2refseq: The $message{PDB} is significantly shorter than the $message{MSA}\n";
}
# write a warning
else {
open OUTPUT, ">>$OutHtmlFile";
if ($FORM{pdb_ID} eq "FILE"){
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> The $message{PDB} is shorter than the $message{MSA}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues. The calculation continues nevertheless.</li></ul></p>\n";
}
else {
print OUTPUT "\n<p><ul><li>The $message{PDB} is shorter than the $message{MSA}. The $SeqName{MSA} has $msa_seq_length residues and the $SeqName{PDB} has $pdb_length residues. The calculation continues nevertheless.</li></ul></p>\n";
}
close OUTPUT;
}
}
### check if the score is not 100
if ($score < 100){
# if the alinment score < 60, write a message and stop the script.
#if ($score < 60){
# &print_to_output_and_exit("The Score of the alignment between the $message{MSA} and the $message{PDB} is ONLY ID% = $score .<br>(<A HREF=$clustal_aligned_file TARGET=Alignment_window>Pairwise Alignment</A>)","alignPDB2refseq: The Score of the alignment between the $message{MSA} and the $message{PDB} is ONLY ID% = $score.");
#}
#else {
open OUTPUT, ">>$OutHtmlFile";
if ($FORM{pdb_ID} eq "FILE"){
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font> The Score of the alignment between the $message{MSA} and the $message{PDB} is ID% = $score . The calculation continues nevertheless. (<A HREF=$clustal_aligned_file TARGET=Alignment_window>Pairwise Alignment</A>)</li></ul></p>\n";
}
else {
print OUTPUT "\n<p><ul><li><font color='red'><b>Warning:</b></font>The Score of the alignment between the $message{MSA} and the $message{PDB} is ID% = $score . The calculation continues nevertheless. (<A HREF=$clustal_aligned_file TARGET=Alignment_window>Pairwise Alignment</A>)</li></ul></p>\n";
}
close OUTPUT;
}
#}
}
######################################################################
sub print_data_files{
# creating a file to hold the sequences names, as we can't send a pointer to the run_calc script
# ------ storable on -------
# the key is the same string as the variable name in the script Selecton_run_calc, so that the
# retriving process will be easy.
print LOG "\nEntered print_data_files. going to create hash\n";
my %run_data = ();
$run_data{run_name} = $run_name; $run_data{WorkingDir} = $WorkingDir; $run_data{WWWdir} = $WWWdir;
$run_data{epsilonPrecision} = $epsilonPrecision; $run_data{query_seq_name_to_run} = $query; $run_data{optimizeBL} = $optimizeBL; $run_data{querySeqFoundinMSA} = $querySeqFoundinMSA; $run_data{tree_faq} = $tree_faq; $run_data{method} = $method; $run_data{SysErrorDef} = $SysErrorDef; $run_data{ErrorDef} = $ErrorDef; $run_data{ContactDef} = $ContactDef; $run_data{fileDna_aligned} = $fileDna_aligned; $run_data{treeUpload} = $treeUpload; $run_data{OutHtmlFile} = $OutHtmlFile; $run_data{cgi_log_file} = $OutLogFile; $run_data{OutLogFile} = $QsubLogFile; $run_data{fileName_amino_aligned} = $fileName_amino_aligned; $run_data{OutputURL} = $OutputURL; $run_data{estimated_run_time} = $estimated_run_time; $run_data{sequences_names_file} = $sequences_names_file; $run_data{selecton_log_dir} =$Logs_dir;
if ($upload_MSA_file_dna eq "") {$run_data{upload_MSA_file_dna} = "no";}
else {$run_data{upload_MSA_file_dna} = "USER_DNA_ALIGNED_FILE";}
if ($upload_unaligned_file_dna eq "") {$run_data{upload_unaligned_file_dna} = "no";}
else {$run_data{upload_unaligned_file_dna} = "USER_DNA_UNALIGNED_FILE";}
if ($upload_TREE_file ne "") {$run_data{upload_TREE_file} = "GIVEN";}
else {$run_data{upload_TREE_file} = "NOT_GIVEN";}
if ($upload_PDB_file ne "" or $FORM{pdb_ID} ne "") {$run_data{was_Pdb_uploaded} = "yes";}
else {$run_data{was_Pdb_uploaded} = "no";}
if ($FORM{pdb_ID} ne "") {
$run_data{PdbFileNameUnc} = $PdbFileNameUnc;
$run_data{PdbPrefix} = $PdbPrefix;
$run_data{pdb_data} = $pdb_data;
$run_data{clustal_aligned_file} = $clustal_aligned_file
}
else{
$run_data{PdbFileNameUnc}= "NOT_GIVEN";
$run_data{PdbPrefix} = "NOT_GIVEN";
$run_data{pdb_data} = "NOT_GIVEN";
$run_data{clustal_aligned_file} = "NOT_GIVEN";
}
print LOG "print_data_files : hash contains ".(keys %run_data)." keys. Going to Store in ".$WorkingDir.$runCalcInput."\n";
unless (open INPUT, ">".$WorkingDir.$runCalcInput){
print LOG "cannot open ".$WorkingDir.$runCalcInput." $!\n";
exit;
}
print INPUT "\n";
close INPUT;
chmod 0755, $WorkingDir.$runCalcInput;
$! = "";
unless(store \%run_data, $WorkingDir.$runCalcInput)
{ print LOG "\ncannot store $!\n";
exit;
}
else{
print LOG "print_data_files : managed to store! $! \n";
}
store \%FORM, $WorkingDir.$FormInput; # when storable off, no need for this file
# ------ storable on -------
unless (open SEQ_NAMES, ">".$WorkingDir.$sequences_names_file){
&sys_error_exit("cannot open the file ".$WorkingDir.$sequences_names_file." for writing $!\n");
}
my $index=0;
foreach(@sequences_names){
print SEQ_NAMES $index." $_\n";
$index++;
}
close SEQ_NAMES;
chmod 0755, $WorkingDir.$sequences_names_file;
chmod 0600, $WorkingDir.$runCalcInput;
# ------ storable off -------
#unless (open RUN_CALC, ">".$WorkingDir.$runCalcInput){
# &sys_error_exit("cannot open the file ".$WorkingDir.$runCalcInput." for writing $!\n");}
#print RUN_CALC "RUN NAME: $run_name\n";
#print RUN_CALC "WORKING DIR: $WorkingDir\n";
#print RUN_CALC "WWW DIR: $WWWdir\n";
#print RUN_CALC "PRECISION LEVEL: $epsilonPrecision\n";
#print RUN_CALC "EVOLUTONARY MODEL: $FORM{MODEL}\n";
#($FORM{EMPIRICAL_MATRIX} ne "") ? print RUN_CALC "EMPIRICAL MATRIX: $FORM{EMPIRICAL_MATRIX}\n" : print RUN_CALC "EMPIRICAL MATRIX: NOT_GIVEN\n"; #ONLY IF IT IS MEC MODEL
#print RUN_CALC "QUERY NAME TO RUN: $query\n";
#print RUN_CALC "DISTRIBUTE CATEGORIES: $FORM{CATEGORIES}\n";
#($upload_TREE_file ne "") ? print RUN_CALC "TREE_WAS_UPLOADED?: $upload_TREE_file\n" : print RUN_CALC "TREE_WAS_UPLOADED?: NOT_GIVEN\n"; #REMARK: CHANGE THIS VAR'S CONTENT IN THE REST OF THE SCRIPT TO TRUE/FALSE.
#print RUN_CALC "OPTIMIZE BRANCH LENGTH? $optimizeBL\n";
#print RUN_CALC "GENETIC CODE: $FORM{GENCODE}\n";
#print RUN_CALC "GIVEN QUERY NAME: $FORM{msa_SEQNAME}\n";
#($FORM{pdb_ID} ne "") ? print RUN_CALC "PDB ID: $FORM{pdb_ID}\nPDB NAME: $PdbFileNameUnc\nPDB PREFIX: $PdbPrefix\nPDB DATA FILE: $pdb_data\nCLUSTAL ALN: $clustal_aligned_file\n" : print RUN_CALC "PDB ID: NOT_GIVEN\nPDB NAME: NOT_GIVEN\nPDB PREFIX: NOT_GIVEN\nPDB DATA FILE: NOT_GIVEN\nCLUSTAL ALN: NOT_GIVEN\n";
#($FORM{chain} ne "") ? print RUN_CALC "PDB CHAIN: $FORM{chain}\n" : print RUN_CALC "PDB CHAIN: NOT_GIVEN\n";
#print RUN_CALC "FOUND QUERY IN MSA?: $querySeqFoundinMSA\n";
#print RUN_CALC "TREE FAQ: $tree_faq\n";
#print RUN_CALC "METHOD: $method\n";
#($recipient ne "") ? print RUN_CALC "USER EMAIL: $recipient\n" : print RUN_CALC "USER EMAIL: NOT_GIVEN\n";
#print RUN_CALC "SYS ERROR: $SysErrorDef\n";
#print RUN_CALC "ERROR DEF: $ErrorDef\n";
#print RUN_CALC "CONTACT DEFINITION: $ContactDef\n";
#print RUN_CALC "WAS PDB UPLOADED?: ";
#($upload_PDB_file ne "" or $FORM{pdb_ID} ne "")? print RUN_CALC "yes\n" : print RUN_CALC "no\n";
#print RUN_CALC "DNA FILE NAME: $fileDna_aligned\n";
#print RUN_CALC "UPLOADED TREE PATH: $treeUpload\n";
#print RUN_CALC "OUTPUT HTML PATH: $OutHtmlFile\n";
#print RUN_CALC "LOG PATH: $OutLogFile\n";
#print RUN_CALC "QSUB LOG: $QsubLogFile\n";
#print RUN_CALC "LOG DIR: $Logs_dir\n";
#print RUN_CALC "SEQ NAMES FILE: $sequences_names_file\n";
#print RUN_CALC "AMINO FILE NAME: $fileName_amino_aligned\n";
#print RUN_CALC "WAS A DNA UNALIGNED FILE UPLOADED?: ";
#($upload_unaligned_file_dna eq "") ? print RUN_CALC "no\n" : print RUN_CALC $upload_unaligned_file_dna."\n";
#print RUN_CALC "WAS A DNA ALIGNED FILE UPLOADED?: ";
#($upload_MSA_file_dna eq "") ? print RUN_CALC "no\n" : print RUN_CALC $upload_MSA_file_dna."\n";
#print RUN_CALC "URL OUTPUT: $OutputURL\n";
#close RUN_CALC;
#chmod 0755, $WorkingDir.$runCalcInput;
# ------ storable off -------
}
######################################################################
sub print_to_output_and_exit{
my $html_err = shift;
my $log_err = shift;
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT "\n<p>$ErrorDef<br>$html_err</p>\n";
print OUTPUT $ContactDef;
close OUTPUT;
print LOG "$log_err";
&send_mail();
&stop_reload;
exit;
}
###########################################################################################
sub sys_error_exit{
my $err = shift;
open OUTPUT, ">>$OutHtmlFile";
print OUTPUT $SysErrorDef;
print OUTPUT $ContactDef;
close OUTPUT;
print LOG "\n$err\n";
&send_mail();
&send_mailSelecton("SYSTEM ERROR\n".$err);
&stop_reload;
exit;
}
##########################################################################################
# Stops the reload of the output page
sub stop_reload {
sleep ($reload_interval);
open OUTPUT, "<$OutHtmlFile";
my @output = <OUTPUT>;
close OUTPUT;
open OUTPUT, ">$OutHtmlFile";
foreach my $line (@output){ # we remove the refresh lines and the button which codes for Selecton cancelled job
unless ($line =~ /REFRESH/ or $line =~ /NO-CACHE/ or $line =~ /ACTION=\"$kill_job_script/ or
$line =~ /VALUE=\"Cancel Selecton Job\"/ or $line =~ /TYPE=hidden NAME=/ or $line =~ /<!--job_/){
print OUTPUT $line;
}
}
close OUTPUT;
print LOG "\n\nEnd time: ";
&printTime();
close LOG;
# remove the job from the running jobs list
open LIST, "+>>".GENERAL_CONSTANTS::SELECTON_RUNNING_JOBS;
flock LIST, 2;
seek LIST, 0, 0; #rewind the pointer to the beginning
my @all_lines_in_list = <LIST>; # read the contents into the array
truncate LIST, 0; # remove all the information, The 0 represents the size of the file that we want
foreach (@all_lines_in_list){
chomp;
unless(/$run_name/){
print LIST $_."\n";
}
}
flock LIST, 8;
close LIST;
chmod 0600, $WorkingDir. "user_email.txt";
# if (-e $WorkingDir."core"){
# print LOG "remove core file from working directory\n";
# unlink $WorkingDir."core";
# }
}
#########################################################################################
# Sends an automatic mail when there are errors
sub send_mail { # to user
$email_subject = "Error in Selecton running";
$email_message = "Hello!\n\nUnfortunately there was an error while running Selecton.\nPlease click on the following link to see more details\n We apologize for the inconvenience\n\n$OutputURL\n";
print LOG "send_mail: sending system error to user\n";
chdir $send_email_dir;
$email_system_return = system ('./sendEmail -f '.GENERAL_CONSTANTS::ADMIN_EMAIL.' -t $recipient -u '.$email_subject.' -xu '.$userName.' -xp '.$userPass.' -s '.$smtp_server.' -m '.$email_message);
unless ($email_system_return =~ /successfully/) {
print LOG "send_mail: The message was not sent successfully. system returned: $email_system_return\n";
}
}
#########################################################################################
sub send_mailSelecton{ # to selecton administrator
my $email_message = shift;
$email_subject = "Error in Selecton running $run_name";
print LOG "send_mailSelecton: send error message to admin\n";
chdir $send_email_dir;
$email_system_return = system ('./sendEmail -f '.GENERAL_CONSTANTS::ADMIN_EMAIL.' -t '.GENERAL_CONSTANTS::ADMIN_EMAIL.' -u '.$email_subject.' -xu '.$userName.' -xp '.$userPass.' -s '.$smtp_server.' -m '.$email_message."\n User's email is: $recipient\n");
unless ($email_system_return =~ /successfully/) {
print LOG "The message was not sent successfully. system returned: $email_system_return\n";
}
}
#########################################################################################
# this function prints the time to the LOG file.
# if used with return arguments: returns time and date in a different format than printed to log (only numbers).
sub printTime {
my @months = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec);
my @weekDays = qw(Sun Mon Tue Wed Thu Fri Sat Sun);
my ($second, $minute, $hour, $dayOfMonth, $month, $yearOffset, $dayOfWeek, $dayOfYear, $daylightSavings) = localtime();
my $year = 1900 + $yearOffset;
my $theTime = "$hour:$minute:$second, $weekDays[$dayOfWeek] $months[$month] $dayOfMonth, $year";
print LOG $theTime;
$second = &convertNum($second);
$minute = &convertNum($minute);
$hour = &convertNum($hour);
$month = &convertNum($month+1);
$dayOfMonth = &convertNum($dayOfMonth);
return "$hour:$minute:$second $dayOfMonth-".$month."-$year";
}
#########################################################################################
# converts a number from one digit to 2 digits
sub convertNum
{
my $input_num = shift;
if ($input_num < 10)
{return "0".$input_num;}
else
{return $input_num;}
}