Katzlab dd76ab1d12 Added PTL2 Scripts
These are PTL2 files from Auden 2/9
2023-02-14 11:20:52 -05:00

163 lines
5.9 KiB
Perl

#!/usr/bin/perl
package HandleQueue;
#use GENERAL_CONSTANTS;
use BIOSEQUENCE_FUNCTIONS;
use strict;
#----------------------------------
# read the Q and assign each job with a status
sub find_place_in_Q{
my $hash_ref = shift;
my $cmd = 'qstat bioseq';
my $num_in_Q = 0;
my $num_of_R = 0;
my $status = "";
my (@line, $pbs_no);
my @ans = `$cmd`;
foreach(@ans){
chomp;
$num_in_Q++;
@line = split(/\s+/, $_);
if($line[0] =~ /^(\d+)\./){
$pbs_no = $1 ;
if($line[4] =~ /R/) {$num_of_R++; $status="Running";}
elsif($line[4] =~ /E/) {$num_of_R++; $status="Running";}
elsif($line[4] =~ /Q/) {$status="Queued. Your number in the queue is: ".($num_in_Q-$num_of_R);}
$hash_ref->{$pbs_no} = $status;
}
$pbs_no = "";
$status="";
}
}
#------------------------------------------------------------------
sub check_job{
my $qsub_job_no= shift;
my $run_number = shift;
my $server_name = shift;
my $ans_ref = shift;
my $ans_out_file = $qsub_job_no."_checkjob.out";
my $ans_err_file = $qsub_job_no."_checkjob.err";
my $cmd = "sh -c 'checkjob $qsub_job_no 1>$ans_out_file 2>$ans_err_file'";
`$cmd`;
if (-z $ans_err_file and !(-z $ans_out_file)){
# if the file exists, it means that this job really lives.
$ans_ref->[0] = "The job $qsub_job_no of the run $server_name $run_number was not found under \"qstat bioseq\", but created a file with \"checkjob\"\n";
# should be written to the Q next round
}
#if the error file is not of size zero - then this job is dead
elsif(!(-z $ans_err_file) and -z $ans_out_file){
$ans_ref->[0] = "The job $server_name $run_number was not ended properly. Was not find in the biocluster Q. ";
$ans_ref->[1] = "error";
}
else{
$ans_ref->[0] = "The job $qsub_job_no of the run $server_name $run_number was not found under \"qstat bioseq\", and did not create any file with \"checkjob\"\n";
$ans_ref->[1] = "error";
}
unlink $ans_err_file;
unlink $ans_out_file;
}
#------------------------------------------------------------------
sub report_error_to_user{
my $dir_path = shift;
my $server_name = shift;
my $run_dir = shift;
my $log_run = shift;
my $run_number = shift;
my $sending_script = shift;
my $output_path = shift;
my $qsub_job_no = shift;
my $user_email = "";
my ($err_message, $err_subject);
# not ok: we write it to HTML, stop reload and send an e-mail to user (if supplied)
if (-e $dir_path."user_email.txt"){
if (open MAIL, $dir_path."user_email.txt"){
$user_email = <MAIL>;
chomp($user_email);
close MAIL;
}
}
if ($user_email eq "NOT_GIVEN" or $user_email eq ""){$user_email = "NO";}
# print to output that there was a failoure and send mail to user
#GENERAL_CONSTANTS::print_to_output($output_path, $server_name, $run_dir, $user_email);
# remove the job from the running list of the server
&BIOSEQUENCE_FUNCTIONS::remove_job_from_running_log($server_name, $run_number);
$err_message = "*** MESSAGE FROM $sending_script : an error occured while trying to run the job in biocluster Q. The run was stopped.***";
if (open RUN_LOG, ">>".GENERAL_CONSTANTS::SERVERS_LOGS_DIR.$server_name."/$log_run"){
print RUN_LOG "\n".$err_message."\n";
close RUN_LOG;
}
$err_subject = "Error in $server_name run $run_number";
if (defined $qsub_job_no){
$err_subject .= " job $qsub_job_no";
$err_message .= "\njob $qsub_job_no";
# check if the job died for no reason
my $err_file = $dir_path.$qsub_job_no.".bioc.ER";
my $out_file = $dir_path.$qsub_job_no.".bioc.OU";
if (-e $err_file and -z $err_file and -e $out_file and -z $out_file){
my $host = find_host($qsub_job_no);
$err_subject .= " died on host $host";
$err_message .= " died on host $host";
}
else{
$err_message .= "\n\nDid not file $err_file";
}
}
# report about the error to the administrator
GENERAL_CONSTANTS::send_mail($server_name, GENERAL_CONSTANTS::ADMIN_EMAIL, $run_number, $err_subject, $err_message."\nRun: $run_number User: $user_email\n");
return $user_email;
}
sub find_host{
my $job_no = shift;
my $n = 1;
my $found = 0;
my $ret = "";
my $cmd = "tracejob $job_no | grep exec_host";
while ($found != 1 and $n<10){
my $ans = `$cmd`;
if ($ans =~ /exec_host=(bioc\d+)\.tau\.ac\.il/){
$ret = $1;
#print "$job_no died on $1\n";
$found = 1;
}
else{
$n++;
$cmd = "tracejob -n $n $job_no | grep exec_host";
#print "ans was: $ans\n";
}
}
return $ret;
}
#------------------------------------------------------------------
# return the number of running jobs on specific node on the cluster
sub node_status
{
my $node=shift;
my $command="ssh bioseq\@biocluster ssh $node ps -ef | awk \'\$4!=0\' | grep \"bioseq\" | wc -l|";
unless (open (NUM_OF_RUNNING,$command)) {return "HandleQueue::node_status Can't Execute \'$command\': $!";}
my $running_jobs=<NUM_OF_RUNNING>;
close (NUM_OF_RUNNING);
chomp $running_jobs;
#print "running: $running_jobs\n";
return ("ok",$running_jobs);
}
#------------------------------------------------------------------
# return the number of running jobs on the entire cluster
sub queue_status
{
unless (open (NUM_OF_RUNNING,"ssh bioseq\@biocluster qstat | grep -v bioseq | grep -c R|")) {return "HandleQueue::queue_status Cna't Execute $!";}
my $Num_Of_Jobs=<NUM_OF_RUNNING>;
chomp ($Num_Of_Jobs);
close (NUM_OF_RUNNING);
return ("ok",$Num_Of_Jobs);
}
1;