From 5851092951db48bf80e3167b8a7aea5da207ec74 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Fri, 15 Jul 2022 19:32:13 +0100 Subject: [PATCH 01/16] Adds comments where code needs to be modified for upcoming change from LSF (bsub/bjobs) to slurm (sbatch/squeue) --- Rfam/Lib/Bio/Rfam/Infernal.pm | 6 +++--- Rfam/Lib/Bio/Rfam/Utils.pm | 11 ++++++++--- Rfam/Scripts/make/rfsearch.pl | 4 ++-- Rfam/Scripts/view/make_sunburst.pl | 1 + Rfam/Scripts/view/rfam_family_view.pl | 3 ++- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Infernal.pm b/Rfam/Lib/Bio/Rfam/Infernal.pm index 83abd49b..b8e259a2 100644 --- a/Rfam/Lib/Bio/Rfam/Infernal.pm +++ b/Rfam/Lib/Bio/Rfam/Infernal.pm @@ -162,7 +162,7 @@ sub cmcalibrate_wrapper { : $seqfilePath: path to sequence file to search : $outPath: file to save standard output to, if undefined send to /dev/null. : $errPath: file to save standard error output to - : $submitExStr: extra string to add to qsub/bsub command + : $submitExStr: extra string to add to qsub/bsub/sbatch command : $queue: queue to submit to, "" for default : $do_locally: '1' to run locally, else run on cluster : $gbPerThread: number of Gb of memory to request per thread @@ -191,7 +191,7 @@ sub cmsearch_wrapper { : $seqfilePath: path to sequence file to search : $outPath: file to save standard output to, if undefined send to /dev/null. : $errPath: file to save standard error output to - : $submitExStr: extra string to add to qsub/bsub command + : $submitExStr: extra string to add to qsub/bsub/sbatch command : $queue: queue to submit to, "" for default : $do_locally: '1' to run locally, else run on cluster : $do_locally: '1' to run locally, else run on cluster @@ -221,7 +221,7 @@ sub cmscan_wrapper { : $seqfilePath: path to sequence file to search : $outPath: file to save standard output to, if undefined send to /dev/null. : $errPath: file to save standard error output to - : $submitExStr: extra string to add to qsub/bsub command + : $submitExStr: extra string to add to qsub/bsub/sbatch command : $queue: queue to submit to, "" for default : $do_locally: '1' to run locally, else run on cluster : $gbPerThread: number of Gb of memory to request per thread diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index d7e19f91..ed42a96a 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -86,6 +86,8 @@ sub submit_nonmpi_job { if($location eq "EBI") { if(! defined $ncpu) { die "submit_nonmpi_job(), location is EBI, but ncpu is undefined"; } if(! defined $reqMb) { die "submit_nonmpi_job(), location is EBI, but reqMb is undefined"; } + #TODO: update this block to work with 'sbatch' instead of 'bsub', may require creating a + #new file that is the 'script' that is used as command-line argument to 'sbatch' $submit_cmd = "bsub "; if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } if(defined $queue && $queue ne "") { @@ -167,6 +169,7 @@ sub submit_mpi_job { # Need to use MPI queue ($queue is irrelevant) # TEMPORARILY USING research queue and span[ptile=8] as per Asier Roa's instructions, see email ("mpi jobs on cluster") # forwarded from Jen, on 08.27.13. + #TODO: update 'bsub' to 'sbatch' $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; # ORIGINAL COMMAND (I BELIEVE WE WILL REVERT TO THIS EVENTUALLY): # $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -a openmpi mpirun.lsf -np $nproc -mca btl tcp,self $cmd"; @@ -211,7 +214,7 @@ sub submit_mpi_job { : : See an alternative function that serves the same purpose: : 'wait_for_cluster_light' but that uses the expensive - : 'qstat' or 'bjobs' calls less frequently. + : 'qstat', 'bjobs' or 'squeue' calls less frequently. : : Ways to return or die: : (1) Returns if all jobs finish and all jobs output files @@ -252,6 +255,7 @@ sub wait_for_cluster { # sanity check if(scalar(@{$outnameAR}) != $n) { die "wait_for_cluster(), internal error, number of elements in jobnameAR and outnameAR differ"; } + # TODO: update this subroutine to work with 'squeue' # modify username > 7 characters and job names > 10 characters if we're at EBI, because bjobs truncates these if($location eq "EBI") { if(length($username) > 7) { @@ -384,8 +388,8 @@ sub wait_for_cluster { : This function (the '_light' version) determines which jobs : are finished mostly using the existence of error files and : by looking for the success string in those error files - : and tries to use expensive 'qstat' or 'bjobs' calls infrequently. - : The non-light version (wait_for_cluster()) calls 'qstat'/'bjobs' + : and tries to use expensive 'qstat', 'bjobs' or 'squeue' calls infrequently. + : The non-light version (wait_for_cluster()) calls 'qstat'/'bjobs'/'squeue' : once every minute. : : If $max_minutes is defined and != -1, we will die if all jobs @@ -432,6 +436,7 @@ sub wait_for_cluster_light { if(scalar(@{$outnameAR}) != $n) { die "wait_for_cluster_light(), internal error, number of elements in jobnameAR and outnameAR differ"; } if(scalar(@{$errnameAR}) != $n) { die "wait_for_cluster_light(), internal error, number of elements in jobnameAR and errnameAR differ"; } + #TODO: update this subroutine to work with 'squeue' # modify username > 7 characters and job names > 10 characters if we're at EBI, because bjobs truncates these if($location eq "EBI") { if(length($username) > 7) { diff --git a/Rfam/Scripts/make/rfsearch.pl b/Rfam/Scripts/make/rfsearch.pl index 96c9ef79..c7c94ae0 100755 --- a/Rfam/Scripts/make/rfsearch.pl +++ b/Rfam/Scripts/make/rfsearch.pl @@ -52,8 +52,8 @@ my $mxsize_opt = undef; # we'll pass '--mxsize $mxsize_opt' to cmsearch my @cmosA = (); # extra single '-' cmsearch options (e.g. -g) my @cmodA = (); # extra double '--' cmsearch options (e.g. --cyk) -my @ssoptA = (); # strings to add to cmsearch qsub/bsub commands -my $ssopt_str = ""; # string to add to cmsearch qsub/bsub commands +my @ssoptA = (); # strings to add to cmsearch qsub/bsub/sbatch commands +my $ssopt_str = ""; # string to add to cmsearch qsub/bsub/sbatch commands my $ignore_sm = 0; # TRUE to ignore BM in DESC for cmbuild options # debugging options my $do_hmmonly = 0; # TRUE to run cmsearch in hmmonly mode diff --git a/Rfam/Scripts/view/make_sunburst.pl b/Rfam/Scripts/view/make_sunburst.pl index 97358dc2..cde99054 100755 --- a/Rfam/Scripts/view/make_sunburst.pl +++ b/Rfam/Scripts/view/make_sunburst.pl @@ -173,6 +173,7 @@ sub help{ The ability to process families in chunks allows the script to be run on the farm, using a submission command something like: + TODO: update this from bsub to sbatch bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 \ -J "sunburst[1-20]" -o "sunburst.\%J.\%I.log" '$0 \ -chunksize 684 -chunk \$\{LSB_JOBINDEX\}' diff --git a/Rfam/Scripts/view/rfam_family_view.pl b/Rfam/Scripts/view/rfam_family_view.pl index 4fcfefb2..a46fe30d 100755 --- a/Rfam/Scripts/view/rfam_family_view.pl +++ b/Rfam/Scripts/view/rfam_family_view.pl @@ -145,7 +145,8 @@ =head1 DESCRIPTION This is a script to run the Rfam view process for a given family. It's intended to be run by the job dequeuer, which polls the rfam_jobs.job_history table for -pending view process jobs, and runs this script on the farm via "bsub". +pending view process jobs, and runs this script on the farm via "bsub" (TODO: +update to 'sbatch') We have the concept of plugin sets, so that we can group together view plugins which have common features. For example, there may be plugins that are based on From 3913d64863a43e88d3cd9e08e618e29bcf30ea68 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Thu, 12 Oct 2023 19:24:29 +0100 Subject: [PATCH 02/16] Adds scheduler field --- Rfam/Conf/rfam.conf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Rfam/Conf/rfam.conf b/Rfam/Conf/rfam.conf index b238c858..52881e6b 100644 --- a/Rfam/Conf/rfam.conf +++ b/Rfam/Conf/rfam.conf @@ -178,3 +178,6 @@ productionPath /nfs/production/agb/rfam/ ssImages /nfs/production/agb/rfam/RELEASES/14.8/ss_images/ +# scheduler can be 'slurm' or 'lsf' +scheduler slurm + From 1bb014e2b60c3d232a0c71b817ba2e45736efc1f Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Thu, 12 Oct 2023 19:49:20 +0100 Subject: [PATCH 03/16] Begins support for slurm --- Rfam/Lib/Bio/Rfam/Infernal.pm | 12 ++++---- Rfam/Lib/Bio/Rfam/Utils.pm | 57 +++++++++++++++++++++-------------- 2 files changed, 40 insertions(+), 29 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Infernal.pm b/Rfam/Lib/Bio/Rfam/Infernal.pm index 4f8b0efd..7ea70ad9 100644 --- a/Rfam/Lib/Bio/Rfam/Infernal.pm +++ b/Rfam/Lib/Bio/Rfam/Infernal.pm @@ -1,4 +1,4 @@ -package Bio::Rfam::Infernal; +,package Bio::Rfam::Infernal; # Wrappers for Infernal executables called by rfsearch and rfmake. @@ -151,7 +151,7 @@ sub cmcalibrate_wrapper { } else { if($doMPI) { - Bio::Rfam::Utils::submit_mpi_job($config->location, "$cmcalibratePath --mpi $options $cmPath > $outPath", $jobname, $errPath, $nproc, $queue); + Bio::Rfam::Utils::submit_mpi_job($config, "$cmcalibratePath --mpi $options $cmPath > $outPath", $jobname, $errPath, $nproc, $queue); } else { my $gbPerThread = (($predicted_Mb_per_thread * 2.) / 1000.); # double prediction to be safe (yes, it can be that inaccurate...) @@ -162,7 +162,7 @@ sub cmcalibrate_wrapper { #$requiredMb = 6000; #} # if the job is run in the cloud, assign the job an index - Bio::Rfam::Utils::submit_nonmpi_job($config->location, "$cmcalibratePath --cpu $nproc $options $cmPath > $outPath", $jobname, $errPath, $nproc, $requiredMb, undef, $queue); + Bio::Rfam::Utils::submit_nonmpi_job($config, "$cmcalibratePath --cpu $nproc $options $cmPath > $outPath", $jobname, $errPath, $nproc, $requiredMb, undef, $queue); } } return ($predicted_seconds / 60); @@ -295,7 +295,7 @@ sub cmsearch_or_cmscan_wrapper { } my $requiredMb = $ncpu * $gbPerThread * 1000.; # - Bio::Rfam::Utils::submit_nonmpi_job($config->location, $config->infernalPath . "$program $options $cmPath $seqfilePath > $outPath", $jobname, $errPath, $ncpu, $requiredMb, $submitExStr, $queue); + Bio::Rfam::Utils::submit_nonmpi_job($config, $config->infernalPath . "$program $options $cmPath $seqfilePath > $outPath", $jobname, $errPath, $ncpu, $requiredMb, $submitExStr, $queue); } return; @@ -421,11 +421,11 @@ sub cmalign_wrapper { my @jobnameA = ($jobname); my @outnameA = ($outPath); if($use_mpi) { - Bio::Rfam::Utils::submit_mpi_job($config->location, "$cmalignPath --mpi $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $queue); + Bio::Rfam::Utils::submit_mpi_job($config, "$cmalignPath --mpi $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $queue); Bio::Rfam::Utils::wait_for_cluster($config->location, $uname, \@jobnameA, \@outnameA, "\# CPU time:", "cmalign-mpi", $logFH, "[$nproc processors]", -1, $do_stdout); } else { # don't use MPI - Bio::Rfam::Utils::submit_nonmpi_job($config->location, "$cmalignPath $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $requiredMb, "", $queue); + Bio::Rfam::Utils::submit_nonmpi_job($config, "$cmalignPath $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $requiredMb, "", $queue); Bio::Rfam::Utils::wait_for_cluster($config->location, $uname, \@jobnameA, \@outnameA, "\# CPU time:", "cmalign-thr", $logFH, "[$nproc processors]", -1, $do_stdout); } unlink $errPath; diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index 0dc9ffd7..45981e15 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -58,12 +58,13 @@ sub run_local_command { Title : submit_nonmpi_job() Incept : EPN, Tue Apr 2 05:59:40 2013 - Usage : submit_nonmpi_job($location, $cmd, $jobname, $errPath, $ncpu, $reqMb, $exStr) + Usage : submit_nonmpi_job($config, $cmd, $jobname, $errPath, $ncpu, $reqMb, $exStr) Function : Submits non-MPI job defined by command $cmd. - : Submission syntax depends on $location value. + : Submission syntax depends on $config->location and + : config->scheduler values. : We do *not* wait for job to finish. Caller : must do that, probably with wait_for_cluster(). - Args : $location: config->location, e.g. "EBI" + Args : $config: Rfam config, with 'location' and 'scheduler' : $cmd: command to run : $jobname: name for job : $errPath: path for stderr output @@ -77,30 +78,34 @@ sub run_local_command { =cut sub submit_nonmpi_job { - my ($location, $cmd, $jobname, $errPath, $ncpu, $reqMb, $exStr, $queue) = @_; + my ($config, $cmd, $jobname, $errPath, $ncpu, $reqMb, $exStr, $queue) = @_; my $submit_cmd = ""; if(defined $queue && $queue eq "p") { $queue = "production"; } if(defined $queue && $queue eq "r") { $queue = "research"; } - if($location eq "EBI") { + if($config->location eq "EBI") { if(! defined $ncpu) { die "submit_nonmpi_job(), location is EBI, but ncpu is undefined"; } if(! defined $reqMb) { die "submit_nonmpi_job(), location is EBI, but reqMb is undefined"; } - #TODO: update this block to work with 'sbatch' instead of 'bsub', may require creating a - #new file that is the 'script' that is used as command-line argument to 'sbatch' - $submit_cmd = "bsub "; - if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } - if(defined $queue && $queue ne "") { - $submit_cmd .= "-q $queue "; + + if($config->scheduler eq "slurm") { + $submit_cmd = "sbatch "; + if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } + $submit_cmd .= "-c $ncpu -J $jobname -o /dev/null -e $errPath --mem-per-cpu=$reqMb --wrap \"$cmd\" > /dev/null"; } - else { - $submit_cmd .= "-q research "; + else { # lsf + $submit_cmd = "bsub "; + if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } + if(defined $queue && $queue ne "") { + $submit_cmd .= "-q $queue "; + } + else { + $submit_cmd .= "-q research "; + } + $submit_cmd .= "-n $ncpu -J $jobname -o /dev/null -e $errPath -M $reqMb -R \"rusage[mem=$reqMb]\" \"$cmd\" > /dev/null"; } - $submit_cmd .= "-n $ncpu -J $jobname -o /dev/null -e $errPath -M $reqMb -R \"rusage[mem=$reqMb]\" \"$cmd\" > /dev/null"; } elsif($location eq "CLOUD"){ - - # temporarily minimize memory to 6GB only to work with the test cloud # if ($reqMb >= 24000){ # $reqMb = 6000; @@ -142,10 +147,11 @@ sub submit_nonmpi_job { Incept : EPN, Tue Apr 2 05:59:40 2013 Usage : submit_mpi_job($location, $cmd, ) Function : Submits MPI job defined by command $cmd. - : MPI submission syntax depends on $location value. + : MPI submission syntax depends on $config->location and + : config->scheduler values. : We do *not* wait for job to finish. Caller : must do that, probably with wait_for_cluster(). - Args : $location: config->location, e.g. "EBI" + Args : $config: Rfam config, with 'location' and 'scheduler' : $cmd: command to run : $jobname: name for job : $errPath: path for stderr output @@ -157,10 +163,10 @@ sub submit_nonmpi_job { =cut sub submit_mpi_job { - my ($location, $cmd, $jobname, $errPath, $nproc, $queue) = @_; + my ($config, $cmd, $jobname, $errPath, $nproc, $queue) = @_; my $submit_cmd = ""; - if($location eq "EBI") { + if($config->location eq "EBI") { # EPN: for some reason, this 'module' command fails inside perl..., I think it may be unnecessary because it's in my .bashrc #my $prepcmd = "module load openmpi-x86_64"; #system($prepcmd); @@ -170,9 +176,14 @@ sub submit_mpi_job { # TEMPORARILY USING research queue and span[ptile=8] as per Asier Roa's instructions, see email ("mpi jobs on cluster") # forwarded from Jen, on 08.27.13. #TODO: update 'bsub' to 'sbatch' - $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; - # ORIGINAL COMMAND (I BELIEVE WE WILL REVERT TO THIS EVENTUALLY): - # $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -a openmpi mpirun.lsf -np $nproc -mca btl tcp,self $cmd"; + if($config->scheduler eq "slurm") { + $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --mem-per-cpu=$reqMb --wrap \"$cmd\" > /dev/null"; + } + else { # lsf + $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; + # ORIGINAL COMMAND (I BELIEVE WE WILL REVERT TO THIS EVENTUALLY): + # $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -a openmpi mpirun.lsf -np $nproc -mca btl tcp,self $cmd"; + } } elsif($location eq "JFRC") { my $queue_opt = ""; From 044af88938b20342298dd1adee38715638cda18c Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:11:51 +0100 Subject: [PATCH 04/16] Adds config->scheduler subroutine --- Rfam/Lib/Bio/Rfam/Config.pm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Rfam/Lib/Bio/Rfam/Config.pm b/Rfam/Lib/Bio/Rfam/Config.pm index 5e20c313..1e0c113b 100644 --- a/Rfam/Lib/Bio/Rfam/Config.pm +++ b/Rfam/Lib/Bio/Rfam/Config.pm @@ -523,4 +523,12 @@ sub rfamTeam { return $self->{'_config'}->{rfamTeam}; } +sub scheduler { + my $self = shift; + if ( $#_ >= 0 ) { + warn "Passed variable to ro config\n"; + } + return $self->{'_config'}->{scheduler}; +} + 1; From 92b576dad7a55e14d112af40bd3627d82bb8c77e Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:12:23 +0100 Subject: [PATCH 05/16] Replaces wait_for_cluster call with wait_for_cluster_light --- Rfam/Lib/Bio/Rfam/Infernal.pm | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Infernal.pm b/Rfam/Lib/Bio/Rfam/Infernal.pm index 7ea70ad9..dc58964f 100644 --- a/Rfam/Lib/Bio/Rfam/Infernal.pm +++ b/Rfam/Lib/Bio/Rfam/Infernal.pm @@ -1,4 +1,4 @@ -,package Bio::Rfam::Infernal; +package Bio::Rfam::Infernal; # Wrappers for Infernal executables called by rfsearch and rfmake. @@ -420,13 +420,14 @@ sub cmalign_wrapper { my $errPath = "a.$$.err"; my @jobnameA = ($jobname); my @outnameA = ($outPath); + my @errnameA = ($errPath); if($use_mpi) { Bio::Rfam::Utils::submit_mpi_job($config, "$cmalignPath --mpi $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $queue); - Bio::Rfam::Utils::wait_for_cluster($config->location, $uname, \@jobnameA, \@outnameA, "\# CPU time:", "cmalign-mpi", $logFH, "[$nproc processors]", -1, $do_stdout); + Bio::Rfam::Utils::wait_for_cluster_light($config, $uname, \@jobnameA, \@outnameA, \@errnameA, "\# CPU time:", "cmalign-mpi", $logFH, "[$nproc processors]", -1, $do_stdout); } else { # don't use MPI Bio::Rfam::Utils::submit_nonmpi_job($config, "$cmalignPath $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $requiredMb, "", $queue); - Bio::Rfam::Utils::wait_for_cluster($config->location, $uname, \@jobnameA, \@outnameA, "\# CPU time:", "cmalign-thr", $logFH, "[$nproc processors]", -1, $do_stdout); + Bio::Rfam::Utils::wait_for_cluster_light($config, $uname, \@jobnameA, \@outnameA, \@errnameA, "\# CPU time:", "cmalign-thr", $logFH, "[$nproc processors]", -1, $do_stdout); } unlink $errPath; } From fa03bc0047fda0e63b9c85bd89cda3db307d89db Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:13:06 +0100 Subject: [PATCH 06/16] Adds slurm support for job submission and monitoring --- Rfam/Lib/Bio/Rfam/Utils.pm | 139 +++++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 54 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index 45981e15..962124b2 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -91,7 +91,7 @@ sub submit_nonmpi_job { if($config->scheduler eq "slurm") { $submit_cmd = "sbatch "; if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } - $submit_cmd .= "-c $ncpu -J $jobname -o /dev/null -e $errPath --mem-per-cpu=$reqMb --wrap \"$cmd\" > /dev/null"; + $submit_cmd .= "-c $ncpu -J $jobname -o /dev/null -e $errPath --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"$cmd\" > /dev/null"; } else { # lsf $submit_cmd = "bsub "; @@ -105,14 +105,14 @@ sub submit_nonmpi_job { $submit_cmd .= "-n $ncpu -J $jobname -o /dev/null -e $errPath -M $reqMb -R \"rusage[mem=$reqMb]\" \"$cmd\" > /dev/null"; } } - elsif($location eq "CLOUD"){ + elsif($config->location eq "CLOUD"){ # temporarily minimize memory to 6GB only to work with the test cloud # if ($reqMb >= 24000){ # $reqMb = 6000; # } $submit_cmd = "/Rfam/software/bin/rfkubesub.py \"$cmd\" $ncpu $reqMb $jobname"; } - elsif($location eq "JFRC") { + elsif($config->location eq "JFRC") { my $batch_opt = ""; if(defined $ncpu && $ncpu > 1) { $batch_opt = "-pe batch $ncpu"; } $submit_cmd = "qsub "; @@ -124,11 +124,11 @@ sub submit_nonmpi_job { $submit_cmd .= " -N $jobname -o /dev/null -e $errPath $batch_opt -b y -cwd -V \"$cmd\" > /dev/null"; } # local command - elsif($location eq ""){ + elsif($config->location eq ""){ $submit_cmd = $cmd } else { - die "ERROR unknown location $location in submit_nonmpi_job()"; + die "ERROR unknown location $config->location in submit_nonmpi_job()"; } # actually submit job @@ -145,7 +145,7 @@ sub submit_nonmpi_job { Title : submit_mpi_job() Incept : EPN, Tue Apr 2 05:59:40 2013 - Usage : submit_mpi_job($location, $cmd, ) + Usage : submit_mpi_job($config, $cmd, ) Function : Submits MPI job defined by command $cmd. : MPI submission syntax depends on $config->location and : config->scheduler values. @@ -177,7 +177,7 @@ sub submit_mpi_job { # forwarded from Jen, on 08.27.13. #TODO: update 'bsub' to 'sbatch' if($config->scheduler eq "slurm") { - $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --mem-per-cpu=$reqMb --wrap \"$cmd\" > /dev/null"; + $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; } else { # lsf $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; @@ -185,16 +185,16 @@ sub submit_mpi_job { # $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -a openmpi mpirun.lsf -np $nproc -mca btl tcp,self $cmd"; } } - elsif($location eq "JFRC") { + elsif($config->location eq "JFRC") { my $queue_opt = ""; if($queue ne "") { $queue_opt = "-l $queue=true "; } $submit_cmd = "qsub -N $jobname -e $errPath -o /dev/null -b y -cwd -V -pe impi $nproc " . $queue_opt . "\"mpirun -np $nproc $cmd\" > /dev/null"; } - elsif ($location eq "CLOUD"){ + elsif ($config->location eq "CLOUD"){ die "ERROR: MPI unavailable on CLOUD. Consider using -cnompi option"; } else { - die "ERROR unknown location $location in submit_mpi_job()"; + die "ERROR unknown location $config->location in submit_mpi_job()"; } # actually submit job @@ -237,7 +237,7 @@ sub submit_mpi_job { : (3) Dies if $max_minutes is defined and != -1, and any : job takes longer than $max_minutes to complete. : - Args : $location: location, e.g. "JFRC" or "EBI" + Args : $config: Rfam config, with 'location' and 'scheduler' : $username: username the cluster jobs belong to : $jobnameAR: ref to array of list of job names on cluster : $outnameAR: ref to array of list of output file names, one per job @@ -255,7 +255,7 @@ sub submit_mpi_job { =cut sub wait_for_cluster { - my ($location, $username, $jobnameAR, $outnameAR, $success_string, $program, $outFH, $extra_note, $max_minutes, $do_stdout) = @_; + my ($config, $username, $jobnameAR, $outnameAR, $success_string, $program, $outFH, $extra_note, $max_minutes, $do_stdout) = @_; my $start_time = time(); @@ -268,7 +268,7 @@ sub wait_for_cluster { # TODO: update this subroutine to work with 'squeue' # modify username > 7 characters and job names > 10 characters if we're at EBI, because bjobs truncates these - if($location eq "EBI") { + if($config->location eq "EBI") { if(length($username) > 7) { $username = substr($username, 0, 7); # bjobs at EBI only prints first 7 letters of username } @@ -278,8 +278,8 @@ sub wait_for_cluster { } } } - elsif($location ne "JFRC") { - die "ERROR in wait_for_cluster, unrecognized location: $location"; + elsif($config->location ne "JFRC") { + die "ERROR in wait_for_cluster, unrecognized location: $config->location"; } my $sleep_nsecs = 60; # we'll call qstat/bjobs every 5 seconds @@ -297,8 +297,8 @@ sub wait_for_cluster { sleep(2); while($nsuccess != $n) { - if ($location eq "JFRC") { @infoA = split("\n", `qstat`); } - elsif($location eq "EBI") { @infoA = split("\n", `bjobs`); } + if ($config->location eq "JFRC") { @infoA = split("\n", `qstat`); } + elsif($config->location eq "EBI") { @infoA = split("\n", `bjobs`); } for($i = 0; $i < $n; $i++) { $ininfoA[$i] = 0; } $nrunning = 0; @@ -307,13 +307,13 @@ sub wait_for_cluster { if($line =~ m/^\s*\d+\s+/) { $line =~ s/^\s*//; @elA = split(/\s+/, $line); - if($location eq "JFRC") { + if($config->location eq "JFRC") { #1232075 4.79167 QLOGIN davisf r 03/25/2013 14:24:11 f02.q@f02u09.int.janelia.org 8 # 396183 10.25000 QLOGIN nawrockie r 07/26/2013 10:10:41 new.q@h02u19.int.janelia.org 1 # 565685 0.00000 c.25858 nawrockie qw 08/01/2013 15:18:55 81 ($jobname, $uname, $status) = ($elA[2], $elA[3], $elA[4]); } - elsif($location eq "EBI") { + elsif($config->location eq "EBI") { # jobid uname status queue sub node run node job name date # 5134531 vitor RUN research-r ebi-004 ebi5-037 *lection.R Apr 29 18:00 # 4422939 stauch PEND research-r ebi-001 *ay[16992] Apr 26 12:56 @@ -331,12 +331,12 @@ sub wait_for_cluster { (! $ininfoA[$i]) && # we didn't already find this job in the queue ($jobnameAR->[$i] eq $jobname)) { # jobname match $ininfoA[$i] = 1; - if($location eq "JFRC") { + if($config->location eq "JFRC") { if($status eq "r") { $nrunning++; } elsif($status =~ m/E/) { die "wait_for_cluster(), internal error, qstat shows Error status: $line"; } else { $nwaiting++; } } - elsif($location eq "EBI") { + elsif($config->location eq "EBI") { if ($status eq "RUN") { $nrunning++; } elsif($status eq "PEND") { $nwaiting++; } else { die "wait_for_cluster(), internal error, bjobs shows non-\"RUN\" and non-\"PEND\" status: $line"; } @@ -416,7 +416,7 @@ sub wait_for_cluster { : (3) Dies if $max_minutes is defined and != -1, and any : job takes longer than $max_minutes to complete. : - Args : $location: location, e.g. "JFRC" or "EBI" + Args : $config: Rfam config, with 'location' and 'scheduler' : $username: username the cluster jobs belong to : $jobnameAR: ref to array of list of job names on cluster : $outnameAR: ref to array of list of output file names, one per job @@ -435,7 +435,7 @@ sub wait_for_cluster { =cut sub wait_for_cluster_light { - my ($location, $username, $jobnameAR, $outnameAR, $errnameAR, $success_string, $program, $outFH, $extra_note, $max_minutes, $do_stdout) = @_; + my ($config, $username, $jobnameAR, $outnameAR, $errnameAR, $success_string, $program, $outFH, $extra_note, $max_minutes, $do_stdout) = @_; my $start_time = time(); my $n = scalar(@{$jobnameAR}); @@ -447,20 +447,22 @@ sub wait_for_cluster_light { if(scalar(@{$outnameAR}) != $n) { die "wait_for_cluster_light(), internal error, number of elements in jobnameAR and outnameAR differ"; } if(scalar(@{$errnameAR}) != $n) { die "wait_for_cluster_light(), internal error, number of elements in jobnameAR and errnameAR differ"; } - #TODO: update this subroutine to work with 'squeue' - # modify username > 7 characters and job names > 10 characters if we're at EBI, because bjobs truncates these - if($location eq "EBI") { - if(length($username) > 7) { - $username = substr($username, 0, 7); # bjobs at EBI only prints first 7 letters of username - } - for($i = 0; $i < $n; $i++) { - if(length($jobnameAR->[$i]) > 10) { # NOTE: THIS WILL CHANGE THE VALUES IN THE ACTUAL ARRAY jobnameAR POINTS TO! - $jobnameAR->[$i] = "*" . substr($jobnameAR->[$i], -9); + # modify username > 7 characters and job names > 10 characters if we're using lsf at EBI, because bjobs truncates these + # if we are using slurm we will use the --format option to squeue to deal with the fact that squeue truncates job names to 8 chars by default + if($config->location eq "EBI") { + if($config->scheduler ne "slurm") { + if(length($username) > 7) { + $username = substr($username, 0, 7); # bjobs at EBI only prints first 7 letters of username + } + for($i = 0; $i < $n; $i++) { + if(length($jobnameAR->[$i]) > 10) { # NOTE: THIS WILL CHANGE THE VALUES IN THE ACTUAL ARRAY jobnameAR POINTS TO! + $jobnameAR->[$i] = "*" . substr($jobnameAR->[$i], -9); + } } } } - elsif(($location ne "JFRC") && ($location ne "CLOUD")) { - die "ERROR in wait_for_cluster_light, unrecognized location: $location"; + elsif(($config->location ne "JFRC") && ($config->location ne "CLOUD")) { + die "ERROR in wait_for_cluster_light, unrecognized location: $config->location"; } my $sleep_nsecs = 30; # we'll look at file system every 30 seconds @@ -514,28 +516,45 @@ sub wait_for_cluster_light { sleep(rand(30)); # randomize wait time here, so all jobs started at same time don't run qstat/bjobs at exact same time $ncycle = 0; # reset to 0 $ncluster_check++; - if ($location eq "JFRC") { @infoA = split("\n", `qstat`); } - elsif($location eq "EBI") { @infoA = split("\n", `bjobs`); } + if ( $config->location eq "JFRC") { + @infoA = split("\n", `qstat`); + } + elsif(($config->location eq "EBI") && ($config->scheduler eq "slurm")) { + @infoA = split("\n", `squeue --format=\"%.8i %.9P %25j %10u %.8T %.12M %9N\"`); + # --format used to specify job names can be 25 characters, instead of default 8 + } + elsif( $config->location eq "EBI") { # lsf + @infoA = split("\n", `bjobs`); + } # Fetch all running jobs of a specific user - elsif($location eq "CLOUD") { @infoA = split("\n", `kubectl get pods --selector=user=$username --selector=tier=backend`);} + elsif($config->location eq "CLOUD") { + @infoA = split("\n", `kubectl get pods --selector=user=$username --selector=tier=backend`); + } # initialize array for($i = 0; $i < $n; $i++) { $ininfoA[$i] = 0; } # parse job log foreach $line (@infoA) { - if ($location ne "CLOUD"){ + if ($config->location ne "CLOUD"){ if($line =~ m/^\s*\d+\s+/) { $line =~ s/^\s*//; @elA = split(/\s+/, $line); - if($location eq "JFRC") { + if($config->location eq "JFRC") { #1232075 4.79167 QLOGIN davisf r 03/25/2013 14:24:11 f02.q@f02u09.int.janelia.org 8 # 396183 10.25000 QLOGIN nawrockie r 07/26/2013 10:10:41 new.q@h02u19.int.janelia.org 1 # 565685 0.00000 c.25858 nawrockie qw 08/01/2013 15:18:55 81 ($jobname, $uname, $status) = ($elA[2], $elA[3], $elA[4]); } # closes JFRC if - - elsif($location eq "EBI") { + elsif(($config->location eq "EBI") && ($config->scheduler eq "slurm")) { + #JOBID PARTITION NAME USER STATE TIME NODELIST + #35080251 standard rs.4002890-9 nawrocki RUNNING 0:02 hl-codon- + #35080252 standard rs.4002890-10 nawrocki RUNNING 0:02 hl-codon- + #35080253 standard ss.4002890-1 nawrocki RUNNING 0:02 hl-codon- + ($jobname, $uname, $status) = ($elA[2], $elA[3], $elA[4]); + # print STDERR ("uname: $uname status: $status; jobname: $jobname\n"); + } # closes EBI + slurm elsif + elsif($config->location eq "EBI") { # lsf # jobid uname status queue sub node run node job name date # 5134531 vitor RUN research-r ebi-004 ebi5-037 *lection.R Apr 29 18:00 # 4422939 stauch PEND research-r ebi-001 *ay[16992] Apr 26 12:56 @@ -543,23 +562,36 @@ sub wait_for_cluster_light { if($status eq "RUN") { $jobname = $elA[6]; } else { $jobname = $elA[5]; } #print STDERR ("uname: $uname status: $status; jobname: $jobname\n"); - } # closes EBI if - + } # closes EBI (and not slurm) elsif # no need to do this for CLOUD if($uname ne $username) { die "wait_for_cluster_light(), internal error, uname mismatch ($uname ne $username)"; } # look through our list of jobs and see if this one matches for($i = 0; $i < $n; $i++) { #5 #printf("\t\tsuccess: %d\tininfo: %d\tmatch: %d\n", $successA[$i], $ininfoA[$i], ($jobnameAR->[$i] eq $jobname) ? 1 : 0); - if((! $successA[$i]) && # job didn't successfully complete already - (! $ininfoA[$i]) && # we didn't already find this job in the queue - ($jobnameAR->[$i] eq $jobname)) { # jobname match - $ininfoA[$i] = 1; - $i = $n; - - if (($location eq "JFRC") && ($status =~ m/E/)) { die "wait_for_cluster_light(), internal error, qstat shows Error status: $line"; } - if (($location eq "EBI") && ($status ne "RUN" && $status ne "PEND")) { die "wait_for_cluster_light(), internal error, bjobs shows non-\"RUN\" and non-\"PEND\" status: $line"; } + if((! $successA[$i]) && # job didn't successfully complete already + (! $ininfoA[$i]) && # we didn't already find this job in the queue + ($jobnameAR->[$i] eq $jobname)) { # jobname match + $ininfoA[$i] = 1; + $i = $n; + + # make sure job state is either pending, running or completing + if(($config->location eq "JFRC") && ($status =~ m/E/)) { + die "wait_for_cluster_light(), internal error, qstat shows Error status: $line"; } + if($config->location eq "EBI") { + if((! defined $config->scheduler) || ($config->scheduler ne "slurm")) { + if(($status ne "RUN") && ($status ne "PEND")) { + die "wait_for_cluster_light(), internal error, bjobs shows non-\"RUN\" and non-\"PEND\" status: $line"; + } + } + elsif((defined $config->scheduler) && ($config->scheduler eq "slurm")) { + if(($status ne "RUNNING") && ($status ne "PENDING") && ($status ne "COMPLETING")) { + die "wait_for_cluster_light(), internal error, squeue shows non-\"RUNNING\", non-\"PENDING\" and non-\"COMPLETING\" status:\n$line"; + } + } + } + } } # EBI/JFRC for loop } # first line check here } # EBI/JFRC location if @@ -588,7 +620,7 @@ sub wait_for_cluster_light { $i = $n; # skip the rest of the computations # check if job is in error status, if it is, then exit # - if (($location eq "CLOUD") && ($status ne "Running" && $status ne "Pending" && $status ne "Completed" && $status ne "ContainerCreating")){ die "wait_for_cluster_light(), internal error, kubectl shows Error status: $line"; } + if (($config->location eq "CLOUD") && ($status ne "Running" && $status ne "Pending" && $status ne "Completed" && $status ne "ContainerCreating")){ die "wait_for_cluster_light(), internal error, kubectl shows Error status: $line"; } } #internal if } # for loop } # cloud segment else @@ -690,7 +722,7 @@ sub wait_for_cluster_light { } } # end of 'if(-e $errnameAR->[$i])' else { # err file doesn't exist yet, job is waiting (or failed) or job is running on cloud - if ($location ne "CLOUD"){ + if ($config->location ne "CLOUD"){ if($finishedA[$i] == 1) { die "wait_for_cluster_light() job $i finished according to qstat/bjobs, but expected output ERROR file $errnameAR->[$i] does not exist\n"; } @@ -3191,4 +3223,3 @@ this program. If not, see . =cut -1; From cfc2bbd4e43844caaa3154eb9a5a722a243d10b1 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:13:41 +0100 Subject: [PATCH 07/16] Updates for slurm support --- Rfam/Scripts/make/rfsearch.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Rfam/Scripts/make/rfsearch.pl b/Rfam/Scripts/make/rfsearch.pl index 04c09a9a..bd2bf1da 100755 --- a/Rfam/Scripts/make/rfsearch.pl +++ b/Rfam/Scripts/make/rfsearch.pl @@ -143,6 +143,7 @@ # output header + my $user; if ($config->location eq "CLOUD"){ my $host_string = hostname; @@ -652,7 +653,7 @@ my $cmcalibrate_string = ($calibrate_mpi) ? "cmcalibrate-mpi" : "cmcalibrate-thr"; if(! $do_all_local) { # job is running on the cluster - $calibrate_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster_light($config->location, $user, \@jobnameA, \@outnameA, \@errnameA, "[ok]", $cmcalibrate_string, $logFH, + $calibrate_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster_light($config, $user, \@jobnameA, \@outnameA, \@errnameA, "[ok]", $cmcalibrate_string, $logFH, sprintf("[$ncpus_cmcalibrate procs, should take ~%.0f minute(s)]", $predicted_minutes), -1, $do_stdout); Bio::Rfam::Utils::checkStderrFile($config->location, $calibrate_errO); # if we get here, err file was empty, so we keep going @@ -963,7 +964,7 @@ if(! $do_all_local) { # wait for cluster jobs to finish #$search_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster($config->location, $user, \@all_jobnameA, \@all_tblOA, "# [ok]", "cmsearch", $logFH, "", -1, $do_stdout); - $search_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster_light($config->location, $user, \@all_jobnameA, \@all_tblOA, \@all_errOA, "# [ok]", "cmsearch", $logFH, "", -1, $do_stdout); + $search_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster_light($config, $user, \@all_jobnameA, \@all_tblOA, \@all_errOA, "# [ok]", "cmsearch", $logFH, "", -1, $do_stdout); } $search_wall_secs = time() - $search_start_time; From 536b47b427bfeca0acda0bc839d9e7bad2be2260 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:14:01 +0100 Subject: [PATCH 08/16] Replaces old wait_for_cluster() call with wait_for_cluster_light() --- Rfam/Scripts/motifs/motif_scan.pl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Rfam/Scripts/motifs/motif_scan.pl b/Rfam/Scripts/motifs/motif_scan.pl index 8b61464d..7b47d9c4 100644 --- a/Rfam/Scripts/motifs/motif_scan.pl +++ b/Rfam/Scripts/motifs/motif_scan.pl @@ -388,12 +388,13 @@ sub submitToCmscan { Bio::Rfam::Infernal::cmscan_wrapper($config, $jobname, " --tblout " . $tblO . " " . $searchopts, $CMdb, $seed, $cmsO, $errO, "", "", 0); # Change jobname and outname to an array so they can be referenced using wait_for_cluter() - my (@jobnameAR, @outnameAR); + my (@jobnameAR, @outnameAR, @errnameAR); push(@jobnameAR, $jobname); push(@outnameAR, $tblO); + push(@errnameAR, $errO); # Determine when the job is finished and return the time it took - my $wait_time = Bio::Rfam::Utils::wait_for_cluster($config->location, $user, \@jobnameAR, \@outnameAR, "# [ok]", "cmscan $rfam_acc", "", "", -1, 1); + my $wait_time = Bio::Rfam::Utils::wait_for_cluster_light($config->location, $user, \@jobnameAR, \@outnameAR, \@errnameAR, "# [ok]", "cmscan $rfam_acc", "", "", -1, 1); } #------------------------------------------------------------------------------------- From 81e38af34b0dd3582aaa920c0257199150550363 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Wed, 18 Oct 2023 20:34:05 +0100 Subject: [PATCH 09/16] Updates submit_mpi_job to take required Mb per cpu --- Rfam/Lib/Bio/Rfam/Infernal.pm | 11 +++++------ Rfam/Lib/Bio/Rfam/Utils.pm | 8 ++++---- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Infernal.pm b/Rfam/Lib/Bio/Rfam/Infernal.pm index dc58964f..0aa6e7d2 100644 --- a/Rfam/Lib/Bio/Rfam/Infernal.pm +++ b/Rfam/Lib/Bio/Rfam/Infernal.pm @@ -150,14 +150,13 @@ sub cmcalibrate_wrapper { Bio::Rfam::Utils::run_local_command(sprintf("$cmcalibratePath %s $options $cmPath > $outPath", ($nproc eq "") ? "" : "--cpu $nproc")); } else { + my $gbPerThread = (($predicted_Mb_per_thread * 2.) / 1000.); # double prediction to be safe (yes, it can be that inaccurate...) + if($gbPerThread < 3.0) { $gbPerThread = 3.0; } # enforce minimum of 3.0 Gb + my $requiredMb = int($nproc * $gbPerThread * 1000.) . "MB"; # round to nearest Mb and append MB if($doMPI) { - Bio::Rfam::Utils::submit_mpi_job($config, "$cmcalibratePath --mpi $options $cmPath > $outPath", $jobname, $errPath, $nproc, $queue); + Bio::Rfam::Utils::submit_mpi_job($config, "$cmcalibratePath --mpi $options $cmPath > $outPath", $jobname, $errPath, $nproc, $requiredMb, $queue); } else { - my $gbPerThread = (($predicted_Mb_per_thread * 2.) / 1000.); # double prediction to be safe (yes, it can be that inaccurate...) - if($gbPerThread < 3.0) { $gbPerThread = 3.0; } # enforce minimum of 3.0 Gb - my $requiredMb = int($nproc * $gbPerThread * 1000.) . "MB"; # round to nearest Mb and append MB - #if ($config->location eq 'CLOUD'){ #$requiredMb = 6000; #} @@ -422,7 +421,7 @@ sub cmalign_wrapper { my @outnameA = ($outPath); my @errnameA = ($errPath); if($use_mpi) { - Bio::Rfam::Utils::submit_mpi_job($config, "$cmalignPath --mpi $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $queue); + Bio::Rfam::Utils::submit_mpi_job($config, "$cmalignPath --mpi $options $cmPath $seqfilePath > $outPath", "a.$$", "a.$$.err", $nproc, $requiredMb, $queue); Bio::Rfam::Utils::wait_for_cluster_light($config, $uname, \@jobnameA, \@outnameA, \@errnameA, "\# CPU time:", "cmalign-mpi", $logFH, "[$nproc processors]", -1, $do_stdout); } else { # don't use MPI diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index 962124b2..e8fd12dc 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -156,6 +156,7 @@ sub submit_nonmpi_job { : $jobname: name for job : $errPath: path for stderr output : $nproc: number of MPI processors to use + : $reqMb: required number of Mb for job, can be undefined if location eq "JFRC" : $queue: queue to submit to, "" for default, ignored if location eq "EBI" Returns : void Dies : If MPI submit command fails. @@ -163,7 +164,7 @@ sub submit_nonmpi_job { =cut sub submit_mpi_job { - my ($config, $cmd, $jobname, $errPath, $nproc, $queue) = @_; + my ($config, $cmd, $jobname, $errPath, $nproc, $reqMb, $queue) = @_; my $submit_cmd = ""; if($config->location eq "EBI") { @@ -175,12 +176,11 @@ sub submit_mpi_job { # Need to use MPI queue ($queue is irrelevant) # TEMPORARILY USING research queue and span[ptile=8] as per Asier Roa's instructions, see email ("mpi jobs on cluster") # forwarded from Jen, on 08.27.13. - #TODO: update 'bsub' to 'sbatch' if($config->scheduler eq "slurm") { - $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; + $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; } else { # lsf - $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; + $submit_cmd = "bsub -J $jobname -e $errPath -M $reqMb -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; # ORIGINAL COMMAND (I BELIEVE WE WILL REVERT TO THIS EVENTUALLY): # $submit_cmd = "bsub -J $jobname -e $errPath -q mpi -I -n $nproc -a openmpi mpirun.lsf -np $nproc -mca btl tcp,self $cmd"; } From 232f2ac14b8fecfb4ed413635fb97cf60e6e5429 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Mon, 23 Oct 2023 17:32:06 +0100 Subject: [PATCH 10/16] Adds checks that scheduler is defined before checking its value --- Rfam/Lib/Bio/Rfam/Utils.pm | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index e8fd12dc..1032a0e8 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -88,7 +88,7 @@ sub submit_nonmpi_job { if(! defined $ncpu) { die "submit_nonmpi_job(), location is EBI, but ncpu is undefined"; } if(! defined $reqMb) { die "submit_nonmpi_job(), location is EBI, but reqMb is undefined"; } - if($config->scheduler eq "slurm") { + if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { $submit_cmd = "sbatch "; if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } $submit_cmd .= "-c $ncpu -J $jobname -o /dev/null -e $errPath --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"$cmd\" > /dev/null"; @@ -176,7 +176,7 @@ sub submit_mpi_job { # Need to use MPI queue ($queue is irrelevant) # TEMPORARILY USING research queue and span[ptile=8] as per Asier Roa's instructions, see email ("mpi jobs on cluster") # forwarded from Jen, on 08.27.13. - if($config->scheduler eq "slurm") { + if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; } else { # lsf @@ -450,7 +450,7 @@ sub wait_for_cluster_light { # modify username > 7 characters and job names > 10 characters if we're using lsf at EBI, because bjobs truncates these # if we are using slurm we will use the --format option to squeue to deal with the fact that squeue truncates job names to 8 chars by default if($config->location eq "EBI") { - if($config->scheduler ne "slurm") { + if((defined $config->scheduler) && ($config->scheduler ne "slurm")) { if(length($username) > 7) { $username = substr($username, 0, 7); # bjobs at EBI only prints first 7 letters of username } @@ -519,7 +519,7 @@ sub wait_for_cluster_light { if ( $config->location eq "JFRC") { @infoA = split("\n", `qstat`); } - elsif(($config->location eq "EBI") && ($config->scheduler eq "slurm")) { + elsif(($config->location eq "EBI") && ((defined $config->scheduler) && ($config->scheduler eq "slurm"))) { @infoA = split("\n", `squeue --format=\"%.8i %.9P %25j %10u %.8T %.12M %9N\"`); # --format used to specify job names can be 25 characters, instead of default 8 } @@ -546,7 +546,7 @@ sub wait_for_cluster_light { # 565685 0.00000 c.25858 nawrockie qw 08/01/2013 15:18:55 81 ($jobname, $uname, $status) = ($elA[2], $elA[3], $elA[4]); } # closes JFRC if - elsif(($config->location eq "EBI") && ($config->scheduler eq "slurm")) { + elsif(($config->location eq "EBI") && ((defined $config->scheduler) && ($config->scheduler eq "slurm"))) { #JOBID PARTITION NAME USER STATE TIME NODELIST #35080251 standard rs.4002890-9 nawrocki RUNNING 0:02 hl-codon- #35080252 standard rs.4002890-10 nawrocki RUNNING 0:02 hl-codon- From 3bd7b591635a96f070203829d323420065ec3e7a Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Mon, 23 Oct 2023 22:39:26 +0100 Subject: [PATCH 11/16] Fixes check for scheduler --- Rfam/Lib/Bio/Rfam/Utils.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index 1032a0e8..ff815aaa 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -450,7 +450,7 @@ sub wait_for_cluster_light { # modify username > 7 characters and job names > 10 characters if we're using lsf at EBI, because bjobs truncates these # if we are using slurm we will use the --format option to squeue to deal with the fact that squeue truncates job names to 8 chars by default if($config->location eq "EBI") { - if((defined $config->scheduler) && ($config->scheduler ne "slurm")) { + if((! defined $config->scheduler) || ($config->scheduler ne "slurm")) { if(length($username) > 7) { $username = substr($username, 0, 7); # bjobs at EBI only prints first 7 letters of username } From 07466c72f1224536a94393da9014160b41831103 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Tue, 24 Oct 2023 16:12:04 +0100 Subject: [PATCH 12/16] Fixes slurm mem request --- Rfam/Lib/Bio/Rfam/Infernal.pm | 2 +- Rfam/Lib/Bio/Rfam/Utils.pm | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Infernal.pm b/Rfam/Lib/Bio/Rfam/Infernal.pm index 0aa6e7d2..39a5d196 100644 --- a/Rfam/Lib/Bio/Rfam/Infernal.pm +++ b/Rfam/Lib/Bio/Rfam/Infernal.pm @@ -152,7 +152,7 @@ sub cmcalibrate_wrapper { else { my $gbPerThread = (($predicted_Mb_per_thread * 2.) / 1000.); # double prediction to be safe (yes, it can be that inaccurate...) if($gbPerThread < 3.0) { $gbPerThread = 3.0; } # enforce minimum of 3.0 Gb - my $requiredMb = int($nproc * $gbPerThread * 1000.) . "MB"; # round to nearest Mb and append MB + my $requiredMb = int($nproc * $gbPerThread * 1000.); # round to nearest Mb if($doMPI) { Bio::Rfam::Utils::submit_mpi_job($config, "$cmcalibratePath --mpi $options $cmPath > $outPath", $jobname, $errPath, $nproc, $requiredMb, $queue); } diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index ff815aaa..9a138fed 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -69,7 +69,7 @@ sub run_local_command { : $jobname: name for job : $errPath: path for stderr output : $ncpu: number of CPUs to run job on, can be undefined if location eq "JFRC" - : $reqMb: required number of Mb for job, can be undefined if location eq "JFRC" + : $reqMb: required number of Mb for job (all threads combined), can be undefined if location eq "JFRC" : $exStr: extra string to add to qsub/sub command : $queue: queue to submit to, "" for default, 'p' = "production", 'r' = "research"; Returns : void @@ -89,6 +89,7 @@ sub submit_nonmpi_job { if(! defined $reqMb) { die "submit_nonmpi_job(), location is EBI, but reqMb is undefined"; } if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { + $reqMb /= $ncpu; # we specify Mb per thread, others are total Mb for all threads $submit_cmd = "sbatch "; if(defined $exStr && $exStr ne "") { $submit_cmd .= "$exStr "; } $submit_cmd .= "-c $ncpu -J $jobname -o /dev/null -e $errPath --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"$cmd\" > /dev/null"; @@ -156,7 +157,7 @@ sub submit_nonmpi_job { : $jobname: name for job : $errPath: path for stderr output : $nproc: number of MPI processors to use - : $reqMb: required number of Mb for job, can be undefined if location eq "JFRC" + : $reqMb: required number of Mb for job (all threads), can be undefined if location eq "JFRC" : $queue: queue to submit to, "" for default, ignored if location eq "EBI" Returns : void Dies : If MPI submit command fails. @@ -176,8 +177,10 @@ sub submit_mpi_job { # Need to use MPI queue ($queue is irrelevant) # TEMPORARILY USING research queue and span[ptile=8] as per Asier Roa's instructions, see email ("mpi jobs on cluster") # forwarded from Jen, on 08.27.13. - if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { - $submit_cmd .= "sbatch -J $jobname -e $errPath -c $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; + if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { + $reqMb /= $nproc; # we specify Mb per thread, others are total Mb for all threads + $submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; +# $submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"srun -n $nproc $cmd\" > /dev/null"; } else { # lsf $submit_cmd = "bsub -J $jobname -e $errPath -M $reqMb -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; From 25997c42ca3f55f1bd7b737907260f5e9c008b0c Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Tue, 24 Oct 2023 16:26:36 +0100 Subject: [PATCH 13/16] Removes TODO comments --- Rfam/Scripts/view/make_sunburst.pl | 6 ++---- Rfam/Scripts/view/rfam_family_view.pl | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/Rfam/Scripts/view/make_sunburst.pl b/Rfam/Scripts/view/make_sunburst.pl index cde99054..1c1ddfbe 100755 --- a/Rfam/Scripts/view/make_sunburst.pl +++ b/Rfam/Scripts/view/make_sunburst.pl @@ -171,13 +171,11 @@ sub help{ parameter specifies which chunk should be built for this job The ability to process families in chunks allows the script to be run on the -farm, using a submission command something like: +farm, using a submission command (on lsf) something like: - TODO: update this from bsub to sbatch bsub -q normal -R"select[mem>4000] rusage[mem=4000]" -M 4000000 \ -J "sunburst[1-20]" -o "sunburst.\%J.\%I.log" '$0 \ -chunksize 684 -chunk \$\{LSB_JOBINDEX\}' - + EOF_help } - diff --git a/Rfam/Scripts/view/rfam_family_view.pl b/Rfam/Scripts/view/rfam_family_view.pl index a46fe30d..b97b3024 100755 --- a/Rfam/Scripts/view/rfam_family_view.pl +++ b/Rfam/Scripts/view/rfam_family_view.pl @@ -145,8 +145,7 @@ =head1 DESCRIPTION This is a script to run the Rfam view process for a given family. It's intended to be run by the job dequeuer, which polls the rfam_jobs.job_history table for -pending view process jobs, and runs this script on the farm via "bsub" (TODO: -update to 'sbatch') +pending view process jobs, and runs this script on the farm via "bsub" or "sbatch". We have the concept of plugin sets, so that we can group together view plugins which have common features. For example, there may be plugins that are based on From 4d7f45e5dbaf6df582c6dc1a8bbcf0a885f8ecd3 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Fri, 10 Nov 2023 19:35:55 +0000 Subject: [PATCH 14/16] Adds --mpi=pmix to srun in submit_mpi_job --- Rfam/Lib/Bio/Rfam/Utils.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index 9a138fed..c33067dd 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -179,8 +179,8 @@ sub submit_mpi_job { # forwarded from Jen, on 08.27.13. if((defined $config->scheduler) && ($config->scheduler eq "slurm")) { $reqMb /= $nproc; # we specify Mb per thread, others are total Mb for all threads - $submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; -# $submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"srun -n $nproc $cmd\" > /dev/null"; + #$submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"mpirun -np $nproc $cmd\" > /dev/null"; + $submit_cmd .= "sbatch -J $jobname -e $errPath -n $nproc --mem-per-cpu=$reqMb --time=48:00:00 --wrap \"srun --mpi=pmix -n $nproc $cmd\" > /dev/null"; } else { # lsf $submit_cmd = "bsub -J $jobname -e $errPath -M $reqMb -q mpi -I -n $nproc -R \"span[ptile=2]\" -a openmpi mpirun -np $nproc -mca btl tcp,self $cmd"; From d5cc371d3ce3a8918275997b04a8a4afefed49db Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Tue, 14 Nov 2023 15:31:05 +0000 Subject: [PATCH 15/16] Fixes check for slurm job status --- Rfam/Lib/Bio/Rfam/Utils.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Rfam/Lib/Bio/Rfam/Utils.pm b/Rfam/Lib/Bio/Rfam/Utils.pm index c33067dd..dab05b94 100644 --- a/Rfam/Lib/Bio/Rfam/Utils.pm +++ b/Rfam/Lib/Bio/Rfam/Utils.pm @@ -589,8 +589,8 @@ sub wait_for_cluster_light { } } elsif((defined $config->scheduler) && ($config->scheduler eq "slurm")) { - if(($status ne "RUNNING") && ($status ne "PENDING") && ($status ne "COMPLETING")) { - die "wait_for_cluster_light(), internal error, squeue shows non-\"RUNNING\", non-\"PENDING\" and non-\"COMPLETING\" status:\n$line"; + if(($status !~ m/^RUNNING/) && ($status !~ m/^PENDING/) && ($status !~ m/^COMPLETI/)) { + die "wait_for_cluster_light(), internal error, squeue shows non-\"RUNNING\", non-\"PENDING\" and non-\"COMPLETI\" status:\n$line"; } } } From 1c830b5ba0f064530b82eb3c222408aeef1e69c8 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Tue, 14 Nov 2023 15:31:45 +0000 Subject: [PATCH 16/16] Removes unnecessary check that err file is empty after calibration --- Rfam/Scripts/make/rfsearch.pl | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/Rfam/Scripts/make/rfsearch.pl b/Rfam/Scripts/make/rfsearch.pl index bd2bf1da..51282a75 100755 --- a/Rfam/Scripts/make/rfsearch.pl +++ b/Rfam/Scripts/make/rfsearch.pl @@ -655,9 +655,14 @@ if(! $do_all_local) { # job is running on the cluster $calibrate_max_wait_secs = Bio::Rfam::Utils::wait_for_cluster_light($config, $user, \@jobnameA, \@outnameA, \@errnameA, "[ok]", $cmcalibrate_string, $logFH, sprintf("[$ncpus_cmcalibrate procs, should take ~%.0f minute(s)]", $predicted_minutes), -1, $do_stdout); - Bio::Rfam::Utils::checkStderrFile($config->location, $calibrate_errO); - # if we get here, err file was empty, so we keep going - if(! $do_dirty) { unlink $calibrate_errO; } # this file is empty anyway + + # we used to check the $calibrate_errO file here and die if it was + # not empty but we don't do that anymore because slurm MPI jobs + # output innocuous warnings to .err file and because + # wait_for_cluster_light() has already checked that the stdout has + # the success string. We also used to erase the err file here, but + # now we leave it + # if(! $do_dirty) { unlink $calibrate_errO; } $calibrate_wall_secs = time() - $calibrate_start_time; } else { # job ran locally