From 02ee8a90c4bdebc56ea906d9fd3e299907481475 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jord=C3=A0=20Polo?= Date: Fri, 10 Jan 2025 10:17:00 -0800 Subject: [PATCH] Add warning when squeue is timing out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Jordà Polo --- omnistat/collector_rms.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/omnistat/collector_rms.py b/omnistat/collector_rms.py index 907aafe2..d9fdb4d8 100644 --- a/omnistat/collector_rms.py +++ b/omnistat/collector_rms.py @@ -97,8 +97,13 @@ def querySlurmJob(self, timeout=1, exit_on_error=False, mode="squeue"): if mode == "squeue": data = utils.runShellCommand(self.__squeue_query, timeout=timeout, exit_on_error=exit_on_error) + if data == None: + logging.warning( + "Failed to capture job information: squeue timed out. " + "Please increase sampling interval or switch to file-based mode." + ) # squeue query output format: JOBID:USER:PARTITION:NUM_NODES:BATCHFLAG - if data and data.stdout.strip(): + elif data.stdout.strip(): data = data.stdout.strip().split(":") keys = [ "RMS_JOB_ID", @@ -111,9 +116,14 @@ def querySlurmJob(self, timeout=1, exit_on_error=False, mode="squeue"): results["RMS_TYPE"] = "slurm" # require a 2nd query to ascertain job steps (otherwise, miss out on batchflag) - data = utils.runShellCommand(self.__squeue_steps, timeout=timeout, exit_on_error=exit_on_error) results["RMS_STEP_ID"] = -1 - if data and data.stdout.strip(): + data = utils.runShellCommand(self.__squeue_steps, timeout=timeout, exit_on_error=exit_on_error) + if data == None: + logging.warning( + "Failed to capture job step information: squeue timed out. " + "Please increase sampling interval or switch to file-based mode." + ) + elif data.stdout.strip(): # If we are in an active job step, the STEPID will have an integer index appended, e.g. # 57735.10 # 57735.interactive