Improve "not queued or running" error message.

adam-azarchs · adam-azarchs · commit ee0155813026 · 2024-07-08T12:19:52.000-07:00
Include the job ID for debugging purposes.

Use a somewhat different error message if the job had started up
successfully compared to if it had not.  The debugging steps are
generally different for jobs which started but then died unexpectedly
vs. ones that didn't start up at all, and the "not queued" part is
just confusing in the case of jobs which had started up.
diff --git a/jobmanagers/retry.json b/jobmanagers/retry.json
@@ -8,7 +8,7 @@
     "^error: .Errno 12. Cannot allocate memory",
     "^error: JSV stderr: error: commlib error: got select error (Connection refused)",
     "^Unable to run job: failed receiving gdi request response",
-    "^According to the job manager, the job for .+ was not queued or running,",
+    "^According to the job manager, the job for .+ has not been (?:queued or )?running",
     "^IOError: \\[Errno 116\\] Stale file handle",
     "^OSError: \\[Errno 11\\] Resource temporarily unavailable",
     "^jobcmd error \\(exit status \\d+\\)",
diff --git a/martian/core/metadata.go b/martian/core/metadata.go
@@ -846,10 +846,19 @@ func (self *Metadata) endRefresh(lastRefresh time.Time) {
 			// The job is not running but the metadata thinks it still is.
 			// The check for metadata updates was completed since the time that
 			// the queue query completed.  This job has failed.  Write an error.
-			err := self._writeRawNoLock(Errors, fmt.Sprintf(
-				"According to the job manager, the job for %s was not queued "+
-					"or running, since at least %s.",
-				self.fqname, notRunningSince.Format(util.TIMEFMT)))
+			var err error
+			if state == Running {
+				err = self._writeRawNoLock(Errors, fmt.Sprintf(
+					"According to the job manager, the job for %s (%s) has "+
+						"not been running since at least %s.",
+					self.fqname, jobid, notRunningSince.Format(util.TIMEFMT)))
+			} else {
+				err = self._writeRawNoLock(Errors, fmt.Sprintf(
+					"According to the job manager, the job for %s (%s) "+
+						"has not been queued or running since at least %s, but "+
+						"it does not appear to have started successfully.",
+					self.fqname, jobid, notRunningSince.Format(util.TIMEFMT)))
+			}
 			if err != nil {
 				util.LogError(err, "runtime",
 					"Error writing error message about cluster-mode job not running.")