Skip to content

Commit ee01558

Browse files
committed
Improve "not queued or running" error message.
Include the job ID for debugging purposes. Use a somewhat different error message if the job had started up successfully compared to if it had not. The debugging steps are generally different for jobs which started but then died unexpectedly vs. ones that didn't start up at all, and the "not queued" part is just confusing in the case of jobs which had started up.
1 parent 20a0195 commit ee01558

File tree

2 files changed

+14
-5
lines changed

2 files changed

+14
-5
lines changed

jobmanagers/retry.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
"^error: .Errno 12. Cannot allocate memory",
99
"^error: JSV stderr: error: commlib error: got select error (Connection refused)",
1010
"^Unable to run job: failed receiving gdi request response",
11-
"^According to the job manager, the job for .+ was not queued or running,",
11+
"^According to the job manager, the job for .+ has not been (?:queued or )?running",
1212
"^IOError: \\[Errno 116\\] Stale file handle",
1313
"^OSError: \\[Errno 11\\] Resource temporarily unavailable",
1414
"^jobcmd error \\(exit status \\d+\\)",

martian/core/metadata.go

+13-4
Original file line numberDiff line numberDiff line change
@@ -846,10 +846,19 @@ func (self *Metadata) endRefresh(lastRefresh time.Time) {
846846
// The job is not running but the metadata thinks it still is.
847847
// The check for metadata updates was completed since the time that
848848
// the queue query completed. This job has failed. Write an error.
849-
err := self._writeRawNoLock(Errors, fmt.Sprintf(
850-
"According to the job manager, the job for %s was not queued "+
851-
"or running, since at least %s.",
852-
self.fqname, notRunningSince.Format(util.TIMEFMT)))
849+
var err error
850+
if state == Running {
851+
err = self._writeRawNoLock(Errors, fmt.Sprintf(
852+
"According to the job manager, the job for %s (%s) has "+
853+
"not been running since at least %s.",
854+
self.fqname, jobid, notRunningSince.Format(util.TIMEFMT)))
855+
} else {
856+
err = self._writeRawNoLock(Errors, fmt.Sprintf(
857+
"According to the job manager, the job for %s (%s) "+
858+
"has not been queued or running since at least %s, but "+
859+
"it does not appear to have started successfully.",
860+
self.fqname, jobid, notRunningSince.Format(util.TIMEFMT)))
861+
}
853862
if err != nil {
854863
util.LogError(err, "runtime",
855864
"Error writing error message about cluster-mode job not running.")

0 commit comments

Comments
 (0)