From 06a8928f4baf1f26923c66d1b1cc8503ebbc2bf1 Mon Sep 17 00:00:00 2001 From: Madhur Tandon Date: Mon, 4 Nov 2024 11:07:32 +0530 Subject: [PATCH 1/2] fix timeout reason for k8s --- metaflow/plugins/kubernetes/kubernetes_job.py | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/metaflow/plugins/kubernetes/kubernetes_job.py b/metaflow/plugins/kubernetes/kubernetes_job.py index 76b028471a8..cd8af5a8d70 100644 --- a/metaflow/plugins/kubernetes/kubernetes_job.py +++ b/metaflow/plugins/kubernetes/kubernetes_job.py @@ -732,22 +732,26 @@ def reason(self): # If pod status is dirty, check for newer status self._pod = self._fetch_pod() if self._pod: - if self._pod.get("status", {}).get("container_statuses") is None: - # We're done, but no container_statuses is set - # This can happen when the pod is evicted + pod_status = self._pod.get("status", {}) + pod_reason = pod_status.get("reason") + # Check for pod-level reasons (like timeout) first - whether container_statuses exists or not + # If no container_statuses is set, This can happen when the pod is evicted + if ( + pod_reason == "DeadlineExceeded" + or pod_status.get("container_statuses") is None + ): return None, ": ".join( filter( None, [ - self._pod.get("status", {}).get("reason"), - self._pod.get("status", {}).get("message"), + pod_status.get("reason"), + pod_status.get("message"), ], ) ) - for k, v in ( - self._pod.get("status", {}) - .get("container_statuses", [{}])[0] + for _, v in ( + pod_status.get("container_statuses", [{}])[0] .get("state", {}) .items() ): From 8a239a761beb68f2b3bd57f51f36b7584777f63f Mon Sep 17 00:00:00 2001 From: Madhur Tandon Date: Wed, 6 Nov 2024 23:07:00 +0530 Subject: [PATCH 2/2] remove bits about transient and retry --- metaflow/plugins/kubernetes/kubernetes.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metaflow/plugins/kubernetes/kubernetes.py b/metaflow/plugins/kubernetes/kubernetes.py index dae6e2a648e..8501bec1c8d 100644 --- a/metaflow/plugins/kubernetes/kubernetes.py +++ b/metaflow/plugins/kubernetes/kubernetes.py @@ -761,6 +761,8 @@ def _has_updates(): raise KubernetesException("%s (exit code %s)" % (msg, exit_code)) else: msg = "%s (exit code %s)" % (msg, exit_code) + if msg.startswith("DeadlineExceeded"): + raise KubernetesException(msg) raise KubernetesException( "%s. This could be a transient error. Use @retry to retry." % msg )