Yelp · nemacysts · Aug 28, 2024 · Jul 5, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/tron/config/config_parse.py b/tron/config/config_parse.py
@@ -867,12 +867,14 @@ class ValidateKubernetes(Validator):
     defaults = {
         "kubeconfig_path": None,
         "enabled": False,
+        "disable_retries_on_lost": False,
         "default_volumes": (),
     }
 
     validators = {
         "kubeconfig_path": valid_string,
         "enabled": valid_bool,
+        "disable_retries_on_lost": valid_bool,
         "default_volumes": build_list_of_type_validator(valid_volume, allow_empty=True),
     }
 

diff --git a/tron/config/schema.py b/tron/config/schema.py
@@ -119,6 +119,7 @@ def config_object_factory(name, required=None, optional=None):
     optional=[
         "kubeconfig_path",
         "enabled",
+        "disable_retries_on_lost",
         "default_volumes",
     ],
 )

diff --git a/tron/core/actionrun.py b/tron/core/actionrun.py
@@ -1314,6 +1314,34 @@ def kill(self, final: bool = True) -> Optional[str]:
 
         return "\n".join(msgs)
 
+    def _exit_unsuccessful(self, exit_status=None, retry_original_command=True) -> Optional[Union[bool, ActionCommand]]:
+
+        k8s_cluster = KubernetesClusterRepository.get_cluster()
+        disable_retries_on_lost = False if not k8s_cluster else k8s_cluster.disable_retries_on_lost
+
+        if self.is_done:
+            log.info(
+                f"{self} got exit code {exit_status} but already in terminal " f'state "{self.state}", not retrying',
+            )
+            return None
+        if self.last_attempt is not None:
+            self.last_attempt.exit(exit_status)
+        if self.retries_remaining is not None:
+            if disable_retries_on_lost and exit_status == exitcode.EXIT_KUBERNETES_TASK_LOST:
+                log.info(f"{self} skipping auto-retries due to disable_retries_on_lost being enabled.")
+            else:
+                if self.retries_remaining > 0:
+                    self.retries_remaining -= 1
+                    return self.restart(original_command=retry_original_command)
+                else:
+                    log.info(
+                        f"Reached maximum number of retries: {len(self.attempts)}",
+                    )
+        if exit_status is None:
+            return self._done("fail_unknown", exit_status)
+        else:
+            return self._done("fail", exit_status)
+
     def handle_action_command_state_change(
         self, action_command: ActionCommand, event: str, event_data=None
     ) -> Optional[Union[bool, ActionCommand]]:

diff --git a/tron/kubernetes.py b/tron/kubernetes.py
@@ -252,7 +252,7 @@ def handle_event(self, event: Event) -> None:
                 self.log.warning(f"    tronctl skip {self.id}")
                 self.log.warning("If you want Tron to NOT run it and consider it as a failure, fail it with:")
                 self.log.warning(f"    tronctl fail {self.id}")
-                self.exited(None)
+                self.exited(exitcode.EXIT_KUBERNETES_TASK_LOST)
             else:
                 self.log.info(
                     f"Did not handle unknown kubernetes event type: {event}",
@@ -280,10 +280,12 @@ def __init__(
         enabled: bool = True,
         default_volumes: Optional[List[ConfigVolume]] = None,
         pod_launch_timeout: Optional[int] = None,
+        disable_retries_on_lost: bool = False,
     ):
         # general k8s config
         self.kubeconfig_path = kubeconfig_path
         self.enabled = enabled
+        self.disable_retries_on_lost = disable_retries_on_lost
         self.default_volumes: Optional[List[ConfigVolume]] = default_volumes or []
         self.pod_launch_timeout = pod_launch_timeout or DEFAULT_POD_LAUNCH_TIMEOUT_S
         # creating a task_proc executor has a couple steps:
@@ -618,6 +620,7 @@ def recover(self, task: KubernetesTask) -> None:
 class KubernetesClusterRepository:
     # Kubernetes config
     kubernetes_enabled: bool = False
+    kubernetes_disable_retries_on_lost: bool = False
     kubeconfig_path: Optional[str] = None
     pod_launch_timeout: Optional[int] = None
     default_volumes: Optional[List[ConfigVolume]] = None
@@ -658,6 +661,7 @@ def shutdown(cls) -> None:
     def configure(cls, kubernetes_options: ConfigKubernetes) -> None:
         cls.kubeconfig_path = kubernetes_options.kubeconfig_path
         cls.kubernetes_enabled = kubernetes_options.enabled
+        cls.kubernetes_disable_retries_on_lost = kubernetes_options.disable_retries_on_lost
         cls.default_volumes = kubernetes_options.default_volumes
 
         for cluster in cls.clusters.values():

diff --git a/tron/utils/exitcode.py b/tron/utils/exitcode.py
@@ -10,6 +10,7 @@
 EXIT_KUBERNETES_ABNORMAL = -9
 EXIT_KUBERNETES_SPOT_INTERRUPTION = -10
 EXIT_KUBERNETES_NODE_SCALEDOWN = -11
+EXIT_KUBERNETES_TASK_LOST = -12
 
 EXIT_REASONS = {
     EXIT_INVALID_COMMAND: "Invalid command",
@@ -23,4 +24,5 @@
     EXIT_KUBERNETES_ABNORMAL: "Kubernetes task failed in an unexpected manner",
     EXIT_KUBERNETES_SPOT_INTERRUPTION: "Kubernetes task failed due to spot interruption",
     EXIT_KUBERNETES_NODE_SCALEDOWN: "Kubernetes task failed due to the autoscaler scaling down a node",
+    EXIT_KUBERNETES_TASK_LOST: "Tron lost track of a pod it already thought it had started for a job.",
 }