From 43aa4dfa7bd61520e1033ca39cf9aec7155f907f Mon Sep 17 00:00:00 2001 From: xjules Date: Wed, 6 Mar 2024 13:28:53 +0100 Subject: [PATCH] Do not retry when we get 'Job has finished' from qdel --- src/ert/scheduler/openpbs_driver.py | 48 ++++++++++++------- .../scheduler/test_openpbs_driver.py | 10 ++-- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/src/ert/scheduler/openpbs_driver.py b/src/ert/scheduler/openpbs_driver.py index cfadbb15aeb..8256c52f8d0 100644 --- a/src/ert/scheduler/openpbs_driver.py +++ b/src/ert/scheduler/openpbs_driver.py @@ -3,7 +3,16 @@ import asyncio import logging import shlex -from typing import List, Literal, Mapping, MutableMapping, Optional, Tuple, Union +from typing import ( + Iterable, + List, + Literal, + Mapping, + MutableMapping, + Optional, + Tuple, + Union, +) from pydantic import BaseModel, Field from typing_extensions import Annotated @@ -29,19 +38,11 @@ "X", # Expired (subjobs only) ] -QSUB_INVALID_CREDENTIAL: int = 171 -QSUB_PREMATURE_END_OF_MESSAGE: int = 183 -QSUB_CONNECTION_REFUSED: int = 162 -QDEL_JOB_HAS_FINISHED: int = 35 -QDEL_REQUEST_INVALID: int = 168 - -QSUB_EXIT_CODES = [ - QSUB_INVALID_CREDENTIAL, - QSUB_PREMATURE_END_OF_MESSAGE, - QSUB_CONNECTION_REFUSED, -] - -QDEL_EXIT_CODES = [QDEL_REQUEST_INVALID, QDEL_JOB_HAS_FINISHED] +QSUB_INVALID_CREDENTIAL = 171 +QSUB_PREMATURE_END_OF_MESSAGE = 183 +QSUB_CONNECTION_REFUSED = 162 +QDEL_JOB_HAS_FINISHED = 35 +QDEL_REQUEST_INVALID = 168 class IgnoredJobstates(BaseModel): @@ -123,7 +124,8 @@ def _resource_string(self) -> str: async def _execute_with_retry( self, cmd_with_args: List[str], - exit_codes_triggering_retries: List[int], + retry_codes: Iterable[int] = (), + accept_codes: Iterable[int] = (), ) -> Tuple[bool, str]: error_message: Optional[str] = None @@ -135,10 +137,13 @@ async def _execute_with_retry( ) stdout, stderr = await process.communicate() + assert process.returncode is not None if process.returncode == 0: return True, stdout.decode(errors="ignore").strip() - elif process.returncode in exit_codes_triggering_retries: + elif process.returncode in retry_codes: error_message = stderr.decode(errors="ignore").strip() + elif process.returncode in accept_codes: + return True, stderr.decode(errors="ignore").strip() else: error_message = ( f'Command "{shlex.join(cmd_with_args)}" failed ' @@ -187,7 +192,12 @@ async def submit( logger.debug(f"Submitting to PBS with command {shlex.join(qsub_with_args)}") process_success, process_message = await self._execute_with_retry( - qsub_with_args, exit_codes_triggering_retries=QSUB_EXIT_CODES + qsub_with_args, + retry_codes=( + QSUB_INVALID_CREDENTIAL, + QSUB_PREMATURE_END_OF_MESSAGE, + QSUB_CONNECTION_REFUSED, + ), ) if not process_success: raise RuntimeError(process_message) @@ -207,7 +217,9 @@ async def kill(self, iens: int) -> None: logger.debug(f"Killing realization {iens} with PBS-id {job_id}") process_success, process_message = await self._execute_with_retry( - ["qdel", str(job_id)], exit_codes_triggering_retries=QDEL_EXIT_CODES + ["qdel", str(job_id)], + retry_codes=(QDEL_REQUEST_INVALID,), + accept_codes=(QDEL_JOB_HAS_FINISHED,), ) if not process_success: raise RuntimeError(process_message) diff --git a/tests/unit_tests/scheduler/test_openpbs_driver.py b/tests/unit_tests/scheduler/test_openpbs_driver.py index f8546b7e4e1..21a610505c7 100644 --- a/tests/unit_tests/scheduler/test_openpbs_driver.py +++ b/tests/unit_tests/scheduler/test_openpbs_driver.py @@ -332,9 +332,13 @@ async def test_that_qdel_will_retry_and_succeed( driver._retry_pbs_cmd_interval = 0.2 driver._iens2jobid[0] = 111 await driver.kill(0) - assert "TRIED" in Path(bin_path / "script_try").read_text(encoding="utf-8") - assert "qdel executed" in Path(bin_path / "qdel_output").read_text(encoding="utf-8") - assert error_msg in Path(bin_path / "qdel_error").read_text(encoding="utf-8") + assert "TRIED" in (bin_path / "script_try").read_text() + if exit_code == QDEL_JOB_HAS_FINISHED: + # the job has been already qdel-ed so no need to retry + assert not (bin_path / "qdel_output").exists() + else: + assert "qdel executed" in (bin_path / "qdel_output").read_text() + assert error_msg in (bin_path / "qdel_error").read_text() @pytest.mark.usefixtures("capturing_qsub")