diff --git a/src/clib/lib/job_queue/lsf_driver.cpp b/src/clib/lib/job_queue/lsf_driver.cpp index 7d91850f9a4..00c42f07bf2 100644 --- a/src/clib/lib/job_queue/lsf_driver.cpp +++ b/src/clib/lib/job_queue/lsf_driver.cpp @@ -735,13 +735,22 @@ void lsf_driver_kill_job(void *_driver, void *_job) { char **argv = (char **)calloc(2, sizeof *argv); CHECK_ALLOC(argv); argv[0] = driver->remote_lsf_server; - argv[1] = saprintf("%s %s %s", driver->bkill_cmd, "-s SIGKILL", + argv[1] = saprintf("%s %s %s", driver->bkill_cmd, "-s SIGTERM", job->lsf_jobnr_char); spawn_blocking(driver->rsh_cmd, 2, (const char **)argv, NULL, NULL); free(argv[1]); free(argv); + + char **argv2 = (char **)calloc(2, sizeof *argv2); + argv2[0] = driver->remote_lsf_server; + argv2[1] = saprintf("%s %s %s %s %s", "sleep 30;", driver->bkill_cmd, + "-s", "SIGKILL", job->lsf_jobnr_char); + spawn(driver->rsh_cmd, 2, (const char **)argv2, NULL, NULL); + free(argv2[0]); + free(argv2); + } else if (driver->submit_method == LSF_SUBMIT_LOCAL_SHELL) { char **argv = (char **)calloc(3, sizeof *argv); CHECK_ALLOC(argv); @@ -753,6 +762,14 @@ void lsf_driver_kill_job(void *_driver, void *_job) { free(argv[1]); free(argv[2]); free(argv); + + char **argv2 = (char **)calloc(1, sizeof *argv2); + argv2[0] = saprintf("%s %s %s %s %s", "sleep 30;", driver->bkill_cmd, + "-s", "SIGKILL", job->lsf_jobnr_char); + spawn(saprintf("%s", "/bin/sh -c"), 1, (const char **)argv2, NULL, + NULL); + free(argv2[0]); + free(argv2); } } diff --git a/src/ert/scheduler/lsf_driver.py b/src/ert/scheduler/lsf_driver.py index b55ba7cbdd6..3ae6e03c78b 100644 --- a/src/ert/scheduler/lsf_driver.py +++ b/src/ert/scheduler/lsf_driver.py @@ -7,6 +7,7 @@ import shlex import shutil import stat +import subprocess import tempfile from pathlib import Path from typing import ( @@ -115,6 +116,7 @@ def __init__( self._jobs: MutableMapping[str, Tuple[int, AnyJob]] = {} self._iens2jobid: MutableMapping[int, str] = {} self._max_attempt: int = 100 + self._sleep_time_between_bkills = 30 self._sleep_time_between_cmd_retries = 3 self._bsub_retries = 10 @@ -191,12 +193,16 @@ async def kill(self, iens: int) -> None: process = await asyncio.create_subprocess_exec( self._bkill_cmd, "-s", - "SIGKILL", + "SIGTERM", job_id, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) stdout, stderr = await process.communicate() + _sigkill_process = subprocess.Popen( + f"sleep {self._sleep_time_between_bkills}; {self._bkill_cmd} -s SIGKILL {job_id}", + shell=True, + ) if process.returncode: logger.error( f"LSF kill failed with returncode {process.returncode} " diff --git a/tests/unit_tests/scheduler/test_lsf_driver.py b/tests/unit_tests/scheduler/test_lsf_driver.py index 595c989a6d2..3c6ae80f2c8 100644 --- a/tests/unit_tests/scheduler/test_lsf_driver.py +++ b/tests/unit_tests/scheduler/test_lsf_driver.py @@ -267,7 +267,7 @@ async def test_kill( bkill_path.write_text( f"#!/bin/sh\necho '{bkill_stdout}'\n" f"echo '{bkill_stderr}' >&2\n" - f"echo $@ > 'bkill_args'\n" + f"echo $@ >> 'bkill_args'\n" f"exit {bkill_returncode}", encoding="utf-8", ) @@ -275,12 +275,26 @@ async def test_kill( driver = LsfDriver() driver._iens2jobid = mocked_iens2jobid + driver._sleep_time_between_bkills = 0 + await driver.kill(iens_to_kill) + + async def wait_for_sigkill_in_file(): + while True: + bkill_args = ( + Path("bkill_args").read_text(encoding="utf-8").strip().split("\n") + ) + if f"-s SIGKILL {mocked_iens2jobid[iens_to_kill]}" in bkill_args: + break + await asyncio.sleep(0.1) + if expected_logged_error: assert expected_logged_error in caplog.text else: - bkill_args = Path("bkill_args").read_text(encoding="utf-8").strip() - assert f"-s SIGKILL {mocked_iens2jobid[iens_to_kill]}" in bkill_args + bkill_args = Path("bkill_args").read_text(encoding="utf-8").strip().split("\n") + assert f"-s SIGTERM {mocked_iens2jobid[iens_to_kill]}" in bkill_args + + await asyncio.wait_for(wait_for_sigkill_in_file(), timeout=5) @given(st.text())