From d3be5eca58c17ed315836e42d222151fedfb489b Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Mon, 13 Jan 2025 15:17:40 +0000
Subject: [PATCH 1/9] Tidy

---
 cylc/flow/scheduler.py    |  7 ++++---
 cylc/flow/subprocctx.py   |  4 ++++
 cylc/flow/task_job_mgr.py | 43 +++++++++++++++++++++++++++------------
 cylc/flow/task_proxy.py   |  4 ++--
 4 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py
index dec3a279ce9..dbfbcdd5069 100644
--- a/cylc/flow/scheduler.py
+++ b/cylc/flow/scheduler.py
@@ -1250,6 +1250,7 @@ def get_contact_data(self) -> Dict[str, str]:
         """
         fields = workflow_files.ContactFileFields
         proc = psutil.Process()
+        platform = get_platform()
         # fmt: off
         return {
             fields.API:
@@ -1275,11 +1276,11 @@ def get_contact_data(self) -> Dict[str, str]:
             fields.VERSION:
                 CYLC_VERSION,
             fields.SCHEDULER_SSH_COMMAND:
-                str(get_platform()['ssh command']),
+                str(platform['ssh command']),
             fields.SCHEDULER_CYLC_PATH:
-                str(get_platform()['cylc path']),
+                str(platform['cylc path']),
             fields.SCHEDULER_USE_LOGIN_SHELL:
-                str(get_platform()['use login shell'])
+                str(platform['use login shell'])
         }
         # fmt: on
 
diff --git a/cylc/flow/subprocctx.py b/cylc/flow/subprocctx.py
index 41b735a71c5..e54b2815f3b 100644
--- a/cylc/flow/subprocctx.py
+++ b/cylc/flow/subprocctx.py
@@ -23,6 +23,7 @@
 from shlex import quote
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+
 from cylc.flow.wallclock import get_current_time_string
 
 if TYPE_CHECKING:
@@ -137,6 +138,9 @@ def __str__(self):
                     'mesg': mesg}
         return ret.rstrip()
 
+    def __repr__(self) -> str:
+        return f"<{type(self).__name__} {self.cmd_key}>"
+
 
 class SubFuncContext(SubProcContext):
     """Represent the context of a Python function to run as a subprocess.
diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py
index 7e1cfa02cc4..5595fd474f5 100644
--- a/cylc/flow/task_job_mgr.py
+++ b/cylc/flow/task_job_mgr.py
@@ -123,6 +123,11 @@
 
 
 if TYPE_CHECKING:
+    # BACK COMPAT: typing_extensions.Literal
+    # FROM: Python 3.7
+    # TO: Python 3.8
+    from typing_extensions import Literal
+
     from cylc.flow.task_proxy import TaskProxy
     from cylc.flow.workflow_db_mgr import WorkflowDatabaseManager
 
@@ -159,7 +164,7 @@ def __init__(self, workflow, proc_pool, workflow_db_mgr,
         self.workflow = workflow
         self.proc_pool = proc_pool
         self.workflow_db_mgr: WorkflowDatabaseManager = workflow_db_mgr
-        self.task_events_mgr = task_events_mgr
+        self.task_events_mgr: TaskEventsManager = task_events_mgr
         self.data_store_mgr = data_store_mgr
         self.job_file_writer = JobFileWriter()
         self.job_runner_mgr = self.job_file_writer.job_runner_mgr
@@ -220,14 +225,19 @@ def poll_task_jobs(self, workflow, itasks, msg=None):
                 self._poll_task_jobs_callback_255
             )
 
-    def prep_submit_task_jobs(self, workflow, itasks, check_syntax=True):
+    def prep_submit_task_jobs(
+        self,
+        workflow: str,
+        itasks: 'Iterable[TaskProxy]',
+        check_syntax: bool = True,
+    ) -> 'Tuple[List[TaskProxy], List[TaskProxy]]':
         """Prepare task jobs for submit.
 
         Prepare tasks where possible. Ignore tasks that are waiting for host
         select command to complete. Bad host select command or error writing to
         a job file will cause a bad task - leading to submission failure.
 
-        Return [list, list]: list of good tasks, list of bad tasks
+        Return (good_tasks, bad_tasks)
         """
         prepared_tasks = []
         bad_tasks = []
@@ -244,16 +254,16 @@ def prep_submit_task_jobs(self, workflow, itasks, check_syntax=True):
                 prepared_tasks.append(itask)
             elif prep_task is False:
                 bad_tasks.append(itask)
-        return [prepared_tasks, bad_tasks]
+        return (prepared_tasks, bad_tasks)
 
     def submit_task_jobs(
         self,
         workflow,
-        itasks,
+        itasks: 'Iterable[TaskProxy]',
         curve_auth,
         client_pub_key_dir,
         run_mode: RunMode = RunMode.LIVE,
-    ):
+    ) -> 'List[TaskProxy]':
         """Prepare for job submission and submit task jobs.
 
         Preparation (host selection, remote host init, and remote install)
@@ -264,7 +274,7 @@ def submit_task_jobs(
         Once preparation has completed or failed, reset .waiting_on_job_prep in
         task instances so the scheduler knows to stop sending them back here.
 
-        This method uses prep_submit_task_job() as helper.
+        This method uses prep_submit_task_jobs() as helper.
 
         Return (list): list of tasks that attempted submission.
         """
@@ -1029,7 +1039,7 @@ def _set_retry_timers(
     def submit_nonlive_task_jobs(
         self: 'TaskJobManager',
         workflow: str,
-        itasks: 'List[TaskProxy]',
+        itasks: 'Iterable[TaskProxy]',
         workflow_run_mode: RunMode,
     ) -> 'Tuple[List[TaskProxy], List[TaskProxy]]':
         """Identify task mode and carry out alternative submission
@@ -1152,7 +1162,7 @@ def _prep_submit_task_job(
         workflow: str,
         itask: 'TaskProxy',
         check_syntax: bool = True
-    ):
+    ) -> 'Union[TaskProxy, None, Literal[False]]':
         """Prepare a task job submission.
 
         Returns:
@@ -1217,7 +1227,7 @@ def _prep_submit_task_job(
         else:
             # host/platform select not ready
             if host_n is None and platform_name is None:
-                return
+                return None
             elif (
                 host_n is None
                 and rtconfig['platform']
@@ -1292,7 +1302,13 @@ def _prep_submit_task_job(
         itask.local_job_file_path = local_job_file_path
         return itask
 
-    def _prep_submit_task_job_error(self, workflow, itask, action, exc):
+    def _prep_submit_task_job_error(
+        self,
+        workflow: str,
+        itask: 'TaskProxy',
+        action: str,
+        exc: Union[Exception, str],
+    ) -> None:
         """Helper for self._prep_submit_task_job. On error."""
         log_task_job_activity(
             SubProcContext(self.JOBS_SUBMIT, action, err=exc, ret_code=1),
@@ -1306,11 +1322,12 @@ def _prep_submit_task_job_error(self, workflow, itask, action, exc):
         # than submit-failed
         # provide a dummy job config - this info will be added to the data
         # store
+        try_num = itask.get_try_num()
         itask.jobs.append({
             'task_id': itask.identity,
             'platform': itask.platform,
             'submit_num': itask.submit_num,
-            'try_num': itask.get_try_num(),
+            'try_num': try_num,
         })
         # create a DB entry for the submit-failed job
         self.workflow_db_mgr.put_insert_task_jobs(
@@ -1319,7 +1336,7 @@ def _prep_submit_task_job_error(self, workflow, itask, action, exc):
                 'flow_nums': serialise_set(itask.flow_nums),
                 'job_id': itask.summary.get('submit_method_id'),
                 'is_manual_submit': itask.is_manual_submit,
-                'try_num': itask.get_try_num(),
+                'try_num': try_num,
                 'time_submit': get_current_time_string(),
                 'platform_name': itask.platform['name'],
                 'job_runner_name': itask.summary['job_runner_name'],
diff --git a/cylc/flow/task_proxy.py b/cylc/flow/task_proxy.py
index 8d9134a6ac4..620b647f909 100644
--- a/cylc/flow/task_proxy.py
+++ b/cylc/flow/task_proxy.py
@@ -152,7 +152,7 @@ class TaskProxy:
             graph children: {msg: [(name, point), ...]}
         .flow_nums:
             flows I belong to (if empty, belongs to 'none' flow)
-         flow_wait:
+        .flow_wait:
             wait for flow merge before spawning children
         .waiting_on_job_prep:
             True whilst task is awaiting job prep, reset to False once the
@@ -316,7 +316,7 @@ def __init__(
             )
 
     def __repr__(self) -> str:
-        return f"<{self.__class__.__name__} {self.identity}>"
+        return f"<{type(self).__name__} {self.identity} {self.state}>"
 
     def __str__(self) -> str:
         """Stringify with tokens, state, submit_num, and flow_nums.

From 21be5b01c44be1773627235fc2c04f7d6521da84 Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Mon, 13 Jan 2025 16:34:04 +0000
Subject: [PATCH 2/9] Kill tasks during job prep

---
 changes.d/6535.fix.md          |  1 +
 cylc/flow/scheduler.py         | 13 +++++----
 cylc/flow/task_job_mgr.py      | 11 +++++++-
 tests/integration/test_kill.py | 48 ++++++++++++++++++++++++++++++++++
 4 files changed, 67 insertions(+), 6 deletions(-)
 create mode 100644 changes.d/6535.fix.md
 create mode 100644 tests/integration/test_kill.py

diff --git a/changes.d/6535.fix.md b/changes.d/6535.fix.md
new file mode 100644
index 00000000000..ab5fc6bb3ca
--- /dev/null
+++ b/changes.d/6535.fix.md
@@ -0,0 +1 @@
+Ensure tasks can be killed while in the preparing state.
diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py
index dbfbcdd5069..eec8dd43b67 100644
--- a/cylc/flow/scheduler.py
+++ b/cylc/flow/scheduler.py
@@ -1078,9 +1078,14 @@ def kill_tasks(
         to_kill: List[TaskProxy] = []
         unkillable: List[TaskProxy] = []
         for itask in itasks:
-            if itask.state(*TASK_STATUSES_ACTIVE):
-                if itask.state_reset(is_held=True):
-                    self.data_store_mgr.delta_task_state(itask)
+            if not itask.state(TASK_STATUS_PREPARING, *TASK_STATUSES_ACTIVE):
+                unkillable.append(itask)
+                continue
+            if itask.state_reset(is_held=True):
+                self.data_store_mgr.delta_task_state(itask)
+            if itask.state(TASK_STATUS_PREPARING):
+                self.task_job_mgr.kill_prep_task(itask)
+            else:
                 to_kill.append(itask)
                 if jobless:
                     # Directly set failed in sim mode:
@@ -1088,8 +1093,6 @@ def kill_tasks(
                         itask, 'CRITICAL', TASK_STATUS_FAILED,
                         flag=self.task_events_mgr.FLAG_RECEIVED
                     )
-            else:
-                unkillable.append(itask)
         if warn and unkillable:
             LOG.warning(
                 "Tasks not killable: "
diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py
index 5595fd474f5..69da98eb8ff 100644
--- a/cylc/flow/task_job_mgr.py
+++ b/cylc/flow/task_job_mgr.py
@@ -161,7 +161,7 @@ class TaskJobManager:
 
     def __init__(self, workflow, proc_pool, workflow_db_mgr,
                  task_events_mgr, data_store_mgr, bad_hosts):
-        self.workflow = workflow
+        self.workflow: str = workflow
         self.proc_pool = proc_pool
         self.workflow_db_mgr: WorkflowDatabaseManager = workflow_db_mgr
         self.task_events_mgr: TaskEventsManager = task_events_mgr
@@ -201,6 +201,15 @@ def kill_task_jobs(
             self._kill_task_jobs_callback_255
         )
 
+    def kill_prep_task(self, itask: 'TaskProxy') -> None:
+        """Kill a preparing task."""
+        itask.waiting_on_job_prep = False
+        itask.local_job_file_path = None  # reset for retry
+        self._set_retry_timers(itask)
+        self._prep_submit_task_job_error(
+            self.workflow, itask, '(killed in job prep)', ''
+        )
+
     def poll_task_jobs(self, workflow, itasks, msg=None):
         """Poll jobs of specified tasks.
 
diff --git a/tests/integration/test_kill.py b/tests/integration/test_kill.py
new file mode 100644
index 00000000000..be88adf123a
--- /dev/null
+++ b/tests/integration/test_kill.py
@@ -0,0 +1,48 @@
+# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
+# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+import pytest
+
+from cylc.flow.commands import kill_tasks, run_cmd
+from cylc.flow.scheduler import Scheduler
+from cylc.flow.task_state import TASK_STATUS_SUBMIT_FAILED, TASK_STATUS_WAITING
+
+
+async def test_kill_preparing(
+    one_conf,
+    flow,
+    scheduler,
+    start,
+    monkeypatch: pytest.MonkeyPatch,
+    log_filter,
+):
+    """Test killing a preparing task."""
+    schd: Scheduler = scheduler(
+        flow(one_conf), run_mode='live', paused_start=False
+    )
+    async with start(schd):
+        # Make the task indefinitely preparing:
+        monkeypatch.setattr(
+            schd.task_job_mgr, '_prep_submit_task_job', lambda *a, **k: None
+        )
+        itask = schd.pool.get_tasks()[0]
+        assert itask.state(TASK_STATUS_WAITING, is_held=False)
+        schd.start_job_submission([itask])
+
+        await run_cmd(kill_tasks(schd, [itask.tokens.relative_id]))
+        assert itask.state(TASK_STATUS_SUBMIT_FAILED, is_held=True)
+        assert log_filter(logging.ERROR, 'killed in job prep')

From da76cf48809ecfbe777458a70d511e3f380570e5 Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Thu, 16 Jan 2025 14:57:03 +0000
Subject: [PATCH 3/9] Add tests for `cylc kill` handlers & retries behaviour

---
 tests/functional/cylc-kill/04-handlers.t      | 47 +++++++++++++++++++
 .../cylc-kill/04-handlers/flow.cylc           | 39 +++++++++++++++
 .../cylc-kill/04-handlers/reference.log       |  6 +++
 tests/functional/cylc-kill/05-retries.t       | 38 +++++++++++++++
 .../functional/cylc-kill/05-retries/flow.cylc | 38 +++++++++++++++
 .../cylc-kill/05-retries/reference.log        |  6 +++
 tests/functional/lib/bash/test_header         |  4 +-
 7 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 tests/functional/cylc-kill/04-handlers.t
 create mode 100644 tests/functional/cylc-kill/04-handlers/flow.cylc
 create mode 100644 tests/functional/cylc-kill/04-handlers/reference.log
 create mode 100644 tests/functional/cylc-kill/05-retries.t
 create mode 100644 tests/functional/cylc-kill/05-retries/flow.cylc
 create mode 100644 tests/functional/cylc-kill/05-retries/reference.log

diff --git a/tests/functional/cylc-kill/04-handlers.t b/tests/functional/cylc-kill/04-handlers.t
new file mode 100644
index 00000000000..f90e0c7a09a
--- /dev/null
+++ b/tests/functional/cylc-kill/04-handlers.t
@@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
+# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test event handlers when killing running/submitted/preparing tasks.
+# Any downstream tasks that depend on the `:submit-fail`/`:fail` outputs
+# SHOULD run.
+# Handlers for the `submission failed`/`failed` events SHOULD run.
+
+export REQUIRE_PLATFORM='runner:at'
+. "$(dirname "$0")/test_header"
+set_test_number 5
+
+# Create platform that ensures job will be in submitted state for long enough
+create_test_global_config '' "
+[platforms]
+    [[old_street]]
+        job runner = at
+        job runner command template = at now + 5 minutes
+        hosts = localhost
+        install target = localhost
+"
+
+install_and_validate
+reftest_run
+
+grep_workflow_log_ok "grep-a" "[(('event-handler-00', 'failed'), 1) out] 1/a" -F
+
+for task in b c; do
+    grep_workflow_log_ok "grep-${task}" \
+        "[(('event-handler-00', 'submission failed'), 1) out] 1/${task}" -F
+done
+
+purge
diff --git a/tests/functional/cylc-kill/04-handlers/flow.cylc b/tests/functional/cylc-kill/04-handlers/flow.cylc
new file mode 100644
index 00000000000..28adbfa35cb
--- /dev/null
+++ b/tests/functional/cylc-kill/04-handlers/flow.cylc
@@ -0,0 +1,39 @@
+[scheduler]
+    allow implicit tasks = True
+    [[events]]
+        expected task failures = 1/a, 1/b, 1/c
+        stall timeout = PT0S
+        abort on stall timeout = True
+[scheduling]
+    [[graph]]
+        R1 = """
+            a:started => killer
+            a:failed => end
+
+            b:submitted? => killer
+            b:submit-failed? => end
+
+            c:submit-failed? => end
+
+            c:submitted? => nope
+        """
+[runtime]
+    [[a, b, c]]
+        [[[events]]]
+            failed handlers = echo %(id)s
+            submission failed handlers = echo %(id)s
+    [[a]]
+        script = sleep 40
+    [[b]]
+        platform = old_street
+    [[c]]
+        platform = $(sleep 40; echo localhost)
+    [[killer]]
+        script = """
+            cylc kill "$CYLC_WORKFLOW_ID//1/a" "$CYLC_WORKFLOW_ID//1/b"
+
+            cylc__job__poll_grep_workflow_log -E '1\/c.* => preparing'
+            cylc kill "$CYLC_WORKFLOW_ID//1/c"
+        """
+    [[end]]
+        script = cylc stop "$CYLC_WORKFLOW_ID" --now --now
diff --git a/tests/functional/cylc-kill/04-handlers/reference.log b/tests/functional/cylc-kill/04-handlers/reference.log
new file mode 100644
index 00000000000..fa8414bf650
--- /dev/null
+++ b/tests/functional/cylc-kill/04-handlers/reference.log
@@ -0,0 +1,6 @@
+Initial point: 1
+Final point: 1
+1/a -triggered off []
+1/b -triggered off []
+1/killer -triggered off ['1/a', '1/b']
+1/end -triggered off ['1/a', '1/b', '1/c']
diff --git a/tests/functional/cylc-kill/05-retries.t b/tests/functional/cylc-kill/05-retries.t
new file mode 100644
index 00000000000..03a90a6001f
--- /dev/null
+++ b/tests/functional/cylc-kill/05-retries.t
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
+# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+# Test retries when killing running/submitted/preparing tasks.
+# As killing tasks puts them in the held state, retries should NOT go ahead.
+
+export REQUIRE_PLATFORM='runner:at'
+. "$(dirname "$0")/test_header"
+set_test_number 2
+
+# Create platform that ensures job will be in submitted state for long enough
+create_test_global_config '' "
+[platforms]
+    [[old_street]]
+        job runner = at
+        job runner command template = at now + 5 minutes
+        hosts = localhost
+        install target = localhost
+"
+
+install_and_validate
+reftest_run
+
+purge
diff --git a/tests/functional/cylc-kill/05-retries/flow.cylc b/tests/functional/cylc-kill/05-retries/flow.cylc
new file mode 100644
index 00000000000..0b6c276cd1d
--- /dev/null
+++ b/tests/functional/cylc-kill/05-retries/flow.cylc
@@ -0,0 +1,38 @@
+[scheduler]
+    allow implicit tasks = True
+    [[events]]
+        expected task failures = 1/a, 1/b, 1/c
+        stall timeout = PT0S
+        abort on stall timeout = True
+[scheduling]
+    [[graph]]
+        R1 = """
+            a:started & b:submitted? => killer => end
+
+            # Should not get these outputs:
+            a:failed? | b:submit-failed? | c:submit-failed? => nope
+        """
+[runtime]
+    [[a]]
+        script = sleep 40
+        execution retry delays = PT0S
+    [[b]]
+        platform = old_street
+        submission retry delays = PT0S
+    [[c]]
+        platform = $(sleep 40; echo localhost)
+        submission retry delays = PT0S
+    [[killer]]
+        script = """
+            cylc kill "$CYLC_WORKFLOW_ID//1/a" "$CYLC_WORKFLOW_ID//1/b"
+
+            cylc__job__poll_grep_workflow_log -E '1\/c.* => preparing'
+            cylc kill "$CYLC_WORKFLOW_ID//1/c"
+        """
+    [[end]]
+        script = """
+            for task in a b c; do
+                cylc__job__poll_grep_workflow_log -E "1\/${task}.* retrying"
+            done
+            cylc stop "$CYLC_WORKFLOW_ID" --now --now
+        """
diff --git a/tests/functional/cylc-kill/05-retries/reference.log b/tests/functional/cylc-kill/05-retries/reference.log
new file mode 100644
index 00000000000..121a4a40def
--- /dev/null
+++ b/tests/functional/cylc-kill/05-retries/reference.log
@@ -0,0 +1,6 @@
+Initial point: 1
+Final point: 1
+1/a -triggered off []
+1/b -triggered off []
+1/killer -triggered off ['1/a', '1/b']
+1/end -triggered off ['1/killer']
diff --git a/tests/functional/lib/bash/test_header b/tests/functional/lib/bash/test_header
index 76f52524eb0..07d75a7c4df 100644
--- a/tests/functional/lib/bash/test_header
+++ b/tests/functional/lib/bash/test_header
@@ -433,8 +433,10 @@ grep_workflow_log_ok() {
     local TEST_NAME="$1"
     local PATTERN="$2"
     shift 2
+    OPTS="$*"
     local LOG_FILE="${WORKFLOW_RUN_DIR}/log/scheduler/log"
-    if grep "$@" -s -e "$PATTERN" "$LOG_FILE"; then
+    # shellcheck disable=SC2086
+    if grep ${OPTS} -s -e "$PATTERN" "$LOG_FILE"; then
         ok "${TEST_NAME}"
         return
     fi

From 02a025e1c563559273e8699411e288f57eff4c2f Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:02:57 +0000
Subject: [PATCH 4/9] Replace functional test with integration test

---
 tests/functional/cylc-kill/03-simulation.t    | 42 ----------------
 .../cylc-kill/03-simulation/flow.cylc         | 12 -----
 tests/integration/test_kill.py                | 50 ++++++++++++++++++-
 3 files changed, 48 insertions(+), 56 deletions(-)
 delete mode 100755 tests/functional/cylc-kill/03-simulation.t
 delete mode 100644 tests/functional/cylc-kill/03-simulation/flow.cylc

diff --git a/tests/functional/cylc-kill/03-simulation.t b/tests/functional/cylc-kill/03-simulation.t
deleted file mode 100755
index dadfeede123..00000000000
--- a/tests/functional/cylc-kill/03-simulation.t
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-# THIS FILE IS PART OF THE CYLC WORKFLOW ENGINE.
-# Copyright (C) NIWA & British Crown (Met Office) & Contributors.
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-# Test kill a running simulation job
-
-. "$(dirname "$0")/test_header"
-
-set_test_number 3
-install_workflow "${TEST_NAME_BASE}" "${TEST_NAME_BASE}"
-
-run_ok "${TEST_NAME_BASE}-validate" cylc validate "${WORKFLOW_NAME}"
-
-# run workflow in background
-cylc play --debug -m simulation "${WORKFLOW_NAME}" >/dev/null 2>&1
-
-# wait for simulated job start
-poll_grep_workflow_log "1/foo.* running" -E
-
-# kill it
-run_ok killer cylc kill "${WORKFLOW_NAME}//1/foo"
-
-# wait for shut down
-poll_grep_workflow_log "INFO - DONE"
-
-# check the sim job was kiled
-grep_workflow_log_ok killed "1/foo.* failed" -E
-
-purge
diff --git a/tests/functional/cylc-kill/03-simulation/flow.cylc b/tests/functional/cylc-kill/03-simulation/flow.cylc
deleted file mode 100644
index 03b6249e962..00000000000
--- a/tests/functional/cylc-kill/03-simulation/flow.cylc
+++ /dev/null
@@ -1,12 +0,0 @@
-[scheduler]
-    [[events]]
-        inactivity timeout = PT20S
-        abort on inactivity timeout = True
-[scheduling]
-    [[graph]]
-        R1 = "foo?"
-[runtime]
-    [[root]]
-        [[[simulation]]]
-            default run length = PT30S
-    [[foo]]
diff --git a/tests/integration/test_kill.py b/tests/integration/test_kill.py
index be88adf123a..fd38b7e42ce 100644
--- a/tests/integration/test_kill.py
+++ b/tests/integration/test_kill.py
@@ -14,12 +14,58 @@
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
+
+import asyncio
 import logging
+
+from async_timeout import timeout as async_timeout
 import pytest
 
-from cylc.flow.commands import kill_tasks, run_cmd
+from cylc.flow.commands import (
+    kill_tasks,
+    run_cmd,
+)
 from cylc.flow.scheduler import Scheduler
-from cylc.flow.task_state import TASK_STATUS_SUBMIT_FAILED, TASK_STATUS_WAITING
+from cylc.flow.task_proxy import TaskProxy
+from cylc.flow.task_state import (
+    TASK_STATUS_FAILED,
+    TASK_STATUS_PREPARING,
+    TASK_STATUS_RUNNING,
+    TASK_STATUS_SUBMIT_FAILED,
+)
+
+
+async def task_state(itask: TaskProxy, state: str, timeout=4, **kwargs):
+    """Await task state."""
+    async with async_timeout(timeout):
+        while not itask.state(state, **kwargs):
+            await asyncio.sleep(0.1)
+
+
+async def test_simulation(flow, scheduler, run):
+    """Test killing a running task in simulation mode."""
+    conf = {
+        'scheduling': {
+            'graph': {
+                'R1': 'foo',
+            },
+        },
+        'runtime': {
+            'root': {
+                'simulation': {
+                    'default run length': 'PT30S',
+                },
+            },
+        },
+    }
+    schd: Scheduler = scheduler(flow(conf), paused_start=False)
+    async with run(schd):
+        itask = schd.pool.get_tasks()[0]
+        await task_state(itask, TASK_STATUS_RUNNING)
+
+        await run_cmd(kill_tasks(schd, [itask.identity]))
+        await task_state(itask, TASK_STATUS_FAILED, is_held=True)
+        assert schd.check_workflow_stalled()
 
 
 async def test_kill_preparing(

From 9146357590ba9ed10ed3360f2711d16b82f40242 Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:51:14 +0000
Subject: [PATCH 5/9] Fix return value of `Scheduler.start_job_submission()`

Was returning True even if no tasks were submitted
---
 cylc/flow/scheduler.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py
index eec8dd43b67..324f701486e 100644
--- a/cylc/flow/scheduler.py
+++ b/cylc/flow/scheduler.py
@@ -19,6 +19,7 @@
 from collections import deque
 from contextlib import suppress
 import itertools
+import logging
 import os
 from pathlib import Path
 from queue import (
@@ -82,6 +83,7 @@
     FLOW_NONE,
     FlowMgr,
     repr_flow_nums,
+    stringify_flow_nums,
 )
 from cylc.flow.host_select import (
     HostSelectException,
@@ -1535,22 +1537,24 @@ def start_job_submission(self, itasks: 'Iterable[TaskProxy]') -> bool:
         self.task_job_mgr.task_remote_mgr.rsync_includes = (
             self.config.get_validated_rsync_includes())
 
-        log = LOG.debug
-        if self.options.reftest or self.options.genref:
-            log = LOG.info
-
-        for itask in self.task_job_mgr.submit_task_jobs(
+        submitted = self.task_job_mgr.submit_task_jobs(
             self.workflow,
             itasks,
             self.server.curve_auth,
             self.server.client_pub_key_dir,
             run_mode=self.get_run_mode()
-        ):
-            if itask.flow_nums:
-                flow = ','.join(str(i) for i in itask.flow_nums)
-            else:
-                flow = FLOW_NONE
-            log(
+        )
+        if not submitted:
+            return False
+
+        log_lvl = logging.DEBUG
+        if self.options.reftest or self.options.genref:
+            log_lvl = logging.INFO
+
+        for itask in submitted:
+            flow = stringify_flow_nums(itask.flow_nums) or FLOW_NONE
+            LOG.log(
+                log_lvl,
                 f"{itask.identity} -triggered off "
                 f"{itask.state.get_resolved_dependencies()} in flow {flow}"
             )

From 475962bdda46efa67fc62bc73ba63b09bf986026 Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Fri, 17 Jan 2025 15:51:29 +0000
Subject: [PATCH 6/9] Add test

---
 tests/integration/test_kill.py | 99 +++++++++++++++++++++++++++++-----
 1 file changed, 86 insertions(+), 13 deletions(-)

diff --git a/tests/integration/test_kill.py b/tests/integration/test_kill.py
index fd38b7e42ce..3588ccf812e 100644
--- a/tests/integration/test_kill.py
+++ b/tests/integration/test_kill.py
@@ -17,6 +17,8 @@
 
 import asyncio
 import logging
+from secrets import token_hex
+from unittest.mock import Mock
 
 from async_timeout import timeout as async_timeout
 import pytest
@@ -27,6 +29,12 @@
 )
 from cylc.flow.scheduler import Scheduler
 from cylc.flow.task_proxy import TaskProxy
+from cylc.flow.task_remote_mgr import (
+    REMOTE_FILE_INSTALL_DONE,
+    REMOTE_FILE_INSTALL_IN_PROGRESS,
+    REMOTE_INIT_DONE,
+    REMOTE_INIT_IN_PROGRESS,
+)
 from cylc.flow.task_state import (
     TASK_STATUS_FAILED,
     TASK_STATUS_PREPARING,
@@ -35,6 +43,9 @@
 )
 
 
+LOCALHOST = 'localhost'
+
+
 async def task_state(itask: TaskProxy, state: str, timeout=4, **kwargs):
     """Await task state."""
     async with async_timeout(timeout):
@@ -42,6 +53,11 @@ async def task_state(itask: TaskProxy, state: str, timeout=4, **kwargs):
             await asyncio.sleep(0.1)
 
 
+def patch_remote_init(schd: Scheduler, value: str):
+    """Set remote init state."""
+    schd.task_job_mgr.task_remote_mgr.remote_init_map[LOCALHOST] = value
+
+
 async def test_simulation(flow, scheduler, run):
     """Test killing a running task in simulation mode."""
     conf = {
@@ -69,26 +85,83 @@ async def test_simulation(flow, scheduler, run):
 
 
 async def test_kill_preparing(
-    one_conf,
-    flow,
-    scheduler,
-    start,
-    monkeypatch: pytest.MonkeyPatch,
-    log_filter,
+    flow, scheduler, run, monkeypatch: pytest.MonkeyPatch, log_filter
 ):
     """Test killing a preparing task."""
     schd: Scheduler = scheduler(
-        flow(one_conf), run_mode='live', paused_start=False
+        flow('foo'), run_mode='live', paused_start=False
     )
-    async with start(schd):
+    async with run(schd):
         # Make the task indefinitely preparing:
         monkeypatch.setattr(
-            schd.task_job_mgr, '_prep_submit_task_job', lambda *a, **k: None
+            schd.task_job_mgr, '_prep_submit_task_job', Mock(return_value=None)
         )
         itask = schd.pool.get_tasks()[0]
-        assert itask.state(TASK_STATUS_WAITING, is_held=False)
-        schd.start_job_submission([itask])
+        await task_state(itask, TASK_STATUS_PREPARING, is_held=False)
 
-        await run_cmd(kill_tasks(schd, [itask.tokens.relative_id]))
-        assert itask.state(TASK_STATUS_SUBMIT_FAILED, is_held=True)
+        await run_cmd(kill_tasks(schd, [itask.identity]))
+        await task_state(itask, TASK_STATUS_SUBMIT_FAILED, is_held=True)
         assert log_filter(logging.ERROR, 'killed in job prep')
+
+
+async def test_kill_preparing_pipeline(
+    flow, scheduler, run, monkeypatch: pytest.MonkeyPatch
+):
+    """Test killing a preparing task through various stages of the preparing
+    pipeline that involve submitting subprocesses and waiting for them to
+    complete."""
+    # Make localhost look like a remote target so we can test
+    # remote init/file install stages:
+    monkeypatch.setattr(
+        'cylc.flow.task_job_mgr.get_localhost_install_target',
+        Mock(return_value=token_hex()),
+    )
+
+    schd: Scheduler = scheduler(
+        flow('one'), run_mode='live', paused_start=False
+    )
+    async with run(schd):
+        remote_mgr = schd.task_job_mgr.task_remote_mgr
+        mock_eval_platform = Mock(return_value=None)
+        monkeypatch.setattr(remote_mgr, 'eval_platform', mock_eval_platform)
+        mock_remote_init = Mock()
+        monkeypatch.setattr(remote_mgr, 'remote_init', mock_remote_init)
+        mock_file_install = Mock()
+        monkeypatch.setattr(remote_mgr, 'file_install', mock_file_install)
+        itask = schd.pool.get_tasks()[0]
+
+        # Platform eval:
+        await task_state(itask, TASK_STATUS_PREPARING)
+        assert schd.release_tasks_to_run() is False
+        await run_cmd(kill_tasks(schd, [itask.identity]))
+        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        # Set to finished:
+        mock_eval_platform.return_value = LOCALHOST
+        # Should not submit after finish because it was killed:
+        assert schd.release_tasks_to_run() is False
+
+        # Remote init:
+        patch_remote_init(schd, REMOTE_INIT_IN_PROGRESS)
+        schd.pool._force_trigger(itask)
+        await task_state(itask, TASK_STATUS_PREPARING)
+        assert schd.release_tasks_to_run() is False
+        await run_cmd(kill_tasks(schd, [itask.identity]))
+        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        # Set to finished:
+        patch_remote_init(schd, REMOTE_INIT_DONE)
+        # Should not submit after finish because it was killed:
+        assert schd.release_tasks_to_run() is False
+        assert not mock_remote_init.called
+
+        # Remote file install:
+        patch_remote_init(schd, REMOTE_FILE_INSTALL_IN_PROGRESS)
+        schd.pool._force_trigger(itask)
+        await task_state(itask, TASK_STATUS_PREPARING)
+        assert schd.release_tasks_to_run() is False
+        await run_cmd(kill_tasks(schd, [itask.identity]))
+        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        # Set to finished:
+        patch_remote_init(schd, REMOTE_FILE_INSTALL_DONE)
+        # Should not submit after finish because it was killed:
+        assert schd.release_tasks_to_run() is False
+        assert not mock_file_install.called

From adf59f3507c4233a181efda15c0ed068030e325f Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Fri, 17 Jan 2025 18:15:34 +0000
Subject: [PATCH 7/9] Mypy

---
 cylc/flow/task_job_mgr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py
index 69da98eb8ff..23d644f812a 100644
--- a/cylc/flow/task_job_mgr.py
+++ b/cylc/flow/task_job_mgr.py
@@ -346,7 +346,7 @@ def submit_livelike_task_jobs(
                     bc_mgr = self.task_events_mgr.broadcast_mgr
                     rtconf = bc_mgr.get_updated_rtconfig(itask)
                     try:
-                        platform = get_platform(
+                        platform = get_platform(  # type: ignore[assignment]
                             rtconf,
                             bad_hosts=self.bad_hosts
                         )
@@ -1278,7 +1278,7 @@ def _prep_submit_task_job(
                     workflow, itask, '(platform not defined)', exc)
                 return False
             else:
-                itask.platform = platform
+                itask.platform = platform  # type: ignore[assignment]
                 # Retry delays, needed for the try_num
                 self._set_retry_timers(itask, rtconfig)
 

From eae06d76d5311688d0e4d2fe126e4f2d5f83853e Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Thu, 13 Feb 2025 17:16:38 +0000
Subject: [PATCH 8/9] Tidy API for submitting task jobs

---
 cylc/flow/network/server.py                   | 16 +++--
 cylc/flow/scheduler.py                        | 22 +++----
 cylc/flow/task_job_mgr.py                     | 63 +++++++++++--------
 cylc/flow/task_remote_mgr.py                  | 30 +++------
 tests/integration/conftest.py                 |  8 +--
 .../run_modes/test_mode_overrides.py          | 12 +---
 tests/integration/run_modes/test_nonlive.py   | 52 +++++++--------
 tests/integration/run_modes/test_skip.py      | 29 ++-------
 tests/integration/test_kill.py                | 24 ++++---
 tests/integration/test_task_events_mgr.py     |  8 +--
 tests/integration/test_task_job_mgr.py        |  6 +-
 tests/unit/test_scheduler.py                  |  4 +-
 tests/unit/test_task_remote_mgr.py            |  7 ++-
 13 files changed, 122 insertions(+), 159 deletions(-)

diff --git a/cylc/flow/network/server.py b/cylc/flow/network/server.py
index 2c170e61198..b2b7753ee55 100644
--- a/cylc/flow/network/server.py
+++ b/cylc/flow/network/server.py
@@ -123,6 +123,10 @@ class WorkflowRuntimeServer:
 
     """
     endpoints: Dict[str, object]
+    curve_auth: ThreadAuthenticator
+    """The ZMQ authenticator."""
+    client_pub_key_dir: str
+    """Client public key directory, used by the ZMQ authenticator."""
 
     OPERATE_SLEEP_INTERVAL = 0.2
     STOP_SLEEP_INTERVAL = 0.2
@@ -136,8 +140,6 @@ def __init__(self, schd):
         self.publisher = None
         self.loop = None
         self.thread = None
-        self.curve_auth = None
-        self.client_pub_key_dir = None
 
         self.schd: 'Scheduler' = schd
         self.resolvers = Resolvers(
@@ -184,10 +186,7 @@ def start(self, barrier):
         self.client_pub_key_dir = client_pub_keyinfo.key_path
 
         # Initial load for the localhost key.
-        self.curve_auth.configure_curve(
-            domain='*',
-            location=(self.client_pub_key_dir)
-        )
+        self.configure_curve()
 
         min_, max_ = glbl_cfg().get(['scheduler', 'run hosts', 'ports'])
         self.replier = WorkflowReplier(self, context=self.zmq_context)
@@ -207,6 +206,11 @@ def start(self, barrier):
 
         self.operate()
 
+    def configure_curve(self) -> None:
+        self.curve_auth.configure_curve(
+            domain='*', location=self.client_pub_key_dir
+        )
+
     async def stop(self, reason: Union[BaseException, str]) -> None:
         """Stop the TCP servers, and clean up authentication.
 
diff --git a/cylc/flow/scheduler.py b/cylc/flow/scheduler.py
index 324f701486e..8fe3f9a4809 100644
--- a/cylc/flow/scheduler.py
+++ b/cylc/flow/scheduler.py
@@ -442,7 +442,8 @@ async def initialise(self):
             self.workflow_db_mgr,
             self.task_events_mgr,
             self.data_store_mgr,
-            self.bad_hosts
+            self.bad_hosts,
+            self.server,
         )
 
         self.profiler = Profiler(self, self.options.profile_mode)
@@ -912,9 +913,7 @@ def restart_remote_init(self):
             if install_target == get_localhost_install_target():
                 continue
             # set off remote init
-            self.task_job_mgr.task_remote_mgr.remote_init(
-                platform, self.server.curve_auth,
-                self.server.client_pub_key_dir)
+            self.task_job_mgr.task_remote_mgr.remote_init(platform)
             # Remote init/file-install is done via process pool
             self.proc_pool.process()
             # add platform to map (to be picked up on main loop)
@@ -1537,13 +1536,7 @@ def start_job_submission(self, itasks: 'Iterable[TaskProxy]') -> bool:
         self.task_job_mgr.task_remote_mgr.rsync_includes = (
             self.config.get_validated_rsync_includes())
 
-        submitted = self.task_job_mgr.submit_task_jobs(
-            self.workflow,
-            itasks,
-            self.server.curve_auth,
-            self.server.client_pub_key_dir,
-            run_mode=self.get_run_mode()
-        )
+        submitted = self.submit_task_jobs(itasks)
         if not submitted:
             return False
 
@@ -1562,6 +1555,13 @@ def start_job_submission(self, itasks: 'Iterable[TaskProxy]') -> bool:
         # one or more tasks were passed through the submission pipeline
         return True
 
+    def submit_task_jobs(
+        self, itasks: 'Iterable[TaskProxy]'
+    ) -> 'List[TaskProxy]':
+        """Submit task jobs, return tasks that attempted submission."""
+        # Note: keep this as simple wrapper for task job mgr's method
+        return self.task_job_mgr.submit_task_jobs(itasks, self.get_run_mode())
+
     def process_workflow_db_queue(self):
         """Update workflow DB."""
         self.workflow_db_mgr.process_queued_ops()
diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py
index 23d644f812a..e9c0bfc6343 100644
--- a/cylc/flow/task_job_mgr.py
+++ b/cylc/flow/task_job_mgr.py
@@ -159,8 +159,16 @@ class TaskJobManager:
         REMOTE_INIT_IN_PROGRESS: REMOTE_INIT_MSG
     }
 
-    def __init__(self, workflow, proc_pool, workflow_db_mgr,
-                 task_events_mgr, data_store_mgr, bad_hosts):
+    def __init__(
+        self,
+        workflow,
+        proc_pool,
+        workflow_db_mgr,
+        task_events_mgr,
+        data_store_mgr,
+        bad_hosts,
+        server,
+    ):
         self.workflow: str = workflow
         self.proc_pool = proc_pool
         self.workflow_db_mgr: WorkflowDatabaseManager = workflow_db_mgr
@@ -171,7 +179,8 @@ def __init__(self, workflow, proc_pool, workflow_db_mgr,
         self.bad_hosts = bad_hosts
         self.bad_hosts_to_clear = set()
         self.task_remote_mgr = TaskRemoteMgr(
-            workflow, proc_pool, self.bad_hosts, self.workflow_db_mgr)
+            workflow, proc_pool, self.bad_hosts, self.workflow_db_mgr, server
+        )
 
     def check_task_jobs(self, workflow, task_pool):
         """Check submission and execution timeout and polling timers.
@@ -267,40 +276,43 @@ def prep_submit_task_jobs(
 
     def submit_task_jobs(
         self,
-        workflow,
         itasks: 'Iterable[TaskProxy]',
-        curve_auth,
-        client_pub_key_dir,
-        run_mode: RunMode = RunMode.LIVE,
+        run_mode: RunMode,
     ) -> 'List[TaskProxy]':
         """Prepare for job submission and submit task jobs.
 
-        Preparation (host selection, remote host init, and remote install)
-        is done asynchronously. Newly released tasks may be sent here several
-        times until these init subprocesses have returned. Failure during
-        preparation is considered to be job submission failure.
-
-        Once preparation has completed or failed, reset .waiting_on_job_prep in
-        task instances so the scheduler knows to stop sending them back here.
-
-        This method uses prep_submit_task_jobs() as helper.
-
-        Return (list): list of tasks that attempted submission.
+        Return: tasks that attempted submission.
         """
         # submit "simulation/skip" mode tasks, modify "dummy" task configs:
         itasks, submitted_nonlive_tasks = self.submit_nonlive_task_jobs(
-            workflow, itasks, run_mode)
+            self.workflow, itasks, run_mode
+        )
 
         # submit "live" mode tasks (and "dummy" mode tasks)
         submitted_live_tasks = self.submit_livelike_task_jobs(
-            workflow, itasks, curve_auth, client_pub_key_dir)
+            self.workflow, itasks
+        )
 
         return submitted_nonlive_tasks + submitted_live_tasks
 
     def submit_livelike_task_jobs(
-        self, workflow, itasks, curve_auth, client_pub_key_dir
+        self,
+        workflow: str,
+        itasks: 'Iterable[TaskProxy]',
     ) -> 'List[TaskProxy]':
         """Submission for live tasks and dummy tasks.
+
+        Preparation (host selection, remote host init, and remote install)
+        is done asynchronously. Newly released tasks may be sent here several
+        times until these init subprocesses have returned. Failure during
+        preparation is considered to be job submission failure.
+
+        Once preparation has completed or failed, reset .waiting_on_job_prep in
+        task instances so the scheduler knows to stop sending them back here.
+
+        This method uses prep_submit_task_jobs() as helper.
+
+        Return: tasks that attempted submission.
         """
         done_tasks: 'List[TaskProxy]' = []
         # Mapping of platforms to task proxies:
@@ -404,8 +416,7 @@ def submit_livelike_task_jobs(
 
                 elif install_target not in ri_map:
                     # Remote init not in progress for target, so start it.
-                    self.task_remote_mgr.remote_init(
-                        platform, curve_auth, client_pub_key_dir)
+                    self.task_remote_mgr.remote_init(platform)
                     for itask in itasks:
                         self.data_store_mgr.delta_job_msg(
                             itask.tokens.duplicate(
@@ -433,8 +444,7 @@ def submit_livelike_task_jobs(
                     # Remote init previously failed because a host was
                     # unreachable, so start it again.
                     del ri_map[install_target]
-                    self.task_remote_mgr.remote_init(
-                        platform, curve_auth, client_pub_key_dir)
+                    self.task_remote_mgr.remote_init(platform)
                     for itask in itasks:
                         self.data_store_mgr.delta_job_msg(
                             itask.tokens.duplicate(
@@ -457,8 +467,7 @@ def submit_livelike_task_jobs(
                 )
             except NoHostsError:
                 del ri_map[install_target]
-                self.task_remote_mgr.remote_init(
-                    platform, curve_auth, client_pub_key_dir)
+                self.task_remote_mgr.remote_init(platform)
                 for itask in itasks:
                     self.data_store_mgr.delta_job_msg(
                         itask.tokens.duplicate(
diff --git a/cylc/flow/task_remote_mgr.py b/cylc/flow/task_remote_mgr.py
index 50b523fb50c..9132a8e44c5 100644
--- a/cylc/flow/task_remote_mgr.py
+++ b/cylc/flow/task_remote_mgr.py
@@ -81,7 +81,7 @@
 )
 
 if TYPE_CHECKING:
-    from zmq.auth.thread import ThreadAuthenticator
+    from cylc.flow.network.server import WorkflowRuntimeServer
 
 # Remote installation literals
 REMOTE_INIT_DONE = 'REMOTE INIT DONE'
@@ -103,7 +103,7 @@ class RemoteTidyQueueTuple(NamedTuple):
 class TaskRemoteMgr:
     """Manage task remote initialisation, tidy, selection."""
 
-    def __init__(self, workflow, proc_pool, bad_hosts, db_mgr):
+    def __init__(self, workflow, proc_pool, bad_hosts, db_mgr, server):
         self.workflow = workflow
         self.proc_pool = proc_pool
         # self.remote_command_map = {command: host|PlatformError|None}
@@ -117,6 +117,7 @@ def __init__(self, workflow, proc_pool, bad_hosts, db_mgr):
         self.is_reload = False
         self.is_restart = False
         self.db_mgr = db_mgr
+        self.server: WorkflowRuntimeServer = server
 
     def _subshell_eval(
         self, eval_str: str, command_pattern: re.Pattern
@@ -207,9 +208,7 @@ def subshell_eval_reset(self):
             if value is not None:
                 del self.remote_command_map[key]
 
-    def remote_init(
-            self, platform: Dict[str, Any], curve_auth: 'ThreadAuthenticator',
-            client_pub_key_dir: str) -> None:
+    def remote_init(self, platform: Dict[str, Any]) -> None:
         """Initialise a remote host if necessary.
 
         Call "cylc remote-init" to install workflow items to remote:
@@ -219,9 +218,6 @@ def remote_init(
         Args:
             platform: A dict containing settings relating to platform used in
                 this remote installation.
-            curve_auth: The ZMQ authenticator.
-            client_pub_key_dir: Client public key directory, used by the
-                ZMQ authenticator.
 
         """
         install_target = platform['install target']
@@ -277,18 +273,13 @@ def remote_init(
             cmd = construct_ssh_cmd(cmd, platform, host)
             self.proc_pool.put_command(
                 SubProcContext(
-                    'remote-init',
-                    cmd,
-                    stdin_files=[tmphandle],
-                    host=host
+                    'remote-init', cmd, stdin_files=[tmphandle], host=host
                 ),
                 bad_hosts=self.bad_hosts,
                 callback=self._remote_init_callback,
-                callback_args=[
-                    platform, tmphandle, curve_auth, client_pub_key_dir
-                ],
+                callback_args=[platform, tmphandle],
                 callback_255=self._remote_init_callback_255,
-                callback_255_args=[platform]
+                callback_255_args=[platform],
             )
 
     def construct_remote_tidy_ssh_cmd(
@@ -485,9 +476,7 @@ def _remote_init_callback_255(self, proc_ctx, platform):
         self.ready = True
         return
 
-    def _remote_init_callback(
-            self, proc_ctx, platform, tmphandle,
-            curve_auth, client_pub_key_dir):
+    def _remote_init_callback(self, proc_ctx, platform, tmphandle):
         """Callback when "cylc remote-init" exits.
 
         Write public key for install target into client public key
@@ -518,8 +507,7 @@ def _remote_init_callback(
             # configure_curve must be called every time certificates are
             # added or removed, in order to update the Authenticator's
             # state.
-            curve_auth.configure_curve(
-                domain='*', location=(client_pub_key_dir))
+            self.server.configure_curve()
             self.remote_init_map[install_target] = REMOTE_INIT_DONE
             self.ready = True
             return
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index cfcd137b453..5af6a2b47f5 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -424,14 +424,14 @@ def capture_submission():
     def _disable_submission(schd: 'Scheduler') -> 'Set[TaskProxy]':
         submitted_tasks: 'Set[TaskProxy]' = set()
 
-        def _submit_task_jobs(_, itasks, *args, **kwargs):
+        def _submit_task_jobs(itasks):
             nonlocal submitted_tasks
             for itask in itasks:
                 itask.state_reset(TASK_STATUS_SUBMITTED)
             submitted_tasks.update(itasks)
             return itasks
 
-        schd.task_job_mgr.submit_task_jobs = _submit_task_jobs  # type: ignore
+        schd.submit_task_jobs = _submit_task_jobs  # type: ignore
         return submitted_tasks
 
     return _disable_submission
@@ -564,7 +564,7 @@ def reflog():
     """
 
     def _reflog(schd: 'Scheduler', flow_nums: bool = False) -> Set[tuple]:
-        submit_task_jobs = schd.task_job_mgr.submit_task_jobs
+        submit_task_jobs = schd.submit_task_jobs
         triggers = set()
 
         def _submit_task_jobs(*args, **kwargs):
@@ -580,7 +580,7 @@ def _submit_task_jobs(*args, **kwargs):
                     triggers.add((itask.identity, deps or None))
             return itasks
 
-        schd.task_job_mgr.submit_task_jobs = _submit_task_jobs
+        schd.submit_task_jobs = _submit_task_jobs
 
         return triggers
 
diff --git a/tests/integration/run_modes/test_mode_overrides.py b/tests/integration/run_modes/test_mode_overrides.py
index 25cdf6cf68a..5aa15394d18 100644
--- a/tests/integration/run_modes/test_mode_overrides.py
+++ b/tests/integration/run_modes/test_mode_overrides.py
@@ -95,11 +95,7 @@ async def test_force_trigger_does_not_override_run_mode(
         schd.pool.force_trigger_tasks('1/foo', [1])
 
         # ... but job submission will always change this to the correct mode:
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            [foo],
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir)
+        schd.submit_task_jobs([foo])
 
         assert foo.run_mode.value == 'skip'
 
@@ -157,10 +153,6 @@ async def test_run_mode_override_from_broadcast(
         foo_1000 = schd.pool.get_task(ISO8601Point('1000'), 'foo')
         foo_1001 = schd.pool.get_task(ISO8601Point('1001'), 'foo')
 
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            [foo_1000, foo_1001],
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir)
+        schd.submit_task_jobs([foo_1000, foo_1001])
         assert foo_1000.run_mode.value == 'skip'
         assert capture_live_submissions() == {'1001/foo'}
diff --git a/tests/integration/run_modes/test_nonlive.py b/tests/integration/run_modes/test_nonlive.py
index 90cefbf7701..d975c34922c 100644
--- a/tests/integration/run_modes/test_nonlive.py
+++ b/tests/integration/run_modes/test_nonlive.py
@@ -19,6 +19,7 @@
 
 from cylc.flow.cycling.integer import IntegerPoint
 from cylc.flow.cycling.iso8601 import ISO8601Point
+from cylc.flow.scheduler import Scheduler
 
 
 # Define here to ensure test doesn't just mirror code:
@@ -30,9 +31,11 @@
         'submit_status': 0,
         'run_signal': None,
         'run_status': 0,
+        # capture_live_submissions fixture submits jobs in sim mode
         'platform_name': 'simulation',
         'job_runner_name': 'simulation',
-        'job_id': None},
+        'job_id': None,
+    },
     'skip': {
         'flow_nums': '[1]',
         'is_manual_submit': 0,
@@ -42,7 +45,8 @@
         'run_status': 0,
         'platform_name': 'skip',
         'job_runner_name': 'skip',
-        'job_id': None},
+        'job_id': None,
+    },
 }
 
 
@@ -59,12 +63,7 @@ def submit_and_check_db():
     """
     def _inner(schd):
         # Submit task jobs:
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
         # Make sure that db changes are enacted:
         schd.workflow_db_mgr.process_queued_ops()
 
@@ -77,7 +76,7 @@ def _inner(schd):
 
             # Check that timestamps have been created:
             for timestamp in [
-                'time_submit', 'time_submit_exit', 'time_run', 'time_run_exit'
+                'time_submit', 'time_submit_exit',  'time_run', 'time_run_exit'
             ]:
                 assert task_jobs[timestamp] is not None
     return _inner
@@ -90,19 +89,25 @@ async def test_db_task_jobs(
     """Ensure that task job data is added to the database correctly
     for each run mode.
     """
-    schd = scheduler(flow({
-        'scheduling': {'graph': {
-            'R1': '&'.join(KGO)}},
-        'runtime': {
-            mode: {'run mode': mode} for mode in KGO}
-    }))
+    schd: Scheduler = scheduler(
+        flow({
+            'scheduling': {
+                'graph': {
+                    'R1': ' & '.join(KGO)
+                }
+            },
+            'runtime': {
+                mode: {'run mode': mode} for mode in KGO
+            },
+        }),
+        run_mode='live'
+    )
     async with start(schd):
         # Reference all task proxies so we can examine them
         # at the end of the test:
         itask_skip = schd.pool.get_task(IntegerPoint('1'), 'skip')
         itask_live = schd.pool.get_task(IntegerPoint('1'), 'live')
 
-
         submit_and_check_db(schd)
 
         # Set outputs to failed:
@@ -110,6 +115,7 @@ async def test_db_task_jobs(
 
         submit_and_check_db(schd)
 
+        # capture_live_submissions fixture submits jobs in sim mode
         assert itask_live.run_mode.value == 'simulation'
         assert itask_skip.run_mode.value == 'skip'
 
@@ -124,12 +130,7 @@ async def test_db_task_states(
     conf['runtime'] = {'one': {'run mode': 'skip'}}
     schd = scheduler(flow(conf))
     async with start(schd):
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
         schd.workflow_db_mgr.process_queued_ops()
         result = schd.workflow_db_mgr.pri_dao.connect().execute(
             'SELECT * FROM task_states').fetchone()
@@ -165,12 +166,7 @@ async def test_mean_task_time(
         itask.tdef.elapsed_times.extend([133.0, 132.4])
 
         # Submit two tasks:
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            [itask],
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir
-        )
+        schd.submit_task_jobs([itask])
 
         # Ensure that the skipped task has succeeded, and that the
         # number of items in the elapsed_times has not changed.
diff --git a/tests/integration/run_modes/test_skip.py b/tests/integration/run_modes/test_skip.py
index 5c9c31a6d56..1bcdf3891af 100644
--- a/tests/integration/run_modes/test_skip.py
+++ b/tests/integration/run_modes/test_skip.py
@@ -51,12 +51,7 @@ async def test_settings_override_from_broadcast(
 
         foo, = schd.pool.get_tasks()
 
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
         # Run mode has changed:
         assert foo.platform['name'] == 'skip'
         # Output failed emitted:
@@ -215,13 +210,7 @@ async def test_prereqs_marked_satisfied_by_skip_mode(
 
     async with start(schd):
         foo = schd.pool.get_task(IntegerPoint(1), 'foo')
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            [foo],
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir,
-            run_mode=schd.get_run_mode()
-        )
+        schd.submit_task_jobs([foo])
         bar = schd.pool.get_task(IntegerPoint(1), 'bar')
         satisfied_message, = bar.state.prerequisites[0]._satisfied.values()
         assert satisfied_message == 'satisfied by skip mode'
@@ -240,20 +229,10 @@ async def test_outputs_can_be_changed(one_conf, flow, start, scheduler, validate
                 {"skip": {"outputs": "failed"}},
             ],
         )
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            None,
-            None
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
 
         # Broadcast the task into skip mode, output succeeded and submit it:
         schd.broadcast_mgr.put_broadcast(
             ['1'], ['one'], [{'skip': {'outputs': 'succeeded'}}]
         )
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            None,
-            None
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
diff --git a/tests/integration/test_kill.py b/tests/integration/test_kill.py
index 3588ccf812e..ab13fb43b10 100644
--- a/tests/integration/test_kill.py
+++ b/tests/integration/test_kill.py
@@ -105,7 +105,7 @@ async def test_kill_preparing(
 
 
 async def test_kill_preparing_pipeline(
-    flow, scheduler, run, monkeypatch: pytest.MonkeyPatch
+    flow, scheduler, start, monkeypatch: pytest.MonkeyPatch
 ):
     """Test killing a preparing task through various stages of the preparing
     pipeline that involve submitting subprocesses and waiting for them to
@@ -120,7 +120,7 @@ async def test_kill_preparing_pipeline(
     schd: Scheduler = scheduler(
         flow('one'), run_mode='live', paused_start=False
     )
-    async with run(schd):
+    async with start(schd):
         remote_mgr = schd.task_job_mgr.task_remote_mgr
         mock_eval_platform = Mock(return_value=None)
         monkeypatch.setattr(remote_mgr, 'eval_platform', mock_eval_platform)
@@ -131,10 +131,12 @@ async def test_kill_preparing_pipeline(
         itask = schd.pool.get_tasks()[0]
 
         # Platform eval:
-        await task_state(itask, TASK_STATUS_PREPARING)
+        schd.submit_task_jobs([itask])
+        assert itask.state(TASK_STATUS_PREPARING)
         assert schd.release_tasks_to_run() is False
         await run_cmd(kill_tasks(schd, [itask.identity]))
-        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        assert itask.state(TASK_STATUS_SUBMIT_FAILED)
+        assert schd.release_tasks_to_run() is False
         # Set to finished:
         mock_eval_platform.return_value = LOCALHOST
         # Should not submit after finish because it was killed:
@@ -142,11 +144,12 @@ async def test_kill_preparing_pipeline(
 
         # Remote init:
         patch_remote_init(schd, REMOTE_INIT_IN_PROGRESS)
-        schd.pool._force_trigger(itask)
-        await task_state(itask, TASK_STATUS_PREPARING)
+        schd.submit_task_jobs([itask])
+        assert itask.state(TASK_STATUS_PREPARING)
         assert schd.release_tasks_to_run() is False
         await run_cmd(kill_tasks(schd, [itask.identity]))
-        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        assert itask.state(TASK_STATUS_SUBMIT_FAILED)
+        assert schd.release_tasks_to_run() is False
         # Set to finished:
         patch_remote_init(schd, REMOTE_INIT_DONE)
         # Should not submit after finish because it was killed:
@@ -155,11 +158,12 @@ async def test_kill_preparing_pipeline(
 
         # Remote file install:
         patch_remote_init(schd, REMOTE_FILE_INSTALL_IN_PROGRESS)
-        schd.pool._force_trigger(itask)
-        await task_state(itask, TASK_STATUS_PREPARING)
+        schd.submit_task_jobs([itask])
+        assert itask.state(TASK_STATUS_PREPARING)
         assert schd.release_tasks_to_run() is False
         await run_cmd(kill_tasks(schd, [itask.identity]))
-        await task_state(itask, TASK_STATUS_SUBMIT_FAILED)
+        assert itask.state(TASK_STATUS_SUBMIT_FAILED)
+        assert schd.release_tasks_to_run() is False
         # Set to finished:
         patch_remote_init(schd, REMOTE_FILE_INSTALL_DONE)
         # Should not submit after finish because it was killed:
diff --git a/tests/integration/test_task_events_mgr.py b/tests/integration/test_task_events_mgr.py
index ac1fb2f9344..4569009196c 100644
--- a/tests/integration/test_task_events_mgr.py
+++ b/tests/integration/test_task_events_mgr.py
@@ -151,13 +151,7 @@ async def test__always_insert_task_job(
     schd = scheduler(id_, run_mode='live')
     schd.bad_hosts = {'no-such-host-1', 'no-such-host-2'}
     async with start(schd):
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir,
-            run_mode=RunMode('live')
-        )
+        schd.submit_task_jobs(schd.pool.get_tasks())
 
         # Both tasks are in a waiting state:
         assert all(
diff --git a/tests/integration/test_task_job_mgr.py b/tests/integration/test_task_job_mgr.py
index b1cf1347071..eda704a2d39 100644
--- a/tests/integration/test_task_job_mgr.py
+++ b/tests/integration/test_task_job_mgr.py
@@ -223,11 +223,7 @@ async def test_broadcast_platform_change(
         schd.task_job_mgr.task_remote_mgr.bad_hosts = {'food'}
 
         # Attempt job submission:
-        schd.task_job_mgr.submit_task_jobs(
-            schd.workflow,
-            schd.pool.get_tasks(),
-            schd.server.curve_auth,
-            schd.server.client_pub_key_dir)
+        schd.submit_task_jobs(schd.pool.get_tasks())
 
         # Check that task platform hasn't become "localhost":
         assert schd.pool.get_tasks()[0].platform['name'] == 'foo'
diff --git a/tests/unit/test_scheduler.py b/tests/unit/test_scheduler.py
index 9163b9e3567..4ee63695efa 100644
--- a/tests/unit/test_scheduler.py
+++ b/tests/unit/test_scheduler.py
@@ -94,7 +94,7 @@ def test_should_auto_restart_now(
 def test_release_tasks_to_run__auto_restart():
     """Test that Scheduler.release_tasks_to_run() works as expected
     during auto restart."""
-    mock_schd = Mock(
+    mock_schd = MagicMock(
         auto_restart_time=(time() - 100),
         auto_restart_mode=AutoRestartMode.RESTART_NORMAL,
         is_paused=False,
@@ -113,7 +113,7 @@ def test_release_tasks_to_run__auto_restart():
     mock_schd.pool.release_queued_tasks.assert_not_called()
 
     Scheduler.start_job_submission(mock_schd, mock_schd.pool.get_tasks())
-    mock_schd.task_job_mgr.submit_task_jobs.assert_called()
+    mock_schd.submit_task_jobs.assert_called()
 
 
 def test_auto_restart_DNS_error(monkeypatch, caplog, log_filter):
diff --git a/tests/unit/test_task_remote_mgr.py b/tests/unit/test_task_remote_mgr.py
index 61cdcce2bc5..115d3a1c729 100644
--- a/tests/unit/test_task_remote_mgr.py
+++ b/tests/unit/test_task_remote_mgr.py
@@ -90,7 +90,7 @@ def test_remote_init_skip(
             'get_dirs_to_symlink'):
         monkeypatch.setattr(f'cylc.flow.task_remote_mgr.{item}', MagicMock())
 
-    TaskRemoteMgr.remote_init(mock_task_remote_mgr, platform, None, '')
+    TaskRemoteMgr.remote_init(mock_task_remote_mgr, platform)
     call_expected = not skip_expected
     assert mock_task_remote_mgr._remote_init_items.called is call_expected
     assert mock_construct_ssh_cmd.called is call_expected
@@ -115,7 +115,7 @@ def test_get_log_file_name(tmp_path: Path,
                            install_target: str,
                            load_type: Optional[str],
                            expected: str):
-    task_remote_mgr = TaskRemoteMgr('some_workflow', None, None, None)
+    task_remote_mgr = TaskRemoteMgr('some_workflow', None, None, None, None)
     if load_type == 'restart':
         task_remote_mgr.is_restart = True
     elif load_type == 'reload':
@@ -285,7 +285,8 @@ def _task_remote_mgr_eval(remote_cmd_map: dict) -> TaskRemoteMgr:
             workflow='usg_ishimura',
             proc_pool=Mock(),
             bad_hosts=[],
-            db_mgr=None
+            db_mgr=None,
+            server=None
         )
         task_remote_mgr.remote_command_map = remote_cmd_map
         return task_remote_mgr

From c28a565b2208b8f2dbb9c0cb519ac9340625fd39 Mon Sep 17 00:00:00 2001
From: Ronnie Dutta <61982285+MetRonnie@users.noreply.github.com>
Date: Thu, 13 Feb 2025 17:18:18 +0000
Subject: [PATCH 9/9] Wipe platform info for task killed during preparation

---
 cylc/flow/task_job_mgr.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cylc/flow/task_job_mgr.py b/cylc/flow/task_job_mgr.py
index e9c0bfc6343..9ce496c8fd2 100644
--- a/cylc/flow/task_job_mgr.py
+++ b/cylc/flow/task_job_mgr.py
@@ -212,6 +212,7 @@ def kill_task_jobs(
 
     def kill_prep_task(self, itask: 'TaskProxy') -> None:
         """Kill a preparing task."""
+        itask.summary['platforms_used'][itask.submit_num] = ''
         itask.waiting_on_job_prep = False
         itask.local_job_file_path = None  # reset for retry
         self._set_retry_timers(itask)