From 21b5092df014d903aefe57a7367af6c91eee3023 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Thu, 10 Oct 2024 16:55:44 -0400 Subject: [PATCH 01/31] WIP: implement the ProcessJobLaunch. --- .../fed/app/job_launch/job_launch_spec.py | 66 +++++++++++++ .../fed/app/job_launch/process_job_launch.py | 93 +++++++++++++++++++ nvflare/private/fed/client/client_engine.py | 4 +- nvflare/private/fed/client/client_executor.py | 2 +- 4 files changed, 162 insertions(+), 3 deletions(-) create mode 100644 nvflare/private/fed/app/job_launch/job_launch_spec.py create mode 100644 nvflare/private/fed/app/job_launch/process_job_launch.py diff --git a/nvflare/private/fed/app/job_launch/job_launch_spec.py b/nvflare/private/fed/app/job_launch/job_launch_spec.py new file mode 100644 index 0000000000..0a150d9f46 --- /dev/null +++ b/nvflare/private/fed/app/job_launch/job_launch_spec.py @@ -0,0 +1,66 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import abstractmethod + +from nvflare.apis.resource_manager_spec import ResourceManagerSpec + + +class JobLaunchSpec: + @abstractmethod + def launch_job(self, + client, + startup, + job_id, + args, + app_custom_folder, + target: str, + scheme: str, + timeout=None) -> bool: + """To launch a job run. + + Args: + timeout: the job needs to be started within this timeout. Otherwise failed the job launch. + None means no timeout limit. + + Returns: boolean to indicates the job launch success or fail. + + """ + raise NotImplemented + + @abstractmethod + def terminate(self): + """To terminate the job run. + + Returns: the job run return code. + + """ + raise NotImplemented + + @abstractmethod + def return_code(self): + """To get the return code of the job run. + + Returns: return_code + + """ + raise NotImplemented + + @abstractmethod + def wait(self): + """To wait until the job run complete. + + Returns: returns until the job run complete. + + """ + raise NotImplemented diff --git a/nvflare/private/fed/app/job_launch/process_job_launch.py b/nvflare/private/fed/app/job_launch/process_job_launch.py new file mode 100644 index 0000000000..b48089ac55 --- /dev/null +++ b/nvflare/private/fed/app/job_launch/process_job_launch.py @@ -0,0 +1,93 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +import os +import shlex +import subprocess +import sys + +from nvflare.private.fed.app.job_launch.job_launch_spec import JobLaunchSpec +from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path + + +class ProcessJobLaunch(JobLaunchSpec): + def __init__(self, job_id: str): + super().__init__() + self.job_id = job_id + + self.process = None + self.logger = logging.getLogger(self.__class__.__name__) + + def launch_job(self, + client, + startup, + job_id, + args, + app_custom_folder, + target: str, + scheme: str, + timeout=None) -> bool: + + new_env = os.environ.copy() + if app_custom_folder != "": + add_custom_dir_to_path(app_custom_folder, new_env) + + command_options = "" + for t in args.set: + command_options += " " + t + command = ( + f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " + + args.workspace + + " -w " + + startup + + " -t " + + client.token + + " -d " + + client.ssid + + " -n " + + job_id + + " -c " + + client.client_name + + " -p " + + str(client.cell.get_internal_listener_url()) + + " -g " + + target + + " -scheme " + + scheme + + " -s fed_client.json " + " --set" + command_options + " print_conf=True" + ) + # use os.setsid to create new process group ID + self.process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) + + self.logger.info("Worker child process ID: {}".format(self.process.pid)) + + def terminate(self): + try: + os.killpg(os.getpgid(self.process.pid), 9) + self.logger.debug("kill signal sent") + except: + pass + + self.process.terminate() + + def return_code(self): + if self.process: + return self.process.poll() + else: + return None + + def wait(self): + if self.process: + self.process.wait() diff --git a/nvflare/private/fed/client/client_engine.py b/nvflare/private/fed/client/client_engine.py index 6c4acb92e9..073772a779 100644 --- a/nvflare/private/fed/client/client_engine.py +++ b/nvflare/private/fed/client/client_engine.py @@ -32,7 +32,7 @@ from nvflare.security.logging import secure_format_exception, secure_log_traceback from .client_engine_internal_spec import ClientEngineInternalSpec -from .client_executor import ProcessExecutor +from .client_executor import JobExecutor from .client_run_manager import ClientRunInfo from .client_status import ClientStatus from .fed_client import FederatedClient @@ -62,7 +62,7 @@ def __init__(self, client: FederatedClient, args, rank, workers=5): self.client_name = client.client_name self.args = args self.rank = rank - self.client_executor = ProcessExecutor(client, os.path.join(args.workspace, "startup")) + self.client_executor = JobExecutor(client, os.path.join(args.workspace, "startup")) self.admin_agent = None self.fl_ctx_mgr = FLContextManager( diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index b93aaf637d..9b9cf40b21 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -122,7 +122,7 @@ def reset_errors(self, job_id): """ -class ProcessExecutor(ClientExecutor): +class JobExecutor(ClientExecutor): """Run the Client executor in a child process.""" def __init__(self, client, startup): From 4c15158479bc676d0c3d8c2dc56cf669e621b25f Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 14 Oct 2024 10:23:27 -0400 Subject: [PATCH 02/31] WIP: working ProcessJobLauncher impelentation. --- nvflare/apis/server_engine_spec.py | 4 +- ...ob_launch_spec.py => job_launcher_spec.py} | 18 +-- ...cess_job_launch.py => process_launcher.py} | 34 +++--- nvflare/private/fed/client/client_engine.py | 2 + .../fed/client/client_engine_internal_spec.py | 1 + nvflare/private/fed/client/client_executor.py | 113 ++++++++++-------- nvflare/private/fed/client/scheduler_cmds.py | 2 + nvflare/private/fed/server/job_runner.py | 2 +- nvflare/private/fed/server/server_engine.py | 5 +- .../job_schedulers/job_scheduler_test.py | 2 +- 10 files changed, 96 insertions(+), 87 deletions(-) rename nvflare/private/fed/app/job_launch/{job_launch_spec.py => job_launcher_spec.py} (78%) rename nvflare/private/fed/app/job_launch/{process_job_launch.py => process_launcher.py} (76%) diff --git a/nvflare/apis/server_engine_spec.py b/nvflare/apis/server_engine_spec.py index 9f8c9a568b..6950b9d160 100644 --- a/nvflare/apis/server_engine_spec.py +++ b/nvflare/apis/server_engine_spec.py @@ -203,12 +203,12 @@ def restore_components(self, snapshot: RunSnapshot, fl_ctx: FLContext): pass @abstractmethod - def start_client_job(self, job_id, client_sites, fl_ctx: FLContext): + def start_client_job(self, job, client_sites, fl_ctx: FLContext): """To send the start client run commands to the clients Args: client_sites: client sites - job_id: job_id + job: job object fl_ctx: FLContext Returns: diff --git a/nvflare/private/fed/app/job_launch/job_launch_spec.py b/nvflare/private/fed/app/job_launch/job_launcher_spec.py similarity index 78% rename from nvflare/private/fed/app/job_launch/job_launch_spec.py rename to nvflare/private/fed/app/job_launch/job_launcher_spec.py index 0a150d9f46..9799870176 100644 --- a/nvflare/private/fed/app/job_launch/job_launch_spec.py +++ b/nvflare/private/fed/app/job_launch/job_launcher_spec.py @@ -13,20 +13,12 @@ # limitations under the License. from abc import abstractmethod -from nvflare.apis.resource_manager_spec import ResourceManagerSpec - -class JobLaunchSpec: +class JobLauncherSpec: @abstractmethod - def launch_job(self, - client, - startup, - job_id, - args, - app_custom_folder, - target: str, - scheme: str, - timeout=None) -> bool: + def launch_job( + self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None + ) -> bool: """To launch a job run. Args: @@ -48,7 +40,7 @@ def terminate(self): raise NotImplemented @abstractmethod - def return_code(self): + def poll(self): """To get the return code of the job run. Returns: return_code diff --git a/nvflare/private/fed/app/job_launch/process_job_launch.py b/nvflare/private/fed/app/job_launch/process_launcher.py similarity index 76% rename from nvflare/private/fed/app/job_launch/process_job_launch.py rename to nvflare/private/fed/app/job_launch/process_launcher.py index b48089ac55..8d14faca38 100644 --- a/nvflare/private/fed/app/job_launch/process_job_launch.py +++ b/nvflare/private/fed/app/job_launch/process_launcher.py @@ -17,27 +17,20 @@ import subprocess import sys -from nvflare.private.fed.app.job_launch.job_launch_spec import JobLaunchSpec +from nvflare.private.fed.app.job_launch.job_launcher_spec import JobLauncherSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path -class ProcessJobLaunch(JobLaunchSpec): - def __init__(self, job_id: str): +class ProcessJobLauncher(JobLauncherSpec): + def __init__(self): super().__init__() - self.job_id = job_id self.process = None self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, - client, - startup, - job_id, - args, - app_custom_folder, - target: str, - scheme: str, - timeout=None) -> bool: + def launch_job( + self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None + ) -> bool: new_env = os.environ.copy() if app_custom_folder != "": @@ -74,15 +67,16 @@ def launch_job(self, self.logger.info("Worker child process ID: {}".format(self.process.pid)) def terminate(self): - try: - os.killpg(os.getpgid(self.process.pid), 9) - self.logger.debug("kill signal sent") - except: - pass + if self.process: + try: + os.killpg(os.getpgid(self.process.pid), 9) + self.logger.debug("kill signal sent") + except: + pass - self.process.terminate() + self.process.terminate() - def return_code(self): + def poll(self): if self.process: return self.process.poll() else: diff --git a/nvflare/private/fed/client/client_engine.py b/nvflare/private/fed/client/client_engine.py index 073772a779..5315959e34 100644 --- a/nvflare/private/fed/client/client_engine.py +++ b/nvflare/private/fed/client/client_engine.py @@ -134,6 +134,7 @@ def get_engine_status(self): def start_app( self, job_id: str, + job_meta: dict, allocated_resource: dict = None, token: str = None, resource_manager=None, @@ -164,6 +165,7 @@ def start_app( self.client_executor.start_app( self.client, job_id, + job_meta, self.args, app_custom_folder, allocated_resource, diff --git a/nvflare/private/fed/client/client_engine_internal_spec.py b/nvflare/private/fed/client/client_engine_internal_spec.py index bd1e852cdc..95008fb7b1 100644 --- a/nvflare/private/fed/client/client_engine_internal_spec.py +++ b/nvflare/private/fed/client/client_engine_internal_spec.py @@ -53,6 +53,7 @@ def deploy_app(self, app_name: str, job_id: str, job_meta: dict, client_name: st def start_app( self, job_id: str, + job_meta: dict, allocated_resource: dict = None, token: str = None, resource_manager=None, diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 9b9cf40b21..4d337e042e 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -31,6 +31,7 @@ from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path, get_return_code from nvflare.security.logging import secure_format_exception, secure_log_traceback +from ..app.job_launch.process_launcher import ProcessJobLauncher from .client_status import ClientStatus, get_status_message @@ -40,6 +41,7 @@ def start_app( self, client, job_id, + job_meta, args, app_custom_folder, allocated_resource, @@ -145,6 +147,7 @@ def start_app( self, client, job_id, + job_meta, args, app_custom_folder, allocated_resource, @@ -158,6 +161,7 @@ def start_app( Args: client: the FL client object job_id: the job_id + job_meta: job meta data args: admin command arguments for starting the worker process app_custom_folder: FL application custom folder allocated_resource: allocated resources @@ -166,45 +170,47 @@ def start_app( target: SP target location scheme: SP connection scheme """ - new_env = os.environ.copy() - if app_custom_folder != "": - add_custom_dir_to_path(app_custom_folder, new_env) - - command_options = "" - for t in args.set: - command_options += " " + t - command = ( - f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " - + args.workspace - + " -w " - + self.startup - + " -t " - + client.token - + " -d " - + client.ssid - + " -n " - + job_id - + " -c " - + client.client_name - + " -p " - + str(client.cell.get_internal_listener_url()) - + " -g " - + target - + " -scheme " - + scheme - + " -s fed_client.json " - " --set" + command_options + " print_conf=True" - ) - # use os.setsid to create new process group ID - process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) - - self.logger.info("Worker child process ID: {}".format(process.pid)) + # new_env = os.environ.copy() + # if app_custom_folder != "": + # add_custom_dir_to_path(app_custom_folder, new_env) + # + # command_options = "" + # for t in args.set: + # command_options += " " + t + # command = ( + # f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " + # + args.workspace + # + " -w " + # + self.startup + # + " -t " + # + client.token + # + " -d " + # + client.ssid + # + " -n " + # + job_id + # + " -c " + # + client.client_name + # + " -p " + # + str(client.cell.get_internal_listener_url()) + # + " -g " + # + target + # + " -scheme " + # + scheme + # + " -s fed_client.json " + # " --set" + command_options + " print_conf=True" + # ) + # # use os.setsid to create new process group ID + # process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) + # + # self.logger.info("Worker child process ID: {}".format(process.pid)) + job_launcher = self._get_job_launcher(client, job_meta) + job_launcher.launch_job(client, self.startup, job_id, args, app_custom_folder, target, scheme) client.multi_gpu = False with self.lock: self.run_processes[job_id] = { - RunProcessKey.CHILD_PROCESS: process, + RunProcessKey.CHILD_PROCESS: job_launcher, RunProcessKey.STATUS: ClientStatus.STARTING, } @@ -214,6 +220,17 @@ def start_app( ) thread.start() + def _get_job_launcher(self, client, job_meta: dict): + launcher = None + launcher_map = job_meta.get("launcher_map") + for launcher_id, sites in launcher_map.items(): + if client.client_name in sites: + engine = client.engine + launcher = engine.get_component(launcher_id) + if not launcher: + launcher = ProcessJobLauncher() + return launcher + def notify_job_status(self, job_id, job_status): run_process = self.run_processes.get(job_id) if run_process: @@ -336,7 +353,7 @@ def abort_app(self, job_id): if process_status == ClientStatus.STARTED: try: with self.lock: - child_process = self.run_processes[job_id][RunProcessKey.CHILD_PROCESS] + job_launcher = self.run_processes[job_id][RunProcessKey.CHILD_PROCESS] data = {} fqcn = FQCN.join([self.client.client_name, job_id]) request = new_cell_message({}, data) @@ -348,7 +365,7 @@ def abort_app(self, job_id): optional=True, ) self.logger.debug("abort sent to worker") - t = threading.Thread(target=self._terminate_process, args=[child_process, job_id]) + t = threading.Thread(target=self._terminate_process, args=[job_launcher, job_id]) t.start() t.join() break @@ -383,14 +400,14 @@ def _terminate_process(self, child_process, job_id): time.sleep(0.05) # we want to quickly check - # kill the sub-process group directly - if not done: - self.logger.debug(f"still not done after {max_wait} secs") - try: - os.killpg(os.getpgid(child_process.pid), 9) - self.logger.debug("kill signal sent") - except: - pass + # # kill the sub-process group directly + # if not done: + # self.logger.debug(f"still not done after {max_wait} secs") + # try: + # os.killpg(os.getpgid(child_process.pid), 9) + # self.logger.debug("kill signal sent") + # except: + # pass child_process.terminate() self.logger.info(f"run ({job_id}): child worker process terminated") @@ -417,11 +434,11 @@ def abort_task(self, job_id): def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace): self.logger.info(f"run ({job_id}): waiting for child worker process to finish.") - child_process = self.run_processes.get(job_id, {}).get(RunProcessKey.CHILD_PROCESS) - if child_process: - child_process.wait() + job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.CHILD_PROCESS) + if job_launcher: + job_launcher.wait() - return_code = get_return_code(child_process, job_id, workspace, self.logger) + return_code = get_return_code(job_launcher, job_id, workspace, self.logger) self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}") if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]: diff --git a/nvflare/private/fed/client/scheduler_cmds.py b/nvflare/private/fed/client/scheduler_cmds.py index 7ea2e8d55b..ef47354365 100644 --- a/nvflare/private/fed/client/scheduler_cmds.py +++ b/nvflare/private/fed/client/scheduler_cmds.py @@ -101,6 +101,7 @@ def process(self, req: Message, app_ctx) -> Message: try: resource_spec = req.body job_id = req.get_header(RequestHeader.JOB_ID) + job_meta = req.get_header(RequestHeader.JOB_META) token = req.get_header(ShareableHeader.RESOURCE_RESERVE_TOKEN) except Exception as e: msg = f"{ERROR_MSG_PREFIX}: Start job execution exception, missing required information: {secure_format_exception(e)}." @@ -116,6 +117,7 @@ def process(self, req: Message, app_ctx) -> Message: resource_consumer.consume(allocated_resources) result = engine.start_app( job_id, + job_meta=job_meta, allocated_resource=allocated_resources, token=token, resource_manager=resource_manager, diff --git a/nvflare/private/fed/server/job_runner.py b/nvflare/private/fed/server/job_runner.py index d6bf91e888..cac4b30c57 100644 --- a/nvflare/private/fed/server/job_runner.py +++ b/nvflare/private/fed/server/job_runner.py @@ -249,7 +249,7 @@ def _start_run(self, job_id: str, job: Job, client_sites: Dict[str, DispatchInfo if err: raise RuntimeError(f"Could not start the server App for job: {job_id}.") - replies = engine.start_client_job(job_id, client_sites, fl_ctx) + replies = engine.start_client_job(job, client_sites, fl_ctx) client_sites_names = list(client_sites.keys()) check_client_replies(replies=replies, client_sites=client_sites_names, command=f"start job ({job_id})") display_sites = ",".join(client_sites_names) diff --git a/nvflare/private/fed/server/server_engine.py b/nvflare/private/fed/server/server_engine.py index d59128cd81..68e2b1416e 100644 --- a/nvflare/private/fed/server/server_engine.py +++ b/nvflare/private/fed/server/server_engine.py @@ -832,13 +832,14 @@ def cancel_client_resources( if requests: _ = self._send_admin_requests(requests, fl_ctx) - def start_client_job(self, job_id, client_sites, fl_ctx: FLContext): + def start_client_job(self, job, client_sites, fl_ctx: FLContext): requests = {} for site, dispatch_info in client_sites.items(): resource_requirement = dispatch_info.resource_requirements token = dispatch_info.token request = Message(topic=TrainingTopic.START_JOB, body=resource_requirement) - request.set_header(RequestHeader.JOB_ID, job_id) + request.set_header(RequestHeader.JOB_ID, job.job_id) + request.set_header(RequestHeader.JOB_META, job.meta) request.set_header(ShareableHeader.RESOURCE_RESERVE_TOKEN, token) client = self.get_client_from_name(site) if client: diff --git a/tests/unit_test/app_common/job_schedulers/job_scheduler_test.py b/tests/unit_test/app_common/job_schedulers/job_scheduler_test.py index 54c6fc49b7..8e0520053c 100644 --- a/tests/unit_test/app_common/job_schedulers/job_scheduler_test.py +++ b/tests/unit_test/app_common/job_schedulers/job_scheduler_test.py @@ -134,7 +134,7 @@ def persist_components(self, fl_ctx: FLContext, completed: bool): def restore_components(self, snapshot, fl_ctx: FLContext): pass - def start_client_job(self, job_id, client_sites, fl_ctx: FLContext): + def start_client_job(self, job, client_sites, fl_ctx: FLContext): pass def check_client_resources( From 610f4609e434cd98742451bc8004740d232cfef1 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 14 Oct 2024 14:25:54 -0400 Subject: [PATCH 03/31] Added k8s_launcher implementation. --- nvflare/app_opt/job_launcher/__init__.py | 13 + .../job_launcher}/job_launcher_spec.py | 0 nvflare/app_opt/job_launcher/k8s_launcher.py | 253 ++++++++++++++++++ .../job_launcher}/process_launcher.py | 2 +- nvflare/private/fed/client/client_executor.py | 41 +-- 5 files changed, 269 insertions(+), 40 deletions(-) create mode 100644 nvflare/app_opt/job_launcher/__init__.py rename nvflare/{private/fed/app/job_launch => app_opt/job_launcher}/job_launcher_spec.py (100%) create mode 100644 nvflare/app_opt/job_launcher/k8s_launcher.py rename nvflare/{private/fed/app/job_launch => app_opt/job_launcher}/process_launcher.py (97%) diff --git a/nvflare/app_opt/job_launcher/__init__.py b/nvflare/app_opt/job_launcher/__init__.py new file mode 100644 index 0000000000..d9155f923f --- /dev/null +++ b/nvflare/app_opt/job_launcher/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nvflare/private/fed/app/job_launch/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py similarity index 100% rename from nvflare/private/fed/app/job_launch/job_launcher_spec.py rename to nvflare/app_opt/job_launcher/job_launcher_spec.py diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py new file mode 100644 index 0000000000..081af51909 --- /dev/null +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -0,0 +1,253 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import time +from enum import Enum + +from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec + +from kubernetes import config +from kubernetes.client import Configuration +from kubernetes.client.api import core_v1_api +from kubernetes.client.rest import ApiException + + +class JobState(Enum): + STARTING = "starting" + RUNNING = "running" + TERMINATED = "terminated" + SUCCEEDED = "succeeded" + UNKNOWN = "unknown" + + +POD_STATE_MAPPING = { + "Pending": JobState.STARTING, + "Running": JobState.RUNNING, + "Succeeded": JobState.SUCCEEDED, + "Failed": JobState.TERMINATED, + "Unknown": JobState.UNKNOWN + } + + +class K8sJobHandle: + def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace='default'): + super().__init__() + self.job_id = job_id + + self.api_instance = api_instance + self.namespace = namespace + self.pod_manifest = { + 'apiVersion': 'v1', + 'kind': 'Pod', + 'metadata': { + 'name': None # set by job_config['name'] + }, + 'spec': { + 'containers': None, # link to container_list + 'volumes': None # link to volume_list + } + } + self.volume_list = [ + { + 'name': None, + 'hostPath': { + 'path': None, + 'type': 'Directory' + } + } + ] + self.container_list = [ + { + 'image': None, + 'name': None, + 'command': ['/usr/local/bin/python'], + 'args': None, # args_list + args_dict + args_sets + 'volumeMounts': None, # volume_mount_list + 'imagePullPolicy': 'Always' + } + ] + self.container_args_python_args_list = [ + '-u', '-m', 'nvflare.private.fed.app.client.worker_process' + ] + self.container_args_module_args_dict = { + '-m': None, + '-w': None, + '-t': None, + '-d': None, + '-n': None, + '-c': None, + '-p': None, + '-g': None, + '-scheme': None, + '-s': None, + } + self.container_volume_mount_list = [ + { + 'name': None, + 'mountPath': None, + } + ] + self._make_manifest(job_config) + + def _make_manifest(self, job_config): + self.container_volume_mount_list = \ + job_config.get('volume_mount_list', + [{'name': 'workspace-nvflare', 'mountPath': '/workspace/nvflare'}] + ) + set_list = job_config.get('set_list') + if set_list is None: + self.container_args_module_args_sets = list() + else: + self.container_args_module_args_sets = ['--set'] + set_list + self.container_args_module_args_dict = \ + job_config.get('module_args', + { + '-m': None, + '-w': None, + '-t': None, + '-d': None, + '-n': None, + '-c': None, + '-p': None, + '-g': None, + '-scheme': None, + '-s': None + } + ) + self.container_args_module_args_dict_as_list = list() + for k, v in self.container_args_module_args_dict.items(): + self.container_args_module_args_dict_as_list.append(k) + self.container_args_module_args_dict_as_list.append(v) + self.volume_list = \ + job_config.get('volume_list', + [{ + 'name': None, + 'hostPath': { + 'path': None, + 'type': 'Directory' + } + }] + ) + + self.pod_manifest['metadata']['name'] = job_config.get('name') + self.pod_manifest['spec']['containers'] = self.container_list + self.pod_manifest['spec']['volumes'] = self.volume_list + + self.container_list[0]['image'] = job_config.get('image', 'nvflare/nvflare:2.5.0') + self.container_list[0]['name'] = job_config.get('container_name', 'nvflare_job') + self.container_list[0]['args'] = \ + self.container_args_python_args_list + \ + self.container_args_module_args_dict_as_list + \ + self.container_args_module_args_sets + self.container_list[0]['volumeMounts'] = self.container_volume_mount_list + + def get_manifest(self): + return self.pod_manifest + + def abort(self, timeout=None): + resp = self.api_instance.delete_namespaced_pod(name=self.job_id, namespace=self.namespace, grace_period_seconds=0) + return self.enter_states([JobState.TERMINATED], timeout=timeout) + + def get_state(self): + try: + resp = self.api_instance.read_namespaced_pod(name=self.job_id, namespace=self.namespace) + except ApiException as e: + return JobState.UNKNOWN + return POD_STATE_MAPPING.get(resp.status.phase, JobState.UNKNOWN) + + def enter_states(self, job_states_to_enter: list, timeout=None): + starting_time = time.time() + if not isinstance(job_states_to_enter, (list, tuple)): + job_states_to_enter = [job_states_to_enter] + if not all([isinstance(js, JobState)] for js in job_states_to_enter): + raise ValueError(f"expect job_states_to_enter with valid values, but get {job_states_to_enter}") + while True: + job_state = self.get_state() + if job_state in job_states_to_enter: + return True + elif timeout is not None and time.time()-starting_time>timeout: + return False + time.sleep(1) + + +class K8sJobLauncher(JobLauncherSpec): + def __init__(self, config_file_path, namespace='default'): + super().__init__() + + config.load_kube_config(config_file_path) + try: + c = Configuration().get_default_copy() + except AttributeError: + c = Configuration() + c.assert_hostname = False + Configuration.set_default(c) + self.core_v1 = core_v1_api.CoreV1Api() + self.namespace = namespace + + self.job_handle = None + + def launch_job(self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, + timeout=None) -> bool: + + root_hostpath = "/home/azureuser/wksp/k2k/disk" + job_image = "localhost:32000/nvfl-k8s:0.0.1" + job_config = { + "name": job_id, + "image": job_image, + "container_name": f"container-{job_id}", + "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], + "volume_list": [{ + 'name': "workspace-nvflare", + 'hostPath': { + 'path': root_hostpath, + 'type': 'Directory' + } + }], + "module_args": { + '-m': args.workspace, + '-w': startup, + '-t': client.token, + '-d': client.ssid, + '-n': job_id, + '-c': client.client_name, + '-p': "tcp://parent-pod:8004", + '-g': target, + '-scheme': scheme, + '-s': "fed_client.json" + }, + "set_list": args.set + } + + self.job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) + try: + self.core_v1.create_namespaced_pod(body=self.job_handle.get_manifest(), namespace=self.namespace) + if self.job_handle.enter_states([JobState.RUNNING], timeout=timeout): + return True + else: + return False + except ApiException as e: + return False + + def terminate(self): + if self.job_handle: + self.job_handle.abort() + + def poll(self): + if self.job_handle: + return self.job_handle.get_state() + else: + return JobState.UNKNOWN + + def wait(self): + if self.job_handle: + self.job_handle.enter_states([JobState.SUCCEEDED, JobState.TERMINATED]) diff --git a/nvflare/private/fed/app/job_launch/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py similarity index 97% rename from nvflare/private/fed/app/job_launch/process_launcher.py rename to nvflare/app_opt/job_launcher/process_launcher.py index 8d14faca38..055c65a76e 100644 --- a/nvflare/private/fed/app/job_launch/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -17,7 +17,7 @@ import subprocess import sys -from nvflare.private.fed.app.job_launch.job_launcher_spec import JobLauncherSpec +from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 4d337e042e..0131293260 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -13,10 +13,6 @@ # limitations under the License. import logging -import os -import shlex -import subprocess -import sys import threading import time from abc import ABC, abstractmethod @@ -28,10 +24,10 @@ from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode from nvflare.fuel.utils.config_service import ConfigService from nvflare.private.defs import CellChannel, CellChannelTopic, JobFailureMsgKey, new_cell_message -from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path, get_return_code +from nvflare.private.fed.utils.fed_utils import get_return_code from nvflare.security.logging import secure_format_exception, secure_log_traceback -from ..app.job_launch.process_launcher import ProcessJobLauncher +from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher from .client_status import ClientStatus, get_status_message @@ -170,39 +166,6 @@ def start_app( target: SP target location scheme: SP connection scheme """ - # new_env = os.environ.copy() - # if app_custom_folder != "": - # add_custom_dir_to_path(app_custom_folder, new_env) - # - # command_options = "" - # for t in args.set: - # command_options += " " + t - # command = ( - # f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " - # + args.workspace - # + " -w " - # + self.startup - # + " -t " - # + client.token - # + " -d " - # + client.ssid - # + " -n " - # + job_id - # + " -c " - # + client.client_name - # + " -p " - # + str(client.cell.get_internal_listener_url()) - # + " -g " - # + target - # + " -scheme " - # + scheme - # + " -s fed_client.json " - # " --set" + command_options + " print_conf=True" - # ) - # # use os.setsid to create new process group ID - # process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) - # - # self.logger.info("Worker child process ID: {}".format(process.pid)) job_launcher = self._get_job_launcher(client, job_meta) job_launcher.launch_job(client, self.startup, job_id, args, app_custom_folder, target, scheme) From 048df0dcc4c9c482aa8b72bd8de429f3eb272dd0 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 14 Oct 2024 14:35:19 -0400 Subject: [PATCH 04/31] Added logger for k8s_launcher.py --- nvflare/app_opt/job_launcher/k8s_launcher.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 081af51909..8df70ff26c 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import time from enum import Enum @@ -195,6 +196,7 @@ def __init__(self, config_file_path, namespace='default'): self.namespace = namespace self.job_handle = None + self.logger = logging.getLogger(self.__class__.__name__) def launch_job(self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None) -> bool: @@ -228,6 +230,8 @@ def launch_job(self, client, startup, job_id, args, app_custom_folder, target: s "set_list": args.set } + self.logger.info(f"launch job with k8s_launcher. Job_id:{job_id}") + self.job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) try: self.core_v1.create_namespaced_pod(body=self.job_handle.get_manifest(), namespace=self.namespace) From 4296f96948cbb23d2301c7d7ffb41fb6afa2ef33 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 14 Oct 2024 15:03:00 -0400 Subject: [PATCH 05/31] Added empty launcher_map check. --- nvflare/private/fed/client/client_executor.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 0131293260..f34d6fb381 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -186,10 +186,11 @@ def start_app( def _get_job_launcher(self, client, job_meta: dict): launcher = None launcher_map = job_meta.get("launcher_map") - for launcher_id, sites in launcher_map.items(): - if client.client_name in sites: - engine = client.engine - launcher = engine.get_component(launcher_id) + if launcher_map: + for launcher_id, sites in launcher_map.items(): + if client.client_name in sites: + engine = client.engine + launcher = engine.get_component(launcher_id) if not launcher: launcher = ProcessJobLauncher() return launcher From 0dfae5ee6fa8d003511b68243d25fe7d89a20c4b Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Wed, 16 Oct 2024 10:33:03 -0400 Subject: [PATCH 06/31] Added more config for K8sJobLauncher. --- nvflare/app_opt/job_launcher/k8s_launcher.py | 28 ++++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 8df70ff26c..9c916b4ccd 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -55,7 +55,8 @@ def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, nam }, 'spec': { 'containers': None, # link to container_list - 'volumes': None # link to volume_list + 'volumes': None, # link to volume_list + 'restartPolicy': 'OnFailure' } } self.volume_list = [ @@ -182,9 +183,19 @@ def enter_states(self, job_states_to_enter: list, timeout=None): class K8sJobLauncher(JobLauncherSpec): - def __init__(self, config_file_path, namespace='default'): + def __init__(self, config_file_path, + root_hostpath: str, + job_image: str, + workspace: str, + mount_path: str, + namespace='default'): super().__init__() + self.root_hostpath = root_hostpath + self.job_image = job_image + self.workspace = workspace + self.mount_path = mount_path + config.load_kube_config(config_file_path) try: c = Configuration().get_default_copy() @@ -201,17 +212,18 @@ def __init__(self, config_file_path, namespace='default'): def launch_job(self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None) -> bool: - root_hostpath = "/home/azureuser/wksp/k2k/disk" - job_image = "localhost:32000/nvfl-k8s:0.0.1" + # root_hostpath = "/home/azureuser/wksp/k2k/disk" + # job_image = "localhost:32000/nvfl-k8s:0.0.1" job_config = { "name": job_id, - "image": job_image, + "image": self.job_image, "container_name": f"container-{job_id}", - "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], + # "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], + "volume_mount_list": [{'name': self.workspace, 'mountPath': self.mount_path}], "volume_list": [{ - 'name': "workspace-nvflare", + 'name': self.workspace, 'hostPath': { - 'path': root_hostpath, + 'path': self.root_hostpath, 'type': 'Directory' } }], From 044b7695b69b37fe7e54e504f46ff474c279efb8 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Wed, 16 Oct 2024 11:47:35 -0400 Subject: [PATCH 07/31] renamed RunProcessKey.JOB_LAUNCHER. --- nvflare/apis/fl_constant.py | 2 +- nvflare/private/fed/app/simulator/simulator_runner.py | 2 +- nvflare/private/fed/client/client_executor.py | 6 +++--- nvflare/private/fed/server/server_engine.py | 4 ++-- nvflare/private/fed/simulator/simulator_client_engine.py | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index b955b4435e..50b8865a8a 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -321,7 +321,7 @@ class SnapshotKey(object): class RunProcessKey(object): LISTEN_PORT = "_listen_port" CONNECTION = "_conn" - CHILD_PROCESS = "_child_process" + JOB_LAUNCHER = "_job_launcher" STATUS = "_status" JOB_ID = "_job_id" PARTICIPANTS = "_participants" diff --git a/nvflare/private/fed/app/simulator/simulator_runner.py b/nvflare/private/fed/app/simulator/simulator_runner.py index 1d19fac603..d3c1d49fd8 100644 --- a/nvflare/private/fed/app/simulator/simulator_runner.py +++ b/nvflare/private/fed/app/simulator/simulator_runner.py @@ -450,7 +450,7 @@ def simulator_run_main(self): try: self.create_clients() self.server.engine.run_processes[SimulatorConstants.JOB_NAME] = { - RunProcessKey.CHILD_PROCESS: None, + RunProcessKey.JOB_LAUNCHER: None, RunProcessKey.JOB_ID: SimulatorConstants.JOB_NAME, RunProcessKey.PARTICIPANTS: self.server.engine.client_manager.clients, } diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index f34d6fb381..febfc58ef5 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -173,7 +173,7 @@ def start_app( with self.lock: self.run_processes[job_id] = { - RunProcessKey.CHILD_PROCESS: job_launcher, + RunProcessKey.JOB_LAUNCHER: job_launcher, RunProcessKey.STATUS: ClientStatus.STARTING, } @@ -317,7 +317,7 @@ def abort_app(self, job_id): if process_status == ClientStatus.STARTED: try: with self.lock: - job_launcher = self.run_processes[job_id][RunProcessKey.CHILD_PROCESS] + job_launcher = self.run_processes[job_id][RunProcessKey.JOB_LAUNCHER] data = {} fqcn = FQCN.join([self.client.client_name, job_id]) request = new_cell_message({}, data) @@ -398,7 +398,7 @@ def abort_task(self, job_id): def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace): self.logger.info(f"run ({job_id}): waiting for child worker process to finish.") - job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.CHILD_PROCESS) + job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_LAUNCHER) if job_launcher: job_launcher.wait() diff --git a/nvflare/private/fed/server/server_engine.py b/nvflare/private/fed/server/server_engine.py index 68e2b1416e..0af06daf85 100644 --- a/nvflare/private/fed/server/server_engine.py +++ b/nvflare/private/fed/server/server_engine.py @@ -284,7 +284,7 @@ def _start_runner_process( with self.lock: self.run_processes[run_number] = { - RunProcessKey.CHILD_PROCESS: process, + RunProcessKey.JOB_LAUNCHER: process, RunProcessKey.JOB_ID: job_id, RunProcessKey.PARTICIPANTS: job_clients, } @@ -332,7 +332,7 @@ def abort_app_on_server(self, job_id: str, turn_to_cold: bool = False) -> str: self.logger.info(f"Abort server status: {status_message}") except Exception: with self.lock: - child_process = self.run_processes.get(job_id, {}).get(RunProcessKey.CHILD_PROCESS, None) + child_process = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_LAUNCHER, None) if child_process: child_process.terminate() finally: diff --git a/nvflare/private/fed/simulator/simulator_client_engine.py b/nvflare/private/fed/simulator/simulator_client_engine.py index fd0a3969c1..cdb4ac3582 100644 --- a/nvflare/private/fed/simulator/simulator_client_engine.py +++ b/nvflare/private/fed/simulator/simulator_client_engine.py @@ -25,7 +25,7 @@ def __init__(self, client, args, rank=0): fl_ctx.set_prop(FLContextKey.SIMULATE_MODE, True, private=True, sticky=True) self.client_executor.run_processes[SimulatorConstants.JOB_NAME] = { - RunProcessKey.CHILD_PROCESS: None, + RunProcessKey.JOB_LAUNCHER: None, RunProcessKey.STATUS: ClientStatus.STARTED, } From 02c71e729ac63998ddebfa0f1d6901147db56ac2 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Thu, 17 Oct 2024 11:40:08 -0400 Subject: [PATCH 08/31] Separated out the JobHandleSpec. --- nvflare/apis/fl_constant.py | 2 +- .../app_opt/job_launcher/job_launcher_spec.py | 37 +++++------ nvflare/app_opt/job_launcher/k8s_launcher.py | 61 ++++++++----------- .../app_opt/job_launcher/process_launcher.py | 56 ++++++++++------- .../fed/app/simulator/simulator_runner.py | 2 +- nvflare/private/fed/client/client_executor.py | 13 ++-- nvflare/private/fed/server/server_engine.py | 4 +- .../fed/simulator/simulator_client_engine.py | 2 +- 8 files changed, 92 insertions(+), 85 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index 50b8865a8a..2021a7fcc1 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -321,7 +321,7 @@ class SnapshotKey(object): class RunProcessKey(object): LISTEN_PORT = "_listen_port" CONNECTION = "_conn" - JOB_LAUNCHER = "_job_launcher" + JOB_HANDLE = "_job_launcher" STATUS = "_status" JOB_ID = "_job_id" PARTICIPANTS = "_participants" diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index 9799870176..48ddfaae8f 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -14,24 +14,9 @@ from abc import abstractmethod -class JobLauncherSpec: - @abstractmethod - def launch_job( - self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None - ) -> bool: - """To launch a job run. - - Args: - timeout: the job needs to be started within this timeout. Otherwise failed the job launch. - None means no timeout limit. - - Returns: boolean to indicates the job launch success or fail. - - """ - raise NotImplemented - +class JobHandleSpec: @abstractmethod - def terminate(self): + def terminate(self, timeout=None): """To terminate the job run. Returns: the job run return code. @@ -56,3 +41,21 @@ def wait(self): """ raise NotImplemented + + +class JobLauncherSpec: + @abstractmethod + def launch_job( + self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None + ) -> JobHandleSpec: + """To launch a job run. + + Args: + timeout: the job needs to be started within this timeout. Otherwise failed the job launch. + None means no timeout limit. + + Returns: boolean to indicates the job launch success or fail. + + """ + raise NotImplemented + diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 9c916b4ccd..4cc941508c 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -15,7 +15,7 @@ import time from enum import Enum -from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec +from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec, JobHandleSpec from kubernetes import config from kubernetes.client import Configuration @@ -40,7 +40,7 @@ class JobState(Enum): } -class K8sJobHandle: +class K8sJobHandle(JobHandleSpec): def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace='default'): super().__init__() self.job_id = job_id @@ -156,17 +156,6 @@ def _make_manifest(self, job_config): def get_manifest(self): return self.pod_manifest - def abort(self, timeout=None): - resp = self.api_instance.delete_namespaced_pod(name=self.job_id, namespace=self.namespace, grace_period_seconds=0) - return self.enter_states([JobState.TERMINATED], timeout=timeout) - - def get_state(self): - try: - resp = self.api_instance.read_namespaced_pod(name=self.job_id, namespace=self.namespace) - except ApiException as e: - return JobState.UNKNOWN - return POD_STATE_MAPPING.get(resp.status.phase, JobState.UNKNOWN) - def enter_states(self, job_states_to_enter: list, timeout=None): starting_time = time.time() if not isinstance(job_states_to_enter, (list, tuple)): @@ -174,13 +163,28 @@ def enter_states(self, job_states_to_enter: list, timeout=None): if not all([isinstance(js, JobState)] for js in job_states_to_enter): raise ValueError(f"expect job_states_to_enter with valid values, but get {job_states_to_enter}") while True: - job_state = self.get_state() + job_state = self.poll() if job_state in job_states_to_enter: return True elif timeout is not None and time.time()-starting_time>timeout: return False time.sleep(1) + def terminate(self, timeout=None): + resp = self.api_instance.delete_namespaced_pod(name=self.job_id, namespace=self.namespace, + grace_period_seconds=0) + return self.enter_states([JobState.TERMINATED], timeout=timeout) + + def poll(self): + try: + resp = self.api_instance.read_namespaced_pod(name=self.job_id, namespace=self.namespace) + except ApiException as e: + return JobState.UNKNOWN + return POD_STATE_MAPPING.get(resp.status.phase, JobState.UNKNOWN) + + def wait(self): + self.enter_states([JobState.SUCCEEDED, JobState.TERMINATED]) + class K8sJobLauncher(JobLauncherSpec): def __init__(self, config_file_path, @@ -210,7 +214,7 @@ def __init__(self, config_file_path, self.logger = logging.getLogger(self.__class__.__name__) def launch_job(self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, - timeout=None) -> bool: + timeout=None) -> JobHandleSpec: # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" @@ -244,26 +248,15 @@ def launch_job(self, client, startup, job_id, args, app_custom_folder, target: s self.logger.info(f"launch job with k8s_launcher. Job_id:{job_id}") - self.job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) + job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) try: - self.core_v1.create_namespaced_pod(body=self.job_handle.get_manifest(), namespace=self.namespace) - if self.job_handle.enter_states([JobState.RUNNING], timeout=timeout): - return True + self.core_v1.create_namespaced_pod(body=job_handle.get_manifest(), namespace=self.namespace) + if job_handle.enter_states([JobState.RUNNING], timeout=timeout): + return job_handle else: - return False + job_handle.terminate() + return None except ApiException as e: - return False + job_handle.terminate() + return None - def terminate(self): - if self.job_handle: - self.job_handle.abort() - - def poll(self): - if self.job_handle: - return self.job_handle.get_state() - else: - return JobState.UNKNOWN - - def wait(self): - if self.job_handle: - self.job_handle.enter_states([JobState.SUCCEEDED, JobState.TERMINATED]) diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py index 055c65a76e..ed419db123 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -17,20 +17,47 @@ import subprocess import sys -from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec +from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec, JobHandleSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path +class ProcessHandle(JobHandleSpec): + def __init__(self, process): + super().__init__() + + self.process = process + self.logger = logging.getLogger(self.__class__.__name__) + + def terminate(self): + if self.process: + try: + os.killpg(os.getpgid(self.process.pid), 9) + self.logger.debug("kill signal sent") + except: + pass + + self.process.terminate() + + def poll(self): + if self.process: + return self.process.poll() + else: + return None + + def wait(self): + if self.process: + self.process.wait() + + class ProcessJobLauncher(JobLauncherSpec): def __init__(self): super().__init__() - self.process = None self.logger = logging.getLogger(self.__class__.__name__) def launch_job( self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None - ) -> bool: + ) -> JobHandleSpec: new_env = os.environ.copy() if app_custom_folder != "": @@ -62,26 +89,9 @@ def launch_job( " --set" + command_options + " print_conf=True" ) # use os.setsid to create new process group ID - self.process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) + process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) - self.logger.info("Worker child process ID: {}".format(self.process.pid)) + self.logger.info("Worker child process ID: {}".format(process.pid)) - def terminate(self): - if self.process: - try: - os.killpg(os.getpgid(self.process.pid), 9) - self.logger.debug("kill signal sent") - except: - pass + return ProcessHandle(process) - self.process.terminate() - - def poll(self): - if self.process: - return self.process.poll() - else: - return None - - def wait(self): - if self.process: - self.process.wait() diff --git a/nvflare/private/fed/app/simulator/simulator_runner.py b/nvflare/private/fed/app/simulator/simulator_runner.py index d3c1d49fd8..f88d9e5772 100644 --- a/nvflare/private/fed/app/simulator/simulator_runner.py +++ b/nvflare/private/fed/app/simulator/simulator_runner.py @@ -450,7 +450,7 @@ def simulator_run_main(self): try: self.create_clients() self.server.engine.run_processes[SimulatorConstants.JOB_NAME] = { - RunProcessKey.JOB_LAUNCHER: None, + RunProcessKey.JOB_HANDLE: None, RunProcessKey.JOB_ID: SimulatorConstants.JOB_NAME, RunProcessKey.PARTICIPANTS: self.server.engine.client_manager.clients, } diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index febfc58ef5..64833da1a0 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -19,6 +19,7 @@ from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs from nvflare.apis.resource_manager_spec import ResourceManagerSpec +from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec from nvflare.fuel.common.exit_codes import PROCESS_EXIT_REASON, ProcessExitCode from nvflare.fuel.f3.cellnet.core_cell import FQCN from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode @@ -166,14 +167,14 @@ def start_app( target: SP target location scheme: SP connection scheme """ - job_launcher = self._get_job_launcher(client, job_meta) - job_launcher.launch_job(client, self.startup, job_id, args, app_custom_folder, target, scheme) + job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) + job_handle = job_launcher.launch_job(client, self.startup, job_id, args, app_custom_folder, target, scheme) client.multi_gpu = False with self.lock: self.run_processes[job_id] = { - RunProcessKey.JOB_LAUNCHER: job_launcher, + RunProcessKey.JOB_HANDLE: job_handle, RunProcessKey.STATUS: ClientStatus.STARTING, } @@ -183,7 +184,7 @@ def start_app( ) thread.start() - def _get_job_launcher(self, client, job_meta: dict): + def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: launcher = None launcher_map = job_meta.get("launcher_map") if launcher_map: @@ -317,7 +318,7 @@ def abort_app(self, job_id): if process_status == ClientStatus.STARTED: try: with self.lock: - job_launcher = self.run_processes[job_id][RunProcessKey.JOB_LAUNCHER] + job_launcher = self.run_processes[job_id][RunProcessKey.JOB_HANDLE] data = {} fqcn = FQCN.join([self.client.client_name, job_id]) request = new_cell_message({}, data) @@ -398,7 +399,7 @@ def abort_task(self, job_id): def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace): self.logger.info(f"run ({job_id}): waiting for child worker process to finish.") - job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_LAUNCHER) + job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE) if job_launcher: job_launcher.wait() diff --git a/nvflare/private/fed/server/server_engine.py b/nvflare/private/fed/server/server_engine.py index 0af06daf85..4aa2f144dc 100644 --- a/nvflare/private/fed/server/server_engine.py +++ b/nvflare/private/fed/server/server_engine.py @@ -284,7 +284,7 @@ def _start_runner_process( with self.lock: self.run_processes[run_number] = { - RunProcessKey.JOB_LAUNCHER: process, + RunProcessKey.JOB_HANDLE: process, RunProcessKey.JOB_ID: job_id, RunProcessKey.PARTICIPANTS: job_clients, } @@ -332,7 +332,7 @@ def abort_app_on_server(self, job_id: str, turn_to_cold: bool = False) -> str: self.logger.info(f"Abort server status: {status_message}") except Exception: with self.lock: - child_process = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_LAUNCHER, None) + child_process = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE, None) if child_process: child_process.terminate() finally: diff --git a/nvflare/private/fed/simulator/simulator_client_engine.py b/nvflare/private/fed/simulator/simulator_client_engine.py index cdb4ac3582..47039a1147 100644 --- a/nvflare/private/fed/simulator/simulator_client_engine.py +++ b/nvflare/private/fed/simulator/simulator_client_engine.py @@ -25,7 +25,7 @@ def __init__(self, client, args, rank=0): fl_ctx.set_prop(FLContextKey.SIMULATE_MODE, True, private=True, sticky=True) self.client_executor.run_processes[SimulatorConstants.JOB_NAME] = { - RunProcessKey.JOB_LAUNCHER: None, + RunProcessKey.JOB_HANDLE: None, RunProcessKey.STATUS: ClientStatus.STARTED, } From b14dc759591c7fa50b111fab8799de4a6551b89c Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 10:10:15 -0400 Subject: [PATCH 09/31] Support for the launcher deploy image. --- .../app_opt/job_launcher/job_launcher_spec.py | 7 +--- nvflare/app_opt/job_launcher/k8s_launcher.py | 14 ++++--- .../app_opt/job_launcher/process_launcher.py | 4 +- nvflare/private/fed/client/client_executor.py | 3 +- nvflare/private/fed/server/job_runner.py | 3 +- nvflare/private/fed/utils/fed_utils.py | 25 +++++++++++ .../private/fed/utils/fed_utils_test.py | 42 +++++++++++++++++++ 7 files changed, 82 insertions(+), 16 deletions(-) diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index 48ddfaae8f..4a79ffd566 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -45,14 +45,11 @@ def wait(self): class JobLauncherSpec: @abstractmethod - def launch_job( - self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None - ) -> JobHandleSpec: + def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: """To launch a job run. Args: - timeout: the job needs to be started within this timeout. Otherwise failed the job launch. - None means no timeout limit. + job_meta: Returns: boolean to indicates the job launch success or fail. diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 4cc941508c..c4e6916908 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -22,6 +22,8 @@ from kubernetes.client.api import core_v1_api from kubernetes.client.rest import ApiException +from nvflare.private.fed.utils.fed_utils import extract_job_image + class JobState(Enum): STARTING = "starting" @@ -189,16 +191,16 @@ def wait(self): class K8sJobLauncher(JobLauncherSpec): def __init__(self, config_file_path, root_hostpath: str, - job_image: str, workspace: str, mount_path: str, + launch_timeout=None, namespace='default'): super().__init__() self.root_hostpath = root_hostpath - self.job_image = job_image self.workspace = workspace self.mount_path = mount_path + self.launch_timeout = launch_timeout config.load_kube_config(config_file_path) try: @@ -213,14 +215,14 @@ def __init__(self, config_file_path, self.job_handle = None self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, - timeout=None) -> JobHandleSpec: + def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" + job_image = extract_job_image(job_meta, client) job_config = { "name": job_id, - "image": self.job_image, + "image": job_image, "container_name": f"container-{job_id}", # "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], "volume_mount_list": [{'name': self.workspace, 'mountPath': self.mount_path}], @@ -251,7 +253,7 @@ def launch_job(self, client, startup, job_id, args, app_custom_folder, target: s job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) try: self.core_v1.create_namespaced_pod(body=job_handle.get_manifest(), namespace=self.namespace) - if job_handle.enter_states([JobState.RUNNING], timeout=timeout): + if job_handle.enter_states([JobState.RUNNING], timeout=self.launch_timeout): return job_handle else: job_handle.terminate() diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py index ed419db123..6d03894628 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -55,9 +55,7 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - def launch_job( - self, client, startup, job_id, args, app_custom_folder, target: str, scheme: str, timeout=None - ) -> JobHandleSpec: + def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: new_env = os.environ.copy() if app_custom_folder != "": diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 64833da1a0..402a9b4348 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -168,7 +168,8 @@ def start_app( scheme: SP connection scheme """ job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) - job_handle = job_launcher.launch_job(client, self.startup, job_id, args, app_custom_folder, target, scheme) + job_handle = job_launcher.launch_job(job_id, None, client, self.startup, args, app_custom_folder, target, + scheme) client.multi_gpu = False diff --git a/nvflare/private/fed/server/job_runner.py b/nvflare/private/fed/server/job_runner.py index cac4b30c57..80984ecb9f 100644 --- a/nvflare/private/fed/server/job_runner.py +++ b/nvflare/private/fed/server/job_runner.py @@ -35,7 +35,7 @@ from nvflare.private.fed.server.admin import check_client_replies from nvflare.private.fed.server.server_state import HotState from nvflare.private.fed.utils.app_deployer import AppDeployer -from nvflare.private.fed.utils.fed_utils import set_message_security_data +from nvflare.private.fed.utils.fed_utils import set_message_security_data, extract_participants from nvflare.security.logging import secure_format_exception @@ -131,6 +131,7 @@ def _deploy_job(self, job: Job, sites: dict, fl_ctx: FLContext) -> Tuple[str, li for app_name, participants in job.get_deployment().items(): app_data = job.get_application(app_name, fl_ctx) + participants = extract_participants(participants) if len(participants) == 1 and participants[0].upper() == ALL_SITES: participants = ["server"] diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index 330051d992..9da370ea9b 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -399,3 +399,28 @@ def add_custom_dir_to_path(app_custom_folder, new_env): new_env[SystemVarName.PYTHONPATH] = path + os.pathsep + app_custom_folder else: new_env[SystemVarName.PYTHONPATH] = app_custom_folder + + +def extract_participants(participants_list): + participants = [] + for item in participants_list: + if isinstance(item, str): + participants.append(item) + elif isinstance(item, dict): + sites = item.get("sites") + participants.extend(sites) + else: + raise ValueError(f"Must be tye of str or dict, but got {type(item)}") + return participants + + +def extract_job_image(job_meta, site_name): + # job_image = "localhost:32000/nvfl-k8s:0.0.1" + deploy_map = job_meta.get(JobMetaKey.DEPLOY_MAP, {}) + for _, participants in deploy_map.items(): + for item in participants: + if isinstance(item, dict): + sites = item.get("sites") + if site_name in sites: + return item.get("image") + return None diff --git a/tests/unit_test/private/fed/utils/fed_utils_test.py b/tests/unit_test/private/fed/utils/fed_utils_test.py index 5de4b5e0ed..c334bfcd24 100644 --- a/tests/unit_test/private/fed/utils/fed_utils_test.py +++ b/tests/unit_test/private/fed/utils/fed_utils_test.py @@ -18,6 +18,7 @@ from nvflare.fuel.utils.fobs import Decomposer from nvflare.fuel.utils.fobs.datum import DatumManager from nvflare.fuel.utils.fobs.fobs import register_custom_folder +from nvflare.private.fed.utils.fed_utils import extract_participants, extract_job_image class ExampleTestClass: @@ -50,3 +51,44 @@ def test_custom_fobs_initialize(self): decomposer = ExampleTestClassDecomposer() decomposers = fobs.fobs._decomposers assert decomposer in list(decomposers.values()) + + def test_extract_participants(self): + participants = ["site-1", "site-2"] + results = extract_participants(participants) + expected = ["site-1", "site-2"] + assert results == expected + + participants = ["@ALL"] + results = extract_participants(participants) + expected = ["@ALL"] + assert results == expected + + def test_extract_participants_with_image(self): + participants = ["site-1", "site-2", + { + "sites": ["site-3", "site-4"], + "image": "image1" + }, + { + "sites": ["site-5"], + "image": "image2" + } + ] + results = extract_participants(participants) + expected = ["site-1", "site-2", "site-3", "site-4", "site-5"] + assert results == expected + + def test_extract_job_image(self): + job_meta = { + "deploy_map": { + "app": [ "site-1", "site-2", + { + "sites": ["site-3", "site-4"], + "image": "image1" + } + ] + } + } + result = extract_job_image(job_meta, "site-3") + expected = "image1" + assert result == expected From c88e00b067a5cb017ccb77a78270021927672f47 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 11:08:32 -0400 Subject: [PATCH 10/31] Changed the _get_job_launcher logic. --- nvflare/private/fed/client/client_executor.py | 15 ++++++--------- .../unit_test/private/fed/utils/fed_utils_test.py | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 402a9b4348..b24228ec4d 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -25,7 +25,7 @@ from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode from nvflare.fuel.utils.config_service import ConfigService from nvflare.private.defs import CellChannel, CellChannelTopic, JobFailureMsgKey, new_cell_message -from nvflare.private.fed.utils.fed_utils import get_return_code +from nvflare.private.fed.utils.fed_utils import get_return_code, extract_job_image from nvflare.security.logging import secure_format_exception, secure_log_traceback from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher @@ -186,14 +186,11 @@ def start_app( thread.start() def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: - launcher = None - launcher_map = job_meta.get("launcher_map") - if launcher_map: - for launcher_id, sites in launcher_map.items(): - if client.client_name in sites: - engine = client.engine - launcher = engine.get_component(launcher_id) - if not launcher: + launch_image = extract_job_image(job_meta, client) + if launch_image: + engine = client.engine + launcher = engine.get_component("image_launcher") + else: launcher = ProcessJobLauncher() return launcher diff --git a/tests/unit_test/private/fed/utils/fed_utils_test.py b/tests/unit_test/private/fed/utils/fed_utils_test.py index c334bfcd24..208f03a9ef 100644 --- a/tests/unit_test/private/fed/utils/fed_utils_test.py +++ b/tests/unit_test/private/fed/utils/fed_utils_test.py @@ -92,3 +92,7 @@ def test_extract_job_image(self): result = extract_job_image(job_meta, "site-3") expected = "image1" assert result == expected + + result = extract_job_image(job_meta, "site-1") + expected = None + assert result == expected From a7721edad031b91007a749027c642cbfcad823f4 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 12:22:52 -0400 Subject: [PATCH 11/31] add more handled for the deployment_map change. --- nvflare/app_common/job_schedulers/job_scheduler.py | 5 ++++- nvflare/private/fed/client/client_executor.py | 2 +- nvflare/private/fed/server/job_meta_validator.py | 8 +++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/nvflare/app_common/job_schedulers/job_scheduler.py b/nvflare/app_common/job_schedulers/job_scheduler.py index c7e03d394f..d5cd35c817 100644 --- a/nvflare/app_common/job_schedulers/job_scheduler.py +++ b/nvflare/app_common/job_schedulers/job_scheduler.py @@ -25,6 +25,7 @@ from nvflare.apis.job_def_manager_spec import JobDefManagerSpec from nvflare.apis.job_scheduler_spec import DispatchInfo, JobSchedulerSpec from nvflare.apis.server_engine_spec import ServerEngineSpec +from nvflare.private.fed.utils.fed_utils import extract_participants SCHEDULE_RESULT_OK = 0 # the job is scheduled SCHEDULE_RESULT_NO_RESOURCE = 1 # job is not scheduled due to lack of resources @@ -109,7 +110,9 @@ def _try_job(self, job: Job, fl_ctx: FLContext) -> (int, Optional[Dict[str, Disp applicable_sites = [] sites_to_app = {} for app_name in job.deploy_map: - for site_name in job.deploy_map[app_name]: + deployments = job.deploy_map[app_name] + deployments = extract_participants(deployments) + for site_name in deployments: if site_name.upper() == ALL_SITES: # deploy_map: {"app_name": ["ALL_SITES"]} will be treated as deploying to all online clients applicable_sites = online_site_names diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index b24228ec4d..30b8277375 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -186,7 +186,7 @@ def start_app( thread.start() def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: - launch_image = extract_job_image(job_meta, client) + launch_image = extract_job_image(job_meta, client.client_name) if launch_image: engine = client.engine launcher = engine.get_component("image_launcher") diff --git a/nvflare/private/fed/server/job_meta_validator.py b/nvflare/private/fed/server/job_meta_validator.py index 0856b736eb..9362125081 100644 --- a/nvflare/private/fed/server/job_meta_validator.py +++ b/nvflare/private/fed/server/job_meta_validator.py @@ -24,6 +24,7 @@ from nvflare.apis.job_meta_validator_spec import JobMetaValidatorSpec from nvflare.fuel.utils.config import ConfigFormat from nvflare.fuel.utils.config_factory import ConfigFactory +from nvflare.private.fed.utils.fed_utils import extract_participants from nvflare.security.logging import secure_format_exception CONFIG_FOLDER = "/config/" @@ -101,7 +102,11 @@ def _validate_deploy_map(job_name: str, meta: dict) -> list: if not deploy_map: raise ValueError(f"deploy_map is empty for job {job_name}") - site_list = [site for deployments in deploy_map.values() for site in deployments] + site_list = [] + for deployments in deploy_map.values(): + deployments = extract_participants(deployments) + for site in deployments: + site_list.append(site) if not site_list: raise ValueError(f"No site is specified in deploy_map for job {job_name}") @@ -126,6 +131,7 @@ def _validate_app(self, job_name: str, meta: dict, zip_file: ZipFile) -> None: has_byoc = False for app, deployments in deploy_map.items(): + deployments = extract_participants(deployments) config_folder = job_name + "/" + app + CONFIG_FOLDER if not self._entry_exists(zip_file, config_folder): From 9a7e89e4df3ef63a67a8c4813402c54f984199c4 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 12:35:52 -0400 Subject: [PATCH 12/31] Added logging for job launcher. --- nvflare/app_opt/job_launcher/k8s_launcher.py | 1 + nvflare/private/fed/client/client_executor.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index c4e6916908..eb56030b50 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -220,6 +220,7 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" job_image = extract_job_image(job_meta, client) + self.logger.info(f"launch job use image: {job_image}") job_config = { "name": job_id, "image": job_image, diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 30b8277375..b84ba17a66 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -190,8 +190,12 @@ def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: if launch_image: engine = client.engine launcher = engine.get_component("image_launcher") + if not launcher: + raise RuntimeError("There's no image job launcher defined.") + self.logger.info(f"Launch job with job launcher: {type(launcher)}") else: launcher = ProcessJobLauncher() + self.logger.info("Launch job with ProcessJobLauncher.") return launcher def notify_job_status(self, job_id, job_status): From 0c57c7b55f89a6c8c429a14ded0773c75e491314 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 12:46:12 -0400 Subject: [PATCH 13/31] Fixed extract_job_image usage. --- nvflare/app_opt/job_launcher/k8s_launcher.py | 3 ++- tests/unit_test/private/fed/utils/fed_utils_test.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index eb56030b50..708ebd671a 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -219,7 +219,8 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" - job_image = extract_job_image(job_meta, client) + self.logger.info(f"K8sJobLauncher start to launch job: {job_id} for client: {client.client_name}") + job_image = extract_job_image(job_meta, client.client_name) self.logger.info(f"launch job use image: {job_image}") job_config = { "name": job_id, diff --git a/tests/unit_test/private/fed/utils/fed_utils_test.py b/tests/unit_test/private/fed/utils/fed_utils_test.py index 208f03a9ef..b2c53108fe 100644 --- a/tests/unit_test/private/fed/utils/fed_utils_test.py +++ b/tests/unit_test/private/fed/utils/fed_utils_test.py @@ -96,3 +96,13 @@ def test_extract_job_image(self): result = extract_job_image(job_meta, "site-1") expected = None assert result == expected + + job_meta = { + "deploy_map": { + "app": [ "site-1", "site-2" + ] + } + } + result = extract_job_image(job_meta, "site-1") + expected = None + assert result == expected From 68c025dd3430a5762fcad7bdebfa769744db28bc Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 13:03:28 -0400 Subject: [PATCH 14/31] Added job_meta for launch_job. --- nvflare/private/fed/client/client_executor.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index b84ba17a66..049d4c18f3 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -168,7 +168,7 @@ def start_app( scheme: SP connection scheme """ job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) - job_handle = job_launcher.launch_job(job_id, None, client, self.startup, args, app_custom_folder, target, + job_handle = job_launcher.launch_job(job_id, job_meta, client, self.startup, args, app_custom_folder, target, scheme) client.multi_gpu = False @@ -192,7 +192,7 @@ def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: launcher = engine.get_component("image_launcher") if not launcher: raise RuntimeError("There's no image job launcher defined.") - self.logger.info(f"Launch job with job launcher: {type(launcher)}") + self.logger.info(f"Launch job image: {launch_image} with job launcher: {type(launcher)} ") else: launcher = ProcessJobLauncher() self.logger.info("Launch job with ProcessJobLauncher.") From 24469b49be783c28a2b1def68aa0ae8512076181 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 18 Oct 2024 17:38:39 -0400 Subject: [PATCH 15/31] codestyle fix. --- .../app_opt/job_launcher/job_launcher_spec.py | 5 +- nvflare/app_opt/job_launcher/k8s_launcher.py | 215 ++++++++---------- .../app_opt/job_launcher/process_launcher.py | 7 +- nvflare/private/fed/client/client_executor.py | 18 +- nvflare/private/fed/server/job_runner.py | 2 +- nvflare/private/fed/utils/fed_utils.py | 1 - .../private/fed/utils/fed_utils_test.py | 36 +-- 7 files changed, 119 insertions(+), 165 deletions(-) diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index 4a79ffd566..95d40541e4 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -45,7 +45,9 @@ def wait(self): class JobLauncherSpec: @abstractmethod - def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: + def launch_job( + self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + ) -> JobHandleSpec: """To launch a job run. Args: @@ -55,4 +57,3 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, """ raise NotImplemented - diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 708ebd671a..201821a100 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -15,13 +15,12 @@ import time from enum import Enum -from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec, JobHandleSpec - from kubernetes import config from kubernetes.client import Configuration from kubernetes.client.api import core_v1_api from kubernetes.client.rest import ApiException +from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec from nvflare.private.fed.utils.fed_utils import extract_job_image @@ -38,122 +37,103 @@ class JobState(Enum): "Running": JobState.RUNNING, "Succeeded": JobState.SUCCEEDED, "Failed": JobState.TERMINATED, - "Unknown": JobState.UNKNOWN - } + "Unknown": JobState.UNKNOWN, +} class K8sJobHandle(JobHandleSpec): - def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace='default'): + def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace="default"): super().__init__() self.job_id = job_id self.api_instance = api_instance self.namespace = namespace self.pod_manifest = { - 'apiVersion': 'v1', - 'kind': 'Pod', - 'metadata': { - 'name': None # set by job_config['name'] + "apiVersion": "v1", + "kind": "Pod", + "metadata": {"name": None}, # set by job_config['name'] + "spec": { + "containers": None, # link to container_list + "volumes": None, # link to volume_list + "restartPolicy": "OnFailure", }, - 'spec': { - 'containers': None, # link to container_list - 'volumes': None, # link to volume_list - 'restartPolicy': 'OnFailure' - } } - self.volume_list = [ - { - 'name': None, - 'hostPath': { - 'path': None, - 'type': 'Directory' - } - } - ] + self.volume_list = [{"name": None, "hostPath": {"path": None, "type": "Directory"}}] self.container_list = [ { - 'image': None, - 'name': None, - 'command': ['/usr/local/bin/python'], - 'args': None, # args_list + args_dict + args_sets - 'volumeMounts': None, # volume_mount_list - 'imagePullPolicy': 'Always' + "image": None, + "name": None, + "command": ["/usr/local/bin/python"], + "args": None, # args_list + args_dict + args_sets + "volumeMounts": None, # volume_mount_list + "imagePullPolicy": "Always", } ] - self.container_args_python_args_list = [ - '-u', '-m', 'nvflare.private.fed.app.client.worker_process' - ] + self.container_args_python_args_list = ["-u", "-m", "nvflare.private.fed.app.client.worker_process"] self.container_args_module_args_dict = { - '-m': None, - '-w': None, - '-t': None, - '-d': None, - '-n': None, - '-c': None, - '-p': None, - '-g': None, - '-scheme': None, - '-s': None, + "-m": None, + "-w": None, + "-t": None, + "-d": None, + "-n": None, + "-c": None, + "-p": None, + "-g": None, + "-scheme": None, + "-s": None, } self.container_volume_mount_list = [ { - 'name': None, - 'mountPath': None, + "name": None, + "mountPath": None, } ] self._make_manifest(job_config) def _make_manifest(self, job_config): - self.container_volume_mount_list = \ - job_config.get('volume_mount_list', - [{'name': 'workspace-nvflare', 'mountPath': '/workspace/nvflare'}] - ) - set_list = job_config.get('set_list') + self.container_volume_mount_list = job_config.get( + "volume_mount_list", [{"name": "workspace-nvflare", "mountPath": "/workspace/nvflare"}] + ) + set_list = job_config.get("set_list") if set_list is None: self.container_args_module_args_sets = list() else: - self.container_args_module_args_sets = ['--set'] + set_list - self.container_args_module_args_dict = \ - job_config.get('module_args', - { - '-m': None, - '-w': None, - '-t': None, - '-d': None, - '-n': None, - '-c': None, - '-p': None, - '-g': None, - '-scheme': None, - '-s': None - } - ) + self.container_args_module_args_sets = ["--set"] + set_list + self.container_args_module_args_dict = job_config.get( + "module_args", + { + "-m": None, + "-w": None, + "-t": None, + "-d": None, + "-n": None, + "-c": None, + "-p": None, + "-g": None, + "-scheme": None, + "-s": None, + }, + ) self.container_args_module_args_dict_as_list = list() for k, v in self.container_args_module_args_dict.items(): self.container_args_module_args_dict_as_list.append(k) self.container_args_module_args_dict_as_list.append(v) - self.volume_list = \ - job_config.get('volume_list', - [{ - 'name': None, - 'hostPath': { - 'path': None, - 'type': 'Directory' - } - }] - ) - - self.pod_manifest['metadata']['name'] = job_config.get('name') - self.pod_manifest['spec']['containers'] = self.container_list - self.pod_manifest['spec']['volumes'] = self.volume_list - - self.container_list[0]['image'] = job_config.get('image', 'nvflare/nvflare:2.5.0') - self.container_list[0]['name'] = job_config.get('container_name', 'nvflare_job') - self.container_list[0]['args'] = \ - self.container_args_python_args_list + \ - self.container_args_module_args_dict_as_list + \ - self.container_args_module_args_sets - self.container_list[0]['volumeMounts'] = self.container_volume_mount_list + self.volume_list = job_config.get( + "volume_list", [{"name": None, "hostPath": {"path": None, "type": "Directory"}}] + ) + + self.pod_manifest["metadata"]["name"] = job_config.get("name") + self.pod_manifest["spec"]["containers"] = self.container_list + self.pod_manifest["spec"]["volumes"] = self.volume_list + + self.container_list[0]["image"] = job_config.get("image", "nvflare/nvflare:2.5.0") + self.container_list[0]["name"] = job_config.get("container_name", "nvflare_job") + self.container_list[0]["args"] = ( + self.container_args_python_args_list + + self.container_args_module_args_dict_as_list + + self.container_args_module_args_sets + ) + self.container_list[0]["volumeMounts"] = self.container_volume_mount_list def get_manifest(self): return self.pod_manifest @@ -168,13 +148,14 @@ def enter_states(self, job_states_to_enter: list, timeout=None): job_state = self.poll() if job_state in job_states_to_enter: return True - elif timeout is not None and time.time()-starting_time>timeout: + elif timeout is not None and time.time() - starting_time > timeout: return False time.sleep(1) def terminate(self, timeout=None): - resp = self.api_instance.delete_namespaced_pod(name=self.job_id, namespace=self.namespace, - grace_period_seconds=0) + resp = self.api_instance.delete_namespaced_pod( + name=self.job_id, namespace=self.namespace, grace_period_seconds=0 + ) return self.enter_states([JobState.TERMINATED], timeout=timeout) def poll(self): @@ -189,12 +170,15 @@ def wait(self): class K8sJobLauncher(JobLauncherSpec): - def __init__(self, config_file_path, - root_hostpath: str, - workspace: str, - mount_path: str, - launch_timeout=None, - namespace='default'): + def __init__( + self, + config_file_path, + root_hostpath: str, + workspace: str, + mount_path: str, + launch_timeout=None, + namespace="default", + ): super().__init__() self.root_hostpath = root_hostpath @@ -215,7 +199,9 @@ def __init__(self, config_file_path, self.job_handle = None self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: + def launch_job( + self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + ) -> JobHandleSpec: # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" @@ -227,34 +213,28 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, "image": job_image, "container_name": f"container-{job_id}", # "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], - "volume_mount_list": [{'name': self.workspace, 'mountPath': self.mount_path}], - "volume_list": [{ - 'name': self.workspace, - 'hostPath': { - 'path': self.root_hostpath, - 'type': 'Directory' - } - }], + "volume_mount_list": [{"name": self.workspace, "mountPath": self.mount_path}], + "volume_list": [{"name": self.workspace, "hostPath": {"path": self.root_hostpath, "type": "Directory"}}], "module_args": { - '-m': args.workspace, - '-w': startup, - '-t': client.token, - '-d': client.ssid, - '-n': job_id, - '-c': client.client_name, - '-p': "tcp://parent-pod:8004", - '-g': target, - '-scheme': scheme, - '-s': "fed_client.json" + "-m": args.workspace, + "-w": startup, + "-t": client.token, + "-d": client.ssid, + "-n": job_id, + "-c": client.client_name, + "-p": "tcp://parent-pod:8004", + "-g": target, + "-scheme": scheme, + "-s": "fed_client.json", }, - "set_list": args.set + "set_list": args.set, } self.logger.info(f"launch job with k8s_launcher. Job_id:{job_id}") job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) try: - self.core_v1.create_namespaced_pod(body=job_handle.get_manifest(), namespace=self.namespace) + self.core_v1.create_namespaced_pod(body=job_handle.get_manifest(), namespace=self.namespace) if job_handle.enter_states([JobState.RUNNING], timeout=self.launch_timeout): return job_handle else: @@ -263,4 +243,3 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, except ApiException as e: job_handle.terminate() return None - diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py index 6d03894628..f651410df7 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -17,7 +17,7 @@ import subprocess import sys -from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec, JobHandleSpec +from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path @@ -55,7 +55,9 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str) -> JobHandleSpec: + def launch_job( + self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + ) -> JobHandleSpec: new_env = os.environ.copy() if app_custom_folder != "": @@ -92,4 +94,3 @@ def launch_job(self, job_id, job_meta, client, startup, args, app_custom_folder, self.logger.info("Worker child process ID: {}".format(process.pid)) return ProcessHandle(process) - diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 049d4c18f3..f39eeb36c1 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -20,15 +20,15 @@ from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec +from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher from nvflare.fuel.common.exit_codes import PROCESS_EXIT_REASON, ProcessExitCode from nvflare.fuel.f3.cellnet.core_cell import FQCN from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode from nvflare.fuel.utils.config_service import ConfigService from nvflare.private.defs import CellChannel, CellChannelTopic, JobFailureMsgKey, new_cell_message -from nvflare.private.fed.utils.fed_utils import get_return_code, extract_job_image +from nvflare.private.fed.utils.fed_utils import extract_job_image, get_return_code from nvflare.security.logging import secure_format_exception, secure_log_traceback -from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher from .client_status import ClientStatus, get_status_message @@ -168,8 +168,9 @@ def start_app( scheme: SP connection scheme """ job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) - job_handle = job_launcher.launch_job(job_id, job_meta, client, self.startup, args, app_custom_folder, target, - scheme) + job_handle = job_launcher.launch_job( + job_id, job_meta, client, self.startup, args, app_custom_folder, target, scheme + ) client.multi_gpu = False @@ -367,15 +368,6 @@ def _terminate_process(self, child_process, job_id): time.sleep(0.05) # we want to quickly check - # # kill the sub-process group directly - # if not done: - # self.logger.debug(f"still not done after {max_wait} secs") - # try: - # os.killpg(os.getpgid(child_process.pid), 9) - # self.logger.debug("kill signal sent") - # except: - # pass - child_process.terminate() self.logger.info(f"run ({job_id}): child worker process terminated") diff --git a/nvflare/private/fed/server/job_runner.py b/nvflare/private/fed/server/job_runner.py index 80984ecb9f..7c38294027 100644 --- a/nvflare/private/fed/server/job_runner.py +++ b/nvflare/private/fed/server/job_runner.py @@ -35,7 +35,7 @@ from nvflare.private.fed.server.admin import check_client_replies from nvflare.private.fed.server.server_state import HotState from nvflare.private.fed.utils.app_deployer import AppDeployer -from nvflare.private.fed.utils.fed_utils import set_message_security_data, extract_participants +from nvflare.private.fed.utils.fed_utils import extract_participants, set_message_security_data from nvflare.security.logging import secure_format_exception diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index 9da370ea9b..b3f192fdf2 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -415,7 +415,6 @@ def extract_participants(participants_list): def extract_job_image(job_meta, site_name): - # job_image = "localhost:32000/nvfl-k8s:0.0.1" deploy_map = job_meta.get(JobMetaKey.DEPLOY_MAP, {}) for _, participants in deploy_map.items(): for item in participants: diff --git a/tests/unit_test/private/fed/utils/fed_utils_test.py b/tests/unit_test/private/fed/utils/fed_utils_test.py index b2c53108fe..1a2f784eba 100644 --- a/tests/unit_test/private/fed/utils/fed_utils_test.py +++ b/tests/unit_test/private/fed/utils/fed_utils_test.py @@ -18,7 +18,7 @@ from nvflare.fuel.utils.fobs import Decomposer from nvflare.fuel.utils.fobs.datum import DatumManager from nvflare.fuel.utils.fobs.fobs import register_custom_folder -from nvflare.private.fed.utils.fed_utils import extract_participants, extract_job_image +from nvflare.private.fed.utils.fed_utils import extract_job_image, extract_participants class ExampleTestClass: @@ -64,31 +64,18 @@ def test_extract_participants(self): assert results == expected def test_extract_participants_with_image(self): - participants = ["site-1", "site-2", - { - "sites": ["site-3", "site-4"], - "image": "image1" - }, - { - "sites": ["site-5"], - "image": "image2" - } - ] + participants = [ + "site-1", + "site-2", + {"sites": ["site-3", "site-4"], "image": "image1"}, + {"sites": ["site-5"], "image": "image2"}, + ] results = extract_participants(participants) expected = ["site-1", "site-2", "site-3", "site-4", "site-5"] assert results == expected def test_extract_job_image(self): - job_meta = { - "deploy_map": { - "app": [ "site-1", "site-2", - { - "sites": ["site-3", "site-4"], - "image": "image1" - } - ] - } - } + job_meta = {"deploy_map": {"app": ["site-1", "site-2", {"sites": ["site-3", "site-4"], "image": "image1"}]}} result = extract_job_image(job_meta, "site-3") expected = "image1" assert result == expected @@ -97,12 +84,7 @@ def test_extract_job_image(self): expected = None assert result == expected - job_meta = { - "deploy_map": { - "app": [ "site-1", "site-2" - ] - } - } + job_meta = {"deploy_map": {"app": ["site-1", "site-2"]}} result = extract_job_image(job_meta, "site-1") expected = None assert result == expected From 1d2feceaa90321149c70861c199c29d976d78f83 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Sat, 19 Oct 2024 08:55:39 -0400 Subject: [PATCH 16/31] refactoried. --- nvflare/app_opt/job_launcher/job_launcher_spec.py | 2 +- nvflare/app_opt/job_launcher/k8s_launcher.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index 95d40541e4..a6ad3c0aa8 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -16,7 +16,7 @@ class JobHandleSpec: @abstractmethod - def terminate(self, timeout=None): + def terminate(self): """To terminate the job run. Returns: the job run return code. diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 201821a100..f9da5eb9f9 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -42,9 +42,10 @@ class JobState(Enum): class K8sJobHandle(JobHandleSpec): - def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace="default"): + def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace="default", timeout=None): super().__init__() self.job_id = job_id + self.timeout = timeout self.api_instance = api_instance self.namespace = namespace @@ -152,11 +153,11 @@ def enter_states(self, job_states_to_enter: list, timeout=None): return False time.sleep(1) - def terminate(self, timeout=None): + def terminate(self): resp = self.api_instance.delete_namespaced_pod( name=self.job_id, namespace=self.namespace, grace_period_seconds=0 ) - return self.enter_states([JobState.TERMINATED], timeout=timeout) + return self.enter_states([JobState.TERMINATED], timeout=self.timeout) def poll(self): try: @@ -176,7 +177,7 @@ def __init__( root_hostpath: str, workspace: str, mount_path: str, - launch_timeout=None, + timeout=None, namespace="default", ): super().__init__() @@ -184,7 +185,7 @@ def __init__( self.root_hostpath = root_hostpath self.workspace = workspace self.mount_path = mount_path - self.launch_timeout = launch_timeout + self.timeout = timeout config.load_kube_config(config_file_path) try: @@ -232,10 +233,10 @@ def launch_job( self.logger.info(f"launch job with k8s_launcher. Job_id:{job_id}") - job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace) + job_handle = K8sJobHandle(job_id, self.core_v1, job_config, namespace=self.namespace, timeout=self.timeout) try: self.core_v1.create_namespaced_pod(body=job_handle.get_manifest(), namespace=self.namespace) - if job_handle.enter_states([JobState.RUNNING], timeout=self.launch_timeout): + if job_handle.enter_states([JobState.RUNNING], timeout=self.timeout): return job_handle else: job_handle.terminate() From ceaeb8ed80c90e833c1d190a8cbcd1db86b8262f Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 21 Oct 2024 15:02:55 -0400 Subject: [PATCH 17/31] extract to use constants. --- nvflare/apis/fl_constant.py | 3 +++ nvflare/private/fed/client/client_executor.py | 4 ++-- nvflare/private/fed/utils/fed_utils.py | 9 +++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index 2021a7fcc1..1d73891389 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -346,6 +346,7 @@ class SystemComponents(object): JOB_META_VALIDATOR = "job_meta_validator" FED_CLIENT = "fed_client" RUN_MANAGER = "run_manager" + IMAGE_LAUNCHER = "image_launcher" class JobConstants: @@ -353,6 +354,8 @@ class JobConstants: CLIENT_JOB_CONFIG = "config_fed_client.json" META_FILE = "meta.json" META = "meta" + SITES = "sites" + JOB_IMAGE = "image" class WorkspaceConstants: diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index f39eeb36c1..20e17b2f2d 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -17,7 +17,7 @@ import time from abc import ABC, abstractmethod -from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs +from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs, SystemComponents from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher @@ -190,7 +190,7 @@ def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: launch_image = extract_job_image(job_meta, client.client_name) if launch_image: engine = client.engine - launcher = engine.get_component("image_launcher") + launcher = engine.get_component(SystemComponents.IMAGE_LAUNCHER) if not launcher: raise RuntimeError("There's no image job launcher defined.") self.logger.info(f"Launch job image: {launch_image} with job launcher: {type(launcher)} ") diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index b3f192fdf2..a5a10fc934 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -26,7 +26,8 @@ from nvflare.apis.client import Client from nvflare.apis.event_type import EventType from nvflare.apis.fl_component import FLContext -from nvflare.apis.fl_constant import ConfigVarName, FLContextKey, FLMetaKey, SiteType, SystemVarName, WorkspaceConstants +from nvflare.apis.fl_constant import ConfigVarName, FLContextKey, FLMetaKey, SiteType, SystemVarName, \ + WorkspaceConstants, JobConstants from nvflare.apis.fl_exception import UnsafeComponentError from nvflare.apis.job_def import JobMetaKey from nvflare.apis.utils.decomposers import flare_decomposers @@ -407,7 +408,7 @@ def extract_participants(participants_list): if isinstance(item, str): participants.append(item) elif isinstance(item, dict): - sites = item.get("sites") + sites = item.get(JobConstants.SITES) participants.extend(sites) else: raise ValueError(f"Must be tye of str or dict, but got {type(item)}") @@ -419,7 +420,7 @@ def extract_job_image(job_meta, site_name): for _, participants in deploy_map.items(): for item in participants: if isinstance(item, dict): - sites = item.get("sites") + sites = item.get(JobConstants.SITES) if site_name in sites: - return item.get("image") + return item.get(JobConstants.JOB_IMAGE) return None From b974d9ae8ab0b06a65f024e39bb25f738cccfb27 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Tue, 22 Oct 2024 09:15:26 -0400 Subject: [PATCH 18/31] Change the JobLauncherSpec API signiture. --- nvflare/apis/fl_constant.py | 3 ++ .../app_opt/job_launcher/job_launcher_spec.py | 8 ++- nvflare/app_opt/job_launcher/k8s_launcher.py | 15 ++++-- .../app_opt/job_launcher/process_launcher.py | 50 +++++++++++-------- .../private/fed/app/client/client_train.py | 7 ++- nvflare/private/fed/client/client_engine.py | 4 +- nvflare/private/fed/client/client_executor.py | 16 +++--- 7 files changed, 63 insertions(+), 40 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index 1d73891389..e8a69b42fc 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -160,6 +160,8 @@ class FLContextKey(object): AUTHORIZATION_REASON = "_authorization_reason" DISCONNECTED_CLIENT_NAME = "_disconnected_client_name" RECONNECTED_CLIENT_NAME = "_reconnected_client_name" + SERVER_CONFIG = "_server_config" + SITE_OBJ = "_site_obj_" CLIENT_REGISTER_DATA = "_client_register_data" SECURITY_ITEMS = "_security_items" @@ -177,6 +179,7 @@ class FLContextKey(object): CONFIG_CTX = "__config_ctx__" FILTER_DIRECTION = "__filter_dir__" ROOT_URL = "__root_url__" # the URL for accessing the FL Server + INTERNAL_URL = "__internal_url__" NOT_READY_TO_END_RUN = "not_ready_to_end_run__" # component sets this to indicate it's not ready to end run yet diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index a6ad3c0aa8..e67f18dd9e 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -13,6 +13,8 @@ # limitations under the License. from abc import abstractmethod +from nvflare.apis.fl_context import FLContext + class JobHandleSpec: @abstractmethod @@ -46,12 +48,14 @@ def wait(self): class JobLauncherSpec: @abstractmethod def launch_job( - self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + self, job_id: str, job_meta: dict, fl_ctx: FLContext ) -> JobHandleSpec: """To launch a job run. Args: - job_meta: + job_id: job_id + job_meta: meta data for the job + fl_ctx: FLContext Returns: boolean to indicates the job launch success or fail. diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index f9da5eb9f9..c22add7fb1 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -20,6 +20,9 @@ from kubernetes.client.api import core_v1_api from kubernetes.client.rest import ApiException +from nvflare.apis.fl_constant import FLContextKey +from nvflare.apis.fl_context import FLContext +from nvflare.apis.workspace import Workspace from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec from nvflare.private.fed.utils.fed_utils import extract_job_image @@ -201,9 +204,13 @@ def __init__( self.logger = logging.getLogger(self.__class__.__name__) def launch_job( - self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + self, job_id: str, job_meta: dict, fl_ctx: FLContext ) -> JobHandleSpec: + workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) + args = fl_ctx.get_prop(FLContextKey.ARGS) + client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) + # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" self.logger.info(f"K8sJobLauncher start to launch job: {job_id} for client: {client.client_name}") @@ -218,14 +225,14 @@ def launch_job( "volume_list": [{"name": self.workspace, "hostPath": {"path": self.root_hostpath, "type": "Directory"}}], "module_args": { "-m": args.workspace, - "-w": startup, + "-w": (workspace_obj.get_startup_kit_dir()), "-t": client.token, "-d": client.ssid, "-n": job_id, "-c": client.client_name, "-p": "tcp://parent-pod:8004", - "-g": target, - "-scheme": scheme, + "-g": fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("target"), + "-scheme": fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("scheme", "grpc"), "-s": "fed_client.json", }, "set_list": args.set, diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py index f651410df7..779edbc164 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -17,6 +17,9 @@ import subprocess import sys +from nvflare.apis.fl_constant import FLContextKey +from nvflare.apis.fl_context import FLContext +from nvflare.apis.workspace import Workspace from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path @@ -56,10 +59,15 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) def launch_job( - self, job_id, job_meta, client, startup, args, app_custom_folder, target: str, scheme: str + self, job_id: str, job_meta: dict, fl_ctx: FLContext ) -> JobHandleSpec: new_env = os.environ.copy() + workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) + args = fl_ctx.get_prop(FLContextKey.ARGS) + client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) + + app_custom_folder = workspace_obj.get_app_custom_dir(job_id) if app_custom_folder != "": add_custom_dir_to_path(app_custom_folder, new_env) @@ -67,26 +75,26 @@ def launch_job( for t in args.set: command_options += " " + t command = ( - f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " - + args.workspace - + " -w " - + startup - + " -t " - + client.token - + " -d " - + client.ssid - + " -n " - + job_id - + " -c " - + client.client_name - + " -p " - + str(client.cell.get_internal_listener_url()) - + " -g " - + target - + " -scheme " - + scheme - + " -s fed_client.json " - " --set" + command_options + " print_conf=True" + f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " + + args.workspace + + " -w " + + (workspace_obj.get_startup_kit_dir()) + + " -t " + + client.token + + " -d " + + client.ssid + + " -n " + + job_id + + " -c " + + client.client_name + + " -p " + + str(client.cell.get_internal_listener_url()) + + " -g " + + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("target") + + " -scheme " + + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("scheme", "grpc") + + " -s fed_client.json " + " --set" + command_options + " print_conf=True" ) # use os.setsid to create new process group ID process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) diff --git a/nvflare/private/fed/app/client/client_train.py b/nvflare/private/fed/app/client/client_train.py index 49eaa7fcd2..98cd0736d9 100644 --- a/nvflare/private/fed/app/client/client_train.py +++ b/nvflare/private/fed/app/client/client_train.py @@ -108,9 +108,14 @@ def main(args): time.sleep(1.0) with client_engine.new_context() as fl_ctx: - fl_ctx.set_prop(FLContextKey.WORKSPACE_OBJECT, workspace, private=True) client_engine.fire_event(EventType.SYSTEM_BOOTSTRAP, fl_ctx) + fl_ctx.set_prop(FLContextKey.WORKSPACE_OBJECT, workspace, private=True) + server_config = list(federated_client.servers.values())[0] + fl_ctx.set_prop(FLContextKey.SERVER_CONFIG, server_config, private=True, sticky=True) + fl_ctx.set_prop(FLContextKey.ARGS, args, private=True, sticky=True) + fl_ctx.set_prop(FLContextKey.SITE_OBJ, federated_client, private=True, sticky=True) + component_security_check(fl_ctx) client_engine.fire_event(EventType.BEFORE_CLIENT_REGISTER, fl_ctx) diff --git a/nvflare/private/fed/client/client_engine.py b/nvflare/private/fed/client/client_engine.py index 5315959e34..875dbc6e51 100644 --- a/nvflare/private/fed/client/client_engine.py +++ b/nvflare/private/fed/client/client_engine.py @@ -161,7 +161,6 @@ def start_app( self.logger.info("Starting client app. rank: {}".format(self.rank)) - server_config = list(self.client.servers.values())[0] self.client_executor.start_app( self.client, job_id, @@ -171,8 +170,7 @@ def start_app( allocated_resource, token, resource_manager, - target=server_config["target"], - scheme=server_config.get("scheme", "grpc"), + fl_ctx=self.new_context() ) return "Start the client app..." diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 20e17b2f2d..67417adc1e 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -18,6 +18,7 @@ from abc import ABC, abstractmethod from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs, SystemComponents +from nvflare.apis.fl_context import FLContext from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher @@ -44,8 +45,7 @@ def start_app( allocated_resource, token, resource_manager, - target: str, - scheme: str, + fl_ctx: FLContext ): """Starts the client app. @@ -57,8 +57,7 @@ def start_app( allocated_resource: allocated resources token: token from resource manager resource_manager: resource manager - target: SP target location - scheme: SP target connection scheme + fl_ctx: FLContext """ pass @@ -150,8 +149,7 @@ def start_app( allocated_resource, token, resource_manager: ResourceManagerSpec, - target: str, - scheme: str, + fl_ctx: FLContext ): """Starts the app. @@ -164,12 +162,12 @@ def start_app( allocated_resource: allocated resources token: token from resource manager resource_manager: resource manager - target: SP target location - scheme: SP connection scheme + fl_ctx: FLContext """ + job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) job_handle = job_launcher.launch_job( - job_id, job_meta, client, self.startup, args, app_custom_folder, target, scheme + job_id, job_meta, fl_ctx ) client.multi_gpu = False From 765357b0445a78c42ef97331e7a9bc42ec4e245e Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Tue, 22 Oct 2024 12:26:17 -0400 Subject: [PATCH 19/31] Added can_launch() to JobLauncherSpec. --- nvflare/apis/fl_constant.py | 1 + .../app_opt/job_launcher/job_launcher_spec.py | 18 +++++-- nvflare/app_opt/job_launcher/k8s_launcher.py | 18 +++++-- .../app_opt/job_launcher/process_launcher.py | 54 ++++++++++--------- nvflare/private/fed/client/client_engine.py | 2 +- nvflare/private/fed/client/client_executor.py | 35 ++++++------ nvflare/private/fed/utils/fed_utils.py | 11 +++- 7 files changed, 84 insertions(+), 55 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index e8a69b42fc..355e726b16 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -359,6 +359,7 @@ class JobConstants: META = "meta" SITES = "sites" JOB_IMAGE = "image" + JOB_ID = "job_id" class WorkspaceConstants: diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index e67f18dd9e..94445a00b7 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -47,17 +47,25 @@ def wait(self): class JobLauncherSpec: @abstractmethod - def launch_job( - self, job_id: str, job_meta: dict, fl_ctx: FLContext - ) -> JobHandleSpec: + def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: """To launch a job run. Args: - job_id: job_id - job_meta: meta data for the job + launch_data: job launch meta data fl_ctx: FLContext Returns: boolean to indicates the job launch success or fail. """ raise NotImplemented + + @abstractmethod + def can_launch(self, launch_data: dict) -> bool: + """To determine if the launcher can launch this job. + + Args: + launch_data: job launch meta data + + Returns: True / False + + """ diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index c22add7fb1..677a20c2ec 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -20,7 +20,7 @@ from kubernetes.client.api import core_v1_api from kubernetes.client.rest import ApiException -from nvflare.apis.fl_constant import FLContextKey +from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec @@ -180,6 +180,7 @@ def __init__( root_hostpath: str, workspace: str, mount_path: str, + supported_images: [str] = None, timeout=None, namespace="default", ): @@ -189,6 +190,7 @@ def __init__( self.workspace = workspace self.mount_path = mount_path self.timeout = timeout + self.supported_images = supported_images config.load_kube_config(config_file_path) try: @@ -203,18 +205,17 @@ def __init__( self.job_handle = None self.logger = logging.getLogger(self.__class__.__name__) - def launch_job( - self, job_id: str, job_meta: dict, fl_ctx: FLContext - ) -> JobHandleSpec: + def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) args = fl_ctx.get_prop(FLContextKey.ARGS) client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) + job_id = launch_data.get(JobConstants.JOB_ID) # root_hostpath = "/home/azureuser/wksp/k2k/disk" # job_image = "localhost:32000/nvfl-k8s:0.0.1" self.logger.info(f"K8sJobLauncher start to launch job: {job_id} for client: {client.client_name}") - job_image = extract_job_image(job_meta, client.client_name) + job_image = launch_data.get(JobConstants.JOB_IMAGE) self.logger.info(f"launch job use image: {job_image}") job_config = { "name": job_id, @@ -251,3 +252,10 @@ def launch_job( except ApiException as e: job_handle.terminate() return None + + def can_launch(self, launch_data: dict) -> bool: + job_image = launch_data.get(JobConstants.JOB_IMAGE) + if job_image in self.supported_images: + return True + else: + return False diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_opt/job_launcher/process_launcher.py index 779edbc164..978750f34a 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_opt/job_launcher/process_launcher.py @@ -17,7 +17,7 @@ import subprocess import sys -from nvflare.apis.fl_constant import FLContextKey +from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec @@ -58,14 +58,13 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - def launch_job( - self, job_id: str, job_meta: dict, fl_ctx: FLContext - ) -> JobHandleSpec: + def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: new_env = os.environ.copy() workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) args = fl_ctx.get_prop(FLContextKey.ARGS) client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) + job_id = launch_data.get(JobConstants.JOB_ID) app_custom_folder = workspace_obj.get_app_custom_dir(job_id) if app_custom_folder != "": @@ -75,26 +74,26 @@ def launch_job( for t in args.set: command_options += " " + t command = ( - f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " - + args.workspace - + " -w " - + (workspace_obj.get_startup_kit_dir()) - + " -t " - + client.token - + " -d " - + client.ssid - + " -n " - + job_id - + " -c " - + client.client_name - + " -p " - + str(client.cell.get_internal_listener_url()) - + " -g " - + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("target") - + " -scheme " - + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("scheme", "grpc") - + " -s fed_client.json " - " --set" + command_options + " print_conf=True" + f"{sys.executable} -m nvflare.private.fed.app.client.worker_process -m " + + args.workspace + + " -w " + + (workspace_obj.get_startup_kit_dir()) + + " -t " + + client.token + + " -d " + + client.ssid + + " -n " + + job_id + + " -c " + + client.client_name + + " -p " + + str(client.cell.get_internal_listener_url()) + + " -g " + + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("target") + + " -scheme " + + fl_ctx.get_prop(FLContextKey.SERVER_CONFIG).get("scheme", "grpc") + + " -s fed_client.json " + " --set" + command_options + " print_conf=True" ) # use os.setsid to create new process group ID process = subprocess.Popen(shlex.split(command, True), preexec_fn=os.setsid, env=new_env) @@ -102,3 +101,10 @@ def launch_job( self.logger.info("Worker child process ID: {}".format(process.pid)) return ProcessHandle(process) + + def can_launch(self, launch_data: dict) -> bool: + job_image = launch_data.get(JobConstants.JOB_IMAGE) + if job_image: + return False + else: + return True diff --git a/nvflare/private/fed/client/client_engine.py b/nvflare/private/fed/client/client_engine.py index 875dbc6e51..6f293abdb4 100644 --- a/nvflare/private/fed/client/client_engine.py +++ b/nvflare/private/fed/client/client_engine.py @@ -170,7 +170,7 @@ def start_app( allocated_resource, token, resource_manager, - fl_ctx=self.new_context() + fl_ctx=self.new_context(), ) return "Start the client app..." diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 67417adc1e..ce02dfeec8 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -17,7 +17,7 @@ import time from abc import ABC, abstractmethod -from nvflare.apis.fl_constant import AdminCommandNames, RunProcessKey, SystemConfigs, SystemComponents +from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemComponents, SystemConfigs from nvflare.apis.fl_context import FLContext from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec @@ -45,7 +45,7 @@ def start_app( allocated_resource, token, resource_manager, - fl_ctx: FLContext + fl_ctx: FLContext, ): """Starts the client app. @@ -149,7 +149,7 @@ def start_app( allocated_resource, token, resource_manager: ResourceManagerSpec, - fl_ctx: FLContext + fl_ctx: FLContext, ): """Starts the app. @@ -165,10 +165,11 @@ def start_app( fl_ctx: FLContext """ - job_launcher: JobLauncherSpec = self._get_job_launcher(client, job_meta) - job_handle = job_launcher.launch_job( - job_id, job_meta, fl_ctx - ) + job_launcher, launch_data = self._get_job_launcher(job_id, client, job_meta) + if not job_launcher: + raise RuntimeError(f"There's no job launcher can handle this job: {launch_data}.") + job_handle = job_launcher.launch_job(launch_data, fl_ctx) + self.logger.info(f"Launch job data: {launch_data} with job launcher: {type(job_launcher)} ") client.multi_gpu = False @@ -184,18 +185,16 @@ def start_app( ) thread.start() - def _get_job_launcher(self, client, job_meta: dict) -> JobLauncherSpec: + def _get_job_launcher(self, job_id, client, job_meta: dict) -> (JobLauncherSpec, dict): launch_image = extract_job_image(job_meta, client.client_name) - if launch_image: - engine = client.engine - launcher = engine.get_component(SystemComponents.IMAGE_LAUNCHER) - if not launcher: - raise RuntimeError("There's no image job launcher defined.") - self.logger.info(f"Launch job image: {launch_image} with job launcher: {type(launcher)} ") - else: - launcher = ProcessJobLauncher() - self.logger.info("Launch job with ProcessJobLauncher.") - return launcher + launch_data = {JobConstants.JOB_IMAGE: launch_image, JobConstants.JOB_ID: job_id} + + launcher = None + for _, component in client.components.items(): + if isinstance(component, JobLauncherSpec): + if component.can_launch(launch_data): + launcher = component + return launcher, launch_data def notify_job_status(self, job_id, job_status): run_process = self.run_processes.get(job_id) diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index a5a10fc934..e37e834b97 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -26,8 +26,15 @@ from nvflare.apis.client import Client from nvflare.apis.event_type import EventType from nvflare.apis.fl_component import FLContext -from nvflare.apis.fl_constant import ConfigVarName, FLContextKey, FLMetaKey, SiteType, SystemVarName, \ - WorkspaceConstants, JobConstants +from nvflare.apis.fl_constant import ( + ConfigVarName, + FLContextKey, + FLMetaKey, + JobConstants, + SiteType, + SystemVarName, + WorkspaceConstants, +) from nvflare.apis.fl_exception import UnsafeComponentError from nvflare.apis.job_def import JobMetaKey from nvflare.apis.utils.decomposers import flare_decomposers From 1d7ebc7c37f33f7d45183f6447c4873bc1375d78 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Wed, 23 Oct 2024 10:57:33 -0400 Subject: [PATCH 20/31] refactor. --- nvflare/apis/fl_constant.py | 3 +-- nvflare/app_opt/job_launcher/k8s_launcher.py | 4 ---- nvflare/lighter/impl/master_template.yml | 5 +++++ nvflare/private/fed/client/client_executor.py | 12 +++++++----- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index 355e726b16..6c75039fbb 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -179,7 +179,6 @@ class FLContextKey(object): CONFIG_CTX = "__config_ctx__" FILTER_DIRECTION = "__filter_dir__" ROOT_URL = "__root_url__" # the URL for accessing the FL Server - INTERNAL_URL = "__internal_url__" NOT_READY_TO_END_RUN = "not_ready_to_end_run__" # component sets this to indicate it's not ready to end run yet @@ -349,7 +348,6 @@ class SystemComponents(object): JOB_META_VALIDATOR = "job_meta_validator" FED_CLIENT = "fed_client" RUN_MANAGER = "run_manager" - IMAGE_LAUNCHER = "image_launcher" class JobConstants: @@ -360,6 +358,7 @@ class JobConstants: SITES = "sites" JOB_IMAGE = "image" JOB_ID = "job_id" + JOB_LAUNCHER = "job_launcher" class WorkspaceConstants: diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 677a20c2ec..e9d05228ca 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -24,7 +24,6 @@ from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec -from nvflare.private.fed.utils.fed_utils import extract_job_image class JobState(Enum): @@ -212,8 +211,6 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) job_id = launch_data.get(JobConstants.JOB_ID) - # root_hostpath = "/home/azureuser/wksp/k2k/disk" - # job_image = "localhost:32000/nvfl-k8s:0.0.1" self.logger.info(f"K8sJobLauncher start to launch job: {job_id} for client: {client.client_name}") job_image = launch_data.get(JobConstants.JOB_IMAGE) self.logger.info(f"launch job use image: {job_image}") @@ -221,7 +218,6 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: "name": job_id, "image": job_image, "container_name": f"container-{job_id}", - # "volume_mount_list": [{'name':'workspace-nvflare', 'mountPath': '/workspace/nvflare'}], "volume_mount_list": [{"name": self.workspace, "mountPath": self.mount_path}], "volume_list": [{"name": self.workspace, "hostPath": {"path": self.root_hostpath, "type": "Directory"}}], "module_args": { diff --git a/nvflare/lighter/impl/master_template.yml b/nvflare/lighter/impl/master_template.yml index decf05c19f..1fc697ddb6 100644 --- a/nvflare/lighter/impl/master_template.yml +++ b/nvflare/lighter/impl/master_template.yml @@ -92,6 +92,11 @@ local_client_resources: | "id": "resource_consumer", "path": "nvflare.app_common.resource_consumers.gpu_resource_consumer.GPUResourceConsumer", "args": {} + }, + { + "id": "process_launcher", + "path": "nvflare.app_opt.job_launcher.process_launcher.ProcessJobLauncher", + "args": {} } ] } diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index ce02dfeec8..0e2c628b93 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -165,7 +165,8 @@ def start_app( fl_ctx: FLContext """ - job_launcher, launch_data = self._get_job_launcher(job_id, client, job_meta) + launch_data = self._get_job_launcher(job_id, client, job_meta) + job_launcher = launch_data.get(JobConstants.JOB_LAUNCHER) if not job_launcher: raise RuntimeError(f"There's no job launcher can handle this job: {launch_data}.") job_handle = job_launcher.launch_job(launch_data, fl_ctx) @@ -185,16 +186,17 @@ def start_app( ) thread.start() - def _get_job_launcher(self, job_id, client, job_meta: dict) -> (JobLauncherSpec, dict): + def _get_job_launcher(self, job_id, client, job_meta: dict) -> dict: launch_image = extract_job_image(job_meta, client.client_name) launch_data = {JobConstants.JOB_IMAGE: launch_image, JobConstants.JOB_ID: job_id} - launcher = None + job_launcher = None for _, component in client.components.items(): if isinstance(component, JobLauncherSpec): if component.can_launch(launch_data): - launcher = component - return launcher, launch_data + job_launcher = component + launch_data[JobConstants.JOB_LAUNCHER] = job_launcher + return launch_data def notify_job_status(self, job_id, job_status): run_process = self.run_processes.get(job_id) From dc94e012a4332be196d9b88c64d879c68ef27e1c Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Wed, 23 Oct 2024 14:12:06 -0400 Subject: [PATCH 21/31] removed duplicate const. --- nvflare/apis/fl_constant.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index de593aa242..a79a1dadde 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -160,7 +160,6 @@ class FLContextKey(object): AUTHORIZATION_REASON = "_authorization_reason" DISCONNECTED_CLIENT_NAME = "_disconnected_client_name" RECONNECTED_CLIENT_NAME = "_reconnected_client_name" - SERVER_CONFIG = "_server_config" SITE_OBJ = "_site_obj_" CLIENT_REGISTER_DATA = "_client_register_data" From 7f1a9d93c9cc7c74ef522b310d83b577fb297401 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Wed, 23 Oct 2024 16:36:29 -0400 Subject: [PATCH 22/31] removed no use import. --- nvflare/private/fed/client/client_executor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 74aa733e20..6631b17a77 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -17,11 +17,10 @@ import time from abc import ABC, abstractmethod -from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemComponents, SystemConfigs +from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemConfigs from nvflare.apis.fl_context import FLContext from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec -from nvflare.app_opt.job_launcher.process_launcher import ProcessJobLauncher from nvflare.fuel.common.exit_codes import PROCESS_EXIT_REASON, ProcessExitCode from nvflare.fuel.f3.cellnet.core_cell import FQCN from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode From ae8a2dcdba763f1f52c154c69d79f4ac3b28a07a Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Thu, 24 Oct 2024 08:43:35 -0400 Subject: [PATCH 23/31] changed to raise NotImplementedError(). --- nvflare/app_opt/job_launcher/job_launcher_spec.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/app_opt/job_launcher/job_launcher_spec.py index 94445a00b7..37afd9db34 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/app_opt/job_launcher/job_launcher_spec.py @@ -24,7 +24,7 @@ def terminate(self): Returns: the job run return code. """ - raise NotImplemented + raise NotImplementedError() @abstractmethod def poll(self): @@ -33,7 +33,7 @@ def poll(self): Returns: return_code """ - raise NotImplemented + raise NotImplementedError() @abstractmethod def wait(self): @@ -42,7 +42,7 @@ def wait(self): Returns: returns until the job run complete. """ - raise NotImplemented + raise NotImplementedError() class JobLauncherSpec: @@ -57,7 +57,7 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: Returns: boolean to indicates the job launch success or fail. """ - raise NotImplemented + raise NotImplementedError() @abstractmethod def can_launch(self, launch_data: dict) -> bool: @@ -69,3 +69,4 @@ def can_launch(self, launch_data: dict) -> bool: Returns: True / False """ + raise NotImplementedError() From f923e97f7c63a85a61801e16002b60ad4dc5a6f1 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 25 Oct 2024 11:48:18 -0400 Subject: [PATCH 24/31] refactored. --- .../job_launcher_spec.py | 3 ++- nvflare/app_common/job_launcher/__init__.py | 13 ++++++++++++ .../job_launcher/process_launcher.py | 4 ++-- nvflare/app_opt/job_launcher/k8s_launcher.py | 20 +++++++++++++++---- nvflare/private/fed/client/client_executor.py | 16 +++++++-------- nvflare/private/fed/utils/fed_utils.py | 8 ++++---- 6 files changed, 45 insertions(+), 19 deletions(-) rename nvflare/{app_opt/job_launcher => apis}/job_launcher_spec.py (94%) create mode 100644 nvflare/app_common/job_launcher/__init__.py rename nvflare/{app_opt => app_common}/job_launcher/process_launcher.py (96%) diff --git a/nvflare/app_opt/job_launcher/job_launcher_spec.py b/nvflare/apis/job_launcher_spec.py similarity index 94% rename from nvflare/app_opt/job_launcher/job_launcher_spec.py rename to nvflare/apis/job_launcher_spec.py index 37afd9db34..378c8620e4 100644 --- a/nvflare/app_opt/job_launcher/job_launcher_spec.py +++ b/nvflare/apis/job_launcher_spec.py @@ -60,10 +60,11 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: raise NotImplementedError() @abstractmethod - def can_launch(self, launch_data: dict) -> bool: + def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: """To determine if the launcher can launch this job. Args: + fl_ctx: FLContext launch_data: job launch meta data Returns: True / False diff --git a/nvflare/app_common/job_launcher/__init__.py b/nvflare/app_common/job_launcher/__init__.py new file mode 100644 index 0000000000..d9155f923f --- /dev/null +++ b/nvflare/app_common/job_launcher/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/nvflare/app_opt/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py similarity index 96% rename from nvflare/app_opt/job_launcher/process_launcher.py rename to nvflare/app_common/job_launcher/process_launcher.py index a3e0882dec..87d4e374ba 100644 --- a/nvflare/app_opt/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -20,7 +20,7 @@ from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace -from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path @@ -108,7 +108,7 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: return ProcessHandle(process) - def can_launch(self, launch_data: dict) -> bool: + def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: job_image = launch_data.get(JobConstants.JOB_IMAGE) if job_image: return False diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index e4b22ae322..ae8d68ea25 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -23,7 +23,7 @@ from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace -from nvflare.app_opt.job_launcher.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec class JobState(Enum): @@ -42,6 +42,14 @@ class JobState(Enum): "Unknown": JobState.UNKNOWN, } +RETURN_CODES = { + JobState.SUCCEEDED: 0, + JobState.STARTING: None, + JobState.RUNNING: None, + JobState.TERMINATED: 1, + JobState.UNKNOWN: None +} + class K8sJobHandle(JobHandleSpec): def __init__(self, job_id: str, api_instance: core_v1_api, job_config: dict, namespace="default", timeout=None): @@ -148,7 +156,7 @@ def enter_states(self, job_states_to_enter: list, timeout=None): if not all([isinstance(js, JobState)] for js in job_states_to_enter): raise ValueError(f"expect job_states_to_enter with valid values, but get {job_states_to_enter}") while True: - job_state = self.poll() + job_state = self._query_state() if job_state in job_states_to_enter: return True elif timeout is not None and time.time() - starting_time > timeout: @@ -162,6 +170,10 @@ def terminate(self): return self.enter_states([JobState.TERMINATED], timeout=self.timeout) def poll(self): + job_state = self._query_state() + return RETURN_CODES.get(job_state) + + def _query_state(self): try: resp = self.api_instance.read_namespaced_pod(name=self.job_id, namespace=self.namespace) except ApiException as e: @@ -255,9 +267,9 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: job_handle.terminate() return None - def can_launch(self, launch_data: dict) -> bool: + def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: job_image = launch_data.get(JobConstants.JOB_IMAGE) - if job_image in self.supported_images: + if job_image: return True else: return False diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 6631b17a77..209ab0a1ec 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -20,7 +20,7 @@ from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemConfigs from nvflare.apis.fl_context import FLContext from nvflare.apis.resource_manager_spec import ResourceManagerSpec -from nvflare.app_opt.job_launcher.job_launcher_spec import JobLauncherSpec +from nvflare.apis.job_launcher_spec import JobLauncherSpec from nvflare.fuel.common.exit_codes import PROCESS_EXIT_REASON, ProcessExitCode from nvflare.fuel.f3.cellnet.core_cell import FQCN from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode @@ -192,7 +192,7 @@ def _get_job_launcher(self, job_id, client, job_meta: dict) -> dict: job_launcher = None for _, component in client.components.items(): if isinstance(component, JobLauncherSpec): - if component.can_launch(launch_data): + if component.can_launch(launch_data, None): job_launcher = component launch_data[JobConstants.JOB_LAUNCHER] = job_launcher return launch_data @@ -319,7 +319,7 @@ def abort_app(self, job_id): if process_status == ClientStatus.STARTED: try: with self.lock: - job_launcher = self.run_processes[job_id][RunProcessKey.JOB_HANDLE] + job_handle = self.run_processes[job_id][RunProcessKey.JOB_HANDLE] data = {} request = new_cell_message({}, data) self.client.cell.fire_and_forget( @@ -330,7 +330,7 @@ def abort_app(self, job_id): optional=True, ) self.logger.debug("abort sent to worker") - t = threading.Thread(target=self._terminate_process, args=[job_launcher, job_id]) + t = threading.Thread(target=self._terminate_process, args=[job_handle, job_id]) t.start() t.join() break @@ -389,11 +389,11 @@ def abort_task(self, job_id): def _wait_child_process_finish(self, client, job_id, allocated_resource, token, resource_manager, workspace): self.logger.info(f"run ({job_id}): waiting for child worker process to finish.") - job_launcher = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE) - if job_launcher: - job_launcher.wait() + job_handle = self.run_processes.get(job_id, {}).get(RunProcessKey.JOB_HANDLE) + if job_handle: + job_handle.wait() - return_code = get_return_code(job_launcher, job_id, workspace, self.logger) + return_code = get_return_code(job_handle, job_id, workspace, self.logger) self.logger.info(f"run ({job_id}): child worker process finished with RC {return_code}") if return_code in [ProcessExitCode.UNSAFE_COMPONENT, ProcessExitCode.CONFIG_ERROR]: diff --git a/nvflare/private/fed/utils/fed_utils.py b/nvflare/private/fed/utils/fed_utils.py index 1721d6f629..8e807e5fe9 100644 --- a/nvflare/private/fed/utils/fed_utils.py +++ b/nvflare/private/fed/utils/fed_utils.py @@ -380,7 +380,7 @@ def get_target_names(targets): return target_names -def get_return_code(process, job_id, workspace, logger): +def get_return_code(job_handle, job_id, workspace, logger): run_dir = os.path.join(workspace, job_id) rc_file = os.path.join(run_dir, FLMetaKey.PROCESS_RC_FILE) if os.path.exists(rc_file): @@ -391,11 +391,11 @@ def get_return_code(process, job_id, workspace, logger): except Exception: logger.warning( f"Could not get the return_code from {rc_file} of the job:{job_id}, " - f"Return the RC from the process:{process.pid}" + f"Return the RC from the job_handle:{job_handle}" ) - return_code = process.poll() + return_code = job_handle.poll() else: - return_code = process.poll() + return_code = job_handle.poll() return return_code From 037c7bb5b807fba5ad40a7241e7e48b58830a4f1 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 25 Oct 2024 13:23:14 -0400 Subject: [PATCH 25/31] Changed to use event to get the job launcher. --- nvflare/apis/event_type.py | 1 + nvflare/apis/fl_constant.py | 1 + nvflare/apis/job_launcher_spec.py | 20 +++-------- .../job_launcher/process_launcher.py | 23 ++++++++----- nvflare/app_opt/job_launcher/k8s_launcher.py | 16 +++++---- nvflare/lighter/impl/master_template.yml | 2 +- nvflare/private/fed/client/client_executor.py | 33 +++++++++---------- 7 files changed, 46 insertions(+), 50 deletions(-) diff --git a/nvflare/apis/event_type.py b/nvflare/apis/event_type.py index 5bdbe50bc8..0868e09379 100644 --- a/nvflare/apis/event_type.py +++ b/nvflare/apis/event_type.py @@ -89,3 +89,4 @@ class EventType(object): AUTHORIZE_COMMAND_CHECK = "_authorize_command_check" BEFORE_BUILD_COMPONENT = "_before_build_component" + GET_JOB_LAUNCHER = "_get_job_launcher" diff --git a/nvflare/apis/fl_constant.py b/nvflare/apis/fl_constant.py index a79a1dadde..70b2e2af41 100644 --- a/nvflare/apis/fl_constant.py +++ b/nvflare/apis/fl_constant.py @@ -161,6 +161,7 @@ class FLContextKey(object): DISCONNECTED_CLIENT_NAME = "_disconnected_client_name" RECONNECTED_CLIENT_NAME = "_reconnected_client_name" SITE_OBJ = "_site_obj_" + JOB_LAUNCHER = "_job_launcher" CLIENT_REGISTER_DATA = "_client_register_data" SECURITY_ITEMS = "_security_items" diff --git a/nvflare/apis/job_launcher_spec.py b/nvflare/apis/job_launcher_spec.py index 378c8620e4..2958d0f30b 100644 --- a/nvflare/apis/job_launcher_spec.py +++ b/nvflare/apis/job_launcher_spec.py @@ -13,6 +13,7 @@ # limitations under the License. from abc import abstractmethod +from nvflare.apis.fl_component import FLComponent from nvflare.apis.fl_context import FLContext @@ -45,29 +46,16 @@ def wait(self): raise NotImplementedError() -class JobLauncherSpec: +class JobLauncherSpec(FLComponent): @abstractmethod - def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: + def launch_job(self, meta_data: dict, fl_ctx: FLContext) -> JobHandleSpec: """To launch a job run. Args: - launch_data: job launch meta data + meta_data: job meta data fl_ctx: FLContext Returns: boolean to indicates the job launch success or fail. """ raise NotImplementedError() - - @abstractmethod - def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: - """To determine if the launcher can launch this job. - - Args: - fl_ctx: FLContext - launch_data: job launch meta data - - Returns: True / False - - """ - raise NotImplementedError() diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py index 87d4e374ba..97f962b309 100644 --- a/nvflare/app_common/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -17,11 +17,13 @@ import subprocess import sys +from nvflare.apis.event_type import EventType from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext +from nvflare.apis.job_def import JobMetaKey from nvflare.apis.workspace import Workspace from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec -from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path +from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path, extract_job_image class ProcessHandle(JobHandleSpec): @@ -58,13 +60,13 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: + def launch_job(self, meta_data: dict, fl_ctx: FLContext) -> JobHandleSpec: new_env = os.environ.copy() workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) args = fl_ctx.get_prop(FLContextKey.ARGS) client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) - job_id = launch_data.get(JobConstants.JOB_ID) + job_id = meta_data.get(JobMetaKey.JOB_ID) server_config = fl_ctx.get_prop(FLContextKey.SERVER_CONFIG) if not server_config: raise RuntimeError(f"missing {FLContextKey.SERVER_CONFIG} in FL context") @@ -108,9 +110,12 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: return ProcessHandle(process) - def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: - job_image = launch_data.get(JobConstants.JOB_IMAGE) - if job_image: - return False - else: - return True + def handle_event(self, event_type: str, fl_ctx: FLContext): + if event_type == EventType.GET_JOB_LAUNCHER: + job_meta = fl_ctx.get_prop(FLContextKey.JOB_META) + job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) + if not job_image: + job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) + job_launcher.append(self) + fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) + diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index ae8d68ea25..256470ed09 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -20,10 +20,12 @@ from kubernetes.client.api import core_v1_api from kubernetes.client.rest import ApiException +from nvflare.apis.event_type import EventType from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.workspace import Workspace from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.private.fed.utils.fed_utils import extract_job_image class JobState(Enum): @@ -267,9 +269,11 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: job_handle.terminate() return None - def can_launch(self, launch_data: dict, fl_ctx: FLContext) -> bool: - job_image = launch_data.get(JobConstants.JOB_IMAGE) - if job_image: - return True - else: - return False + def handle_event(self, event_type: str, fl_ctx: FLContext): + if event_type == EventType.GET_JOB_LAUNCHER: + job_meta = fl_ctx.get_prop(FLContextKey.JOB_META) + job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) + if job_image: + job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) + job_launcher.append(self) + fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) diff --git a/nvflare/lighter/impl/master_template.yml b/nvflare/lighter/impl/master_template.yml index 1fc697ddb6..e4c4e72fdd 100644 --- a/nvflare/lighter/impl/master_template.yml +++ b/nvflare/lighter/impl/master_template.yml @@ -95,7 +95,7 @@ local_client_resources: | }, { "id": "process_launcher", - "path": "nvflare.app_opt.job_launcher.process_launcher.ProcessJobLauncher", + "path": "nvflare.app_common.job_launcher.process_launcher.ProcessJobLauncher", "args": {} } ] diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 209ab0a1ec..625a9d253c 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -17,7 +17,8 @@ import time from abc import ABC, abstractmethod -from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemConfigs +from nvflare.apis.event_type import EventType +from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemConfigs, FLContextKey from nvflare.apis.fl_context import FLContext from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.apis.job_launcher_spec import JobLauncherSpec @@ -164,12 +165,9 @@ def start_app( fl_ctx: FLContext """ - launch_data = self._get_job_launcher(job_id, client, job_meta) - job_launcher = launch_data.get(JobConstants.JOB_LAUNCHER) - if not job_launcher: - raise RuntimeError(f"There's no job launcher can handle this job: {launch_data}.") - job_handle = job_launcher.launch_job(launch_data, fl_ctx) - self.logger.info(f"Launch job data: {launch_data} with job launcher: {type(job_launcher)} ") + job_launcher: JobLauncherSpec = self._get_job_launcher(job_meta, fl_ctx) + job_handle = job_launcher.launch_job(job_meta, fl_ctx) + self.logger.info(f"Launch job_id: {job_id} with job launcher: {type(job_launcher)} ") client.multi_gpu = False @@ -185,17 +183,16 @@ def start_app( ) thread.start() - def _get_job_launcher(self, job_id, client, job_meta: dict) -> dict: - launch_image = extract_job_image(job_meta, client.client_name) - launch_data = {JobConstants.JOB_IMAGE: launch_image, JobConstants.JOB_ID: job_id} - - job_launcher = None - for _, component in client.components.items(): - if isinstance(component, JobLauncherSpec): - if component.can_launch(launch_data, None): - job_launcher = component - launch_data[JobConstants.JOB_LAUNCHER] = job_launcher - return launch_data + def _get_job_launcher(self, job_meta: dict, fl_ctx: FLContext) -> JobLauncherSpec: + engine = fl_ctx.get_engine() + fl_ctx.set_prop(FLContextKey.JOB_META, job_meta, private=True, sticky=False) + engine.fire_event(EventType.GET_JOB_LAUNCHER, fl_ctx) + + job_launcher = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER) + if not (job_launcher or isinstance(job_launcher, list)): + raise RuntimeError(f"There's no job launcher can handle this job: {job_meta}.") + + return job_launcher[0] def notify_job_status(self, job_id, job_status): run_process = self.run_processes.get(job_id) From e6123b102d72deb817dc365212f197b61ecf7671 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 25 Oct 2024 13:37:26 -0400 Subject: [PATCH 26/31] updated K8sJobLauncher. --- nvflare/apis/job_launcher_spec.py | 4 ++-- nvflare/app_common/job_launcher/process_launcher.py | 4 ++-- nvflare/app_opt/job_launcher/k8s_launcher.py | 8 +++----- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/nvflare/apis/job_launcher_spec.py b/nvflare/apis/job_launcher_spec.py index 2958d0f30b..19058b1b25 100644 --- a/nvflare/apis/job_launcher_spec.py +++ b/nvflare/apis/job_launcher_spec.py @@ -48,11 +48,11 @@ def wait(self): class JobLauncherSpec(FLComponent): @abstractmethod - def launch_job(self, meta_data: dict, fl_ctx: FLContext) -> JobHandleSpec: + def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: """To launch a job run. Args: - meta_data: job meta data + job_meta: job meta data fl_ctx: FLContext Returns: boolean to indicates the job launch success or fail. diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py index 97f962b309..65bc1059ad 100644 --- a/nvflare/app_common/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -60,13 +60,13 @@ def __init__(self): self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, meta_data: dict, fl_ctx: FLContext) -> JobHandleSpec: + def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: new_env = os.environ.copy() workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) args = fl_ctx.get_prop(FLContextKey.ARGS) client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) - job_id = meta_data.get(JobMetaKey.JOB_ID) + job_id = job_meta.get(JobMetaKey.JOB_ID) server_config = fl_ctx.get_prop(FLContextKey.SERVER_CONFIG) if not server_config: raise RuntimeError(f"missing {FLContextKey.SERVER_CONFIG} in FL context") diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index 256470ed09..c6e82b2d3c 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -193,7 +193,6 @@ def __init__( root_hostpath: str, workspace: str, mount_path: str, - supported_images: [str] = None, timeout=None, namespace="default", ): @@ -203,7 +202,6 @@ def __init__( self.workspace = workspace self.mount_path = mount_path self.timeout = timeout - self.supported_images = supported_images config.load_kube_config(config_file_path) try: @@ -218,12 +216,12 @@ def __init__( self.job_handle = None self.logger = logging.getLogger(self.__class__.__name__) - def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: + def launch_job(self, job_meta: dict, fl_ctx: FLContext) -> JobHandleSpec: workspace_obj: Workspace = fl_ctx.get_prop(FLContextKey.WORKSPACE_OBJECT) args = fl_ctx.get_prop(FLContextKey.ARGS) client = fl_ctx.get_prop(FLContextKey.SITE_OBJ) - job_id = launch_data.get(JobConstants.JOB_ID) + job_id = job_meta.get(JobConstants.JOB_ID) server_config = fl_ctx.get_prop(FLContextKey.SERVER_CONFIG) if not server_config: raise RuntimeError(f"missing {FLContextKey.SERVER_CONFIG} in FL context") @@ -232,7 +230,7 @@ def launch_job(self, launch_data: dict, fl_ctx: FLContext) -> JobHandleSpec: raise RuntimeError(f"expect server config data to be dict but got {type(service)}") self.logger.info(f"K8sJobLauncher start to launch job: {job_id} for client: {client.client_name}") - job_image = launch_data.get(JobConstants.JOB_IMAGE) + job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) self.logger.info(f"launch job use image: {job_image}") job_config = { "name": job_id, From 90f868785f00792f91314be92b1e7cfa0562708f Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 25 Oct 2024 15:21:24 -0400 Subject: [PATCH 27/31] codestyle fix. --- nvflare/app_common/job_launcher/process_launcher.py | 3 +-- nvflare/app_opt/job_launcher/k8s_launcher.py | 4 ++-- nvflare/private/fed/client/client_executor.py | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py index 65bc1059ad..95d98e63db 100644 --- a/nvflare/app_common/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -21,8 +21,8 @@ from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext from nvflare.apis.job_def import JobMetaKey -from nvflare.apis.workspace import Workspace from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.workspace import Workspace from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path, extract_job_image @@ -118,4 +118,3 @@ def handle_event(self, event_type: str, fl_ctx: FLContext): job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) job_launcher.append(self) fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) - diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index c6e82b2d3c..c9c5f6e37b 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -23,8 +23,8 @@ from nvflare.apis.event_type import EventType from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext -from nvflare.apis.workspace import Workspace from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.workspace import Workspace from nvflare.private.fed.utils.fed_utils import extract_job_image @@ -49,7 +49,7 @@ class JobState(Enum): JobState.STARTING: None, JobState.RUNNING: None, JobState.TERMINATED: 1, - JobState.UNKNOWN: None + JobState.UNKNOWN: None, } diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 625a9d253c..44be0356a5 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -18,10 +18,10 @@ from abc import ABC, abstractmethod from nvflare.apis.event_type import EventType -from nvflare.apis.fl_constant import AdminCommandNames, JobConstants, RunProcessKey, SystemConfigs, FLContextKey +from nvflare.apis.fl_constant import AdminCommandNames, FLContextKey, JobConstants, RunProcessKey, SystemConfigs from nvflare.apis.fl_context import FLContext -from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.apis.job_launcher_spec import JobLauncherSpec +from nvflare.apis.resource_manager_spec import ResourceManagerSpec from nvflare.fuel.common.exit_codes import PROCESS_EXIT_REASON, ProcessExitCode from nvflare.fuel.f3.cellnet.core_cell import FQCN from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode From 1aa8d39763d96fdab70b7b283454e36356d7339a Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 25 Oct 2024 15:27:34 -0400 Subject: [PATCH 28/31] removed no use import. --- nvflare/app_common/job_launcher/process_launcher.py | 2 +- nvflare/private/fed/client/client_executor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py index 95d98e63db..f974aac0e1 100644 --- a/nvflare/app_common/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -18,7 +18,7 @@ import sys from nvflare.apis.event_type import EventType -from nvflare.apis.fl_constant import FLContextKey, JobConstants +from nvflare.apis.fl_constant import FLContextKey from nvflare.apis.fl_context import FLContext from nvflare.apis.job_def import JobMetaKey from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 44be0356a5..6068e4d6e5 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -18,7 +18,7 @@ from abc import ABC, abstractmethod from nvflare.apis.event_type import EventType -from nvflare.apis.fl_constant import AdminCommandNames, FLContextKey, JobConstants, RunProcessKey, SystemConfigs +from nvflare.apis.fl_constant import AdminCommandNames, FLContextKey, RunProcessKey, SystemConfigs from nvflare.apis.fl_context import FLContext from nvflare.apis.job_launcher_spec import JobLauncherSpec from nvflare.apis.resource_manager_spec import ResourceManagerSpec @@ -27,7 +27,7 @@ from nvflare.fuel.f3.cellnet.defs import MessageHeaderKey, ReturnCode from nvflare.fuel.utils.config_service import ConfigService from nvflare.private.defs import CellChannel, CellChannelTopic, JobFailureMsgKey, new_cell_message -from nvflare.private.fed.utils.fed_utils import extract_job_image, get_return_code +from nvflare.private.fed.utils.fed_utils import get_return_code from nvflare.security.logging import secure_format_exception, secure_log_traceback from .client_status import ClientStatus, get_status_message From 8153dd3db0d04a4a453ce822a53bb6afba0488ba Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Mon, 28 Oct 2024 12:11:46 -0400 Subject: [PATCH 29/31] JobReturnCode standard. --- nvflare/apis/job_launcher_spec.py | 15 ++++++++++++++ .../job_launcher/process_launcher.py | 12 +++++------ nvflare/app_opt/job_launcher/k8s_launcher.py | 20 +++++++++---------- nvflare/private/fed/client/client_executor.py | 8 ++++---- 4 files changed, 34 insertions(+), 21 deletions(-) diff --git a/nvflare/apis/job_launcher_spec.py b/nvflare/apis/job_launcher_spec.py index 19058b1b25..fcc0e04c94 100644 --- a/nvflare/apis/job_launcher_spec.py +++ b/nvflare/apis/job_launcher_spec.py @@ -14,7 +14,22 @@ from abc import abstractmethod from nvflare.apis.fl_component import FLComponent +from nvflare.apis.fl_constant import FLContextKey from nvflare.apis.fl_context import FLContext +from nvflare.fuel.common.exit_codes import ProcessExitCode + + +class JobReturnCode(ProcessExitCode): + SUCCESS = 0 + EXECUTION_ERROR = 1 + ABORTED = 9 + UNKNOWN = 127 + + +def add_launcher(launcher, fl_ctx: FLContext): + job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) + job_launcher.append(launcher) + fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) class JobHandleSpec: diff --git a/nvflare/app_common/job_launcher/process_launcher.py b/nvflare/app_common/job_launcher/process_launcher.py index f974aac0e1..912893feff 100644 --- a/nvflare/app_common/job_launcher/process_launcher.py +++ b/nvflare/app_common/job_launcher/process_launcher.py @@ -21,10 +21,12 @@ from nvflare.apis.fl_constant import FLContextKey from nvflare.apis.fl_context import FLContext from nvflare.apis.job_def import JobMetaKey -from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec, JobReturnCode, add_launcher from nvflare.apis.workspace import Workspace from nvflare.private.fed.utils.fed_utils import add_custom_dir_to_path, extract_job_image +JOB_RETURN_CODE_MAPPING = {0: JobReturnCode.SUCCESS, 1: JobReturnCode.EXECUTION_ERROR, 9: JobReturnCode.ABORTED} + class ProcessHandle(JobHandleSpec): def __init__(self, process): @@ -45,9 +47,9 @@ def terminate(self): def poll(self): if self.process: - return self.process.poll() + return JOB_RETURN_CODE_MAPPING.get(self.process.poll(), JobReturnCode.EXECUTION_ERROR) else: - return None + return JobReturnCode.UNKNOWN def wait(self): if self.process: @@ -115,6 +117,4 @@ def handle_event(self, event_type: str, fl_ctx: FLContext): job_meta = fl_ctx.get_prop(FLContextKey.JOB_META) job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) if not job_image: - job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) - job_launcher.append(self) - fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) + add_launcher(self, fl_ctx) diff --git a/nvflare/app_opt/job_launcher/k8s_launcher.py b/nvflare/app_opt/job_launcher/k8s_launcher.py index c9c5f6e37b..bba0df01e0 100644 --- a/nvflare/app_opt/job_launcher/k8s_launcher.py +++ b/nvflare/app_opt/job_launcher/k8s_launcher.py @@ -23,7 +23,7 @@ from nvflare.apis.event_type import EventType from nvflare.apis.fl_constant import FLContextKey, JobConstants from nvflare.apis.fl_context import FLContext -from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec +from nvflare.apis.job_launcher_spec import JobHandleSpec, JobLauncherSpec, JobReturnCode, add_launcher from nvflare.apis.workspace import Workspace from nvflare.private.fed.utils.fed_utils import extract_job_image @@ -44,12 +44,12 @@ class JobState(Enum): "Unknown": JobState.UNKNOWN, } -RETURN_CODES = { - JobState.SUCCEEDED: 0, - JobState.STARTING: None, - JobState.RUNNING: None, - JobState.TERMINATED: 1, - JobState.UNKNOWN: None, +JOB_RETURN_CODE_MAPPING = { + JobState.SUCCEEDED: JobReturnCode.SUCCESS, + JobState.STARTING: JobReturnCode.UNKNOWN, + JobState.RUNNING: JobReturnCode.UNKNOWN, + JobState.TERMINATED: JobReturnCode.ABORTED, + JobState.UNKNOWN: JobReturnCode.UNKNOWN, } @@ -173,7 +173,7 @@ def terminate(self): def poll(self): job_state = self._query_state() - return RETURN_CODES.get(job_state) + return JOB_RETURN_CODE_MAPPING.get(job_state, JobReturnCode.UNKNOWN) def _query_state(self): try: @@ -272,6 +272,4 @@ def handle_event(self, event_type: str, fl_ctx: FLContext): job_meta = fl_ctx.get_prop(FLContextKey.JOB_META) job_image = extract_job_image(job_meta, fl_ctx.get_identity_name()) if job_image: - job_launcher: list = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER, []) - job_launcher.append(self) - fl_ctx.set_prop(FLContextKey.JOB_LAUNCHER, job_launcher, private=True, sticky=False) + add_launcher(self, fl_ctx) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 6068e4d6e5..222431428a 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -189,7 +189,7 @@ def _get_job_launcher(self, job_meta: dict, fl_ctx: FLContext) -> JobLauncherSpe engine.fire_event(EventType.GET_JOB_LAUNCHER, fl_ctx) job_launcher = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER) - if not (job_launcher or isinstance(job_launcher, list)): + if not isinstance(job_launcher, list): raise RuntimeError(f"There's no job launcher can handle this job: {job_meta}.") return job_launcher[0] @@ -327,7 +327,7 @@ def abort_app(self, job_id): optional=True, ) self.logger.debug("abort sent to worker") - t = threading.Thread(target=self._terminate_process, args=[job_handle, job_id]) + t = threading.Thread(target=self._terminate_job, args=[job_handle, job_id]) t.start() t.join() break @@ -345,7 +345,7 @@ def abort_app(self, job_id): self.logger.info("Client worker process is terminated.") - def _terminate_process(self, child_process, job_id): + def _terminate_job(self, job_handle, job_id): max_wait = 10.0 done = False start = time.time() @@ -362,7 +362,7 @@ def _terminate_process(self, child_process, job_id): time.sleep(0.05) # we want to quickly check - child_process.terminate() + job_handle.terminate() self.logger.info(f"run ({job_id}): child worker process terminated") def abort_task(self, job_id): From baecfff46005ca51e4ab9423780d2c397f5e1a25 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Tue, 29 Oct 2024 12:56:48 -0400 Subject: [PATCH 30/31] fixed the _get_job_launcher() condition logic. --- nvflare/private/fed/client/client_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nvflare/private/fed/client/client_executor.py b/nvflare/private/fed/client/client_executor.py index 222431428a..6a8a111239 100644 --- a/nvflare/private/fed/client/client_executor.py +++ b/nvflare/private/fed/client/client_executor.py @@ -189,7 +189,7 @@ def _get_job_launcher(self, job_meta: dict, fl_ctx: FLContext) -> JobLauncherSpe engine.fire_event(EventType.GET_JOB_LAUNCHER, fl_ctx) job_launcher = fl_ctx.get_prop(FLContextKey.JOB_LAUNCHER) - if not isinstance(job_launcher, list): + if not (job_launcher and isinstance(job_launcher, list)): raise RuntimeError(f"There's no job launcher can handle this job: {job_meta}.") return job_launcher[0] From b51ef76d595357447013f86c00390d400c6d3c91 Mon Sep 17 00:00:00 2001 From: Yuhong Wen Date: Fri, 1 Nov 2024 14:01:27 -0400 Subject: [PATCH 31/31] fixed the missing client_name in the workspace object. --- nvflare/private/fed/app/client/client_train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nvflare/private/fed/app/client/client_train.py b/nvflare/private/fed/app/client/client_train.py index 1b8f4b587f..7618622ad8 100644 --- a/nvflare/private/fed/app/client/client_train.py +++ b/nvflare/private/fed/app/client/client_train.py @@ -100,6 +100,7 @@ def main(args): federated_client.use_gpu = False federated_client.config_folder = config_folder + workspace = Workspace(args.workspace, federated_client.client_name, config_folder) client_engine = ClientEngine(federated_client, args, rank)