From 1f666c43bb20bdd5dd9cdccaba6acff396492eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=AE=8B=E5=AE=B6=E4=BA=AE=5FJialiang?= Date: Wed, 6 Nov 2024 21:47:13 +0800 Subject: [PATCH] some refinement on code comment --- dlrover/python/elastic_agent/torch/training.py | 2 +- dlrover/trainer/torch/elastic_run.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 944588977..656e0166a 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -290,7 +290,7 @@ def _join_rendezvous(self): def next_rendezvous(self): """The handler will periodically query the world from the master until the world is not empty. The world is a dictionary like - like {0: 8, 1: 8, 2: 8} where the key is the node ID and the value is + {0: 8, 1: 8, 2: 8} where the key is the node ID and the value is the local world size. The handler can get its rank by the position of it node ID in the world. """ diff --git a/dlrover/trainer/torch/elastic_run.py b/dlrover/trainer/torch/elastic_run.py index 12ac3cc2e..7761d104f 100644 --- a/dlrover/trainer/torch/elastic_run.py +++ b/dlrover/trainer/torch/elastic_run.py @@ -36,7 +36,7 @@ auto-config will set the nnodes as the number of nodes in a job, nproc_per_node as the number of available GPUs. If the number of nodes >= 4, it will set the network-check as True. If network-check is True, -dlrover-run will launch simple tasks on each node to check wether +dlrover-run will launch simple tasks on each node to check whether the node is slow or fault. Single-node multi-worker