diff --git a/dlrover/python/common/constants.py b/dlrover/python/common/constants.py index d5babb9e9..7977cb62a 100644 --- a/dlrover/python/common/constants.py +++ b/dlrover/python/common/constants.py @@ -387,6 +387,12 @@ class JobConstant(object): TRAINING_AGENT_LOOP_DEFAULT_INTERVAL = 15 + # sleep 5s before next rendezvous round + RENDEZVOUS_DEFAULT_INTERVAL = 5 + + # sleep 5s before next port synchronization + SYNC_PORTS_DEFAULT_INTERVAL = 5 + class Accelerators(object): NVIDIA_GPU = "nvidia.com/gpu" diff --git a/dlrover/python/elastic_agent/torch/training.py b/dlrover/python/elastic_agent/torch/training.py index 10743e0dd..f22aa7521 100644 --- a/dlrover/python/elastic_agent/torch/training.py +++ b/dlrover/python/elastic_agent/torch/training.py @@ -354,9 +354,7 @@ def next_rendezvous(self): "and waits for more nodes." ) start_pending = time.time() - time.sleep( - JobConstant.TRAINING_AGENT_LOOP_DEFAULT_INTERVAL - ) + time.sleep(JobConstant.RENDEZVOUS_DEFAULT_INTERVAL) start_join = time.time() if start_join - start_pending > self.pend_timeout: raise TimeoutError( @@ -373,7 +371,7 @@ def next_rendezvous(self): err_msg, level=TrainingExceptionLevel.RDZV_ERROR ) raise TimeoutError(err_msg) - time.sleep(JobConstant.TRAINING_AGENT_LOOP_DEFAULT_INTERVAL) + time.sleep(JobConstant.RENDEZVOUS_DEFAULT_INTERVAL) rank = list(world.keys()).index(self._node_rank) world_size = len(world) logger.info( @@ -1185,7 +1183,9 @@ def stop_executor(self): """Shutdown the executor to save the checkpoint.""" self._save_ckpt_executor.shutdown(wait=False) - def sync_training_ports(self, interval=20): + def sync_training_ports( + self, interval=JobConstant.SYNC_PORTS_DEFAULT_INTERVAL + ): logger.info(f"Accelerator: {self._config.accelerator}") if ( self._config.accelerator == Accelerators.ASCEND_NPU