Skip to content

Commit

Permalink
more logging (#1309)
Browse files Browse the repository at this point in the history
  • Loading branch information
BalaBalaYi authored Oct 25, 2024
1 parent 53e10bd commit ae9d476
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 1 deletion.
5 changes: 5 additions & 0 deletions dlrover/python/elastic_agent/torch/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,11 @@ def auto_configure_params(self):
device = torch.cuda.get_device_name()
if "Ascend" in device:
self.accelerator = Accelerators.ASCEND_NPU
logger.info(
f"Use {self.accelerator} device for training, "
f"cuda is available: {torch.cuda.is_available()}."
)

if not self.auto_config:
return

Expand Down
5 changes: 5 additions & 0 deletions dlrover/trainer/torch/node_check/ascend_npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ def main():
device = torch.cuda.get_device_name()
if "Ascend" in device:
protocol = "hccl"
else:
logger.warning(
f"Use GLOO as comm protocol for use_cuda: {use_cuda} "
"or 'Ascend' not in `torch.cuda.get_device_name()`."
)

init_process_group(protocol, timeout=get_network_check_timeout())

Expand Down
9 changes: 8 additions & 1 deletion dlrover/trainer/torch/node_check/nvidia_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
import torch
import torch.distributed as dist

from dlrover.python.common.log import default_logger as logger

from .utils import (
bm_allreduce,
get_network_check_timeout,
Expand All @@ -37,7 +39,12 @@ def set_nccl_env():
@record_execution_time
def main():
use_cuda = torch.cuda.is_available()
protocol = "nccl" if use_cuda else "gloo"

if use_cuda:
protocol = "nccl"
else:
logger.warning("Use GLOO as comm protocol for cuda is not available.")
protocol = "gloo"

init_process_group(protocol, timeout=get_network_check_timeout())
if use_cuda:
Expand Down

0 comments on commit ae9d476

Please sign in to comment.