Skip to content

Commit cadc50e

Browse files
ppwwyyxxkwen2501
authored andcommitted
LOG(INFO) -> VLOG(2) in ProcessGroupNCCL (pytorch#130696)
In the same spirit as pytorch#105695 Initialization and error handling logs are mostly kept. Routine logs are changed to VLOG. Pull Request resolved: pytorch#130696 Approved by: https://github.com/kwen2501 Co-authored-by: Ke Wen <[email protected]>
1 parent ed30fa7 commit cadc50e

File tree

1 file changed

+18
-17
lines changed

1 file changed

+18
-17
lines changed

torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1243,8 +1243,8 @@ void ProcessGroupNCCL::waitForFutureOrTimeout(
12431243
try {
12441244
bool result = fut.get();
12451245
if (result) {
1246-
LOG(INFO) << logPrefix()
1247-
<< "future is successfully executed for: " << futDescription;
1246+
VLOG(2) << logPrefix()
1247+
<< "future is successfully executed for: " << futDescription;
12481248
if (log) {
12491249
data.strings["status"] = "SUCCESS";
12501250
}
@@ -1311,8 +1311,9 @@ void ProcessGroupNCCL::abortCommsFromMap(
13111311
// TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
13121312
gpuGuard.set_index(deviceIndex);
13131313
}
1314-
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
1315-
<< ncclComm->repr() << " on CUDA device: " << devName;
1314+
1315+
VLOG(2) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
1316+
<< ncclComm->repr() << " on CUDA device: " << devName;
13161317
ncclComm->ncclCommAbort(abortReason);
13171318
// Note that we don't remove the aborted communicators from the
13181319
// cache. The reason is that if we do remove the communicator
@@ -1324,8 +1325,8 @@ void ProcessGroupNCCL::abortCommsFromMap(
13241325
// their responsibility to destroy the process group and recreate
13251326
// it to recover from errors.
13261327

1327-
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed "
1328-
<< " communicator on CUDA device: " << devName;
1328+
VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed "
1329+
<< " communicator on CUDA device: " << devName;
13291330
}
13301331
}
13311332

@@ -1391,7 +1392,7 @@ void ProcessGroupNCCL::shutdown() {
13911392

13921393
// NOLINTNEXTLINE(bugprone-exception-escape)
13931394
ProcessGroupNCCL::~ProcessGroupNCCL() {
1394-
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
1395+
VLOG(2) << logPrefix() << "ProcessGroupNCCL destructor entered.";
13951396

13961397
if (!terminateProcessGroup_.load()) {
13971398
if (rank_ % localDeviceCount_ == 0) {
@@ -1413,19 +1414,19 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
14131414
if (!blockingWait_) {
14141415
if (ncclCommWatchdogThread_.joinable()) {
14151416
ncclCommWatchdogThread_.join();
1416-
LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
1417+
VLOG(2) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
14171418
}
14181419
if (ncclHeartbeatMonitorThread_.joinable()) {
14191420
ncclHeartbeatMonitorThread_.join();
1420-
LOG(INFO) << logPrefix()
1421-
<< "ProcessGroupNCCL heart beat monitor thread joined.";
1421+
VLOG(2) << logPrefix()
1422+
<< "ProcessGroupNCCL heart beat monitor thread joined.";
14221423
}
14231424
}
14241425
#endif
14251426
if (onCompletionHookThread_.joinable()) {
14261427
onCompletionHookThread_.join();
1427-
LOG(INFO) << logPrefix()
1428-
<< "ProcessGroupNCCL onCompletionHookThread thread joined.";
1428+
VLOG(2) << logPrefix()
1429+
<< "ProcessGroupNCCL onCompletionHookThread thread joined.";
14291430
}
14301431
}
14311432

@@ -1673,7 +1674,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
16731674
<< "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
16741675
}
16751676
} else {
1676-
LOG(INFO)
1677+
VLOG(2)
16771678
<< logPrefix()
16781679
<< "GIL checker was not registered, perhaps this is a no-python build?";
16791680
}
@@ -1748,7 +1749,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
17481749
} catch (std::exception& e) {
17491750
if (std::string(e.what()).find("driver shutting down") !=
17501751
std::string::npos) {
1751-
LOG(INFO)
1752+
VLOG(2)
17521753
<< logPrefix()
17531754
<< "main process destroyed cuda before watchdog loop exited, terminating watchdog."
17541755
<< " (Watchdog caught exception: " << e.what();
@@ -2481,9 +2482,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
24812482
globalRankStride, // globalRankStride
24822483
size_); // worldSize
24832484

2484-
LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
2485-
<< ncclComm->repr()
2486-
<< " on CUDA device: " << static_cast<int>(deviceIndex);
2485+
VLOG(2) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
2486+
<< ncclComm->repr()
2487+
<< " on CUDA device: " << static_cast<int>(deviceIndex);
24872488

24882489
// At this point NCCL should have been initialized, hence we can accurately
24892490
// get the env value even if NCCL sets it by reading from nccl.conf file

0 commit comments

Comments
 (0)