@@ -1243,8 +1243,8 @@ void ProcessGroupNCCL::waitForFutureOrTimeout(
1243
1243
try {
1244
1244
bool result = fut.get ();
1245
1245
if (result) {
1246
- LOG (INFO ) << logPrefix ()
1247
- << " future is successfully executed for: " << futDescription;
1246
+ VLOG ( 2 ) << logPrefix ()
1247
+ << " future is successfully executed for: " << futDescription;
1248
1248
if (log) {
1249
1249
data.strings [" status" ] = " SUCCESS" ;
1250
1250
}
@@ -1311,8 +1311,9 @@ void ProcessGroupNCCL::abortCommsFromMap(
1311
1311
// TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
1312
1312
gpuGuard.set_index (deviceIndex);
1313
1313
}
1314
- LOG (INFO) << logPrefix () << " ProcessGroupNCCL destroying ncclComm_ "
1315
- << ncclComm->repr () << " on CUDA device: " << devName;
1314
+
1315
+ VLOG (2 ) << logPrefix () << " ProcessGroupNCCL destroying ncclComm_ "
1316
+ << ncclComm->repr () << " on CUDA device: " << devName;
1316
1317
ncclComm->ncclCommAbort (abortReason);
1317
1318
// Note that we don't remove the aborted communicators from the
1318
1319
// cache. The reason is that if we do remove the communicator
@@ -1324,8 +1325,8 @@ void ProcessGroupNCCL::abortCommsFromMap(
1324
1325
// their responsibility to destroy the process group and recreate
1325
1326
// it to recover from errors.
1326
1327
1327
- LOG (INFO ) << logPrefix () << " ProcessGroupNCCL destroyed "
1328
- << " communicator on CUDA device: " << devName;
1328
+ VLOG ( 2 ) << logPrefix () << " ProcessGroupNCCL destroyed "
1329
+ << " communicator on CUDA device: " << devName;
1329
1330
}
1330
1331
}
1331
1332
@@ -1391,7 +1392,7 @@ void ProcessGroupNCCL::shutdown() {
1391
1392
1392
1393
// NOLINTNEXTLINE(bugprone-exception-escape)
1393
1394
ProcessGroupNCCL::~ProcessGroupNCCL () {
1394
- LOG (INFO ) << logPrefix () << " ProcessGroupNCCL destructor entered." ;
1395
+ VLOG ( 2 ) << logPrefix () << " ProcessGroupNCCL destructor entered." ;
1395
1396
1396
1397
if (!terminateProcessGroup_.load ()) {
1397
1398
if (rank_ % localDeviceCount_ == 0 ) {
@@ -1413,19 +1414,19 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
1413
1414
if (!blockingWait_) {
1414
1415
if (ncclCommWatchdogThread_.joinable ()) {
1415
1416
ncclCommWatchdogThread_.join ();
1416
- LOG (INFO ) << logPrefix () << " ProcessGroupNCCL watchdog thread joined." ;
1417
+ VLOG ( 2 ) << logPrefix () << " ProcessGroupNCCL watchdog thread joined." ;
1417
1418
}
1418
1419
if (ncclHeartbeatMonitorThread_.joinable ()) {
1419
1420
ncclHeartbeatMonitorThread_.join ();
1420
- LOG (INFO ) << logPrefix ()
1421
- << " ProcessGroupNCCL heart beat monitor thread joined." ;
1421
+ VLOG ( 2 ) << logPrefix ()
1422
+ << " ProcessGroupNCCL heart beat monitor thread joined." ;
1422
1423
}
1423
1424
}
1424
1425
#endif
1425
1426
if (onCompletionHookThread_.joinable ()) {
1426
1427
onCompletionHookThread_.join ();
1427
- LOG (INFO ) << logPrefix ()
1428
- << " ProcessGroupNCCL onCompletionHookThread thread joined." ;
1428
+ VLOG ( 2 ) << logPrefix ()
1429
+ << " ProcessGroupNCCL onCompletionHookThread thread joined." ;
1429
1430
}
1430
1431
}
1431
1432
@@ -1673,7 +1674,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
1673
1674
<< " Could not acquire GIL within 300 ms on exit, possible GIL induced hang" ;
1674
1675
}
1675
1676
} else {
1676
- LOG (INFO )
1677
+ VLOG ( 2 )
1677
1678
<< logPrefix ()
1678
1679
<< " GIL checker was not registered, perhaps this is a no-python build?" ;
1679
1680
}
@@ -1748,7 +1749,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
1748
1749
} catch (std::exception& e) {
1749
1750
if (std::string (e.what ()).find (" driver shutting down" ) !=
1750
1751
std::string::npos) {
1751
- LOG (INFO )
1752
+ VLOG ( 2 )
1752
1753
<< logPrefix ()
1753
1754
<< " main process destroyed cuda before watchdog loop exited, terminating watchdog."
1754
1755
<< " (Watchdog caught exception: " << e.what ();
@@ -2481,9 +2482,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
2481
2482
globalRankStride, // globalRankStride
2482
2483
size_); // worldSize
2483
2484
2484
- LOG (INFO ) << logPrefix () << " ProcessGroupNCCL created ncclComm_ "
2485
- << ncclComm->repr ()
2486
- << " on CUDA device: " << static_cast <int >(deviceIndex);
2485
+ VLOG ( 2 ) << logPrefix () << " ProcessGroupNCCL created ncclComm_ "
2486
+ << ncclComm->repr ()
2487
+ << " on CUDA device: " << static_cast <int >(deviceIndex);
2487
2488
2488
2489
// At this point NCCL should have been initialized, hence we can accurately
2489
2490
// get the env value even if NCCL sets it by reading from nccl.conf file
0 commit comments