Skip to content

Commit

Permalink
LOG(INFO) -> VLOG(2) in ProcessGroupNCCL (pytorch#130696)
Browse files Browse the repository at this point in the history
In the same spirit as pytorch#105695

Initialization and error handling logs are mostly kept. Routine logs are changed to VLOG.

Pull Request resolved: pytorch#130696
Approved by: https://github.com/kwen2501

Co-authored-by: Ke Wen <[email protected]>
  • Loading branch information
2 people authored and pytorchmergebot committed Nov 4, 2024
1 parent ed30fa7 commit cadc50e
Showing 1 changed file with 18 additions and 17 deletions.
35 changes: 18 additions & 17 deletions torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1243,8 +1243,8 @@ void ProcessGroupNCCL::waitForFutureOrTimeout(
try {
bool result = fut.get();
if (result) {
LOG(INFO) << logPrefix()
<< "future is successfully executed for: " << futDescription;
VLOG(2) << logPrefix()
<< "future is successfully executed for: " << futDescription;
if (log) {
data.strings["status"] = "SUCCESS";
}
Expand Down Expand Up @@ -1311,8 +1311,9 @@ void ProcessGroupNCCL::abortCommsFromMap(
// TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
gpuGuard.set_index(deviceIndex);
}
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
<< ncclComm->repr() << " on CUDA device: " << devName;

VLOG(2) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
<< ncclComm->repr() << " on CUDA device: " << devName;
ncclComm->ncclCommAbort(abortReason);
// Note that we don't remove the aborted communicators from the
// cache. The reason is that if we do remove the communicator
Expand All @@ -1324,8 +1325,8 @@ void ProcessGroupNCCL::abortCommsFromMap(
// their responsibility to destroy the process group and recreate
// it to recover from errors.

LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed "
<< " communicator on CUDA device: " << devName;
VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed "
<< " communicator on CUDA device: " << devName;
}
}

Expand Down Expand Up @@ -1391,7 +1392,7 @@ void ProcessGroupNCCL::shutdown() {

// NOLINTNEXTLINE(bugprone-exception-escape)
ProcessGroupNCCL::~ProcessGroupNCCL() {
LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
VLOG(2) << logPrefix() << "ProcessGroupNCCL destructor entered.";

if (!terminateProcessGroup_.load()) {
if (rank_ % localDeviceCount_ == 0) {
Expand All @@ -1413,19 +1414,19 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
if (!blockingWait_) {
if (ncclCommWatchdogThread_.joinable()) {
ncclCommWatchdogThread_.join();
LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
VLOG(2) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
}
if (ncclHeartbeatMonitorThread_.joinable()) {
ncclHeartbeatMonitorThread_.join();
LOG(INFO) << logPrefix()
<< "ProcessGroupNCCL heart beat monitor thread joined.";
VLOG(2) << logPrefix()
<< "ProcessGroupNCCL heart beat monitor thread joined.";
}
}
#endif
if (onCompletionHookThread_.joinable()) {
onCompletionHookThread_.join();
LOG(INFO) << logPrefix()
<< "ProcessGroupNCCL onCompletionHookThread thread joined.";
VLOG(2) << logPrefix()
<< "ProcessGroupNCCL onCompletionHookThread thread joined.";
}
}

Expand Down Expand Up @@ -1673,7 +1674,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
<< "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
}
} else {
LOG(INFO)
VLOG(2)
<< logPrefix()
<< "GIL checker was not registered, perhaps this is a no-python build?";
}
Expand Down Expand Up @@ -1748,7 +1749,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
} catch (std::exception& e) {
if (std::string(e.what()).find("driver shutting down") !=
std::string::npos) {
LOG(INFO)
VLOG(2)
<< logPrefix()
<< "main process destroyed cuda before watchdog loop exited, terminating watchdog."
<< " (Watchdog caught exception: " << e.what();
Expand Down Expand Up @@ -2481,9 +2482,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
globalRankStride, // globalRankStride
size_); // worldSize

LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
<< ncclComm->repr()
<< " on CUDA device: " << static_cast<int>(deviceIndex);
VLOG(2) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
<< ncclComm->repr()
<< " on CUDA device: " << static_cast<int>(deviceIndex);

// At this point NCCL should have been initialized, hence we can accurately
// get the env value even if NCCL sets it by reading from nccl.conf file
Expand Down

0 comments on commit cadc50e

Please sign in to comment.