LOG(INFO) -> VLOG(2) in ProcessGroupNCCL (pytorch#130696)

ppwwyyxx · kwen2501 · pytorchmergebot · commit cadc50e7e962 · 2024-11-04T04:43:42.000Z
In the same spirit as pytorch#105695 Initialization and error handling logs are mostly kept. Routine logs are changed to VLOG. Pull Request resolved: pytorch#130696 Approved by: https://github.com/kwen2501 Co-authored-by: Ke Wen <kw2501@fb.com>
diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@@ -1243,8 +1243,8 @@ void ProcessGroupNCCL::waitForFutureOrTimeout(
     try {
       bool result = fut.get();
       if (result) {
-        LOG(INFO) << logPrefix()
-                  << "future is successfully executed for: " << futDescription;
+        VLOG(2) << logPrefix()
+                << "future is successfully executed for: " << futDescription;
         if (log) {
           data.strings["status"] = "SUCCESS";
         }
@@ -1311,8 +1311,9 @@ void ProcessGroupNCCL::abortCommsFromMap(
       // TODO: fix `getIndexFromDeviceKey` or fix `DeviceKey`
       gpuGuard.set_index(deviceIndex);
     }
-    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
-              << ncclComm->repr() << " on CUDA device: " << devName;
+
+    VLOG(2) << logPrefix() << "ProcessGroupNCCL destroying ncclComm_ "
+            << ncclComm->repr() << " on CUDA device: " << devName;
     ncclComm->ncclCommAbort(abortReason);
     // Note that we don't remove the aborted communicators from the
     // cache. The reason is that if we do remove the communicator
@@ -1324,8 +1325,8 @@ void ProcessGroupNCCL::abortCommsFromMap(
     // their responsibility to destroy the process group and recreate
     // it to recover from errors.
 
-    LOG(INFO) << logPrefix() << "ProcessGroupNCCL destroyed "
-              << " communicator on CUDA device: " << devName;
+    VLOG(2) << logPrefix() << "ProcessGroupNCCL destroyed "
+            << " communicator on CUDA device: " << devName;
   }
 }
 
@@ -1391,7 +1392,7 @@ void ProcessGroupNCCL::shutdown() {
 
 // NOLINTNEXTLINE(bugprone-exception-escape)
 ProcessGroupNCCL::~ProcessGroupNCCL() {
-  LOG(INFO) << logPrefix() << "ProcessGroupNCCL destructor entered.";
+  VLOG(2) << logPrefix() << "ProcessGroupNCCL destructor entered.";
 
   if (!terminateProcessGroup_.load()) {
     if (rank_ % localDeviceCount_ == 0) {
@@ -1413,19 +1414,19 @@ ProcessGroupNCCL::~ProcessGroupNCCL() {
   if (!blockingWait_) {
     if (ncclCommWatchdogThread_.joinable()) {
       ncclCommWatchdogThread_.join();
-      LOG(INFO) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
+      VLOG(2) << logPrefix() << "ProcessGroupNCCL watchdog thread joined.";
     }
     if (ncclHeartbeatMonitorThread_.joinable()) {
       ncclHeartbeatMonitorThread_.join();
-      LOG(INFO) << logPrefix()
-                << "ProcessGroupNCCL heart beat monitor thread joined.";
+      VLOG(2) << logPrefix()
+              << "ProcessGroupNCCL heart beat monitor thread joined.";
     }
   }
 #endif
   if (onCompletionHookThread_.joinable()) {
     onCompletionHookThread_.join();
-    LOG(INFO) << logPrefix()
-              << "ProcessGroupNCCL onCompletionHookThread thread joined.";
+    VLOG(2) << logPrefix()
+            << "ProcessGroupNCCL onCompletionHookThread thread joined.";
   }
 }
 
@@ -1673,7 +1674,7 @@ void ProcessGroupNCCL::heartbeatMonitor() {
           << "Could not acquire GIL within 300 ms on exit, possible GIL induced hang";
     }
   } else {
-    LOG(INFO)
+    VLOG(2)
         << logPrefix()
         << "GIL checker was not registered, perhaps this is a no-python build?";
   }
@@ -1748,7 +1749,7 @@ void ProcessGroupNCCL::ncclCommWatchdog() {
   } catch (std::exception& e) {
     if (std::string(e.what()).find("driver shutting down") !=
         std::string::npos) {
-      LOG(INFO)
+      VLOG(2)
           << logPrefix()
           << "main process destroyed cuda before watchdog loop exited, terminating watchdog."
           << " (Watchdog caught exception: " << e.what();
@@ -2481,9 +2482,9 @@ std::shared_ptr<NCCLComm> ProcessGroupNCCL::initNCCLComm(
       globalRankStride, // globalRankStride
       size_); // worldSize
 
-  LOG(INFO) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
-            << ncclComm->repr()
-            << " on CUDA device: " << static_cast<int>(deviceIndex);
+  VLOG(2) << logPrefix() << "ProcessGroupNCCL created ncclComm_ "
+          << ncclComm->repr()
+          << " on CUDA device: " << static_cast<int>(deviceIndex);
 
   // At this point NCCL should have been initialized, hence we can accurately
   // get the env value even if NCCL sets it by reading from nccl.conf file