dmlc · trivialfis · Nov 19, 2024 · Nov 19, 2024
diff --git a/demo/guide-python/distributed_extmem_basic.py b/demo/guide-python/distributed_extmem_basic.py
@@ -106,7 +106,8 @@ def setup_rmm() -> None:
         return
 
     try:
-        from cuda import cudart
+        # Use the arena pool if available
+        from cuda.bindings import runtime as cudart
         from rmm.mr import ArenaMemoryResource
 
         status, free, total = cudart.cudaMemGetInfo()

diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
@@ -156,17 +156,28 @@ def main(tmpdir: str, args: argparse.Namespace) -> None:
 def setup_rmm() -> None:
     """Setup RMM for GPU-based external memory training."""
     import rmm
-    from cuda import cudart
     from rmm.allocators.cupy import rmm_cupy_allocator
 
     if not xgboost.build_info()["USE_RMM"]:
         return
 
-    # The combination of pool and async is by design. As XGBoost needs to allocate large
-    # pages repeatly, it's not easy to handle fragmentation. We can use more experiments
-    # here.
-    mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
-    rmm.mr.set_current_device_resource(mr)
+    try:
+        # Use the arena pool if available
+        from cuda.bindings import runtime as cudart
+        from rmm.mr import ArenaMemoryResource
+
+        status, free, total = cudart.cudaMemGetInfo()
+        if status != cudart.cudaError_t.cudaSuccess:
+            raise RuntimeError(cudart.cudaGetErrorString(status))
+
+        mr = rmm.mr.CudaMemoryResource()
+        mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
+    except ImportError:
+        # The combination of pool and async is by design. As XGBoost needs to allocate
+        # large pages repeatly, it's not easy to handle fragmentation. We can use more
+        # experiments here.
+        mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
+        rmm.mr.set_current_device_resource(mr)
     # Set the allocator for cupy as well.
     cp.cuda.set_allocator(rmm_cupy_allocator)
 

diff --git a/src/collective/coll.cu b/src/collective/coll.cu
@@ -153,19 +153,19 @@ template <typename Fn, typename R = std::invoke_result_t<Fn, dh::CUDAStreamView>
   auto abort = [&](std::string msg) {
     auto rc = stub->CommAbort(nccl->Handle());
     fut.wait();  // Must block, otherwise the thread might access freed memory.
-    return Fail(std::move(msg)) + std::move(rc);
+    return Fail(msg + ": " + std::to_string(nccl->Timeout().count()) + "s.") + std::move(rc);
   };
   if (!chan.called) {
     // Timeout waiting for the NCCL op to return. With older versions of NCCL, the op
     // might block even if the config is set to nonblocking.
-    return abort("NCCL future timeout.");
+    return abort("NCCL future timeout");
   }
 
   // This actually includes the time for prior kernels due to CUDA async calls.
   switch (fut.wait_for(nccl->Timeout())) {
     case std::future_status::timeout:
       // Timeout waiting for the NCCL op to finish.
-      return abort("NCCL timeout.");
+      return abort("NCCL timeout");
     case std::future_status::ready:
       return fut.get();
     case std::future_status::deferred:

diff --git a/tests/cpp/collective/test_allreduce.cu b/tests/cpp/collective/test_allreduce.cu
@@ -111,7 +111,7 @@ TEST_F(MGPUAllreduceTest, Timeout) {
         auto rc = w->NoCheck();
         if (r == 1) {
           auto rep = rc.Report();
-          ASSERT_NE(rep.find("NCCL timeout."), std::string::npos) << rep;
+          ASSERT_NE(rep.find("NCCL timeout:"), std::string::npos) << rep;
         }
 
         w.reset();
@@ -131,7 +131,7 @@ TEST_F(MGPUAllreduceTest, Timeout) {
         // Only one of the workers is doing allreduce.
         if (r == 0) {
           auto rc = w->NoCheck();
-          ASSERT_NE(rc.Report().find("NCCL timeout."), std::string::npos) << rc.Report();
+          ASSERT_NE(rc.Report().find("NCCL timeout:"), std::string::npos) << rc.Report();
         }
 
         w.reset();