Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Small enhancements for doc and error messages. #11002

Merged
merged 1 commit into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion demo/guide-python/distributed_extmem_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def setup_rmm() -> None:
return

try:
from cuda import cudart
# Use the arena pool if available
from cuda.bindings import runtime as cudart
from rmm.mr import ArenaMemoryResource

status, free, total = cudart.cudaMemGetInfo()
Expand Down
23 changes: 17 additions & 6 deletions demo/guide-python/external_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,17 +156,28 @@ def main(tmpdir: str, args: argparse.Namespace) -> None:
def setup_rmm() -> None:
"""Setup RMM for GPU-based external memory training."""
import rmm
from cuda import cudart
from rmm.allocators.cupy import rmm_cupy_allocator

if not xgboost.build_info()["USE_RMM"]:
return

# The combination of pool and async is by design. As XGBoost needs to allocate large
# pages repeatly, it's not easy to handle fragmentation. We can use more experiments
# here.
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
rmm.mr.set_current_device_resource(mr)
try:
# Use the arena pool if available
from cuda.bindings import runtime as cudart
from rmm.mr import ArenaMemoryResource

status, free, total = cudart.cudaMemGetInfo()
if status != cudart.cudaError_t.cudaSuccess:
raise RuntimeError(cudart.cudaGetErrorString(status))

mr = rmm.mr.CudaMemoryResource()
mr = ArenaMemoryResource(mr, arena_size=int(total * 0.9))
except ImportError:
# The combination of pool and async is by design. As XGBoost needs to allocate
# large pages repeatly, it's not easy to handle fragmentation. We can use more
# experiments here.
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource())
rmm.mr.set_current_device_resource(mr)
# Set the allocator for cupy as well.
cp.cuda.set_allocator(rmm_cupy_allocator)

Expand Down
6 changes: 3 additions & 3 deletions src/collective/coll.cu
Original file line number Diff line number Diff line change
Expand Up @@ -153,19 +153,19 @@ template <typename Fn, typename R = std::invoke_result_t<Fn, dh::CUDAStreamView>
auto abort = [&](std::string msg) {
auto rc = stub->CommAbort(nccl->Handle());
fut.wait(); // Must block, otherwise the thread might access freed memory.
return Fail(std::move(msg)) + std::move(rc);
return Fail(msg + ": " + std::to_string(nccl->Timeout().count()) + "s.") + std::move(rc);
};
if (!chan.called) {
// Timeout waiting for the NCCL op to return. With older versions of NCCL, the op
// might block even if the config is set to nonblocking.
return abort("NCCL future timeout.");
return abort("NCCL future timeout");
}

// This actually includes the time for prior kernels due to CUDA async calls.
switch (fut.wait_for(nccl->Timeout())) {
case std::future_status::timeout:
// Timeout waiting for the NCCL op to finish.
return abort("NCCL timeout.");
return abort("NCCL timeout");
case std::future_status::ready:
return fut.get();
case std::future_status::deferred:
Expand Down
4 changes: 2 additions & 2 deletions tests/cpp/collective/test_allreduce.cu
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ TEST_F(MGPUAllreduceTest, Timeout) {
auto rc = w->NoCheck();
if (r == 1) {
auto rep = rc.Report();
ASSERT_NE(rep.find("NCCL timeout."), std::string::npos) << rep;
ASSERT_NE(rep.find("NCCL timeout:"), std::string::npos) << rep;
}

w.reset();
Expand All @@ -131,7 +131,7 @@ TEST_F(MGPUAllreduceTest, Timeout) {
// Only one of the workers is doing allreduce.
if (r == 0) {
auto rc = w->NoCheck();
ASSERT_NE(rc.Report().find("NCCL timeout."), std::string::npos) << rc.Report();
ASSERT_NE(rc.Report().find("NCCL timeout:"), std::string::npos) << rc.Report();
}

w.reset();
Expand Down
Loading