Skip to content

Commit

Permalink
Retry error logging (#1844)
Browse files Browse the repository at this point in the history
Retry getting improved error throwing and logging, with bugs fixed and test added that repros the cudf failure.  [Original PR](#1827) that was [reverted](#1843). 

The changes to the previously-approved PR that includes the fixes and test is [this commit](c8a8505). The [original while loop](https://github.com/rapidsai/rmm/blob/6e8539e42d51852faab5f9b330232168f9223eee/include/rmm/mr/device/pool_memory_resource.hpp#L253) has been restored with better error handling. 

Note that this changes the interface of the macros, one of which is called in cudf that will be changed [here](rapidsai/cudf#18108) after this goes in.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: #1844
  • Loading branch information
pmattione-nvidia authored Mar 4, 2025
1 parent b44ebb6 commit dea0b21
Show file tree
Hide file tree
Showing 14 changed files with 100 additions and 45 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,7 @@ MRFactoryFunc get_mr_factory(std::string const& resource_name)
if (resource_name == "arena") { return &make_arena; }
if (resource_name == "binning") { return &make_binning; }

std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl;

RMM_FAIL();
RMM_FAIL("Invalid memory_resource name: " + resource_name);
}

void declare_benchmark(std::string const& name)
Expand Down Expand Up @@ -175,7 +173,7 @@ void declare_benchmark(std::string const& name)
return;
}

std::cout << "Error: invalid memory_resource name: " << name << std::endl;
RMM_FAIL("Invalid memory_resource name: " + name);
}

// NOLINTNEXTLINE(bugprone-easily-swappable-parameters)
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/utilities/simulated_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,10 @@ class simulated_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, cuda_stream_view) override
{
// NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
RMM_EXPECTS(begin_ + bytes <= end_, "Simulated memory size exceeded", rmm::bad_alloc);
RMM_EXPECTS(begin_ + bytes <= end_,
"Simulated memory size exceeded (failed to allocate " +
rmm::detail::format_bytes(bytes) + ")",
rmm::bad_alloc);
auto* ptr = static_cast<void*>(begin_);
begin_ += bytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
return ptr;
Expand Down
42 changes: 33 additions & 9 deletions include/rmm/detail/error.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@
GET_RMM_EXPECTS_MACRO(__VA_ARGS__, RMM_EXPECTS_3, RMM_EXPECTS_2) \
(__VA_ARGS__)
#define GET_RMM_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
"RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _reason \
#define RMM_EXPECTS_3(_condition, _reason, _exception_type) \
(!!(_condition)) ? static_cast<void>(0) \
: throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
{ \
std::string("RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": ") + _reason \
}
#define RMM_EXPECTS_2(_condition, _reason) RMM_EXPECTS_3(_condition, _reason, rmm::logic_error)

Expand All @@ -79,9 +79,10 @@
GET_RMM_FAIL_MACRO(__VA_ARGS__, RMM_FAIL_2, RMM_FAIL_1) \
(__VA_ARGS__)
#define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what};
#define RMM_FAIL_2(_what, _exception_type) \
/*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
throw _exception_type{std::string{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": "} + \
_what};
#define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error)

/**
Expand Down Expand Up @@ -131,8 +132,31 @@
*
* Defaults to throwing rmm::bad_alloc, but when `cudaErrorMemoryAllocation` is returned,
* rmm::out_of_memory is thrown instead.
*
* Can be called with either 1 or 2 arguments:
* - RMM_CUDA_TRY_ALLOC(cuda_call): Performs error checking without specifying bytes
* - RMM_CUDA_TRY_ALLOC(cuda_call, num_bytes): Includes the number of bytes in the error message
*/
#define RMM_CUDA_TRY_ALLOC(_call) \
#define RMM_CUDA_TRY_ALLOC(...) \
GET_RMM_CUDA_TRY_ALLOC_MACRO(__VA_ARGS__, RMM_CUDA_TRY_ALLOC_2, RMM_CUDA_TRY_ALLOC_1) \
(__VA_ARGS__)
#define GET_RMM_CUDA_TRY_ALLOC_MACRO(_1, _2, NAME, ...) NAME

#define RMM_CUDA_TRY_ALLOC_2(_call, num_bytes) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
cudaGetLastError(); \
auto const msg = std::string{"CUDA error (failed to allocate "} + \
std::to_string(num_bytes) + " bytes) at: " + __FILE__ + ":" + \
RMM_STRINGIFY(__LINE__) + ": " + cudaGetErrorName(error) + " " + \
cudaGetErrorString(error); \
if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; } \
throw rmm::bad_alloc{msg}; \
} \
} while (0)

#define RMM_CUDA_TRY_ALLOC_1(_call) \
do { \
cudaError_t const error = (_call); \
if (cudaSuccess != error) { \
Expand Down
4 changes: 3 additions & 1 deletion include/rmm/mr/device/arena_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,9 @@ class arena_memory_resource final : public device_memory_resource {
void* pointer = arena.allocate(bytes);
if (pointer == nullptr) {
if (dump_log_on_failure_) { dump_memory_log(bytes); }
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(bytes) + "): No room in arena.";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
return pointer;
}
Expand Down
3 changes: 2 additions & 1 deletion include/rmm/mr/device/cuda_async_view_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,8 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
{
void* ptr{nullptr};
if (bytes > 0) {
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()),
bytes);
}
return ptr;
}
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/cuda_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class cuda_memory_resource final : public device_memory_resource {
void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
{
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes));
RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes), bytes);
return ptr;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,8 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_

size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(),
"Maximum allocation size exceeded",
std::string("Maximum allocation size exceeded (failed to allocate ") +
rmm::detail::format_bytes(size) + ")",
rmm::out_of_memory);
auto const block = this->underlying().get_block(size, stream_event);

Expand Down
5 changes: 4 additions & 1 deletion include/rmm/mr/device/limiting_resource_adaptor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <rmm/aligned.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/resource_ref.hpp>
Expand Down Expand Up @@ -150,7 +151,9 @@ class limiting_resource_adaptor final : public device_memory_resource {
}

allocated_bytes_ -= proposed_size;
RMM_FAIL("Exceeded memory limit", rmm::out_of_memory);
auto const msg = std::string("Exceeded memory limit (failed to allocate ") +
rmm::detail::format_bytes(bytes) + ")";
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}

/**
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/device/managed_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ class managed_memory_resource final : public device_memory_resource {
if (bytes == 0) { return nullptr; }

void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes));
RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes), bytes);
return ptr;
}

Expand Down
52 changes: 31 additions & 21 deletions include/rmm/mr/device/pool_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -242,8 +242,8 @@ class pool_memory_resource final
* Attempts to allocate `try_size` bytes from upstream. If it fails, it iteratively reduces the
* attempted size by half until `min_size`, returning the allocated block once it succeeds.
*
* @throws rmm::bad_alloc if `min_size` bytes cannot be allocated from upstream or maximum pool
* size is exceeded.
* @throws rmm::out_of_memory if `min_size` bytes cannot be allocated from upstream or maximum
* pool size is exceeded.
*
* @param try_size The initial requested size to try allocating.
* @param min_size The minimum requested size to try allocating.
Expand All @@ -252,21 +252,34 @@ class pool_memory_resource final
*/
block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
{
auto report_error = [&](const char* reason) {
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",
rmm::detail::format_stream(stream),
min_size,
reason);
auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
rmm::detail::format_bytes(min_size) + std::string("): ") + reason;
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
};

while (try_size >= min_size) {
auto block = block_from_upstream(try_size, stream);
if (block.has_value()) {
current_pool_size_ += block.value().size();
return block.value();
}
if (try_size == min_size) {
break; // only try `size` once
try {
auto block = block_from_upstream(try_size, stream);
current_pool_size_ += block.size();
return block;
} catch (std::exception const& e) {
if (try_size == min_size) { report_error(e.what()); }
}
try_size = std::max(min_size, try_size / 2);
}
RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
rmm::detail::format_stream(stream),
min_size);
RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);

auto const max_size = maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max());
auto const msg = std::string("Not enough room to grow, current/max/try size = ") +
rmm::detail::format_bytes(pool_size()) + ", " +
rmm::detail::format_bytes(max_size) + ", " +
rmm::detail::format_bytes(min_size);
report_error(msg.c_str());
return {};
}

/**
Expand Down Expand Up @@ -307,6 +320,7 @@ class pool_memory_resource final
// limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
// time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
// until either success or the attempt is less than the requested size.

return try_to_expand(size_to_grow(size), size, stream);
}

Expand Down Expand Up @@ -339,21 +353,17 @@ class pool_memory_resource final
*
* @param size The size in bytes to allocate from the upstream resource
* @param stream The stream on which the memory is to be used.
* @throws if call to allocate_async() throws
* @return block_type The allocated block
*/
std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
{
RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);

if (size == 0) { return {}; }

try {
void* ptr = get_upstream_resource().allocate_async(size, stream);
return std::optional<block_type>{
*upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
} catch (std::exception const& e) {
return std::nullopt;
}
void* ptr = get_upstream_resource().allocate_async(size, stream);
return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
}

/**
Expand Down
5 changes: 4 additions & 1 deletion include/rmm/mr/device/system_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/detail/error.hpp>
#include <rmm/detail/export.hpp>
#include <rmm/detail/format.hpp>
#include <rmm/mr/device/device_memory_resource.hpp>

#include <cstddef>
Expand Down Expand Up @@ -103,7 +104,9 @@ class system_memory_resource final : public device_memory_resource {
return rmm::detail::aligned_host_allocate(
bytes, CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { return ::operator new(size); });
} catch (std::bad_alloc const& e) {
RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
auto const msg = std::string("Failed to allocate ") + rmm::detail::format_bytes(bytes) +
std::string("of memory: ") + e.what();
RMM_FAIL(msg.c_str(), rmm::out_of_memory);
}
}

Expand Down
3 changes: 1 addition & 2 deletions include/rmm/mr/host/pinned_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,7 @@ class pinned_memory_resource final : public host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
auto status = cudaMallocHost(&ptr, size);
if (cudaSuccess != status) { throw std::bad_alloc{}; }
RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, size), size);
return ptr;
});
}
Expand Down
2 changes: 1 addition & 1 deletion include/rmm/mr/pinned_host_memory_resource.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ class pinned_host_memory_resource {

return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
void* ptr{nullptr};
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault));
RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault), size);
return ptr;
});
}
Expand Down
11 changes: 11 additions & 0 deletions tests/mr/host/pinned_pool_mr_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,16 @@ TEST(PinnedPoolTest, NonAlignedPoolSize)
rmm::logic_error);
}

TEST(PinnedPoolTest, ThrowOutOfMemory)
{
rmm::mr::pinned_memory_resource pinned_mr{};
const auto initial{0};
const auto maximum{1024};
pool_mr mr{pinned_mr, initial, maximum};
mr.allocate(1024);

EXPECT_THROW(mr.allocate(1024), rmm::out_of_memory);
}

} // namespace
} // namespace rmm::test

0 comments on commit dea0b21

Please sign in to comment.