Retry error logging (#1844)

Retry getting improved error throwing and logging, with bugs fixed and test added that repros the cudf failure. [Original PR](#1827) that was [reverted](#1843). The changes to the previously-approved PR that includes the fixes and test is [this commit](c8a8505). The [original while loop](https://github.com/rapidsai/rmm/blob/6e8539e42d51852faab5f9b330232168f9223eee/include/rmm/mr/device/pool_memory_resource.hpp#L253) has been restored with better error handling. Note that this changes the interface of the macros, one of which is called in cudf that will be changed [here](rapidsai/cudf#18108) after this goes in. Authors: - Paul Mattione (https://github.com/pmattione-nvidia) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) URL: #1844
rapidsai · Mar 4, 2025 · dea0b21 · dea0b21
1 parent b44ebb6
commit dea0b21
Show file tree

Hide file tree

Showing 14 changed files with 100 additions and 45 deletions.
diff --git a/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu b/benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu
@@ -138,9 +138,7 @@ MRFactoryFunc get_mr_factory(std::string const& resource_name)
   if (resource_name == "arena") { return &make_arena; }
   if (resource_name == "binning") { return &make_binning; }
 
-  std::cout << "Error: invalid memory_resource name: " << resource_name << std::endl;
-
-  RMM_FAIL();
+  RMM_FAIL("Invalid memory_resource name: " + resource_name);
 }
 
 void declare_benchmark(std::string const& name)
@@ -175,7 +173,7 @@ void declare_benchmark(std::string const& name)
     return;
   }
 
-  std::cout << "Error: invalid memory_resource name: " << name << std::endl;
+  RMM_FAIL("Invalid memory_resource name: " + name);
 }
 
 // NOLINTNEXTLINE(bugprone-easily-swappable-parameters)

diff --git a/benchmarks/utilities/simulated_memory_resource.hpp b/benchmarks/utilities/simulated_memory_resource.hpp
@@ -65,7 +65,10 @@ class simulated_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, cuda_stream_view) override
   {
     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
-    RMM_EXPECTS(begin_ + bytes <= end_, "Simulated memory size exceeded", rmm::bad_alloc);
+    RMM_EXPECTS(begin_ + bytes <= end_,
+                "Simulated memory size exceeded (failed to allocate " +
+                  rmm::detail::format_bytes(bytes) + ")",
+                rmm::bad_alloc);
     auto* ptr = static_cast<void*>(begin_);
     begin_ += bytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
     return ptr;

diff --git a/include/rmm/detail/error.hpp b/include/rmm/detail/error.hpp
@@ -55,11 +55,11 @@
   GET_RMM_EXPECTS_MACRO(__VA_ARGS__, RMM_EXPECTS_3, RMM_EXPECTS_2) \
   (__VA_ARGS__)
 #define GET_RMM_EXPECTS_MACRO(_1, _2, _3, NAME, ...) NAME
-#define RMM_EXPECTS_3(_condition, _reason, _exception_type)                       \
-  (!!(_condition)) ? static_cast<void>(0)                                         \
-                   : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/ \
-  {                                                                               \
-    "RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _reason          \
+#define RMM_EXPECTS_3(_condition, _reason, _exception_type)                             \
+  (!!(_condition)) ? static_cast<void>(0)                                               \
+                   : throw _exception_type /*NOLINT(bugprone-macro-parentheses)*/       \
+  {                                                                                     \
+    std::string("RMM failure at: " __FILE__ ":" RMM_STRINGIFY(__LINE__) ": ") + _reason \
   }
 #define RMM_EXPECTS_2(_condition, _reason) RMM_EXPECTS_3(_condition, _reason, rmm::logic_error)
 
@@ -79,9 +79,10 @@
   GET_RMM_FAIL_MACRO(__VA_ARGS__, RMM_FAIL_2, RMM_FAIL_1) \
   (__VA_ARGS__)
 #define GET_RMM_FAIL_MACRO(_1, _2, NAME, ...) NAME
-#define RMM_FAIL_2(_what, _exception_type)       \
-  /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/ \
-  throw _exception_type{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": " _what};
+#define RMM_FAIL_2(_what, _exception_type)                                                         \
+  /*NOLINTNEXTLINE(bugprone-macro-parentheses)*/                                                   \
+  throw _exception_type{std::string{"RMM failure at:" __FILE__ ":" RMM_STRINGIFY(__LINE__) ": "} + \
+                        _what};
 #define RMM_FAIL_1(_what) RMM_FAIL_2(_what, rmm::logic_error)
 
 /**
@@ -131,8 +132,31 @@
  *
  * Defaults to throwing rmm::bad_alloc, but when `cudaErrorMemoryAllocation` is returned,
  * rmm::out_of_memory is thrown instead.
+ *
+ * Can be called with either 1 or 2 arguments:
+ * - RMM_CUDA_TRY_ALLOC(cuda_call): Performs error checking without specifying bytes
+ * - RMM_CUDA_TRY_ALLOC(cuda_call, num_bytes): Includes the number of bytes in the error message
  */
-#define RMM_CUDA_TRY_ALLOC(_call)                                                                  \
+#define RMM_CUDA_TRY_ALLOC(...)                                                         \
+  GET_RMM_CUDA_TRY_ALLOC_MACRO(__VA_ARGS__, RMM_CUDA_TRY_ALLOC_2, RMM_CUDA_TRY_ALLOC_1) \
+  (__VA_ARGS__)
+#define GET_RMM_CUDA_TRY_ALLOC_MACRO(_1, _2, NAME, ...) NAME
+
+#define RMM_CUDA_TRY_ALLOC_2(_call, num_bytes)                                          \
+  do {                                                                                  \
+    cudaError_t const error = (_call);                                                  \
+    if (cudaSuccess != error) {                                                         \
+      cudaGetLastError();                                                               \
+      auto const msg = std::string{"CUDA error (failed to allocate "} +                 \
+                       std::to_string(num_bytes) + " bytes) at: " + __FILE__ + ":" +    \
+                       RMM_STRINGIFY(__LINE__) + ": " + cudaGetErrorName(error) + " " + \
+                       cudaGetErrorString(error);                                       \
+      if (cudaErrorMemoryAllocation == error) { throw rmm::out_of_memory{msg}; }        \
+      throw rmm::bad_alloc{msg};                                                        \
+    }                                                                                   \
+  } while (0)
+
+#define RMM_CUDA_TRY_ALLOC_1(_call)                                                                \
   do {                                                                                             \
     cudaError_t const error = (_call);                                                             \
     if (cudaSuccess != error) {                                                                    \

diff --git a/include/rmm/mr/device/arena_memory_resource.hpp b/include/rmm/mr/device/arena_memory_resource.hpp
@@ -166,7 +166,9 @@ class arena_memory_resource final : public device_memory_resource {
       void* pointer = arena.allocate(bytes);
       if (pointer == nullptr) {
         if (dump_log_on_failure_) { dump_memory_log(bytes); }
-        RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+        auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
+                         rmm::detail::format_bytes(bytes) + "): No room in arena.";
+        RMM_FAIL(msg.c_str(), rmm::out_of_memory);
       }
       return pointer;
     }

diff --git a/include/rmm/mr/device/cuda_async_view_memory_resource.hpp b/include/rmm/mr/device/cuda_async_view_memory_resource.hpp
@@ -98,7 +98,8 @@ class cuda_async_view_memory_resource final : public device_memory_resource {
   {
     void* ptr{nullptr};
     if (bytes > 0) {
-      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()));
+      RMM_CUDA_TRY_ALLOC(cudaMallocFromPoolAsync(&ptr, bytes, pool_handle(), stream.value()),
+                         bytes);
     }
     return ptr;
   }

diff --git a/include/rmm/mr/device/cuda_memory_resource.hpp b/include/rmm/mr/device/cuda_memory_resource.hpp
@@ -59,7 +59,7 @@ class cuda_memory_resource final : public device_memory_resource {
   void* do_allocate(std::size_t bytes, [[maybe_unused]] cuda_stream_view stream) override
   {
     void* ptr{nullptr};
-    RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes));
+    RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, bytes), bytes);
     return ptr;
   }
 

diff --git a/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp b/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp
@@ -214,7 +214,8 @@ class stream_ordered_memory_resource : public crtp<PoolResource>, public device_
 
     size = rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT);
     RMM_EXPECTS(size <= this->underlying().get_maximum_allocation_size(),
-                "Maximum allocation size exceeded",
+                std::string("Maximum allocation size exceeded (failed to allocate ") +
+                  rmm::detail::format_bytes(size) + ")",
                 rmm::out_of_memory);
     auto const block = this->underlying().get_block(size, stream_event);
 

diff --git a/include/rmm/mr/device/limiting_resource_adaptor.hpp b/include/rmm/mr/device/limiting_resource_adaptor.hpp
@@ -18,6 +18,7 @@
 #include <rmm/aligned.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -150,7 +151,9 @@ class limiting_resource_adaptor final : public device_memory_resource {
     }
 
     allocated_bytes_ -= proposed_size;
-    RMM_FAIL("Exceeded memory limit", rmm::out_of_memory);
+    auto const msg = std::string("Exceeded memory limit (failed to allocate ") +
+                     rmm::detail::format_bytes(bytes) + ")";
+    RMM_FAIL(msg.c_str(), rmm::out_of_memory);
   }
 
   /**

diff --git a/include/rmm/mr/device/managed_memory_resource.hpp b/include/rmm/mr/device/managed_memory_resource.hpp
@@ -63,7 +63,7 @@ class managed_memory_resource final : public device_memory_resource {
     if (bytes == 0) { return nullptr; }
 
     void* ptr{nullptr};
-    RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes));
+    RMM_CUDA_TRY_ALLOC(cudaMallocManaged(&ptr, bytes), bytes);
     return ptr;
   }
 

diff --git a/include/rmm/mr/device/pool_memory_resource.hpp b/include/rmm/mr/device/pool_memory_resource.hpp
@@ -242,8 +242,8 @@ class pool_memory_resource final
    * Attempts to allocate `try_size` bytes from upstream. If it fails, it iteratively reduces the
    * attempted size by half until `min_size`, returning the allocated block once it succeeds.
    *
-   * @throws rmm::bad_alloc if `min_size` bytes cannot be allocated from upstream or maximum pool
-   * size is exceeded.
+   * @throws rmm::out_of_memory if `min_size` bytes cannot be allocated from upstream or maximum
+   * pool size is exceeded.
    *
    * @param try_size The initial requested size to try allocating.
    * @param min_size The minimum requested size to try allocating.
@@ -252,21 +252,34 @@ class pool_memory_resource final
    */
   block_type try_to_expand(std::size_t try_size, std::size_t min_size, cuda_stream_view stream)
   {
+    auto report_error = [&](const char* reason) {
+      RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded: %s]",
+                    rmm::detail::format_stream(stream),
+                    min_size,
+                    reason);
+      auto const msg = std::string("Maximum pool size exceeded (failed to allocate ") +
+                       rmm::detail::format_bytes(min_size) + std::string("): ") + reason;
+      RMM_FAIL(msg.c_str(), rmm::out_of_memory);
+    };
+
     while (try_size >= min_size) {
-      auto block = block_from_upstream(try_size, stream);
-      if (block.has_value()) {
-        current_pool_size_ += block.value().size();
-        return block.value();
-      }
-      if (try_size == min_size) {
-        break;  // only try `size` once
+      try {
+        auto block = block_from_upstream(try_size, stream);
+        current_pool_size_ += block.size();
+        return block;
+      } catch (std::exception const& e) {
+        if (try_size == min_size) { report_error(e.what()); }
       }
       try_size = std::max(min_size, try_size / 2);
     }
-    RMM_LOG_ERROR("[A][Stream %s][Upstream %zuB][FAILURE maximum pool size exceeded]",
-                  rmm::detail::format_stream(stream),
-                  min_size);
-    RMM_FAIL("Maximum pool size exceeded", rmm::out_of_memory);
+
+    auto const max_size = maximum_pool_size_.value_or(std::numeric_limits<std::size_t>::max());
+    auto const msg      = std::string("Not enough room to grow, current/max/try size = ") +
+                     rmm::detail::format_bytes(pool_size()) + ", " +
+                     rmm::detail::format_bytes(max_size) + ", " +
+                     rmm::detail::format_bytes(min_size);
+    report_error(msg.c_str());
+    return {};
   }
 
   /**
@@ -307,6 +320,7 @@ class pool_memory_resource final
     // limit each time. If it is not set, grow exponentially, e.g. by doubling the pool size each
     // time. Upon failure, attempt to back off exponentially, e.g. by half the attempted size,
     // until either success or the attempt is less than the requested size.
+
     return try_to_expand(size_to_grow(size), size, stream);
   }
 
@@ -339,21 +353,17 @@ class pool_memory_resource final
    *
    * @param size The size in bytes to allocate from the upstream resource
    * @param stream The stream on which the memory is to be used.
+   * @throws if call to allocate_async() throws
    * @return block_type The allocated block
    */
-  std::optional<block_type> block_from_upstream(std::size_t size, cuda_stream_view stream)
+  block_type block_from_upstream(std::size_t size, cuda_stream_view stream)
   {
     RMM_LOG_DEBUG("[A][Stream %s][Upstream %zuB]", rmm::detail::format_stream(stream), size);
 
     if (size == 0) { return {}; }
 
-    try {
-      void* ptr = get_upstream_resource().allocate_async(size, stream);
-      return std::optional<block_type>{
-        *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first};
-    } catch (std::exception const& e) {
-      return std::nullopt;
-    }
+    void* ptr = get_upstream_resource().allocate_async(size, stream);
+    return *upstream_blocks_.emplace(static_cast<char*>(ptr), size, true).first;
   }
 
   /**

diff --git a/include/rmm/mr/device/system_memory_resource.hpp b/include/rmm/mr/device/system_memory_resource.hpp
@@ -19,6 +19,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/detail/export.hpp>
+#include <rmm/detail/format.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cstddef>
@@ -103,7 +104,9 @@ class system_memory_resource final : public device_memory_resource {
       return rmm::detail::aligned_host_allocate(
         bytes, CUDA_ALLOCATION_ALIGNMENT, [](std::size_t size) { return ::operator new(size); });
     } catch (std::bad_alloc const& e) {
-      RMM_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+      auto const msg = std::string("Failed to allocate ") + rmm::detail::format_bytes(bytes) +
+                       std::string("of memory: ") + e.what();
+      RMM_FAIL(msg.c_str(), rmm::out_of_memory);
     }
   }
 

diff --git a/include/rmm/mr/host/pinned_memory_resource.hpp b/include/rmm/mr/host/pinned_memory_resource.hpp
@@ -122,8 +122,7 @@ class pinned_memory_resource final : public host_memory_resource {
 
     return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
       void* ptr{nullptr};
-      auto status = cudaMallocHost(&ptr, size);
-      if (cudaSuccess != status) { throw std::bad_alloc{}; }
+      RMM_CUDA_TRY_ALLOC(cudaMallocHost(&ptr, size), size);
       return ptr;
     });
   }

diff --git a/include/rmm/mr/pinned_host_memory_resource.hpp b/include/rmm/mr/pinned_host_memory_resource.hpp
@@ -72,7 +72,7 @@ class pinned_host_memory_resource {
 
     return rmm::detail::aligned_host_allocate(bytes, alignment, [](std::size_t size) {
       void* ptr{nullptr};
-      RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault));
+      RMM_CUDA_TRY_ALLOC(cudaHostAlloc(&ptr, size, cudaHostAllocDefault), size);
       return ptr;
     });
   }

diff --git a/tests/mr/host/pinned_pool_mr_tests.cpp b/tests/mr/host/pinned_pool_mr_tests.cpp
@@ -88,5 +88,16 @@ TEST(PinnedPoolTest, NonAlignedPoolSize)
     rmm::logic_error);
 }
 
+TEST(PinnedPoolTest, ThrowOutOfMemory)
+{
+  rmm::mr::pinned_memory_resource pinned_mr{};
+  const auto initial{0};
+  const auto maximum{1024};
+  pool_mr mr{pinned_mr, initial, maximum};
+  mr.allocate(1024);
+
+  EXPECT_THROW(mr.allocate(1024), rmm::out_of_memory);
+}
+
 }  // namespace
 }  // namespace rmm::test