Add red.async.add.s64 emulation

NVIDIA · Nov 10, 2023 · af46a06 · af46a06
1 parent 33f4899
commit af46a06
Show file tree

Hide file tree

Showing 3 changed files with 70 additions and 0 deletions.
diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp
@@ -165,6 +165,16 @@ __global__ void test_compilation() {
     }
   ));
 #endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar]; // .u64 intentional
+      auto overload = static_cast<void (*)(cuda::ptx::op_add_t, int64_t* , const int64_t& , int64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
 }
 
 int main(int, char**)

diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
@@ -415,6 +415,19 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 
 -  PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async)
 
+PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using
+
+```cuda
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+```
+
 **red_async**:
 ```cuda
 // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
@@ -536,6 +549,15 @@ __device__ static inline void red_async(
   uint64_t* dest,
   const uint64_t& value,
   uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
 ```
 
 ### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -1038,6 +1038,44 @@ _LIBCUDACXX_DEVICE static inline void red_async(
 }
 #endif // __cccl_ptx_isa >= 810
 
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dest,
+  const _CUDA_VSTD::int64_t& __value,
+  _CUDA_VSTD::int64_t* __remote_bar)
+{
+  // __op == op_add (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 intentional"
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "l"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
 
 // 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated