diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp index e61249e8784..cfc76a58586 100644 --- a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -165,6 +165,16 @@ __global__ void test_compilation() { } )); #endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; // .u64 intentional + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 } int main(int, char**) diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index a93f2b54cab..cd3d2648778 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -415,6 +415,19 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release - PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async) +PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using + +```cuda +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +``` + **red_async**: ```cuda // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 @@ -536,6 +549,15 @@ __device__ static inline void red_async( uint64_t* dest, const uint64_t& value, uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); ``` ### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier) diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index df312de91be..975a1e425cc 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -1038,6 +1038,44 @@ _LIBCUDACXX_DEVICE static inline void red_async( } #endif // __cccl_ptx_isa >= 810 +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_add_t, + _CUDA_VSTD::int64_t* __dest, + const _CUDA_VSTD::int64_t& __value, + _CUDA_VSTD::int64_t* __remote_bar) +{ + // __op == op_add (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "l"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + // 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated