diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp new file mode 100644 index 00000000000..cfc76a58586 --- /dev/null +++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp @@ -0,0 +1,183 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the `__device__` + * variable `fn_ptr`. Now, because weak stores from a single thread may be + * elided, we also wrap the store in an if branch that cannot be removed. + * + * To prevent dead-code-elimination of the if branch, we use + * `non_eliminated_false`, which uses inline assembly to hide the fact that is + * always false from NVVM. + * + * So this is how we ensure that none of the function pointer stores are elided. + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__device__ void * fn_ptr = nullptr; + +__device__ bool non_eliminated_false(void){ + int ret = 0; + asm ("": "=r"(ret)::); + return ret != 0; +} + +__global__ void test_compilation() { +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 + +#if __cccl_ptx_isa >= 810 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; // .u64 intentional + auto overload = static_cast(cuda::ptx::red_async); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 810 +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index b76f7ce4398..06c546dd584 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -490,22 +490,22 @@ int main() { ### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions) -| Instruction | Available in libcu++ | -|------------------------------------------|----------------------| -| [`bar, barrier`] | No | -| [`bar.warp.sync`] | No | -| [`barrier.cluster`] | No | -| [`membar/fence`] | No | -| [`atom`] | No | -| [`red`] | No | -| [`red.async`] | No | -| [`vote (deprecated)`] | No | -| [`vote.sync`] | No | -| [`match.sync`] | No | -| [`activemask`] | No | -| [`redux.sync`] | No | -| [`griddepcontrol`] | No | -| [`elect.sync`] | No | +| Instruction | Available in libcu++ | +|-----------------------|-------------------------| +| [`bar, barrier`] | No | +| [`bar.warp.sync`] | No | +| [`barrier.cluster`] | No | +| [`membar/fence`] | No | +| [`atom`] | No | +| [`red`] | No | +| [`red.async`] | CTK-FUTURE, CCCL v2.3.0 | +| [`vote (deprecated)`] | No | +| [`vote.sync`] | No | +| [`match.sync`] | No | +| [`activemask`] | No | +| [`redux.sync`] | No | +| [`griddepcontrol`] | No | +| [`elect.sync`] | No | [`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier [`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync @@ -513,7 +513,7 @@ int main() { [`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence [`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom [`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red -[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async +[`red.async`]: #redasync [`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated [`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync [`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync @@ -522,6 +522,155 @@ int main() { [`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol [`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync +#### `red.async` + +- PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async) + +PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using + +```cuda +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +``` + +**red_async**: +```cuda +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void red_async( + cuda::ptx::op_inc_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void red_async( + cuda::ptx::op_dec_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void red_async( + cuda::ptx::op_and_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void red_async( + cuda::ptx::op_or_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void red_async( + cuda::ptx::op_xor_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint64_t* dest, + const uint64_t& value, + uint64_t* remote_bar); + +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +``` + ### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 48dd076986c..b29d617b7ef 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -31,7 +31,6 @@ #include "../__cuda/ptx/ptx_helper_functions.h" #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h" #include "../cstdint" // uint32_t - /* * The cuda::ptx namespace intends to provide PTX wrappers for new hardware * features and new PTX instructions so that they can be experimented with @@ -691,6 +690,528 @@ _LIBCUDACXX_DEVICE static inline void st_async( // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .inc } +template +__device__ static inline void red_async( + cuda::ptx::op_inc_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_inc_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::uint32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_inc (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .dec } +template +__device__ static inline void red_async( + cuda::ptx::op_dec_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_dec_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::uint32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_dec (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_min_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::uint32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_max_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::uint32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const uint32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_add_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::uint32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .min } +template +__device__ static inline void red_async( + cuda::ptx::op_min_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_min_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::int32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_min (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .max } +template +__device__ static inline void red_async( + cuda::ptx::op_max_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_max_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::int32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_max (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .s32 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint32_t* dest, + const int32_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_add_t, + _CUDA_VSTD::uint32_t* __dest, + const _CUDA_VSTD::int32_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_s32 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .and } +template +__device__ static inline void red_async( + cuda::ptx::op_and_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_and_op_t, + _B32* __dest, + const _B32& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_and_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__as_b32(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .or } +template +__device__ static inline void red_async( + cuda::ptx::op_or_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_or_op_t, + _B32* __dest, + const _B32& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_or_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__as_b32(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .b32 } +// .op = { .xor } +template +__device__ static inline void red_async( + cuda::ptx::op_xor_op_t, + B32* dest, + const B32& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_xor_op_t, + _B32* __dest, + const _B32& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_b32 (due to parameter type constraint) + // __op == op_xor_op (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "r"(__as_b32(__value)), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90 +// .type = { .u64 } +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + uint64_t* dest, + const uint64_t& value, + uint64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_add_t, + _CUDA_VSTD::uint64_t* __dest, + const _CUDA_VSTD::uint64_t& __value, + _CUDA_VSTD::uint64_t* __remote_bar) +{ + // __type == type_u64 (due to parameter type constraint) + // __op == op_add (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; " + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "l"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + +/* +// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90 +// .op = { .add } +template +__device__ static inline void red_async( + cuda::ptx::op_add_t, + int64_t* dest, + const int64_t& value, + int64_t* remote_bar); +*/ +#if __cccl_ptx_isa >= 810 +extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline void red_async( + op_add_t, + _CUDA_VSTD::int64_t* __dest, + const _CUDA_VSTD::int64_t& __value, + _CUDA_VSTD::int64_t* __remote_bar) +{ + // __op == op_add (due to parameter type constraint) + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + asm ( + "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 intentional" + : + : "r"(__as_ptr_remote_dsmem(__dest)), + "l"(__value), + "r"(__as_ptr_remote_dsmem(__remote_bar)) + : "memory" + ); + + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 810 + + // 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated) // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated