diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md index 06c546dd584..ec7220c8382 100644 --- a/libcudacxx/docs/extended_api/ptx.md +++ b/libcudacxx/docs/extended_api/ptx.md @@ -673,18 +673,18 @@ __device__ static inline void red_async( ### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier) -| Instruction | Available in libcu++ | -|------------------------------------------|----------------------| -| [`mbarrier.init`] | No | -| [`mbarrier.inval`] | No | -| [`mbarrier.expect_tx`] | No | -| [`mbarrier.complete_tx`] | No | +| Instruction | Available in libcu++ | +|------------------------------------------|-------------------------| +| [`mbarrier.init`] | No | +| [`mbarrier.inval`] | No | +| [`mbarrier.expect_tx`] | No | +| [`mbarrier.complete_tx`] | No | | [`mbarrier.arrive`] | CTK-FUTURE, CCCL v2.3.0 | -| [`mbarrier.arrive_drop`] | No | -| [`cp.async.mbarrier.arrive`] | No | -| [`mbarrier.test_wait/mbarrier.try_wait`] | No | -| [`mbarrier.pending_count`] | No | -| [`tensormap.cp_fenceproxy`] | No | +| [`mbarrier.arrive_drop`] | No | +| [`cp.async.mbarrier.arrive`] | No | +| [`mbarrier.test_wait/mbarrier.try_wait`] | CTK-FUTURE, CCCL v2.3.0 | +| [`mbarrier.pending_count`] | No | +| [`tensormap.cp_fenceproxy`] | No | [`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init [`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval @@ -693,14 +693,15 @@ __device__ static inline void red_async( [`mbarrier.arrive`]: #mbarrierarrive [`mbarrier.arrive_drop`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive-drop [`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive -[`mbarrier.test_wait/mbarrier.try_wait`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +[`mbarrier.test_wait/mbarrier.try_wait`]: #mbarriertest_waitmbarriertry_wait [`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count [`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy + #### `mbarrier.arrive` -- PTX ISA: [mbarrier.arrive](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) +- PTX ISA: [`mbarrier.arrive`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive) ```cuda // mbarrier.arrive.shared.b64 state, [addr]; // 1. PTX ISA 70, SM_80 @@ -834,6 +835,123 @@ __global__ void kernel() { ) } ``` + +#### `mbarrier.test_wait/mbarrier.try_wait` + +- PTX ISA: [`mbarrier.test_wait/mbarrier.try_wait`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait) + +**mbarrier_test_wait**: +```cuda +// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 +template +__device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); + +// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +``` + +**mbarrier_test_wait_parity**: +```cuda +// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 +template +__device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +``` + +**mbarrier_try_wait**: +```cuda +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); + +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); + +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); + +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +``` + +**mbarrier_try_wait_parity**: +```cuda +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); + +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); + +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); + +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +``` + ### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h index 1f5c0d2b5a3..6b42a151288 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h @@ -508,6 +508,561 @@ _LIBCUDACXX_DEVICE static inline void mbarrier_arrive_expect_tx( // 9.7.12.15.16. Parallel Synchronization and Communication Instructions: mbarrier.test_wait/mbarrier.try_wait // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-test-wait-mbarrier-try-wait +/* +// mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. PTX ISA 70, SM_80 +template +__device__ static inline bool mbarrier_test_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 700 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_test_wait( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.shared.b64 P_OUT, [%1], %2; // 1. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_80__(); + )); +} +#endif // __cccl_ptx_isa >= 700 + +/* +// mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_test_wait( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 2. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_test_wait_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. PTX ISA 71, SM_80 +template +__device__ static inline bool mbarrier_test_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 710 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_test_wait_parity( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_80,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.shared.b64 P_OUT, [%1], %2; // 3. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_80__(); + )); +} +#endif // __cccl_ptx_isa >= 710 + +/* +// mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_test_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_test_wait_parity( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.test_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 4. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_test_wait_parity_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2; // 5a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait( + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.shared::cta.b64 P_OUT, [%1], %2, %3; // 5b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 6a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint64_t& state, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint64_t& __state, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cta.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2 , %3; // 6b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "l"(__state), + "r"(__suspendTimeHint) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait_parity( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2; // 7a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. PTX ISA 78, SM_90 +template +__device__ static inline bool mbarrier_try_wait_parity( + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait_parity( + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.shared::cta.b64 P_OUT, [%1], %2, %3; // 7b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory" + ); + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 780 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2; // 8a. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + +/* +// mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. PTX ISA 80, SM_90 +// .sem = { .acquire } +// .scope = { .cta, .cluster } +template +__device__ static inline bool mbarrier_try_wait_parity( + cuda::ptx::sem_acquire_t, + cuda::ptx::scope_t scope, + uint64_t* addr, + const uint32_t& phaseParity, + const uint32_t& suspendTimeHint); +*/ +#if __cccl_ptx_isa >= 800 +extern "C" _LIBCUDACXX_DEVICE bool __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline bool mbarrier_try_wait_parity( + sem_acquire_t, + scope_t<_Scope> __scope, + _CUDA_VSTD::uint64_t* __addr, + const _CUDA_VSTD::uint32_t& __phaseParity, + const _CUDA_VSTD::uint32_t& __suspendTimeHint) +{ + // __sem == sem_acquire (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster, ""); + + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __waitComplete; + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cta.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm ( + "{\n\t .reg .pred P_OUT; \n\t" + "mbarrier.try_wait.parity.acquire.cluster.shared::cta.b64 P_OUT, [%1], %2, %3; // 8b. \n\t" + "selp.b32 %0, 1, 0, P_OUT; \n" + "}" + : "=r"(__waitComplete) + : "r"(__as_ptr_smem(__addr)), + "r"(__phaseParity), + "r"(__suspendTimeHint) + : "memory" + ); + } + return static_cast(__waitComplete); // Deliberate downcast. + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + return __bool__cuda_ptx_mbarrier_try_wait_parity_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 800 + + // 9.7.12.15.17. Parallel Synchronization and Communication Instructions: mbarrier.pending_count // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp new file mode 100644 index 00000000000..524bace98c4 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mbarrier.wait.compile.pass.cpp @@ -0,0 +1,200 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the `__device__` + * variable `fn_ptr`. Now, because weak stores from a single thread may be + * elided, we also wrap the store in an if branch that cannot be removed. + * + * To prevent dead-code-elimination of the if branch, we use + * `non_eliminated_false`, which uses inline assembly to hide the fact that is + * always false from NVVM. + * + * So this is how we ensure that none of the function pointer stores are elided. + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__device__ void * fn_ptr = nullptr; + +__device__ bool non_eliminated_false(void){ + int ret = 0; + asm ("": "=r"(ret)::); + return ret != 0; +} + +__global__ void test_compilation() { +#if __cccl_ptx_isa >= 700 + NV_IF_TARGET(NV_PROVIDES_SM_80, ( + if (non_eliminated_false()) { + // mbarrier.test_wait.shared.b64 waitComplete, [addr], state; // 1. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 700 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.test_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 2. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 710 + NV_IF_TARGET(NV_PROVIDES_SM_80, ( + if (non_eliminated_false()) { + // mbarrier.test_wait.parity.shared.b64 waitComplete, [addr], phaseParity; // 3. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 710 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.test_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 4. + auto overload = static_cast(cuda::ptx::mbarrier_test_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state; // 5a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.shared::cta.b64 waitComplete, [addr], state, suspendTimeHint; // 5b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state; // 6a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.try_wait{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], state , suspendTimeHint; // 6b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity; // 7a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.parity.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 7b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 780 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity; // 8a. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 + +#if __cccl_ptx_isa >= 800 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + if (non_eliminated_false()) { + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + if (non_eliminated_false()) { + // mbarrier.try_wait.parity{.sem}{.scope}.shared::cta.b64 waitComplete, [addr], phaseParity, suspendTimeHint; // 8b. + auto overload = static_cast(cuda::ptx::mbarrier_try_wait_parity); + fn_ptr = reinterpret_cast(overload); + } + )); +#endif // __cccl_ptx_isa >= 800 +} + +int main(int, char**) +{ + return 0; +}