diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md index c1e373cfe50..5df26d146df 100644 --- a/libcudacxx/docs/ptx.md +++ b/libcudacxx/docs/ptx.md @@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release | [`cvta`] | No | | [`cvt`] | No | | [`cvt.pack`] | No | -| [`getctarank`] | No | | [`mapa`] | CTK-FUTURE, CCCL v2.4.0 | +| [`getctarank`] | CTK-FUTURE, CCCL v2.4.0 | [`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 [`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated @@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta [`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt [`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack -[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank [`mapa`]: #mapa +[`getctarank`]: #getctarank #### `st.async` @@ -434,6 +434,21 @@ __device__ static inline Tp* mapa( ``` +#### `getctarank` + +- PTX ISA: [`getctarank`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank) + +**getctarank**: +```cuda +// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); +``` + + ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 1c794a1ada2..53cd738a7ff 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -587,6 +587,38 @@ _LIBCUDACXX_DEVICE static inline _Tp* mapa( // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank +/* +// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline uint32_t getctarank( + cuda::ptx::space_cluster_t, + const void* addr); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint32_t getctarank( + space_cluster_t, + const void* __addr) +{ + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __dest; + asm ( + "getctarank.shared::cluster.u32 %0, %1;" + : "=r"(__dest) + : "r"(__as_ptr_smem(__addr)) + : + ); + return __dest; + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_getctarank_is_not_supported_before_SM_90__(); + return 0; + )); +} +#endif // __cccl_ptx_isa >= 780 /* diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp new file mode 100644 index 00000000000..a98c7639603 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_getctarank(void ** fn_ptr) { +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // getctarank.shared::cluster.u32 dest, addr; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::getctarank)); + )); +#endif // __cccl_ptx_isa >= 780 +} + +int main(int, char**) +{ + return 0; +}