Add cuda::ptx::getctarank

NVIDIA · Feb 7, 2024 · fbce1cd · fbce1cd
1 parent 5f02fe8
commit fbce1cd
Show file tree

Hide file tree

Showing 3 changed files with 95 additions and 2 deletions.
diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md
@@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 | [`cvta`]                                          | No                      |
 | [`cvt`]                                           | No                      |
 | [`cvt.pack`]                                      | No                      |
-| [`getctarank`]                                    | No                      |
 | [`mapa`]                                          | CTK-FUTURE, CCCL v2.4.0 |
+| [`getctarank`]                                    | CTK-FUTURE, CCCL v2.4.0 |
 
 [`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
 [`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
@@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
 [`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
 [`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
-[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
 [`mapa`]: #mapa
+[`getctarank`]: #getctarank
 
 #### `st.async`
 
@@ -434,6 +434,21 @@ __device__ static inline Tp* mapa(
 ```
 
 
+#### `getctarank`
+
+- PTX ISA: [`getctarank`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank)
+
+**getctarank**:
+```cuda
+// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline uint32_t getctarank(
+  cuda::ptx::space_cluster_t,
+  const void* addr);
+```
+
+
 ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
 
 | Instruction                              | Available in libcu++ |

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -587,6 +587,38 @@ _LIBCUDACXX_DEVICE static inline _Tp* mapa(
 
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+/*
+// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename=void>
+__device__ static inline uint32_t getctarank(
+  cuda::ptx::space_cluster_t,
+  const void* addr);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(
+  space_cluster_t,
+  const void* __addr)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint32_t __dest;
+    asm (
+      "getctarank.shared::cluster.u32 %0, %1;"
+      : "=r"(__dest)
+      : "r"(__as_ptr_smem(__addr))
+      :
+    );
+    return __dest;
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
+    return 0;
+  ));
+}
+#endif // __cccl_ptx_isa >= 780
 
 
 /*

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.getctarank.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_getctarank(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // getctarank.shared::cluster.u32 dest, addr;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void* )>(cuda::ptx::getctarank));
+  ));
+#endif // __cccl_ptx_isa >= 780
+}
+
+int main(int, char**)
+{
+    return 0;
+}