Add cuda::ptx::mapa

NVIDIA · Feb 7, 2024 · 5f02fe8 · 5f02fe8
1 parent 12d959d
commit 5f02fe8
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 2 deletions.
diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md
@@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 | [`cvta`]                                          | No                      |
 | [`cvt`]                                           | No                      |
 | [`cvt.pack`]                                      | No                      |
-| [`mapa`]                                          | No                      |
 | [`getctarank`]                                    | No                      |
+| [`mapa`]                                          | CTK-FUTURE, CCCL v2.4.0 |
 
 [`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
 [`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
@@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
 [`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
 [`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
 [`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
-[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
 [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
+[`mapa`]: #mapa
 
 #### `st.async`
 
@@ -417,6 +417,23 @@ int main() {
   cudaDeviceSynchronize();
 }
 ```
+
+#### `mapa`
+
+- PTX ISA: [`mapa`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa)
+
+**mapa**:
+```cuda
+// mapa{.space}.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename Tp>
+__device__ static inline Tp* mapa(
+  cuda::ptx::space_cluster_t,
+  const Tp* addr,
+  uint32_t target_cta);
+```
+
+
 ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)
 
 | Instruction                              | Available in libcu++ |

diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -549,6 +549,42 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // 9.7.8.22. Data Movement and Conversion Instructions: mapa
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
 
+/*
+// mapa{.space}.u32  dest, addr, target_cta; // PTX ISA 78, SM_90
+// .space     = { .shared::cluster }
+template <typename Tp>
+__device__ static inline Tp* mapa(
+  cuda::ptx::space_cluster_t,
+  const Tp* addr,
+  uint32_t target_cta);
+*/
+#if __cccl_ptx_isa >= 780
+extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__();
+template <typename _Tp>
+_LIBCUDACXX_DEVICE static inline _Tp* mapa(
+  space_cluster_t,
+  const _Tp* __addr,
+  _CUDA_VSTD::uint32_t __target_cta)
+{
+  // __space == space_cluster (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    _CUDA_VSTD::uint32_t __dest;
+    asm (
+      "mapa.shared::cluster.u32  %0, %1, %2;"
+      : "=r"(__dest)
+      : "r"(__as_ptr_smem(__addr)),
+        "r"(__target_cta)
+      :
+    );
+    return __from_ptr_dsmem<_Tp>(__dest);
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_mapa_is_not_supported_before_SM_90__();
+    return __from_ptr_dsmem<_Tp>(0);
+  ));
+}
+#endif // __cccl_ptx_isa >= 780
+
 // 9.7.8.23. Data Movement and Conversion Instructions: getctarank
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
 

diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp
@@ -0,0 +1,46 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_mapa(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 780
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // mapa.shared::cluster.u32  dest, addr, target_cta;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t* (*)(cuda::ptx::space_cluster_t, const uint64_t* , uint32_t )>(cuda::ptx::mapa));
+  ));
+#endif // __cccl_ptx_isa >= 780
+}
+
+int main(int, char**)
+{
+    return 0;
+}