diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md index 006ef9c66d1..c1e373cfe50 100644 --- a/libcudacxx/docs/ptx.md +++ b/libcudacxx/docs/ptx.md @@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release | [`cvta`] | No | | [`cvt`] | No | | [`cvt.pack`] | No | -| [`mapa`] | No | | [`getctarank`] | No | +| [`mapa`] | CTK-FUTURE, CCCL v2.4.0 | [`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2 [`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated @@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release [`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta [`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt [`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack -[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa [`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank +[`mapa`]: #mapa #### `st.async` @@ -417,6 +417,23 @@ int main() { cudaDeviceSynchronize(); } ``` + +#### `mapa` + +- PTX ISA: [`mapa`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa) + +**mapa**: +```cuda +// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); +``` + + ### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index 31382449f08..1c794a1ada2 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -549,6 +549,42 @@ _LIBCUDACXX_DEVICE static inline void st_async( // 9.7.8.22. Data Movement and Conversion Instructions: mapa // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa +/* +// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90 +// .space = { .shared::cluster } +template +__device__ static inline Tp* mapa( + cuda::ptx::space_cluster_t, + const Tp* addr, + uint32_t target_cta); +*/ +#if __cccl_ptx_isa >= 780 +extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__(); +template +_LIBCUDACXX_DEVICE static inline _Tp* mapa( + space_cluster_t, + const _Tp* __addr, + _CUDA_VSTD::uint32_t __target_cta) +{ + // __space == space_cluster (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + _CUDA_VSTD::uint32_t __dest; + asm ( + "mapa.shared::cluster.u32 %0, %1, %2;" + : "=r"(__dest) + : "r"(__as_ptr_smem(__addr)), + "r"(__target_cta) + : + ); + return __from_ptr_dsmem<_Tp>(__dest); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_mapa_is_not_supported_before_SM_90__(); + return __from_ptr_dsmem<_Tp>(0); + )); +} +#endif // __cccl_ptx_isa >= 780 + // 9.7.8.23. Data Movement and Conversion Instructions: getctarank // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp new file mode 100644 index 00000000000..3f1347f7f05 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp @@ -0,0 +1,46 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_mapa(void ** fn_ptr) { +#if __cccl_ptx_isa >= 780 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // mapa.shared::cluster.u32 dest, addr, target_cta; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::mapa)); + )); +#endif // __cccl_ptx_isa >= 780 +} + +int main(int, char**) +{ + return 0; +}