Skip to content

Commit

Permalink
Add cuda::ptx::mapa
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed Feb 7, 2024
1 parent 12d959d commit 5f02fe8
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 2 deletions.
21 changes: 19 additions & 2 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
| [`cvta`] | No |
| [`cvt`] | No |
| [`cvt.pack`] | No |
| [`mapa`] | No |
| [`getctarank`] | No |
| [`mapa`] | CTK-FUTURE, CCCL v2.4.0 |

[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
Expand All @@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
[`mapa`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa
[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
[`mapa`]: #mapa

#### `st.async`

Expand Down Expand Up @@ -417,6 +417,23 @@ int main() {
cudaDeviceSynchronize();
}
```

#### `mapa`

- PTX ISA: [`mapa`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa)

**mapa**:
```cuda
// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename Tp>
__device__ static inline Tp* mapa(
cuda::ptx::space_cluster_t,
const Tp* addr,
uint32_t target_cta);
```


### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)

| Instruction | Available in libcu++ |
Expand Down
36 changes: 36 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,42 @@ _LIBCUDACXX_DEVICE static inline void st_async(
// 9.7.8.22. Data Movement and Conversion Instructions: mapa
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mapa

/*
// mapa{.space}.u32 dest, addr, target_cta; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename Tp>
__device__ static inline Tp* mapa(
cuda::ptx::space_cluster_t,
const Tp* addr,
uint32_t target_cta);
*/
#if __cccl_ptx_isa >= 780
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_mapa_is_not_supported_before_SM_90__();
template <typename _Tp>
_LIBCUDACXX_DEVICE static inline _Tp* mapa(
space_cluster_t,
const _Tp* __addr,
_CUDA_VSTD::uint32_t __target_cta)
{
// __space == space_cluster (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
_CUDA_VSTD::uint32_t __dest;
asm (
"mapa.shared::cluster.u32 %0, %1, %2;"
: "=r"(__dest)
: "r"(__as_ptr_smem(__addr)),
"r"(__target_cta)
:
);
return __from_ptr_dsmem<_Tp>(__dest);
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_mapa_is_not_supported_before_SM_90__();
return __from_ptr_dsmem<_Tp>(0);
));
}
#endif // __cccl_ptx_isa >= 780

// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank

Expand Down
46 changes: 46 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/ptx/ptx.mapa.compile.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_mapa(void ** fn_ptr) {
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// mapa.shared::cluster.u32 dest, addr, target_cta;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<uint64_t* (*)(cuda::ptx::space_cluster_t, const uint64_t* , uint32_t )>(cuda::ptx::mapa));
));
#endif // __cccl_ptx_isa >= 780
}

int main(int, char**)
{
return 0;
}

0 comments on commit 5f02fe8

Please sign in to comment.