Skip to content

Commit

Permalink
Add cuda::ptx::getctarank
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed Feb 7, 2024
1 parent 5f02fe8 commit fbce1cd
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 2 deletions.
19 changes: 17 additions & 2 deletions libcudacxx/docs/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -281,8 +281,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
| [`cvta`] | No |
| [`cvt`] | No |
| [`cvt.pack`] | No |
| [`getctarank`] | No |
| [`mapa`] | CTK-FUTURE, CCCL v2.4.0 |
| [`getctarank`] | CTK-FUTURE, CCCL v2.4.0 |

[`mov`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-mov-2
[`shfl (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-shfl-deprecated
Expand All @@ -302,8 +302,8 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release
[`cvta`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvta
[`cvt`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt
[`cvt.pack`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt-pack
[`getctarank`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
[`mapa`]: #mapa
[`getctarank`]: #getctarank

#### `st.async`

Expand Down Expand Up @@ -434,6 +434,21 @@ __device__ static inline Tp* mapa(
```


#### `getctarank`

- PTX ISA: [`getctarank`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank)

**getctarank**:
```cuda
// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename=void>
__device__ static inline uint32_t getctarank(
cuda::ptx::space_cluster_t,
const void* addr);
```


### [9.7.8.24. Data Movement and Conversion Instructions: Asynchronous copy](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-asynchronous-copy)

| Instruction | Available in libcu++ |
Expand Down
32 changes: 32 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,38 @@ _LIBCUDACXX_DEVICE static inline _Tp* mapa(

// 9.7.8.23. Data Movement and Conversion Instructions: getctarank
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-getctarank
/*
// getctarank{.space}.u32 dest, addr; // PTX ISA 78, SM_90
// .space = { .shared::cluster }
template <typename=void>
__device__ static inline uint32_t getctarank(
cuda::ptx::space_cluster_t,
const void* addr);
*/
#if __cccl_ptx_isa >= 780
extern "C" _LIBCUDACXX_DEVICE void __cuda_ptx_getctarank_is_not_supported_before_SM_90__();
template <typename=void>
_LIBCUDACXX_DEVICE static inline _CUDA_VSTD::uint32_t getctarank(
space_cluster_t,
const void* __addr)
{
// __space == space_cluster (due to parameter type constraint)
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
_CUDA_VSTD::uint32_t __dest;
asm (
"getctarank.shared::cluster.u32 %0, %1;"
: "=r"(__dest)
: "r"(__as_ptr_smem(__addr))
:
);
return __dest;
),(
// Unsupported architectures will have a linker error with a semi-decent error message
__cuda_ptx_getctarank_is_not_supported_before_SM_90__();
return 0;
));
}
#endif // __cccl_ptx_isa >= 780


/*
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

/*
* We use a special strategy to force the generation of the PTX. This is mainly
* a fight against dead-code-elimination in the NVVM layer.
*
* The reason we need this strategy is because certain older versions of ptxas
* segfault when a non-sensical sequence of PTX is generated. So instead, we try
* to force the instantiation and compilation to PTX of all the overloads of the
* PTX wrapping functions.
*
* We do this by writing a function pointer of each overload to the kernel
* parameter `fn_ptr`.
*
* Because `fn_ptr` is possibly visible outside this translation unit, the
* compiler must compile all the functions which are stored.
*
*/

__global__ void test_getctarank(void ** fn_ptr) {
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
// getctarank.shared::cluster.u32 dest, addr;
*fn_ptr++ = reinterpret_cast<void*>(static_cast<uint32_t (*)(cuda::ptx::space_cluster_t, const void* )>(cuda::ptx::getctarank));
));
#endif // __cccl_ptx_isa >= 780
}

int main(int, char**)
{
return 0;
}

0 comments on commit fbce1cd

Please sign in to comment.