Skip to content

Commit

Permalink
Add red.async.add.s64 emulation
Browse files Browse the repository at this point in the history
  • Loading branch information
ahendriksen committed Nov 10, 2023
1 parent 33f4899 commit af46a06
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,16 @@ __global__ void test_compilation() {
}
));
#endif // __cccl_ptx_isa >= 810

#if __cccl_ptx_isa >= 810
NV_IF_TARGET(NV_PROVIDES_SM_90, (
if (non_eliminated_false()) {
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [dest], value, [remote_bar]; // .u64 intentional
auto overload = static_cast<void (*)(cuda::ptx::op_add_t, int64_t* , const int64_t& , int64_t* )>(cuda::ptx::red_async);
fn_ptr = reinterpret_cast<void*>(overload);
}
));
#endif // __cccl_ptx_isa >= 810
}

int main(int, char**)
Expand Down
22 changes: 22 additions & 0 deletions libcudacxx/docs/extended_api/ptx.md
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,19 @@ notes](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#release

- PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async)

PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using

```cuda
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
int64_t* dest,
const int64_t& value,
int64_t* remote_bar);
```

**red_async**:
```cuda
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type} [dest], value, [remote_bar]; // PTX ISA 81, SM_90
Expand Down Expand Up @@ -536,6 +549,15 @@ __device__ static inline void red_async(
uint64_t* dest,
const uint64_t& value,
uint64_t* remote_bar);
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
int64_t* dest,
const int64_t& value,
int64_t* remote_bar);
```

### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)
Expand Down
38 changes: 38 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,44 @@ _LIBCUDACXX_DEVICE static inline void red_async(
}
#endif // __cccl_ptx_isa >= 810

/*
// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64 [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
// .op = { .add }
template <typename=void>
__device__ static inline void red_async(
cuda::ptx::op_add_t,
int64_t* dest,
const int64_t& value,
int64_t* remote_bar);
*/
#if __cccl_ptx_isa >= 810
extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
template <typename=void>
_LIBCUDACXX_DEVICE static inline void red_async(
op_add_t,
_CUDA_VSTD::int64_t* __dest,
const _CUDA_VSTD::int64_t& __value,
_CUDA_VSTD::int64_t* __remote_bar)
{
// __op == op_add (due to parameter type constraint)

NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
asm (
"red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64 [%0], %1, [%2]; // .u64 intentional"
:
: "r"(__as_ptr_remote_dsmem(__dest)),
"l"(__value),
"r"(__as_ptr_remote_dsmem(__remote_bar))
: "memory"
);

),(
// Unsupported architectures will have a linker error with a semi-decent error message
return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
));
}
#endif // __cccl_ptx_isa >= 810


// 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
Expand Down

0 comments on commit af46a06

Please sign in to comment.