diff --git a/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp
new file mode 100644
index 00000000000..cfc76a58586
--- /dev/null
+++ b/libcudacxx/.upstream-tests/test/cuda/ptx/ptx.red.async.compile.pass.cpp
@@ -0,0 +1,183 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the `__device__`
+ * variable `fn_ptr`. Now, because weak stores from a single thread may be
+ * elided, we also wrap the store in an if branch that cannot be removed.
+ *
+ * To prevent dead-code-elimination of the if branch, we use
+ * `non_eliminated_false`, which uses inline assembly to hide the fact that is
+ * always false from NVVM.
+ *
+ * So this is how we ensure that none of the function pointer stores are elided.
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__device__ void * fn_ptr = nullptr;
+
+__device__ bool non_eliminated_false(void){
+  int ret = 0;
+  asm ("": "=r"(ret)::);
+  return ret != 0;
+}
+
+__global__ void test_compilation() {
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_inc_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_dec_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_min_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_max_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint32_t* , const uint32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_min_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_max_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_and_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_or_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_xor_op_t, int32_t* , const int32_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar];
+      auto overload = static_cast<void (*)(cuda::ptx::op_add_t, uint64_t* , const uint64_t& , uint64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+
+#if __cccl_ptx_isa >= 810
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    if (non_eliminated_false()) {
+      // red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [dest], value, [remote_bar]; // .u64 intentional
+      auto overload = static_cast<void (*)(cuda::ptx::op_add_t, int64_t* , const int64_t& , int64_t* )>(cuda::ptx::red_async);
+      fn_ptr = reinterpret_cast<void*>(overload);
+    }
+  ));
+#endif // __cccl_ptx_isa >= 810
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/docs/extended_api/ptx.md b/libcudacxx/docs/extended_api/ptx.md
index b76f7ce4398..06c546dd584 100644
--- a/libcudacxx/docs/extended_api/ptx.md
+++ b/libcudacxx/docs/extended_api/ptx.md
@@ -490,22 +490,22 @@ int main() {
 
 ### [9.7.12. Parallel Synchronization and Communication Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions)
 
-| Instruction                              | Available in libcu++ |
-|------------------------------------------|----------------------|
-| [`bar, barrier`]                         | No                   |
-| [`bar.warp.sync`]                        | No                   |
-| [`barrier.cluster`]                      | No                   |
-| [`membar/fence`]                         | No                   |
-| [`atom`]                                 | No                   |
-| [`red`]                                  | No                   |
-| [`red.async`]                            | No                   |
-| [`vote (deprecated)`]                    | No                   |
-| [`vote.sync`]                            | No                   |
-| [`match.sync`]                           | No                   |
-| [`activemask`]                           | No                   |
-| [`redux.sync`]                           | No                   |
-| [`griddepcontrol`]                       | No                   |
-| [`elect.sync`]                           | No                   |
+| Instruction           | Available in libcu++    |
+|-----------------------|-------------------------|
+| [`bar, barrier`]      | No                      |
+| [`bar.warp.sync`]     | No                      |
+| [`barrier.cluster`]   | No                      |
+| [`membar/fence`]      | No                      |
+| [`atom`]              | No                      |
+| [`red`]               | No                      |
+| [`red.async`]         | CTK-FUTURE, CCCL v2.3.0 |
+| [`vote (deprecated)`] | No                      |
+| [`vote.sync`]         | No                      |
+| [`match.sync`]        | No                      |
+| [`activemask`]        | No                      |
+| [`redux.sync`]        | No                      |
+| [`griddepcontrol`]    | No                      |
+| [`elect.sync`]        | No                      |
 
 [`bar, barrier`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-barrier
 [`bar.warp.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-bar-warp-sync
@@ -513,7 +513,7 @@ int main() {
 [`membar/fence`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar-fence
 [`atom`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-atom
 [`red`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red
-[`red.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
+[`red.async`]: #redasync
 [`vote (deprecated)`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated
 [`vote.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-sync
 [`match.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync
@@ -522,6 +522,155 @@ int main() {
 [`griddepcontrol`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-griddepcontrol
 [`elect.sync`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-elect-sync
 
+#### `red.async`
+
+-  PTX ISA: [`red.async`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async)
+
+PTX does not currently (CTK 12.3) expose `red.async.add.s64`. This exposure is emulated in `cuda::ptx` using
+
+```cuda
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+```
+
+**red_async**:
+```cuda
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_inc_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_dec_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_and_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_or_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_xor_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint64_t* dest,
+  const uint64_t& value,
+  uint64_t* remote_bar);
+
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+```
+
 ### [9.7.12.15. Parallel Synchronization and Communication Instructions: mbarrier](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier)
 
 | Instruction                              | Available in libcu++ |
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index 48dd076986c..b29d617b7ef 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -31,7 +31,6 @@
 #include "../__cuda/ptx/ptx_helper_functions.h"
 #include "../__cuda/ptx/parallel_synchronization_and_communication_instructions_mbarrier.h"
 #include "../cstdint" // uint32_t
-
 /*
  * The cuda::ptx namespace intends to provide PTX wrappers for new hardware
  * features and new PTX instructions so that they can be experimented with
@@ -691,6 +690,528 @@ _LIBCUDACXX_DEVICE static inline void st_async(
 // 9.7.12.7. Parallel Synchronization and Communication Instructions: red.async
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-red-async
 
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .inc }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_inc_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_inc_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::uint32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_inc (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.inc.u32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .dec }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_dec_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_dec_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::uint32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_dec (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.dec.u32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::uint32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.u32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::uint32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.u32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const uint32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::uint32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .min }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_min_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_min_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::int32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_min (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.min.s32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .max }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_max_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_max_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::int32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_max (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.max.s32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .s32 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint32_t* dest,
+  const int32_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_add_t,
+  _CUDA_VSTD::uint32_t* __dest,
+  const _CUDA_VSTD::int32_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_s32 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.s32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .and }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_and_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_and_op_t,
+  _B32* __dest,
+  const _B32& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_and_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.and.b32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__as_b32(__value)),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .or }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_or_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_or_op_t,
+  _B32* __dest,
+  const _B32& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_or_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.or.b32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__as_b32(__value)),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .b32 }
+// .op        = { .xor }
+template <typename B32>
+__device__ static inline void red_async(
+  cuda::ptx::op_xor_op_t,
+  B32* dest,
+  const B32& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename _B32>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_xor_op_t,
+  _B32* __dest,
+  const _B32& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_b32 (due to parameter type constraint)
+  // __op == op_xor_op (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.xor.b32  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "r"(__as_b32(__value)),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}{.type}  [dest], value, [remote_bar];  // PTX ISA 81, SM_90
+// .type      = { .u64 }
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  uint64_t* dest,
+  const uint64_t& value,
+  uint64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_add_t,
+  _CUDA_VSTD::uint64_t* __dest,
+  const _CUDA_VSTD::uint64_t& __value,
+  _CUDA_VSTD::uint64_t* __remote_bar)
+{
+  // __type == type_u64 (due to parameter type constraint)
+  // __op == op_add (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; "
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "l"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+/*
+// red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes{.op}.u64  [dest], value, [remote_bar]; // .u64 intentional PTX ISA 81, SM_90
+// .op        = { .add }
+template <typename=void>
+__device__ static inline void red_async(
+  cuda::ptx::op_add_t,
+  int64_t* dest,
+  const int64_t& value,
+  int64_t* remote_bar);
+*/
+#if __cccl_ptx_isa >= 810
+extern "C" _LIBCUDACXX_DEVICE void __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+template <typename=void>
+_LIBCUDACXX_DEVICE static inline void red_async(
+  op_add_t,
+  _CUDA_VSTD::int64_t* __dest,
+  const _CUDA_VSTD::int64_t& __value,
+  _CUDA_VSTD::int64_t* __remote_bar)
+{
+  // __op == op_add (due to parameter type constraint)
+
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    asm (
+      "red.async.relaxed.cluster.shared::cluster.mbarrier::complete_tx::bytes.add.u64  [%0], %1, [%2]; // .u64 intentional"
+      :
+      : "r"(__as_ptr_remote_dsmem(__dest)),
+        "l"(__value),
+        "r"(__as_ptr_remote_dsmem(__remote_bar))
+      : "memory"
+    );
+
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    return __void__cuda_ptx_red_async_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 810
+
+
 // 9.7.12.8. Parallel Synchronization and Communication Instructions: vote (deprecated)
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-vote-deprecated