diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md
index b7f8d64da80..354d99f1d93 100644
--- a/libcudacxx/docs/ptx.md
+++ b/libcudacxx/docs/ptx.md
@@ -478,7 +478,7 @@ __device__ static inline uint32_t getctarank(
 | [`cp.async.bulk.prefetch.tensor`]           | No                      |
 | [`cp.async.bulk.commit_group`]              | CTK-FUTURE, CCCL v2.4.0 |
 | [`cp.async.bulk.wait_group`]                | CTK-FUTURE, CCCL v2.4.0 |
-| [`tensormap.replace`]                       | No                      |
+| [`tensormap.replace`]                       | CTK-FUTURE, CCCL v2.4.0 |
 
 [`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
 [`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group
@@ -491,7 +491,7 @@ __device__ static inline uint32_t getctarank(
 [`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor
 [`cp.async.bulk.commit_group`]: #cpasyncbulkcommit_group
 [`cp.async.bulk.wait_group`]: #cpasyncbulkwait_group
-[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+[`tensormap.replace`]: #tensormapreplace
 
 
 #### `cp.async.bulk`
@@ -839,6 +839,182 @@ template <int N32>
 __device__ static inline void cp_async_bulk_wait_group_read(
   cuda::ptx::n32_t<N32> N);
 ```
+
+#### `tensormap.replace`
+
+- PTX ISA: [`tensormap.replace`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace)
+
+**tensormap_replace**:
+```cuda
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B64 new_val);
+
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B64 new_val);
+
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B32 new_val);
+
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B32 new_val);
+
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+```
+
 ### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions)
 
 | Instruction                              | Available in libcu++ |
@@ -1202,7 +1378,7 @@ __device__ static inline void red_async(
 | [`cp.async.mbarrier.arrive`]             | No                      |
 | [`mbarrier.test_wait/mbarrier.try_wait`] | CTK-FUTURE, CCCL v2.3.0 |
 | [`mbarrier.pending_count`]               | No                      |
-| [`tensormap.cp_fenceproxy`]              | No                      |
+| [`tensormap.cp_fenceproxy`]              | CTK-FUTURE, CCCL v2.4.0 |
 
 [`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init
 [`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval
@@ -1213,7 +1389,7 @@ __device__ static inline void red_async(
 [`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive
 [`mbarrier.test_wait/mbarrier.try_wait`]: #mbarriertest_waitmbarriertry_wait
 [`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count
-[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+[`tensormap.cp_fenceproxy`]: #tensormapcpfenceproxy
 
 
 
@@ -1473,6 +1649,25 @@ __device__ static inline bool mbarrier_try_wait_parity(
   const uint32_t& suspendTimeHint);
 ```
 
+#### `tensormap.cp_fenceproxy`
+
+- PTX ISA: [`tensormap.cp_fenceproxy`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy)
+
+**tensormap_cp_fenceproxy**:
+```cuda
+// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA 83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void tensormap_cp_fenceproxy(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  void* dst,
+  const void* src,
+  cuda::ptx::n32_t<N32> size);
+```
+
+
 ### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions)
 
 | Instruction                              | Available in libcu++ |
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
index 594d8a582a8..7dc17309fbd 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h
@@ -105,4 +105,13 @@
 #endif
 #endif // __cccl_ptx_isa >= 800
 
+// NVRTC uses its own <nv/target> header, so we need to manually tell it when we expect SM90a to be available
+#if defined(_CCCL_COMPILER_NVRTC) && !defined(NV_HAS_FEATURE_SM_90a)
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL))
+#define NV_HAS_FEATURE_SM_90a NV_PROVIDES_SM_90
+#else // ^^^ SM90a ^^^ / vvv !SM90a vvv
+#define NV_HAS_FEATURE_SM_90a NV_NO_TARGET
+#endif //
+#endif // _CCCL_COMPILER_NVRTC && !NV_HAS_FEATURE_SM_90a
+
 #endif // __CCCL_PTX_ISA_H_
diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
index c472b0467ca..90bbeb190b9 100644
--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
+++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h
@@ -2161,6 +2161,701 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read(
 
 // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(
+  space_global_t,
+  void* __tm_addr,
+  _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_address.global.b1024.b64    [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "l"(__as_b64(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_address.space.b1024.b64    [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B64>
+__device__ static inline void tensormap_replace_global_address(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+template <typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_address(
+  space_shared_t,
+  void* __tm_addr,
+  _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_address.shared::cta.b1024.b64    [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "l"(__as_b64(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(
+  space_global_t,
+  void* __tm_addr,
+  _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.rank.global.b1024.b32              [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.rank.space.b1024.b32              [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <typename B32>
+__device__ static inline void tensormap_replace_rank(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+template <typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_rank(
+  space_shared_t,
+  void* __tm_addr,
+  _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.rank.shared::cta.b1024.b32              [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_box_dim(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.box_dim.global.b1024.b32           [%0], %1, %2;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.box_dim.space.b1024.b32           [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_box_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_box_dim(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [%0], %1, %2;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_global_dim(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_dim.global.b1024.b32        [%0], %1, %2;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_dim.space.b1024.b32        [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_global_dim(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_global_dim(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [%0], %1, %2;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_stride(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B64 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_stride.global.b1024.b64     [%0], %1, %2;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__ord),
+        "l"(__as_b64(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.global_stride.space.b1024.b64     [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B64>
+__device__ static inline void tensormap_replace_global_stride(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B64 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B64>
+_CCCL_DEVICE static inline void tensormap_replace_global_stride(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B64 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B64) == 8, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [%0], %1, %2;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__ord),
+        "l"(__as_b64(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_element_size(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.element_stride.global.b1024.b32    [%0], %1, %2;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.element_stride.space.b1024.b32    [tm_addr], ord, new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32, typename B32>
+__device__ static inline void tensormap_replace_element_size(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> ord,
+  B32 new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+template <int _N32, typename _B32>
+_CCCL_DEVICE static inline void tensormap_replace_element_size(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __ord,
+  _B32 __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  static_assert(sizeof(_B32) == 4, "");
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [%0], %1, %2;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__ord),
+        "r"(__as_b32(__new_val))
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.elemtype.global.b1024.b32          [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.elemtype.space.b1024.b32          [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_elemtype(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_elemtype(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_interleave_layout(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_interleave_layout(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_interleave_layout(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.swizzle_mode.global.b1024.b32      [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.swizzle_mode.space.b1024.b32      [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_swizzle_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .global }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_global_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(
+  space_global_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_global (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.fill_mode.global.b1024.b32         [%0], %1;"
+      :
+      : "l"(__as_ptr_gmem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
+
+/*
+// tensormap.replace.tile.fill_mode.space.b1024.b32         [tm_addr], new_val; // PTX ISA 83, SM_90a
+// .space     = { .shared::cta }
+template <int N32>
+__device__ static inline void tensormap_replace_fill_mode(
+  cuda::ptx::space_shared_t,
+  void* tm_addr,
+  cuda::ptx::n32_t<N32> new_val);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+template <int _N32>
+_CCCL_DEVICE static inline void tensormap_replace_fill_mode(
+  space_shared_t,
+  void* __tm_addr,
+  n32_t<_N32> __new_val)
+{
+  // __space == space_shared (due to parameter type constraint)
+  NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,(
+    asm (
+      "tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [%0], %1;"
+      :
+      : "r"(__as_ptr_smem(__tm_addr)),
+        "n"(__new_val)
+      : "memory"
+    );
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
 
 /*
  *  9.7.9. Texture Instructions
@@ -3263,6 +3958,74 @@ _CCCL_DEVICE static inline void red_async(
 
 // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy
 // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy
+/*
+// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned  [dst], [src], size; // PTX ISA 83, SM_90
+// .sem       = { .release }
+// .scope     = { .cta, .cluster, .gpu, .sys }
+template <int N32, cuda::ptx::dot_scope Scope>
+__device__ static inline void tensormap_cp_fenceproxy(
+  cuda::ptx::sem_release_t,
+  cuda::ptx::scope_t<Scope> scope,
+  void* dst,
+  const void* src,
+  cuda::ptx::n32_t<N32> size);
+*/
+#if __cccl_ptx_isa >= 830
+extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
+template <int _N32, dot_scope _Scope>
+_CCCL_DEVICE static inline void tensormap_cp_fenceproxy(
+  sem_release_t,
+  scope_t<_Scope> __scope,
+  void* __dst,
+  const void* __src,
+  n32_t<_N32> __size)
+{
+  // __sem == sem_release (due to parameter type constraint)
+  static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, "");
+  NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,(
+    if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) {
+      asm volatile (
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [%0], [%1], %2;"
+        :
+        : "l"(__as_ptr_gmem(__dst)),
+          "r"(__as_ptr_smem(__src)),
+          "n"(__size)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) {
+      asm volatile (
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [%0], [%1], %2;"
+        :
+        : "l"(__as_ptr_gmem(__dst)),
+          "r"(__as_ptr_smem(__src)),
+          "n"(__size)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_gpu) {
+      asm volatile (
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [%0], [%1], %2;"
+        :
+        : "l"(__as_ptr_gmem(__dst)),
+          "r"(__as_ptr_smem(__src)),
+          "n"(__size)
+        : "memory"
+      );
+    } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_sys) {
+      asm volatile (
+        "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [%0], [%1], %2;"
+        :
+        : "l"(__as_ptr_gmem(__dst)),
+          "r"(__as_ptr_smem(__src)),
+          "n"(__size)
+        : "memory"
+      );
+    }
+  ),(
+    // Unsupported architectures will have a linker error with a semi-decent error message
+    __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__();
+  ));
+}
+#endif // __cccl_ptx_isa >= 830
 
 
 /*
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
new file mode 100644
index 00000000000..4e905039b5f
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp
@@ -0,0 +1,52 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_tensormap_cp_fenceproxy(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_PROVIDES_SM_90, (
+    // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned  [dst], [src], size;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cta_t, void* , const void* , cuda::ptx::n32_t<128>)>(cuda::ptx::tensormap_cp_fenceproxy));
+    // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned  [dst], [src], size;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_cluster_t, void* , const void* , cuda::ptx::n32_t<128>)>(cuda::ptx::tensormap_cp_fenceproxy));
+    // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned  [dst], [src], size;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_gpu_t, void* , const void* , cuda::ptx::n32_t<128>)>(cuda::ptx::tensormap_cp_fenceproxy));
+    // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned  [dst], [src], size;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::sem_release_t, cuda::ptx::scope_sys_t, void* , const void* , cuda::ptx::n32_t<128>)>(cuda::ptx::tensormap_cp_fenceproxy));
+  ));
+#endif // __cccl_ptx_isa >= 830
+}
+
+int main(int, char**)
+{
+    return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
new file mode 100644
index 00000000000..dbc25eeb921
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp
@@ -0,0 +1,179 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+// UNSUPPORTED: libcpp-has-no-threads
+
+// <cuda/ptx>
+
+#include <cuda/ptx>
+#include <cuda/std/utility>
+
+/*
+ * We use a special strategy to force the generation of the PTX. This is mainly
+ * a fight against dead-code-elimination in the NVVM layer.
+ *
+ * The reason we need this strategy is because certain older versions of ptxas
+ * segfault when a non-sensical sequence of PTX is generated. So instead, we try
+ * to force the instantiation and compilation to PTX of all the overloads of the
+ * PTX wrapping functions.
+ *
+ * We do this by writing a function pointer of each overload to the kernel
+ * parameter `fn_ptr`.
+ *
+ * Because `fn_ptr` is possibly visible outside this translation unit, the
+ * compiler must compile all the functions which are stored.
+ *
+ */
+
+__global__ void test_tensormap_replace(void ** fn_ptr) {
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_address.global.b1024.b64    [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , int64_t )>(cuda::ptx::tensormap_replace_global_address));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_address.shared::cta.b1024.b64    [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , int64_t )>(cuda::ptx::tensormap_replace_global_address));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.rank.global.b1024.b32              [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , int32_t )>(cuda::ptx::tensormap_replace_rank));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.rank.shared::cta.b1024.b32              [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , int32_t )>(cuda::ptx::tensormap_replace_rank));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.box_dim.global.b1024.b32           [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_box_dim));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.box_dim.shared::cta.b1024.b32           [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_box_dim));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_dim.global.b1024.b32        [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_global_dim));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_dim.shared::cta.b1024.b32        [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_global_dim));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_stride.global.b1024.b64     [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>, int64_t )>(cuda::ptx::tensormap_replace_global_stride));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.global_stride.shared::cta.b1024.b64     [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>, int64_t )>(cuda::ptx::tensormap_replace_global_stride));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.element_stride.global.b1024.b32    [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_element_size));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.element_stride.shared::cta.b1024.b32    [tm_addr], ord, new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>, int32_t )>(cuda::ptx::tensormap_replace_element_size));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.elemtype.global.b1024.b32          [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_elemtype));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.elemtype.shared::cta.b1024.b32          [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_elemtype));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_interleave_layout));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_interleave_layout));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.swizzle_mode.global.b1024.b32      [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_swizzle_mode));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32      [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_swizzle_mode));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.fill_mode.global.b1024.b32         [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_global_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_fill_mode));
+  ));
+#endif // __cccl_ptx_isa >= 830
+
+#if __cccl_ptx_isa >= 830
+  NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, (
+    // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32         [tm_addr], new_val;
+    *fn_ptr++ = reinterpret_cast<void*>(static_cast<void (*)(cuda::ptx::space_shared_t, void* , cuda::ptx::n32_t<0>)>(cuda::ptx::tensormap_replace_fill_mode));
+  ));
+#endif // __cccl_ptx_isa >= 830
+}
+
+int main(int, char**)
+{
+    return 0;
+}