diff --git a/libcudacxx/docs/ptx.md b/libcudacxx/docs/ptx.md index b7f8d64da80..354d99f1d93 100644 --- a/libcudacxx/docs/ptx.md +++ b/libcudacxx/docs/ptx.md @@ -478,7 +478,7 @@ __device__ static inline uint32_t getctarank( | [`cp.async.bulk.prefetch.tensor`] | No | | [`cp.async.bulk.commit_group`] | CTK-FUTURE, CCCL v2.4.0 | | [`cp.async.bulk.wait_group`] | CTK-FUTURE, CCCL v2.4.0 | -| [`tensormap.replace`] | No | +| [`tensormap.replace`] | CTK-FUTURE, CCCL v2.4.0 | [`cp.async`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async [`cp.async.commit_group`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-commit-group @@ -491,7 +491,7 @@ __device__ static inline uint32_t getctarank( [`cp.async.bulk.prefetch.tensor`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor [`cp.async.bulk.commit_group`]: #cpasyncbulkcommit_group [`cp.async.bulk.wait_group`]: #cpasyncbulkwait_group -[`tensormap.replace`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace +[`tensormap.replace`]: #tensormapreplace #### `cp.async.bulk` @@ -839,6 +839,182 @@ template __device__ static inline void cp_async_bulk_wait_group_read( cuda::ptx::n32_t N); ``` + +#### `tensormap.replace` + +- PTX ISA: [`tensormap.replace`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace) + +**tensormap_replace**: +```cuda +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_global_t, + void* tm_addr, + B64 new_val); + +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_shared_t, + void* tm_addr, + B64 new_val); + +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_global_t, + void* tm_addr, + B32 new_val); + +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_shared_t, + void* tm_addr, + B32 new_val); + +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); + +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); + +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); + +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); + +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +``` + ### [9.7.9. Texture Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#texture-instructions) | Instruction | Available in libcu++ | @@ -1202,7 +1378,7 @@ __device__ static inline void red_async( | [`cp.async.mbarrier.arrive`] | No | | [`mbarrier.test_wait/mbarrier.try_wait`] | CTK-FUTURE, CCCL v2.3.0 | | [`mbarrier.pending_count`] | No | -| [`tensormap.cp_fenceproxy`] | No | +| [`tensormap.cp_fenceproxy`] | CTK-FUTURE, CCCL v2.4.0 | [`mbarrier.init`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-init [`mbarrier.inval`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-inval @@ -1213,7 +1389,7 @@ __device__ static inline void red_async( [`cp.async.mbarrier.arrive`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive [`mbarrier.test_wait/mbarrier.try_wait`]: #mbarriertest_waitmbarriertry_wait [`mbarrier.pending_count`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-pending-count -[`tensormap.cp_fenceproxy`]: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy +[`tensormap.cp_fenceproxy`]: #tensormapcpfenceproxy @@ -1473,6 +1649,25 @@ __device__ static inline bool mbarrier_try_wait_parity( const uint32_t& suspendTimeHint); ``` +#### `tensormap.cp_fenceproxy` + +- PTX ISA: [`tensormap.cp_fenceproxy`](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy) + +**tensormap_cp_fenceproxy**: +```cuda +// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA 83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void tensormap_cp_fenceproxy( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + void* dst, + const void* src, + cuda::ptx::n32_t size); +``` + + ### [9.7.13. Warp Level Matrix Multiply-Accumulate Instructions](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-multiply-accumulate-instructions) | Instruction | Available in libcu++ | diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h index 594d8a582a8..7dc17309fbd 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cccl/ptx_isa.h @@ -105,4 +105,13 @@ #endif #endif // __cccl_ptx_isa >= 800 +// NVRTC uses its own header, so we need to manually tell it when we expect SM90a to be available +#if defined(_CCCL_COMPILER_NVRTC) && !defined(NV_HAS_FEATURE_SM_90a) +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) && defined(__CUDA_ARCH_FEAT_SM90_ALL)) +#define NV_HAS_FEATURE_SM_90a NV_PROVIDES_SM_90 +#else // ^^^ SM90a ^^^ / vvv !SM90a vvv +#define NV_HAS_FEATURE_SM_90a NV_NO_TARGET +#endif // +#endif // _CCCL_COMPILER_NVRTC && !NV_HAS_FEATURE_SM_90a + #endif // __CCCL_PTX_ISA_H_ diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h index c472b0467ca..90bbeb190b9 100644 --- a/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h +++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/ptx.h @@ -2161,6 +2161,701 @@ _CCCL_DEVICE static inline void cp_async_bulk_wait_group_read( // 9.7.8.25. Data Movement and Conversion Instructions: tensormap.replace // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-tensormap-replace +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_global_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address( + space_global_t, + void* __tm_addr, + _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_address.global.b1024.b64 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "l"(__as_b64(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_address.space.b1024.b64 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_address( + cuda::ptx::space_shared_t, + void* tm_addr, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_address( + space_shared_t, + void* __tm_addr, + _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_address.shared::cta.b1024.b64 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "l"(__as_b64(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_address_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_global_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank( + space_global_t, + void* __tm_addr, + _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.rank.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.rank.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_rank( + cuda::ptx::space_shared_t, + void* tm_addr, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_rank( + space_shared_t, + void* __tm_addr, + _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.rank.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_rank_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_box_dim( + space_global_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.box_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.box_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_box_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_box_dim( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_box_dim_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_dim( + space_global_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_dim.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_dim.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_dim( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_dim( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_dim_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_stride( + space_global_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B64 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_stride.global.b1024.b64 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__ord), + "l"(__as_b64(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.global_stride.space.b1024.b64 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_global_stride( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B64 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_global_stride( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B64 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B64) == 8, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__ord), + "l"(__as_b64(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_global_stride_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_element_size( + space_global_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_global (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.element_stride.global.b1024.b32 [%0], %1, %2;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.element_stride.space.b1024.b32 [tm_addr], ord, new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_element_size( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t ord, + B32 new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_element_size( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __ord, + _B32 __new_val) +{ + // __space == space_shared (due to parameter type constraint) + static_assert(sizeof(_B32) == 4, ""); + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [%0], %1, %2;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__ord), + "r"(__as_b32(__new_val)) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_element_size_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype( + space_global_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.elemtype.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.elemtype.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_elemtype( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_elemtype( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_elemtype_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_interleave_layout( + space_global_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.interleave_layout.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.interleave_layout.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_interleave_layout( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_interleave_layout( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_interleave_layout_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode( + space_global_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.swizzle_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.swizzle_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_swizzle_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_swizzle_mode( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_swizzle_mode_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .global } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_global_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode( + space_global_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_global (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.fill_mode.global.b1024.b32 [%0], %1;" + : + : "l"(__as_ptr_gmem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 + +/* +// tensormap.replace.tile.fill_mode.space.b1024.b32 [tm_addr], new_val; // PTX ISA 83, SM_90a +// .space = { .shared::cta } +template +__device__ static inline void tensormap_replace_fill_mode( + cuda::ptx::space_shared_t, + void* tm_addr, + cuda::ptx::n32_t new_val); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); +template +_CCCL_DEVICE static inline void tensormap_replace_fill_mode( + space_shared_t, + void* __tm_addr, + n32_t<_N32> __new_val) +{ + // __space == space_shared (due to parameter type constraint) + NV_IF_ELSE_TARGET(NV_HAS_FEATURE_SM_90a,( + asm ( + "tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [%0], %1;" + : + : "r"(__as_ptr_smem(__tm_addr)), + "n"(__new_val) + : "memory" + ); + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_replace_fill_mode_is_not_supported_before_SM_90a__(); + )); +} +#endif // __cccl_ptx_isa >= 830 /* * 9.7.9. Texture Instructions @@ -3263,6 +3958,74 @@ _CCCL_DEVICE static inline void red_async( // 9.7.12.15.18. Parallel Synchronization and Communication Instructions: tensormap.cp_fenceproxy // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-tensormap-cp-fenceproxy +/* +// tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.sem.scope.sync.aligned [dst], [src], size; // PTX ISA 83, SM_90 +// .sem = { .release } +// .scope = { .cta, .cluster, .gpu, .sys } +template +__device__ static inline void tensormap_cp_fenceproxy( + cuda::ptx::sem_release_t, + cuda::ptx::scope_t scope, + void* dst, + const void* src, + cuda::ptx::n32_t size); +*/ +#if __cccl_ptx_isa >= 830 +extern "C" _CCCL_DEVICE void __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); +template +_CCCL_DEVICE static inline void tensormap_cp_fenceproxy( + sem_release_t, + scope_t<_Scope> __scope, + void* __dst, + const void* __src, + n32_t<_N32> __size) +{ + // __sem == sem_release (due to parameter type constraint) + static_assert(__scope == scope_cta || __scope == scope_cluster || __scope == scope_gpu || __scope == scope_sys, ""); + NV_IF_ELSE_TARGET(NV_PROVIDES_SM_90,( + if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cta) { + asm volatile ( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), + "r"(__as_ptr_smem(__src)), + "n"(__size) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_cluster) { + asm volatile ( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), + "r"(__as_ptr_smem(__src)), + "n"(__size) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_gpu) { + asm volatile ( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), + "r"(__as_ptr_smem(__src)), + "n"(__size) + : "memory" + ); + } else if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (__scope == scope_sys) { + asm volatile ( + "tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [%0], [%1], %2;" + : + : "l"(__as_ptr_gmem(__dst)), + "r"(__as_ptr_smem(__src)), + "n"(__size) + : "memory" + ); + } + ),( + // Unsupported architectures will have a linker error with a semi-decent error message + __cuda_ptx_tensormap_cp_fenceproxy_is_not_supported_before_SM_90__(); + )); +} +#endif // __cccl_ptx_isa >= 830 /* diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp new file mode 100644 index 00000000000..4e905039b5f --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.cp_fenceproxy.compile.pass.cpp @@ -0,0 +1,52 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_tensormap_cp_fenceproxy(void ** fn_ptr) { +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_PROVIDES_SM_90, ( + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cta.sync.aligned [dst], [src], size; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.cluster.sync.aligned [dst], [src], size; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.gpu.sync.aligned [dst], [src], size; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_cp_fenceproxy)); + // tensormap.cp_fenceproxy.global.shared::cta.tensormap::generic.release.sys.sync.aligned [dst], [src], size; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_cp_fenceproxy)); + )); +#endif // __cccl_ptx_isa >= 830 +} + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp new file mode 100644 index 00000000000..dbc25eeb921 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/ptx/ptx.tensormap.replace.compile.pass.cpp @@ -0,0 +1,179 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// +// UNSUPPORTED: libcpp-has-no-threads + +// + +#include +#include + +/* + * We use a special strategy to force the generation of the PTX. This is mainly + * a fight against dead-code-elimination in the NVVM layer. + * + * The reason we need this strategy is because certain older versions of ptxas + * segfault when a non-sensical sequence of PTX is generated. So instead, we try + * to force the instantiation and compilation to PTX of all the overloads of the + * PTX wrapping functions. + * + * We do this by writing a function pointer of each overload to the kernel + * parameter `fn_ptr`. + * + * Because `fn_ptr` is possibly visible outside this translation unit, the + * compiler must compile all the functions which are stored. + * + */ + +__global__ void test_tensormap_replace(void ** fn_ptr) { +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_address.global.b1024.b64 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tensormap_replace_global_address)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_address.shared::cta.b1024.b64 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tensormap_replace_global_address)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.rank.global.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tensormap_replace_rank)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.rank.shared::cta.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast(cuda::ptx::tensormap_replace_rank)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.box_dim.global.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_box_dim)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.box_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_box_dim)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_dim.global.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_global_dim)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_dim.shared::cta.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_global_dim)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_stride.global.b1024.b64 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int64_t )>(cuda::ptx::tensormap_replace_global_stride)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.global_stride.shared::cta.b1024.b64 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int64_t )>(cuda::ptx::tensormap_replace_global_stride)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.element_stride.global.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_element_size)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.element_stride.shared::cta.b1024.b32 [tm_addr], ord, new_val; + *fn_ptr++ = reinterpret_cast(static_cast, int32_t )>(cuda::ptx::tensormap_replace_element_size)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.elemtype.global.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_elemtype)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.elemtype.shared::cta.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_elemtype)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.interleave_layout.global.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_interleave_layout)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.interleave_layout.shared::cta.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_interleave_layout)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.swizzle_mode.global.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_swizzle_mode)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.swizzle_mode.shared::cta.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_swizzle_mode)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.fill_mode.global.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_fill_mode)); + )); +#endif // __cccl_ptx_isa >= 830 + +#if __cccl_ptx_isa >= 830 + NV_IF_TARGET(NV_HAS_FEATURE_SM_90a, ( + // tensormap.replace.tile.fill_mode.shared::cta.b1024.b32 [tm_addr], new_val; + *fn_ptr++ = reinterpret_cast(static_cast)>(cuda::ptx::tensormap_replace_fill_mode)); + )); +#endif // __cccl_ptx_isa >= 830 +} + +int main(int, char**) +{ + return 0; +}