diff --git a/include/cute/arch/xe_copy_1B.hpp b/include/cute/arch/xe_copy_1B.hpp index 77da428802..d4a23afa36 100644 --- a/include/cute/arch/xe_copy_1B.hpp +++ b/include/cute/arch/xe_copy_1B.hpp @@ -106,6 +106,15 @@ SYCL_DEVICE_BUILTIN( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord)); +// 8bits NO transform transpose +SYCL_DEVICE_BUILTIN( + cute::intel::ushort8 __builtin_IB_subgroup_block_read_cacheopts_transpose_u8_m32k8( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, int cache = 0)); +SYCL_DEVICE_BUILTIN( + cute::intel::ushort4 __builtin_IB_subgroup_block_read_cacheopts_transpose_u8_m32k4( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord, int cache = 0)); // 8bits VNNI transform No transpose SYCL_DEVICE_BUILTIN( @@ -443,6 +452,67 @@ struct XE_2D_U8x32x32_LD_N { } }; +struct XE_2D_U8x32x4_LD_T { + using BlockShape = Shape<_4, _32>; + using inst_dtype = uint8_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(SYCL_INTEL_TARGET) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + *reinterpret_cast(dst) = + __builtin_IB_subgroup_block_read_cacheopts_transpose_u8_m32k4( + (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + +struct XE_2D_U8x32x8_LD_T { + using BlockShape = Shape<_8, _32>; + using inst_dtype = uint8_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(SYCL_INTEL_TARGET) + static_assert(sizeof(T) == 1, "Expected T to have size 1"); + *reinterpret_cast(dst) = + __builtin_IB_subgroup_block_read_cacheopts_transpose_u8_m32k8( + (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + +struct XE_2D_U8x16x32_LD_T { + using BlockShape = Shape<_32, _16>; + using inst_dtype = uint32_t; + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(SYCL_INTEL_TARGET) + static_assert(sizeof(T) == 1, "Expected T to have size 2"); + *reinterpret_cast(dst) = + __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( + (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + + struct XE_2D_U4x16x16_LD_T { using BlockShape = Shape<_16, _16>; using inst_dtype = uint32_t; diff --git a/include/cute/arch/xe_copy_4B.hpp b/include/cute/arch/xe_copy_4B.hpp index 9074ec1ea7..266a428511 100644 --- a/include/cute/arch/xe_copy_4B.hpp +++ b/include/cute/arch/xe_copy_4B.hpp @@ -117,9 +117,10 @@ SYCL_DEVICE_BUILTIN( int pitch_minus_one, cute::intel::coord_t coord)); // 32bits No transform No transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_u32_m1k16v1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); SYCL_DEVICE_BUILTIN( cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_u32_m2k16v1( intptr_t baseoffset, int width_minus_one, int height_minus_one, @@ -142,9 +143,10 @@ SYCL_DEVICE_BUILTIN( int pitch_minus_one, cute::intel::coord_t coord)); // 32bits No transform Transpose -SYCL_DEVICE_BUILTIN(cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( - intptr_t baseoffset, int width_minus_one, int height_minus_one, - int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint __builtin_IB_subgroup_block_read_flat_transpose_u32_k1( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); SYCL_DEVICE_BUILTIN( cute::intel::uint2 __builtin_IB_subgroup_block_read_flat_transpose_u32_k2( intptr_t baseoffset, int width_minus_one, int height_minus_one, @@ -157,6 +159,10 @@ SYCL_DEVICE_BUILTIN( cute::intel::uint8 __builtin_IB_subgroup_block_read_flat_transpose_u32_k8( intptr_t baseoffset, int width_minus_one, int height_minus_one, int pitch_minus_one, cute::intel::coord_t coord)); +SYCL_DEVICE_BUILTIN( + cute::intel::uint4 __builtin_IB_subgroup_block_read_flat_transpose_u32_m8k8( + intptr_t baseoffset, int width_minus_one, int height_minus_one, + int pitch_minus_one, cute::intel::coord_t coord)); // 32bits SYCL_DEVICE_BUILTIN(void __builtin_IB_subgroup_block_write_flat_u32_m1k16v1( @@ -710,6 +716,27 @@ struct XE_2D_U32x16x8_LD_T { }; }; +struct XE_2D_TF32x8x8_LD_T { + using BlockShape = Shape<_8, _8>; + using ValueShape = Shape<_4, _16>; + + static constexpr bool is_transpose = true; + + template + CUTE_HOST_DEVICE static void copy(const void *baseoffset, int width, + int height, int pitch, intel::coord_t coord, + T *dst) { +#if defined(SYCL_INTEL_TARGET) + static_assert(sizeof(T) == 4, "Expected T to have size 4"); + *reinterpret_cast(dst) = + __builtin_IB_subgroup_block_read_flat_transpose_u32_m8k8( + (intptr_t)(baseoffset), width - 1, height - 1, pitch - 1, coord); +#else + CUTE_INVALID_CONTROL_PATH("Trying to use block loads on non-PVC hardware"); +#endif + } +}; + struct XE_2D_U32x1x16_ST_N { using BlockShape = Shape<_1, _16>; diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index e92d184210..181754f4b8 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -1398,6 +1398,24 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0, Stride<_32, _1>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride< _32, Stride<_32, _1>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + + template + Copy_Traits_(ArgTs... args) + : XE_2D_LD_Unpack(args...) {} +}; + template struct Copy_Traits_ : XE_2D_LD_Unpack { @@ -1688,6 +1706,60 @@ struct Copy_Traits_ : XE_2D_LD_Unpack(args...) {} }; +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride< _0, Stride<_1, _8>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_256,Stride< _1,_8>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + + template + Copy_Traits_(ArgT... args) + : XE_2D_LD_Unpack(args...) {} +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0, Stride<_1, _8, _16>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_256,Stride<_1, _8, _16>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + + template + Copy_Traits_(ArgT... args) + : XE_2D_LD_Unpack(args...) {} +}; + +template +struct Copy_Traits_ + : XE_2D_LD_Unpack { + using ThrID = Layout<_16>; + // Map from (src-thr,src-val) to bit + using SrcLayout = Layout>, + Stride<_0, Stride<_1, _8, _16>>>; + // Map from (dst-thr,dst-val) to bit + using DstLayout = Layout>, + Stride<_256,Stride<_1, _8, _16>>>; + // Reference map from (thr,val) to bit + using RefLayout = DstLayout; + + template + Copy_Traits_(ArgT... args) + : XE_2D_LD_Unpack(args...) {} +}; + // template // struct Copy_Traits // : XE_2D_LD_Unpack { @@ -2232,6 +2304,9 @@ COPY_TRAIT_LD_DEF(XE_2D_U8x16x32_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U8x32x32_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U8x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U8x32x64_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_U8x32x8_LD_T) +COPY_TRAIT_LD_DEF(XE_2D_U8x32x4_LD_T) +COPY_TRAIT_LD_DEF(XE_2D_U8x16x32_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U16x1x16_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U16x2x16_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U16x4x16_LD_N) @@ -2274,6 +2349,7 @@ COPY_TRAIT_LD_DEF(XE_2D_U16x16x32_LD_V) COPY_TRAIT_LD_DEF(XE_2D_U16x16x16_LD_T) COPY_TRAIT_LD_DEF(XE_2D_TF32x16x16_LD_N) COPY_TRAIT_LD_DEF(XE_2D_TF32x32x16_LD_N) +COPY_TRAIT_LD_DEF(XE_2D_TF32x8x8_LD_T) COPY_TRAIT_LD_DEF(XE_2D_U4x32x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x16x64_LD_N) COPY_TRAIT_LD_DEF(XE_2D_U4x32x16_LD_T)