From 8bac470b2844916a2001533c13c32e9b16a077ca Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Tue, 22 Apr 2025 21:42:37 +0100 Subject: [PATCH 1/2] Avoid offsetting base_ptr in XE_2D_LD_Unpack --- include/cute/atom/copy_traits_xe.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index e92d184210..0f6a17e704 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -216,6 +216,7 @@ struct XE_2D_LD_Unpack { uint32_t height; uint32_t pitch; uint32_t stride_l = 0; + uint32_t height_offset = 0; @@ -253,6 +254,10 @@ struct XE_2D_LD_Unpack { if constexpr (stride_rank == 3) { stride_l = size<2>(tensor.stride()); } + if(stride_l % pitch != 0){ + CUTE_INVALID_CONTROL_PATH("Incompatible strides in tensor.\n"); + }; + height_offset = stride_l / pitch; } XE_2D_LD_Unpack(Traits_LD_t const &traits) : base_ptr(traits.base_ptr), @@ -279,11 +284,12 @@ struct XE_2D_LD_Unpack { auto [m, n, l] = src.data().coord_; int x = is_need_reversed ? m : n; int y = is_need_reversed ? n : m; + y += l * traits.height_offset; constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::copy(base_addr + l * traits.stride_l, - (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height, + CopyOp::copy(base_addr, + (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height + l * traits.height_offset, (traits.pitch * sizeof_bits_v) / sizeof_bits_v, intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, raw_pointer_cast(&((&*dst.data())[0]))); From ac49513fdf4b24234988107798dae7374760fc34 Mon Sep 17 00:00:00 2001 From: Joe Todd Date: Tue, 22 Apr 2025 22:18:47 +0100 Subject: [PATCH 2/2] Apply changes to prefetch & store --- include/cute/atom/copy_traits_xe.hpp | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/include/cute/atom/copy_traits_xe.hpp b/include/cute/atom/copy_traits_xe.hpp index 0f6a17e704..5a250b6156 100644 --- a/include/cute/atom/copy_traits_xe.hpp +++ b/include/cute/atom/copy_traits_xe.hpp @@ -262,7 +262,7 @@ struct XE_2D_LD_Unpack { XE_2D_LD_Unpack(Traits_LD_t const &traits) : base_ptr(traits.base_ptr), width(traits.width), height(traits.height), pitch(traits.pitch), - stride_l(traits.stride_l) {} + stride_l(traits.stride_l), height_offset(traits.height_offset){} XE_2D_LD_Unpack() {} @@ -288,8 +288,8 @@ struct XE_2D_LD_Unpack { constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::copy(base_addr, - (traits.width * sizeof_bits_v) / sizeof_bits_v, traits.height + l * traits.height_offset, + CopyOp::copy(base_addr, (traits.width * sizeof_bits_v) / sizeof_bits_v, + traits.height + l * traits.height_offset, (traits.pitch * sizeof_bits_v) / sizeof_bits_v, intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}, raw_pointer_cast(&((&*dst.data())[0]))); @@ -311,11 +311,12 @@ struct XE_2D_LD_Unpack { int x = is_need_reversed ? m : n; int y = is_need_reversed ? n : m; + y += l * atom.height_offset; constexpr auto inst_size_bits = detail::size_of_inst_bits; - CopyOp::PREFETCH::copy(base_addr + l * atom.stride_l, - (atom.width * sizeof_bits_v) / sizeof_bits_v, atom.height, + CopyOp::PREFETCH::copy(base_addr, (atom.width * sizeof_bits_v) / sizeof_bits_v, + atom.height + l * atom.height_offset, (atom.pitch * sizeof_bits_v) / sizeof_bits_v, intel::coord_t{(int)(x * sizeof_bits_v / inst_size_bits), y}); } @@ -345,6 +346,7 @@ template (tensor.stride()); } + if(stride_l % pitch != 0){ + CUTE_INVALID_CONTROL_PATH("Incompatible strides in tensor.\n"); + }; + height_offset = stride_l / pitch; } XE_2D_ST_Unpack(Traits_ST_t const &traits) : base_ptr(traits.base_ptr), width(traits.width), height(traits.height), pitch(traits.pitch), - stride_l(traits.stride_l) {} + stride_l(traits.stride_l), height_offset(traits.height_offset) {} XE_2D_ST_Unpack() {} @@ -389,11 +395,14 @@ template