diff --git a/test/unit/cute/intel_xe/copy_block.cpp b/test/unit/cute/intel_xe/copy_block.cpp index 8d50704881..575d3b9413 100644 --- a/test/unit/cute/intel_xe/copy_block.cpp +++ b/test/unit/cute/intel_xe/copy_block.cpp @@ -42,6 +42,7 @@ using namespace cutlass; using namespace syclcompat::experimental; #define SUBGROUP_SIZE (16) +constexpr int row_alignment = 16; // Alignment requirement for Xe 2D Block Copy Instructions template @@ -265,31 +266,34 @@ struct copy_op { // Allocate and initialize // using dtype = uint32_t; - cutlass::host_vector host_src(M * N); - cutlass::host_vector host_output(M * N); - for (size_t i = 0; i < host_src.size(); ++i) { - host_src[i] = static_cast(i); + constexpr int elem_alignment = row_alignment / sizeof(dtype); + constexpr int row_pitch_S = cute::ceil_div(N, elem_alignment) * elem_alignment; + constexpr int row_pitch_D = cute::ceil_div(M, elem_alignment) * elem_alignment; + using TensorLayoutS = decltype(make_layout(Shape, Int>{}, make_stride(Int{}, _1{}))); + using TensorLayoutD = decltype(make_layout(Shape, Int>{}, make_stride(Int{}, _1{}))); + + cutlass::host_vector host_src(M * row_pitch_S); + cutlass::host_vector host_output(N * row_pitch_D); + + for (size_t i = 0; i < cute::cosize(TensorLayoutS{}); ++i) { + host_src[TensorLayoutS{}(i)] = static_cast(i); } cutlass::device_vector device_src = host_src; cutlass::device_vector device_output = host_output; - Tensor S = - make_tensor(make_gmem_ptr(device_src.data()), - make_layout(Shape, Int>{}, Stride, _1>{})); - Tensor D = - make_tensor(make_gmem_ptr(device_output.data()), - make_layout(Shape, Int>{}, Stride, _1>{})); + Tensor S = make_tensor(make_gmem_ptr(device_src.data()), TensorLayoutS{}); + Tensor D = make_tensor(make_gmem_ptr(device_output.data()), TensorLayoutD{}); auto tiled_load = make_tiled_copy( - Copy_Atom, dtype>{}.with(device_src.data(), M, N), + Copy_Atom, dtype>{}.with(S), Layout, _1>>{}, make_layout(shape_div(typename Copy_Traits::BlockShape{}, Shape<_16, _1>{}))); auto tiled_store = make_tiled_copy( - Copy_Atom, dtype>{}.with(device_output.data(), N, M), + Copy_Atom, dtype>{}.with(D), Layout>>{}, - make_layout(shape_div(typename Copy_Traits::BlockShape{}, Shape<_1, _16>{}))); + make_layout(shape_div(typename Copy_Traits::BlockShape{}, Shape<_1, _16>{}))); auto blockDim = syclcompat::dim3(size(tiled_load)); // // Launch the kernel @@ -306,7 +310,7 @@ struct copy_op { host_output = device_output; for (int i = 0; i < N; ++i) { for (int j = 0; j < M; ++j) { - EXPECT_EQ(host_output[i * M + j], host_src[j * N + i]); + EXPECT_EQ(host_output[i * row_pitch_D + j], host_src[j * row_pitch_S + i]); } } } diff --git a/test/unit/cute/intel_xe/copy_subgroup_block.cpp b/test/unit/cute/intel_xe/copy_subgroup_block.cpp index b2c5b29ba7..dbc5436d51 100644 --- a/test/unit/cute/intel_xe/copy_subgroup_block.cpp +++ b/test/unit/cute/intel_xe/copy_subgroup_block.cpp @@ -43,7 +43,7 @@ using namespace syclcompat::experimental; template -void copy_kernel_vectorized(TensorS S, TensorD D, uint32_t M, uint32_t N) { +void copy_kernel_vectorized(TensorS S, TensorD D) { using namespace cute; using Element = typename TensorS::value_type; @@ -158,15 +158,19 @@ bool copy(uint32_t M, uint32_t N) { // Given a 2D shape, perform an efficient copy // + constexpr int elem_alignment = 16 / sizeof(dtype); + int row_pitch = cute::ceil_div(N, elem_alignment) * elem_alignment; + auto tensor_shape = make_shape(M, N); + auto tensor_layout = make_layout(tensor_shape, make_stride(row_pitch, 1)); auto block_shape = make_shape(Int{}, Int{}); auto subgroup_shape = make_shape(Int{}, Int{}); // // Allocate and initialize // - cutlass::host_vector host_src(size(tensor_shape)); - cutlass::host_vector host_output(size(tensor_shape)); + cutlass::host_vector host_src(cute::cosize(tensor_layout)); + cutlass::host_vector host_output(cute::cosize(tensor_layout)); for (size_t i = 0; i < host_src.size(); ++i) { host_src[i] = static_cast(i); @@ -179,10 +183,8 @@ bool copy(uint32_t M, uint32_t N) { // Make tensors // - Tensor tensor_S = make_tensor(make_gmem_ptr(device_src.data()), - make_layout(tensor_shape, make_stride(N, 1))); - Tensor tensor_D = make_tensor(make_gmem_ptr(device_output.data()), - make_layout(tensor_shape, make_stride(N, 1))); + Tensor tensor_S = make_tensor(make_gmem_ptr(device_src.data()), tensor_layout); + Tensor tensor_D = make_tensor(make_gmem_ptr(device_output.data()), tensor_layout); // // Tile tensors @@ -216,7 +218,7 @@ bool copy(uint32_t M, uint32_t N) { wg_tile_m, wg_tile_n, sg_tile_m, sg_tile_n>>( launch_policy{gridDim, blockDim, kernel_properties{sycl_exp::sub_group_size}}, - tensor_S, tensor_D, M, N); + tensor_S, tensor_D); syclcompat::wait_and_throw(); @@ -226,22 +228,21 @@ bool copy(uint32_t M, uint32_t N) { host_output = device_output; - auto surface_pitch = N; for (int i = 0; i < sg_tile_m && i < M; i++) { for (int j = 0; j < sg_tile_n && j < N; j++) { - EXPECT_EQ(host_output[surface_pitch * i + j], surface_pitch * i + j); + EXPECT_EQ(host_output[row_pitch * i + j], row_pitch * i + j); } } for (int i = sg_tile_m; i < sg_tile_m + 1 && i < M; i++) { for (int j = 0; j < sg_tile_n && j < N; j++) { - EXPECT_NE(host_output[surface_pitch * i + j], surface_pitch * i + j); + EXPECT_NE(host_output[row_pitch * i + j], row_pitch * i + j); } } for (int i = 0; i < sg_tile_m && i < M; i++) { for (int j = sg_tile_n; j < sg_tile_n + 1 && j < N; j++) { - EXPECT_NE(host_output[surface_pitch * i + j], surface_pitch * i + j); + EXPECT_NE(host_output[row_pitch * i + j], row_pitch * i + j); } } return true; diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp index 13625638a3..c26ddf4de4 100644 --- a/test/unit/gemm/device/gemm_testbed_3x.hpp +++ b/test/unit/gemm/device/gemm_testbed_3x.hpp @@ -4018,7 +4018,7 @@ template class ActivationFunctor = // TODO(Codeplay): remove the test_batch option once batching is enabled for all tests bool TestXe( double alpha = 1.0, double beta = 0.0, - bool test_batch = true, int max_alignment = 4, + bool test_batch = true, int max_alignment = 8, CheckEquality check_relative_equality = CheckEquality::RELATIVE) { using ElementScalar = typename Gemm::EpilogueOutputOp::ElementScalar; using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; @@ -4040,7 +4040,7 @@ bool TestXe( std::vector problem_size_l = test_batch ? std::vector{1, 3, 4} : std::vector{1}; constexpr int TileShapeK = cute::size<2>(typename Gemm::GemmKernel::TileShape{}); - std::vector problem_size_k{TileShapeK}; + std::vector problem_size_k{TileShapeK, TileShapeK*32}; using DecompositionMode = typename cutlass::gemm::kernel::detail::PersistentTileSchedulerXeStreamKParams::DecompositionMode; std::vector decomposition_modes = {DecompositionMode::Heuristic}; diff --git a/test/unit/gemm/device/gemm_universal_f16t_s4n_f32t_mixed_input_tensor_op_f32_xe.cpp b/test/unit/gemm/device/gemm_universal_f16t_s4n_f32t_mixed_input_tensor_op_f32_xe.cpp index b2271b6217..9e492ac4d0 100644 --- a/test/unit/gemm/device/gemm_universal_f16t_s4n_f32t_mixed_input_tensor_op_f32_xe.cpp +++ b/test/unit/gemm/device/gemm_universal_f16t_s4n_f32t_mixed_input_tensor_op_f32_xe.cpp @@ -131,7 +131,7 @@ TEST(XE_Device_GemmUniversal_f16t_s4n_f32t_mixed_input_tensor_op_f32, 128x128x64 using Gemm = cutlass::gemm::device::GemmUniversalAdapter; // TODO(Codeplay): gemm batch doesn't work for mixed type - bool passed = test::gemm::device::TestXe(1.0, 1.0, false, 8); + bool passed = test::gemm::device::TestXe(1.0, 1.0, false, 16); EXPECT_TRUE(passed); } //////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/gemm_universal_f16t_s4t_f32t_mixed_input_tensor_op_f32_xe.cpp b/test/unit/gemm/device/gemm_universal_f16t_s4t_f32t_mixed_input_tensor_op_f32_xe.cpp index 87c8e1227f..b6f3af0f40 100644 --- a/test/unit/gemm/device/gemm_universal_f16t_s4t_f32t_mixed_input_tensor_op_f32_xe.cpp +++ b/test/unit/gemm/device/gemm_universal_f16t_s4t_f32t_mixed_input_tensor_op_f32_xe.cpp @@ -131,7 +131,7 @@ TEST(XE_Device_GemmUniversal_f16t_s4t_f32t_mixed_input_tensor_op_f32, 128x128x64 using Gemm = cutlass::gemm::device::GemmUniversalAdapter; // TODO(Codeplay): gemm batch doesn't work for mixed type - bool passed = test::gemm::device::TestXe(1.0, 1.0, false, 8); + bool passed = test::gemm::device::TestXe(1.0, 1.0, false, 32); EXPECT_TRUE(passed); } //////////////////////////////////////////////////////////////////////////////// diff --git a/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp index bf42112ca9..f22eec7df1 100644 --- a/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp +++ b/test/unit/gemm/device/xe_gemm_s8_s8_s32_tensor_op_s32.cpp @@ -65,7 +65,7 @@ TEST(XE_Device_Gemm_s8t_s8t_s32t_tensor_op_s32, 256x256x32) { using LayoutA = layout::RowMajor; using LayoutB = layout::RowMajor; using Gemm = XE_Device_Gemm_s8_s8_s32_tensor_op_s32::Gemm; - EXPECT_TRUE(test::gemm::device::TestXe()); + EXPECT_TRUE(test::gemm::device::TestXe(1.0, 0.0, true, 16)); } /* TODO(Codeplay): Transposed copy are not implemented