From 734811221a58e88cfc6e8cd2010820bb5b10b697 Mon Sep 17 00:00:00 2001 From: Dimple Prajapati Date: Wed, 28 Aug 2024 11:00:56 -0700 Subject: [PATCH 1/2] [test] Fix test cases with unsupported shaped buffer sizes (#847) [test] Fix test cases that does xegpu.load_nd on unsupported shaped buffer In several test cases, we use load_nd on unsupported shaped buffer. This PR fixes one of those test cases. Co-authored-by: Md Abdullah Shahneous Bari --- .../imex/ExecutionEngine/ImexRunnerUtils.h | 5 + .../Dialect/XeGPU/dynamic_memref.vc.mlir | 44 +++--- .../Integration/Dialect/XeGPU/exp_f32.vc.mlir | 89 +++++------- .../Dialect/XeGPU/fmax_f32.vc.mlir | 7 +- .../Dialect/XeGPU/load2d-padding.mlir | 42 +++--- .../Dialect/XeGPU/load2d_dpas_store2d.mlir | 97 ------------- .../Integration/Dialect/XeGPU/preop_dpas.mlir | 102 +++++++------ .../Dialect/XeGPU/vector_broadcast_1.mlir | 132 ++++++++++------- .../Dialect/XeGPU/vector_broadcast_2.mlir | 136 ++++++++++-------- .../vector_extract_strided_slice_1.vc.mlir | 122 ++++++++++------ .../Dialect/XeGPU/vector_insert_1.mlir | 51 +++---- .../Dialect/XeGPU/vector_insert_2.mlir | 49 +++---- .../Dialect/XeGPU/xegpu-to-vc.mlir | 135 +++++++++++------ 13 files changed, 510 insertions(+), 501 deletions(-) delete mode 100644 test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir diff --git a/include/imex/ExecutionEngine/ImexRunnerUtils.h b/include/imex/ExecutionEngine/ImexRunnerUtils.h index b9f03023a..464ad6418 100644 --- a/include/imex/ExecutionEngine/ImexRunnerUtils.h +++ b/include/imex/ExecutionEngine/ImexRunnerUtils.h @@ -72,6 +72,11 @@ _mlir_ciface_fillResource1DRandomF16(UnrankedMemRefType *ptr, const float lower, const float upper, const bool genInt); +extern "C" IMEX_RUNNERUTILS_EXPORT void +_mlir_ciface_fillResource1DRandomF32(UnrankedMemRefType *ptr, + const float lower, const float upper, + const bool genInt); + extern "C" IMEX_RUNNERUTILS_EXPORT void _mlir_ciface_printMemrefBF16(UnrankedMemRefType *m); extern "C" IMEX_RUNNERUTILS_EXPORT void diff --git a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir index a70a08c76..55c8f0902 100644 --- a/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir +++ b/test/Integration/Dialect/XeGPU/dynamic_memref.vc.mlir @@ -7,55 +7,49 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A : memref<8x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + func.func @test(%A : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref_0 = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %A, %memref_0 : memref<8x16xf16> to memref<8x16xf16> + %memref_0 = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %A, %memref_0 : memref<8x16xf32> to memref<8x16xf32> %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> - %memref_0_cast = memref.cast %memref_0 : memref<8x16xf16> to memref + %memref_0_cast = memref.cast %memref_0 : memref<8x16xf32> to memref %memref_1_cast = memref.cast %memref_1 : memref<8x16xf32> to memref - gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref, %memref_1_cast : memref) - gpu.dealloc %memref_0 : memref<8x16xf16> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0_cast : memref, %memref_1_cast : memref) + gpu.dealloc %memref_0 : memref<8x16xf32> return %memref_1 : memref<8x16xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0 : memref, %arg1: memref) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%arg0 : memref, %arg1: memref) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index - %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref -> !xegpu.tensor_desc<8x16xf16> - %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %3 = vector.shape_cast %2 : vector<8x16xf16> to vector<128xf16> - %5 = arith.extf %3 : vector<128xf16> to vector<128xf32> - %4 = vector.shape_cast %5 : vector<128xf32> to vector<8x16xf32> + %1 = xegpu.create_nd_tdesc %arg0[0, 0], [%c8, %c16], [%c16, %c1] : memref -> !xegpu.tensor_desc<8x16xf32> + %2 = xegpu.load_nd %1 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> %6 = xegpu.create_nd_tdesc %arg1[0, 0], [%c8, %c16], [%c16, %c1] : memref -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %4, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %2, %6 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } func.func @main() attributes {llvm.emit_c_interface} { - %A = memref.alloc() : memref<8x16xf16> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> + %A = memref.alloc() : memref<8x16xf32> + %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32> %c_gen_int = arith.constant 0 : i1 %cf_lower = arith.constant -0.5 : f32 %cf_upper = arith.constant 0.5 : f32 - call @fillResource1DRandomF16(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf16>, f32, f32, i1) -> () + call @fillResource1DRandomF32(%A_random, %cf_lower, %cf_upper, %c_gen_int) : (memref<*xf32>, f32, f32, i1) -> () - %B = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32> + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> - %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16> - // call @printMemrefF16(%A_cast) : (memref<*xf16>) -> () + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> // call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] - call @printAllcloseF16(%A_cast, %B_cast) : (memref<*xf16>, memref<*xf32>) -> () + call @printAllcloseF32(%A_cast, %B_cast) : (memref<*xf32>, memref<*xf32>) -> () - memref.dealloc %A : memref<8x16xf16> + memref.dealloc %A : memref<8x16xf32> return } - func.func private @printMemrefF16(memref<*xf16>) attributes {llvm.emit_c_interface} func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} - func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface} - func.func private @printAllcloseF16(memref<*xf16>, memref<*xf32>) attributes {llvm.emit_c_interface} + func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface} + func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} } diff --git a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir index 45036fbbf..92111fc1d 100644 --- a/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir +++ b/test/Integration/Dialect/XeGPU/exp_f32.vc.mlir @@ -7,35 +7,28 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} { + func.func @test(%A: memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>) attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<16x16xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> - memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16> + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32> + %memref_2 = gpu.alloc host_shared () : memref<8x16xf32> %memref_3 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>) - gpu.launch_func @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_1 : memref<16x16xf16> + gpu.launch_func @module0::@test_exp_larger_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>) + gpu.launch_func @module1::@test_exp_generic_vec blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_3 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf32> return %memref_2, %memref_3 : memref<8x16xf32>, memref<8x16xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_exp_larger_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_exp_larger_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load A tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - // load B tile - %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %val2 = xegpu.load_nd %b_tile0 { packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - // do DPAS - %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> // take exp - %t6 = math.exp %val4 : vector<8x16xf32> + %t6 = math.exp %val0 : vector<8x16xf32> // store %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> xegpu.store_nd %t6, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> @@ -43,27 +36,22 @@ module @gemm attributes {gpu.container_module} { } } gpu.module @module1 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_exp_generic_vec(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_exp_generic_vec(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load A tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - // load B tile - %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - // do DPAS - %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> - // extract dpas out into 16xf32 vectors - %cst1 = arith.constant dense<1.4426950408889634> : vector<128xf32> - %v0 = vector.extract %val4[0] : vector<16xf32> from vector<8x16xf32> - %v1 = vector.extract %val4[1] : vector<16xf32> from vector<8x16xf32> - %v2 = vector.extract %val4[2] : vector<16xf32> from vector<8x16xf32> - %v3 = vector.extract %val4[3] : vector<16xf32> from vector<8x16xf32> - %v4 = vector.extract %val4[4] : vector<16xf32> from vector<8x16xf32> - %v5 = vector.extract %val4[5] : vector<16xf32> from vector<8x16xf32> - %v6 = vector.extract %val4[6] : vector<16xf32> from vector<8x16xf32> - %v7 = vector.extract %val4[7] : vector<16xf32> from vector<8x16xf32> + %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + + // extract the loaded vector into 16xf32 vectors + %v0 = vector.extract %val0[0] : vector<16xf32> from vector<8x16xf32> + %v1 = vector.extract %val0[1] : vector<16xf32> from vector<8x16xf32> + %v2 = vector.extract %val0[2] : vector<16xf32> from vector<8x16xf32> + %v3 = vector.extract %val0[3] : vector<16xf32> from vector<8x16xf32> + %v4 = vector.extract %val0[4] : vector<16xf32> from vector<8x16xf32> + %v5 = vector.extract %val0[5] : vector<16xf32> from vector<8x16xf32> + %v6 = vector.extract %val0[6] : vector<16xf32> from vector<8x16xf32> + %v7 = vector.extract %val0[7] : vector<16xf32> from vector<8x16xf32> // do generic size exp %v0_exp = math.exp %v0 : vector<16xf32> %v1_exp = math.exp %v1 : vector<16xf32> @@ -104,31 +92,19 @@ module @gemm attributes {gpu.container_module} { %rand_lower = arith.constant -1.0 : f32 %rand_upper = arith.constant 1.0 : f32 %gen_int = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> + %A = memref.alloc() : memref<8x16xf32> %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%B_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> () + %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @fillResource1DRandomF32(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf32>, f32, f32, i1) -> () // run GPU version - %Out_gpu_large, %Out_gpu_generic = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> (memref<8x16xf32>, memref<8x16xf32>) + %Out_gpu_large, %Out_gpu_generic = call @test(%A) : (memref<8x16xf32>) -> (memref<8x16xf32>, memref<8x16xf32>) %Out_gpu_generic_cast = memref.cast %Out_gpu_generic : memref<8x16xf32> to memref<*xf32> %Out_gpu_large_cast = memref.cast %Out_gpu_large : memref<8x16xf32> to memref<*xf32> // run CPU version scf.for %i = %c0 to %c8 step %c1 { scf.for %j = %c0 to %c16 step %c1 { - %v0_init = arith.constant 0.0 : f32 - %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 { - %a0 = memref.load %A[%i, %k] : memref<8x16xf16> - %b0 = memref.load %B[%k, %j] : memref<16x16xf16> - %a0_f32 = arith.extf %a0 : f16 to f32 - %b0_f32 = arith.extf %b0 : f16 to f32 - %t0 = arith.mulf %a0_f32, %b0_f32 : f32 - %v0_new = arith.addf %v0, %t0 : f32 - scf.yield %v0_new : f32 - } - %vexp = math.exp %result#0: f32 + %a0 = memref.load %A[%i, %j] : memref<8x16xf32> + %vexp = math.exp %a0: f32 memref.store %vexp, %Out_cpu[%i, %j] : memref<8x16xf32> } } @@ -141,8 +117,7 @@ module @gemm attributes {gpu.container_module} { call @printAllcloseF32(%Out_gpu_generic_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () call @printAllcloseF32(%Out_gpu_large_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> + memref.dealloc %A : memref<8x16xf32> memref.dealloc %Out_cpu : memref<8x16xf32> // gpu dealloc gpu.dealloc %Out_gpu_generic : memref<8x16xf32> @@ -150,6 +125,6 @@ module @gemm attributes {gpu.container_module} { return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} + func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface} func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} } diff --git a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir index b2bb6829a..8cb0eab7b 100644 --- a/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir +++ b/test/Integration/Dialect/XeGPU/fmax_f32.vc.mlir @@ -50,6 +50,7 @@ module @gemm attributes {gpu.container_module} { %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index + %A = memref.alloc() : memref<8x32xf16> %B = memref.alloc() : memref<16x32xf16> %Out_cpu = memref.alloc() : memref<8x16xf32> @@ -72,9 +73,9 @@ module @gemm attributes {gpu.container_module} { %v0_init = arith.constant 0.0 : f32 %v1_init = arith.constant 0.0 : f32 %result:2 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init, %v1 = %v1_init) -> (f32, f32){ - %a0 = memref.load %A[%i, %k] : memref<8x32xf16> %1 = arith.addi %k, %c16 : index %2 = arith.addi %j, %c16 : index + %a0 = memref.load %A[%i, %k] : memref<8x32xf16> %a1 = memref.load %A[%i, %1] : memref<8x32xf16> %b0 = memref.load %B[%k, %j] : memref<16x32xf16> %b1 = memref.load %B[%k, %2] : memref<16x32xf16> @@ -94,8 +95,8 @@ module @gemm attributes {gpu.container_module} { } %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> // print GPU and CPU outs - // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> () - // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () // dealloc diff --git a/test/Integration/Dialect/XeGPU/load2d-padding.mlir b/test/Integration/Dialect/XeGPU/load2d-padding.mlir index 22a97f496..b6c488f46 100644 --- a/test/Integration/Dialect/XeGPU/load2d-padding.mlir +++ b/test/Integration/Dialect/XeGPU/load2d-padding.mlir @@ -7,46 +7,46 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - // memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0> - memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0> + // memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<1.0> + memref.global "private" constant @__constant_8x16xf32 : memref<8x16xf32> = dense<1.0> - func.func @test(%arg0: memref<8x16xf16>,%arg3:index) -> memref<8x16xf16> attributes {llvm.emit_c_interface} { + func.func @test(%arg0: memref<8x16xf32>,%arg3:index) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<8x16xf16> - gpu.launch_func @test_kernel::@test_padding blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<8x16xf16>, %arg3:index) + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %arg0, %memref : memref<8x16xf32> to memref<8x16xf32> + %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> + gpu.launch_func @test_kernel::@test_padding blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_1 : memref<8x16xf32>, %arg3:index) - gpu.dealloc %memref : memref<8x16xf16> - return %memref_1 : memref<8x16xf16> + gpu.dealloc %memref : memref<8x16xf32> + return %memref_1 : memref<8x16xf32> } gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_padding(%arg0: memref<8x16xf16>, %arg1: memref<8x16xf16>,%arg3:index) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_padding(%arg0: memref<8x16xf32>, %arg1: memref<8x16xf32>,%arg3:index) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %0 = xegpu.create_nd_tdesc %arg0[%arg3, %arg3] - : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> + : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> %2 = xegpu.create_nd_tdesc %arg1[0, 0] - : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - xegpu.store_nd %3,%2 : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16> + : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + xegpu.store_nd %3,%2 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } func.func @main() attributes {llvm.emit_c_interface} { - %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> + %0 = memref.get_global @__constant_8x16xf32 : memref<8x16xf32> %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index - %2 = call @test(%0, %c1) : (memref<8x16xf16>, index) -> memref<8x16xf16> - %3 = call @test(%0, %c2) : (memref<8x16xf16>, index) -> memref<8x16xf16> + %2 = call @test(%0, %c1) : (memref<8x16xf32>, index) -> memref<8x16xf32> + %3 = call @test(%0, %c2) : (memref<8x16xf32>, index) -> memref<8x16xf32> %c7 = arith.constant 7 : index - %vector_0 = vector.load %2[%c7,%c0] :memref<8x16xf16>, vector<16xf16> + %vector_0 = vector.load %2[%c7,%c0] :memref<8x16xf32>, vector<16xf32> // CHECK: ( 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) - vector.print %vector_0 : vector<16xf16> + vector.print %vector_0 : vector<16xf32> - %vector_1 = vector.load %3[%c0,%c0] :memref<8x16xf16>, vector<16xf16> + %vector_1 = vector.load %3[%c0,%c0] :memref<8x16xf32>, vector<16xf32> // CHECK: ( 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 ) - vector.print %vector_1 : vector<16xf16> + vector.print %vector_1 : vector<16xf32> return } } diff --git a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir b/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir deleted file mode 100644 index 98de82596..000000000 --- a/test/Integration/Dialect/XeGPU/load2d_dpas_store2d.mlir +++ /dev/null @@ -1,97 +0,0 @@ -// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \ -// RUN: --runner imex-cpu-runner -e main \ -// RUN: --entry-point-result=void \ -// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck -// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xegpu-to-func-vc.pp \ -// RUN: --runner imex-cpu-runner -e main \ -// RUN: --entry-point-result=void \ -// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -module @gemm attributes {gpu.container_module} { - memref.global "private" @__constant_8x16xf16 : memref<8x16xf16> = dense<1.0> - memref.global "private" @__constant_16x16xf16 : memref<16x16xf16> = dense<1.0> - memref.global "private" @__constant_16x16xf32 : memref<16x16xf32> = dense<0.0> - - func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { - %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> - %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> - memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_0 : memref<16x16xf16> - return %memref_1 : memref<8x16xf32> - } - - gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { - %0 = xegpu.create_nd_tdesc %arg0[0, 0] - : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %1 = xegpu.create_nd_tdesc %arg1[0, 0] - : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %2 = xegpu.create_nd_tdesc %arg2[0, 0] - : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %4 = xegpu.load_nd %1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - %5 = xegpu.load_nd %2 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> - %6 = xegpu.dpas %3, %4, %5 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32> - xegpu.store_nd %6,%2 : vector<8x16xf32>,!xegpu.tensor_desc<8x16xf32> - gpu.return - } - } - func.func @main() attributes {llvm.emit_c_interface} { - %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> - %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> - %ref = memref.get_global @__constant_16x16xf32 : memref<16x16xf32> - - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index - %c1024 = arith.constant 1024 : index - - scf.for %arg0 = %c0 to %c8 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - %int0 = arith.index_cast %arg0 : index to i16 - %int1 = arith.index_cast %arg1 : index to i16 - %c16_i16 = arith.constant 16 : i16 - %idx0 = arith.muli %int0, %c16_i16 : i16 - %idx1 = arith.addi %int1, %idx0 : i16 - %fp = arith.uitofp %idx1 : i16 to f16 - %cst100 = arith.constant 1.0 : f16 - %val0 = arith.divf %fp, %cst100 : f16 - %cst1 = arith.constant 1.0 : f16 - %val1 = arith.addf %val0, %cst1 : f16 - memref.store %val0, %0[%arg0, %arg1] : memref<8x16xf16> - memref.store %val1, %1[%arg0, %arg1] : memref<16x16xf16> - } - } - // caculate the result C matrix - scf.for %arg0 = %c0 to %c8 step %c1 { - scf.for %arg1 = %c0 to %c16 step %c1 { - %acc = memref.load %ref[%arg0, %arg1] : memref<16x16xf32> - %res = scf.for %arg2 = %c0 to %c1024 step %c1 iter_args(%arg3 = %acc) -> f32 { - %a = memref.load %0[%arg0, %arg2] : memref<8x16xf16> - %b = memref.load %1[%arg2, %arg1] : memref<16x16xf16> - %c = arith.mulf %a, %b : f16 - %cc = arith.extf %c : f16 to f32 - %ccc = arith.addf %cc, %arg3 : f32 - scf.yield %ccc : f32 - } - memref.store %res, %ref[%arg0, %arg1] : memref<16x16xf32> - } - } - - %cast_ref = memref.cast %ref : memref<16x16xf32> to memref<*xf32> - %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> - %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> - // call @printMemrefF32(%cast) : (memref<*xf32>) -> () - // CHECK: [ALLCLOSE: TRUE] - call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> () - return - } - func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} -} diff --git a/test/Integration/Dialect/XeGPU/preop_dpas.mlir b/test/Integration/Dialect/XeGPU/preop_dpas.mlir index c3bef77ca..cc984886d 100644 --- a/test/Integration/Dialect/XeGPU/preop_dpas.mlir +++ b/test/Integration/Dialect/XeGPU/preop_dpas.mlir @@ -7,39 +7,57 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - memref.global "private" @__constant_8x16xf32 : memref<8x16xf32> = dense<0.0> - func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0> + memref.global "private" @__Bconstant_32x32xf16 : memref<32x32xf16> = dense<2.0> + func.func @test(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} { %c64 = arith.constant 64 : index - %c128 = arith.constant 128 : index + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %arg0, %memref : memref<8x16xf16> to memref<8x16xf16> - %memref_0 = gpu.alloc host_shared () : memref<16x16xf16> - memref.copy %arg1, %memref_0 : memref<16x16xf16> to memref<16x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @test_kernel::@test_kernel blocks in (%c128, %c64, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_0 : memref<16x16xf16>, %memref_1 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_0 : memref<16x16xf16> - return %memref_1 : memref<8x16xf32> + %memref = gpu.alloc host_shared () : memref<32x32xf16> + memref.copy %arg0, %memref : memref<32x32xf16> to memref<32x32xf16> + %memref_0 = gpu.alloc host_shared () : memref<32x32xf16> + memref.copy %arg1, %memref_0 : memref<32x32xf16> to memref<32x32xf16> + %memref_1 = gpu.alloc host_shared () : memref<32x32xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_0 : memref<32x32xf16>, %memref_1 : memref<32x32xf32>) + gpu.dealloc %memref : memref<32x32xf16> + gpu.dealloc %memref_0 : memref<32x32xf16> + return %memref_1 : memref<32x32xf32> } - gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %C: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { +gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { + gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %C: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + + %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index %c8 = arith.constant 8 : index - %c1024 = arith.constant 1024 : index %cst = arith.constant dense<1.0> : vector<8x16xf16> %0 = gpu.block_id x %1 = gpu.block_id y - %4 = xegpu.create_nd_tdesc %C[%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - %7 = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %8 = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %9 = xegpu.load_nd %7 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - %10 = xegpu.load_nd %8 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - %13 = arith.addf %9, %cst : vector<8x16xf16> - %11 = xegpu.dpas %13, %10 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> - xegpu.store_nd %11, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + + %2 = arith.muli %0, %c8 : index + %3 = arith.muli %1, %c16 : index + + %4 = xegpu.create_nd_tdesc %C[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + + %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) { + %A0 = xegpu.create_nd_tdesc %A[%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + + %B0 = xegpu.create_nd_tdesc %B[%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + + %A0_preop = arith.addf %A0_val, %cst : vector<8x16xf16> + + %dpas0 = xegpu.dpas %A0_preop, %B0_val , %arg4: vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32> + scf.yield %dpas0 : vector<8x16xf32> + } + xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return } } @@ -47,28 +65,19 @@ module @gemm attributes {gpu.container_module} { func.func @main() attributes {llvm.emit_c_interface} { %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %c16 = arith.constant 16 : index - %c128 = arith.constant 128 : index + %c32 = arith.constant 32 : index - %rand_lower = arith.constant -2.0 : f32 - %rand_upper = arith.constant 2.0 : f32 - %gen_int = arith.constant 1 : i1 + %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16> + %B = memref.get_global @__Bconstant_32x32xf16 : memref<32x32xf16> + %C_ref = memref.alloc() : memref<32x32xf32> - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> - %C_ref = memref.get_global @__constant_8x16xf32 : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%B_random, %rand_lower, %rand_upper, %gen_int) : (memref<*xf16>, f32, f32, i1) -> () // caculate the result C matrix - scf.for %i = %c0 to %c8 step %c1 { - scf.for %j = %c0 to %c16 step %c1 { - %acc = memref.load %C_ref[%i, %j] : memref<8x16xf32> - %res = scf.for %k = %c0 to %c16 step %c1 iter_args(%acc1 = %acc) -> f32 { - %a = memref.load %A[%i, %k] : memref<8x16xf16> - %b = memref.load %B[%k, %j] : memref<16x16xf16> + scf.for %i = %c0 to %c32 step %c1 { + scf.for %j = %c0 to %c32 step %c1 { + %acc = arith.constant 0.0 : f32 + %res = scf.for %k = %c0 to %c32 step %c1 iter_args(%acc1 = %acc) -> f32 { + %a = memref.load %A[%i, %k] : memref<32x32xf16> + %b = memref.load %B[%k, %j] : memref<32x32xf16> // adjust for preop in GPU kernel, where we add 1 between load and dpas %cst1 = arith.constant 1.0 : f16 %a_adj = arith.addf %a, %cst1 : f16 @@ -77,21 +86,20 @@ module @gemm attributes {gpu.container_module} { %ccc = arith.addf %cc, %acc1 : f32 scf.yield %ccc : f32 } - memref.store %res, %C_ref[%i, %j] : memref<8x16xf32> + memref.store %res, %C_ref[%i, %j] : memref<32x32xf32> } } - %2 = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> - %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + %2 = call @test(%A, %B) : (memref<32x32xf16>, memref<32x32xf16>) -> memref<32x32xf32> + %cast = memref.cast %2 : memref<32x32xf32> to memref<*xf32> // call @printMemrefF32(%cast) : (memref<*xf32>) -> () - %cast_ref = memref.cast %C_ref : memref<8x16xf32> to memref<*xf32> + %cast_ref = memref.cast %C_ref : memref<32x32xf32> to memref<*xf32> // call @printMaxErrorF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> () // call @printMemrefF32(%cast_ref) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%cast, %cast_ref) : (memref<*xf32>, memref<*xf32>) -> () return } - func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} func.func private @printMaxErrorF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir index 4cbfab6bc..2b6922ec3 100644 --- a/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir +++ b/test/Integration/Dialect/XeGPU/vector_broadcast_1.mlir @@ -7,47 +7,73 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0> + memref.global "private" @__constant_B32x32xf16 : memref<32x32xf16> = dense<2.0> + memref.global "private" @__constant_1x32xf16 : memref<1x32xf16> = dense<10.0> + func.func @test(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %bcast : memref<1x32xf16> ) -> memref<32x32xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<16x16xf16> + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %memref = gpu.alloc host_shared () : memref<32x32xf16> + %memref_1 = gpu.alloc host_shared () : memref<32x32xf16> %memref_2 = gpu.alloc host_shared () : memref<1x32xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> - memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16> + memref.copy %A, %memref : memref<32x32xf16> to memref<32x32xf16> + memref.copy %B, %memref_1 : memref<32x32xf16> to memref<32x32xf16> memref.copy %bcast, %memref_2 : memref<1x32xf16> to memref<1x32xf16> - %memref_3 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>, %memref_2 : memref<1x32xf16>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_1 : memref<16x16xf16> + %memref_3 = gpu.alloc host_shared () : memref<32x32xf32> + gpu.launch_func @module0::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_3 : memref<32x32xf32>, %memref_2 : memref<1x32xf16>) + gpu.dealloc %memref : memref<32x32xf16> + gpu.dealloc %memref_1 : memref<32x32xf16> gpu.dealloc %memref_2 : memref<1x32xf16> - return %memref_3 : memref<8x16xf32> + return %memref_3 : memref<32x32xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %Out: memref<32x32xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index - // load A tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - // load B tile - %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - // load B cast - %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16> - %val3 = xegpu.load_nd %bcast_tile : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16> - // extract first 16 elems - %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 16]} - : vector<1x32xf16> to vector<1x16xf16> - // broadcast over row dim - %val6 = vector.broadcast %val5 : vector<1x16xf16> to vector<8x16xf16> - // add to A - %val8 = arith.addf %val0, %val6 : vector<8x16xf16> - // do DPAS - %val4 = xegpu.dpas %val8, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %c32 = arith.constant 32 : index + %c8 = arith.constant 8 : index + + %0 = gpu.block_id x + %1 = gpu.block_id y + + %2 = arith.muli %0, %c8 : index + %3 = arith.muli %1, %c16 : index + + %4 = xegpu.create_nd_tdesc %Out[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + + %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) { + + // load A tile + %a_tile0 = xegpu.create_nd_tdesc %A [%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %A0_val = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + + // load B tile + %b_tile0 = xegpu.create_nd_tdesc %B [%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B0_val = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + + // load B cast + %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16> + %val3 = xegpu.load_nd %bcast_tile : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16> + + // extract first 16 elems + %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 16]} + : vector<1x32xf16> to vector<1x16xf16> + // broadcast over row dim + %val6 = vector.broadcast %val5 : vector<1x16xf16> to vector<8x16xf16> + // add to A + %A0_val8 = arith.addf %A0_val, %val6 : vector<8x16xf16> + + // do DPAS + %dpas = xegpu.dpas %A0_val8, %B0_val, %arg4 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32> + + scf.yield %dpas : vector<8x16xf32> + } // store - %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %val4, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + + xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } @@ -57,31 +83,33 @@ module @gemm attributes {gpu.container_module} { %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1_f32 = arith.constant 1.0 : f32 // random init %lower = arith.constant -1.0 : f32 %upper = arith.constant 1.0 : f32 %false = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> - %bcast = memref.alloc() : memref<1x32xf16> - %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16> + %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16> + %B = memref.get_global @__constant_B32x32xf16 : memref<32x32xf16> + %bcast = memref.get_global @__constant_1x32xf16 : memref<1x32xf16> + + %Out_cpu = memref.alloc() : memref<32x32xf32> + + %A_random = memref.cast %A : memref<32x32xf16> to memref<*xf16> + %B_random = memref.cast %B : memref<32x32xf16> to memref<*xf16> %bcast_random = memref.cast %bcast : memref<1x32xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%bcast_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () + // run GPU version - %Out_gpu = call @test(%A, %B, %bcast) : (memref<8x16xf16>, memref<16x16xf16>, memref<1x32xf16>) -> memref<8x16xf32> - %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32> + %Out_gpu = call @test(%A, %B, %bcast) : (memref<32x32xf16>, memref<32x32xf16>, memref<1x32xf16>) -> memref<32x32xf32> + %Out_gpu_cast = memref.cast %Out_gpu : memref<32x32xf32> to memref<*xf32> // run CPU version - scf.for %i = %c0 to %c8 step %c1 { - scf.for %j = %c0 to %c16 step %c1 { + scf.for %i = %c0 to %c32 step %c1 { + scf.for %j = %c0 to %c32 step %c1 { %v0_init = arith.constant 0.0 : f32 - %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 { - %a0 = memref.load %A[%i, %k] : memref<8x16xf16> - %b0 = memref.load %B[%k, %j] : memref<16x16xf16> + %result:1 = scf.for %k = %c0 to %c32 step %c1 iter_args(%v0 = %v0_init) -> f32 { + %a0 = memref.load %A[%i, %k] : memref<32x32xf16> + %b0 = memref.load %B[%k, %j] : memref<32x32xf16> %bcast_val = memref.load %bcast[%c0, %k] : memref<1x32xf16> %t1 = arith.addf %a0, %bcast_val : f16 %a0_f32 = arith.extf %t1 : f16 to f32 @@ -91,21 +119,15 @@ module @gemm attributes {gpu.container_module} { scf.yield %v0_new : f32 } // only update the first 8x8 of the result, next 8x8 is value 1 - memref.store %result#0, %Out_cpu[%i, %j] : memref<8x16xf32> + memref.store %result#0, %Out_cpu[%i, %j] : memref<32x32xf32> } } - %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> + %Out_cpu_cast = memref.cast %Out_cpu : memref<32x32xf32> to memref<*xf32> // print GPU and CPU outs // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> () // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () - // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> - memref.dealloc %Out_cpu : memref<8x16xf32> - // gpu dealloc - gpu.dealloc %Out_gpu : memref<8x16xf32> return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} diff --git a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir index e523f0c9e..007768ce7 100644 --- a/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir +++ b/test/Integration/Dialect/XeGPU/vector_broadcast_2.mlir @@ -7,49 +7,75 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %bcast : memref<1x32xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + memref.global "private" @__constant_32x32xf16 : memref<32x32xf16> = dense<1.0> + memref.global "private" @__constant_B32x32xf16 : memref<32x32xf16> = dense<2.0> + memref.global "private" @__constant_1x32xf16 : memref<1x32xf16> = dense<10.0> + func.func @test(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %bcast : memref<1x32xf16> ) -> memref<32x32xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<16x16xf16> + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %memref = gpu.alloc host_shared () : memref<32x32xf16> + %memref_1 = gpu.alloc host_shared () : memref<32x32xf16> %memref_2 = gpu.alloc host_shared () : memref<1x32xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> - memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16> + memref.copy %A, %memref : memref<32x32xf16> to memref<32x32xf16> + memref.copy %B, %memref_1 : memref<32x32xf16> to memref<32x32xf16> memref.copy %bcast, %memref_2 : memref<1x32xf16> to memref<1x32xf16> - %memref_3 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_3 : memref<8x16xf32>, %memref_2 : memref<1x32xf16>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_1 : memref<16x16xf16> + %memref_3 = gpu.alloc host_shared () : memref<32x32xf32> + gpu.launch_func @module0::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_3 : memref<32x32xf32>, %memref_2 : memref<1x32xf16>) + gpu.dealloc %memref : memref<32x32xf16> + gpu.dealloc %memref_1 : memref<32x32xf16> gpu.dealloc %memref_2 : memref<1x32xf16> - return %memref_3 : memref<8x16xf32> + return %memref_3 : memref<32x32xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%A: memref<32x32xf16>, %B: memref<32x32xf16>, %Out: memref<32x32xf32>, %bcast : memref<1x32xf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index - // load A tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> - // load B tile - %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - // load B cast - %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16> - %val3 = xegpu.load_nd %bcast_tile : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16> - // extract first 8 elems - %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 8]} - : vector<1x32xf16> to vector<1x8xf16> - // reshape and broadcast over col dim - %val6 = vector.shape_cast %val5 : vector<1x8xf16> to vector<8xf16> - %t = vector.shape_cast %val6 : vector<8xf16> to vector<8x1xf16> - %val7 = vector.broadcast %t : vector<8x1xf16> to vector<8x16xf16> - // add to A - %val9 = arith.addf %val0, %val7 : vector<8x16xf16> - // do DPAS - %val4 = xegpu.dpas %val9, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %c32 = arith.constant 32 : index + %c8 = arith.constant 8 : index + + %0 = gpu.block_id x + %1 = gpu.block_id y + + %2 = arith.muli %0, %c8 : index + %3 = arith.muli %1, %c16 : index + + %4 = xegpu.create_nd_tdesc %Out[%2, %3] : memref<32x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %5 = xegpu.load_nd %4 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> + + %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<8x16xf32>) { + + // load A tile + %a_tile0 = xegpu.create_nd_tdesc %A [%2, %arg3] : memref<32x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %A0_val = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + + // load B tile + %b_tile0 = xegpu.create_nd_tdesc %B [%arg3, %3] : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B0_val = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + + // load B cast + %bcast_tile = xegpu.create_nd_tdesc %bcast [%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x32xf16> + %val3 = xegpu.load_nd %bcast_tile : !xegpu.tensor_desc<1x32xf16> -> vector<1x32xf16> + + // extract first 8 elems + %val5 = vector.extract_strided_slice %val3 {offsets = [0, 0], strides = [1, 1], sizes = [1, 8]} + : vector<1x32xf16> to vector<1x8xf16> + // reshape and broadcast over col dim + %val6 = vector.shape_cast %val5 : vector<1x8xf16> to vector<8xf16> + %t = vector.shape_cast %val6 : vector<8xf16> to vector<8x1xf16> + %val7 = vector.broadcast %t : vector<8x1xf16> to vector<8x16xf16> + // add to A + %A0_val8 = arith.addf %A0_val, %val7 : vector<8x16xf16> + + // do DPAS + %dpas = xegpu.dpas %A0_val8, %B0_val, %arg4 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32> + + scf.yield %dpas : vector<8x16xf32> + } // store - %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %val4, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + + xegpu.store_nd %6, %4 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } @@ -59,31 +85,33 @@ module @gemm attributes {gpu.container_module} { %c1 = arith.constant 1 : index %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1_f32 = arith.constant 1.0 : f32 // random init %lower = arith.constant -1.0 : f32 %upper = arith.constant 1.0 : f32 %false = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> - %bcast = memref.alloc() : memref<1x32xf16> - %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16> + %A = memref.get_global @__constant_32x32xf16 : memref<32x32xf16> + %B = memref.get_global @__constant_B32x32xf16 : memref<32x32xf16> + %bcast = memref.get_global @__constant_1x32xf16 : memref<1x32xf16> + + %Out_cpu = memref.alloc() : memref<32x32xf32> + + %A_random = memref.cast %A : memref<32x32xf16> to memref<*xf16> + %B_random = memref.cast %B : memref<32x32xf16> to memref<*xf16> %bcast_random = memref.cast %bcast : memref<1x32xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%bcast_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () + // run GPU version - %Out_gpu = call @test(%A, %B, %bcast) : (memref<8x16xf16>, memref<16x16xf16>, memref<1x32xf16>) -> memref<8x16xf32> - %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32> + %Out_gpu = call @test(%A, %B, %bcast) : (memref<32x32xf16>, memref<32x32xf16>, memref<1x32xf16>) -> memref<32x32xf32> + %Out_gpu_cast = memref.cast %Out_gpu : memref<32x32xf32> to memref<*xf32> // run CPU version - scf.for %i = %c0 to %c8 step %c1 { - scf.for %j = %c0 to %c16 step %c1 { + scf.for %i = %c0 to %c32 step %c1 { + scf.for %j = %c0 to %c32 step %c1 { %v0_init = arith.constant 0.0 : f32 - %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 { - %a0 = memref.load %A[%i, %k] : memref<8x16xf16> - %b0 = memref.load %B[%k, %j] : memref<16x16xf16> + %result:1 = scf.for %k = %c0 to %c32 step %c1 iter_args(%v0 = %v0_init) -> f32 { + %a0 = memref.load %A[%i, %k] : memref<32x32xf16> + %b0 = memref.load %B[%k, %j] : memref<32x32xf16> %bcast_val = memref.load %bcast[%c0, %i] : memref<1x32xf16> %t1 = arith.addf %a0, %bcast_val : f16 %a0_f32 = arith.extf %t1 : f16 to f32 @@ -93,21 +121,15 @@ module @gemm attributes {gpu.container_module} { scf.yield %v0_new : f32 } // only update the first 8x8 of the result, next 8x8 is value 1 - memref.store %result#0, %Out_cpu[%i, %j] : memref<8x16xf32> + memref.store %result#0, %Out_cpu[%i, %j] : memref<32x32xf32> } } - %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> + %Out_cpu_cast = memref.cast %Out_cpu : memref<32x32xf32> to memref<*xf32> // print GPU and CPU outs // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> () // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () - // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> - memref.dealloc %Out_cpu : memref<8x16xf32> - // gpu dealloc - gpu.dealloc %Out_gpu : memref<8x16xf32> return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} diff --git a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir index 53494e0cf..46ded4f4f 100644 --- a/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir +++ b/test/Integration/Dialect/XeGPU/vector_extract_strided_slice_1.vc.mlir @@ -7,41 +7,63 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16>, %B: memref<16x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + memref.global "private" @__constant_8x32xf16 : memref<8x32xf16> = dense<1.0> + memref.global "private" @__constant_16x32xf16 : memref<16x32xf16> = dense<2.0> + + func.func @test(%A: memref<8x32xf16>, %B: memref<16x32xf16> ) -> memref<8x32xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<16x16xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> - memref.copy %B, %memref_1 : memref<16x16xf16> to memref<16x16xf16> - %memref_2 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_2 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> - gpu.dealloc %memref_1 : memref<16x16xf16> - return %memref_2 : memref<8x16xf32> + %memref = gpu.alloc host_shared () : memref<8x32xf16> + %memref_1 = gpu.alloc host_shared () : memref<16x32xf16> + memref.copy %A, %memref : memref<8x32xf16> to memref<8x32xf16> + memref.copy %B, %memref_1 : memref<16x32xf16> to memref<16x32xf16> + %memref_2 = gpu.alloc host_shared () : memref<8x32xf32> + gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x32xf16>, %memref_1 : memref<16x32xf16>, %memref_2 : memref<8x32xf32>) + gpu.dealloc %memref : memref<8x32xf16> + gpu.dealloc %memref_1 : memref<16x32xf16> + return %memref_2 : memref<8x32xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %B: memref<16x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%A: memref<8x32xf16>, %B: memref<16x32xf16>, %C: memref<8x32xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load A tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %A0 = xegpu.create_nd_tdesc %A[%c0, %c0] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %A1 = xegpu.create_nd_tdesc %A[%c0, %c16] : memref<8x32xf16> -> !xegpu.tensor_desc<8x16xf16> + %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %A1_val = xegpu.load_nd %A1 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + // load B tile - %b_tile0 = xegpu.create_nd_tdesc %B [%c0, %c0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %val2 = xegpu.load_nd %b_tile0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + %B0 = xegpu.create_nd_tdesc %B[%c0, %c0] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B1 = xegpu.create_nd_tdesc %B[%c0, %c16] : memref<16x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + %B1_val = xegpu.load_nd %B1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + // do DPAS - %val4 = xegpu.dpas %val0, %val2 : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %dpas0 = xegpu.dpas %A0_val, %B0_val : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %dpas1 = xegpu.dpas %A1_val, %B1_val : vector<8x16xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + // extract second 8x8 - %val5 = vector.extract_strided_slice %val4 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32> + %val5_0 = vector.extract_strided_slice %dpas0 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32> + %val5_1 = vector.extract_strided_slice %dpas1 {sizes = [8, 8], strides = [1, 1], offsets = [0, 8]} : vector<8x16xf32> to vector<8x8xf32> + %cst_8x8_flat = arith.constant dense<1.0> : vector<64xf32> %cst_8x8 = vector.shape_cast %cst_8x8_flat : vector<64xf32> to vector<8x8xf32> // shift the first half to left and use %cst_8x8 as the second half - %val6 = vector.shuffle %val5, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32> - %val7 = vector.shape_cast %val6 : vector<16x8xf32> to vector<8x16xf32> + + %val6_0 = vector.shuffle %val5_0, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32> + %val6_1 = vector.shuffle %val5_1, %cst_8x8 [0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15] : vector<8x8xf32>, vector<8x8xf32> + + %val7_0 = vector.shape_cast %val6_0 : vector<16x8xf32> to vector<8x16xf32> + %val7_1 = vector.shape_cast %val6_1 : vector<16x8xf32> to vector<8x16xf32> + // store - %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %val7, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + %out_tile_0 = xegpu.create_nd_tdesc %C [%c0, %c0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32> + %out_tile_1 = xegpu.create_nd_tdesc %C [%c0, %c16] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32> + + xegpu.store_nd %val7_0, %out_tile_0 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %val7_1, %out_tile_1 : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return } } @@ -52,51 +74,67 @@ module @gemm attributes {gpu.container_module} { %c8 = arith.constant 8 : index %c16 = arith.constant 16 : index %c1_f32 = arith.constant 1.0 : f32 + %c24 = arith.constant 24 : index + + %c32 = arith.constant 32 : index + // random init %lower = arith.constant -1.0 : f32 %upper = arith.constant 1.0 : f32 %false = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> - %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - %B_random = memref.cast %B : memref<16x16xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - call @fillResource1DRandomF16(%B_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () + %A = memref.get_global @__constant_8x32xf16 : memref<8x32xf16> + %B =memref.get_global @__constant_16x32xf16 : memref<16x32xf16> + %Out_cpu = memref.alloc() : memref<8x32xf32> // run GPU version - %Out_gpu = call @test(%A, %B) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> - %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32> + %Out_gpu = call @test(%A, %B) : (memref<8x32xf16>, memref<16x32xf16>) -> memref<8x32xf32> + %Out_gpu_cast = memref.cast %Out_gpu : memref<8x32xf32> to memref<*xf32> // run CPU version scf.for %i = %c0 to %c8 step %c1 { scf.for %j = %c8 to %c16 step %c1 { %v0_init = arith.constant 0.0 : f32 %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 { - %a0 = memref.load %A[%i, %k] : memref<8x16xf16> - %b0 = memref.load %B[%k, %j] : memref<16x16xf16> + %a0 = memref.load %A[%i, %k] : memref<8x32xf16> + %b0 = memref.load %B[%k, %j] : memref<16x32xf16> + %a0_f32 = arith.extf %a0 : f16 to f32 + %b0_f32 = arith.extf %b0 : f16 to f32 + %t0 = arith.mulf %a0_f32, %b0_f32 : f32 + %v0_new = arith.addf %v0, %t0 : f32 + scf.yield %v0_new : f32 + } + // only update the 8x8 of first half of 8x32 of the result, next 8x8 is value 1 + %shifted_j = arith.subi %j, %c8 : index + memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x32xf32> + memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x32xf32> + } + } + + // run CPU version + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c24 to %c32 step %c1 { + %v0_init = arith.constant 0.0 : f32 + %result:1 = scf.for %k = %c0 to %c16 step %c1 iter_args(%v0 = %v0_init) -> f32 { + %a0 = memref.load %A[%i, %k] : memref<8x32xf16> + %b0 = memref.load %B[%k, %j] : memref<16x32xf16> %a0_f32 = arith.extf %a0 : f16 to f32 %b0_f32 = arith.extf %b0 : f16 to f32 %t0 = arith.mulf %a0_f32, %b0_f32 : f32 %v0_new = arith.addf %v0, %t0 : f32 scf.yield %v0_new : f32 } - // only update the first 8x8 of the result, next 8x8 is value 1 + // only update the 8x8 of second half of 8x32 of the result, next 8x8 is value 1 %shifted_j = arith.subi %j, %c8 : index - memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x16xf32> - memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x16xf32> + memref.store %result#0, %Out_cpu[%i, %shifted_j] : memref<8x32xf32> + memref.store %c1_f32, %Out_cpu[%i, %j] : memref<8x32xf32> } } - %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> + %Out_cpu_cast = memref.cast %Out_cpu : memref<8x32xf32> to memref<*xf32> + // print GPU and CPU outs // call @printMemrefF32(%Out_cpu_cast) : (memref<*xf32>) -> () // call @printMemrefF32(%Out_gpu_cast) : (memref<*xf32>) -> () // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () - // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> - memref.dealloc %Out_cpu : memref<8x16xf32> - // gpu dealloc - gpu.dealloc %Out_gpu : memref<8x16xf32> + return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} diff --git a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir index 53b57afc8..9ee4a30d0 100644 --- a/test/Integration/Dialect/XeGPU/vector_insert_1.mlir +++ b/test/Integration/Dialect/XeGPU/vector_insert_1.mlir @@ -7,31 +7,30 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + func.func @test(%A: memref<8x16xf32> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32> %memref_2 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_2 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> + gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf32> return %memref_2 : memref<8x16xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> // extract row at pos 2 - %a_row = vector.extract %val0 [2] : vector<16xf16> from vector<8x16xf16> + %a_row = vector.extract %val0 [2] : vector<16xf32> from vector<8x16xf32> // insert row at pos 7 - %val3 = vector.insert %a_row, %val0 [7] : vector<16xf16> into vector<8x16xf16> - %val4 = arith.extf %val3 : vector<8x16xf16> to vector<8x16xf32> + %val3 = vector.insert %a_row, %val0 [7] : vector<16xf32> into vector<8x16xf32> // store %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %val4, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %val3, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } @@ -45,33 +44,29 @@ module @gemm attributes {gpu.container_module} { %c16 = arith.constant 16 : index %c1_f32 = arith.constant 1.0 : f32 %c2_f32 = arith.constant 2.0 : f32 - %cst = arith.constant 2.0 : f16 + %cst = arith.constant 2.0 : f32 // random init %lower = arith.constant -3.0 : f32 %upper = arith.constant 3.0 : f32 %false = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> + %A = memref.alloc() : memref<8x16xf32> %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - // call @fillResource1DF16(%A_random, %c1_f32) : (memref<*xf16>, f32) -> () + %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @fillResource1DRandomF32(%A_random, %lower, %upper, %false) : (memref<*xf32>, f32, f32, i1) -> () // run GPU version - %Out_gpu = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32> + %Out_gpu = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32> // run CPU version scf.for %i = %c0 to %c8 step %c1 { scf.for %j = %c0 to %c16 step %c1 { - %v = memref.load %A[%i, %j] : memref<8x16xf16> - %v_f32 = arith.extf %v : f16 to f32 - memref.store %v_f32, %Out_cpu[%i, %j] : memref<8x16xf32> + %v = memref.load %A[%i, %j] : memref<8x16xf32> + memref.store %v, %Out_cpu[%i, %j] : memref<8x16xf32> } } scf.for %i = %c0 to %c16 step %c1 { - %v = memref.load %A[%c2, %i] : memref<8x16xf16> - %v_f32 = arith.extf %v : f16 to f32 - memref.store %v_f32, %Out_cpu[%c7, %i] : memref<8x16xf32> + %v = memref.load %A[%c2, %i] : memref<8x16xf32> + memref.store %v, %Out_cpu[%c7, %i] : memref<8x16xf32> } %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> @@ -81,15 +76,13 @@ module @gemm attributes {gpu.container_module} { // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> + memref.dealloc %A : memref<8x16xf32> memref.dealloc %Out_cpu : memref<8x16xf32> // gpu dealloc gpu.dealloc %Out_gpu : memref<8x16xf32> return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface} + func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface} } diff --git a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir index bcca52e98..aec7eef99 100644 --- a/test/Integration/Dialect/XeGPU/vector_insert_2.mlir +++ b/test/Integration/Dialect/XeGPU/vector_insert_2.mlir @@ -7,31 +7,30 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module} { - func.func @test(%A: memref<8x16xf16> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + func.func @test(%A: memref<8x16xf32> ) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index - %memref = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %A, %memref : memref<8x16xf16> to memref<8x16xf16> + %memref = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %A, %memref : memref<8x16xf32> to memref<8x16xf32> %memref_2 = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf16>, %memref_2 : memref<8x16xf32>) - gpu.dealloc %memref : memref<8x16xf16> + gpu.launch_func @module0::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref : memref<8x16xf32>, %memref_2 : memref<8x16xf32>) + gpu.dealloc %memref : memref<8x16xf32> return %memref_2 : memref<8x16xf32> } gpu.module @module0 attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - gpu.func @test_kernel(%A: memref<8x16xf16>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + gpu.func @test_kernel(%A: memref<8x16xf32>, %Out: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index %c16 = arith.constant 16 : index // load tile - %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16> - %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf16> -> vector<8x16xf16> + %a_tile0 = xegpu.create_nd_tdesc %A [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %val0 = xegpu.load_nd %a_tile0 : !xegpu.tensor_desc<8x16xf32> -> vector<8x16xf32> // define const vector - %cst = arith.constant dense<1.23> : vector<16xf16> + %cst = arith.constant dense<1.23> : vector<16xf32> // insert row at pos 7 - %val3 = vector.insert %cst, %val0 [7] : vector<16xf16> into vector<8x16xf16> - %val4 = arith.extf %val3 : vector<8x16xf16> to vector<8x16xf32> + %val3 = vector.insert %cst, %val0 [7] : vector<16xf32> into vector<8x16xf32> // store %out_tile = xegpu.create_nd_tdesc %Out [%c0, %c0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> - xegpu.store_nd %val4, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> + xegpu.store_nd %val3, %out_tile : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32> gpu.return } } @@ -45,32 +44,28 @@ module @gemm attributes {gpu.container_module} { %c16 = arith.constant 16 : index %c1_f32 = arith.constant 1.0 : f32 %c2_f32 = arith.constant 2.0 : f32 - %cst = arith.constant 1.23 : f16 + %cst = arith.constant 1.23 : f32 // random init %lower = arith.constant -3.0 : f32 %upper = arith.constant 3.0 : f32 %false = arith.constant 0 : i1 - %A = memref.alloc() : memref<8x16xf16> - %B = memref.alloc() : memref<16x16xf16> + %A = memref.alloc() : memref<8x16xf32> %Out_cpu = memref.alloc() : memref<8x16xf32> - %A_random = memref.cast %A : memref<8x16xf16> to memref<*xf16> - call @fillResource1DRandomF16(%A_random, %lower, %upper, %false) : (memref<*xf16>, f32, f32, i1) -> () - // call @fillResource1DF16(%A_random, %c1_f32) : (memref<*xf16>, f32) -> () + %A_random = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @fillResource1DRandomF32(%A_random, %lower, %upper, %false) : (memref<*xf32>, f32, f32, i1) -> () // run GPU version - %Out_gpu = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf32> + %Out_gpu = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> %Out_gpu_cast = memref.cast %Out_gpu : memref<8x16xf32> to memref<*xf32> // run CPU version scf.for %i = %c0 to %c8 step %c1 { scf.for %j = %c0 to %c16 step %c1 { - %v = memref.load %A[%i, %j] : memref<8x16xf16> - %v_f32 = arith.extf %v : f16 to f32 - memref.store %v_f32, %Out_cpu[%i, %j] : memref<8x16xf32> + %v = memref.load %A[%i, %j] : memref<8x16xf32> + memref.store %v, %Out_cpu[%i, %j] : memref<8x16xf32> } } scf.for %i = %c0 to %c16 step %c1 { - %cst_f32 = arith.extf %cst : f16 to f32 - memref.store %cst_f32, %Out_cpu[%c7, %i] : memref<8x16xf32> + memref.store %cst, %Out_cpu[%c7, %i] : memref<8x16xf32> } %Out_cpu_cast = memref.cast %Out_cpu : memref<8x16xf32> to memref<*xf32> @@ -80,15 +75,13 @@ module @gemm attributes {gpu.container_module} { // CHECK: [ALLCLOSE: TRUE] call @printAllcloseF32(%Out_gpu_cast, %Out_cpu_cast) : (memref<*xf32>, memref<*xf32>) -> () // dealloc - memref.dealloc %A : memref<8x16xf16> - memref.dealloc %B : memref<16x16xf16> + memref.dealloc %A : memref<8x16xf32> memref.dealloc %Out_cpu : memref<8x16xf32> // gpu dealloc gpu.dealloc %Out_gpu : memref<8x16xf32> return } func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DRandomF16(memref<*xf16>, f32, f32, i1) attributes {llvm.emit_c_interface} + func.func private @fillResource1DRandomF32(memref<*xf32>, f32, f32, i1) attributes {llvm.emit_c_interface} func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface} - func.func private @fillResource1DF16(memref<*xf16>, f32) attributes {llvm.emit_c_interface} } diff --git a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir index fe13bcc47..c16cc3815 100644 --- a/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir +++ b/test/Integration/Dialect/XeGPU/xegpu-to-vc.mlir @@ -8,46 +8,77 @@ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck module @gemm attributes {gpu.container_module, spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { - memref.global "private" constant @__constant_8x16xf16 : memref<8x16xf16> = dense<5.000000e-01> - memref.global "private" constant @__constant_16x16xf16 : memref<16x16xf16> = dense<1.099610e+00> - func.func @test(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>) -> memref<8x16xf32> { + memref.global "private" constant @__constant_32x32xf16 : memref<32x32xf16> = dense<5.000000e-01> + memref.global "private" constant @__Bconstant_32x32xf16 : memref<32x32xf16> = dense<1.099610e+00> + func.func @test(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>) -> memref<32x32xf32> { %c1 = arith.constant 1 : index - %memref_0 = gpu.alloc host_shared () : memref<8x16xf16> - memref.copy %arg0, %memref_0 : memref<8x16xf16> to memref<8x16xf16> - %memref_1 = gpu.alloc host_shared () : memref<16x16xf16> - memref.copy %arg1, %memref_1 : memref<16x16xf16> to memref<16x16xf16> - %memref_c = gpu.alloc host_shared () : memref<8x16xf32> - gpu.launch_func @test_kernel::@test_kernel blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<8x16xf16>, %memref_1 : memref<16x16xf16>, %memref_c : memref<8x16xf32>) - %result = memref.alloc() : memref<8x16xf32> - memref.copy %memref_c, %result: memref<8x16xf32> to memref<8x16xf32> - gpu.dealloc %memref_0 : memref<8x16xf16> - gpu.dealloc %memref_1 : memref<16x16xf16> - gpu.dealloc %memref_c :memref<8x16xf32> + %c4 = arith.constant 4 : index + %c2 = arith.constant 2 : index + %memref_0 = gpu.alloc host_shared () : memref<32x32xf16> + memref.copy %arg0, %memref_0 : memref<32x32xf16> to memref<32x32xf16> + %memref_1 = gpu.alloc host_shared () : memref<32x32xf16> + memref.copy %arg1, %memref_1 : memref<32x32xf16> to memref<32x32xf16> + %memref_c = gpu.alloc host_shared () : memref<32x32xf32> + gpu.launch_func @test_kernel::@test_kernel blocks in (%c4, %c2, %c1) threads in (%c1, %c1, %c1) args(%memref_0 : memref<32x32xf16>, %memref_1 : memref<32x32xf16>, %memref_c : memref<32x32xf32>) + %result = memref.alloc() : memref<32x32xf32> + memref.copy %memref_c, %result: memref<32x32xf32> to memref<32x32xf32> + gpu.dealloc %memref_0 : memref<32x32xf16> + gpu.dealloc %memref_1 : memref<32x32xf16> + gpu.dealloc %memref_c :memref<32x32xf32> - return %result : memref<8x16xf32> + return %result : memref<32x32xf32> } gpu.module @test_kernel { - gpu.func @test_kernel(%arg0: memref<8x16xf16>, %arg1: memref<16x16xf16>, %arg2: memref<8x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + gpu.func @test_kernel(%arg0: memref<32x32xf16>, %arg1: memref<32x32xf16>, %arg2: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array, gpu.known_grid_size = array, spirv.entry_point_abi = #spirv.entry_point_abi<>}{ + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c128 = arith.constant 128 : index + %c8 = arith.constant 8 : index + + %0 = gpu.block_id x + %1 = gpu.block_id y + + %2 = arith.muli %0, %c8 : index + %3 = arith.muli %1, %c16 : index + %128 = arith.muli %c8, %c16 : index + %256 = arith.muli %128, %c2 : index + %x = arith.muli %256, %0 : index + %y = arith.muli %128, %1 : index + + %c_index = arith.addi %x, %y : index + %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [1024], strides: [1] : memref<32x32xf32> to memref<1024xf32> + %C0 = xegpu.create_nd_tdesc %arg02[%c_index] : memref<1024xf32> -> !xegpu.tensor_desc<128xf32> + %5 = xegpu.load_nd %C0 : !xegpu.tensor_desc<128xf32> -> vector<128xf32> + + %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [1024], strides: [1] : memref<32x32xf16> to memref<1024xf16> + + %6 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %5) -> (vector<128xf32>) { + %a_index = arith.addi %x, %arg3 : index + %A0 = xegpu.create_nd_tdesc %arg00[%a_index]: memref<1024xf16> -> !xegpu.tensor_desc<128xf16> + %A0_val = xegpu.load_nd %A0 : !xegpu.tensor_desc<128xf16> -> vector<128xf16> + + %B0 = xegpu.create_nd_tdesc %arg1[%arg3, %3] {boundary_check = true} : memref<32x32xf16> -> !xegpu.tensor_desc<16x16xf16> + %B0_val = xegpu.load_nd %B0 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> + + %A0_cast = vector.shape_cast %A0_val : vector<128xf16> to vector<8x8x2xf16> + + %dpas0 = xegpu.dpas %A0_cast, %B0_val : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> + %dpas0_cast = vector.shape_cast %dpas0: vector<8x16xf32> to vector<128xf32> + + scf.yield %dpas0_cast : vector<128xf32> + } + xegpu.store_nd %6, %C0 : vector<128xf32>, !xegpu.tensor_desc<128xf32> - %arg00 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf16> to memref<128xf16> - %0 = xegpu.create_nd_tdesc %arg00[0]: memref<128xf16> -> !xegpu.tensor_desc<128xf16> - %1 = xegpu.create_nd_tdesc %arg1[0, 0] {boundary_check = true} : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16> - %arg02 = memref.reinterpret_cast %arg2 to offset: [0], sizes: [128], strides: [1] : memref<8x16xf32> to memref<128xf32> - %2 = xegpu.create_nd_tdesc %arg02[0] : memref<128xf32> -> !xegpu.tensor_desc<128xf32> - %3 = xegpu.load_nd %0 : !xegpu.tensor_desc<128xf16> -> vector<128xf16> - %4 = xegpu.load_nd %1 {packed} : !xegpu.tensor_desc<16x16xf16> -> vector<8x16x2xf16> - %6 = vector.shape_cast %3: vector<128xf16> to vector<8x8x2xf16> - %5 = xegpu.dpas %6, %4 : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32> - %7 = vector.shape_cast %5: vector<8x16xf32> to vector<128xf32> - xegpu.store_nd %7, %2 : vector<128xf32>, !xegpu.tensor_desc<128xf32> gpu.return } } func.func @main() { - %0 = memref.get_global @__constant_8x16xf16 : memref<8x16xf16> - %1 = memref.get_global @__constant_16x16xf16 : memref<16x16xf16> - %2 = call @test(%0, %1) : (memref<8x16xf16>, memref<16x16xf16>) -> memref<8x16xf32> - %cast = memref.cast %2 : memref<8x16xf32> to memref<*xf32> + %0 = memref.get_global @__constant_32x32xf16 : memref<32x32xf16> + %1 = memref.get_global @__Bconstant_32x32xf16 : memref<32x32xf16> + %2 = call @test(%0, %1) : (memref<32x32xf16>, memref<32x32xf16>) -> memref<32x32xf32> + %cast = memref.cast %2 : memref<32x32xf32> to memref<*xf32> call @printMemrefF32(%cast) : (memref<*xf32>) -> () return } @@ -55,12 +86,36 @@ spirv.target_env = #spirv.target_env<#spirv.vce Date: Wed, 28 Aug 2024 13:50:32 -0500 Subject: [PATCH 2/2] Reimplement the blocking pass with backward dataflow analysis framework. (#848) --- include/imex/Dialect/XeTile/IR/XeTileOps.td | 9 +- .../imex/Dialect/XeTile/Transforms/Passes.h | 2 + .../imex/Dialect/XeTile/Transforms/Passes.td | 26 + .../XeTileToXeGPU/XeTileOpConversion.cpp | 13 +- lib/Dialect/XeTile/IR/XeTileOps.cpp | 4 +- lib/Dialect/XeTile/Transforms/Blocking.cpp | 23 +- .../XeTile/Transforms/BlockingAnalysis.cpp | 778 ++++++++++++++++ .../XeTile/Transforms/BlockingAnalysis.h | 68 ++ .../XeTile/Transforms/BlockingRewrite.cpp | 875 ++++++++++++++++++ lib/Dialect/XeTile/Transforms/CMakeLists.txt | 6 +- .../XeTile/Transforms/Canonicalization.cpp | 2 +- lib/Dialect/XeTile/Transforms/PassDetail.h | 4 + test/Conversion/XeTileToXeGPU/sg_softmax.mlir | 4 +- .../XeTileToXeGPU/sg_tiled_softmax.mlir | 4 +- test/Dialect/XeTile/IR/invalid.mlir | 2 +- test/Dialect/XeTile/IR/ops.mlir | 8 +- .../Blocking/persistent_kernel.mlir | 201 ++++ .../Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir | 99 ++ .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir | 45 + .../Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir | 79 ++ .../Transforms/Blocking/unit_tests.mlir | 339 +++++++ test/Dialect/XeTile/Transforms/blocking.mlir | 12 +- .../XeTile/block_reduce_dim_0_fp32.mlir | 2 +- .../XeTile/block_reduce_dim_1_fp32.mlir | 2 +- .../XeTile/block_softmax_dim_0_fp32.mlir | 2 +- .../XeTile/block_softmax_dim_1_fp32.mlir | 2 +- 26 files changed, 2567 insertions(+), 44 deletions(-) create mode 100644 lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp create mode 100644 lib/Dialect/XeTile/Transforms/BlockingAnalysis.h create mode 100644 lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp create mode 100644 test/Dialect/XeTile/Transforms/Blocking/persistent_kernel.mlir create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32.mlir create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir create mode 100644 test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_i8_i32.mlir create mode 100644 test/Dialect/XeTile/Transforms/Blocking/unit_tests.mlir diff --git a/include/imex/Dialect/XeTile/IR/XeTileOps.td b/include/imex/Dialect/XeTile/IR/XeTileOps.td index 55399a7a9..cf0f6eed5 100644 --- a/include/imex/Dialect/XeTile/IR/XeTileOps.td +++ b/include/imex/Dialect/XeTile/IR/XeTileOps.td @@ -447,6 +447,9 @@ def XeTile_TileMMAOp : XeTile_Op<"tile_mma", []> { mlir::Type getElementType() { return getA().getType().getElementType(); } + mlir::VectorType getOutputType() { + return getOutput().getType(); + } }]; let hasVerifier = 1; @@ -581,7 +584,7 @@ def XeTile_TransposeOp: XeTile_Op<"transpose", []> { let hasVerifier = 1; } -def XeTile_ReduceOp: XeTile_Op<"reduce", []> { +def XeTile_ReductionOp: XeTile_Op<"reduction", []> { let summary = "performs a reduction operation over a 2D vector."; let description = [{ It has the same semantics as the `vector.multi_reduction`, @@ -591,10 +594,10 @@ def XeTile_ReduceOp: XeTile_Op<"reduce", []> { let arguments = (ins Vector_CombiningKindAttr: $kind, XeTile_2DOr4DVector: $source, - DenseI64ArrayAttr: $reduction_dim); + DenseI64ArrayAttr: $reduction_dims); let results = (outs XeTile_2DOr4DVector: $result); let assemblyFormat = [{ - $kind `,` $source $reduction_dim attr-dict `:` type($source) `->` type($result) + $kind `,` $source $reduction_dims attr-dict `:` type($source) `->` type($result) }]; let hasVerifier = 1; diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.h b/include/imex/Dialect/XeTile/Transforms/Passes.h index 0f1d948a4..91b002c76 100644 --- a/include/imex/Dialect/XeTile/Transforms/Passes.h +++ b/include/imex/Dialect/XeTile/Transforms/Passes.h @@ -40,6 +40,8 @@ std::unique_ptr createXeTileInitDuplicatePass(); std::unique_ptr createXeTileBlockingPass(const std::string &device = "pvc"); +std::unique_ptr +createNewXeTileBlockingPass(const std::string &device = "pvc"); std::unique_ptr createXeTileBlockAligningPass(); std::unique_ptr createXeTileWgToSgPass(); std::unique_ptr createXeTileOptimizeTransposePass(); diff --git a/include/imex/Dialect/XeTile/Transforms/Passes.td b/include/imex/Dialect/XeTile/Transforms/Passes.td index d0737931c..242a90a5e 100644 --- a/include/imex/Dialect/XeTile/Transforms/Passes.td +++ b/include/imex/Dialect/XeTile/Transforms/Passes.td @@ -130,5 +130,31 @@ def XeTileCanonicalization : Pass<"xetile-canonicalization", "::mlir::gpu::GPUMo ]; } +def NewXeTileBlocking : Pass<"new-xetile-blocking", "::mlir::gpu::GPUModuleOp">{ + let summary = "transform XeTile large tiles(input) into arrays of smaller " + "blocks with appropriate size, such that the operator on each " + "of the blocks can be mapped into one hardware instruction."; + + let description = [{ + This transform pass preprocesses the xetile program by decomposing large XeTile tiles + into smaller ones that can be handled by a hardware instruction. It is going to replace + the xetile-blocking pass. + }]; + + let constructor = "imex::createNewXeTileBlockingPass()"; + let dependentDialects = ["imex::xetile::XeTileDialect", + "mlir::arith::ArithDialect", + "mlir::math::MathDialect", + "mlir::gpu::GPUDialect", + "mlir::memref::MemRefDialect", + "mlir::vector::VectorDialect"]; + + let options = [ + Option<"device", "device", "std::string", + /*default=*/"\"pvc\"", + "gpu platform architecture where these ops are running"> + ]; +} + #endif // _XeTile_PASSES_TD_INCLUDED_ diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp index d471c8f1b..83d392e54 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp @@ -736,15 +736,16 @@ extern llvm::SmallVector lowerInnerReductionWithVectorReduction( mlir::vector::CombiningKind kind, mlir::Location loc, mlir::Type elemTy, XeOneToNPatternRewriter &rewriter); -struct SgTileReduceOpPattern : public XeOneToNConversion { - using XeOneToNConversion::XeOneToNConversion; +struct SgTileReductionOpPattern + : public XeOneToNConversion { + using XeOneToNConversion::XeOneToNConversion; mlir::LogicalResult - matchAndRewrite(xetile::ReduceOp op, OpAdaptor adaptor, + matchAndRewrite(xetile::ReductionOp op, OpAdaptor adaptor, XeOneToNPatternRewriter &rewriter) const override { auto srcTy = op.getSource().getType(); auto elemTy = srcTy.getElementType(); - auto dims = op.getReductionDim(); + auto dims = op.getReductionDims(); // its input should be a 4D vector, and has 2 reduction dims, // otherwise run the blocking pass first. if (dims.size() != 2 || srcTy.getRank() != 4) @@ -1092,8 +1093,8 @@ void populateXeTileOpConversionPatterns(imex::XeOneToNTypeConverter &converter, SgTileMMAOpPattern, SgUpdateTileOffsetOpPattern, SgTransposeOpPattern, SgTransposeOpPattern, SgBroadcastOpPattern, - SgTileReduceOpPattern, SgVectorCreateMaskOpPattern>(patterns.getContext(), - converter, analysis); + SgTileReductionOpPattern, SgVectorCreateMaskOpPattern>( + patterns.getContext(), converter, analysis); patterns.insert, ElementWiseOpPattern, ElementWiseOpPattern, diff --git a/lib/Dialect/XeTile/IR/XeTileOps.cpp b/lib/Dialect/XeTile/IR/XeTileOps.cpp index d52caf164..060b93a69 100644 --- a/lib/Dialect/XeTile/IR/XeTileOps.cpp +++ b/lib/Dialect/XeTile/IR/XeTileOps.cpp @@ -859,8 +859,8 @@ mlir::LogicalResult TransposeOp::verify() { return mlir::success(); } -mlir::LogicalResult ReduceOp::verify() { - auto dims = getReductionDim(); +mlir::LogicalResult ReductionOp::verify() { + auto dims = getReductionDims(); auto resShape = getResult().getType().getShape(); for (auto i : dims) if (resShape[i] != 1) diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp index fdefa8e76..7bbec87d1 100644 --- a/lib/Dialect/XeTile/Transforms/Blocking.cpp +++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp @@ -556,15 +556,16 @@ struct VectorMultiDimReductionOpPattern } }; -struct TileReduceOpPattern - : public XeTileConversion { +struct TileReductionOpPattern + : public XeTileConversion { - using XeTileConversion::XeTileConversion; + using XeTileConversion::XeTileConversion; - TileReduceOpPattern(mlir::MLIRContext *context, - imex::XeTypeConverter &converter, - TileUsageAnalysis &analysis, - std::shared_ptr ptruArch) + TileReductionOpPattern(mlir::MLIRContext *context, + imex::XeTypeConverter &converter, + TileUsageAnalysis &analysis, + std::shared_ptr ptruArch) : XeTileConversion(context, converter, analysis) { this->uArchInterface = ptruArch; } @@ -572,13 +573,13 @@ struct TileReduceOpPattern std::shared_ptr uArchInterface = nullptr; mlir::LogicalResult - matchAndRewrite(xetile::ReduceOp op, OpAdaptor adaptor, + matchAndRewrite(xetile::ReductionOp op, OpAdaptor adaptor, OpPatternRewriter &rewriter) const override { auto loc = op.getLoc(); auto srcTy = op.getSource().getType(); auto elemTy = srcTy.getElementType(); auto shape = srcTy.getShape(); - auto reductionDims = op.getReductionDim(); + auto reductionDims = op.getReductionDims(); if (srcTy.getRank() != 2 || reductionDims.size() != 1) return rewriter.notifyMatchFailure( @@ -611,7 +612,7 @@ struct TileReduceOpPattern auto newSource = addPackOp(adaptor.getSource(), {blkSizes[0], blkSizes[1]}, rewriter); - auto newDest = rewriter.create( + auto newDest = rewriter.create( loc, newDestType, op.getKind(), newSource, newReductionDims); auto unpack = addUnpackOp(newDest.getResult(), rewriter); rewriter.replaceOp(op, unpack); @@ -1161,7 +1162,7 @@ void populateXeTileBlockingPatterns( VectorizableOpPattern, SCFForOpPattern, SCFYieldOpPattern, InitTileOpPattern, LoadTileOpPattern, StoreTileOpPattern, TileMMAOpPattern, UpdateTileOffsetOpPattern, - VectorMultiDimReductionOpPattern, TileReduceOpPattern, + VectorMultiDimReductionOpPattern, TileReductionOpPattern, TileBroadcastOpPattern>(patterns.getContext(), converter, analysis, ptruArch); patterns.insert, diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp new file mode 100644 index 000000000..ada144241 --- /dev/null +++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp @@ -0,0 +1,778 @@ +#include +#include +#include + +#include "BlockingAnalysis.h" + +namespace llvm { +using imex::Block; +// Implementation of llvm::DenseMapInfo for Block, required for +// using Block as a value in DenseMap. +template <> struct DenseMapInfo { + static inline Block getEmptyKey() { + return Block(-1, -1); // the empty key + } + + static inline Block getTombstoneKey() { + return Block(-2, -2); // the tombstone key + } + + static unsigned getHashValue(const Block &b) { + return hash_combine(b[0], b[1]); + } + + static bool isEqual(const Block &lhs, const Block &rhs) { return lhs == rhs; } +}; +} // namespace llvm + +namespace imex { + +// ===------------------ Block Implementation --------------------------===// + +int64_t &Block::operator[](size_t index) { + assert(index < 2 && "Index out of bounds"); + return values[index]; +} + +const int64_t &Block::operator[](size_t index) const { + assert(index < 2 && "Index out of bounds"); + return values[index]; +} + +bool Block::operator==(Block &other) const { + return values[0] == other.values[0] && values[1] == other.values[1]; +} + +bool Block::operator==(const Block &other) const { + return values[0] == other.values[0] && values[1] == other.values[1]; +} + +void Block::print(llvm::raw_ostream &os) const { + os << "[" << values[0] << ", " << values[1] << "]"; +} + +llvm::ArrayRef Block::asArrayRef() const { + return llvm::ArrayRef(values, 2); +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk) { + blk.print(os); + return os; +} + +// ===------------------ BlockRequests Implementation --------------------===// +// A class holding all blocking requests for a given mlir::Value. +// For convience, it also tracks the UsePoint of the value. +class BlockingRequests { +public: + BlockingRequests() = default; + BlockingRequests(int64_t h, int64_t w, mlir::Operation *user, int64_t pos) + : BlockingRequests(h, w, UsePoint(user, pos)) {} + + BlockingRequests(int64_t h, int64_t w, UsePoint point) + : BlockingRequests(Block(h, w), point) {} + + BlockingRequests(llvm::ArrayRef shape, UsePoint point) + : BlockingRequests(shape[0], shape[1], point) { + assert(shape.size() == 2 && "Invalid block size."); + } + + BlockingRequests(Block block, UsePoint point); + + bool operator==(const BlockingRequests &other) const; + bool operator!=(const BlockingRequests &other) const; + + Block getDefBlock() const; + Block getUseBlock(UsePoint point) const; + + void print(llvm::raw_ostream &os) const; + + static BlockingRequests meet(const BlockingRequests &lhs, + const BlockingRequests &rhs); + + static BlockingRequests join(const BlockingRequests &lhs, + const BlockingRequests &rhs); + + // indicate that one use of the result operand + // has decided on the inner block size. + bool isInitialized() const { return requests.size() != 0; } + + int64_t getNumUniqRequests() const { return getRequests().size(); } + + llvm::SmallVector getRequests() const { + llvm::SmallDenseSet reqs; + for (auto [point, block] : requests) + reqs.insert(block); + return llvm::SmallVector(reqs.begin(), reqs.end()); + } + + void updateDefBlock(Block block) { def = block; } + +private: + Block def; + llvm::DenseMap requests; +}; + +BlockingRequests::BlockingRequests(Block block, UsePoint point) { + assert(block && "Invalid block."); + requests.try_emplace(point, block); +} + +Block BlockingRequests::getDefBlock() const { + if (def) + return def; + if (requests.size()) + return (requests.begin()->second); + return Block(); +} + +Block BlockingRequests::getUseBlock(UsePoint point) const { + return requests.lookup(point); +} + +void BlockingRequests::print(llvm::raw_ostream &os) const { + if (!isInitialized()) { + os << "Uninitialized"; + } else { + os << "Requests (" << requests.size() << ", " + << "def: " << def << "): "; + for (auto [i, iter] : llvm::enumerate(requests)) { + os << "(" << *(iter.first).first << ", " << (iter.first).second + << "): \n\t" << iter.second; + if (i != requests.size() - 1) + os << ", "; + } + } +} + +bool BlockingRequests::operator==(const BlockingRequests &other) const { + return requests == other.requests; +} + +bool BlockingRequests::operator!=(const BlockingRequests &other) const { + return !(*this == other); +} + +BlockingRequests BlockingRequests::meet(const BlockingRequests &lhs, + const BlockingRequests &rhs) { + return join(lhs, rhs); +} + +BlockingRequests BlockingRequests::join(const BlockingRequests &lhs, + const BlockingRequests &rhs) { + BlockingRequests newReq; + if (lhs.isInitialized()) { + for (auto [point, block] : lhs.requests) { + newReq.requests.try_emplace(point, block); + } + } + if (rhs.isInitialized()) { + for (auto [point, block] : rhs.requests) { + newReq.requests.try_emplace(point, block); + } + } + return newReq; +} + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, + BlockingRequests requests) { + requests.print(os); + return os; +} + +// ===---------------- BlockingLattice Implementation -----------------===// +// A lattice wrapper for BlockingRequests +struct BlockingLattice : public mlir::dataflow::Lattice { + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(BlockingLattice) + using Lattice::Lattice; + + mlir::ChangeResult join(const AbstractSparseLattice &rhs) override { + return join(static_cast(rhs).getValue()); + } + + mlir::ChangeResult join(const BlockingRequests &other) { + auto &val = getValue(); + BlockingRequests newValue = BlockingRequests::join(val, other); + if (newValue == val) + return mlir::ChangeResult::NoChange; + val = newValue; + return mlir::ChangeResult::Change; + } +}; + +// ===----------------------BlockingAnalysisImpl ---------------------===// +class BlockingAnalysisImpl + : public mlir::dataflow::SparseBackwardDataFlowAnalysis { +public: + BlockingAnalysisImpl(mlir::DataFlowSolver &solver, + mlir::SymbolTableCollection &symbolTable, + std::shared_ptr uArch) + : SparseBackwardDataFlowAnalysis(solver, symbolTable), uArch(uArch) {} + + void visitOperation(mlir::Operation *op, + mlir::ArrayRef operands, + mlir::ArrayRef results) override; + + void visitBranchOperand(mlir::OpOperand &operand) override {} + + void visitCallOperand(mlir::OpOperand &operand) override {} + + void setToExitState(BlockingLattice *lattice) override {} + +private: + void visitPrefetchTileOp(xetile::PrefetchTileOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitLoadTileOp(xetile::LoadTileOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitStoreTileOp(xetile::StoreTileOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitUpdateTileOp(xetile::UpdateTileOffsetOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitTileMMAOp(xetile::TileMMAOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitVectorizableOp(mlir::Operation *op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitShapecastOp(mlir::vector::ShapeCastOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitReductionOp(xetile::ReductionOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitBroadcastOp(xetile::BroadcastOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + void visitTransposeOp(xetile::TransposeOp op, + mlir::ArrayRef operands, + mlir::ArrayRef results); + + int getMaxSLMBlockSize(int elemBitWidth, int height); + + template + Block getInnerBlockSize(mlir::Operation *op, mlir::Type elemTy, + llvm::ArrayRef &shape, + int memorySpace = 0); + + llvm::SmallVector + getMMASize(mlir::Type elemTy, const int APrecision, const int BPrecision, + const int CPrecision, const int DPrecision); + +private: + std::shared_ptr uArch = nullptr; +}; + +void BlockingAnalysisImpl::visitOperation( + mlir::Operation *op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + + if (auto updateTileOp = mlir::dyn_cast(op)) + visitUpdateTileOp(updateTileOp, operands, results); + + if (auto prefetchOp = mlir::dyn_cast(op)) + visitPrefetchTileOp(prefetchOp, operands, results); + + if (auto loadOp = mlir::dyn_cast(op)) + visitLoadTileOp(loadOp, operands, results); + + if (auto storeOp = mlir::dyn_cast(op)) + visitStoreTileOp(storeOp, operands, results); + + if (auto tileMMAOp = mlir::dyn_cast(op)) + visitTileMMAOp(tileMMAOp, operands, results); + + if (auto reductionOp = mlir::dyn_cast(op)) + visitReductionOp(reductionOp, operands, results); + + if (auto transposeOp = mlir::dyn_cast(op)) + visitTransposeOp(transposeOp, operands, results); + + if (auto broadcastOp = mlir::dyn_cast(op)) + visitBroadcastOp(broadcastOp, operands, results); + + if (op->hasTrait()) + visitVectorizableOp(op, operands, results); + + if (auto shapecastOp = mlir::dyn_cast(op)) + visitShapecastOp(shapecastOp, operands, results); +} + +void BlockingAnalysisImpl::visitPrefetchTileOp( + xetile::PrefetchTileOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto tileTy = op.getTile().getType(); + auto elemTy = tileTy.getElementType(); + auto shape = tileTy.getShape(); + auto memSpace = tileTy.getMemoryScopeAsInt(); + // initialized with a default size queried from the architecture + auto size = getInnerBlockSize(op, elemTy, shape, memSpace); + if (!size) + return; // do nothing if didnot get a valid block size + auto BlockingRequest = BlockingRequests(size, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(BlockingRequest)); +} + +void BlockingAnalysisImpl::visitLoadTileOp( + xetile::LoadTileOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto lattice = results[0]->getValue(); + + if (lattice.getNumUniqRequests() > 1) + op.emitWarning("multiple users requesting different blocking sizes."); + + auto tileTy = op.getSource().getType(); + auto elemTy = tileTy.getElementType(); + auto shape = tileTy.getShape(); + auto memSpace = tileTy.getMemoryScopeAsInt(); + // initialized with a default size queried from the architecture + Block block = getInnerBlockSize(op, elemTy, shape, memSpace); + + // It has users but users' requirements are not available yet. + // Worth to wait until all users are visited. + if (!op.getValue().use_empty() && !lattice.isInitialized()) + return; + + // adjust according to user's requirements if it is available + if (lattice.isInitialized()) { + // align the height dimension if user is a transpose op, + // otherwise align the width dimension to minimize the + // in-register data movements. + bool hasTransposeUser = op.getValue().hasOneUse() && + mlir::isa(*(op->user_begin())); + + int dim = hasTransposeUser ? 0 : 1; + for (auto rq : lattice.getRequests()) + block[dim] = std::min(block[dim], rq[dim]); + } + + if (!block) + return; // do nothing if didnot get a valid block size + + auto BlockingRequest = BlockingRequests(block, UsePoint({op, 0})); + // propagate the blocking size to its def op + propagateIfChanged(operands[0], operands[0]->join(BlockingRequest)); + + // update the def block size for the result value + BlockingRequests &def = getLatticeElement(op.getValue())->getValue(); + def.updateDefBlock(block); +} + +void BlockingAnalysisImpl::visitStoreTileOp( + xetile::StoreTileOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto tileTy = op.getTile().getType(); + auto elemTy = tileTy.getElementType(); + auto shape = tileTy.getShape(); + auto memSpace = tileTy.getMemoryScopeAsInt(); + auto size = getInnerBlockSize(op, elemTy, shape, memSpace); + + if (!size) + return; // do nothing if didnot get a valid block size + + for (auto &&[i, inputOpr] : llvm::enumerate(operands)) { + auto blockingRequest = BlockingRequests(size, UsePoint(op, i)); + propagateIfChanged(inputOpr, inputOpr->join(blockingRequest)); + } +} + +void BlockingAnalysisImpl::visitUpdateTileOp( + xetile::UpdateTileOffsetOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto lattice = results[0]->getValue(); + if (lattice.isInitialized()) { + auto block = lattice.getRequests()[0]; + auto request = BlockingRequests(block, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(request)); + } +} + +void BlockingAnalysisImpl::visitTileMMAOp( + xetile::TileMMAOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + + auto getElemBitWidth = [](mlir::VectorType vecTy) { + return vecTy.getElementType().getIntOrFloatBitWidth(); + }; + + auto C = op.getC(); + auto aPrecision = getElemBitWidth(op.getAType()); + auto bPrecision = getElemBitWidth(op.getBType()); + auto dPrecision = getElemBitWidth(op.getOutputType()); + auto cPrecision = !C ? dPrecision : getElemBitWidth(C.getType()); + + auto mmaSize = getMMASize(op.getElementType(), aPrecision, bPrecision, + cPrecision, dPrecision); + + auto blockSizeForA = + BlockingRequests(mmaSize[0], mmaSize[1], UsePoint({op, 0})); + auto blockSizeForB = + BlockingRequests(mmaSize[1], mmaSize[2], UsePoint({op, 1})); + + propagateIfChanged(operands[0], operands[0]->join(blockSizeForA)); + propagateIfChanged(operands[1], operands[1]->join(blockSizeForB)); + if (C) { + auto blockSizeForC = + BlockingRequests(mmaSize[0], mmaSize[2], UsePoint(op, 2)); + propagateIfChanged(operands[2], operands[2]->join(blockSizeForC)); + } + + // update the def block size for the result value + BlockingRequests &def = getLatticeElement(op.getOutput())->getValue(); + def.updateDefBlock(Block(mmaSize[0], mmaSize[2])); +} + +void BlockingAnalysisImpl::visitReductionOp( + xetile::ReductionOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto srcTy = op.getSource().getType(); + auto dims = op.getReductionDims(); + // We only support reduction on 2D types now. + if (srcTy.getRank() != 2 || dims.size() != 1) + return; + + auto elemTy = srcTy.getElementType(); + auto shape = srcTy.getShape(); + // ReductionOp is special. Its blocking size is fixed to {1, + // min(subgroupSize, width)} + auto size = getInnerBlockSize(op, elemTy, shape); + if (!size) + return; // do nothing if didnot get a valid block size + + auto blockingRequest = BlockingRequests(size, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(blockingRequest)); +} + +void BlockingAnalysisImpl::visitBroadcastOp( + xetile::BroadcastOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto srcTy = op.getSource().getType(); + auto dims = op.getBroadcastDim(); + // We only support reduction on 2D types now. + if (srcTy.getRank() != 2 || dims.size() != 1) + return; + + auto elemTy = srcTy.getElementType(); + auto shape = srcTy.getShape(); + // BroadcastOp is special. Its blocking size is fixed to {1, + // min(subgroupSize, width)} + auto size = getInnerBlockSize(op, elemTy, shape); + if (!size) + return; // do nothing if didnot get a valid block size + + auto blockingRequest = BlockingRequests(size, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(blockingRequest)); +} + +void BlockingAnalysisImpl::visitTransposeOp( + xetile::TransposeOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + + auto permutation = op.getPermutation(); + auto resType = op.getResult().getType(); + // we only support true 2D transpose now + if (resType.getRank() != 2 || permutation != mlir::ArrayRef({1, 0})) + return; + + auto lattice = results[0]->getValue(); + + // Wait for requests from users. + if (!op->use_empty() && !lattice.isInitialized()) + return; + + Block block; + + // use the default size if no users + if (op->use_empty()) { + auto srcTy = op.getVector().getType(); + auto shape = srcTy.getShape(); + block = getInnerBlockSize(op, srcTy.getElementType(), shape); + } + + // TransposeOp determines its blocking size based on requests from + // its users, by swapping the blocking size of its users. + if (lattice.isInitialized()) { + // TODO: handle multiple users + if (lattice.getNumUniqRequests() == 1) { + auto req = lattice.getRequests()[0]; + block = Block(req[1], req[0]); + } + } + + if (!block) + return; // do nothing if didnot get a valid block size + + auto request = BlockingRequests(block, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(request)); + + // update the def block size for the result value + BlockingRequests &def = getLatticeElement(op.getResult())->getValue(); + def.updateDefBlock(Block(block[1], block[0])); +} + +void BlockingAnalysisImpl::visitVectorizableOp( + mlir::Operation *op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + // Currently only supports simple elementwise math ops. + if (op->getNumResults() != 1) + return; + + auto type = mlir::dyn_cast(op->getResult(0).getType()); + if (!type) + return; + + auto lattice = results[0]->getValue(); + + // Wait for requests from users. + if (!op->use_empty() && !lattice.isInitialized()) + return; + + auto elemTy = type.getElementType(); + auto shape = type.getShape(); + Block block = getInnerBlockSize(op, elemTy, shape); + + // elementwise operations are not sensitive to the block size. + // It will use the block size requested by its users. + if (lattice.isInitialized()) { + block[0] = 0; + for (auto &req : lattice.getRequests()) { + block[0] = std::max(block[0], req[0]); + block[1] = std::min(block[1], req[1]); + } + } + + // do nothing if get an invalid block + if (!block) + return; + + // propagate the block size on its operands + for (auto &&[i, inputOpr] : llvm::enumerate(operands)) { + auto req = BlockingRequests(block, UsePoint(op, i)); + propagateIfChanged(inputOpr, inputOpr->join(req)); + } + + // update the def block size for the result value + BlockingRequests &def = getLatticeElement(op->getResult(0))->getValue(); + def.updateDefBlock(block); +} + +void BlockingAnalysisImpl::visitShapecastOp( + mlir::vector::ShapeCastOp op, mlir::ArrayRef operands, + mlir::ArrayRef results) { + auto shape = op.getSource().getType().getShape(); + if (shape.size() == 2) { + auto BlockingRequest = BlockingRequests(shape, UsePoint(op, 0)); + propagateIfChanged(operands[0], operands[0]->join(BlockingRequest)); + } +} + +int BlockingAnalysisImpl::getMaxSLMBlockSize(int elemBitWidth, int height) { + // TODO: use uArch to get max vec size? + const int lscConstraint = 512; // lsc supports upto 512 bytes per load/store + int numElems = (lscConstraint * 8) / elemBitWidth; + int width = numElems / height; + return width; +} + +// Determine the inner block size for the given operation based on the +// operand's element data type, shape, and also memory space. +template +Block BlockingAnalysisImpl::getInnerBlockSize( + mlir::Operation *op, mlir::Type elemTy, llvm::ArrayRef &shape, + int memorySpace) { + assert(elemTy.isIntOrFloat() && "only support int or float element type."); + + // TODO: get from uArch ? + const int64_t subgroupSize = 16; + int elemSize = elemTy.getIntOrFloatBitWidth(); + + int maxHeight = 0, minHeight = 0, maxWidth = 0, minWidth = 0; + if (mlir::isa(op) || + mlir::isa(op)) { + // for reduction and broadcast ops, we simply using + // [1, subgroupSize] as innerblock size + maxWidth = subgroupSize; + minWidth = 1; + maxHeight = 1; + minHeight = 1; + } else if (op->hasTrait()) { + // for elementwise operations, they are pretty flexiable + // on the block size. But we expect its second dimension + // is subgroupSize aligned. + minWidth = 1; + minHeight = 1; + maxWidth = std::min(shape[1], subgroupSize); + maxHeight = shape[0]; + } else if (mlir::isa(op)) { + // for transpose op, we will use the original shape + // as the default size, and adjust it if it is defined + // by a load op + minWidth = 1; + minHeight = 1; + maxWidth = shape[1]; + maxHeight = shape[0]; + + // if the transpose follows a load op, and data element is 32-bit + // or 64-bit, it is expected to be folded with a load, and need to + // be aligned to hardware constraints. + auto defOp = op->getOperand(0).getDefiningOp(); + if (defOp && elemSize >= 32) { + auto params = uArch->get2DLoadConfig(defOp, elemSize, false, true); + minHeight = params->blockHeight.min; + minWidth = params->blockWidth.min; + // to be compatible with the SIMT instrinsic, the maximum height is + // limited to 16, which is maximum supported value by SIMT instrinsic. + maxHeight = std::min(params->blockHeight.max, 16); + maxWidth = params->blockWidth.max; + } + } else if (memorySpace == 3) { + // this is supposed for load/store from/to SLM, they will use regular + // load/store instructions with chunk size. lsc instrinsic and hardware + // has serveral limits on the size per load/store. + minHeight = minWidth = 1; + // If shape[0] is divisible by subgroup size, we use regular load (with + // chunk size) with XeGPU.load_gather (maxHeight = 16). Otherwise, we + // use 1D load with XeGPU.load_nd(1d, maxHeight = 1). + maxHeight = shape[0] % subgroupSize == 0 ? subgroupSize : 1; + maxWidth = getMaxSLMBlockSize(elemSize, maxHeight); + } else { // for load/store from/to global memory + mlir::FailureOr params; + if (mlir::isa(op)) + params = uArch->get2DStoreConfig(elemSize); + if (mlir::isa(op) || + mlir::isa(op)) { + bool transpose = false; + // if its user is a transpose op, and data element is 32-bit + // or 64-bit, we will use the transpose supported size. + if (auto loadOp = mlir::dyn_cast(op)) { + auto value = loadOp.getValue(); + transpose = elemSize >= 32 && value.hasOneUse() && + mlir::isa(*(value.user_begin())); + } + params = uArch->get2DLoadConfig(op, elemSize, false, transpose); + } + if (mlir::succeeded(params)) { + maxHeight = params->blockHeight.max; + minHeight = params->blockHeight.min; + maxWidth = params->blockWidth.max; + minWidth = params->blockWidth.min; + } + } + + auto findLargestDivisorInRange = [&](int64_t v, int64_t l, int64_t h) { + for (int i = h; i >= l; i--) { + if (v % i == 0) + return i; + } + // irregular shape or shape is not in the supported range. + return 0; + }; + + auto height = findLargestDivisorInRange(shape[0], minHeight, maxHeight); + auto width = findLargestDivisorInRange(shape[1], minWidth, maxWidth); + return Block(height, width); +} + +llvm::SmallVector +BlockingAnalysisImpl::getMMASize(mlir::Type elemTy, const int APrecision, + const int BPrecision, const int CPrecision, + const int DPrecision) { + assert(elemTy.isIntOrFloat() && "only support int or float data type."); + auto dpasParams = + uArch->getDPASConfig(APrecision, BPrecision, CPrecision, DPrecision); + return llvm::SmallVector( + {dpasParams.m, dpasParams.k, dpasParams.n}); +} + +// ===--------------------------------BlockingAnalysis---------------------------------===// + +mlir::LogicalResult BlockingAnalysis::run(mlir::Operation *op) { + mlir::SymbolTableCollection symbolTable; + // BlockingAnalysisImpl is using default initialize method + // provided by SparseBackwardDataFlowAnalysis. And this default + // initialize method relies on results of DeadCodeAnalysis to + // skip analysis on the dead code. + solver.load(); + solver.load(); + solver.load(symbolTable, uArch); + target = op; + return solver.initializeAndRun(op); +} + +void BlockingAnalysis::printAnalysisResult() { + llvm::dbgs() << "\n\nBlockingAnalysis Results:\n"; + target->walk([&](mlir::Operation *op) { + if (op->getNumRegions() == 0 && op->getNumResults() == 1) { + auto resTy = op->getResult(0).getType(); + if (mlir::isa(resTy) || + mlir::isa(resTy)) { + llvm::dbgs() << "\nOp: " << *op; + for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) { + if (mlir::isa(inputOpr.getType()) || + mlir::isa(inputOpr.getType())) { + UsePoint p(op, i); + llvm::dbgs() << "\n opr[" << i << "]: " << inputOpr + << " --> blkSZ: " << getUseBlockSize(inputOpr, p); + } + } + + for (auto [i, res] : llvm::enumerate(op->getResults())) + llvm::dbgs() << "\n res[" << i << "]: " << res + << " --> blkSZ: " << getDefBlockSize(res); + llvm::dbgs() << "\n"; + } + } else if (auto forOp = mlir::dyn_cast(op)) { + llvm::dbgs() << "\nOp: " << op->getName(); + for (auto [i, arg] : llvm::enumerate(forOp.getRegionIterArgs())) + llvm::dbgs() << "\n arg[" << i << "]: " + << " --> blkSZ: " << getDefBlockSize(arg); + + for (auto [i, res] : llvm::enumerate(forOp.getResults())) + llvm::dbgs() << "\n res[" << i << "]: " + << " --> blkSZ: " << getDefBlockSize(res); + llvm::dbgs() << "\n"; + } else if (auto YieldOp = mlir::dyn_cast(op)) { + llvm::dbgs() << "\nOp: " << op->getName(); + for (auto [i, res] : llvm::enumerate(YieldOp.getResults())) + llvm::dbgs() << "\n res[" << i << "]: " << res + << " --> blkSZ: " << getDefBlockSize(res) << ", " + << getUseBlockSize(res, UsePoint(op, i)); + llvm::dbgs() << "\n"; + } else if (auto StoreOp = mlir::dyn_cast(op)) { + llvm::dbgs() << "\nOp: " << *op; + for (auto [i, inputOpr] : llvm::enumerate(op->getOperands())) { + llvm::dbgs() << "\n opr[" << i << "]: " << inputOpr << " --> blkSZ: " + << getUseBlockSize(inputOpr, UsePoint(StoreOp, i)); + } + llvm::dbgs() << "\n"; + } + }); +} + +Block BlockingAnalysis::getUseBlockSize(mlir::Value val, UsePoint point) const { + auto *state = solver.lookupState(val); + if (!state) + return Block(); + return state->getValue().getUseBlock(point); +} + +Block BlockingAnalysis::getDefBlockSize(mlir::Value val) const { + auto *state = solver.lookupState(val); + if (!state) + return Block(); + return state->getValue().getDefBlock(); +} + +} // namespace imex diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h new file mode 100644 index 000000000..96f2249e2 --- /dev/null +++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.h @@ -0,0 +1,68 @@ + +#ifndef IMEX_BLOCKING_ANALYSIS_H +#define IMEX_BLOCKING_ANALYSIS_H + +#include +#include +#include + +#include + +#include "imex/Utils/XeArch.h" + +namespace imex { + +/// a class representing a inner block size, provides some +/// convinient methods for manipulation. +class Block { +public: + Block() : values{0, 0} {} + + Block(int64_t h, int64_t w) : values{h, w} {} + + int64_t &operator[](size_t index); + const int64_t &operator[](size_t index) const; + + bool operator==(Block &other) const; + bool operator==(const Block &other) const; + + bool operator!=(Block &other) const { return !(*this == other); } + bool operator!=(const Block &other) const { return !(*this == other); } + + void print(llvm::raw_ostream &os) const; + + llvm::ArrayRef asArrayRef() const; + + operator bool() const { return values[0] != 0 && values[1] != 0; } + +private: + int64_t values[2]; +}; + +llvm::raw_ostream &operator<<(llvm::raw_ostream &os, Block blk); + +// A pair of operator and operand index number representing +// the use point of a value. +typedef std::pair UsePoint; + +class BlockingAnalysis { +public: + explicit BlockingAnalysis(std::shared_ptr uArch) { + this->uArch = uArch; + }; + + mlir::LogicalResult run(mlir::Operation *op); + + Block getUseBlockSize(mlir::Value val, UsePoint point) const; + Block getDefBlockSize(mlir::Value val) const; + void printAnalysisResult(); + +private: + mlir::DataFlowSolver solver; + std::shared_ptr uArch; + mlir::Operation *target; +}; + +} // namespace imex + +#endif // IMEX_BLOCKING_ANALYSIS_H diff --git a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp new file mode 100644 index 000000000..74604dd52 --- /dev/null +++ b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp @@ -0,0 +1,875 @@ +//===-------------- Blocking.cpp --------- Blocking Pass -------*- C++ -*-===// +// +// Copyright 2024 Intel Corporation +// Part of the IMEX Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file contains lowering transformation for determing the problem size +/// that can be handled by an XeGPU operator (hardware instruction). XeTile +/// program can work one bigger problem size that cannot be handled by a +/// hardware instruction. But it needs to be decomposed into smaller pieces +/// such that each pieces can be handled by a hardware instruction. +/// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "imex/Dialect/XeTile/Transforms/Passes.h" +#include "imex/Utils/DebugUtils.h" +#include "imex/Utils/XeArch.h" + +#include "BlockingAnalysis.h" +#include "PassDetail.h" + +using namespace mlir; +using namespace llvm; +using namespace imex; +namespace imex { +#define GEN_PASS_DECL_NEWXETILEBLOCKING +#define GEN_PASS_DEF_NEWXETILEBLOCKING +#include "imex/Dialect/XeTile/Transforms/Passes.h.inc" +} // namespace imex + +namespace imex { +namespace Blocking { + +static xetile::TileUnpackOp +addUnpackOp(mlir::Value src, mlir::ConversionPatternRewriter &rewriter) { + auto srcTy = llvm::dyn_cast_if_present(src.getType()); + assert(srcTy && srcTy.getRank() == 4); + auto shape = srcTy.getShape(); + auto grids = shape.take_front(2); + auto innerBlocks = shape.take_back(2); + llvm::SmallVector unpackShape( + {grids[0] * innerBlocks[0], grids[1] * innerBlocks[1]}); + + auto unpackTy = mlir::VectorType::get(unpackShape, srcTy.getElementType()); + return rewriter.create( + src.getLoc(), unpackTy, src, + mlir::DenseI64ArrayAttr::get(src.getContext(), innerBlocks)); +} + +static mlir::Value addPackOp(mlir::Value src, + llvm::ArrayRef targetBlkSizes, + mlir::ConversionPatternRewriter &rewriter) { + auto srcTy = mlir::dyn_cast(src.getType()); + assert(srcTy && targetBlkSizes.size() == 2); + auto shape = srcTy.getShape(); + llvm::SmallVector packShape({shape[0] / targetBlkSizes[0], + shape[1] / targetBlkSizes[1], + targetBlkSizes[0], targetBlkSizes[1]}); + + auto packTy = mlir::VectorType::get(packShape, srcTy.getElementType()); + auto packOp = rewriter.create( + src.getLoc(), packTy, src, + mlir::DenseI64ArrayAttr::get(src.getContext(), targetBlkSizes)); + return packOp; +} + +/// OpConversionPatternWithAnalysis is a wrapper around OpConversionPattern +/// but takes an extra AnalysisT object as an argument, such that patterns +/// can leverage the analysis results. +template +class OpConversionPatternWithAnalysis + : public mlir::OpConversionPattern { +public: + using OpPatternRewriter = typename mlir::ConversionPatternRewriter; + + OpConversionPatternWithAnalysis(mlir::MLIRContext *context, + AnalysisT &analysis) + : mlir::OpConversionPattern(context), analysis(analysis) {} + +protected: + AnalysisT &analysis; +}; + +/// OpTraitConversionPatternWithAnalysis is a wrapper around +/// OpTraitConversionPattern but takes an extra AnalysisT object as an argument, +/// such that patterns can leverage the analysis results. +template