From 1fd2e0002630fc16e73cc55039a7ea175de650e6 Mon Sep 17 00:00:00 2001 From: Igor Zamyatin Date: Tue, 8 Oct 2024 14:30:25 -0500 Subject: [PATCH] llvm pulldown 10/2024 (#915) --- build_tools/llvm_version.txt | 2 +- ...upport-for-VectorAnyINTEL-capability.patch | 99 ++-- ...e-spirv.CL.printf-op-assembly-format.patch | 49 -- ...onstant-attribute-in-ParseDecoration.patch | 36 -- ...n-and-de-serialization-support-for-s.patch | 33 +- ...0007-Move-chunk_size-into-TensorDesc.patch | 432 ------------------ ...mporary-downstream-defintion-changes.patch | 38 +- ...ative-bf16-support-in-SPIR-V-dialect.patch | 52 +-- .../0010-refine-the-XeGPU-definition.patch | 206 --------- docs/rfcs/XeGPU.md | 30 +- include/imex/Dialect/XeTile/IR/XeTileAttrs.td | 14 +- include/imex/Dialect/XeTile/IR/XeTileTypes.td | 14 +- lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp | 113 ----- lib/Conversion/XeGPUToVC/LSCPatterns.cpp | 55 +-- lib/Conversion/XeGPUToVC/XeGPUToVC.cpp | 46 +- .../XeTileToXeGPU/ArithOpConversion.cpp | 6 +- .../XeTileToXeGPU/XeTileOpConversion.cpp | 8 +- lib/Dialect/XeTile/IR/XeTileDialect.cpp | 2 +- lib/Dialect/XeTile/IR/XeTileOps.cpp | 2 +- .../XeTile/Transforms/BlockAligning.cpp | 2 +- lib/Dialect/XeTile/Transforms/Blocking.cpp | 6 +- .../XeTile/Transforms/BlockingAnalysis.cpp | 17 +- .../XeTile/Transforms/BlockingRewrite.cpp | 2 +- .../XeTile/Transforms/Canonicalization.cpp | 8 +- lib/Transforms/OptimizeTranspose.cpp | 4 +- lib/Transforms/PropagatePackedLayout.cpp | 12 +- lib/Transforms/VnniTransformation.cpp | 16 +- lib/Utils/XeArch.cpp | 4 +- test/Conversion/GPUToSPIRV/printf.mlir | 2 +- test/Conversion/XeGPUToVC/atomiclsc.mlir | 44 +- .../XeGPUToVC/load_global_no_chunk_f16.mlir | 37 +- .../XeGPUToVC/load_global_no_chunk_f32.mlir | 29 +- .../prefetch_global_no_chunk_f16.mlir | 19 +- .../prefetch_global_no_chunk_f32.mlir | 18 +- .../store_load_slm_no_chunk_f16.mlir | 50 +- .../store_load_slm_no_chunk_f32.mlir | 36 +- .../XeTileToXeGPU/array_length_load.mlir | 4 +- test/Conversion/XeTileToXeGPU/lit.local.cfg | 8 + test/Conversion/XeTileToXeGPU/reduction.mlir | 20 +- .../sg_gemm_1k_1k_1k_f16_f32.mlir | 222 ++++----- .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir | 80 ++-- .../sg_gemm_1k_1k_1k_i8_i32.mlir | 38 +- .../sg_gemm_1k_1k_1k_tf32_tf32.mlir | 80 ++-- .../XeTileToXeGPU/sg_gemm_transpose_b.mlir | 10 +- .../XeTileToXeGPU/sg_load_tile.mlir | 4 +- .../XeTileToXeGPU/sg_mixed_scf.mlir | 50 +- test/Conversion/XeTileToXeGPU/sg_scf_for.mlir | 26 +- test/Conversion/XeTileToXeGPU/sg_softmax.mlir | 16 +- .../XeTileToXeGPU/sg_store_tile.mlir | 32 +- .../Conversion/XeTileToXeGPU/sg_tile_mma.mlir | 12 +- .../XeTileToXeGPU/sg_tiled_broadcast.mlir | 4 +- .../XeTileToXeGPU/sg_tiled_load_tile.mlir | 4 +- .../XeTileToXeGPU/sg_tiled_scf_for.mlir | 26 +- .../XeTileToXeGPU/sg_tiled_softmax.mlir | 16 +- .../XeTileToXeGPU/sg_tiled_store_tile.mlir | 32 +- .../XeTileToXeGPU/sg_tiled_tile_mma.mlir | 12 +- test/Conversion/XeTileToXeGPU/test_order.mlir | 8 +- test/Dialect/XeGPU/IR/XeGPUOps.mlir | 8 +- test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir | 12 +- test/Dialect/XeGPU/IR/create_nd_tdesc.mlir | 16 +- test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir | 19 +- test/Dialect/XeGPU/IR/create_tdesc_vc.mlir | 50 +- test/Dialect/XeGPU/IR/invalid_vc.mlir | 26 +- test/Dialect/XeGPU/IR/load_gather_vc.mlir | 24 +- test/Dialect/XeGPU/IR/store_scatter_vc.mlir | 13 +- test/Dialect/XeGPU/IR/update_offset_vc.mlir | 14 +- test/Dialect/XeTile/IR/ops.mlir | 4 +- .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir | 6 +- .../sg_gemm_1k_1k_1k_f16_f32_slm.mlir | 6 +- .../Transforms/wg_to_sg_btranspose.mlir | 28 +- .../Transforms/wg_to_sg_gemm_postop.mlir | 6 +- .../load_global_chunk_4_f32.mlir | 8 +- .../load_global_chunk_8_f32.mlir | 8 +- .../load_global_no_chunk_f16.mlir | 7 +- .../load_global_no_chunk_f32.mlir | 6 +- .../store_global_chunk_4_f32.mlir | 5 +- .../store_global_chunk_8_f32.mlir | 5 +- .../store_global_no_chunk_f16.mlir | 5 +- .../store_global_no_chunk_f32.mlir | 5 +- .../store_load_slm_chunk_4_f32.mlir | 9 +- .../store_load_slm_chunk_8_f32.mlir | 9 +- .../store_load_slm_chunk_8_f32_mask.mlir | 8 +- .../store_load_slm_no_chunk_f16.mlir | 9 +- .../store_load_slm_no_chunk_f32.mlir | 9 +- .../Dialect/XeGPU/load1d-slm-f32.mlir | 2 +- .../XeGPU/loadgather2d_masked_f32.mlir | 11 +- .../XeGPU/loadgather_chunk_size_f32.mlir | 5 +- .../XeGPU/loadgather_chunk_size_i32.mlir | 5 +- .../Dialect/XeGPU/loadgather_f32.mlir | 5 +- .../Dialect/XeGPU/loadgather_masked_f32.mlir | 5 +- .../Dialect/XeGPU/optimize_transpose.mlir | 46 +- test/SPIRV/OpTest.spirv.CL.printf.mlir | 2 +- .../postop_reduce_n.mlir | 76 +-- .../VectorLinearize/postop_reduce_n.mlir | 76 +-- .../VnniTransform/gemm_with_extract.mlir | 58 +-- .../VnniTransform/gemm_with_extract_e2e.mlir | 56 +-- 96 files changed, 1034 insertions(+), 1885 deletions(-) delete mode 100644 build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch delete mode 100644 build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch delete mode 100644 build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch delete mode 100644 build_tools/patches/0010-refine-the-XeGPU-definition.patch create mode 100644 test/Conversion/XeTileToXeGPU/lit.local.cfg diff --git a/build_tools/llvm_version.txt b/build_tools/llvm_version.txt index 0ca8e4c0b..33000613b 100644 --- a/build_tools/llvm_version.txt +++ b/build_tools/llvm_version.txt @@ -1 +1 @@ -08a61eb01172054fc5f8c78ff527f01d9768569b +add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a diff --git a/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch b/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch index b04bc1020..531b66a3e 100644 --- a/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch +++ b/build_tools/patches/0001-Add-support-for-VectorAnyINTEL-capability.patch @@ -1,35 +1,14 @@ -From 94cc2bb6a778cad3b762244d6d78ecf2e19b5372 Mon Sep 17 00:00:00 2001 -From: Md Abdullah Shahneous Bari -Date: Fri, 26 Apr 2024 20:20:28 +0000 -Subject: [PATCH 1/7] Add-support-for-VectorAnyINTEL-capability - -Allow vector of any lengths between [2-2^63-1]. -VectorAnyINTEL capability (part of "SPV_INTEL_vector_compute" extension) -relaxes the length constraint on SPIR-V vector sizes from 2,3, and 4. - -Also add support for following: - -- Add support for capability inferred extension requirement checking. -If a capability is a requirement, the respective extension that implements -it should also become an extension requirement, there were no support for -that check, as a result, the extension requirement had to be added separately. -This separate requirement addition causes problem when a feature is enabled by -multiple capability, and one of the capability is part of an extension. E.g., -vector size of 16 can be enabled by both "Vector16" and "vectorAnyINTEL" -capability, however, only "vectorAnyINTEL" has an extension requirement -("SPV_INTEL_vector_compute"). Since the process of adding capability -and extension requirement are independent, there is no way, to handle -cases like this. Therefore, for cases like this, enable adding capability -requirement initially, then do the check for capability inferred extension. - -- Add support for optionally skipping capability and extension requirement +From 45b150c9a0c4e4bd60c153e5142da17fd6cde6da Mon Sep 17 00:00:00 2001 +From: izamyati +Date: Tue, 24 Sep 2024 17:42:02 -0500 +Subject: [PATCH] Add support for VectorAnyINTEL capability --- .../mlir/Dialect/SPIRV/IR/SPIRVBase.td | 9 +- mlir/include/mlir/IR/CommonTypeConstraints.td | 86 ++++++++++++ mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp | 7 +- mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp | 24 +++- - .../SPIRV/Transforms/SPIRVConversion.cpp | 132 +++++++++++++++--- + .../SPIRV/Transforms/SPIRVConversion.cpp | 126 +++++++++++++++--- .../arith-to-spirv-unsupported.mlir | 4 +- .../ArithToSPIRV/arith-to-spirv.mlir | 34 +++++ .../FuncToSPIRV/types-to-spirv.mlir | 17 ++- @@ -42,13 +21,13 @@ requirement initially, then do the check for capability inferred extension. mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir | 34 ++--- mlir/test/Target/SPIRV/arithmetic-ops.mlir | 6 +- mlir/test/Target/SPIRV/ocl-ops.mlir | 6 + - 17 files changed, 319 insertions(+), 68 deletions(-) + 17 files changed, 316 insertions(+), 65 deletions(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td -index 6ec97e17c5dc..75e42c024553 100644 +index 3b7da9b44a08..ddaeb13ef253 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td -@@ -4138,7 +4138,12 @@ def SPIRV_Int32 : TypeAlias; +@@ -4142,7 +4142,12 @@ def SPIRV_Int32 : TypeAlias; def SPIRV_Float32 : TypeAlias; def SPIRV_Float : FloatOfWidths<[16, 32, 64]>; def SPIRV_Float16or32 : FloatOfWidths<[16, 32]>; @@ -62,8 +41,8 @@ index 6ec97e17c5dc..75e42c024553 100644 [SPIRV_Bool, SPIRV_Integer, SPIRV_Float]>; // Component type check is done in the type parser for the following SPIR-V // dialect-specific types so we use "Any" here. -@@ -4189,7 +4194,7 @@ class SPIRV_JointMatrixOfType allowedTypes> : - "Joint Matrix">; +@@ -4185,7 +4190,7 @@ class SPIRV_CoopMatrixOfType allowedTypes> : + "Cooperative Matrix">; class SPIRV_VectorOf : - VectorOfLengthAndType<[2, 3, 4, 8,16], [type]>; @@ -72,10 +51,10 @@ index 6ec97e17c5dc..75e42c024553 100644 class SPIRV_ScalarOrVectorOf : AnyTypeOf<[type, SPIRV_VectorOf]>; diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td -index af4f13dc0936..28d49d9e91f0 100644 +index 211385245555..671ec270efe0 100644 --- a/mlir/include/mlir/IR/CommonTypeConstraints.td +++ b/mlir/include/mlir/IR/CommonTypeConstraints.td -@@ -608,6 +608,92 @@ class ScalableVectorOfRankAndLengthAndType allowedRanks, +@@ -637,6 +637,92 @@ class ScalableVectorOfRankAndLengthAndType allowedRanks, ScalableVectorOfLength.summary, "::mlir::VectorType">; @@ -169,7 +148,7 @@ index af4f13dc0936..28d49d9e91f0 100644 // Negative values for `n` index in reverse. class ShapedTypeWithNthDimOfSize allowedSizes> : Type< diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp -index 72488d6e5d0b..b38f20458d32 100644 +index 48be287ef833..aec6d64209dd 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -187,9 +187,12 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect, @@ -188,7 +167,7 @@ index 72488d6e5d0b..b38f20458d32 100644 return Type(); } diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp -index 3f25696aa5eb..2d64fea0dc26 100644 +index 337df3a5a65f..542c6beba2e4 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp @@ -100,9 +100,11 @@ bool CompositeType::classof(Type type) { @@ -206,7 +185,7 @@ index 3f25696aa5eb..2d64fea0dc26 100644 } Type CompositeType::getElementType(unsigned index) const { -@@ -170,7 +172,21 @@ void CompositeType::getCapabilities( +@@ -164,7 +166,21 @@ void CompositeType::getCapabilities( .Case([&](VectorType type) { auto vecSize = getNumElements(); if (vecSize == 8 || vecSize == 16) { @@ -230,10 +209,10 @@ index 3f25696aa5eb..2d64fea0dc26 100644 capabilities.push_back(ref); } diff --git a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp -index 4072608dc8f8..3fc675632970 100644 +index d833ec9309ba..36840582a114 100644 --- a/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp +++ b/mlir/lib/Dialect/SPIRV/Transforms/SPIRVConversion.cpp -@@ -43,9 +43,13 @@ using namespace mlir; +@@ -88,9 +88,13 @@ static std::optional> getTargetShape(VectorType vecType) { template static LogicalResult checkExtensionRequirements( LabelT label, const spirv::TargetEnv &targetEnv, @@ -249,7 +228,7 @@ index 4072608dc8f8..3fc675632970 100644 continue; LLVM_DEBUG({ -@@ -71,9 +75,13 @@ static LogicalResult checkExtensionRequirements( +@@ -116,9 +120,13 @@ static LogicalResult checkExtensionRequirements( template static LogicalResult checkCapabilityRequirements( LabelT label, const spirv::TargetEnv &targetEnv, @@ -265,7 +244,7 @@ index 4072608dc8f8..3fc675632970 100644 continue; LLVM_DEBUG({ -@@ -90,6 +98,55 @@ static LogicalResult checkCapabilityRequirements( +@@ -135,6 +143,55 @@ static LogicalResult checkCapabilityRequirements( return success(); } @@ -321,27 +300,24 @@ index 4072608dc8f8..3fc675632970 100644 /// Returns true if the given `storageClass` needs explicit layout when used in /// Shader environments. static bool needsExplicitLayout(spirv::StorageClass storageClass) { -@@ -247,12 +304,17 @@ convertScalarType(const spirv::TargetEnv &targetEnv, +@@ -280,11 +337,16 @@ convertScalarType(const spirv::TargetEnv &targetEnv, return nullptr; } -- if (auto floatType = dyn_cast(type)) { + //if (auto floatType = dyn_cast(type)) { + // Convert to 32-bit float and remove floatType related capability + // restriction -+ if (auto floatType = dyn_cast(type)) { + if (auto floatType = dyn_cast(type)) { LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); return Builder(targetEnv.getContext()).getF32Type(); } -- auto intType = cast(type); + //auto intType = cast(type); + // Convert to 32-bit int and remove intType related capability restriction -+ auto intType = cast(type); + auto intType = cast(type); LLVM_DEBUG(llvm::dbgs() << type << " converted to 32-bit for SPIR-V\n"); return IntegerType::get(targetEnv.getContext(), /*width=*/32, - intType.getSignedness()); -@@ -342,16 +404,40 @@ convertVectorType(const spirv::TargetEnv &targetEnv, +@@ -375,16 +437,40 @@ convertVectorType(const spirv::TargetEnv &targetEnv, cast(type).getExtensions(extensions, storageClass); cast(type).getCapabilities(capabilities, storageClass); @@ -389,7 +365,7 @@ index 4072608dc8f8..3fc675632970 100644 } static Type -@@ -1163,16 +1249,18 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) { +@@ -1553,16 +1639,18 @@ bool SPIRVConversionTarget::isLegalOp(Operation *op) { SmallVector, 4> typeExtensions; SmallVector, 8> typeCapabilities; for (Type valueType : valueTypes) { @@ -400,10 +376,9 @@ index 4072608dc8f8..3fc675632970 100644 - return false; - typeCapabilities.clear(); -- cast(valueType).getCapabilities(typeCapabilities); + cast(valueType).getCapabilities(typeCapabilities); - if (failed(checkCapabilityRequirements(op->getName(), this->targetEnv, - typeCapabilities))) -+ cast(valueType).getCapabilities(typeCapabilities); + typeExtensions.clear(); + cast(valueType).getExtensions(typeExtensions); + // Checking for capability and extension requirements along with capability @@ -418,10 +393,10 @@ index 4072608dc8f8..3fc675632970 100644 } diff --git a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir -index 0d92a8e676d8..d61ace8d6876 100644 +index 24a0bab352c3..96b8ea6e7975 100644 --- a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir +++ b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv-unsupported.mlir -@@ -11,9 +11,9 @@ module attributes { +@@ -28,9 +28,9 @@ module attributes { #spirv.vce, #spirv.resource_limits<>> } { @@ -434,10 +409,10 @@ index 0d92a8e676d8..d61ace8d6876 100644 } diff --git a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir -index ae47ae36ca51..644996fe0fa7 100644 +index 1abe0fd2ec46..e485296ad026 100644 --- a/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir +++ b/mlir/test/Conversion/ArithToSPIRV/arith-to-spirv.mlir -@@ -1447,6 +1447,40 @@ func.func @ops_flags(%arg0: i64, %arg1: i64) { +@@ -1462,6 +1462,40 @@ func.func @ops_flags(%arg0: i64, %arg1: i64) { %2 = arith.muli %arg0, %arg1 overflow : i64 // CHECK: %{{.*}} = spirv.IMul %{{.*}}, %{{.*}} : i64 %3 = arith.muli %arg0, %arg1 overflow : i64 @@ -586,7 +561,7 @@ index 53a1015de75b..6970b8ec0628 100644 spirv.Return } diff --git a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir -index 7dc0bd99f54b..5dd9901828cd 100644 +index 5c24f0e6a7d3..3ca61ab48096 100644 --- a/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/logical-ops.mlir @@ -166,7 +166,7 @@ func.func @logicalUnary(%arg0 : i1) @@ -599,10 +574,10 @@ index 7dc0bd99f54b..5dd9901828cd 100644 return } diff --git a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir -index f7fd05b36bae..5228bb719d94 100644 +index d8a26c71d12f..d22378817dbb 100644 --- a/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/non-uniform-ops.mlir -@@ -439,7 +439,7 @@ func.func @group_non_uniform_bitwise_and(%val: i32) -> i32 { +@@ -495,7 +495,7 @@ func.func @group_non_uniform_bitwise_and(%val: i32) -> i32 { // ----- func.func @group_non_uniform_bitwise_and(%val: i1) -> i1 { @@ -611,7 +586,7 @@ index f7fd05b36bae..5228bb719d94 100644 %0 = spirv.GroupNonUniformBitwiseAnd "Workgroup" "Reduce" %val : i1 return %0: i1 } -@@ -460,7 +460,7 @@ func.func @group_non_uniform_bitwise_or(%val: i32) -> i32 { +@@ -516,7 +516,7 @@ func.func @group_non_uniform_bitwise_or(%val: i32) -> i32 { // ----- func.func @group_non_uniform_bitwise_or(%val: i1) -> i1 { @@ -620,7 +595,7 @@ index f7fd05b36bae..5228bb719d94 100644 %0 = spirv.GroupNonUniformBitwiseOr "Workgroup" "Reduce" %val : i1 return %0: i1 } -@@ -481,7 +481,7 @@ func.func @group_non_uniform_bitwise_xor(%val: i32) -> i32 { +@@ -537,7 +537,7 @@ func.func @group_non_uniform_bitwise_xor(%val: i32) -> i32 { // ----- func.func @group_non_uniform_bitwise_xor(%val: i1) -> i1 { @@ -629,7 +604,7 @@ index f7fd05b36bae..5228bb719d94 100644 %0 = spirv.GroupNonUniformBitwiseXor "Workgroup" "Reduce" %val : i1 return %0: i1 } -@@ -502,7 +502,7 @@ func.func @group_non_uniform_logical_and(%val: i1) -> i1 { +@@ -558,7 +558,7 @@ func.func @group_non_uniform_logical_and(%val: i1) -> i1 { // ----- func.func @group_non_uniform_logical_and(%val: i32) -> i32 { @@ -638,7 +613,7 @@ index f7fd05b36bae..5228bb719d94 100644 %0 = spirv.GroupNonUniformLogicalAnd "Workgroup" "Reduce" %val : i32 return %0: i32 } -@@ -523,7 +523,7 @@ func.func @group_non_uniform_logical_or(%val: i1) -> i1 { +@@ -579,7 +579,7 @@ func.func @group_non_uniform_logical_or(%val: i1) -> i1 { // ----- func.func @group_non_uniform_logical_or(%val: i32) -> i32 { @@ -647,7 +622,7 @@ index f7fd05b36bae..5228bb719d94 100644 %0 = spirv.GroupNonUniformLogicalOr "Workgroup" "Reduce" %val : i32 return %0: i32 } -@@ -544,7 +544,7 @@ func.func @group_non_uniform_logical_xor(%val: i1) -> i1 { +@@ -600,7 +600,7 @@ func.func @group_non_uniform_logical_xor(%val: i1) -> i1 { // ----- func.func @group_non_uniform_logical_xor(%val: i32) -> i32 { diff --git a/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch b/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch deleted file mode 100644 index 69232a7ba..000000000 --- a/build_tools/patches/0002-change-spirv.CL.printf-op-assembly-format.patch +++ /dev/null @@ -1,49 +0,0 @@ -From dc1e914409a9d4c02c21a292227754fa4ac0cea7 Mon Sep 17 00:00:00 2001 -From: Dimple Prajapati -Date: Fri, 26 Apr 2024 20:30:34 +0000 -Subject: [PATCH 2/7] change-spirv.CL.printf-op-assembly-format - ---- - mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td | 4 ++-- - mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir | 4 ++-- - 2 files changed, 4 insertions(+), 4 deletions(-) - -diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td -index c7c2fe8bc742..b5ca27d7d753 100644 ---- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td -+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVCLOps.td -@@ -875,7 +875,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> { - #### Example: - - ```mlir -- %0 = spirv.CL.printf %0 %1 %2 : (!spirv.ptr, (i32, i32)) -> i32 -+ %0 = spirv.CL.printf %0 : !spirv.ptr(%1, %2 : i32, i32) -> i32 - ``` - }]; - -@@ -889,7 +889,7 @@ def SPIRV_CLPrintfOp : SPIRV_CLOp<"printf", 184, []> { - ); - - let assemblyFormat = [{ -- $format `,` $arguments attr-dict `:` `(` type($format) `,` `(` type($arguments) `)` `)` `->` type($result) -+ $format `:` type($format) ( `(` $arguments^ `:` type($arguments) `)`)? attr-dict `->` type($result) - }]; - - let hasVerifier = 0; -diff --git a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir -index 7a29abd44b34..b15ffdbbb767 100644 ---- a/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir -+++ b/mlir/test/Dialect/SPIRV/IR/ocl-ops.mlir -@@ -275,8 +275,8 @@ func.func @rintvec(%arg0 : vector<3xf16>) -> () { - //===----------------------------------------------------------------------===// - // CHECK-LABEL: func.func @printf( - func.func @printf(%arg0 : !spirv.ptr, %arg1 : i32, %arg2 : i32) -> i32 { -- // CHECK: spirv.CL.printf {{%.*}}, {{%.*}}, {{%.*}} : (!spirv.ptr, (i32, i32)) -> i32 -- %0 = spirv.CL.printf %arg0, %arg1, %arg2 : (!spirv.ptr, (i32, i32)) -> i32 -+ // CHECK: spirv.CL.printf {{%.*}} : !spirv.ptr({{%.*}}, {{%.*}} : i32, i32) -> i32 -+ %0 = spirv.CL.printf %arg0 : !spirv.ptr(%arg1, %arg2 : i32, i32) -> i32 - return %0 : i32 - } - --- -2.34.1 diff --git a/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch b/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch deleted file mode 100644 index 81d751d81..000000000 --- a/build_tools/patches/0003-Add-Constant-attribute-in-ParseDecoration.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 85635423ba70290147e674672854b90bbb81f555 Mon Sep 17 00:00:00 2001 -From: "Prajapati, Dimple" -Date: Fri, 26 Apr 2024 20:32:04 +0000 -Subject: [PATCH 3/7] Add-Constant-attribute-in-ParseDecoration - ---- - mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 1 + - mlir/lib/Target/SPIRV/Serialization/Serializer.cpp | 1 + - 2 files changed, 2 insertions(+) - -diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -index faaa42023a80..cfe3121bbe95 100644 ---- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -@@ -297,6 +297,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { - decorations[words[0]].set(symbol, llvm::dyn_cast(linkageAttr)); - break; - } -+ case spirv::Decoration::Constant: - case spirv::Decoration::Aliased: - case spirv::Decoration::AliasedPointer: - case spirv::Decoration::Block: -diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp -index 200abdf993ce..a7d195d7fcb0 100644 ---- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp -+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp -@@ -267,6 +267,7 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID, - << stringifyDecoration(decoration); - case spirv::Decoration::Aliased: - case spirv::Decoration::AliasedPointer: -+ case spirv::Decoration::Constant: - case spirv::Decoration::Flat: - case spirv::Decoration::NonReadable: - case spirv::Decoration::NonWritable: --- -2.34.1 diff --git a/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch b/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch index 6bc75a749..9d2618753 100644 --- a/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch +++ b/build_tools/patches/0004-Add-serialization-and-de-serialization-support-for-s.patch @@ -1,22 +1,29 @@ +From 4cb4411e2451b1549bafd6a8a3723f78251ef6f3 Mon Sep 17 00:00:00 2001 +From: izamyati +Date: Tue, 1 Oct 2024 08:59:35 -0500 +Subject: [PATCH] Add serialization and deserialization support for s + +--- + mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp | 6 ++++++ + mlir/lib/Target/SPIRV/Serialization/Serializer.cpp | 6 ++++++ + 2 files changed, 12 insertions(+) + diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -index 12980879b20a..b5fbe8c5ceb8 100644 +index 6c7fe4106982..b1be812e74eb 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -@@ -259,8 +259,9 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { +@@ -259,6 +259,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { symbol, FPRoundingModeAttr::get(opBuilder.getContext(), static_cast(words[2]))); break; -- case spirv::Decoration::DescriptorSet: + case spirv::Decoration::Alignment: + case spirv::Decoration::DescriptorSet: case spirv::Decoration::Binding: -+ case spirv::Decoration::DescriptorSet: if (words.size() != 3) { - return emitError(unknownLoc, "OpDecorate with ") - << decorationName << " needs a single integer literal"; -@@ -319,6 +320,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { - case spirv::Decoration::Restrict: +@@ -320,6 +321,10 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { case spirv::Decoration::RestrictPointer: case spirv::Decoration::NoContraction: + case spirv::Decoration::Constant: + case spirv::Decoration::SingleElementVectorINTEL: + case spirv::Decoration::VectorComputeCallableFunctionINTEL: + case spirv::Decoration::VectorComputeFunctionINTEL: @@ -24,7 +31,7 @@ index 12980879b20a..b5fbe8c5ceb8 100644 if (words.size() != 2) { return emitError(unknownLoc, "OpDecoration with ") << decorationName << "needs a single target "; -@@ -329,6 +334,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { +@@ -330,6 +335,7 @@ LogicalResult spirv::Deserializer::processDecoration(ArrayRef words) { // it is needed for many validation rules. decorations[words[0]].set(symbol, opBuilder.getUnitAttr()); break; @@ -33,7 +40,7 @@ index 12980879b20a..b5fbe8c5ceb8 100644 case spirv::Decoration::SpecId: if (words.size() != 3) { diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp -index 714a3edfb565..bb3c68530aa9 100644 +index f355982e9ed8..d6080185eefe 100644 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp @@ -252,8 +252,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID, @@ -47,10 +54,10 @@ index 714a3edfb565..bb3c68530aa9 100644 case spirv::Decoration::Location: if (auto intAttr = dyn_cast(attr)) { args.push_back(intAttr.getValue().getZExtValue()); -@@ -286,6 +288,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID, - case spirv::Decoration::Restrict: +@@ -287,6 +289,10 @@ LogicalResult Serializer::processDecorationAttr(Location loc, uint32_t resultID, case spirv::Decoration::RestrictPointer: case spirv::Decoration::NoContraction: + case spirv::Decoration::Constant: + case spirv::Decoration::SingleElementVectorINTEL: + case spirv::Decoration::VectorComputeCallableFunctionINTEL: + case spirv::Decoration::VectorComputeFunctionINTEL: @@ -58,3 +65,5 @@ index 714a3edfb565..bb3c68530aa9 100644 // For unit attributes and decoration attributes, the args list // has no values so we do nothing. if (isa(attr)) +-- +2.34.1 diff --git a/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch b/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch deleted file mode 100644 index 33c132dfb..000000000 --- a/build_tools/patches/0007-Move-chunk_size-into-TensorDesc.patch +++ /dev/null @@ -1,432 +0,0 @@ -From c1a7d459790db5335907947cf44dcbd230cec783 Mon Sep 17 00:00:00 2001 -From: Chao Chen -Date: Thu, 29 Aug 2024 17:58:34 +0000 -Subject: [PATCH] move chunk_size into TensorDesc - ---- - .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td | 46 +++++++++++--- - .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 19 ++---- - .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td | 63 ++++++++++++------- - mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 40 ++++++++---- - mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 31 ++++----- - 5 files changed, 122 insertions(+), 77 deletions(-) - -diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td -index f3ca09a6a68e..6ffb4eb3c60f 100644 ---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td -+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td -@@ -19,9 +19,15 @@ class XeGPUAttr traits = [], - let mnemonic = attrMnemonic; - } - --def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { -+class XeGPU_TensorDescAttr traits = [], -+ string baseCppClass = "::mlir::Attribute"> -+ : XeGPUAttr { -+ let assemblyFormat = "`<` struct(params) `>`"; -+} -+ -+def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> { - let summary = [{a composite attribute for `TensorDescType`}]; -- let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite -+ let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite - attribute defined for `TensorDescType` for describing following - properties of a `TensorDesc`. - 1. `memory_scope`: It describes where the data block described by the -@@ -33,29 +39,49 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> { - 8x32. Its default value is 1. - 3. `boundary_check`: It is used to indicates the hardware whether to do - out-of-boundary check. The default value is true. -- 4. `scattered`: It is used to differenciate TensorDescs created from -- `create_nd_tdesc` vs from `create_tdesc`. - }]; - - let parameters = (ins - OptionalParameter<"MemoryScopeAttr">: $memory_scope, - OptionalParameter<"IntegerAttr", "1">: $array_length, -- OptionalParameter<"BoolAttr", "true">: $boundary_check, -- OptionalParameter<"BoolAttr", "false">: $scattered -+ OptionalParameter<"BoolAttr", "true">: $boundary_check - ); - - let builders = [ - AttrBuilder<(ins - CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, - CArg<"int", "1">:$array_length, -- CArg<"bool", "true">: $boundary_check, -- CArg<"bool", "false">: $scattered -+ CArg<"bool", "true">: $boundary_check - )> - ]; - -- let assemblyFormat = "`<` struct(params) `>`"; - } - -+def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> { -+ let summary = [{a composite attribute for `TensorDescType`}]; -+ let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite -+ attribute defined for `TensorDescType` for describing following -+ properties of a `TensorDesc`. -+ 1. `memory_scope`: It describes where the data block described by the -+ TensorDesc is located, `Global` device memory or `Shared` local memory. -+ It is default to `Global`. -+ 2. `chunk_size`: indicates number of continious elements accessed for each -+ offset, default is 1. It is used with `scattered` attr only. -+ }]; -+ -+ let parameters = (ins -+ OptionalParameter<"MemoryScopeAttr">: $memory_scope, -+ OptionalParameter<"IntegerAttr", "1">: $chunk_size -+ ); -+ -+ let builders = [ -+ AttrBuilder<(ins -+ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, -+ CArg<"int", "1">: $chunk_size -+ )> -+ ]; -+ } -+ - //===----------------------------------------------------------------------===// - // XeGPU Memory Scope Enums. - //===----------------------------------------------------------------------===// -@@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr: - let assemblyFormat = "$value"; - } - --#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD -\ No newline at end of file -+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD -diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -index c32c7541c397..13a0bff5de1a 100644 ---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -@@ -411,42 +411,33 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { - is fixed to the hardware supportted subgroup size, e.g., 16 on PVC, - implying each element in the array corresponds to a work-item (SIMT lane) - in the subgroup. -- * chunk_size: [optional attribute] indicates number of continious -- elements accessed for each offset, default is 1. - - Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] - ```mlir - %a = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> -+ %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1> - ``` - - Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. - It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] - ```mlir - %0 = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> -+ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8> - ``` - - Example 3. It is similar to Example 2, but there is some overlaps among workitems. - It accesses: a[0:7], a[4:11], a[8:15], a[12:19] - ```mlir - %0 = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32> -+ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>> - ``` - }]; - - let arguments = (ins XeGPU_BaseAddrType: $source, - Variadic: $offsets, -- DenseI64ArrayAttr: $const_offsets, -- DefaultValuedAttr: $chunk_size); -+ DenseI64ArrayAttr: $const_offsets); - let results = (outs XeGPU_TensorDesc:$TensorDesc); - -- let builders = [ -- OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source, -- "llvm::ArrayRef": $offsets, -- CArg<"uint32_t", "1"> : $chunk_size)>, -- ]; -- - let assemblyFormat = [{ - $source - custom($offsets, $const_offsets) -@@ -723,7 +714,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>] - - def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure, - AllElementTypesMatch<["tensorDesc", "value", "result"]>, -- AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> { -+ AllShapesMatch<["tensorDesc", "value", "result"]>]> { - let summary = "Atomic ready-modify-write operation on the TensorDesc. "; - - let description = [{ -diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td -index 9f101a71697b..8b22baf365af 100644 ---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td -+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td -@@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", - TypeBuilderWithInferredContext<(ins - "llvm::ArrayRef": $shape, - "mlir::Type": $elementType, -- CArg<"bool", "false">: $scattered, - CArg<"int", "1">: $array_length, -- CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope, -- CArg<"bool", "true">: $boundary_check -- )> -+ CArg<"bool", "true">: $boundary_check, -+ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)>, -+ TypeBuilderWithInferredContext<(ins -+ "llvm::ArrayRef": $shape, -+ "mlir::Type": $elementType, -+ CArg<"int", "1">: $chunk_size, -+ CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope)> - ]; - - let extraClassDeclaration = [{ -@@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc", - return llvm::cast(cloneWith(getShape(), elementType)); - } - -- TensorDescAttr getEncodingAsTensorDescAttr() const { -- return llvm::dyn_cast_if_present(getEncoding()); -+ BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const { -+ return llvm::dyn_cast_if_present(getEncoding()); -+ } -+ -+ ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const { -+ return llvm::dyn_cast_if_present(getEncoding()); - } - - xegpu::MemoryScope getMemoryScope() const { -- auto attr = getEncodingAsTensorDescAttr(); -- if (attr && attr.getMemoryScope()) -- return attr.getMemoryScope().getValue(); -+ auto block_attr = getEncodingAsBlockTensorDescAttr(); -+ if (block_attr && block_attr.getMemoryScope()) -+ return block_attr.getMemoryScope().getValue(); -+ -+ auto scatter_attr = getEncodingAsScatterTensorDescAttr(); -+ if (scatter_attr && scatter_attr.getMemoryScope()) -+ return scatter_attr.getMemoryScope().getValue(); -+ - // return default value - return MemoryScope::Global; - } - - int getArrayLength() { -- auto attr = getEncodingAsTensorDescAttr(); -- if (attr && attr.getArrayLength()) -- return attr.getArrayLength().getInt(); -+ auto attr = getEncoding(); -+ auto block_attr = mlir::dyn_cast_if_present(attr); -+ assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); -+ if (block_attr && block_attr.getArrayLength()) -+ return block_attr.getArrayLength().getInt(); - // return default value - return 1; - } - - bool getBoundaryCheck() { -- auto attr = getEncodingAsTensorDescAttr(); -- if (attr && attr.getBoundaryCheck()) -- return attr.getBoundaryCheck().getValue(); -+ auto attr = getEncoding(); -+ auto block_attr = mlir::dyn_cast_if_present(attr); -+ assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr."); -+ if (block_attr && block_attr.getBoundaryCheck()) -+ return block_attr.getBoundaryCheck().getValue(); - // return default value - return true; - } - -- bool getScattered() { -- auto attr = getEncodingAsTensorDescAttr(); -- if (attr && attr.getScattered()) -- return attr.getScattered().getValue(); -- // return default value -- return false; -+ bool isScattered() { -+ return bool(getEncodingAsScatterTensorDescAttr()); -+ } -+ -+ int getChunkSize() { -+ auto attr = getEncoding(); -+ auto scatter_attr = mlir::dyn_cast_if_present(attr); -+ assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr."); -+ if (scatter_attr && scatter_attr.getChunkSize()) -+ return scatter_attr.getChunkSize().getInt(); -+ return 1; - } - }]; - -diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -index 24719fe748fe..0eab601bbaac 100644 ---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -@@ -30,18 +30,28 @@ void XeGPUDialect::initialize() { - } - - //===----------------------------------------------------------------------===// --// XeGPU_TensorDescAttr -+// XeGPU_BlockTensorDescAttr - //===----------------------------------------------------------------------===// --TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context, -- xegpu::MemoryScope memory_scope, -- int array_length, bool boundary_check, -- bool scattered) { -+BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context, -+ xegpu::MemoryScope memory_scope, -+ int array_length, bool boundary_check) { - auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); - auto lengthAttr = - IntegerAttr::get(IntegerType::get(context, 64), array_length); - auto boundaryAttr = BoolAttr::get(context, boundary_check); -- auto scatteredAttr = BoolAttr::get(context, scattered); -- return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr); -+ return Base::get(context, scopeAttr, lengthAttr, boundaryAttr); -+} -+ -+//===----------------------------------------------------------------------===// -+// XeGPU_ScatterTensorDescAttr -+//===----------------------------------------------------------------------===// -+ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context, -+ xegpu::MemoryScope memory_scope, -+ int chunk_size) { -+ auto scopeAttr = MemoryScopeAttr::get(context, memory_scope); -+ auto chunkSizeAttr = -+ IntegerAttr::get(IntegerType::get(context, 64), chunk_size); -+ return Base::get(context, scopeAttr, chunkSizeAttr); - } - - //===----------------------------------------------------------------------===// -@@ -108,12 +118,18 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const { - } - - TensorDescType TensorDescType::get(llvm::ArrayRef shape, -- mlir::Type elementType, bool scattered, -- int array_length, MemoryScope memory_scope, -- bool boundary_check) { -+ mlir::Type elementType, int array_length, -+ bool boundary_check, MemoryScope memory_scope) { -+ auto context = elementType.getContext(); -+ auto attr = BlockTensorDescAttr::get(context, memory_scope, array_length, boundary_check); -+ return Base::get(context, shape, elementType, attr); -+} -+ -+TensorDescType TensorDescType::get(llvm::ArrayRef shape, -+ mlir::Type elementType, int chunk_size, -+ MemoryScope memory_scope) { - auto context = elementType.getContext(); -- auto attr = TensorDescAttr::get(context, memory_scope, array_length, -- boundary_check, scattered); -+ auto attr = ScatterTensorDescAttr::get(context, memory_scope, chunk_size); - return Base::get(context, shape, elementType, attr); - } - -diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -index 8e185b8d2586..ee3834bd0d9c 100644 ---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -@@ -153,7 +153,7 @@ LogicalResult CreateNdDescOp::verify() { - return emitOpError("TensorDesc should have the same element " - "type with the source if it is a memref.\n"); - -- if (getType().getScattered()) -+ if (getType().isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - - return success(); -@@ -164,7 +164,7 @@ LogicalResult CreateNdDescOp::verify() { - //===----------------------------------------------------------------------===// - LogicalResult PrefetchNdOp::verify() { - auto tdescTy = getTensorDescType(); -- if (tdescTy.getScattered()) -+ if (tdescTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - - if (!isReadHintOrNone(getL1HintAttr())) -@@ -189,7 +189,7 @@ LogicalResult LoadNdOp::verify() { - if (tdescTy.getRank() > 2) - return emitOpError("Expecting a 1D/2D TensorDesc.\n"); - -- if (tdescTy.getScattered()) -+ if (tdescTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - - if (!valueTy) -@@ -257,7 +257,7 @@ LogicalResult StoreNdOp::verify() { - if (dstTy.getRank() > 2) - return emitOpError("Expecting a 1D/2D TensorDesc.\n"); - -- if (dstTy.getScattered()) -+ if (dstTy.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - - if (!valTy) -@@ -280,7 +280,7 @@ LogicalResult StoreNdOp::verify() { - //===----------------------------------------------------------------------===// - LogicalResult UpdateNdOffsetOp::verify() { - auto ty = getTensorDescType(); -- if (ty.getScattered()) -+ if (ty.isScattered()) - return emitOpError("Expects a non-scattered TensorDesc.\n"); - - // number of offsets specified must match the rank of the tensor descriptor -@@ -293,28 +293,19 @@ LogicalResult UpdateNdOffsetOp::verify() { - //===----------------------------------------------------------------------===// - // XeGPU_CreateDescOp - //===----------------------------------------------------------------------===// --void CreateDescOp::build(OpBuilder &builder, OperationState &state, -- TensorDescType TensorDesc, Value source, -- llvm::ArrayRef offsets, -- uint32_t chunk_size) { -- llvm::SmallVector staticOffsets; -- llvm::SmallVector dynamicOffsets; -- dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); -- build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets, -- chunk_size); --} - - LogicalResult CreateDescOp::verify() { - auto tdescTy = getTensorDescType(); -- auto chunkSize = getChunkSize(); - - if (getRankOf(getSource()) > 1) - return emitOpError( - "Expecting the source is a 1D memref or pointer (uint64_t)."); - -- if (!tdescTy.getScattered()) -+ if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - -+ auto chunkSize = tdescTy.getChunkSize(); -+ - SmallVector shape({(int64_t)getNumOffsets()}); - if (chunkSize != 1) - shape.push_back(chunkSize); -@@ -332,7 +323,7 @@ LogicalResult CreateDescOp::verify() { - //===----------------------------------------------------------------------===// - LogicalResult PrefetchOp::verify() { - auto tdescTy = getTensorDescType(); -- if (!tdescTy.getScattered()) -+ if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - - if (!isReadHintOrNone(getL1HintAttr())) -@@ -355,7 +346,7 @@ LogicalResult LoadGatherOp::verify() { - auto maskTy = getMaskType(); - auto valueTy = getValueType(); - -- if (!tdescTy.getScattered()) -+ if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - - if (!isReadHintOrNone(getL1HintAttr())) -@@ -401,7 +392,7 @@ LogicalResult LoadGatherOp::verify() { - //===----------------------------------------------------------------------===// - LogicalResult StoreScatterOp::verify() { - auto tdescTy = getTensorDescType(); -- if (!tdescTy.getScattered()) -+ if (!tdescTy.isScattered()) - return emitOpError("Expects a scattered TensorDesc.\n"); - - if (!isWriteHintOrNone(getL1HintAttr())) --- -2.34.1 diff --git a/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch b/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch index 623657e55..72b2739c6 100644 --- a/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch +++ b/build_tools/patches/0008-xegpu-temporary-downstream-defintion-changes.patch @@ -1,19 +1,19 @@ -From c5e6d0bd63d6aab004ae4e795f1466800c54b3ff Mon Sep 17 00:00:00 2001 -From: Chao Chen -Date: Thu, 29 Aug 2024 19:18:42 +0000 -Subject: [PATCH] Add temporary changes for downstream: - add transposeBitWidth - for load_nd - add CompileHintOp +From 0829723718f1e80834d9d0051069e263fcfea82a Mon Sep 17 00:00:00 2001 +From: izamyati +Date: Tue, 24 Sep 2024 18:25:53 -0500 +Subject: [PATCH] xegpu temporary downstream defintion changes --- - mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++++ - mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +- - 2 files changed, 7 insertions(+), 1 deletion(-) + mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 6 ++++++ + mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp | 1 + + mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 2 +- + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -index 13a0bff5de1a..64b15fd1cc32 100644 +index e24a056de2ca..948cc40e8595 100644 --- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td +++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -@@ -285,6 +285,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor +@@ -302,6 +302,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor let arguments = (ins XeGPU_TensorDesc: $TensorDesc, OptionalAttr: $packed, OptionalAttr: $transpose, @@ -21,7 +21,7 @@ index 13a0bff5de1a..64b15fd1cc32 100644 OptionalAttr: $l1_hint, OptionalAttr: $l2_hint, OptionalAttr: $l3_hint); -@@ -805,4 +806,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { +@@ -850,4 +851,9 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> { let extraClassDeclaration = extraBaseClassDeclaration; } @@ -31,11 +31,23 @@ index 13a0bff5de1a..64b15fd1cc32 100644 +} + #endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD +diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +index fa0344276553..849de4fced8f 100644 +--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp ++++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp +@@ -184,6 +184,7 @@ struct TransferReadLowering : public OpRewritePattern { + xegpu::CachePolicyAttr hint = nullptr; + auto loadOp = rewriter.create( + loc, vecTy, ndDesc, /*packed=*/nullptr, transposeAttr, ++ /*transpose_bit_width*/nullptr, + /*l1_hint=*/hint, + /*l2_hint=*/hint, /*l3_hint=*/hint); + rewriter.replaceOp(readOp, loadOp); diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -index ee3834bd0d9c..98fc3308d96e 100644 +index 1a7a6b347840..121a7007208b 100644 --- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp +++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -@@ -222,7 +222,7 @@ LogicalResult LoadNdOp::verify() { +@@ -236,7 +236,7 @@ LogicalResult LoadNdOp::verify() { emitWarning("Invalid transpose attr. It is ignored."); } diff --git a/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch b/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch index 248af9c41..d282deb4c 100644 --- a/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch +++ b/build_tools/patches/0009-SPIR-V-Enable-native-bf16-support-in-SPIR-V-dialect.patch @@ -1,9 +1,7 @@ -From a2e340bcdb9936074795f1d28bef235be33a53b8 Mon Sep 17 00:00:00 2001 -From: Md Abdullah Shahneous Bari -Date: Tue, 20 Aug 2024 21:38:22 +0000 -Subject: [PATCH] This Patch enables Khronos extension: SPV_KHR_bfloat16. Most - of the ops specified in the extension is supported. Some notable exceptions - are: OpDot, OpCooperativeMatrixMulAddKHR. +From 1f270ef0932e583d3d12fa9af7082ddecf8d9546 Mon Sep 17 00:00:00 2001 +From: izamyati +Date: Tue, 24 Sep 2024 18:19:04 -0500 +Subject: [PATCH] SPIR-V Enable native bf16 support in SPIR-V dialect --- .../Dialect/SPIRV/IR/SPIRVArithmeticOps.td | 10 ++--- @@ -66,7 +64,7 @@ index 22d5afcd7738..de9e11493793 100644 let assemblyFormat = "operands attr-dict `:` type($vector1) `->` type($result)"; diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td -index 04952dd1dc61..6c9c348490ab 100644 +index ddaeb13ef253..9b43dbfe2341 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -343,6 +343,7 @@ def SPV_KHR_subgroup_rotate : I32EnumAttrCase<"SPV_KHR_subgroup @@ -77,7 +75,7 @@ index 04952dd1dc61..6c9c348490ab 100644 def SPV_EXT_demote_to_helper_invocation : I32EnumAttrCase<"SPV_EXT_demote_to_helper_invocation", 1000>; def SPV_EXT_descriptor_indexing : I32EnumAttrCase<"SPV_EXT_descriptor_indexing", 1001>; -@@ -435,7 +436,7 @@ def SPIRV_ExtensionAttr : +@@ -434,7 +435,7 @@ def SPIRV_ExtensionAttr : SPV_KHR_fragment_shader_barycentric, SPV_KHR_ray_cull_mask, SPV_KHR_uniform_group_instructions, SPV_KHR_subgroup_rotate, SPV_KHR_non_semantic_info, SPV_KHR_terminate_invocation, @@ -86,7 +84,7 @@ index 04952dd1dc61..6c9c348490ab 100644 SPV_EXT_demote_to_helper_invocation, SPV_EXT_descriptor_indexing, SPV_EXT_fragment_fully_covered, SPV_EXT_fragment_invocation_density, SPV_EXT_fragment_shader_interlock, SPV_EXT_physical_storage_buffer, -@@ -1193,6 +1194,22 @@ def SPIRV_C_ShaderClockKHR : I32EnumAttrCase<"Shade +@@ -1192,6 +1193,22 @@ def SPIRV_C_ShaderClockKHR : I32EnumAttrCase<"Shade Extension<[SPV_KHR_shader_clock]> ]; } @@ -109,7 +107,7 @@ index 04952dd1dc61..6c9c348490ab 100644 def SPIRV_C_FragmentFullyCoveredEXT : I32EnumAttrCase<"FragmentFullyCoveredEXT", 5265> { list implies = [SPIRV_C_Shader]; list availability = [ -@@ -1491,6 +1508,7 @@ def SPIRV_CapabilityAttr : +@@ -1484,6 +1501,7 @@ def SPIRV_CapabilityAttr : SPIRV_C_RayQueryKHR, SPIRV_C_RayTracingKHR, SPIRV_C_Float16ImageAMD, SPIRV_C_ImageGatherBiasLodAMD, SPIRV_C_FragmentMaskAMD, SPIRV_C_StencilExportEXT, SPIRV_C_ImageReadWriteLodAMD, SPIRV_C_Int64ImageEXT, SPIRV_C_ShaderClockKHR, @@ -117,7 +115,7 @@ index 04952dd1dc61..6c9c348490ab 100644 SPIRV_C_FragmentFullyCoveredEXT, SPIRV_C_MeshShadingNV, SPIRV_C_FragmentDensityEXT, SPIRV_C_ShaderNonUniform, SPIRV_C_RuntimeDescriptorArray, SPIRV_C_StorageTexelBufferArrayDynamicIndexing, SPIRV_C_RayTracingNV, -@@ -4148,16 +4166,21 @@ def SPIRV_Bool : TypeAlias; +@@ -4139,16 +4157,21 @@ def SPIRV_Bool : TypeAlias; def SPIRV_Integer : AnyIntOfWidths<[8, 16, 32, 64]>; def SPIRV_Int16 : TypeAlias; def SPIRV_Int32 : TypeAlias; @@ -142,7 +140,7 @@ index 04952dd1dc61..6c9c348490ab 100644 // Component type check is done in the type parser for the following SPIR-V // dialect-specific types so we use "Any" here. def SPIRV_AnyPtr : DialectType; @@ -152,14 +150,14 @@ index 04952dd1dc61..6c9c348490ab 100644 def SPIRV_Aggregate : AnyTypeOf<[SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct]>; def SPIRV_Composite : AnyTypeOf<[SPIRV_Vector, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct, - SPIRV_AnyCooperativeMatrix, SPIRV_AnyJointMatrix, SPIRV_AnyMatrix]>; + SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix]>; def SPIRV_Type : AnyTypeOf<[ - SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector, + SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_BFloat16KHR, SPIRV_Vector, SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct, - SPIRV_AnyCooperativeMatrix, SPIRV_AnyJointMatrix, SPIRV_AnyMatrix, - SPIRV_AnySampledImage -@@ -4764,6 +4787,12 @@ def SPIRV_FPFMM_AllowReassocINTEL : I32BitEnumAttrCaseBit<"AllowReassocINTEL", 1 + SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage + ]>; +@@ -4738,6 +4761,12 @@ def SPIRV_FPFMM_AllowReassocINTEL : I32BitEnumAttrCaseBit<"AllowReassocINTEL", 1 ]; } @@ -273,7 +271,7 @@ index b05ee0251df5..a5c8aa8fb450 100644 let summary = [{ Convert value numerically from one floating-point width to another diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp -index 654b0a8a2ed0..74f7d06d5272 100644 +index b4ad5923e975..d477c089732a 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVDialect.cpp @@ -171,8 +171,10 @@ static Type parseAndVerifyType(SPIRVDialect const &dialect, @@ -290,10 +288,10 @@ index 654b0a8a2ed0..74f7d06d5272 100644 } } else if (auto t = llvm::dyn_cast(type)) { diff --git a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp -index 0eac34ee3a0f..16dcdd60a4bb 100644 +index 542c6beba2e4..27bfc1871528 100644 --- a/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp +++ b/mlir/lib/Dialect/SPIRV/IR/SPIRVTypes.cpp -@@ -596,7 +596,7 @@ bool ScalarType::classof(Type type) { +@@ -521,7 +521,7 @@ bool ScalarType::classof(Type type) { } bool ScalarType::isValid(FloatType type) { @@ -302,7 +300,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644 } bool ScalarType::isValid(IntegerType type) { -@@ -605,6 +605,14 @@ bool ScalarType::isValid(IntegerType type) { +@@ -530,6 +530,14 @@ bool ScalarType::isValid(IntegerType type) { void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, std::optional storage) { @@ -317,7 +315,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644 // 8- or 16-bit integer/floating-point numbers will require extra extensions // to appear in interface storage classes. See SPV_KHR_16bit_storage and // SPV_KHR_8bit_storage for more details. -@@ -623,7 +631,7 @@ void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, +@@ -548,7 +556,7 @@ void ScalarType::getExtensions(SPIRVType::ExtensionArrayRefVector &extensions, [[fallthrough]]; case StorageClass::Input: case StorageClass::Output: @@ -326,7 +324,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644 static const Extension exts[] = {Extension::SPV_KHR_16bit_storage}; ArrayRef ref(exts, std::size(exts)); extensions.push_back(ref); -@@ -710,7 +718,20 @@ void ScalarType::getCapabilities( +@@ -635,7 +643,20 @@ void ScalarType::getCapabilities( } else { assert(llvm::isa(*this)); switch (bitwidth) { @@ -349,7 +347,7 @@ index 0eac34ee3a0f..16dcdd60a4bb 100644 case 32: break; diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp -index 14fd4d5d4e40..4960dc7053e0 100644 +index cccf360b8e21..d38615eed7f1 100644 --- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp +++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp @@ -817,14 +817,20 @@ LogicalResult spirv::Deserializer::processType(spirv::Opcode opcode, @@ -377,7 +375,7 @@ index 14fd4d5d4e40..4960dc7053e0 100644 case 32: floatTy = opBuilder.getF32Type(); break; -@@ -1366,6 +1372,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef operands, +@@ -1330,6 +1336,9 @@ LogicalResult spirv::Deserializer::processConstant(ArrayRef operands, } else if (floatType.isF16()) { APInt data(16, operands[2]); value = APFloat(APFloat::IEEEhalf(), data); @@ -388,7 +386,7 @@ index 14fd4d5d4e40..4960dc7053e0 100644 auto attr = opBuilder.getFloatAttr(floatType, value); diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp -index 64c23c75d4cd..b7d50073db99 100644 +index 10e5264bffac..26a8f7bb5fa9 100644 --- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp +++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp @@ -477,6 +477,9 @@ LogicalResult Serializer::prepareBasicType( @@ -401,7 +399,7 @@ index 64c23c75d4cd..b7d50073db99 100644 return success(); } -@@ -984,7 +987,8 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr, +@@ -965,7 +968,8 @@ uint32_t Serializer::prepareConstantFp(Location loc, FloatAttr floatAttr, } words = llvm::bit_cast(value.convertToDouble()); encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, words.word1, words.word2}); @@ -411,3 +409,5 @@ index 64c23c75d4cd..b7d50073db99 100644 uint32_t word = static_cast(value.bitcastToAPInt().getZExtValue()); encodeInstructionInto(typesGlobalValues, opcode, {typeID, resultID, word}); +-- +2.34.1 diff --git a/build_tools/patches/0010-refine-the-XeGPU-definition.patch b/build_tools/patches/0010-refine-the-XeGPU-definition.patch deleted file mode 100644 index 89d37c371..000000000 --- a/build_tools/patches/0010-refine-the-XeGPU-definition.patch +++ /dev/null @@ -1,206 +0,0 @@ -From 8a734652353bdd85b9cc7d2426e7395404372d72 Mon Sep 17 00:00:00 2001 -From: Chao Chen -Date: Wed, 28 Aug 2024 23:57:49 +0000 -Subject: [PATCH] refine the XeGPU definition - add verification for - scattered tensordesc regarding to chunk size and total size - refine - load_gather and store_scatter to reveal transpose effect - ---- - .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td | 40 +++++++++++------ - mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp | 1 + - mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp | 44 ++++++++++++++++--- - 3 files changed, 65 insertions(+), 20 deletions(-) - -diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -index a3922bbad2b3..3e0c6f243fd4 100644 ---- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td -@@ -413,24 +413,28 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> { - implying each element in the array corresponds to a work-item (SIMT lane) - in the subgroup. - -+ The first dimension of the result TensorDesc corresponds to work-items, so it should -+ match the dimension of offsets. It may also has a second dimension corresponding to -+ the chunk_size if the chunk size is larger than 1. -+ - Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64] - ```mlir - %a = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32, chunk_size_per_lane = 1> -+ %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32> - ``` - - Example 2. It assumes subgroup size is 4, and each workitem access 8 elements. - It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71] - ```mlir - %0 = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8> -+ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> - ``` - - Example 3. It is similar to Example 2, but there is some overlaps among workitems. - It accesses: a[0:7], a[4:11], a[8:15], a[12:19] - ```mlir - %0 = memref.alloc() : memref<1024xf32> -- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size_per_lane = 8>> -+ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>> - ``` - }]; - -@@ -500,28 +504,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] - - let description = [{ It (aka. load) load data per each work-item. The output - describes the data being loaded at the subgroup level, so its size is -- consistent with the number of work-items in a subgroup. When `chunk_size_per_lane` -- attribute is larger than 1 in TensorDesc, the output vector will be 2D vector, -- with dim-1 correspoding to the chunk size. -+ consistent with the number of work-items in a subgroup. When the chunk size -+ is larger than 2, the output vector is a 2D vector, with dim-1 correspoding -+ to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item. -+ Specially, there is a transpose effect on the result (as compared to the TensorDesc) -+ due to the hardware implementation. Therefore, a transpose attribute is introduced -+ on purpose, making sure users are aware of this implicit transformation. - - The mask operand masks out memory access so that it is safe to pass out-of-boundary - addresses/offsets as long as they are masked. It applies to slots of SIMD lanes. - - Example: - ```mlir -- %2 = xegpu.load %1, %0 {transpose = [1, 0], -+ %2 = xegpu.load %1, %0 {transpose, - l1_hint = #xegpu.cache_hint, - l2_hint = #xegpu.cache_hint, - l3_hint = #xegpu.cache_hint} -- : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr>, vector<16xi1> -- -> vector<16xf32> -+ : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr>, -+ vector<16xi1> -> vector<16xf32> - ``` - - }]; - - let arguments = (ins XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, -- OptionalAttr: $transpose, -+ OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); -@@ -553,11 +560,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"] - let hasVerifier = 1; - } - --def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>, -- AllElementTypesMatch<["value", "TensorDesc"]>]> { -+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>, -+ AllElementTypesMatch<["value", "TensorDesc"]>]> { - let summary = "store data to scattered memory locations."; -- let description = [{ It (aka. store) stores data to scattered memory locations. -- It has similar semantic to `load_gather`. -+ let description = [{ It (aka. store) stores data to scattered memory locations. The value is -+ typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be -+ a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes -+ and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter` -+ has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is -+ introduced on purpose, making sure users are aware of this implicit transformation. - - Example: - ```mlir -@@ -572,6 +583,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe - XeGPU_ValueType: $value, - XeGPU_TensorDesc: $TensorDesc, - XeGPU_MaskType: $mask, -+ OptionalAttr: $transpose, - OptionalAttr: $l1_hint, - OptionalAttr: $l2_hint, - OptionalAttr: $l3_hint); -diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -index 0eab601bbaac..555c232ff1f0 100644 ---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp -@@ -57,6 +57,7 @@ ScatterTensorDescAttr ScatterTensorDescAttr::get(mlir::MLIRContext *context, - //===----------------------------------------------------------------------===// - // XeGPU_TensorDescType - //===----------------------------------------------------------------------===// -+ - mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) { - llvm::SmallVector shape; - mlir::Type elementType; -diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -index c9e399a7149f..b35a639540aa 100644 ---- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp -@@ -305,6 +305,26 @@ LogicalResult CreateDescOp::verify() { - - auto chunkSize = tdescTy.getChunkSize(); - -+ // check chunk_size -+ llvm::SmallVector supportedChunkSizes = {1, 2, 3, 4, 8, 16, 32, 64, 128, 256}; -+ if (!llvm::is_contained(supportedChunkSizes, chunkSize)) -+ return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256."); -+ -+ // check total size -+ auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth(); -+ auto bitsPerLane = elemBits * chunkSize; -+ if (chunkSize > 1 && bitsPerLane % 32) { -+ // For 8-bit and 16-bit data, the hardware only supports chunk size of 1. -+ // For 32-bit data, the hardware can support larger larger chunk size. So -+ // we can bitcast 8-bit/16-bit data to 32-bit data for better performance. -+ // But this requires the total size is 32 bit aligned to make the optimization work. -+ return emitOpError("access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned."); -+ } -+ -+ auto lscConstraints = 512 * 8; // each access is upto 512 bytes. -+ if (elemBits * tdescTy.getNumElements() > lscConstraints) -+ return emitOpError("total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes."); -+ - SmallVector shape({(int64_t)getNumOffsets()}); - if (chunkSize != 1) - shape.push_back(chunkSize); -@@ -370,14 +390,13 @@ LogicalResult LoadGatherOp::verify() { - if (tdescShape[0] != maskShape[0]) - return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - -- if (getTransposeAttr()) { -- auto trans = getTranspose().value(); -- if (tdescShape.size() < trans.size()) -- emitWarning("Invalid transpose attr. It is ignored."); -- else -- transpose(trans, tdescShape); -+ if (tdescTy.getRank() == 2) { -+ if (!getTransposeAttr()) -+ return emitOpError("load_gather has to be transposed."); -+ transpose({1, 0}, tdescShape); - } - -+ - if (valueShape != tdescShape) - return emitOpError("Unexpected result shape") - << "(Expected shape: " << makeString(tdescShape) -@@ -404,11 +423,24 @@ LogicalResult StoreScatterOp::verify() { - return emitOpError("invlid l3_hint: ") << getL3HintAttr(); - - auto maskTy = getMaskType(); -+ auto valueTy = getValueType(); - auto maskShape = getShapeOf(maskTy); - auto tdescShape = getShapeOf(tdescTy); -+ auto valueShape = getShapeOf(valueTy); - if (tdescShape[0] != maskShape[0]) - return emitOpError("dim-0 of the Mask and TensorDesc should be the same."); - -+ if (tdescTy.getRank() == 2) { -+ if (!getTransposeAttr()) -+ return emitOpError("load_gather has to be transposed."); -+ transpose({1, 0}, tdescShape); -+ } -+ -+ if (valueShape != tdescShape) -+ return emitOpError("Unexpected value shape") -+ << "(Expected shape: " << makeString(tdescShape) -+ << ", Given shape: " << makeString(valueShape) << ").\n"; -+ - return success(); - } - //===----------------------------------------------------------------------===// --- -2.34.1 diff --git a/docs/rfcs/XeGPU.md b/docs/rfcs/XeGPU.md index 0524f4175..3d87e116f 100644 --- a/docs/rfcs/XeGPU.md +++ b/docs/rfcs/XeGPU.md @@ -16,13 +16,13 @@ Below is a summary. | Ops | Syntax | Example | | :--- | :---- | :--- | -|create_tdesc | operation ::= xegpu.create_tdesc $base_addr, $offset attr-dict : type($base_addr), type($offset) -> type($tdesc) | %scatter_tdesc = xegpu.create_tdesc %mem_addr, %offset: int64, Vector<16 x index> -> tensor_desc<16 x bf16, #xegpu.scatter_tdesc_attr> | +|create_tdesc | operation ::= xegpu.create_tdesc $base_addr, $offset attr-dict : type($base_addr), type($offset) -> type($tdesc) | %scatter_tdesc = xegpu.create_tdesc %mem_addr, %offset: int64, Vector<16 x index> -> tensor_desc<16 x bf16, #xegpu.scatter_tdesc_attr> | |load_gather | operation ::= xegpu.load_gather $tdesc, $mask attr-dict : type($tdesc), type($mask) -> type($res) | %result = xegpu.load_gather %scatter_tdesc, %mask {L1 = cached, L2 = uncached, transpose} : tensor_desc<16x8xbf16, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<8x16xbf16> | |store_scatter | operation ::= xegpu.store_scatter $value, $tdesc, $mask attr-dict : type($value), type($tdesc), type($mask) | xegpu.store_scatter %value, %scatter_tdesc, %mask {L1 = cached, L2 = uncached} : vector<16xbf16>, tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> | |update_offset | operation ::= xegpu.update_offset $tdesc, $delta : type($tdesc), type($delta) -> type($tdesc) | %tdesc_updated = xegpu.update_offset %tdesc, %offsets: tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> -> tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>> | |Prefetch | operation ::= xegpu.prefetch $tdesc attr-dict : type($tdesc) | xegpu.prefetch %scatter_tdesc1 {L1 = cached, L2 = uncached} : tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>> | |atomic_rmw | operation ::= xegpu.atomic_rmw $kind, $value, $tdesc, $mask attr-dict : type($value), type($tdesc), type($mask) | %ret_value = xegpu.atomic_rmw “addf”, %value, %scatter_mem2, %mask : vector<16xbf16>, tensor_desc<16xbf16, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> | -|create_nd_tdesc | operation ::= xegpu.create_nd_tdesc $base_addr, $offset0, $offset1, $tdim0, $tdim1, $tstride0 attr-dict : type($base_addr), index, index, index, index, index, index -> type($tdesc) | %tdesc = xegpu.create_nd_tdesc %mem_addr, %tile_offset:2, %base_shape:2,%base_strides:2: int64, index, index, index, index, index, index -> tensor_desc<8x16xbf16, #xegpu.block_tdesc_attr> | +|create_nd_tdesc | operation ::= xegpu.create_nd_tdesc $base_addr, $offset0, $offset1, $tdim0, $tdim1, $tstride0 attr-dict : type($base_addr), index, index, index, index, index, index -> type($tdesc) | %tdesc = xegpu.create_nd_tdesc %mem_addr, %tile_offset:2, %base_shape:2,%base_strides:2: int64, index, index, index, index, index, index -> tensor_desc<8x16xbf16, #xegpu.block_tdesc_attr> | |load_nd | operation ::= xegpu.load_nd $tdesc attr-dict : type($tdesc) -> type($res) | %result = xegpu.load_nd %tdesc {L1_hint = uncached, L3_hint = uncached} : tensor_desc<8x16xbf16> -> vector<8x16xbf16> | |dpas | operation ::= xegpu.dpas $matC, $matA, $matB attr_dict : type($matC), type($matA), type($matB) -> type($res) | %vector_c = xegpu.dpas %vector_c, %vector_a, %vector_b: vector<8x16xfloat>, vector<8x8x2xbf16>, vector<8x16x2xbf16> -> vector<8x16xfloat> | |store_nd | operation ::= xegpu.store_nd $value, $tdesc attr-dict : type($value), type($tdesc) | xegpu.store_nd %value, %tdesc {L1_hint = uncached, L3_hint = uncached} : vector<8x16xbf16>, tensor_desc<8x16xbf16> | @@ -66,7 +66,7 @@ data fragments and will be introduced in the next section in details. XeGPU oper `create_nd_tdesc` can also accept an optional `block_tdesc_attr` to extend its capablity. The `block_tdesc_attr` could encode the following optional attributes: -- `memory_scope`. It describes where the data block being described is located. `global` means device memory, or `slm` means shared local memory. +- `memory_space`. It describes where the data block being described is located. `global` means device memory, or `slm` means shared local memory. It is default to `global`. However, it has to match with the memory scope of the base addresses. If the base address is for shared local memory, than the memory scope of the tensor_desc has to be shared local memory too. - `array_length`. It is only used for load. It describes how many horizontally consecutive blocks will be loaded by a hardware load instruction. @@ -96,16 +96,16 @@ create_nd_tdesc also accepts a memref as input instead of a memory address, shap The example below accepts a memory address and an offset and creates a 1D tensor_desc. The tensor_desc describes a 1D vector that is loaded by all work items combined within the subgroup. ```mlir #sg_map_a = xegpu.sg_map - #tdesc_attr1 = !xegpu.block_tdesc_attr + #tdesc_attr1 = !xegpu.block_tdesc_attr %tdesc1 = xegpu.create_nd_tdesc %mem_addr, %offset : uint64, index into tensor_desc<16xbf16, #tdesc_attr1> - #tdesc_attr2 = !xegpu.block_tdesc_attr + #tdesc_attr2 = !xegpu.block_tdesc_attr %tdesc2 = xegpu.create_nd_tdesc %mem_addr, %offset : uint64, index into tensor_desc<16xbf16, #tdesc_attr2> ``` -Attribute `memory_scope` indicates whether the tensor is located in the global or shared local memory. The default value is global. +Attribute `memory_space` indicates whether the tensor is located in the global or shared local memory. The default value is global. Attribute `boundary_check` indicates whether the operation detects the boundary and pads with zero for out-of-boundary access. The default value is true. For 1D tensor description, the base_shape and base_stride are optional, the attribute “boundary_check” must be false, “%mem_add + %offset” must not access out-of-boundary memory to avoid undefined behavior. @@ -197,7 +197,7 @@ When this variant is used, the matrix B must be in VNNI layout, and the matrix A ``` `prefetch_nd` prefetches the memory specified by tensor_desc to cache. -Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_scope` can be applied to prefetch_nd. +Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_space` can be applied to prefetch_nd. ```mlir xegpu.prefetch_nd %tdesc2: tensor_desc<8x16xbf16> xegpu.prefetch_nd %tdesc2: tensor_desc<16xbf16> @@ -230,7 +230,7 @@ creates a tensor_desc, which describes the memory base address and offsets for 1 `scatter_tdesc_attr` could also contain the following optional attributes to extend the capbility of the operator, as shown in the following example. -- `memory_scope`. It has the same semantic to the one in `block_tdesc_attr`, describing where the data block being +- `memory_space`. It has the same semantic to the one in `block_tdesc_attr`, describing where the data block being described is located: global means device memory, and slm means shared local memory. It has to match with the memory scope of the base addresses. It is default to global. - `chunk_size`. It specifies the size being loaded per each work item, when each work item may load a consecutive @@ -241,7 +241,7 @@ as shown in the following example. a valid chunk size could be 2, 4, 8, 16, 32, 64, and for int8, a valid chunk size could be 4, 8, 16, 32, 64. ```mlir - #tdesc_attr = !xegpu.scatter_tdesc_attr< memory_scope=slm, chunk_size=8> + #tdesc_attr = !xegpu.scatter_tdesc_attr< memory_space=slm, chunk_size=8> %scatter_tdesc_chunk = xegpu.create_tdesc, %base_addr, %offsets : uint64, vector<16xindex> into tensor_desc<16x8xuint16, #tdesc_attr> ``` @@ -258,7 +258,7 @@ When loading a tensor_desc with chunk_size attribute, the output vector must be The transpose attribute must be present to explicitly describe the transpose effect. ```mlir - #tdesc_attr = #xegpu.scatter_tdesc_attr + #tdesc_attr = #xegpu.scatter_tdesc_attr %result = xegpu.load_gather %scatter_tdesc_chunk, %mask {L1 = cached, L2 = uncached, transpose} : tensor_desc<16x8xbf16, #tdesc_attr>, vector<16xi1> -> vector<8x16xbf16> ``` @@ -276,7 +276,7 @@ uint32, uint64. xegpu.store_scatter %value, %scatter_tdesc1, %mask : vector<16xuint16>, vector<16xi1>, tensor_desc<16xuint16, #xegpu.scatter_tdesc_attr<>> ``` -Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_scope` can be applied to `store_scatter`. Similar to `load_gather`, +Attributes `L1_hint`, `L2_hint`, `L3_hint`, and `memory_space` can be applied to `store_scatter`. Similar to `load_gather`, when the `chunk_size` of `tensor_desc` is specified, the `value` is a 2D vector with the shape of [chunk_size, subgroup_size]. `prefetch` prefetches data from the memory specified by tensor_desc. @@ -392,7 +392,7 @@ For load_nd with `transpose` attribute, wi_layout is transposed to match with th `xegpu.sg_map` is also used to describe the WI data distribution for regular load. Below example shows that each WI loads one fp32 data element. The result vector <16xfp32> is loaded and distributed to each WI as <1xf32>. ```mlir #sg_map_t = xegpu.sg_map - #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true> + #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true> %scatter_tdesc = xegpu.create_tdesc, %src_addr, %offsets: uint64, vector<16xindex> into tensor_desc<16xfp32, #scatter_attr, #sg_map_t> @@ -403,7 +403,7 @@ For load_nd with `transpose` attribute, wi_layout is transposed to match with th Below example shows that each WI loads 4 fp32 data element with the chunk_size_per_lane. This load with chunk_size_per_lane is effectively load 2D tensor and transpose. The data fragement <1x4xf32> is loaded and transposed as <4x1xf32>. ```mlir #sg_map_t = xegpu.sg_map - #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true> + #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true> %scatter_tdesc_chunk = xegpu.create_tdesc, %src_addr, %offsets {chunk_size_per_lane=4} : uint64, vector<16xindex> into tensor_desc<16x4xfp32, #scatter_attr, #sg_map_t> @@ -554,7 +554,7 @@ An example on how to perform transpose using load_gather with chunk_size_per_lan ```mlir #sg_map_t = xegpu.sg_map - #scatter_attr = !xegpu.tdesc_attr< memory_scope=slm, scattered=true> + #scatter_attr = !xegpu.tdesc_attr< memory_space=slm, scattered=true> %scatter_tdesc_chunk = xegpu.create_tdesc, %src_addr, %offsets {chunk_size_per_lane=4} : uint64, vector<16xindex> into tensor_desc<16x4xfp32, #scatter_attr, #sg_map_t> @@ -563,7 +563,7 @@ An example on how to perform transpose using load_gather with chunk_size_per_lan tensor_desc<16x4xfp32, #tdesc_attr, #sg_map_t>, vector<16xi1> -> vector<4x1xfp32> #sg_map = xegpu.sg_map - #tdesc_attr = !xegpu.tdesc_attr< memory_scope=slm, boundary_check=false> + #tdesc_attr = !xegpu.tdesc_attr< memory_space=slm, boundary_check=false> %tdesc2 = xegpu.create_nd_tdesc %dest_addr, %offset: uint64, index into tensor_desc<64xfp32, #tdesc_attr> xegpu.store_nd %value, %tdesc2: diff --git a/include/imex/Dialect/XeTile/IR/XeTileAttrs.td b/include/imex/Dialect/XeTile/IR/XeTileAttrs.td index e8c168e19..70de8c0e8 100644 --- a/include/imex/Dialect/XeTile/IR/XeTileAttrs.td +++ b/include/imex/Dialect/XeTile/IR/XeTileAttrs.td @@ -64,7 +64,7 @@ def XeTile_TileAttr : XeTile_Attr<"XeTile", "tile_attr"> { OptionalParameter<"xetile::WorkGroupMapAttr">:$wg_map, DefaultValuedParameter<"mlir::DenseI32ArrayAttr", "mlir::DenseI32ArrayAttr::get($_ctxt, {1, 0})">:$order, OptionalParameter<"mlir::DenseI64ArrayAttr">:$inner_blocks, - OptionalParameter<"mlir::Attribute">:$memory_scope + OptionalParameter<"mlir::Attribute">:$memory_space ); let assemblyFormat = "`<` struct(params) `>`"; let genVerifyDecl = true; @@ -73,31 +73,31 @@ def XeTile_TileAttr : XeTile_Attr<"XeTile", "tile_attr"> { CArg<"xetile::WorkGroupMapAttr", "{}">:$wg_map, CArg<"llvm::ArrayRef", "{1, 0}">:$order, CArg<"llvm::ArrayRef", "{}">:$inner_blocks, - CArg<"int", "0">:$memory_scope), + CArg<"int", "0">:$memory_space), [{ mlir::Type intType = mlir::IntegerType::get($_ctxt, 32); return $_get($_ctxt, sg_map, wg_map, mlir::DenseI32ArrayAttr::get($_ctxt, order), mlir::DenseI64ArrayAttr::get($_ctxt, inner_blocks), - mlir::IntegerAttr::get(intType, memory_scope)); + mlir::IntegerAttr::get(intType, memory_space)); }]>, AttrBuilder<(ins CArg<"llvm::ArrayRef", "{1, 0}">:$order, - CArg<"int", "0">:$memory_scope), + CArg<"int", "0">:$memory_space), [{ mlir::Type intType = mlir::IntegerType::get($_ctxt, 32); return $_get($_ctxt, xetile::SubGroupMapAttr(), xetile::WorkGroupMapAttr(), mlir::DenseI32ArrayAttr::get($_ctxt, order), mlir::DenseI64ArrayAttr::get($_ctxt, {}), - mlir::IntegerAttr::get(intType, memory_scope)); + mlir::IntegerAttr::get(intType, memory_space)); }]>, AttrBuilder<(ins CArg<"xetile::SubGroupMapAttr", "{}">:$sg_map, CArg<"xetile::WorkGroupMapAttr", "{}">:$wg_map, CArg<"llvm::ArrayRef", "{1, 0}">:$order, - CArg<"int", "0">:$memory_scope), + CArg<"int", "0">:$memory_space), [{ mlir::Type intType = mlir::IntegerType::get($_ctxt, 32); return $_get($_ctxt, sg_map, wg_map, mlir::DenseI32ArrayAttr::get($_ctxt, order), mlir::DenseI64ArrayAttr::get($_ctxt, {}), - mlir::IntegerAttr::get(intType, memory_scope)); + mlir::IntegerAttr::get(intType, memory_space)); }]> ]; } diff --git a/include/imex/Dialect/XeTile/IR/XeTileTypes.td b/include/imex/Dialect/XeTile/IR/XeTileTypes.td index 0e9ee2908..df6fa1b40 100644 --- a/include/imex/Dialect/XeTile/IR/XeTileTypes.td +++ b/include/imex/Dialect/XeTile/IR/XeTileTypes.td @@ -117,20 +117,20 @@ def XeTile : XeTile_Type<"Tile", "tile", [ShapedTypeInterface], return mlir::DenseI32ArrayAttr::get(getContext(), {1, 0}); } - mlir::Attribute getMemoryScope() { + mlir::Attribute getMemorySpace() { auto encoding = llvm::dyn_cast_if_present(getEncoding()); if (encoding) - return encoding.getMemoryScope(); + return encoding.getMemorySpace(); return mlir::Attribute(); } - int getMemoryScopeAsInt() { + int getMemorySpaceAsInt() { auto encoding = llvm::dyn_cast_if_present(getEncoding()); - if (encoding && encoding.getMemoryScope()) { - auto memoryScope = encoding.getMemoryScope(); - assert(mlir::isa(memoryScope) && + if (encoding && encoding.getMemorySpace()) { + auto MemorySpace = encoding.getMemorySpace(); + assert(mlir::isa(MemorySpace) && "Using `getMemorySpaceAsInt` with non-Integer attribute"); - return mlir::cast(memoryScope).getInt(); + return mlir::cast(MemorySpace).getInt(); } // return default value 0 indicating Global memory return 0; diff --git a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp index 521dd2b5f..b4ea05ec8 100644 --- a/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp +++ b/lib/Conversion/GPUToSPIRV/GPUToSPIRVPass.cpp @@ -69,118 +69,6 @@ class GPUXToSPIRVPass : public impl::ConvertGPUXToSPIRVBase { bool mapMemorySpace; }; -class PrintfOpPattern : public mlir::OpConversionPattern { -public: - using mlir::OpConversionPattern::OpConversionPattern; - mlir::LogicalResult - matchAndRewrite(mlir::gpu::PrintfOp gpuPrintfOp, OpAdaptor adaptor, - mlir::ConversionPatternRewriter &rewriter) const override { - auto loc = gpuPrintfOp.getLoc(); - - auto funcOp = rewriter.getBlock() - ->getParent() - ->getParentOfType(); - - auto moduleOp = funcOp->getParentOfType(); - - const char formatStringPrefix[] = "printfMsg"; - unsigned stringNumber = 0; - mlir::SmallString<16> globalVarName; - mlir::spirv::GlobalVariableOp globalVar; - - // formulate spirv global variable name - do { - globalVarName.clear(); - (formatStringPrefix + llvm::Twine(stringNumber++)) - .toStringRef(globalVarName); - } while (moduleOp.lookupSymbol(globalVarName)); - - auto i8Type = rewriter.getI8Type(); - auto i32Type = rewriter.getI32Type(); - - unsigned scNum = 0; - auto createSpecConstant = [&](unsigned value) { - auto attr = rewriter.getI8IntegerAttr(value); - mlir::SmallString<16> specCstName; - (llvm::Twine(globalVarName) + "_sc" + llvm::Twine(scNum++)) - .toStringRef(specCstName); - - return rewriter.create( - loc, rewriter.getStringAttr(specCstName), attr); - }; - - // define GlobalVarOp with printf format string using SpecConstants - // and make composite of SpecConstants - { - mlir::Operation *parent = - mlir::SymbolTable::getNearestSymbolTable(gpuPrintfOp->getParentOp()); - - mlir::ConversionPatternRewriter::InsertionGuard guard(rewriter); - - mlir::Block &entryBlock = *parent->getRegion(0).begin(); - rewriter.setInsertionPointToStart( - &entryBlock); // insertion point at module level - - // Create Constituents with SpecConstant to construct - // SpecConstantCompositeOp - llvm::SmallString<20> formatString(gpuPrintfOp.getFormat()); - formatString.push_back('\0'); // Null terminate for C - mlir::SmallVector constituents; - for (auto c : formatString) { - auto cSpecConstantOp = createSpecConstant(c); - constituents.push_back(mlir::SymbolRefAttr::get(cSpecConstantOp)); - } - - // Create specialization constant composite defined via spirv.SpecConstant - size_t contentSize = constituents.size(); - auto globalType = mlir::spirv::ArrayType::get(i8Type, contentSize); - mlir::spirv::SpecConstantCompositeOp specCstComposite; - mlir::SmallString<16> specCstCompositeName; - (llvm::Twine(globalVarName) + "_scc").toStringRef(specCstCompositeName); - specCstComposite = rewriter.create( - loc, mlir::TypeAttr::get(globalType), - rewriter.getStringAttr(specCstCompositeName), - rewriter.getArrayAttr(constituents)); - - // Define GlobalVariable initialized from Constant Composite - globalVar = rewriter.create( - loc, - mlir::spirv::PointerType::get( - globalType, mlir::spirv::StorageClass::UniformConstant), - globalVarName, mlir::FlatSymbolRefAttr::get(specCstComposite)); - globalVar->setAttr("Constant", rewriter.getUnitAttr()); - } - - // Get SSA value of Global variable - mlir::Value globalPtr = - rewriter.create(loc, globalVar); - - mlir::Value fmtStr = rewriter.create( - loc, - mlir::spirv::PointerType::get( - i8Type, mlir::spirv::StorageClass::UniformConstant), - globalPtr); - - // Get printf arguments - auto argsRange = adaptor.getArgs(); - mlir::SmallVector printfArgs; - printfArgs.reserve(argsRange.size() + 1); - printfArgs.append(argsRange.begin(), argsRange.end()); - - rewriter.create(loc, i32Type, fmtStr, printfArgs); - - rewriter.eraseOp(gpuPrintfOp); - - return mlir::success(); - } -}; - -void populateGPUPrintfToSPIRVPatterns(mlir::SPIRVTypeConverter &typeConverter, - mlir::RewritePatternSet &patterns) { - - patterns.add(typeConverter, patterns.getContext()); -} - // This op: // vector.create_mask %maskVal : vector // is lowered to: @@ -519,7 +407,6 @@ void GPUXToSPIRVPass::runOnOperation() { mlir::populateSCFToSPIRVPatterns(typeConverter, scfToSpirvCtx, patterns); mlir::cf::populateControlFlowToSPIRVPatterns(typeConverter, patterns); mlir::populateMathToSPIRVPatterns(typeConverter, patterns); - imex::populateGPUPrintfToSPIRVPatterns(typeConverter, patterns); imex::populateVectorToSPIRVPatterns(typeConverter, patterns); if (failed(applyFullConversion(gpuModule, *target, std::move(patterns)))) diff --git a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp index c883399b8..221d94dcd 100644 --- a/lib/Conversion/XeGPUToVC/LSCPatterns.cpp +++ b/lib/Conversion/XeGPUToVC/LSCPatterns.cpp @@ -177,11 +177,11 @@ static LogicalResult isValidScatterSetup(Type elemTy, int simd_lanes, // or the data for store. It is not used for prefetch. prefetch on slm is not // available. static std::string getLSCIntrinsicStr(llvm::StringRef opName, int simd_lanes, - xegpu::MemoryScope memoryScope, + xegpu::MemorySpace MemorySpace, llvm::StringRef dataTyStr = "") { - auto kind = memoryScope == xegpu::MemoryScope::SLM ? "slm" : "stateless"; + auto kind = MemorySpace == xegpu::MemorySpace::SLM ? "slm" : "stateless"; // using 32bit for slm and 64bit for stateless - auto addrBits = memoryScope == xegpu::MemoryScope::SLM ? 32 : 64; + auto addrBits = MemorySpace == xegpu::MemorySpace::SLM ? 32 : 64; auto predTyStr = llvm::formatv("v{0}i1", simd_lanes).str(); auto offsetTyStr = llvm::formatv("v{0}i{1}", simd_lanes, addrBits).str(); if (opName == "load") { @@ -443,7 +443,7 @@ static Value genLoadIntrinsicCallWithC32BConversion( ConversionPatternRewriter &rewriter, Location &loc, VectorType resultTy, int simd_lanes, Value pred, std::optional l1, std::optional l3, Type elemTy, int chunkSize, - xegpu::MemoryScope scope, Value addresses) { + xegpu::MemorySpace scope, Value addresses) { // truncate the value from i32Ty to elemTy. auto truncate = [&](Value value, Type elemTy, @@ -486,7 +486,7 @@ static Value gen1DLoadInstrinsicCall(ConversionPatternRewriter &rewriter, std::optional l1, std::optional l3, Type elemTy, int elems, - xegpu::MemoryScope scope, Value payload) { + xegpu::MemorySpace scope, Value payload) { const int simd_lanes = 1; auto pred = dense_vector_int_val(1, i1Ty, simd_lanes); auto bitWidth = elemTy.getIntOrFloatBitWidth(); @@ -512,9 +512,9 @@ static func::CallOp genPrefetchIntrinsicCall(ConversionPatternRewriter &rewriter, Location &loc, int simd_lanes, std::optional l1, std::optional l3, Type elemTy, - int chunkSize, xegpu::MemoryScope memoryScope, + int chunkSize, xegpu::MemorySpace MemorySpace, Value addresses) { - auto intrinsicStr = getLSCIntrinsicStr("prefetch", simd_lanes, memoryScope); + auto intrinsicStr = getLSCIntrinsicStr("prefetch", simd_lanes, MemorySpace); auto pred = dense_vector_int_val(1, i1Ty, simd_lanes); return genLSCIntrinsicCallWithEncoding( rewriter, loc, intrinsicStr, {} /* null resultType */, pred, LSC_LOAD, l1, @@ -526,12 +526,12 @@ genPrefetchIntrinsicCall(ConversionPatternRewriter &rewriter, Location &loc, static func::CallOp gen1DPrefetchIntrinsicCall( ConversionPatternRewriter &rewriter, Location &loc, std::optional l1, std::optional l3, - Type elemTy, int elems, xegpu::MemoryScope memoryScope, Value payload) { + Type elemTy, int elems, xegpu::MemorySpace MemorySpace, Value payload) { const int simd_lanes = 1; auto bitWidth = elemTy.getIntOrFloatBitWidth(); assert(bitWidth >= 32 && "1D block is only for 32/64-bit data."); return genPrefetchIntrinsicCall(rewriter, loc, simd_lanes, l1, l3, elemTy, - elems, memoryScope, payload); + elems, MemorySpace, payload); } // Generate a call to lsc.store intrinsic, using convert-to-32b conversion @@ -545,7 +545,7 @@ static func::CallOp genStoreIntrinsicCallWithC32BConversion( ConversionPatternRewriter &rewriter, Location &loc, int simd_lanes, Value pred, std::optional l1, std::optional l3, Type elemTy, int chunkSize, - xegpu::MemoryScope scope, Value addresses, Value data) { + xegpu::MemorySpace scope, Value addresses, Value data) { // lsc store only takes 32-bit data as input and save the least 8-bit, // or 16-bit to the memory. So we need to extend the data to 32-bit if @@ -597,7 +597,7 @@ static func::CallOp gen1DStoreInstrinsicCall(ConversionPatternRewriter &rewriter, Location &loc, std::optional l1, std::optional l3, Type elemTy, - int elems, xegpu::MemoryScope scope, Value payload, + int elems, xegpu::MemorySpace scope, Value payload, Value data) { auto bitWidth = elemTy.getIntOrFloatBitWidth(); assert(bitWidth >= 32 && "1D block is only for 32/64-bit data."); @@ -729,7 +729,7 @@ class LoadNdPattern : public OpConversionPattern { auto tdescTy = op.getTensorDescType(); auto elemTy = tdescTy.getElementType(); auto rank = tdescTy.getRank(); - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); auto l1hint = op.getL1Hint(); auto l3hint = op.getL3Hint(); @@ -763,11 +763,11 @@ class LoadNdPattern : public OpConversionPattern { auto newValue = gen1DLoadInstrinsicCall( rewriter, loc, op.getType(), l1hint, l3hint, elemTy, elems, - tdescTy.getMemoryScope(), adaptor.getTensorDesc()); + tdescTy.getMemorySpace(), adaptor.getTensorDesc()); rewriter.replaceOp(op, newValue); return success(); } else if (rank == 2) { // 2d.ugm.desc - if (scope != xegpu::MemoryScope::Global) + if (scope != xegpu::MemorySpace::Global) return rewriter.notifyMatchFailure( op, "Only global access supported for block load."); auto payload = adaptor.getTensorDesc(); @@ -800,8 +800,9 @@ class LoadNdPattern : public OpConversionPattern { // keep the clean interface. This part of the logic will be moved out. auto shape = tdescTy.getShape().vec(); shape[1] = shape[1] / factor; - tdescTy = TensorDescType::get(tdescTy.getContext(), shape, elemTy, - tdescTy.getEncoding()); + tdescTy = + TensorDescType::get(tdescTy.getContext(), shape, elemTy, + tdescTy.getEncoding(), /*sg_map*/ nullptr); // update arg7 of the payload auto nblks = tdescTy.getArrayLength(); @@ -862,14 +863,14 @@ class PrefetchNdPattern : public OpConversionPattern { auto tdescTy = op.getTensorDescType(); auto elemTy = tdescTy.getElementType(); auto rank = tdescTy.getRank(); - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); auto l1hint = op.getL1Hint(); auto l3hint = op.getL3Hint(); if (rank == 1) { // for 1D tensor desc, use lsc.load - if (scope == xegpu::MemoryScope::SLM) { + if (scope == xegpu::MemorySpace::SLM) { // no prefetch for slm. rewriter.eraseOp(op); return success(); @@ -887,7 +888,7 @@ class PrefetchNdPattern : public OpConversionPattern { rewriter.replaceOp(op, callOp); return success(); } else if (rank == 2) { // 2d.ugm.desc - if (scope != xegpu::MemoryScope::Global) + if (scope != xegpu::MemorySpace::Global) return rewriter.notifyMatchFailure( op, "Only global access supported for block prefetch."); auto callOp = gen2DPrefetchIntrinsicCall( @@ -910,7 +911,7 @@ class StoreNdPattern : public OpConversionPattern { auto tdescTy = op.getTensorDescType(); auto elemTy = tdescTy.getElementType(); auto rank = tdescTy.getRank(); - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); auto l1hint = op.getL1Hint(); auto l3hint = op.getL3Hint(); @@ -943,7 +944,7 @@ class StoreNdPattern : public OpConversionPattern { return success(); } else if (rank == 2) { // store.2d.ugm.desc - if (scope != xegpu::MemoryScope::Global) + if (scope != xegpu::MemorySpace::Global) return rewriter.notifyMatchFailure( op, "Only global access supported for block store."); @@ -996,7 +997,7 @@ class LoadGatherPattern : public OpConversionPattern { auto resultTy = cast(op.getType()); auto newValue = genLoadIntrinsicCallWithC32BConversion( rewriter, loc, resultTy, simd_lanes, op.getMask(), l1hint, l3hint, - elemTy, chunkSize, tdescTy.getMemoryScope(), adaptor.getTensorDesc()); + elemTy, chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc()); rewriter.replaceOp(op, newValue); return success(); @@ -1015,11 +1016,11 @@ class PrefetchPattern : public OpConversionPattern { auto elemTy = tdescTy.getElementType(); auto chunkSize = tdescTy.getChunkSize(); auto simd_lanes = tdescTy.getShape()[0]; - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); // For SLM, there is not prefetch available, we will simply // remove the prefetch op. - if (scope == xegpu::MemoryScope::SLM) { + if (scope == xegpu::MemorySpace::SLM) { rewriter.eraseOp(op); return success(); } @@ -1080,7 +1081,7 @@ class StoreScatterPattern : public OpConversionPattern { auto l3hint = op.getL3Hint(); auto callOp = genStoreIntrinsicCallWithC32BConversion( rewriter, loc, simd_lanes, op.getMask(), l1hint, l3hint, elemTy, - chunkSize, tdescTy.getMemoryScope(), adaptor.getTensorDesc(), + chunkSize, tdescTy.getMemorySpace(), adaptor.getTensorDesc(), adaptor.getValue()); rewriter.replaceOp(op, callOp); @@ -1204,10 +1205,10 @@ class FencePattern : public OpConversionPattern { fence_scope = lscFenceScope::GROUP; switch (op.getMemoryKind()) { - case xegpu::MemoryScope::Global: + case xegpu::MemorySpace::Global: sfid = lscSFID::UGM; break; - case xegpu::MemoryScope::SLM: + case xegpu::MemorySpace::SLM: sfid = lscSFID::TGM; break; } diff --git a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp index 3d6858464..b97010633 100644 --- a/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp +++ b/lib/Conversion/XeGPUToVC/XeGPUToVC.cpp @@ -227,13 +227,13 @@ class CreateNdDescPattern : public OpConversionPattern { ConversionPatternRewriter &rewriter) const override { auto loc = op.getLoc(); auto tdescTy = op.getType(); - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); auto rank = tdescTy.getRank(); auto elemBytes = tdescTy.getElementType().getIntOrFloatBitWidth() / 8; // SLM has to use 32-bit address, while ugm needs to use 64-bit address. auto addrTy = - (scope == xegpu::MemoryScope::SLM) ? (Type)i32Ty : (Type)i64Ty; + (scope == xegpu::MemorySpace::SLM) ? (Type)i32Ty : (Type)i64Ty; // Handle different source types: memref and i64/i32/ui64/ui32 auto memRefType = dyn_cast(op.getSource().getType()); @@ -249,7 +249,7 @@ class CreateNdDescPattern : public OpConversionPattern { base = adjustBasePointer(rewriter, op, base); base = rewriter.create(loc, addrTy, base); - if (scope == xegpu::MemoryScope::SLM || rank == 1) { + if (scope == xegpu::MemorySpace::SLM || rank == 1) { // for SLM and 1D, we need to create message for use regular load/store // instead of matrix descriptor, the shape of accepted TensorDescs are // limited to 1xN (rank = 2 with leading dimension to be 1) or N (rank = @@ -369,14 +369,14 @@ class UpdateNDOffsetPattern : public OpConversionPattern { auto loc = op.getLoc(); auto tdescTy = op.getTensorDescType(); - auto scope = tdescTy.getMemoryScope(); + auto scope = tdescTy.getMemorySpace(); auto rank = tdescTy.getRank(); auto addrTy = - (scope == xegpu::MemoryScope::SLM) ? (Type)i32Ty : (Type)i64Ty; + (scope == xegpu::MemorySpace::SLM) ? (Type)i32Ty : (Type)i64Ty; auto desc = adaptor.getTensorDesc(); - if (scope == xegpu::MemoryScope::SLM || rank == 1) { + if (scope == xegpu::MemorySpace::SLM || rank == 1) { // for SLM and 1D, we need to create message for use regular load/store // instead of matrix descriptor @@ -435,20 +435,6 @@ class UpdateNDOffsetPattern : public OpConversionPattern { } }; -// converts an array of OpFoldResult into a vector of index. -static Value convertToIndexVector(llvm::ArrayRef ofrs, - Location loc, - ConversionPatternRewriter &rewriter) { - SmallVector array; - for (auto ofr : ofrs) { - auto value = getValueOrConstantOp(ofr, loc, rewriter, indexTy); - assert(value.getType().isIndex() && "expecting an index type value."); - array.push_back(value); - } - return rewriter.create( - loc, vecTy(ofrs.size(), indexTy), ValueRange(array)); -} - class CreateDescPattern : public OpConversionPattern { public: using OpConversionPattern::OpConversionPattern; @@ -462,8 +448,8 @@ class CreateDescPattern : public OpConversionPattern { assert(elemTy.isIntOrFloat() && "only support int or float element type."); // use 32-bit address for SLM and 64-bit address for UGM - auto scope = tdescTy.getMemoryScope(); - auto addrTy = scope == xegpu::MemoryScope::SLM ? (Type)i32Ty : (Type)i64Ty; + auto scope = tdescTy.getMemorySpace(); + auto addrTy = scope == xegpu::MemorySpace::SLM ? (Type)i32Ty : (Type)i64Ty; Value base = rewriter.create( loc, adaptor.getSource()); @@ -478,8 +464,7 @@ class CreateDescPattern : public OpConversionPattern { // offset is represented in number of elements, need to scale it to bytes auto elemBytes = elemTy.getIntOrFloatBitWidth() / 8; auto factor = dense_vector_int_val(elemBytes, addrTy, simd_lanes); - Value offsets = convertToIndexVector(op.getMixedOffsets(), loc, rewriter); - offsets = castValueTo(offsets, payloadTy, loc, rewriter); + Value offsets = castValueTo(adaptor.getOffsets(), payloadTy, loc, rewriter); offsets = muli(factor, offsets); // create a payload with the base address broadcasted to all simd lanes @@ -506,16 +491,15 @@ class UpdateOffsetOpPattern : public OpConversionPattern { assert(elemTy.isIntOrFloat() && "only support int or float element type."); // use 32-bit address for SLM and 64-bit address for UGM - auto scope = tdescTy.getMemoryScope(); - auto addrTy = scope == xegpu::MemoryScope::SLM ? (Type)i32Ty : (Type)i64Ty; + auto scope = tdescTy.getMemorySpace(); + auto addrTy = scope == xegpu::MemorySpace::SLM ? (Type)i32Ty : (Type)i64Ty; auto simd_lanes = tdescTy.getShape()[0]; auto payloadTy = VectorType::get(simd_lanes, addrTy); auto elemBytes = elemTy.getIntOrFloatBitWidth() / 8; Value factor = dense_vector_int_val(elemBytes, addrTy, simd_lanes); - Value offsets = convertToIndexVector(op.getMixedOffsets(), loc, rewriter); - offsets = castValueTo(offsets, payloadTy, loc, rewriter); + Value offsets = castValueTo(adaptor.getOffsets(), payloadTy, loc, rewriter); offsets = muli(factor, offsets); auto payload = addi(adaptor.getTensorDesc(), offsets); @@ -917,14 +901,14 @@ struct XeGPUToVCPass : public imex::impl::ConvertXeGPUToVCBase { typeConverter.addConversion([&](IndexType type) { return type; }); typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type { - auto scope = type.getMemoryScope(); + auto scope = type.getMemorySpace(); auto rank = type.getRank(); auto i32Type = IntegerType::get(&getContext(), 32); auto i64Type = IntegerType::get(&getContext(), 64); - if (type.isScattered() || rank == 1 || scope == xegpu::MemoryScope::SLM) { + if (type.isScattered() || rank == 1 || scope == xegpu::MemorySpace::SLM) { auto addrTy = - scope == xegpu::MemoryScope::SLM ? (Type)i32Type : (Type)i64Type; + scope == xegpu::MemorySpace::SLM ? (Type)i32Type : (Type)i64Type; auto simd_lanes = type.isScattered() ? type.getShape()[0] : 1; return VectorType::get(simd_lanes, addrTy); } else if (rank == 2) { diff --git a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp index 31cffb595..1102a8b64 100644 --- a/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/ArithOpConversion.cpp @@ -311,8 +311,7 @@ class SgVectorMultiDimReductionOpPattern rewriter.setInsertionPoint(op); // doing reduction on outer dimension - if (mlir::isConstantIntValue(dims[0], 0) && - mlir::isConstantIntValue(dims[1], 2)) { + if (dims[0] == 0 && dims[1] == 2) { auto intermediates = lowerOuterReduction(sources, shape, op.getKind(), loc, elemTy, rewriter); { @@ -330,8 +329,7 @@ class SgVectorMultiDimReductionOpPattern } // doing reduction on inner dimension - if (mlir::isConstantIntValue(dims[0], 1) && - mlir::isConstantIntValue(dims[1], 3)) { + if (dims[0] == 1 && dims[1] == 3) { auto intermediates = lowerInnerReductionWithIntraVectorShuffles( sources, shape, op.getKind(), loc, elemTy, rewriter); diff --git a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp index bd9201d10..60033b870 100644 --- a/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp +++ b/lib/Conversion/XeTileToXeGPU/XeTileOpConversion.cpp @@ -407,9 +407,9 @@ class SgInitTileOpPattern : public XeOneToNConversion { auto shape = llvm::to_vector(tileTy.getShape()); auto indexType = rewriter.getIndexType(); - auto memoryScope = op.getSourceMemorySpaceAsInt() == 3 - ? mlir::xegpu::MemoryScope::SLM - : mlir::xegpu::MemoryScope::Global; + auto MemorySpace = op.getSourceMemorySpaceAsInt() == 3 + ? mlir::xegpu::MemorySpace::SLM + : mlir::xegpu::MemorySpace::Global; if (tileTy.getRank() != 2) return op.emitOpError("The tile shape should be 2D."); @@ -454,7 +454,7 @@ class SgInitTileOpPattern : public XeOneToNConversion { auto offsetsX = offsets.pop_back_val(); auto tDescTy = mlir::xegpu::TensorDescType::get( - innerBlk, elemTy, array_length, true /*boundary_check*/, memoryScope); + innerBlk, elemTy, array_length, true /*boundary_check*/, MemorySpace); auto createIndexConstant = [&](mlir::Type type, int64_t value) { auto attr = rewriter.getIndexAttr(value); diff --git a/lib/Dialect/XeTile/IR/XeTileDialect.cpp b/lib/Dialect/XeTile/IR/XeTileDialect.cpp index a2afb71e3..0813ea315 100644 --- a/lib/Dialect/XeTile/IR/XeTileDialect.cpp +++ b/lib/Dialect/XeTile/IR/XeTileDialect.cpp @@ -116,7 +116,7 @@ mlir::LogicalResult XeTileAttr::verify( ::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError, ::imex::xetile::SubGroupMapAttr sg_map, xetile::WorkGroupMapAttr wg_map, mlir::DenseI32ArrayAttr order, mlir::DenseI64ArrayAttr inner_blocks, - mlir::Attribute memoryScope) { + mlir::Attribute MemorySpace) { if (order != mlir::DenseI32ArrayAttr() && order.size() != 2) emitError() << "expect integer array of size 2 for order"; diff --git a/lib/Dialect/XeTile/IR/XeTileOps.cpp b/lib/Dialect/XeTile/IR/XeTileOps.cpp index f51249f49..c9767a9dc 100644 --- a/lib/Dialect/XeTile/IR/XeTileOps.cpp +++ b/lib/Dialect/XeTile/IR/XeTileOps.cpp @@ -129,7 +129,7 @@ mlir::LogicalResult InitTileOp::verify() { auto tileTy = getType(); // Check for memory space validity. if (getSourceMemorySpaceAsInt() != - static_cast(tileTy.getMemoryScopeAsInt())) + static_cast(tileTy.getMemorySpaceAsInt())) return emitOpError( "memory space of the tile doesn't match with the source."); diff --git a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp index 3b1c902a4..b579e7772 100644 --- a/lib/Dialect/XeTile/Transforms/BlockAligning.cpp +++ b/lib/Dialect/XeTile/Transforms/BlockAligning.cpp @@ -203,7 +203,7 @@ struct InitTileOpPattern auto attr = imex::xetile::XeTileAttr::get( op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(), - tileTy.getOrder(), newBlockSize, tileTy.getMemoryScope()); + tileTy.getOrder(), newBlockSize, tileTy.getMemorySpace()); auto newTileTy = imex::xetile::TileType::get(tileTy.getShape(), tileTy.getElementType(), attr); diff --git a/lib/Dialect/XeTile/Transforms/Blocking.cpp b/lib/Dialect/XeTile/Transforms/Blocking.cpp index 83f82c4a3..02f8584e8 100644 --- a/lib/Dialect/XeTile/Transforms/Blocking.cpp +++ b/lib/Dialect/XeTile/Transforms/Blocking.cpp @@ -548,8 +548,8 @@ struct VectorMultiDimReductionOpPattern // will be transformed to // multi_reduction, %e, %a[1, 3]: vector<16x2x1x16xf16> to // vector<16x1xf16> - auto dim = mlir::cast(reductionDims[0]).getInt(); - auto newReductionDims = rewriter.getI64ArrayAttr({dim, dim + 2}); + auto dim = reductionDims[0]; + auto newReductionDims = rewriter.getDenseI64ArrayAttr({dim, dim + 2}); auto newDestShape = (dim == 0) @@ -955,7 +955,7 @@ struct InitTileOpPattern auto attr = imex::xetile::XeTileAttr::get( op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(), - tileTy.getOrder(), innerBlocks, tileTy.getMemoryScope()); + tileTy.getOrder(), innerBlocks, tileTy.getMemorySpace()); auto newTileTy = imex::xetile::TileType::get(tileTy.getShape(), elemTy, attr); diff --git a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp index 9f575df8e..2005f185c 100644 --- a/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp +++ b/lib/Dialect/XeTile/Transforms/BlockingAnalysis.cpp @@ -213,9 +213,10 @@ class BlockingAnalysisImpl std::shared_ptr uArch) : SparseBackwardDataFlowAnalysis(solver, symbolTable), uArch(uArch) {} - void visitOperation(mlir::Operation *op, - mlir::ArrayRef operands, - mlir::ArrayRef results) override; + mlir::LogicalResult + visitOperation(mlir::Operation *op, + mlir::ArrayRef operands, + mlir::ArrayRef results) override; void visitBranchOperand(mlir::OpOperand &operand) override {} @@ -283,7 +284,7 @@ class BlockingAnalysisImpl std::shared_ptr uArch = nullptr; }; -void BlockingAnalysisImpl::visitOperation( +mlir::LogicalResult BlockingAnalysisImpl::visitOperation( mlir::Operation *op, mlir::ArrayRef operands, mlir::ArrayRef results) { @@ -319,6 +320,8 @@ void BlockingAnalysisImpl::visitOperation( if (auto createMaskOp = mlir::dyn_cast(op)) visitCreateMaskOp(createMaskOp, operands, results); + + return mlir::success(); } void BlockingAnalysisImpl::visitPrefetchTileOp( @@ -327,7 +330,7 @@ void BlockingAnalysisImpl::visitPrefetchTileOp( auto tileTy = op.getTile().getType(); auto elemTy = tileTy.getElementType(); auto shape = tileTy.getShape(); - auto memSpace = tileTy.getMemoryScopeAsInt(); + auto memSpace = tileTy.getMemorySpaceAsInt(); // initialized with a default size queried from the architecture auto size = getInnerBlockSize(op, elemTy, shape, memSpace); if (!size) @@ -348,7 +351,7 @@ void BlockingAnalysisImpl::visitLoadTileOp( auto elemTy = tileTy.getElementType(); auto bitWidth = elemTy.getIntOrFloatBitWidth(); auto shape = tileTy.getShape(); - auto memSpace = tileTy.getMemoryScopeAsInt(); + auto memSpace = tileTy.getMemorySpaceAsInt(); // initialized with a default size queried from the architecture Block block = getInnerBlockSize(op, elemTy, shape, memSpace); @@ -387,7 +390,7 @@ void BlockingAnalysisImpl::visitStoreTileOp( auto tileTy = op.getTile().getType(); auto elemTy = tileTy.getElementType(); auto shape = tileTy.getShape(); - auto memSpace = tileTy.getMemoryScopeAsInt(); + auto memSpace = tileTy.getMemorySpaceAsInt(); auto size = getInnerBlockSize(op, elemTy, shape, memSpace); if (!size) diff --git a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp index a988583d3..878987bdc 100644 --- a/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp +++ b/lib/Dialect/XeTile/Transforms/BlockingRewrite.cpp @@ -213,7 +213,7 @@ struct InitTileOpPattern auto attr = imex::xetile::XeTileAttr::get( op.getContext(), tileTy.getSgMap(), tileTy.getWgMap(), - tileTy.getOrder(), innerBlockAttr, tileTy.getMemoryScope()); + tileTy.getOrder(), innerBlockAttr, tileTy.getMemorySpace()); auto elemTy = tileTy.getElementType(); auto newTileTy = imex::xetile::TileType::get(shape, elemTy, attr); diff --git a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp index 53c7769f4..8cc02340a 100644 --- a/lib/Dialect/XeTile/Transforms/Canonicalization.cpp +++ b/lib/Dialect/XeTile/Transforms/Canonicalization.cpp @@ -301,16 +301,14 @@ struct VectorMultiReductionToXeTileReduce return mlir::failure(); // If result is not 1D, we can not convert it to xetile.reduce. This // requires that the reduction dimensions has rank 1. - auto reductionDims = op.getReductionDims().getValue(); + auto reductionDims = op.getReductionDims(); if (reductionDims.size() != 1) return mlir::failure(); // Retain discardable attributes if any. llvm::SmallVector discardableAttrs( op->getDiscardableAttrs().begin(), op->getDiscardableAttrs().end()); // Create an equivalent XeTileReduceOp - int64_t reduceDim = llvm::cast(reductionDims[0]) - .getValue() - .getSExtValue(); + int64_t reduceDim = reductionDims[0]; auto resultTy = llvm::cast(op.getType()); auto xetileResultTy = mlir::VectorType::get( (reduceDim == 0 ? llvm::ArrayRef({1, resultTy.getDimSize(0)}) @@ -410,7 +408,7 @@ struct XeTileCanonicalizationPass final auto newAttr = imex::xetile::XeTileAttr::get( tileTy.getContext(), tileTy.getSgMap(), tileTy.getWgMap(), mlir::DenseI32ArrayAttr::get(tileTy.getContext(), {1, 0}), - tileTy.getInnerBlocks(), tileTy.getMemoryScope()); + tileTy.getInnerBlocks(), tileTy.getMemorySpace()); return imex::xetile::TileType::get( swapLastTwoElems(tileTy.getShape()), tileTy.getElementType(), diff --git a/lib/Transforms/OptimizeTranspose.cpp b/lib/Transforms/OptimizeTranspose.cpp index 9033786d8..7a150f48e 100644 --- a/lib/Transforms/OptimizeTranspose.cpp +++ b/lib/Transforms/OptimizeTranspose.cpp @@ -128,8 +128,8 @@ struct LoadTransposeAnalysis { transposeAttr.asArrayRef() == llvm::ArrayRef{1, 0}) return mlir::WalkResult::skip(); // Memory space of the load op must be global. - if (loadOp.getTensorDesc().getType().getMemoryScope() != - mlir::xegpu::MemoryScope::Global) + if (loadOp.getTensorDesc().getType().getMemorySpace() != + mlir::xegpu::MemorySpace::Global) return mlir::WalkResult::skip(); // Single user must be a transpose op. auto transposeOp = llvm::dyn_cast_if_present( diff --git a/lib/Transforms/PropagatePackedLayout.cpp b/lib/Transforms/PropagatePackedLayout.cpp index c872ba865..2f220be88 100644 --- a/lib/Transforms/PropagatePackedLayout.cpp +++ b/lib/Transforms/PropagatePackedLayout.cpp @@ -160,9 +160,9 @@ class LayoutAnalysisImpl public: using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; - void visitOperation(mlir::Operation *op, - mlir::ArrayRef operands, - mlir::ArrayRef results) override { + mlir::LogicalResult + visitOperation(mlir::Operation *op, mlir::ArrayRef operands, + mlir::ArrayRef results) override { if (mlir::OpTrait::hasElementwiseMappableTraits(op)) { Layout layout; for (auto &&[res, resLattice] : @@ -182,7 +182,7 @@ class LayoutAnalysisImpl propagateIfChanged(argLattice, argLattice->meet(tmpLayout)); } - return; + return mlir::success(); } if (auto dpas = mlir::dyn_cast(op)) { @@ -193,12 +193,14 @@ class LayoutAnalysisImpl propagateIfChanged(operand, operand->meet(std::nullopt)); } } - return; + return mlir::success(); } // Unknown ops: mark all args as invalid layout (no layout change). for (auto operand : operands) propagateIfChanged(operand, operand->meet(std::nullopt)); + + return mlir::success(); } void visitBranchOperand(mlir::OpOperand &operand) override {} diff --git a/lib/Transforms/VnniTransformation.cpp b/lib/Transforms/VnniTransformation.cpp index 36f2a7f8f..bb5ee5ae2 100644 --- a/lib/Transforms/VnniTransformation.cpp +++ b/lib/Transforms/VnniTransformation.cpp @@ -127,9 +127,9 @@ class LayoutAnalysisImpl public: using SparseBackwardDataFlowAnalysis::SparseBackwardDataFlowAnalysis; - void visitOperation(mlir::Operation *op, - mlir::ArrayRef operands, - mlir::ArrayRef results) override { + mlir::LogicalResult + visitOperation(mlir::Operation *op, mlir::ArrayRef operands, + mlir::ArrayRef results) override { // the B operand of a dpas operation is always in vnni layout // and it is the start point of the layout propagation if (auto dpas = mlir::dyn_cast(op)) { @@ -144,7 +144,7 @@ class LayoutAnalysisImpl // for C operand, it cannot be in vnni format propagateIfChanged(operands[2], operands[2]->meet(Layout(false))); } - return; + return mlir::success(); } if (mlir::OpTrait::hasElementwiseMappableTraits(op)) { @@ -175,7 +175,7 @@ class LayoutAnalysisImpl for (auto &&lattice : operands) propagateIfChanged(lattice, lattice->meet(layout)); } - return; + return mlir::success(); } if (auto extractStrideSliceOp = @@ -186,7 +186,7 @@ class LayoutAnalysisImpl layout = Layout::meet(layout, Layout(isVNNIApplicable(srcTy))); propagateIfChanged(operands[0], operands[0]->meet(layout)); } - return; + return mlir::success(); } if (auto extractOp = mlir::dyn_cast(op)) { @@ -201,12 +201,14 @@ class LayoutAnalysisImpl layout = Layout::meet(layout, Layout(isVNNIApplicable(vecTy))); propagateIfChanged(operands[0], operands[0]->meet(layout)); } - return; + return mlir::success(); } // Unknown ops: mark all args as non-vnni layout (no layout change). for (auto operand : operands) propagateIfChanged(operand, operand->join(Layout(false))); + + return mlir::success(); } void visitBranchOperand(mlir::OpOperand &operand) override {} diff --git a/lib/Utils/XeArch.cpp b/lib/Utils/XeArch.cpp index c9f3657cf..649faaa3a 100644 --- a/lib/Utils/XeArch.cpp +++ b/lib/Utils/XeArch.cpp @@ -304,7 +304,7 @@ mlir::LogicalResult XeuArchInterface::isLegalLoad2dOp(mlir::Operation *op) { auto tdescTy = loadOp.getTensorDescType(); // TODO: need more thinking on SLM - if (tdescTy.getMemoryScope() == mlir::xegpu::MemoryScope::SLM) + if (tdescTy.getMemorySpace() == mlir::xegpu::MemorySpace::SLM) return mlir::success(); int elementSize = loadOp.getTensorDescType().getElementTypeBitWidth(); @@ -347,7 +347,7 @@ mlir::LogicalResult XeuArchInterface::isLegalStore2dOp(mlir::Operation *op) { int elementSize = tdescTy.getElementTypeBitWidth(); // TODO: need more thinking on SLM - if (tdescTy.getMemoryScope() == mlir::xegpu::MemoryScope::SLM) + if (tdescTy.getMemorySpace() == mlir::xegpu::MemorySpace::SLM) return mlir::success(); LoadStore2DConfig storeParams; diff --git a/test/Conversion/GPUToSPIRV/printf.mlir b/test/Conversion/GPUToSPIRV/printf.mlir index 5c0255cbe..206b156da 100644 --- a/test/Conversion/GPUToSPIRV/printf.mlir +++ b/test/Conversion/GPUToSPIRV/printf.mlir @@ -47,7 +47,7 @@ module @test attributes { %2 = gpu.thread_id x // CHECK: [[FMTSTR_ADDR:%.*]] = spirv.mlir.addressof [[PRINTMSG]] : !spirv.ptr, UniformConstant> // CHECK-NEXT: [[FMTSTR_PTR:%.*]] = spirv.Bitcast [[FMTSTR_ADDR]] : !spirv.ptr, UniformConstant> to !spirv.ptr - // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] : !spirv.ptr({{%.*}}, {{%.*}}, {{%.*}} : i32, f32, i64) -> i32 + // CHECK-NEXT: {{%.*}} = spirv.CL.printf [[FMTSTR_PTR]] {{%.*}}, {{%.*}}, {{%.*}} : !spirv.ptr, i32, f32, i64 -> i32 gpu.printf "\nHello, world : %d %f \n Thread id: %d\n" %arg0, %arg1, %2: i32, f32, index gpu.return } diff --git a/test/Conversion/XeGPUToVC/atomiclsc.mlir b/test/Conversion/XeGPUToVC/atomiclsc.mlir index 8402e53b1..353e94860 100644 --- a/test/Conversion/XeGPUToVC/atomiclsc.mlir +++ b/test/Conversion/XeGPUToVC/atomiclsc.mlir @@ -4,46 +4,30 @@ module @gemm attributes {gpu.container_module} { gpu.module @test_kernel { // CHECK: func.func private @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64(vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32> attributes {VectorComputeFunctionINTEL, linkage_attributes = #spirv.linkage_attributes>} gpu.func @test_atomiclsc(%arg0: memref<128xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + // CHECK: %[[cst:.*]] = arith.constant dense : vector<16xi1> + %mask = arith.constant dense : vector<16xi1> + + // CHECK: %[[cst_0:.*]] = arith.constant dense<5.000000e-01> : vector<16xf32> + %1 = arith.constant dense<0.5> : vector<16xf32> - //CHECK: %[[cst:.*]] = arith.constant dense : vector<16xi1> - //CHECK: %[[cst_0:.*]] = arith.constant dense<5.000000e-01> : vector<16xf32> //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %{{.*}} : memref<128xf32> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64 - //CHECK: %[[cst_1:.*]] = arith.constant dense<4> : vector<16xi64> - //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[c1:.*]] = arith.constant 1 : index - //CHECK: %[[c2:.*]] = arith.constant 2 : index - //CHECK: %[[c3:.*]] = arith.constant 3 : index - //CHECK: %[[c4:.*]] = arith.constant 4 : index - //CHECK: %[[c5:.*]] = arith.constant 5 : index - //CHECK: %[[c6:.*]] = arith.constant 6 : index - //CHECK: %[[c7:.*]] = arith.constant 7 : index - //CHECK: %[[c8:.*]] = arith.constant 8 : index - //CHECK: %[[c9:.*]] = arith.constant 9 : index - //CHECK: %[[c10:.*]] = arith.constant 10 : index - //CHECK: %[[c11:.*]] = arith.constant 11 : index - //CHECK: %[[c12:.*]] = arith.constant 12 : index - //CHECK: %[[c13:.*]] = arith.constant 13 : index - //CHECK: %[[c14:.*]] = arith.constant 14 : index - //CHECK: %[[c15:.*]] = arith.constant 15 : index - //CHECK: %[[r1:.*]] = vector.from_elements %[[c0]], %[[c1]], %[[c2]], %[[c3]], %[[c4]], %[[c5]], %[[c6]], %[[c7]], %[[c8]], %[[c9]], %[[c10]], %[[c11]], %[[c12]], %[[c13]], %[[c14]], %[[c15]] : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi64> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64> + //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi64> //CHECK: %[[c19_i8:.*]] = arith.constant 19 : i8 //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32 //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8 //CHECK: %[[cst_2:.*]] = arith.constant dense<0> : vector<16xi32> - //CHECK: %[[r6:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32> - //CHECK: %[[r7:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64(%[[cst]], %[[c19_i8]], %[[c1_i8]], %[[c1_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[cst]], %[[r5]], %[[r6]], %[[cst_2]], %[[c0_i32]], %[[cst_2]]) : (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32> - - %mask = arith.constant dense : vector<16xi1> - %1 = arith.constant dense<0.5> : vector<16xf32> + //CHECK: %[[r3:.*]] = vector.bitcast %[[cst_0]] : vector<16xf32> to vector<16xi32> + //CHECK: %[[r4:.*]] = func.call @llvm.genx.lsc.xatomic.stateless.v16i32.v16i1.v16i64( + //CHECK-SAME: %[[cst]], %[[c19_i8]], %[[c1_i8]], %[[c1_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], + //CHECK-SAME: %[[c1_i8]], %[[c1_i8]], %[[cst]], %[[r2]], %[[r3]], %[[cst_2]], %[[c0_i32]], %[[cst_2]]) + //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, vector<16xi1>, vector<16xi64>, vector<16xi32>, vector<16xi32>, i32, vector<16xi32>) -> vector<16xi32> %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> - %2 = xegpu.create_tdesc %arg0[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<128xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %2 = xegpu.create_tdesc %arg0, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %3 = xegpu.atomic_rmw "addf" %2, %mask, %1 : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32> gpu.return } diff --git a/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir index c1f3de856..10a9a7f77 100644 --- a/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir +++ b/test/Conversion/XeGPUToVC/load_global_no_chunk_f16.mlir @@ -1,6 +1,6 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf16>, %[[arg1:.*]]: memref<16xf16>) kernel @@ -9,39 +9,38 @@ gpu.module @test_kernel { //CHECK: %[[mask:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + //CHECK: %[[a_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[a_ptr]] : index to i64 - //CHECK: %[[cst_0:.*]] = arith.constant dense<2> : vector<16xi64> - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64> - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32 //CHECK: %[[c6_i8:.*]] = arith.constant 6 : i8 //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 - //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.stateless.v16i32.v16i1.v16i64 - //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.stateless.v16i32.v16i1.v16i64 + //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> vector<16xi32> - //CHECK: %[[r7:.*]] = arith.trunci %[[r6]] : vector<16xi32> to vector<16xi16> - //CHECK: %[[r8:.*]] = vector.bitcast %[[r7]] : vector<16xi16> to vector<16xf16> + //CHECK: %[[r4:.*]] = arith.trunci %[[r3]] : vector<16xi32> to vector<16xi16> + //CHECK: %[[r5:.*]] = vector.bitcast %[[r4]] : vector<16xi16> to vector<16xf16> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16> //CHECK: %[[b_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg1]] : memref<16xf16> -> index - //CHECK: %[[r9:.*]] = arith.index_castui %[[b_ptr]] : index to i64 - //CHECK: %[[r10:.*]] = vector.broadcast %[[r9]] : i64 to vector<16xi64> - //CHECK: %[[r11:.*]] = arith.addi %[[r10]], %[[r3]] : vector<16xi64> - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + //CHECK: %[[r6:.*]] = arith.index_castui %[[b_ptr]] : index to i64 + //CHECK: %[[r7:.*]] = vector.broadcast %[[r6]] : i64 to vector<16xi64> + //CHECK: %[[r8:.*]] = arith.addi %[[r7]], %[[cst_0]] : vector<16xi64> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> - //CHECK: %[[r12:.*]] = vector.bitcast %[[r8]] : vector<16xf16> to vector<16xi16> - //CHECK: %[[r13:.*]] = arith.extui %[[r12]] : vector<16xi16> to vector<16xi32> + //CHECK: %[[r9:.*]] = vector.bitcast %[[r5]] : vector<16xf16> to vector<16xi16> + //CHECK: %[[r10:.*]] = arith.extui %[[r9]] : vector<16xi16> to vector<16xi32> //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8 //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16i32 - //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r11]], %[[r13]], %[[c0_i32]]) + //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r8]], %[[r10]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xi32>, i32) -> () xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> gpu.return diff --git a/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir index b99c8e289..a7a52f3d4 100644 --- a/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir +++ b/test/Conversion/XeGPUToVC/load_global_no_chunk_f32.mlir @@ -1,7 +1,7 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf32>, %[[arg1:.*]]: memref<16xf32>) kernel @@ -9,37 +9,34 @@ gpu.module @test_kernel { //CHECK: %[[mask:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> //CHECK: %[[a_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf32> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[a_ptr]] : index to i64 - //CHECK: %[[cst_0:.*]] = arith.constant dense<4> : vector<16xi64> - - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64> - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 //CHECK: %[[c0_i32:.*]] = arith.constant 0 : i32 //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8 //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 - //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.stateless.v16f32.v16i1.v16i64 - //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.stateless.v16f32.v16i1.v16i64 + //CHECK-SAME: (%[[mask]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> vector<16xf32> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32> //CHECK: %[[b_ptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg1]] : memref<16xf32> -> index - //CHECK: %[[r7:.*]] = arith.index_castui %[[b_ptr]] : index to i64 - //CHECK: %[[r8:.*]] = vector.broadcast %[[r7]] : i64 to vector<16xi64> - //CHECK: %[[r9:.*]] = arith.addi %[[r8]], %[[r3]] : vector<16xi64> - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + //CHECK: %[[r4:.*]] = arith.index_castui %[[b_ptr]] : index to i64 + //CHECK: %[[r5:.*]] = vector.broadcast %[[r4]] : i64 to vector<16xi64> + //CHECK: %[[r6:.*]] = arith.addi %[[r5]], %[[cst_0]] : vector<16xi64> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8 //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16f32 - //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r9]], %[[r6]], %[[c0_i32]]) + //CHECK-SAME: (%[[mask]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r6]], %[[r3]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xf32>, i32) -> () xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> gpu.return diff --git a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir index a42062a72..4ad2f4880 100644 --- a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir +++ b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f16.mlir @@ -1,6 +1,6 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { //CHECK: gpu.func @test_copy(%[[arg0:.*]]: memref<16xf16>, %[[arg1:.*]]: memref<16xf16>) kernel @@ -8,17 +8,14 @@ gpu.module @test_kernel { //CHECK: %[[cst:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64 - //CHECK: %[[cst_0:.*]] = arith.constant dense<2> : vector<16xi64> - - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64> - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 @@ -27,12 +24,12 @@ gpu.module @test_kernel { //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 //CHECK: func.call @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64 - //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> () xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf16, #scatter> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16> - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir index 64d310ae9..f748b25ca 100644 --- a/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir +++ b/test/Conversion/XeGPUToVC/prefetch_global_no_chunk_f32.mlir @@ -1,21 +1,19 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { gpu.func @test_copy(%a: memref<16xf32>, %b: memref<16xf32>) kernel { //CHECK: %[[cst:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %arg0 : memref<16xf32> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i64 - //CHECK: %[[cst_0:.*]] = arith.constant dense<4> : vector<16xi64> - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_0]] : vector<16xi64> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi64> - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + //CHECK: %[[cst_0:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i64 to vector<16xi64> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_0]] : vector<16xi64> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 @@ -24,11 +22,11 @@ gpu.module @test_kernel { //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 //CHECK: func.call @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64 - //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, i32) -> () xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf32, #scatter> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32> - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir index fda563970..696a44e85 100644 --- a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir +++ b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f16.mlir @@ -1,7 +1,7 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { //CHECK: gpu.func @test_store_scatter(%[[arg0:.*]]: memref<16xf16>) kernel @@ -14,21 +14,20 @@ gpu.module @test_kernel { //CHECK: %[[cst_0:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + //CHECK: %[[alloc:.*]] = memref.alloc() : memref<16xf16, 3> %slm = memref.alloc() : memref<16xf16, 3> //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[alloc]] : memref<16xf16, 3> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i32 - //CHECK: %[[cst_1:.*]] = arith.constant dense<2> : vector<16xi32> - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi32> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi32> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi32> - %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16, 3> -> !xegpu.tensor_desc<16xf16, #slm> + //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi32> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi32> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf16, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #slm> - //CHECK: %[[r6:.*]] = vector.bitcast %[[cst]] : vector<16xf16> to vector<16xi16> - //CHECK: %[[r7:.*]] = arith.extui %[[r6]] : vector<16xi16> to vector<16xi32> + //CHECK: %[[r3:.*]] = vector.bitcast %[[cst]] : vector<16xf16> to vector<16xi16> + //CHECK: %[[r4:.*]] = arith.extui %[[r3]] : vector<16xi16> to vector<16xi32> //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8 //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 //CHECK: %[[c1_i16:.*]] = arith.constant 1 : i16 @@ -36,32 +35,29 @@ gpu.module @test_kernel { //CHECK: %[[c6_i8:.*]] = arith.constant 6 : i8 //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 //CHECK: func.call @llvm.genx.lsc.store.slm.v16i1.v16i32.v16i32 - //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[r7]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[r4]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, vector<16xi32>, i32) -> () xegpu.store %cst, %slm_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> - //CHECK: %[[r8:.*]] = func.call @llvm.genx.lsc.load.slm.v16i32.v16i1.v16i32 - //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK: %[[r5:.*]] = func.call @llvm.genx.lsc.load.slm.v16i32.v16i1.v16i32 + //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, i32) -> vector<16xi32> - //CHECK: %[[r9:.*]] = arith.trunci %[[r8]] : vector<16xi32> to vector<16xi16> - //CHECK: %[[r10:.*]] = vector.bitcast %[[r9]] : vector<16xi16> to vector<16xf16> + //CHECK: %[[r6:.*]] = arith.trunci %[[r5]] : vector<16xi32> to vector<16xi16> + //CHECK: %[[r7:.*]] = vector.bitcast %[[r6]] : vector<16xi16> to vector<16xf16> %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> -> vector<16xf16> //CHECK: %[[intptr_2:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf16> -> index - //CHECK: %[[r11:.*]] = arith.index_castui %[[intptr_2]] : index to i64 - //CHECK: %[[cst_3:.*]] = arith.constant dense<2> : vector<16xi64> - - //CHECK: %[[r12:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r13:.*]] = arith.muli %[[r12]], %[[cst_3]] : vector<16xi64> - //CHECK: %[[r14:.*]] = vector.broadcast %[[r11]] : i64 to vector<16xi64> - //CHECK: %[[r15:.*]] = arith.addi %[[r14]], %[[r13]] : vector<16xi64> - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #global> + //CHECK: %[[r8:.*]] = arith.index_castui %[[intptr_2]] : index to i64 + //CHECK: %[[cst_3:.*]] = arith.constant dense<[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]> : vector<16xi64> + //CHECK: %[[r9:.*]] = vector.broadcast %[[r8]] : i64 to vector<16xi64> + //CHECK: %[[r10:.*]] = arith.addi %[[r9]], %[[cst_3]] : vector<16xi64> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #global> - //CHECK: %[[r16:.*]] = vector.bitcast %[[r10]] : vector<16xf16> to vector<16xi16> - //CHECK: %[[r17:.*]] = arith.extui %[[r16]] : vector<16xi16> to vector<16xi32> + //CHECK: %[[r11:.*]] = vector.bitcast %[[r7]] : vector<16xf16> to vector<16xi16> + //CHECK: %[[r12:.*]] = arith.extui %[[r11]] : vector<16xi16> to vector<16xi32> //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16i32 - //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r15]], %[[r17]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c6_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r10]], %[[r12]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xi32>, i32) -> () xegpu.store %data, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #global>, vector<16xi1> diff --git a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir index 1af7818e7..14024fcf9 100644 --- a/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir +++ b/test/Conversion/XeGPUToVC/store_load_slm_no_chunk_f32.mlir @@ -1,7 +1,7 @@ // RUN: imex-opt -convert-xegpu-to-vc -cse %s | FileCheck %s -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr gpu.module @test_kernel { //CHECK: gpu.func @test_store_scatter(%[[arg0:.*]]: memref<16xf32>) kernel @@ -12,19 +12,17 @@ gpu.module @test_kernel { //CHECK: %[[cst_0:.*]] = arith.constant dense : vector<16xi1> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> //CHECK: %[[alloc:.*]] = memref.alloc() : memref<16xf32, 3> %slm = memref.alloc() : memref<16xf32, 3> //CHECK: %[[intptr:.*]] = memref.extract_aligned_pointer_as_index %[[alloc]] : memref<16xf32, 3> -> index //CHECK: %[[r0:.*]] = arith.index_castui %[[intptr]] : index to i32 - //CHECK: %[[cst_1:.*]] = arith.constant dense<4> : vector<16xi32> - //CHECK: %[[r1:.*]] = vector.from_elements {{.*}} : vector<16xindex> - //CHECK: %[[r2:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi32> - //CHECK: %[[r3:.*]] = arith.muli %[[r2]], %[[cst_1]] : vector<16xi32> - //CHECK: %[[r4:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32> - //CHECK: %[[r5:.*]] = arith.addi %[[r4]], %[[r3]] : vector<16xi32> - %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32, 3> -> !xegpu.tensor_desc<16xf32, #slm> + //CHECK: %[[cst_1:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi32> + //CHECK: %[[r1:.*]] = vector.broadcast %[[r0]] : i32 to vector<16xi32> + //CHECK: %[[r2:.*]] = arith.addi %[[r1]], %[[cst_1]] : vector<16xi32> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #slm> //CHECK: %[[c4_i8:.*]] = arith.constant 4 : i8 //CHECK: %[[c0_i8:.*]] = arith.constant 0 : i8 @@ -33,26 +31,24 @@ gpu.module @test_kernel { //CHECK: %[[c3_i8:.*]] = arith.constant 3 : i8 //CHECK: %[[c1_i8:.*]] = arith.constant 1 : i8 //CHECK: func.call @llvm.genx.lsc.store.slm.v16i1.v16i32.v16f32 - //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[cst]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[cst]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, vector<16xf32>, i32) -> () xegpu.store %cst, %slm_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> - //CHECK: %[[r6:.*]] = func.call @llvm.genx.lsc.load.slm.v16f32.v16i1.v16i32 - //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r5]], %[[c0_i32]]) + //CHECK: %[[r3:.*]] = func.call @llvm.genx.lsc.load.slm.v16f32.v16i1.v16i32 + //CHECK-SAME: (%[[cst_0]], %[[c0_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r2]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi32>, i32) -> vector<16xf32> %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> -> vector<16xf32> //CHECK: %[[intptr_2:.*]] = memref.extract_aligned_pointer_as_index %[[arg0]] : memref<16xf32> -> index - //CHECK: %[[r7:.*]] = arith.index_castui %[[intptr_2]] : index to i64 - //CHECK: %[[cst_3:.*]] = arith.constant dense<4> : vector<16xi64> - //CHECK: %[[r8:.*]] = arith.index_castui %[[r1]] : vector<16xindex> to vector<16xi64> - //CHECK: %[[r9:.*]] = arith.muli %[[r8]], %[[cst_3]] : vector<16xi64> - //CHECK: %[[r10:.*]] = vector.broadcast %[[r7]] : i64 to vector<16xi64> - //CHECK: %[[r11:.*]] = arith.addi %[[r10]], %[[r9]] : vector<16xi64> - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #global> + //CHECK: %[[r4:.*]] = arith.index_castui %[[intptr_2]] : index to i64 + //CHECK: %[[cst_3:.*]] = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xi64> + //CHECK: %[[r5:.*]] = vector.broadcast %[[r4]] : i64 to vector<16xi64> + //CHECK: %[[r6:.*]] = arith.addi %[[r5]], %[[cst_3]] : vector<16xi64> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #global> //CHECK: func.call @llvm.genx.lsc.store.stateless.v16i1.v16i64.v16f32 - //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r11]], %[[r6]], %[[c0_i32]]) + //CHECK-SAME: (%[[cst_0]], %[[c4_i8]], %[[c0_i8]], %[[c0_i8]], %[[c1_i16]], %[[c0_i32]], %[[c3_i8]], %[[c1_i8]], %[[c1_i8]], %[[c0_i8]], %[[r6]], %[[r3]], %[[c0_i32]]) //CHECK-SAME: (vector<16xi1>, i8, i8, i8, i16, i32, i8, i8, i8, i8, vector<16xi64>, vector<16xf32>, i32) -> () xegpu.store %data, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #global>, vector<16xi1> diff --git a/test/Conversion/XeTileToXeGPU/array_length_load.mlir b/test/Conversion/XeTileToXeGPU/array_length_load.mlir index 749e58e46..7fd8f87ca 100644 --- a/test/Conversion/XeTileToXeGPU/array_length_load.mlir +++ b/test/Conversion/XeTileToXeGPU/array_length_load.mlir @@ -7,8 +7,8 @@ gpu.module @test_kernel { %a_loaded = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16> // Do not let XeGPU do one load with multiple blocks (array_length > 1), where each block is finer than one GRF. - //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c16] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c0] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %arg1[%c0, %c16] : memref<1x32xf16> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> %b_tile = xetile.init_tile %b[%c0, %c0] : memref<1x32xf16> -> !xetile.tile<1x32xf16> %b_loaded = xetile.load_tile %b_tile : !xetile.tile<1x32xf16> -> vector<1x32xf16> diff --git a/test/Conversion/XeTileToXeGPU/lit.local.cfg b/test/Conversion/XeTileToXeGPU/lit.local.cfg new file mode 100644 index 000000000..097b2470c --- /dev/null +++ b/test/Conversion/XeTileToXeGPU/lit.local.cfg @@ -0,0 +1,8 @@ + +# need slm support for XeTile lowering +excludes_slm_tests = [ + 'sg_mixed_scf.mlir', + 'sg_gemm_1k_1k_1k_f16_f32_slm.mlir', + ] + +config.excludes.update(excludes_slm_tests) diff --git a/test/Conversion/XeTileToXeGPU/reduction.mlir b/test/Conversion/XeTileToXeGPU/reduction.mlir index 2db5c548d..fa61a6170 100644 --- a/test/Conversion/XeTileToXeGPU/reduction.mlir +++ b/test/Conversion/XeTileToXeGPU/reduction.mlir @@ -10,10 +10,10 @@ module { %c0 = arith.constant 0 : index %acc = arith.constant dense<0.0> : vector<16xf16> //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> + //CHECK-SAME : !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16> //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> @@ -118,9 +118,9 @@ module { %r = vector.multi_reduction , %e, %acc [1] : vector<16x32xf16> to vector<16xf16> //CHECK: %[[R161:.*]] = vector.shape_cast %[[R160]] : vector<16xf16> to vector<2x8xf16> %c = vector.shape_cast %r: vector<16xf16> to vector<2x8xf16> - //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[R162:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<2x8xf16> - //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R161]], %[[R162]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<2x8xf16>, !xegpu.tensor_desc<2x8xf16, #xegpu.block_tdesc_attr> xetile.store_tile %c, %s : vector<2x8xf16>, !xetile.tile<2x8xf16> gpu.return } @@ -132,8 +132,8 @@ module { %a_tile = xetile.init_tile %a[%c0, %c0] : memref<8x32xf32> -> !xetile.tile<8x32xf32> %b_tile = xetile.init_tile %b[%c0, %c0] : memref<8x1xf32> -> !xetile.tile<8x1xf32> - //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> - //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> + //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> + //CHECK: xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> %a_loaded = xetile.load_tile %a_tile: !xetile.tile<8x32xf32> -> vector<8x32xf32> //CHECK: %[[R1:.*]] = vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23] : vector<16xf32>, vector<16xf32> @@ -173,10 +173,10 @@ module { %c0 = arith.constant 0 : index %acc = arith.constant dense<0.0> : vector<32xf16> //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<128x256xf16> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> %t = xetile.init_tile %a[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<16x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> + //CHECK-SAME: !xegpu.tensor_desc<16x32xf16, #xegpu.block_tdesc_attr> -> vector<16x32xf16> %v = xetile.load_tile %t : !xetile.tile<16x32xf16> -> vector<16x32xf16> //CHECK: %[[R2:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [0, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> //CHECK: %[[R3:.*]] = vector.extract_strided_slice %[[R1]] {offsets = [1, 0], sizes = [1, 32], strides = [1, 1]} : vector<16x32xf16> to vector<1x32xf16> @@ -231,9 +231,9 @@ module { %r = vector.multi_reduction , %e, %acc [0] : vector<16x32xf16> to vector<32xf16> //CHECK: %[[R118:.*]] = vector.shape_cast %[[R117]] : vector<32xf16> to vector<4x8xf16> %c = vector.shape_cast %r: vector<32xf16> to vector<4x8xf16> - //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[R119:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c0]]] : memref<128x256xf16> -> !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> %s = xetile.init_tile %b[%c0, %c0] : memref<128x256xf16> -> !xetile.tile<4x8xf16> - //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R118]], %[[R119]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<4x8xf16>, !xegpu.tensor_desc<4x8xf16, #xegpu.block_tdesc_attr> xetile.store_tile %c, %s : vector<4x8xf16>, !xetile.tile<4x8xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir index 15c9d742b..6df1b36bf 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32.mlir @@ -25,95 +25,95 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index //CHECK: %[[r7:.*]] = arith.addi %[[r1]], %[[c32]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c48:.*]] = arith.constant 48 : index //CHECK: %[[r9:.*]] = arith.addi %[[r1]], %[[c48]] : index - //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r11]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r16:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r19:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r16]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r21:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r24:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r25:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r21]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r26:.*]] = arith.addi %[[r0]], %[[c32]] : index - //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r27:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r28:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r29:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r30:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c40:.*]] = arith.constant 40 : index //CHECK: %[[r31:.*]] = arith.addi %[[r0]], %[[c40]] : index - //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r32:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r33:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r34:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r35:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r31]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r36:.*]] = arith.addi %[[r0]], %[[c48]] : index - //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r37:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r38:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r39:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r40:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r36]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c56:.*]] = arith.constant 56 : index //CHECK: %[[r41:.*]] = arith.addi %[[r0]], %[[c56]] : index - //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r42:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r43:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r44:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r45:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r41]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r46:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r47:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r48:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r49:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r2]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r50:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r51:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r52:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r7]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r53:.*]] = xegpu.create_nd_tdesc %[[C]][%[[r26]], %[[r9]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<64x64xf32> - //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r54:.*]] = xegpu.load_nd %[[r46]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r55:.*]] = xegpu.load_nd %[[r47]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r56:.*]] = xegpu.load_nd %[[r48]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r57:.*]] = xegpu.load_nd %[[r49]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r58:.*]] = xegpu.load_nd %[[r50]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r59:.*]] = xegpu.load_nd %[[r51]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r60:.*]] = xegpu.load_nd %[[r52]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r61:.*]] = xegpu.load_nd %[[r53]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<64x64xf32> -> vector<64x64xf32> - //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r62:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r63:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r2]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r64:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r65:.*]] = xegpu.create_nd_tdesc %[[A]][%[[r26]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16> - //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r66:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r67:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c0]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r68:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r3]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r69:.*]] = xegpu.create_nd_tdesc %[[B]][%[[c32]], %[[r7]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<64x64xf16> //CHECK: %[[r72:.*]]:16 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r62]], %[[arg5:.*]] = %[[r63]], %[[arg6:.*]] = %[[r64]], %[[arg7:.*]] = %[[r65]], %[[arg8:.*]] = %[[r66]], //CHECK-SAME: %[[arg9:.*]] = %[[r67]], %[[arg10:.*]] = %[[r68]], %[[arg11:.*]] = %[[r69]], %[[arg12:.*]] = %[[r54]], %[[arg13:.*]] = %[[r55]], //CHECK-SAME: %[[arg14:.*]] = %[[r56]], %[[arg15:.*]] = %[[r57]], %[[arg16:.*]] = %[[r58]], %[[arg17:.*]] = %[[r59]], %[[arg18:.*]] = %[[r60]], - //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: %[[arg19:.*]] = %[[r61]]) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>) { %out:3 = scf.for %k = %c0 to %c1024 step %c64 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value) @@ -152,16 +152,16 @@ gpu.module @test_kernel { //CHECK: %[[r208:.*]] = vector.extract_strided_slice %[[arg19]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r105:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r106:.*]] = vector.extract %[[r105]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r107:.*]] = vector.extract %[[r105]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r108:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r109:.*]] = vector.extract %[[r108]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r110:.*]] = vector.extract %[[r108]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r111:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r112:.*]] = vector.extract %[[r111]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r113:.*]] = vector.extract %[[r111]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r114:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r115:.*]] = vector.extract %[[r114]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r116:.*]] = vector.extract %[[r114]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r117:.*]] = vector.extract_strided_slice %[[r106]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -198,16 +198,16 @@ gpu.module @test_kernel { //CHECK: %[[r148:.*]] = vector.extract_strided_slice %[[r116]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %a_value = xetile.load_tile %a_tile : !xetile.tile<64x64xf16> -> vector<64x64xf16> - //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r149:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r150:.*]] = vector.extract %[[r149]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r151:.*]] = vector.extract %[[r149]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r152:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r153:.*]] = vector.extract %[[r152]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r154:.*]] = vector.extract %[[r152]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r155:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r156:.*]] = vector.extract %[[r155]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r157:.*]] = vector.extract %[[r155]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r158:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r159:.*]] = vector.extract %[[r158]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r160:.*]] = vector.extract %[[r158]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r161:.*]] = vector.extract_strided_slice %[[r150]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16> @@ -384,21 +384,21 @@ gpu.module @test_kernel { //CHECK: %[[r358:.*]] = vector.shuffle %[[r288]], %[[r304]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> //CHECK: %[[r359:.*]] = vector.shuffle %[[r320]], %[[r336]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> //CHECK: %[[r360:.*]] = vector.shuffle %[[r358]], %[[r359]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32> - //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r361:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r362:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r363:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r364:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r365:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r366:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r367:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r368:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c64] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16> %b_next_tile = xetile.update_tile_offset %b_tile, [%c64, %c0] : !xetile.tile<64x64xf16>, index, index -> !xetile.tile<64x64xf16> //CHECK: scf.yield %[[r361]], %[[r362]], %[[r363]], %[[r364]], %[[r365]], %[[r366]], %[[r367]], %[[r368]], %[[r339]], %[[r345]], %[[r351]], %[[r357]], %[[r342]], %[[r348]], %[[r354]], %[[r360]] - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>, //CHECK-SAME: vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32>, vector<32x16xf32> scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<64x64xf16>, !xetile.tile<64x64xf16>, vector<64x64xf32> @@ -435,38 +435,38 @@ gpu.module @test_kernel { //CHECK: %[[r102:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r103:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r104:.*]] = vector.extract_strided_slice %[[r72]]#15 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r73]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r77]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r81]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r85]], %[[r10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r74]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r78]], %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r82]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r86]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r75]], %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r79]], %[[r18]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r83]], %[[r19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r87]], %[[r20]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r76]], %[[r22]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r80]], %[[r23]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r84]], %[[r24]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r88]], %[[r25]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r89]], %[[r27]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r93]], %[[r28]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r97]], %[[r29]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r101]], %[[r30]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r90]], %[[r32]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r94]], %[[r33]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r98]], %[[r34]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r102]], %[[r35]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r91]], %[[r37]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r95]], %[[r38]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r99]], %[[r39]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r103]], %[[r40]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r92]], %[[r42]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r96]], %[[r43]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r100]], %[[r44]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r104]], %[[r45]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %out#2, %c_init_tile: vector<64x64xf32>, !xetile.tile<64x64xf32> gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir index a57b1300f..2e075d714 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_f16_f32_slm.mlir @@ -2,7 +2,7 @@ // RUN: --cse --convert-xetile-to-xegpu --cse %s -o -| FileCheck %s -#tile_attr = #xetile.tile_attr +#tile_attr = #xetile.tile_attr // CHECK-LABEL: gpu.module @test_kernel { gpu.module @test_kernel { @@ -26,37 +26,37 @@ gpu.module @test_kernel { %0 = arith.muli %block_id_x, %c16 : index %1 = arith.muli %block_id_y, %c16 : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<128x128xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r5:.*]] = xegpu.load_nd %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<128x128xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r5:.*]] = xegpu.load_nd %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> -> vector<8x16xf32> %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32> %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32} : !xetile.tile<8x16xf32> -> vector<8x16xf32> - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c1:.*]] = arith.constant 1 : index //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c1]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r7]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r7]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c2:.*]] = arith.constant 2 : index //CHECK: %[[r9:.*]] = arith.addi %[[r0]], %[[c2]] : index - //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r9]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r9]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c3:.*]] = arith.constant 3 : index //CHECK: %[[r11:.*]] = arith.addi %[[r0]], %[[c3]] : index - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r11]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r11]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c4:.*]] = arith.constant 4 : index //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c4]] : index - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r13]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r13]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c5:.*]] = arith.constant 5 : index //CHECK: %[[r15:.*]] = arith.addi %[[r0]], %[[c5]] : index - //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r15]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r15]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c6:.*]] = arith.constant 6 : index //CHECK: %[[r17:.*]] = arith.addi %[[r0]], %[[c6]] : index - //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r17]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r18:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r17]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c7:.*]] = arith.constant 7 : index //CHECK: %[[r19:.*]] = arith.addi %[[r0]], %[[c7]] : index - //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r19]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r19]], %[[c0]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #tile_attr> - //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<128x128xf16, 3> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #tile_attr> //CHECK: %[[r37:.*]]:10 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c128]] step %[[c16]] @@ -64,25 +64,25 @@ gpu.module @test_kernel { //CHECK-SAME: %[[arg7:.*]] = %[[r12]], %[[arg8:.*]] = %[[r14]], %[[arg9:.*]] = %[[r16]], //CHECK-SAME: %[[arg10:.*]] = %[[r18]], %[[arg11:.*]] = %[[r20]], %[[arg12:.*]] = %[[r21]], //CHECK-SAME: %[[arg28:.*]] = %[[r5]]) - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32> %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xetile.tile<8x16xf16, #tile_attr>, !xetile.tile<16x16xf16, #tile_attr>, vector<8x16xf32>) { - //CHECK: %[[r38:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r39:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r40:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r41:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r42:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r43:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r44:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> - //CHECK: %[[r45:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r38:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r39:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r40:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r41:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r42:.*]] = xegpu.load_nd %[[arg8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r43:.*]] = xegpu.load_nd %[[arg9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r44:.*]] = xegpu.load_nd %[[arg10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> + //CHECK: %[[r45:.*]] = xegpu.load_nd %[[arg11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> -> vector<1x16xf16> //CHECK: %[[r46:.*]] = vector.shuffle %[[r38]], %[[r39]] [0, 1] : vector<1x16xf16>, vector<1x16xf16> //CHECK: %[[r47:.*]] = vector.shuffle %[[r40]], %[[r41]] [0, 1] : vector<1x16xf16>, vector<1x16xf16> //CHECK: %[[r48:.*]] = vector.shuffle %[[r42]], %[[r43]] [0, 1] : vector<1x16xf16>, vector<1x16xf16> @@ -92,27 +92,27 @@ gpu.module @test_kernel { //CHECK: %[[r52:.*]] = vector.shuffle %[[r50]], %[[r51]] [0, 1, 2, 3, 4, 5, 6, 7] : vector<4x16xf16>, vector<4x16xf16> %7 = xetile.load_tile %arg4 {padding = 0.000000e+00 : f32} : !xetile.tile<8x16xf16, #tile_attr> -> vector<8x16xf16> - //CHECK: %[[r53:.*]] = xegpu.load_nd %[[arg12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<16x16xf16> + //CHECK: %[[r53:.*]] = xegpu.load_nd %[[arg12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<16x16xf16> %8 = xetile.load_tile %arg5 {padding = 0.000000e+00 : f32} : !xetile.tile<16x16xf16, #tile_attr> -> vector<16x16xf16> //CHECK: %[[r84:.*]] = xegpu.dpas %[[r52]], %[[r53]], %[[arg28]] : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> %9 = xetile.tile_mma %7, %8, %arg6 : vector<8x16xf16>, vector<16x16xf16>, vector<8x16xf32> -> vector<8x16xf32> - //CHECK: %[[r85:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r86:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r87:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r88:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r89:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r90:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r91:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> - //CHECK: %[[r92:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r85:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r86:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r87:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r88:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r89:.*]] = xegpu.update_nd_offset %[[arg8]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r90:.*]] = xegpu.update_nd_offset %[[arg9]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r91:.*]] = xegpu.update_nd_offset %[[arg10]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r92:.*]] = xegpu.update_nd_offset %[[arg11]], [%[[c0]], %[[c16]]] : !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> %10 = xetile.update_tile_offset %arg4, [%c0, %c16] : !xetile.tile<8x16xf16, #tile_attr>, index, index -> !xetile.tile<8x16xf16, #tile_attr> - //CHECK: %[[r108:.*]] = xegpu.update_nd_offset %[[arg12]], [%[[c16]], %[[c0]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r108:.*]] = xegpu.update_nd_offset %[[arg12]], [%[[c16]], %[[c0]]] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> %11 = xetile.update_tile_offset %arg5, [%c16, %c0] : !xetile.tile<16x16xf16, #tile_attr>, index, index -> !xetile.tile<16x16xf16, #tile_attr> scf.yield %10, %11, %9 : !xetile.tile<8x16xf16, #tile_attr>, !xetile.tile<16x16xf16, #tile_attr>, vector<8x16xf32> } - //CHECK: xegpu.store_nd %[[r37]]#9, %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r37]]#9, %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %6#2, %2 : vector<8x16xf32>, !xetile.tile<8x16xf32> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir index 2f477344e..f1cf087eb 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_i8_i32.mlir @@ -24,44 +24,44 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> - //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> - //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xi32> -> !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> + //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> + //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi32, #xegpu.block_tdesc_attr> -> vector<32x16xi32> %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xi32> -> !xetile.tile<32x32xi32> %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xi32> -> vector<32x32xi32> - //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> + //CHECK: %20 = xegpu.create_nd_tdesc %[[arg0]][%2, %[[c0]]] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8> - //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> + //CHECK: %21 = xegpu.create_nd_tdesc %[[arg1]][%c0, %3] : memref<1024x1024xi8> -> !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xi8> -> !xetile.tile<32x32xi8> %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value) -> (!xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32>) { - //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> -> vector<32x32xi8> + //CHECK: %[[r39:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> -> vector<32x32xi8> //CHECK: %[[r40:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r41:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [8, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r42:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [16, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> //CHECK: %[[r43:.*]] = vector.extract_strided_slice %[[r39]] {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xi8> to vector<8x32xi8> %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8> - //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> -> vector<2x32x16xi8> + //CHECK: %[[r44:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> -> vector<2x32x16xi8> //CHECK: %[[r45:.*]] = vector.extract %[[r44]][0] : vector<32x16xi8> from vector<2x32x16xi8> //CHECK: %[[r46:.*]] = vector.extract %[[r44]][1] : vector<32x16xi8> from vector<2x32x16xi8> %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xi8> -> vector<32x32xi8> @@ -69,14 +69,14 @@ gpu.module @test_kernel { //CHECK-COUNT-8: xegpu.dpas {{.*}} : vector<8x32xi8>, vector<32x16xi8>, vector<8x16xi32> -> vector<8x16xi32> %c_new_value = xetile.tile_mma %a_value, %b_value, %c_value : vector<32x32xi8>, vector<32x32xi8>, vector<32x32xi32> -> vector<32x32xi32> - //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> - //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> + //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c0]], %[[c32]]] : !xegpu.tensor_desc<32x32xi8, #xegpu.block_tdesc_attr> + //CHECK: xegpu.update_nd_offset %{{.*}}, [%[[c32]], %[[c0]]] : !xegpu.tensor_desc<32x16xi8, #xegpu.block_tdesc_attr> %a_next_tile = xetile.update_tile_offset %a_tile, [%c0, %c32] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8> %b_next_tile = xetile.update_tile_offset %b_tile, [%c32, %c0] : !xetile.tile<32x32xi8>, index, index -> !xetile.tile<32x32xi8> scf.yield %a_next_tile, %b_next_tile, %c_new_value : !xetile.tile<32x32xi8>, !xetile.tile<32x32xi8>, vector<32x32xi32> } - //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> + //CHECK-COUNT-8: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xi32>, !xegpu.tensor_desc<8x16xi32, #xegpu.block_tdesc_attr> xetile.store_tile %out#2, %c_init_tile {innner_blocks = [8, 16]}: vector<32x32xi32>, !xetile.tile<32x32xi32> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir index db8022dad..0e29a1fc5 100755 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_1k_1k_1k_tf32_tf32.mlir @@ -23,43 +23,43 @@ gpu.module @test_kernel { //CHECK: %[[r2:.*]] = arith.addi %[[r0]], %[[c0]] : index //CHECK: %[[r3:.*]] = arith.addi %[[r1]], %[[c0]] : index - //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r4:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index //CHECK: %[[r5:.*]] = arith.addi %[[r1]], %[[c16]] : index - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index //CHECK: %[[r7:.*]] = arith.addi %[[r0]], %[[c8]] : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r7]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[r10:.*]] = arith.addi %[[r0]], %[[c16]] : index - //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r10]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index //CHECK: %[[r13:.*]] = arith.addi %[[r0]], %[[c24]] : index - //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r14:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r15:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r13]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r16:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r3]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r17:.*]] = xegpu.create_nd_tdesc %[[arg2]][%[[r2]], %[[r5]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg2[%0, %1] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32> - //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r18:.*]] = xegpu.load_nd %[[r16]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + //CHECK: %[[r19:.*]] = xegpu.load_nd %[[r17]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xf32> -> vector<32x32xf32> - //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r20:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c0]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r21:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[r2]], %[[c16]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> %4 = xetile.init_tile %arg0[%0, %c0] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32> - //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r22:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r3]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r23:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[r5]]] : memref<1024x1024xtf32> -> !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> %5 = xetile.init_tile %arg1[%c0, %1] : memref<1024x1024xtf32> -> !xetile.tile<32x32xtf32> //CHECK: %[[r24:.*]]:6 = scf.for %[[arg3:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] //CHECK-SAME: iter_args(%[[arg4:.*]] = %[[r20]], %[[arg5:.*]] = %[[r21]], %[[arg6:.*]] = %[[r22]], %[[arg7:.*]] = %[[r23]], %[[arg8:.*]] = %[[r18]], %[[arg9:.*]] = %[[r19]]) - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> %6:3 = scf.for %arg3 = %c0 to %c1024 step %c64 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) -> (!xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32>) { //CHECK: %[[r65:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r66:.*]] = vector.extract_strided_slice %[[arg8]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -70,10 +70,10 @@ gpu.module @test_kernel { //CHECK: %[[r71:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r72:.*]] = vector.extract_strided_slice %[[arg9]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> + //CHECK: %[[r33:.*]] = xegpu.load_nd %[[arg4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> //CHECK: %[[r34:.*]] = vector.extract %[[r33]][0] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r35:.*]] = vector.extract %[[r33]][1] : vector<32x8xtf32> from vector<2x32x8xtf32> - //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> + //CHECK: %[[r36:.*]] = xegpu.load_nd %[[arg5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> -> vector<2x32x8xtf32> //CHECK: %[[r37:.*]] = vector.extract %[[r36]][0] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r38:.*]] = vector.extract %[[r36]][1] : vector<32x8xtf32> from vector<2x32x8xtf32> //CHECK: %[[r39:.*]] = vector.extract_strided_slice %[[r34]] {offsets = [0, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32> @@ -94,8 +94,8 @@ gpu.module @test_kernel { //CHECK: %[[r54:.*]] = vector.extract_strided_slice %[[r38]] {offsets = [24, 0], sizes = [8, 8], strides = [1, 1]} : vector<32x8xtf32> to vector<8x8xtf32> %7 = xetile.load_tile %arg4 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xtf32> -> vector<32x32xtf32> - //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> - //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> + //CHECK: %[[r55:.*]] = xegpu.load_nd %[[arg6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> + //CHECK: %[[r56:.*]] = xegpu.load_nd %[[arg7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> -> vector<32x16xtf32> //CHECK: %[[r57:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> //CHECK: %[[r58:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> //CHECK: %[[r59:.*]] = vector.extract_strided_slice %[[r55]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xtf32> to vector<8x16xtf32> @@ -149,19 +149,19 @@ gpu.module @test_kernel { %9 = xetile.tile_mma %7, %8, %arg6 : vector<32x32xtf32>, vector<32x32xtf32>, vector<32x32xf32> -> vector<32x32xf32> - //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r111:.*]] = xegpu.update_nd_offset %[[arg4]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r112:.*]] = xegpu.update_nd_offset %[[arg5]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr> %10 = xetile.update_tile_offset %arg4, [%c0, %c64] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32> - //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> - //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r113:.*]] = xegpu.update_nd_offset %[[arg6]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> + //CHECK: %[[r114:.*]] = xegpu.update_nd_offset %[[arg7]], [%[[c64]], %[[c0]]] : !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr> %11 = xetile.update_tile_offset %arg5, [%c64, %c0] : !xetile.tile<32x32xtf32>, index, index -> !xetile.tile<32x32xtf32> //CHECK: scf.yield %[[r111]], %[[r112]], %[[r113]], %[[r114]], %[[r107]], %[[r110]] - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, - //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x8xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, + //CHECK-SAME: !xegpu.tensor_desc<32x16xtf32, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> scf.yield %10, %11, %9 : !xetile.tile<32x32xtf32>, !xetile.tile<32x32xtf32>, vector<32x32xf32> } @@ -173,14 +173,14 @@ gpu.module @test_kernel { //CHECK: %[[r30:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r31:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> //CHECK: %[[r32:.*]] = vector.extract_strided_slice %[[r24]]#5 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r25]], %[[r4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r29]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r26]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r30]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r27]], %[[r11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r31]], %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r28]], %[[r14]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r32]], %[[r15]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %6#2, %2 : vector<32x32xf32>, !xetile.tile<32x32xf32> //CHECK: gpu.return gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir b/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir index 43deb87d2..0723d481e 100644 --- a/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_gemm_transpose_b.mlir @@ -17,18 +17,18 @@ gpu.module @test_kernel { %c_init_tile = xetile.init_tile %C[%m, %n] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32> %c_init_value = xetile.load_tile %c_init_tile : !xetile.tile<32x32xf32> -> vector<32x32xf32> %a_init_tile = xetile.init_tile %A[%m, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T2:.*]] = xegpu.create_nd_tdesc %[[arg1]][%{{.*}}, %{{.*}}] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %b_init_tile = xetile.init_tile %B[%c0, %n] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> -// CHECK: scf.for %{{.*}}= %{{.*}}to %{{.*}}step %{{.*}}iter_args(%{{.*}}= %{{.*}}, %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[T2]], %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}} = %{{.*}}) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { +// CHECK: scf.for %{{.*}}= %{{.*}}to %{{.*}}step %{{.*}}iter_args(%{{.*}}= %{{.*}}, %[[ARG5:.*]] = %[[T1]], %[[ARG6:.*]] = %[[T2]], %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}}= %{{.*}}, %{{.*}} = %{{.*}}) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>) { %out:3 = scf.for %k = %c0 to %c1024 step %c32 iter_args(%a_tile = %a_init_tile, %b_tile = %b_init_tile, %c_value = %c_init_value) -> (!xetile.tile<32x32xf16>, !xetile.tile<32x32xf16>, vector<32x32xf32>) { %a_value = xetile.load_tile %a_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16> // Check if array_length is 1 for the load + transpose + MMA B case. // - // xegpu.load_nd %[[ARG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> - // xegpu.load_nd %[[ARG6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> + // xegpu.load_nd %[[ARG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> + // xegpu.load_nd %[[ARG6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> %b_value = xetile.load_tile %b_tile : !xetile.tile<32x32xf16> -> vector<32x32xf16> %b_transpose = xetile.transpose %b_value, [1, 0] : vector<32x32xf16> -> vector<32x32xf16> %c_new_value = xetile.tile_mma %a_value, %b_transpose, %c_value : vector<32x32xf16>, vector<32x32xf16>, vector<32x32xf32> -> vector<32x32xf32> diff --git a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir index 2fb7cc259..194edb960 100644 --- a/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_load_tile.mlir @@ -8,10 +8,10 @@ gpu.module @test_kernel { //CHECK: %[[c64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index //CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] - //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK-SAME: memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> //CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir index 3a8ec5510..565804d46 100755 --- a/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_mixed_scf.mlir @@ -56,7 +56,7 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %26 = arith.muli %arg3, %c1024 : index %27 = arith.addi %26, %13 : index %28 = arith.addi %27, %16 : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc {{.*}} : memref<2048x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %29 = xetile.init_tile %arg1[%28, %15] : memref<2048x12288xbf16> -> !xetile.tile<32x32xbf16> %30 = scf.for %arg4 = %c0 to %c2 step %c1 iter_args(%arg5 = %cst) -> (vector<1x4xf32>) { @@ -65,23 +65,23 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %35 = arith.addi %34, %10 : index %36 = arith.addi %35, %11 : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %37 = xetile.init_tile %arg0[%36, %9] : memref<16384x12288xbf16> -> !xetile.tile<32x32xbf16> %38:3 = scf.for %arg6 = %c0 to %c12288 step %c32 iter_args(%arg7 = %37, %arg8 = %29, %arg9 = %cst_0) -> (!xetile.tile<32x32xbf16>, !xetile.tile<32x32xbf16>, vector<32x32xf32>) { - //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> + //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16> //CHECK-COUNT-8: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %48 = xetile.load_tile %arg7 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xbf16> -> vector<32x32xbf16> - //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> + //CHECK: %{{.*}} = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> //CHECK-COUNT-2: %{{.*}} = vector.extract %{{.*}} : vector<32x16xbf16> from vector<2x32x16xbf16> //CHECK-COUNT-4: %{{.*}} = vector.extract_strided_slice %{{.*}} {offsets = {{.*}}, sizes = [16, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<16x16xbf16> %49 = xetile.load_tile %arg8 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xbf16> -> vector<32x32xbf16> - //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> - //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.update_nd_offset %{{.*}} : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %50 = xetile.update_tile_offset %arg7, [%c0, %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16> %51 = xetile.update_tile_offset %arg8, [%c0, %c32] : !xetile.tile<32x32xbf16>, index, index -> !xetile.tile<32x32xbf16> @@ -101,24 +101,24 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp %41 = vector.shape_cast %40 : vector<32xf32> to vector<1x32xf32> %alloc = memref.alloc() : memref<8x128xf32, 3> - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr> - %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr> - - //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr> - xetile.store_tile %41, %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr> - - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK-COUNT-8: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> -> vector<1x4xf32> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr> + %42 = xetile.init_tile %alloc[%17, %13] : memref<8x128xf32, 3> -> !xetile.tile<1x32xf32, #xetile.tile_attr> + + //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #xegpu.block_tdesc_attr> + xetile.store_tile %41, %42 : vector<1x32xf32>, !xetile.tile<1x32xf32, #xetile.tile_attr> + + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.create_nd_tdesc %{{.*}} : memref<8x128xf32, 3> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK-COUNT-8: xegpu.load_nd {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> -> vector<1x4xf32> //CHECK-COUNT-8: arith.addf %{{.*}}, %{{.*}} : vector<1x4xf32> - %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr> - %44 = xetile.load_tile %43 {padding = 0.000000e+00 : f32} : !xetile.tile<8x4xf32, #xetile.tile_attr> -> vector<8x4xf32> + %43 = xetile.init_tile %alloc[%21, %23] : memref<8x128xf32, 3> -> !xetile.tile<8x4xf32, #xetile.tile_attr> + %44 = xetile.load_tile %43 {padding = 0.000000e+00 : f32} : !xetile.tile<8x4xf32, #xetile.tile_attr> -> vector<8x4xf32> %45 = vector.multi_reduction , %44, %cst_2 [0] : vector<8x4xf32> to vector<4xf32> %46 = vector.shape_cast %45 : vector<4xf32> to vector<1x4xf32> %47 = arith.addf %arg5, %46 : vector<1x4xf32> @@ -126,8 +126,8 @@ gpu.module @postop_reduce_m attributes {spirv.target_env = #spirv.target_env<#sp } {lowerBoundMap = affine_map<() -> (0)>, operandSegmentSizes = array, step = 1 : index, syn.mm_dim = 0 : i64, syn.parall_level = 2 : i64, upperBoundMap = affine_map<() -> (2)>} //CHECK: %{{.*}} = arith.addi %{{.*}}, %{{.*}} : index - //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: %{{.*}} = xegpu.create_nd_tdesc %{{.*}} : memref<32x2048xf32> -> !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x4xf32>, !xegpu.tensor_desc<1x4xf32, #xegpu.block_tdesc_attr> %31 = arith.addi %26, %16 : index %32 = xetile.init_tile %arg2[%25, %31] : memref<32x2048xf32> -> !xetile.tile<1x4xf32> xetile.store_tile %30, %32 : vector<1x4xf32>, !xetile.tile<1x4xf32> diff --git a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir index a452e0454..b500bf23d 100644 --- a/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_scf_for.mlir @@ -12,19 +12,19 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index %c1024 = arith.constant 1024 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) { + //CHECK: %[[r1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] iter_args(%[[arg3:.*]] = %[[r0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) { %nexta, %res = scf.for %k= %c0 to %c1024 step %c64 iter_args(%subA = %1, %subB = %cst) -> (!xetile.tile<32x32xf16>, vector<32x32xf16>) { - //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %3 = xetile.load_tile %subA : !xetile.tile<32x32xf16> -> vector<32x32xf16> - //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %5 = xetile.update_tile_offset %subA, [%c0, %c64]: !xetile.tile<32x32xf16>, index, index -> !xetile.tile<32x32xf16> - //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> + //CHECK: scf.yield %[[r11]], %[[r10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> scf.yield %5, %3: !xetile.tile<32x32xf16>, vector<32x32xf16> } //CHECK: %[[r2:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> @@ -33,19 +33,19 @@ gpu.module @test_kernel { //CHECK: %[[r5:.*]] = vector.extract_strided_slice %[[r1]]#1 {offsets = [24, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> - //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r6:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index - //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r7:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r8:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index - //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r9:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> %5 = xetile.init_tile %b[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r2]], %[[r6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r3]], %[[r7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r4]], %[[r8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[r5]], %[[r9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> xetile.store_tile %res, %5: vector<32x32xf16>, !xetile.tile<32x32xf16> //CHECK: gpu.return diff --git a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir index e07c15bb5..f874de792 100644 --- a/test/Conversion/XeTileToXeGPU/sg_softmax.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_softmax.mlir @@ -4,13 +4,13 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_0(%a: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> @@ -30,12 +30,12 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_1(%a: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %2 = xetile.load_tile %1: !xetile.tile<32x64xf16> -> vector<32x64xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir index 07d7111f4..9b8007a62 100644 --- a/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_store_tile.mlir @@ -12,28 +12,28 @@ gpu.module @test_kernel { %result = arith.constant dense<0.0>: vector<32x32xf32> //CHECK: %[[c0:.*]] = arith.constant 0 : index //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c48:.*]] = arith.constant 48 : index - //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c8:.*]] = arith.constant 8 : index - //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c16:.*]] = arith.constant 16 : index - //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R8:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R9:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> //CHECK: %[[c24:.*]] = arith.constant 24 : index - //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R10:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: %[[R11:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32> - //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R0]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R0]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R1]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R1]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R2]], %[[R8]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R2]], %[[R9]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R3]], %[[R10]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + //CHECK: xegpu.store_nd %[[R3]], %[[R11]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %result, %1: vector<32x32xf32>, !xetile.tile<32x32xf32> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir index 93524d094..bc1871af5 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tile_mma.mlir @@ -9,10 +9,10 @@ gpu.module @test_kernel { //CHECK: %[[c64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %1 = xetile.init_tile %a[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16> - //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r1:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r2:.*]] = vector.extract %[[r1]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r3:.*]] = vector.extract %[[r1]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r4:.*]] = vector.extract_strided_slice %[[r2]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -24,15 +24,15 @@ gpu.module @test_kernel { //CHECK: %[[r10:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> //CHECK: %[[r11:.*]] = vector.extract_strided_slice %[[r3]] {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %2 = xetile.load_tile %1 : !xetile.tile<32x32xf16> -> vector<32x32xf16> - //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r12:.*]] = xegpu.create_nd_tdesc %[[arg1]][%[[c64]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r13:.*]] = xegpu.create_nd_tdesc %arg1[%[[c64]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %3 = xetile.init_tile %b[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16> - //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r14:.*]] = xegpu.load_nd %[[r12]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r15:.*]] = vector.extract %[[r14]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r16:.*]] = vector.extract %[[r14]][1] : vector<32x16xf16> from vector<2x32x16xf16> - //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[r17:.*]] = xegpu.load_nd %[[r13]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[r18:.*]] = vector.extract %[[r17]][0] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r19:.*]] = vector.extract %[[r17]][1] : vector<32x16xf16> from vector<2x32x16xf16> //CHECK: %[[r20:.*]] = vector.extract_strided_slice %[[r15]] {offsets = [0, 0], sizes = [16, 16], strides = [1, 1]} : vector<32x16xf16> to vector<16x16xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir index d24ae787b..f503b4dea 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_broadcast.mlir @@ -10,7 +10,7 @@ gpu.module @test_kernel { %3 = xetile.tile_unpack %2 {inner_blocks = array}: vector<32x4x1x16xf16> -> vector<32x64xf16> %4 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> %5 = xetile.tile_pack %3 {inner_blocks = array}: vector<32x64xf16> -> vector<32x4x1x16xf16> - // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + // CHECK-COUNT-128: xegpu.store_nd %[[cst]], %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> xetile.store_tile %5, %4 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr> gpu.return } @@ -87,7 +87,7 @@ gpu.module @test_kernel { %2 = xetile.tile_unpack %1 {inner_blocks = array}: vector<32x4x1x16xf16> -> vector<32x64xf16> %3 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> %4 = xetile.tile_pack %2 {inner_blocks = array}: vector<32x64xf16> -> vector<32x4x1x16xf16> - // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> + // CHECK-COUNT-128: xegpu.store_nd %{{.*}}, %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<1x16xf16>, !xegpu.tensor_desc<1x16xf16, #xegpu.block_tdesc_attr> xetile.store_tile %4, %3 : vector<32x4x1x16xf16>, !xetile.tile<32x64xf16, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir index cb2f84467..92d59f0ee 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_load_tile.mlir @@ -9,11 +9,11 @@ gpu.module @test_kernel { %c64 = arith.constant 64 : index // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %arg0[%[[C0]], %[[C64]]] : memref<1024x1024xf16> - // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R1:.*]] = xegpu.load_nd %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + // CHECK-SAME: !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x1x32x32xf16> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir index a12536ed1..62d7e313d 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_scf_for.mlir @@ -14,30 +14,30 @@ // CHECK: %[[c1024:.*]] = arith.constant 1024 : index %c1024 = arith.constant 1024 : index - // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R1:.*]]:2 = scf.for %[[arg2:.*]] = %[[c0]] to %[[c1024]] step %[[c64]] - // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) + // CHECK-SAME: iter_args(%[[arg3:.*]] = %[[R0]], %[[arg4:.*]] = %[[cst]]) -> (!xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16>) %1:2 = scf.for %arg2 = %c0 to %c1024 step %c64 iter_args(%arg3 = %0, %arg4 = %cst) -> (!xetile.tile<32x32xf16, #xetile.tile_attr>, vector<1x1x32x32xf16>) { - // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + // CHECK: %[[R10:.*]] = xegpu.load_nd %[[arg3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %5 = xetile.load_tile %arg3 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x1x32x32xf16> - // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R11:.*]] = xegpu.update_nd_offset %[[arg3]], [%[[c0]], %[[c64]]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %6 = xetile.update_tile_offset %arg3, [%c0, %c64] : !xetile.tile<32x32xf16, #xetile.tile_attr>, index, index -> !xetile.tile<32x32xf16, #xetile.tile_attr> - // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> + // CHECK: scf.yield %[[R11]], %[[R10]] : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr>, vector<32x32xf16> scf.yield %6, %5 : !xetile.tile<32x32xf16, #xetile.tile_attr>, vector<1x1x32x32xf16> } - // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c0]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c8:.*]] = arith.constant 8 : index - // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c8]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c16:.*]] = arith.constant 16 : index - // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c16]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> // CHECK: %[[c24:.*]] = arith.constant 24 : index - // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[ARG1]][%[[c24]], %[[c64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg1[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> // CHECK: %[[R6:.*]] = vector.extract_strided_slice %[[R1]]#1 {offsets = [0, 0], sizes = [8, 32], strides = [1, 1]} : vector<32x32xf16> to vector<8x32xf16> @@ -47,10 +47,10 @@ %3 = xetile.tile_unpack %1#1 {inner_blocks = array} : vector<1x1x32x32xf16> -> vector<32x32xf16> %4 = xetile.tile_pack %3 {inner_blocks = array}: vector<32x32xf16> -> vector<4x1x8x32xf16> - // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> - // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R6]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R7]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R8]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> + // CHECK: xegpu.store_nd %[[R9]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x32xf16>, !xegpu.tensor_desc<8x32xf16, #xegpu.block_tdesc_attr> xetile.store_tile %4, %2 : vector<4x1x8x32xf16>, !xetile.tile<32x32xf16, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir index a520e588b..9929788d0 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_softmax.mlir @@ -4,13 +4,13 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_0(%arg0: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32} : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x2x32x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> @@ -35,12 +35,12 @@ gpu.module @test_kernel { //CHECK-SAME: (%[[arg0:.*]]: memref<1024x1024xf16>) gpu.func @sglevel_softmax_dim_1(%arg0: memref<1024x1024xf16>) { //CHECK: %[[c0:.*]] = arith.constant 0 : index - //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> //CHECK: %[[c32:.*]] = arith.constant 32 : index - //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> + //CHECK: %[[r1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> - //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r2:.*]] = xegpu.load_nd %[[r0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> + //CHECK: %[[r3:.*]] = xegpu.load_nd %[[r1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x32xf16, #xegpu.block_tdesc_attr> -> vector<32x32xf16> %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32} : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x2x32x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r2]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> //CHECK-COUNT-32: {{.*}} = vector.extract_strided_slice %[[r3]] {offsets = [{{.*}}], sizes = [1, 32], strides = [1, 1]} : vector<32x32xf16> to vector<1x32xf16> diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir index 4419001ca..52ba152c0 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_store_tile.mlir @@ -8,25 +8,25 @@ // CHECK: %[[c0:.*]] = arith.constant 0 : index // CHECK: %[[c32:.*]] = arith.constant 32 : index // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c48:.*]] = arith.constant 48 : index // CHECK: %[[R1:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c0]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c8:.*]] = arith.constant 8 : index // CHECK: %[[R2:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R3:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c8]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c16:.*]] = arith.constant 16 : index // CHECK: %[[R4:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R5:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c16]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[c24:.*]] = arith.constant 24 : index // CHECK: %[[R6:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c32]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: %[[R7:.*]] = xegpu.create_nd_tdesc %[[arg0]][%[[c24]], %[[c48]]] : memref<1024x1024xf32> - // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[0, 32] : memref<1024x1024xf32> -> !xetile.tile<32x32xf32, #xetile.tile_attr> // CHECK: %[[R8:.*]] = vector.extract_strided_slice %[[cst]] {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -37,21 +37,21 @@ %2 = xetile.tile_pack %1 {inner_blocks = array} : vector<32x32xf32> -> vector<4x2x8x16xf32> // CHECK: xegpu.store_nd %[[R8]], %[[R0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R8]], %[[R1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R9]], %[[R2]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R9]], %[[R3]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R10]], %[[R4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R10]], %[[R5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R11]], %[[R6]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> // CHECK: xegpu.store_nd %[[R11]], %[[R7]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> - // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> xetile.store_tile %2, %0 : vector<4x2x8x16xf32>, !xetile.tile<32x32xf32, #xetile.tile_attr> gpu.return } diff --git a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir index 31cac7e21..6079e9f29 100644 --- a/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir +++ b/test/Conversion/XeTileToXeGPU/sg_tiled_tile_mma.mlir @@ -8,24 +8,24 @@ gpu.module @test_kernel { // CHECK: %[[C64:.*]] = arith.constant 64 : index %c64 = arith.constant 64 : index - // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[REG0:.*]] = xegpu.create_nd_tdesc %[[ARG0]][%[[C0]], %[[C64]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %0 = xetile.init_tile %arg0[%c0, %c64] : memref<1024x1024xf16> -> !xetile.tile<32x32xf16, #xetile.tile_attr> - // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + // CHECK: %[[REG1:.*]] = xegpu.load_nd %[[REG0]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> // CHECK: %[[REG2:.*]] = vector.extract %[[REG1]][0] : vector<32x16xf16> from vector<2x32x16xf16> // CHECK: %[[REG3:.*]] = vector.extract %[[REG1]][1] : vector<32x16xf16> from vector<2x32x16xf16> %1 = xetile.load_tile %0 {padding = 0.000000e+00 : f32} : !xetile.tile<32x32xf16, #xetile.tile_attr> -> vector<1x2x32x16xf16> - // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[REG4:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C0]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> // CHECK: %[[C32:.*]] = arith.constant 32 : index - // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + // CHECK: %[[REG5:.*]] = xegpu.create_nd_tdesc %arg1[%[[C64]], %[[C32]]] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> %2 = xetile.init_tile %arg1[%c64, %c0] : memref<1024x1024xf16> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + // CHECK: %[[REG6:.*]] = xegpu.load_nd %[[REG4]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> // CHECK: %[[REG7:.*]] = vector.extract %[[REG6]][0] : vector<32x16xf16> from vector<2x32x16xf16> // CHECK: %[[REG8:.*]] = vector.extract %[[REG6]][1] : vector<32x16xf16> from vector<2x32x16xf16> - // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + // CHECK: %[[REG9:.*]] = xegpu.load_nd %[[REG5]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> // CHECK: %[[REG10:.*]] = vector.extract %[[REG9]][0] : vector<32x16xf16> from vector<2x32x16xf16> // CHECK: %[[REG11:.*]] = vector.extract %[[REG9]][1] : vector<32x16xf16> from vector<2x32x16xf16> %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32} : !xetile.tile<32x64xf16, #xetile.tile_attr> -> vector<1x4x32x16xf16> diff --git a/test/Conversion/XeTileToXeGPU/test_order.mlir b/test/Conversion/XeTileToXeGPU/test_order.mlir index 60b8b029f..b0eedce8e 100644 --- a/test/Conversion/XeTileToXeGPU/test_order.mlir +++ b/test/Conversion/XeTileToXeGPU/test_order.mlir @@ -5,10 +5,10 @@ // CHECK: %[[C0:.*]] = arith.constant 0 : index // CHECK: %[[C16:.*]] = arith.constant 16 : index // CHECK: %[[R_CAST:.*]] = memref.reinterpret_cast %[[ARG1]] to offset: [0], sizes: [128, 64], strides: [64, 1] : memref<64x128xf16, strided<[1, 64]>> to memref<128x64xf16, strided<[64, 1]>> -// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16, strided<[64, 1]>> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> -// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> +// CHECK: %[[T1:.*]] = xegpu.create_nd_tdesc %[[R_CAST]][%[[C0]], %[[C0]]] : memref<128x64xf16, strided<[64, 1]>> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T8:.*]] = xegpu.load_nd %[[T1]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> +// CHECK: %[[T19:.*]] = xegpu.update_nd_offset %[[T1]], [%[[C0]], %[[C16]]] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> +// CHECK: %[[T26:.*]] = xegpu.load_nd %[[T19]] <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> gpu.module @test_kernel { func.func @test_func(%A : memref<128x64xf16>, %B : memref<64x128xf16, strided<[1, 64], offset: 0>>) { %c0 = arith.constant 0 : index diff --git a/test/Dialect/XeGPU/IR/XeGPUOps.mlir b/test/Dialect/XeGPU/IR/XeGPUOps.mlir index 6f729cc9f..7d88ec0a7 100644 --- a/test/Dialect/XeGPU/IR/XeGPUOps.mlir +++ b/test/Dialect/XeGPU/IR/XeGPUOps.mlir @@ -24,10 +24,10 @@ func.func @test_create_nd_tdesc_vc(%src: memref<24x32xf32>) { // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 - // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 - -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + // CHECK: xegpu.create_tdesc %{{.*}} : ui64, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir index 7551f7308..159c338a0 100644 --- a/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir +++ b/test/Dialect/XeGPU/IR/atomic_rmw_vc.mlir @@ -5,8 +5,8 @@ // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s // CHECK-LABEL: func @test_atomic_rmw({{.*}}) { -func.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> +func.func @test_atomic_rmw(%src: ui64, %offsets : vector<16 x index>, %value : vector<16xf32>, %mask : vector<16xi1>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> @@ -16,8 +16,8 @@ func.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<1 } // CHECK-LABEL: func @test_atomic_rmw_0({{.*}}) { -func.func @test_atomic_rmw_0(%src: ui64, %value : vector<16x2xf32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> +func.func @test_atomic_rmw_0(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xf32>, %mask : vector<16xi1>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x2xf32> @@ -27,8 +27,8 @@ func.func @test_atomic_rmw_0(%src: ui64, %value : vector<16x2xf32>, %mask : vect } // CHECK-LABEL: func @test_atomic_rmw_1({{.*}}) { -func.func @test_atomic_rmw_1(%src: ui64, %value : vector<16x2xi32>, %mask : vector<16xi1>) { - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> +func.func @test_atomic_rmw_1(%src: ui64, %offsets : vector<16 x index>, %value : vector<16x2xi32>, %mask : vector<16xi1>) { + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> // CHECK: xegpu.atomic_rmw // CHECK-SAME: !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16xi1>, vector<16x2xi32> diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir index e437622a6..d351fb826 100644 --- a/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir +++ b/test/Dialect/XeGPU/IR/create_nd_tdesc.mlir @@ -65,9 +65,9 @@ func.func @test_create_nd_tdesc_4(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_5(%src: memref, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -75,9 +75,9 @@ func.func @test_create_nd_tdesc_5(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_6(%src: memref, %w : index, %h : index, %x : index, %y : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -94,9 +94,9 @@ func.func @test_create_nd_tdesc_7(%src: memref<1024xf16>, %offset : index) { func.func @test_create_nd_tdesc_8(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> + : memref -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr> return } @@ -104,8 +104,8 @@ func.func @test_create_nd_tdesc_8(%src: memref, %w : index, %h : index, func.func @test_create_nd_tdesc_9(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr>> + // CHECK-SAME: memref -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr>> %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref - -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr> + -> !xegpu.tensor_desc<64x128xf16, #xegpu.block_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir index e24c15574..6f652a21c 100644 --- a/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir +++ b/test/Dialect/XeGPU/IR/create_nd_tdesc_vc.mlir @@ -69,9 +69,8 @@ func.func @test_create_nd_tdesc_vc_5(%src: memref, %w : index, %h : ind %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1] - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf32> return } @@ -80,9 +79,8 @@ func.func @test_create_nd_tdesc_vc_6(%src: memref, %w : index, %h : ind %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc // CHECK-SAME: %arg0[%arg3, %arg4], [%arg2, %arg1], [%arg1, %c1] - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[%x, %y], [%h, %w], [%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf32> return } @@ -100,16 +98,15 @@ func.func @test_create_nd_tdesc_vc_7(%src: memref<1024xf32>, %offset : index) { func.func @test_create_nd_tdesc_vc_8(%src: memref, %w : index, %h : index, %x : index) { %c1 = arith.constant 1 : index // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] - : memref -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: memref -> !xegpu.tensor_desc<8x16xf32> + %1 = xegpu.create_nd_tdesc %src[8, %x], [%h, %w], [%w, %c1] : memref -> !xegpu.tensor_desc<8x16xf32> return } // CHECK-LABEL: func @test_create_nd_tdesc_vc_9({{.*}}) { func.func @test_create_nd_tdesc_vc_9(%src: memref<8x32xf32>) { // CHECK: xegpu.create_nd_tdesc - // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + // CHECK-SAME: memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<8x32xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir index fd2da2354..137f77816 100644 --- a/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir +++ b/test/Dialect/XeGPU/IR/create_tdesc_vc.mlir @@ -1,49 +1,51 @@ -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s +// RUN: imex-opt %s | FileCheck %s // Verify the printed output can be parsed. -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s +// RUN: imex-opt %s | imex-opt | FileCheck %s // Verify the generic form can be parsed. -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s +// RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s // CHECK-LABEL: func @test_create_tdesc_vc({{.*}}) { -func.func @test_create_tdesc_vc(%src: ui64) { - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> +func.func @test_create_tdesc_vc(%src: ui64, %offsets : vector<16 x index>) { + // CHECK: xegpu.create_tdesc %arg0, %arg1 + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> return } // CHECK-LABEL: func @test_create_tdesc_vc_2({{.*}}) { -func.func @test_create_tdesc_vc_2(%src: ui64) { - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 - -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr> +func.func @test_create_tdesc_vc_2(%src: ui64, %offsets : vector<16 x index>) { + // CHECK: xegpu.create_tdesc %arg0, %arg1 + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : + ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> return } // CHECK-LABEL: func @test_create_tdesc_vc_3({{.*}}) { -func.func @test_create_tdesc_vc_3(%src: ui64) { - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 +func.func @test_create_tdesc_vc_3(%src: ui64, %offsets : vector<16 x index>) { + // CHECK: xegpu.create_tdesc %arg0, %arg1 + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> return } // CHECK-LABEL: func @test_create_tdesc_vc_4({{.*}}) { -func.func @test_create_tdesc_vc_4(%src: ui64) { - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 - // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 - -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> +func.func @test_create_tdesc_vc_4(%src: ui64, %offsets : vector<16 x index>) { + // CHECK: xegpu.create_tdesc %arg0, %arg1 : ui64, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } // CHECK-LABEL: func @test_create_tdesc_vc_5({{.*}}) { -func.func @test_create_tdesc_vc_5(%src: memref) { - // CHECK: xegpu.create_tdesc - // CHECK-SAME: memref - // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref - -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> +func.func @test_create_tdesc_vc_5(%src: memref, %offsets : vector<16 x index>) { + // CHECK: xegpu.create_tdesc {{.*}} : memref, vector<16xindex> + // CHECK-SAME: !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %1 = xegpu.create_tdesc %src, %offsets : memref, vector<16 x index> + -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/invalid_vc.mlir b/test/Dialect/XeGPU/IR/invalid_vc.mlir index 3cfb4ad9b..aef5e77a5 100644 --- a/test/Dialect/XeGPU/IR/invalid_vc.mlir +++ b/test/Dialect/XeGPU/IR/invalid_vc.mlir @@ -47,19 +47,19 @@ func.func @test_create_nd_tdesc_vc_5(%input: memref<24x32x64xf32>) { } // ----- -func.func @test_create_tdesc(%src: ui64) { - // expected-error@+1 {{Incorrect TensorDesc shape}} - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17] - : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>> +func.func @test_create_tdesc(%src: ui64, %offsets : vector<16x8xindex>) { + // expected-error@+1 {{operand #1 must be vector of index values of ranks 1}} + %1 = xegpu.create_tdesc %src, %offsets + : ui64, vector<16x8xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<>> return } // ----- -func.func @test_load_gather(%src: ui64) { +func.func @test_load_gather(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc {{.*}} : ui64 + // CHECK: xegpu.create_tdesc {{.*}} : ui64, vector<16xindex> // CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf16, #xegpu.scatter_tdesc_attr> // expected-error@+1 {{failed to verify that all of {value, TensorDesc} have same rank}} @@ -69,25 +69,25 @@ func.func @test_load_gather(%src: ui64) { } // ----- -func.func @test_create_tdesc_oversized(%src: ui64) { +func.func @test_create_tdesc_oversized(%src: ui64, %offsets : vector<16xindex>) { // expected-error@+1 {{total access size (simd_lanes * chunk_size * sizeof(elemTy)) is upto 512 bytes}} - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x16xf32, #xegpu.scatter_tdesc_attr> return } // ----- -func.func @test_create_tdesc_invalid_chunk_size(%src: ui64) { +func.func @test_create_tdesc_invalid_chunk_size(%src: ui64, %offsets : vector<16xindex>) { // expected-error@+1 {{Invalid chunk_size. Supported values are 1, 2, 3, 4, 8, 16, 32, 64, 128, or 256.}} - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x7xf32, #xegpu.scatter_tdesc_attr> return } // ----- -func.func @test_create_tdesc_unaligned(%src: ui64) { +func.func @test_create_tdesc_unaligned(%src: ui64, %offsets : vector<16xindex>) { // expected-error@+1 {{access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned}} - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x3xf16, #xegpu.scatter_tdesc_attr> return } diff --git a/test/Dialect/XeGPU/IR/load_gather_vc.mlir b/test/Dialect/XeGPU/IR/load_gather_vc.mlir index 3209205ac..e65275267 100644 --- a/test/Dialect/XeGPU/IR/load_gather_vc.mlir +++ b/test/Dialect/XeGPU/IR/load_gather_vc.mlir @@ -1,16 +1,16 @@ -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | FileCheck %s +// RUN: imex-opt %s | FileCheck %s // Verify the printed output can be parsed. -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s +// RUN: imex-opt %s | imex-opt | FileCheck %s // Verify the generic form can be parsed. -// RUN: IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt -mlir-print-op-generic %s | IMEX_XEGPU_PRINT_DEFAULTS=true imex-opt | FileCheck %s +// RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s // CHECK-LABEL: func @test_load_gather_vc({{.*}}) { -func.func @test_load_gather_vc(%src: ui64) { +func.func @test_load_gather_vc(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64 + //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> //CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> @@ -20,12 +20,12 @@ func.func @test_load_gather_vc(%src: ui64) { } // CHECK-LABEL: func @test_load_gather_vc_2({{.*}}) { -func.func @test_load_gather_vc_2(%src: ui64) { +func.func @test_load_gather_vc_2(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64 + //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64, vector<16xindex> //CHECK-SAME: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr> //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> @@ -36,11 +36,11 @@ func.func @test_load_gather_vc_2(%src: ui64) { } // CHECK-LABEL: func @test_load_gather_vc_4({{.*}}) { -func.func @test_load_gather_vc_4(%src: ui64) { +func.func @test_load_gather_vc_4(%src: ui64, %offsets : vector<16xindex>) { %0 = arith.constant dense<1>: vector<16xi1> - //CHECK: {{.*}} = xegpu.create_tdesc {{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 + //CHECK: {{.*}} = xegpu.create_tdesc {{.*}}, {{.*}} : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> //CHECK: {{.*}} = xegpu.load {{.*}}, {{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> diff --git a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir index ad8a5b9f4..df304e739 100644 --- a/test/Dialect/XeGPU/IR/store_scatter_vc.mlir +++ b/test/Dialect/XeGPU/IR/store_scatter_vc.mlir @@ -5,13 +5,16 @@ // RUN: imex-opt -mlir-print-op-generic %s | imex-opt | FileCheck %s // CHECK-LABEL: func @test_store_scatter_vc({{.*}}) { -func.func @test_store_scatter_vc(%src: ui64, %dst: ui64) { +func.func @test_store_scatter_vc(%src: ui64, %offsets : vector<16 x index>, %dst: ui64) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + // CHECK: xegpu.create_tdesc + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %2 = xegpu.create_tdesc %dst[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + // CHECK: xegpu.create_tdesc + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %2 = xegpu.create_tdesc %dst, %offsets + : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.load // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} diff --git a/test/Dialect/XeGPU/IR/update_offset_vc.mlir b/test/Dialect/XeGPU/IR/update_offset_vc.mlir index 15f03b34e..2a90d4c07 100644 --- a/test/Dialect/XeGPU/IR/update_offset_vc.mlir +++ b/test/Dialect/XeGPU/IR/update_offset_vc.mlir @@ -7,8 +7,10 @@ // CHECK-LABEL: func @test_update_offset_VC({{.*}}) { func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) { %0 = arith.constant dense<1>: vector<16xi1> - // CHECK: xegpu.create_tdesc %{{.*}} : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %1 = xegpu.create_tdesc %src[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + // CHECK: xegpu.create_tdesc + // CHECK-SAME: ui64, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %1 = xegpu.create_tdesc %src, %offsets + : ui64, vector<16 x index> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> // CHECK: xegpu.load // CHECK-SAME: {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} @@ -16,8 +18,12 @@ func.func @test_update_offset_VC(%src: ui64, %offsets : vector<16 x index>) { %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> - // CHECK: xegpu.update_offset %{{.*}} : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %5 = xegpu.update_offset %1, [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %3 = arith.constant dense<16>: vector<16 x index> + + // CHECK: xegpu.update_offset + // CHECK-SAME: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> + %5 = xegpu.update_offset %1, %3 + : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16 x index> return } diff --git a/test/Dialect/XeTile/IR/ops.mlir b/test/Dialect/XeTile/IR/ops.mlir index f0ba5981d..1a952e3f0 100644 --- a/test/Dialect/XeTile/IR/ops.mlir +++ b/test/Dialect/XeTile/IR/ops.mlir @@ -22,8 +22,8 @@ #wg_map_b2 = #xetile.wg_map func.func @test_init_tile_for_slm(%a: memref<1024x1024xf16, 3>) { - //CHECK: xetile.init_tile {{.*}}[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr> - %1 = xetile.init_tile %a[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr> + //CHECK: xetile.init_tile {{.*}}[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr> + %1 = xetile.init_tile %a[8, 16] : memref<1024x1024xf16, 3> -> !xetile.tile<32x64xf16, #xetile.tile_attr> return } diff --git a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir index 2bd13c234..0843b7485 100644 --- a/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir +++ b/test/Dialect/XeTile/Transforms/Blocking/sg_gemm_1k_1k_1k_f16_f32_slm.mlir @@ -1,6 +1,6 @@ // RUN: imex-opt --xetile-init-duplicate --new-xetile-blocking --canonicalize --cse %s | FileCheck %s -#slm = #xetile.tile_attr +#slm = #xetile.tile_attr // CHECK-LABEL: gpu.module @test_kernel { gpu.module @test_kernel { @@ -26,8 +26,8 @@ gpu.module @test_kernel { %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32> %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32} : !xetile.tile<8x16xf32> -> vector<8x16xf32> - //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr> - //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr> + //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr> + //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr> %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #slm> %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #slm> %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) diff --git a/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir b/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir index b2270afb1..91ae5b63c 100644 --- a/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir +++ b/test/Dialect/XeTile/Transforms/sg_gemm_1k_1k_1k_f16_f32_slm.mlir @@ -1,6 +1,6 @@ // RUN: imex-opt --xetile-init-duplicate --xetile-blocking --canonicalize --cse %s | FileCheck %s -#slm = #xetile.tile_attr +#slm = #xetile.tile_attr // CHECK-LABEL: gpu.module @test_kernel { gpu.module @test_kernel { @@ -26,8 +26,8 @@ gpu.module @test_kernel { %2 = xetile.init_tile %arg2[%0, %1] : memref<128x128xf32> -> !xetile.tile<8x16xf32> %3 = xetile.load_tile %2 {padding = 0.000000e+00 : f32} : !xetile.tile<8x16xf32> -> vector<8x16xf32> - //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr> - //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr> + //CHECK: %[[r5:.*]] = xetile.init_tile %[[arg0]][%[[r0]], %[[c0]]] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #xetile.tile_attr> + //CHECK: %[[r6:.*]] = xetile.init_tile %[[arg1]][%[[c0]], %[[r1]]] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #xetile.tile_attr> %4 = xetile.init_tile %arg0[%0, %c0] : memref<128x128xf16, 3> -> !xetile.tile<8x16xf16, #slm> %5 = xetile.init_tile %arg1[%c0, %1] : memref<128x128xf16, 3> -> !xetile.tile<16x16xf16, #slm> %6:3 = scf.for %arg3 = %c0 to %c128 step %c16 iter_args(%arg4 = %4, %arg5 = %5, %arg6 = %3) diff --git a/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir b/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir index b05102cb1..312a07dc0 100644 --- a/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir +++ b/test/Dialect/XeTile/Transforms/wg_to_sg_btranspose.mlir @@ -51,11 +51,11 @@ gpu.module @test_gemm_btranspose{ %4 = arith.muli %block_id_x, %c2048 : index %5 = arith.muli %0, %c256 : index %6 = arith.addi %4, %5 : index - %7 = xetile.init_tile %arg2[%6, %3] : memref<16384x1536xf32> -> !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> + %7 = xetile.init_tile %arg2[%6, %3] : memref<16384x1536xf32> -> !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> %8 = arith.muli %block_id_x, %c2048 : index %9 = arith.muli %0, %c256 : index %10 = arith.addi %8, %9 : index - %11 = xetile.init_tile %arg0[%10, %c0] : memref<16384x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> + %11 = xetile.init_tile %arg0[%10, %c0] : memref<16384x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> //CHECK: %[[R7:.*]] = index.floordivs %[[R6]], %[[c8]] //CHECK: %[[R8:.*]] = index.remu %[[R6]], %[[c8]] @@ -69,16 +69,16 @@ gpu.module @test_gemm_btranspose{ //CHECK: %[[R16:.*]] = index.add %[[R15]], %[[c0]] //CHECK: %[[INITTILE:.*]] = xetile.init_tile %[[arg1]][%[[R12]], %[[R16]]] : memref<1536x12288xf16> -> !xetile.tile<64x32xf16> - %12 = xetile.init_tile %arg1[%2, %c0] : memref<1536x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - %13:2 = scf.for %arg15 = %c0 to %c2 step %c1_1 iter_args(%arg16 = %7, %arg17 = %11) -> (!xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>) { - %14 = xetile.update_tile_offset %arg17, [%c1024, %c0] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - %15 = xetile.update_tile_offset %arg16, [%c1024, %c0] : !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - %16:3 = scf.for %arg18 = %c0 to %c12288 step %c32_2 iter_args(%arg19 = %cst, %arg20 = %arg17, %arg21 = %12) -> (vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>) { - %18 = xetile.update_tile_offset %arg21, [%c0, %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - %19 = xetile.update_tile_offset %arg20, [%c0, %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - %20 = xetile.load_tile %arg20 {padding = 0.000000e+00 : f32} : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> -> vector<256x32xf16> + %12 = xetile.init_tile %arg1[%2, %c0] : memref<1536x12288xf16> -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + %13:2 = scf.for %arg15 = %c0 to %c2 step %c1_1 iter_args(%arg16 = %7, %arg17 = %11) -> (!xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>) { + %14 = xetile.update_tile_offset %arg17, [%c1024, %c0] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + %15 = xetile.update_tile_offset %arg16, [%c1024, %c0] : !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + %16:3 = scf.for %arg18 = %c0 to %c12288 step %c32_2 iter_args(%arg19 = %cst, %arg20 = %arg17, %arg21 = %12) -> (vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>) { + %18 = xetile.update_tile_offset %arg21, [%c0, %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + %19 = xetile.update_tile_offset %arg20, [%c0, %c32_2] : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, index, index -> !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + %20 = xetile.load_tile %arg20 {padding = 0.000000e+00 : f32} : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> -> vector<256x32xf16> %21 = math.exp %20 {map = #xetile.wg_map} : vector<256x32xf16> - %22 = xetile.load_tile %arg21 {padding = 0.000000e+00 : f32} : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> -> vector<256x32xf16> + %22 = xetile.load_tile %arg21 {padding = 0.000000e+00 : f32} : !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> -> vector<256x32xf16> //CHECK: %[[TRANSPOSE:.*]] vector.transpose {{%.*}}, [1, 0] : vector<64x32xf16> to vector<32x64xf16> %23 = vector.transpose %22, [1, 0] {map = #xetile.wg_map} : vector<256x32xf16> to vector<32x256xf16> %24 = math.exp %23 {map = #xetile.wg_map} : vector<32x256xf16> @@ -86,11 +86,11 @@ gpu.module @test_gemm_btranspose{ %25 = xetile.tile_mma %21, %24, %cst {wg_map_a =#xetile.wg_map, wg_map_b =#xetile.wg_map, wg_map_c =#xetile.wg_map} : vector<256x32xf16>, vector<32x256xf16>, vector<256x256xf32> -> vector<256x256xf32> xegpu.compile_hint %26 = arith.addf %arg19, %25 {map = #xetile.wg_map} : vector<256x256xf32> - scf.yield %26, %19, %18 : vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> + scf.yield %26, %19, %18 : vector<256x256xf32>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> } %17 = math.exp %16#0 {map = #xetile.wg_map} : vector<256x256xf32> - xetile.store_tile %17, %arg16 : vector<256x256xf32>, !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> - scf.yield %15, %14 : !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_scope = 0 : i32>> + xetile.store_tile %17, %arg16 : vector<256x256xf32>, !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> + scf.yield %15, %14 : !xetile.tile<256x256xf32, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>>, !xetile.tile<256x32xf16, #xetile.tile_attr, inner_blocks = [], memory_space = 0 : i32>> } gpu.terminator } diff --git a/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir b/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir index 888bb0bad..ac2f1261a 100644 --- a/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir +++ b/test/Dialect/XeTile/Transforms/wg_to_sg_gemm_postop.mlir @@ -1,11 +1,11 @@ // RUN: imex-opt --split-input-file --xetile-wg-to-sg --cse %s -verify-diagnostics | FileCheck %s #wg_map_a = #xetile.wg_map -#tile_attr_a = #xetile.tile_attr +#tile_attr_a = #xetile.tile_attr #wg_map_b = #xetile.wg_map -#tile_attr_b = #xetile.tile_attr +#tile_attr_b = #xetile.tile_attr #wg_map_c = #xetile.wg_map -#tile_attr_c = #xetile.tile_attr +#tile_attr_c = #xetile.tile_attr #map = affine_map<() -> (0)> #map1 = affine_map<() -> (12288)> diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir index 2309e13f5..098eb8928 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_4_f32.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test(%arg0: memref<16x4xf32>) -> memref<16x4xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index @@ -23,11 +23,13 @@ module @gemm attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_copy(%a: memref<16x4xf32>, %b: memref<16x4xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex> // load from a using load_gather %a_cast = memref.reinterpret_cast %a to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32> - %a_tdesc = xegpu.create_tdesc %a_cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter> + %a_tdesc = xegpu.create_tdesc %a_cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter> xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16x4xf32, #scatter> %data = xegpu.load %a_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1> -> vector<4x16xf32> @@ -38,7 +40,7 @@ module @gemm attributes {gpu.container_module} { // store to b using store_scatter %b_cast = memref.reinterpret_cast %b to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32> - %b_tdesc = xegpu.create_tdesc %b_cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter> + %b_tdesc = xegpu.create_tdesc %b_cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter> xegpu.store %data, %b_tdesc, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir index 748b24ab4..3fbd1b227 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_chunk_8_f32.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test(%arg0: memref<16x8xf32>) -> memref<16x8xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index @@ -23,11 +23,13 @@ module @gemm attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_copy(%a: memref<16x8xf32>, %b: memref<16x8xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { + %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> + %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> // load from a using load_gather %a_cast = memref.reinterpret_cast %a to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32> - %a_tdesc = xegpu.create_tdesc %a_cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter> + %a_tdesc = xegpu.create_tdesc %a_cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter> xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16x8xf32, #scatter> %data = xegpu.load %a_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1> -> vector<8x16xf32> @@ -38,7 +40,7 @@ module @gemm attributes {gpu.container_module} { // store to b using store_scatter %b_cast = memref.reinterpret_cast %b to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32> - %b_tdesc = xegpu.create_tdesc %b_cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter> + %b_tdesc = xegpu.create_tdesc %b_cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter> xegpu.store %data, %b_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir index f4edaad18..bd699c38d 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f16.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test(%arg0: memref<16xf16>) -> memref<16xf16> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index @@ -25,16 +25,17 @@ module @gemm attributes {gpu.container_module} { gpu.func @test_copy(%a: memref<16xf16>, %b: memref<16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> // load from a using load_gather - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> -> vector<16xf16> // %v1 = vector.extract %data[4]: f16 from vector<16xf16> // gpu.printf "\ndata[4] : %f.\n" %v1: f16 // store to b using store_scatter - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> xegpu.store %data, %b_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir index 7cffad49b..0516e79a6 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/load_global_no_chunk_f32.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test(%arg0: memref<16xf32>) -> memref<16xf32> attributes {llvm.emit_c_interface} { %c1 = arith.constant 1 : index @@ -28,12 +28,12 @@ module @gemm attributes {gpu.container_module} { %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> // load from a using load_gather - %a_tdesc = xegpu.create_tdesc %a[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + %a_tdesc = xegpu.create_tdesc %a, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> xegpu.prefetch %a_tdesc : !xegpu.tensor_desc<16xf32, #scatter> %data = xegpu.load %a_tdesc, %mask : !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> -> vector<16xf32> // store to b using store_scatter - %b_tdesc = xegpu.create_tdesc %b[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + %b_tdesc = xegpu.create_tdesc %b, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> xegpu.store %data, %b_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir index 282c41306..012be6c1d 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_4_f32.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16x4xf32> attributes {llvm.emit_c_interface} { @@ -23,8 +23,9 @@ module @gemm attributes {gpu.container_module} { [48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.]]> : vector<4x16xf32> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex> %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32> - %5 = xegpu.create_tdesc %cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #scatter> + %5 = xegpu.create_tdesc %cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #scatter> xegpu.store %cst, %5, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #scatter>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir index b614e21e5..f0241bfb0 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_chunk_8_f32.mlir @@ -6,7 +6,7 @@ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} { @@ -28,9 +28,10 @@ module @gemm attributes {gpu.container_module} { [112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126., 127.]]> : vector<8x16xf32> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32> - %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #scatter> + %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #scatter> xegpu.store %cst, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #scatter>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir index 000547969..932f113e1 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f16.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16xf16> attributes {llvm.emit_c_interface} { @@ -20,7 +20,8 @@ module @gemm attributes {gpu.container_module} { %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf16> %mask = arith.constant dense<1> : vector<16xi1> - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #scatter> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #scatter> xegpu.store %cst, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #scatter>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir index e83027672..f1508f56c 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_global/store_global_no_chunk_f32.mlir @@ -5,7 +5,7 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#scatter = #xegpu.scatter_tdesc_attr +#scatter = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16xf32> attributes {llvm.emit_c_interface} { @@ -20,7 +20,8 @@ module @gemm attributes {gpu.container_module} { %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf32> %mask = arith.constant dense<1> : vector<16xi1> - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #scatter> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #scatter> xegpu.store %cst, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #scatter>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir index 11f8ff90f..2a354496e 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_4_f32.mlir @@ -5,8 +5,8 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16x4xf32> attributes {llvm.emit_c_interface} { @@ -24,17 +24,18 @@ module @gemm attributes {gpu.container_module} { [48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60., 61., 62., 63.]]> : vector<4x16xf32> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60]> : vector<16xindex> // store the cst into slm and load it back; %slm = memref.alloc() : memref<64xf32, 3> - %slm_tdesc = xegpu.create_tdesc %slm[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32, 3> -> !xegpu.tensor_desc<16x4xf32, #slm> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<64xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #slm> xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #slm>, vector<16xi1> // load from slm %data = xegpu.load %slm_tdesc, %mask {transpose} : !xegpu.tensor_desc<16x4xf32, #slm>, vector<16xi1> -> vector<4x16xf32> // store data to global memory %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [64], strides: [1] : memref<16x4xf32> to memref<64xf32> - %5 = xegpu.create_tdesc %cast[0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60] : memref<64xf32> -> !xegpu.tensor_desc<16x4xf32, #global> + %5 = xegpu.create_tdesc %cast, %offsets : memref<64xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x4xf32, #global> xegpu.store %data, %5, %mask {transpose} : vector<4x16xf32>, !xegpu.tensor_desc<16x4xf32, #global>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir index 56c7aa516..bc5bf1708 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32.mlir @@ -6,8 +6,8 @@ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} { @@ -30,9 +30,10 @@ module @gemm attributes {gpu.container_module} { [112., 113., 114., 115., 116., 117., 118., 119., 120., 121., 122., 123., 124., 125., 126., 127.]]> : vector<8x16xf32> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> // store the cst into slm - %slm_tdesc = xegpu.create_tdesc %slm[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32, 3> -> !xegpu.tensor_desc<16x8xf32, #slm> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<128xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #slm> xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #slm>, vector<16xi1> // load from slm @@ -40,7 +41,7 @@ module @gemm attributes {gpu.container_module} { // store data to global memory %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32> - %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #global> + %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #global> xegpu.store %data, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #global>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir index 1e8edb13a..ce5e0f521 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_chunk_8_f32_mask.mlir @@ -6,8 +6,8 @@ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16x8xf32> attributes {llvm.emit_c_interface} { @@ -33,7 +33,7 @@ module @gemm attributes {gpu.container_module} { %offsets = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex> // store the cst into slm - %slm_tdesc = xegpu.create_tdesc %slm[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32, 3> -> !xegpu.tensor_desc<16x8xf32, #slm> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<128xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #slm> xegpu.store %cst, %slm_tdesc, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #slm>, vector<16xi1> // load from slm @@ -41,7 +41,7 @@ module @gemm attributes {gpu.container_module} { // store data to global memory %cast = memref.reinterpret_cast %mem to offset: [0], sizes: [128], strides: [1] : memref<16x8xf32> to memref<128xf32> - %5 = xegpu.create_tdesc %cast[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120] : memref<128xf32> -> !xegpu.tensor_desc<16x8xf32, #global> + %5 = xegpu.create_tdesc %cast, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x8xf32, #global> xegpu.store %data, %5, %mask {transpose} : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #global>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir index ea000b14b..695157f95 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f16.mlir @@ -5,8 +5,8 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16xf16> attributes {llvm.emit_c_interface} { @@ -20,15 +20,16 @@ module @gemm attributes {gpu.container_module} { gpu.func @test_store_scatter(%mem: memref<16xf16>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf16> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> // store the cst into slm and load it back; %slm = memref.alloc() : memref<16xf16, 3> - %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16, 3> -> !xegpu.tensor_desc<16xf16, #slm> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf16, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #slm> xegpu.store %cst, %slm_tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf16, #slm>, vector<16xi1> -> vector<16xf16> // store data to global memory - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf16> -> !xegpu.tensor_desc<16xf16, #global> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf16>, vector<16xindex> -> !xegpu.tensor_desc<16xf16, #global> xegpu.store %data, %tdesc, %mask : vector<16xf16>, !xegpu.tensor_desc<16xf16, #global>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir index 2a10527fb..e75d5ee9d 100644 --- a/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir +++ b/test/Integration/Dialect/XeGPU/gather_scatter_slm/store_load_slm_no_chunk_f32.mlir @@ -5,8 +5,8 @@ // RUN: --runner imex-cpu-runner -e main --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#global = #xegpu.scatter_tdesc_attr -#slm = #xegpu.scatter_tdesc_attr +#global = #xegpu.scatter_tdesc_attr +#slm = #xegpu.scatter_tdesc_attr module @gemm attributes {gpu.container_module} { func.func @test() -> memref<16xf32> attributes {llvm.emit_c_interface} { @@ -21,14 +21,15 @@ module @gemm attributes {gpu.container_module} { %cst = arith.constant dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0]> : vector<16xf32> %mask = arith.constant dense<1> : vector<16xi1> + %offsets = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : vector<16xindex> // store the cst into slm and load it back; %slm = memref.alloc() : memref<16xf32, 3> - %slm_tdesc = xegpu.create_tdesc %slm[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32, 3> -> !xegpu.tensor_desc<16xf32, #slm> + %slm_tdesc = xegpu.create_tdesc %slm, %offsets : memref<16xf32, 3>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #slm> xegpu.store %cst, %slm_tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> %data = xegpu.load %slm_tdesc, %mask : !xegpu.tensor_desc<16xf32, #slm>, vector<16xi1> -> vector<16xf32> - %tdesc = xegpu.create_tdesc %mem[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #global> + %tdesc = xegpu.create_tdesc %mem, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #global> xegpu.store %cst, %tdesc, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #global>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir b/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir index 7ac7761e5..353a233ed 100644 --- a/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir +++ b/test/Integration/Dialect/XeGPU/load1d-slm-f32.mlir @@ -7,7 +7,7 @@ // RUN: --entry-point-result=void \ // RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck -#slm = #xegpu.block_tdesc_attr +#slm = #xegpu.block_tdesc_attr module @gemm attributes {gpu.container_module} { memref.global "private" constant @__constant_8x16xf32 : memref<32xf32> = dense<[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]> diff --git a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir index 34d8183fd..c082c7c3c 100644 --- a/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather2d_masked_f32.mlir @@ -43,20 +43,21 @@ module @gemm attributes {gpu.container_module} { // Spirv has no lowering for memref.reinterpret_cast with different sizes (doesn't work: memref<3x16xf32> to memref<16xf32>) // Each row has a tdesc with offsets that determine linearized memref's values to be loaded - %row_1_in_td = xegpu.create_tdesc %arg0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %row_1_out_td = xegpu.create_tdesc %arg1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %offsets_row1 = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> + %row_1_in_td = xegpu.create_tdesc %arg0, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_1_out_td = xegpu.create_tdesc %arg1, %offsets_row1 : memref, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %row_1_loaded = xegpu.load %row_1_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %row_1_store = arith.select %row_mask, %row_1_loaded, %user_val : vector<16xi1>, vector<16xf32> xegpu.store %row_1_store, %row_1_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> - %row_2_in_td = xegpu.update_offset %row_1_in_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %row_2_out_td = xegpu.update_offset %row_1_out_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_2_in_td = xegpu.update_offset %row_1_in_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> + %row_2_out_td = xegpu.update_offset %row_1_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> %row_2_loaded = xegpu.load %row_2_in_td, %row_mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> %row_2_store = arith.select %row_mask, %row_2_loaded, %user_val : vector<16xi1>, vector<16xf32> xegpu.store %row_2_store, %row_2_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> // The entire row is out of bounds - %row_3_out_td = xegpu.update_offset %row_2_out_td, [16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %row_3_out_td = xegpu.update_offset %row_2_out_td, %offset_step : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xindex> xegpu.store %user_val, %row_3_out_td, %store_mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return } diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir index 46abbae1f..ca2e4bba0 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_f32.mlir @@ -32,9 +32,10 @@ module @gemm attributes {gpu.container_module} { gpu.func @test_scattered(%in: memref, %out: memref) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { // We have 16 work items, each accesses 2 elements: {chunk_size = 2}, hence 16x2 tensor. // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32. + %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1> - %tdesc_in = xegpu.create_tdesc %in[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> - %tdesc_out = xegpu.create_tdesc %out[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %tdesc_in = xegpu.create_tdesc %in, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> + %tdesc_out = xegpu.create_tdesc %out, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr> %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<2x16xf32> xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir index bd8ca7111..638db7833 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_chunk_size_i32.mlir @@ -32,9 +32,10 @@ module @gemm attributes {gpu.container_module} { gpu.func @test_scattered(%in: memref, %out: memref) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { // We have 16 work items, each accesses 2 elements: {chunk_size = 2}, hence 16x2 tensor. // Valid offsets (%offsets for which %mask is 1) should not exceed 16*2=32. + %offsets = arith.constant dense<[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]> : vector<16xi1> - %tdesc_in = xegpu.create_tdesc %in[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> - %tdesc_out = xegpu.create_tdesc %out[0,4,8,12,16,20,24,28,32,34,38,42,46,50,54,58] : memref -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> + %tdesc_in = xegpu.create_tdesc %in, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> + %tdesc_out = xegpu.create_tdesc %out, %offsets : memref, vector<16xindex> -> !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr> %loaded = xegpu.load %tdesc_in, %mask {transpose} : !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16xi1> -> vector<2x16xi32> xegpu.store %loaded, %tdesc_out, %mask {transpose} : vector<2x16xi32>, !xegpu.tensor_desc<16x2xi32, #xegpu.scatter_tdesc_attr>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir index 0eb253e82..88e2cbf8c 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_f32.mlir @@ -24,11 +24,12 @@ module @gemm attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_scattered(%arg0: memref<1x16xf32>, %arg1: memref<1x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index + %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %tdesc2 = xegpu.create_tdesc %2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir index daa6e9d12..349576169 100644 --- a/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir +++ b/test/Integration/Dialect/XeGPU/loadgather_masked_f32.mlir @@ -24,11 +24,12 @@ module @gemm attributes {gpu.container_module} { gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce, api=OpenCL, #spirv.resource_limits<>>} { gpu.func @test_scattered(%arg0: memref<1x16xf32>, %arg1: memref<1x16xf32>) kernel attributes {VectorComputeFunctionINTEL, spirv.entry_point_abi = #spirv.entry_point_abi<>} { %c0 = arith.constant 0 : index + %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> %mask = arith.constant dense<[1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1]> : vector<16xi1> %1 = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> %2 = memref.reinterpret_cast %arg1 to offset: [0], sizes: [16], strides: [1] : memref<1x16xf32> to memref<16xf32> - %tdesc1 = xegpu.create_tdesc %1[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> - %tdesc2 = xegpu.create_tdesc %2[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] : memref<16xf32> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc1 = xegpu.create_tdesc %1, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> + %tdesc2 = xegpu.create_tdesc %2, %offsets : memref<16xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>> %loaded = xegpu.load %tdesc1, %mask : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> -> vector<16xf32> xegpu.store %loaded, %tdesc2, %mask : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1> gpu.return diff --git a/test/Integration/Dialect/XeGPU/optimize_transpose.mlir b/test/Integration/Dialect/XeGPU/optimize_transpose.mlir index 93327f688..8f8d61a67 100644 --- a/test/Integration/Dialect/XeGPU/optimize_transpose.mlir +++ b/test/Integration/Dialect/XeGPU/optimize_transpose.mlir @@ -34,34 +34,34 @@ module @gemm attributes {gpu.container_module} { %1 = arith.muli %block_id_y, %c32 : index %2 = arith.addi %0, %c0 : index %3 = arith.addi %1, %c0 : index - %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %5 = arith.addi %1, %c16 : index - %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c8 = arith.constant 8 : index %7 = arith.addi %0, %c8 : index - %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %10 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> - %11 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> - %12 = xegpu.load_nd %10 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> -> vector<16x16xf32> - %13 = xegpu.load_nd %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> -> vector<16x16xf32> - %14 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %15 = xegpu.create_nd_tdesc %arg1[%3, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %16 = xegpu.create_nd_tdesc %arg1[%3, %c16] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %17:5 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %14, %arg5 = %15, %arg6 = %16, %arg7 = %12, %arg8 = %13) -> (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<16x16xf32>, vector<16x16xf32>) { + %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %10 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> + %11 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<256x256xf32> -> !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> + %12 = xegpu.load_nd %10 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> -> vector<16x16xf32> + %13 = xegpu.load_nd %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf32, #xegpu.block_tdesc_attr> -> vector<16x16xf32> + %14 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %15 = xegpu.create_nd_tdesc %arg1[%3, %c0] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %16 = xegpu.create_nd_tdesc %arg1[%3, %c16] : memref<256x256xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %17:5 = scf.for %arg3 = %c0 to %c256 step %c32 iter_args(%arg4 = %14, %arg5 = %15, %arg6 = %16, %arg7 = %12, %arg8 = %13) -> (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<16x16xf32>, vector<16x16xf32>) { %22 = vector.extract_strided_slice %arg7 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %23 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %24 = vector.extract_strided_slice %arg8 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %25 = vector.extract_strided_slice %arg8 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> - %26 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> + %26 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16xf16> %27 = vector.extract %26[0] : vector<16x16xf16> from vector<2x16x16xf16> %28 = vector.extract %26[1] : vector<16x16xf16> from vector<2x16x16xf16> %29 = vector.extract_strided_slice %27 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> %30 = vector.extract_strided_slice %27 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> %31 = vector.extract_strided_slice %28 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> %32 = vector.extract_strided_slice %28 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf16> to vector<8x16xf16> - %33 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> - %34 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> + %33 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> + %34 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<32x16xf16> %35 = vector.transpose %33, [1, 0] : vector<32x16xf16> to vector<16x32xf16> %36 = vector.shape_cast %35 {packed} : vector<16x32xf16> to vector<512xf16> %37 = vector.shuffle %36, %36 [0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63, 64, 96, 65, 97, 66, 98, 67, 99, 68, 100, 69, 101, 70, 102, 71, 103, 72, 104, 73, 105, 74, 106, 75, 107, 76, 108, 77, 109, 78, 110, 79, 111, 80, 112, 81, 113, 82, 114, 83, 115, 84, 116, 85, 117, 86, 118, 87, 119, 88, 120, 89, 121, 90, 122, 91, 123, 92, 124, 93, 125, 94, 126, 95, 127, 128, 160, 129, 161, 130, 162, 131, 163, 132, 164, 133, 165, 134, 166, 135, 167, 136, 168, 137, 169, 138, 170, 139, 171, 140, 172, 141, 173, 142, 174, 143, 175, 144, 176, 145, 177, 146, 178, 147, 179, 148, 180, 149, 181, 150, 182, 151, 183, 152, 184, 153, 185, 154, 186, 155, 187, 156, 188, 157, 189, 158, 190, 159, 191, 192, 224, 193, 225, 194, 226, 195, 227, 196, 228, 197, 229, 198, 230, 199, 231, 200, 232, 201, 233, 202, 234, 203, 235, 204, 236, 205, 237, 206, 238, 207, 239, 208, 240, 209, 241, 210, 242, 211, 243, 212, 244, 213, 245, 214, 246, 215, 247, 216, 248, 217, 249, 218, 250, 219, 251, 220, 252, 221, 253, 222, 254, 223, 255, 256, 288, 257, 289, 258, 290, 259, 291, 260, 292, 261, 293, 262, 294, 263, 295, 264, 296, 265, 297, 266, 298, 267, 299, 268, 300, 269, 301, 270, 302, 271, 303, 272, 304, 273, 305, 274, 306, 275, 307, 276, 308, 277, 309, 278, 310, 279, 311, 280, 312, 281, 313, 282, 314, 283, 315, 284, 316, 285, 317, 286, 318, 287, 319, 320, 352, 321, 353, 322, 354, 323, 355, 324, 356, 325, 357, 326, 358, 327, 359, 328, 360, 329, 361, 330, 362, 331, 363, 332, 364, 333, 365, 334, 366, 335, 367, 336, 368, 337, 369, 338, 370, 339, 371, 340, 372, 341, 373, 342, 374, 343, 375, 344, 376, 345, 377, 346, 378, 347, 379, 348, 380, 349, 381, 350, 382, 351, 383, 384, 416, 385, 417, 386, 418, 387, 419, 388, 420, 389, 421, 390, 422, 391, 423, 392, 424, 393, 425, 394, 426, 395, 427, 396, 428, 397, 429, 398, 430, 399, 431, 400, 432, 401, 433, 402, 434, 403, 435, 404, 436, 405, 437, 406, 438, 407, 439, 408, 440, 409, 441, 410, 442, 411, 443, 412, 444, 413, 445, 414, 446, 415, 447, 448, 480, 449, 481, 450, 482, 451, 483, 452, 484, 453, 485, 454, 486, 455, 487, 456, 488, 457, 489, 458, 490, 459, 491, 460, 492, 461, 493, 462, 494, 463, 495, 464, 496, 465, 497, 466, 498, 467, 499, 468, 500, 469, 501, 470, 502, 471, 503, 472, 504, 473, 505, 474, 506, 475, 507, 476, 508, 477, 509, 478, 510, 479, 511] {packed} : vector<512xf16>, vector<512xf16> @@ -84,19 +84,19 @@ module @gemm attributes {gpu.container_module} { %54 = xegpu.dpas %32, %46, %53 : vector<8x16xf16>, vector<8x16x2xf16>, vector<8x16xf32> -> vector<8x16xf32> %55 = vector.shuffle %48, %52 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> %56 = vector.shuffle %50, %54 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> - %57 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> - %58 = xegpu.update_nd_offset %arg5, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %59 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - scf.yield %57, %58, %59, %55, %56 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<16x16xf32>, vector<16x16xf32> + %57 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr> + %58 = xegpu.update_nd_offset %arg5, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %59 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + scf.yield %57, %58, %59, %55, %56 : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<16x16xf32>, vector<16x16xf32> } %18 = vector.extract_strided_slice %17#3 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %19 = vector.extract_strided_slice %17#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %20 = vector.extract_strided_slice %17#4 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> %21 = vector.extract_strided_slice %17#4 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16xf32> to vector<8x16xf32> - xegpu.store_nd %18, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %20, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %19, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %21, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %18, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %20, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %19, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %21, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> gpu.return } } diff --git a/test/SPIRV/OpTest.spirv.CL.printf.mlir b/test/SPIRV/OpTest.spirv.CL.printf.mlir index a7be187f6..b771a2726 100644 --- a/test/SPIRV/OpTest.spirv.CL.printf.mlir +++ b/test/SPIRV/OpTest.spirv.CL.printf.mlir @@ -45,7 +45,7 @@ module @print_simple attributes {gpu.container_module} { %printfMsg1_addr = spirv.mlir.addressof @printfMsg1 : !spirv.ptr, UniformConstant> %0 = spirv.Bitcast %printfMsg1_addr : !spirv.ptr, UniformConstant> to !spirv.ptr - %1 = spirv.CL.printf %0 : !spirv.ptr (%arg0, %arg1 : i32, f32) -> i32 + %1 = spirv.CL.printf %0 %arg0, %arg1 : !spirv.ptr, i32, f32 -> i32 spirv.Return } diff --git a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir index 864aad766..464eb9507 100644 --- a/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir +++ b/test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir @@ -53,11 +53,11 @@ module { %26 = arith.muli %25, %c256 : index %27 = arith.divsi %15, %c32 : index %28 = arith.muli %27, %c32 : index - %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %29 = xegpu.create_nd_tdesc %arg0[%19, %15] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %30 = arith.divsi %23, %c32 : index %31 = arith.muli %30, %c32 : index %32 = arith.addi %26, %2 : index - %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %33 = xegpu.create_nd_tdesc %arg0[%32, %28] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %34 = arith.remsi %11, %c4 : index %35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) { %39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32> @@ -75,29 +75,29 @@ module { %50 = arith.addi %49, %24 : index %51 = arith.divsi %50, %c128 : index %52 = arith.muli %51, %c128 : index - %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %53 = xegpu.create_nd_tdesc %arg1[%50, %23] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %33 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %54 = xegpu.update_nd_offset %33, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %54 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %55 = xegpu.update_nd_offset %54, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %55 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %56 = xegpu.update_nd_offset %55, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %57 = arith.addi %52, %3 : index - %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { + %58 = xegpu.create_nd_tdesc %arg1[%57, %31] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %58 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %59 = xegpu.update_nd_offset %58, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %59 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %60 = xegpu.update_nd_offset %59, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %60 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %61 = xegpu.update_nd_offset %60, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %62:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %29, %arg7 = %53, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %56, %arg17 = %61, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { %391 = arith.cmpi eq, %arg18, %c21 : index %392 = arith.select %391, %c0, %arg18 : index scf.if %391 { gpu.barrier } %393 = arith.addi %392, %c1 : index - %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> + %394 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> %395 = vector.shape_cast %394 : vector<2x32x16xbf16> to vector<1024xbf16> %396 = vector.shuffle %395, %395 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> %397 = vector.shuffle %395, %395 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16> @@ -117,7 +117,7 @@ module { %411 = vector.shape_cast %410 : vector<128xbf16> to vector<8x16xbf16> %412 = vector.shuffle %397, %397 [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16> %413 = vector.shape_cast %412 : vector<128xbf16> to vector<8x16xbf16> - %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> + %414 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> %415 = vector.shape_cast %414 : vector<2x16x16x2xbf16> to vector<1024xbf16> %416 = vector.shuffle %415, %415 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> %417 = vector.shuffle %415, %415 [512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023] : vector<1024xbf16>, vector<1024xbf16> @@ -130,15 +130,15 @@ module { %424 = vector.shuffle %417, %417 [256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<512xbf16>, vector<512xbf16> %425 = vector.shape_cast %424 : vector<256xbf16> to vector<8x16x2xbf16> xegpu.compile_hint - xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint - %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %426 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %427 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint xegpu.compile_hint - %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> - %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %428 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %429 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint %430 = xegpu.dpas %399, %419, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %431 = xegpu.dpas %407, %421, %430 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> @@ -157,7 +157,7 @@ module { %444 = xegpu.dpas %405, %423, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %445 = xegpu.dpas %413, %425, %444 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> xegpu.compile_hint - scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index + scf.yield %428, %429, %431, %433, %435, %437, %439, %441, %443, %445, %426, %427, %393 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index } %63 = vector.shape_cast %62#2 : vector<8x16xf32> to vector<128xf32> %64 = vector.shape_cast %62#3 : vector<8x16xf32> to vector<128xf32> @@ -419,13 +419,13 @@ module { %320 = arith.addf %318, %319 : vector<16xf32> %321 = vector.shuffle %317, %320 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16xf32>, vector<16xf32> %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class> - %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %322 = xegpu.create_nd_tdesc %alloc[%13, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %323 = arith.addi %13, %c8 : index - %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %324 = xegpu.create_nd_tdesc %alloc[%323, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %325 = arith.addi %13, %c16 : index - %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %326 = xegpu.create_nd_tdesc %alloc[%325, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %327 = arith.addi %13, %c24 : index - %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %328 = xegpu.create_nd_tdesc %alloc[%327, %34] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %329 = vector.shuffle %321, %321 [0, 1, 2, 3, 4, 5, 6, 7] : vector<32xf32>, vector<32xf32> %330 = vector.shape_cast %329 : vector<8xf32> to vector<8x1xf32> %331 = vector.shuffle %321, %321 [8, 9, 10, 11, 12, 13, 14, 15] : vector<32xf32>, vector<32xf32> @@ -434,13 +434,13 @@ module { %334 = vector.shape_cast %333 : vector<8xf32> to vector<8x1xf32> %335 = vector.shuffle %321, %321 [24, 25, 26, 27, 28, 29, 30, 31] : vector<32xf32>, vector<32xf32> %336 = vector.shape_cast %335 : vector<8xf32> to vector<8x1xf32> - xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %330, %322 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %332, %324 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %334, %326 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %336, %328 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.barrier - %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> - %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> + %337 = xegpu.create_nd_tdesc %alloc[%9, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> + %338 = xegpu.load_nd %337 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> %339 = vector.shape_cast %338 : vector<8x4xf32> to vector<32xf32> %340 = vector.shuffle %339, %339 [0, 1, 2, 3] : vector<32xf32>, vector<32xf32> %341 = vector.shuffle %339, %339 [4, 5, 6, 7] : vector<32xf32>, vector<32xf32> @@ -499,8 +499,8 @@ module { } %36 = arith.addi %16, %18 : index %37 = arith.addi %36, %9 : index - %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %38 = xegpu.create_nd_tdesc %arg2[%37, %7] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %35, %38 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.return } } diff --git a/test/Transforms/VectorLinearize/postop_reduce_n.mlir b/test/Transforms/VectorLinearize/postop_reduce_n.mlir index 09f28d414..506eb36f4 100644 --- a/test/Transforms/VectorLinearize/postop_reduce_n.mlir +++ b/test/Transforms/VectorLinearize/postop_reduce_n.mlir @@ -56,13 +56,13 @@ module { %28 = arith.muli %27, %c32 : index %29 = arith.addi %19, %c0 : index %30 = arith.addi %15, %c0 : index - %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %31 = xegpu.create_nd_tdesc %arg0[%29, %30] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> %32 = arith.divsi %23, %c32 : index %33 = arith.muli %32, %c32 : index %34 = arith.addi %26, %2 : index %35 = arith.addi %34, %c0 : index %36 = arith.addi %28, %c0 : index - %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %37 = xegpu.create_nd_tdesc %arg0[%35, %36] : memref<16384x12288xbf16> -> !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %38 = arith.remsi %11, %c4 : index %39 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) { @@ -89,31 +89,31 @@ module { %57 = arith.muli %56, %c128 : index %58 = arith.addi %55, %c0 : index %59 = arith.addi %23, %c0 : index - %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %60 = xegpu.create_nd_tdesc %arg1[%58, %59] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %37 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %61 = xegpu.update_nd_offset %37, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %61 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %62 = xegpu.update_nd_offset %61, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %62 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %63 = xegpu.update_nd_offset %62, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> %64 = arith.addi %57, %3 : index %65 = arith.addi %64, %c0 : index %66 = arith.addi %33, %c0 : index - %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> - %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { + %67 = xegpu.create_nd_tdesc %arg1[%65, %66] : memref<1536x12288xbf16> -> !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %67 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %68 = xegpu.update_nd_offset %67, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %68 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %69 = xegpu.update_nd_offset %68, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %69 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %70 = xegpu.update_nd_offset %69, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %71:13 = scf.for %arg5 = %c0 to %c12288 step %c32 iter_args(%arg6 = %31, %arg7 = %60, %arg8 = %cst_0, %arg9 = %cst_0, %arg10 = %cst_0, %arg11 = %cst_0, %arg12 = %cst_0, %arg13 = %cst_0, %arg14 = %cst_0, %arg15 = %cst_0, %arg16 = %63, %arg17 = %70, %arg18 = %c0) -> (!xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index) { %437 = arith.cmpi eq, %arg18, %c21 : index %438 = arith.select %437, %c0, %arg18 : index scf.if %437 { gpu.barrier } %439 = arith.addi %438, %c1 : index - %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> + %440 = xegpu.load_nd %arg6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xbf16> //CHECK: vector.shape_cast %{{.*}} : vector<2x32x16xbf16> to vector<1024xbf16> //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> @@ -144,7 +144,7 @@ module { %448 = vector.extract_strided_slice %442 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %449 = vector.extract_strided_slice %442 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> %450 = vector.extract_strided_slice %442 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xbf16> to vector<8x16xbf16> - %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> + %451 = xegpu.load_nd %arg7 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xbf16> //CHECK: vector.shape_cast %{{.*}} : vector<2x16x16x2xbf16> to vector<1024xbf16> //CHECK: vector.shuffle %{{.*}}, %{{.*}} [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511] : vector<1024xbf16>, vector<1024xbf16> @@ -164,15 +164,15 @@ module { %456 = vector.extract_strided_slice %453 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16> %457 = vector.extract_strided_slice %453 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<16x16x2xbf16> to vector<8x16x2xbf16> xegpu.compile_hint - xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + xegpu.prefetch_nd %arg17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint - %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> - %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> + %458 = xegpu.update_nd_offset %arg16, [%c0, %c32] : !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr> + %459 = xegpu.update_nd_offset %arg17, [%c0, %c32] : !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint xegpu.compile_hint - %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> - %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %460 = xegpu.update_nd_offset %arg6, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> + %461 = xegpu.update_nd_offset %arg7, [%c0, %c32] : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr> xegpu.compile_hint %462 = xegpu.dpas %443, %454, %arg8 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %463 = xegpu.dpas %447, %455, %462 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> @@ -191,7 +191,7 @@ module { %476 = xegpu.dpas %446, %456, %arg15 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> %477 = xegpu.dpas %450, %457, %476 : vector<8x16xbf16>, vector<8x16x2xbf16>, vector<8x16xf32> -> vector<8x16xf32> xegpu.compile_hint - scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index + scf.yield %460, %461, %463, %465, %467, %469, %471, %473, %475, %477, %458, %459, %439 : !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xbf16, #xegpu.block_tdesc_attr>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<8x32xbf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<4x32xbf16, #xegpu.block_tdesc_attr>, index } //CHECK-COUNT-8: vector.shape_cast %{{.*}} : vector<8x16xf32> to vector<128xf32> @@ -489,27 +489,27 @@ module { %alloc = memref.alloc() : memref<256x4xf32, #spirv.storage_class> %359 = arith.addi %13, %c0 : index %360 = arith.addi %38, %c0 : index - %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %361 = xegpu.create_nd_tdesc %alloc[%359, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %362 = arith.addi %13, %c8 : index - %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %363 = xegpu.create_nd_tdesc %alloc[%362, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %c16 = arith.constant 16 : index %364 = arith.addi %13, %c16 : index - %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %365 = xegpu.create_nd_tdesc %alloc[%364, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %c24 = arith.constant 24 : index %366 = arith.addi %13, %c24 : index - %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %367 = xegpu.create_nd_tdesc %alloc[%366, %360] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> %368 = vector.extract_strided_slice %358 {offsets = [0, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %369 = vector.extract_strided_slice %358 {offsets = [8, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %370 = vector.extract_strided_slice %358 {offsets = [16, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> %371 = vector.extract_strided_slice %358 {offsets = [24, 0], sizes = [8, 1], strides = [1, 1]} : vector<32x1xf32> to vector<8x1xf32> - xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %368, %361 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %369, %363 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %370, %365 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %371, %367 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.barrier %372 = arith.addi %9, %c0 : index - %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> - %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> + %373 = xegpu.create_nd_tdesc %alloc[%372, %c0] : memref<256x4xf32, #spirv.storage_class> -> !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> + %374 = xegpu.load_nd %373 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x4xf32, #xegpu.block_tdesc_attr> -> vector<8x4xf32> %375 = vector.extract_strided_slice %374 {offsets = [0, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> %376 = vector.extract_strided_slice %374 {offsets = [1, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> %377 = vector.extract_strided_slice %374 {offsets = [2, 0], sizes = [1, 4], strides = [1, 1]} : vector<8x4xf32> to vector<1x4xf32> @@ -591,8 +591,8 @@ module { %41 = arith.addi %40, %9 : index %42 = arith.addi %41, %c0 : index %43 = arith.addi %7, %c0 : index - %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + %44 = xegpu.create_nd_tdesc %arg2[%42, %43] : memref<16384x4xf32> -> !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %39, %44 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x1xf32>, !xegpu.tensor_desc<8x1xf32, #xegpu.block_tdesc_attr> gpu.return } } diff --git a/test/Transforms/VnniTransform/gemm_with_extract.mlir b/test/Transforms/VnniTransform/gemm_with_extract.mlir index a1685f920..206486397 100644 --- a/test/Transforms/VnniTransform/gemm_with_extract.mlir +++ b/test/Transforms/VnniTransform/gemm_with_extract.mlir @@ -11,28 +11,28 @@ gpu.module @test_kernel { %1 = arith.muli %block_id_y, %c32 : index %2 = arith.addi %0, %c0 : index %3 = arith.addi %1, %c0 : index - %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c16 = arith.constant 16 : index %5 = arith.addi %1, %c16 : index - %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c8 = arith.constant 8 : index %7 = arith.addi %0, %c8 : index - %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %10 = arith.addi %0, %c16 : index - %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c24 = arith.constant 24 : index %13 = arith.addi %0, %c24 : index - %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>) { + %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>) { %31 = vector.extract_strided_slice %arg6 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %32 = vector.extract_strided_slice %arg6 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %33 = vector.extract_strided_slice %arg6 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -41,7 +41,7 @@ gpu.module @test_kernel { %36 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %37 = vector.extract_strided_slice %arg7 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %38 = vector.extract_strided_slice %arg7 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %40 = vector.extract %39[0] : vector<32x16xf16> from vector<2x32x16xf16> %41 = vector.extract %39[1] : vector<32x16xf16> from vector<2x32x16xf16> %42 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -53,8 +53,8 @@ gpu.module @test_kernel { %48 = vector.extract_strided_slice %41 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %49 = vector.extract_strided_slice %41 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> - //CHECK: %[[R50:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> - %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + //CHECK: %[[R50:.*]] = xegpu.load_nd %{{.*}} <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint, packed}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x16x16x2xf16> + %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> //CHECK: %[[R51:.*]] = vector.extract %[[R50]][0] : vector<16x16x2xf16> from vector<2x16x16x2xf16> //CHECK: %[[R52:.*]] = vector.extract %[[R50]][1] : vector<16x16x2xf16> from vector<2x16x16x2xf16> @@ -117,9 +117,9 @@ gpu.module @test_kernel { %80 = vector.shuffle %64, %68 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> %81 = vector.shuffle %72, %76 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> %82 = vector.shuffle %80, %81 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32> - %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> } %23 = vector.extract_strided_slice %22#2 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %24 = vector.extract_strided_slice %22#2 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -129,14 +129,14 @@ gpu.module @test_kernel { %28 = vector.extract_strided_slice %22#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %29 = vector.extract_strided_slice %22#3 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %30 = vector.extract_strided_slice %22#3 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> gpu.return } } diff --git a/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir b/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir index e1c803622..af6696bc4 100644 --- a/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir +++ b/test/Transforms/VnniTransform/gemm_with_extract_e2e.mlir @@ -35,28 +35,28 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv. %1 = arith.muli %block_id_y, %c32 : index %2 = arith.addi %0, %c0 : index %3 = arith.addi %1, %c0 : index - %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %4 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c16 = arith.constant 16 : index %5 = arith.addi %1, %c16 : index - %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %6 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c8 = arith.constant 8 : index %7 = arith.addi %0, %c8 : index - %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %8 = xegpu.create_nd_tdesc %arg2[%7, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %9 = xegpu.create_nd_tdesc %arg2[%7, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %10 = arith.addi %0, %c16 : index - %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %11 = xegpu.create_nd_tdesc %arg2[%10, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %12 = xegpu.create_nd_tdesc %arg2[%10, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> %c24 = arith.constant 24 : index %13 = arith.addi %0, %c24 : index - %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> - %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> - %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>) { + %14 = xegpu.create_nd_tdesc %arg2[%13, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %15 = xegpu.create_nd_tdesc %arg2[%13, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + %16 = xegpu.create_nd_tdesc %arg2[%2, %3] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + %17 = xegpu.create_nd_tdesc %arg2[%2, %5] : memref<1024x1024xf32> -> !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> + %18 = xegpu.load_nd %16 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + %19 = xegpu.load_nd %17 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf32, #xegpu.block_tdesc_attr> -> vector<32x16xf32> + %20 = xegpu.create_nd_tdesc %arg0[%2, %c0] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %21 = xegpu.create_nd_tdesc %arg1[%c0, %3] : memref<1024x1024xf16> -> !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %22:4 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %20, %arg5 = %21, %arg6 = %18, %arg7 = %19) -> (!xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32>) { %31 = vector.extract_strided_slice %arg6 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %32 = vector.extract_strided_slice %arg6 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %33 = vector.extract_strided_slice %arg6 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -65,7 +65,7 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv. %36 = vector.extract_strided_slice %arg7 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %37 = vector.extract_strided_slice %arg7 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %38 = vector.extract_strided_slice %arg7 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + %39 = xegpu.load_nd %arg4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %40 = vector.extract %39[0] : vector<32x16xf16> from vector<2x32x16xf16> %41 = vector.extract %39[1] : vector<32x16xf16> from vector<2x32x16xf16> %42 = vector.extract_strided_slice %40 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> @@ -77,7 +77,7 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv. %48 = vector.extract_strided_slice %41 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> %49 = vector.extract_strided_slice %41 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf16> to vector<8x16xf16> - %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> + %50 = xegpu.load_nd %arg5 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> -> vector<2x32x16xf16> %51 = vector.extract %50[0] : vector<32x16xf16> from vector<2x32x16xf16> %52 = vector.extract %50[1] : vector<32x16xf16> from vector<2x32x16xf16> @@ -109,9 +109,9 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv. %80 = vector.shuffle %64, %68 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> %81 = vector.shuffle %72, %76 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : vector<8x16xf32>, vector<8x16xf32> %82 = vector.shuffle %80, %81 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31] : vector<16x16xf32>, vector<16x16xf32> - %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> - scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> + %83 = xegpu.update_nd_offset %arg4, [%c0, %c32] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + %84 = xegpu.update_nd_offset %arg5, [%c32, %c0] : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr> + scf.yield %83, %84, %79, %82 : !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, !xegpu.tensor_desc<32x16xf16, #xegpu.block_tdesc_attr>, vector<32x16xf32>, vector<32x16xf32> } %23 = vector.extract_strided_slice %22#2 {offsets = [0, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %24 = vector.extract_strided_slice %22#2 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> @@ -121,14 +121,14 @@ gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv. %28 = vector.extract_strided_slice %22#3 {offsets = [8, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %29 = vector.extract_strided_slice %22#3 {offsets = [16, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> %30 = vector.extract_strided_slice %22#3 {offsets = [24, 0], sizes = [8, 16], strides = [1, 1]} : vector<32x16xf32> to vector<8x16xf32> - xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> - xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %23, %4 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %27, %6 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %24, %8 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %28, %9 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %25, %11 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %29, %12 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %26, %14 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> + xegpu.store_nd %30, %15 <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, l3_hint = #xegpu.cache_hint}> : vector<8x16xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr> gpu.return } }