Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add XeTile block operation fallback pass #991

Merged
merged 1 commit into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/imex/Dialect/XeTile/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ std::unique_ptr<mlir::Pass>
createXeTileBlockingPass(const std::string &device = "pvc");
std::unique_ptr<mlir::Pass> createXeTileWgToSgPass();
std::unique_ptr<mlir::Pass> createXeTileCanonicalizationPass();
std::unique_ptr<mlir::Pass> createXeTileBlockOpFallbackPass();

#define GEN_PASS_DECL_XETILEBLOCKING
#define GEN_PASS_DECL_XETILECANONICALIZATION
Expand Down
17 changes: 17 additions & 0 deletions include/imex/Dialect/XeTile/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -96,4 +96,21 @@ def XeTileBlocking : Pass<"xetile-blocking", "::mlir::gpu::GPUModuleOp">{
}


def XeTileBlockOpFallback : Pass<"xetile-blockop-fallback", "::mlir::gpu::GPUModuleOp">{
let summary = "Transform unsuitable block ops to fallback scattered ops";

let description = [{
This transform pass transforms XeTile block ops that are not suitable due to HW restrictions,
to scattered XeTile ops.
}];

let constructor = "imex::createXeTileBlockOpFallbackPass()";
let dependentDialects = ["imex::xetile::XeTileDialect",
"mlir::arith::ArithDialect",
"mlir::gpu::GPUDialect",
"mlir::index::IndexDialect",
"mlir::memref::MemRefDialect",
"mlir::vector::VectorDialect"];
}

#endif // _XeTile_PASSES_TD_INCLUDED_
443 changes: 443 additions & 0 deletions lib/Dialect/XeTile/Transforms/BlockOpFallback.cpp

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions lib/Dialect/XeTile/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
add_imex_dialect_library(IMEXXeTileTransforms
Blocking.cpp
BlockingAnalysis.cpp
BlockOpFallback.cpp
InitDuplicate.cpp
Canonicalization.cpp
WgToSg.cpp
Expand Down
4 changes: 3 additions & 1 deletion lib/Transforms/RemoveSingleElemVector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,9 @@ struct RemoveSingleElemVectorPass final
});

mlir::RewritePatternSet patterns(context);
patterns.add<VectorExtractStridedSliceConversion, VectorizableOpPattern,
// Disable ectorExtractStridedSliceConversion for now as it interferes with
// xetile-blockop-fallback pass
patterns.add</*VectorExtractStridedSliceConversion,*/ VectorizableOpPattern,
VectorShffleOpConversion, VectorInterleaveOpConversion,
VectorSplatOpConversion, VectorExtractElementOpConversion>(
typeConverter, context);
Expand Down
391 changes: 391 additions & 0 deletions test/Dialect/XeTile/Transforms/block_op_fallback.mlir

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<64x1xf32>) -> memref<64x1xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<64x1xf32>
memref.copy %A, %A_gpu : memref<64x1xf32> to memref<64x1xf32>
%B_gpu = gpu.alloc host_shared() : memref<64x1xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x1xf32>, %B_gpu : memref<64x1xf32>)
%B = memref.alloc() : memref<64x1xf32>
memref.copy %B_gpu, %B : memref<64x1xf32> to memref<64x1xf32>
gpu.dealloc %A_gpu : memref<64x1xf32>
gpu.dealloc %B_gpu : memref<64x1xf32>
return %B : memref<64x1xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<64x1xf32>, %arg1: memref<64x1xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst16 = arith.constant 16 : index
%cst64 = arith.constant 64 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<64x1xf32> -> !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%out:2 = scf.for %k = %cst0 to %cst64 step %cst16
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>) {
%a_value = xetile.load_tile %a_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x1xf32>
xetile.store_tile %a_value, %b_tile : vector<16x1xf32>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x1xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%A = memref.alloc() : memref<64x1xf32>
scf.for %arg0 = %c0 to %c64 step %c1 {
%0 = index.castu %arg0 : index to i32
%val = arith.uitofp %0 : i32 to f32
memref.store %val, %A[%arg0, %c0] : memref<64x1xf32>
}
%C = call @test(%A) : (memref<64x1xf32>) -> memref<64x1xf32>
%cast_A = memref.cast %A : memref<64x1xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<64x1xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<64x2xf32>) -> memref<64x2xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<64x2xf32>
memref.copy %A, %A_gpu : memref<64x2xf32> to memref<64x2xf32>
%B_gpu = gpu.alloc host_shared() : memref<64x2xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<64x2xf32>, %B_gpu : memref<64x2xf32>)
%B = memref.alloc() : memref<64x2xf32>
memref.copy %B_gpu, %B : memref<64x2xf32> to memref<64x2xf32>
gpu.dealloc %A_gpu : memref<64x2xf32>
gpu.dealloc %B_gpu : memref<64x2xf32>
return %B : memref<64x2xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<64x2xf32>, %arg1: memref<64x2xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst16 = arith.constant 16 : index
%cst64 = arith.constant 64 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<64x2xf32> -> !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%out:2 = scf.for %k = %cst0 to %cst64 step %cst16
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>) {
%a_value = xetile.load_tile %a_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<16x2xf32>
xetile.store_tile %a_value, %b_tile : vector<16x2xf32>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst16, %cst0] : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<16x2xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c64 = arith.constant 64 : index

%A = memref.alloc() : memref<64x2xf32>
scf.for %arg0 = %c0 to %c64 step %c1 {
%0 = index.castu %arg0 : index to i32
%val = arith.uitofp %0 : i32 to f32
memref.store %val, %A[%arg0, %c0] : memref<64x2xf32>
memref.store %val, %A[%arg0, %c1] : memref<64x2xf32>
}
%C = call @test(%A) : (memref<64x2xf32>) -> memref<64x2xf32>
%cast_A = memref.cast %A : memref<64x2xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<64x2xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
80 changes: 80 additions & 0 deletions test/Integration/Dialect/XeTile/fallback/slm.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// RUN: %python_executable %imex_runner --requires=l0-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%levelzero_runtime --filecheck
// RUN: %python_executable %imex_runner --requires=sycl-runtime -i %s --pass-pipeline-file=%p/xetile-fallback-to-func-vc.pp \
// RUN: --runner imex-cpu-runner -e main \
// RUN: --entry-point-result=void \
// RUN: --shared-libs=%irunner_utils,%mlir_runner_utils,%mlir_c_runner_utils,%sycl_runtime --filecheck

module @narrow_tile attributes {gpu.container_module} {
func.func @test(%A: memref<32x32xf32>) -> memref<32x32xf32> attributes {llvm.emit_c_interface} {
%c1 = arith.constant 1 : index
%A_gpu = gpu.alloc host_shared() : memref<32x32xf32>
memref.copy %A, %A_gpu : memref<32x32xf32> to memref<32x32xf32>
%B_gpu = gpu.alloc host_shared() : memref<32x32xf32>
gpu.launch_func @test_module::@test_scf_for blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%A_gpu : memref<32x32xf32>, %B_gpu : memref<32x32xf32>)
%B = memref.alloc() : memref<32x32xf32>
memref.copy %B_gpu, %B : memref<32x32xf32> to memref<32x32xf32>
gpu.dealloc %A_gpu : memref<32x32xf32>
gpu.dealloc %B_gpu : memref<32x32xf32>
return %B : memref<32x32xf32>
}
gpu.module @test_module attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Bfloat16ConversionINTEL, BFloat16TypeKHR, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, VectorAnyINTEL, VectorComputeINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_bfloat16, SPV_KHR_expect_assume, SPV_INTEL_bfloat16_conversion, SPV_INTEL_vector_compute]>, api=OpenCL, #spirv.resource_limits<>>} {
gpu.func @test_scf_for(%arg0: memref<32x32xf32>, %arg1: memref<32x32xf32>) kernel attributes {VectorComputeFunctionINTEL, known_block_size = array<i32: 1, 1, 1>, known_grid_size = array<i32: 1, 1, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
%cst0 = arith.constant 0 : index
%cst8 = arith.constant 8 : index
%cst16 = arith.constant 16 : index
%cst32 = arith.constant 32 : index
%0 = xetile.init_tile %arg0 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%1 = xetile.init_tile %arg1 [0, 0] : memref<32x32xf32> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%slm = memref.alloc() : memref<8x16xf32, 3>
%slm_tile = xetile.init_tile %slm [0, 0] : memref<8x16xf32, 3> -> !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
%out:2 = scf.for %j = %cst0 to %cst32 step %cst8
iter_args(%a_tile = %0, %b_tile = %1)
-> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
%out:2 = scf.for %k = %cst0 to %cst32 step %cst16
iter_args(%c_tile = %a_tile, %d_tile = %b_tile)
-> (!xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>) {
%c_value = xetile.load_tile %c_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>> -> vector<8x16xf32>
xetile.store_tile %c_value, %slm_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>>
%d_value = xetile.load_tile %slm_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0], memory_space = 3 : i32>> -> vector<8x16xf32>
xetile.store_tile %d_value, %d_tile : vector<8x16xf32>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%c_next_tile = xetile.update_tile_offset %c_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%d_next_tile = xetile.update_tile_offset %d_tile, [%cst0, %cst16] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %c_next_tile, %d_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
}
%a_next_tile = xetile.update_tile_offset %a_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
%b_next_tile = xetile.update_tile_offset %b_tile, [%cst8, %cst0] : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
scf.yield %a_next_tile, %b_next_tile : !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>, !xetile.tile<8x16xf32, #xetile.tile_attr<order = [1, 0]>>
}
gpu.return
}
}
func.func @main() attributes {llvm.emit_c_interface} {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index

%A = memref.alloc() : memref<32x32xf32>
scf.for %arg0 = %c0 to %c32 step %c1 {
scf.for %arg1 = %c0 to %c32 step %c1 {
%0 = index.castu %arg0 : index to i32
%1 = index.castu %arg1 : index to i32
%2 = arith.addi %0, %1 : i32
%val = arith.uitofp %2 : i32 to f32
memref.store %val, %A[%arg0, %arg1] : memref<32x32xf32>
}
}
%C = call @test(%A) : (memref<32x32xf32>) -> memref<32x32xf32>
%cast_A = memref.cast %A : memref<32x32xf32> to memref<*xf32>
%cast_C = memref.cast %C : memref<32x32xf32> to memref<*xf32>
// CHECK: [ALLCLOSE: TRUE]
call @printAllcloseF32(%cast_C, %cast_A) : (memref<*xf32>, memref<*xf32>) -> ()
//call @printMemrefF32(%cast_A) : (memref<*xf32>) -> ()
//call @printMemrefF32(%cast_C) : (memref<*xf32>) -> ()
return
}
func.func private @printAllcloseF32(memref<*xf32>, memref<*xf32>) attributes {llvm.emit_c_interface}
//func.func private @printMemrefF32(memref<*xf32>) attributes {llvm.emit_c_interface}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
builtin.module(
cse
gpu.module(xetile-init-duplicate
xetile-canonicalization
xetile-blockop-fallback
xetile-blocking
cse
convert-xetile-to-xegpu
cse
imex-xegpu-hoist-transpose
imex-xegpu-apply-vnni-transformation
imex-xegpu-optimize-transpose)
cse
imex-vector-linearize
cse
imex-remove-single-elem-vector
canonicalize
cse
gpu.module(convert-xegpu-to-vc)
reconcile-unrealized-casts
bf16-to-gpu
cse
imex-convert-gpu-to-spirv
spirv.module(spirv-lower-abi-attrs
spirv-update-vce)
func.func(llvm-request-c-wrappers)
serialize-spirv
convert-vector-to-scf
convert-gpu-to-gpux
convert-scf-to-cf
expand-strided-metadata
finalize-memref-to-llvm
convert-cf-to-llvm
convert-vector-to-llvm
convert-index-to-llvm
convert-arith-to-llvm
convert-func-to-llvm
convert-math-to-llvm
convert-gpux-to-llvm
lower-affine
reconcile-unrealized-casts)
4 changes: 3 additions & 1 deletion test/Transforms/RemoveSingleElemVector/postop_reduce_n.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ module {
%34 = arith.remsi %11, %c4 : index
%35 = scf.for %arg3 = %c0 to %c3 step %c1 iter_args(%arg4 = %cst) -> (vector<8x1xf32>) {
%39 = vector.shape_cast %arg4 : vector<8x1xf32> to vector<8xf32>
//CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
// Disabling remove single elem vector.extra_stride_slice for now.
// DISABLE-CHECK-COUNT-8: vector.extractelement {{.*}} : vector<8xf32>
// CHECK-COUNT-8: vector.extract_strided_slice
%40 = vector.extract_strided_slice %39 {offsets = [0], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
%41 = vector.extract_strided_slice %39 {offsets = [1], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
%42 = vector.extract_strided_slice %39 {offsets = [2], sizes = [1], strides = [1]} : vector<8xf32> to vector<1xf32>
Expand Down
Loading