From 18efe7ca25567b7074193f2ce30e456f2ea318f9 Mon Sep 17 00:00:00 2001 From: Konstantin Schwarz Date: Mon, 14 Oct 2024 19:59:20 +0100 Subject: [PATCH] [GlobalISel] Match G_SHUFFLE_VECTORs representing sub-vector extracts --- .../llvm/CodeGen/GlobalISel/CombinerHelper.h | 4 ++ .../include/llvm/Target/GlobalISel/Combine.td | 12 +++- .../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 55 ++++++++++++++++ llvm/test/CodeGen/AArch64/ext-narrow-index.ll | 7 +- llvm/test/CodeGen/AArch64/vecreduce-add.ll | 12 ++-- .../CodeGen/AIE/aie2/intrinsics-shufflevec.ll | 65 +++---------------- 6 files changed, 86 insertions(+), 69 deletions(-) diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b2132562ac3f..ce2cd978859a 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -856,6 +856,10 @@ class CombinerHelper { bool matchExtractVectorElementWithDifferentIndices(const MachineOperand &MO, BuildFnTy &MatchInfo); + /// Transform: + /// res = G_SHUFFLE_VECTORS a, b, <0, 1> -> res, undef = G_UNMERGE_VALUES a + bool matchShuffleToExtractSubvector(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Transform: /// G_INTTOPTR (int G_CONSTANT x) -> (pointer G_CONSTANT x) bool matchIntToPtrContant(MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td index 3c8d968c2764..a67690b6371f 100644 --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -1527,6 +1527,16 @@ def combine_shuffle_concat : GICombineRule< [{ return Helper.matchCombineShuffleConcat(*${root}, ${matchinfo}); }]), (apply [{ Helper.applyCombineShuffleConcat(*${root}, ${matchinfo}); }])>; +// Combines Shuffles representing vector extracts into Unmerges +// res = G_SHUFFLE_VECTORS a, b, <0, 1> +// ===> +// res, undef = G_UNMERGE_VALUES a +def combine_shuffle_to_extract_vector : GICombineRule< + (defs root:$root, build_fn_matchinfo:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return Helper.matchShuffleToExtractSubvector(*${root}, ${matchinfo}); }]), + (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])>; + def combine_inttoptr_constant : GICombineRule< (defs root:$root, build_fn_matchinfo:$info), (match (wip_match_opcode G_INTTOPTR):$root, @@ -1642,7 +1652,7 @@ def all_combines : GICombineGroup<[trivial_combines, vector_ops_combines, sub_add_reg, select_to_minmax, redundant_binop_in_equality, fsub_to_fneg, commute_constant_to_rhs, match_ands, match_ors, combine_concat_vector, double_icmp_zero_and_or_combine, match_addos, - combine_shuffle_concat]>; + combine_shuffle_concat, combine_shuffle_to_extract_vector]>; // A combine group used to for prelegalizer combiners at -O0. The combines in // this group have been selected based on experiments to balance code size and diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec7ca5dc8e2b..34bea6f26aad 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallBitVector.h" #include "llvm/Analysis/CmpInstAnalysis.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" @@ -7254,6 +7255,60 @@ bool CombinerHelper::matchAddOverflow(MachineInstr &MI, BuildFnTy &MatchInfo) { return false; } +bool CombinerHelper::matchShuffleToExtractSubvector(MachineInstr &MI, + BuildFnTy &MatchInfo) { + + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + const Register DstReg = MI.getOperand(0).getReg(); + const Register Src1Reg = MI.getOperand(1).getReg(); + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT Src1Ty = MRI.getType(Src1Reg); + + if (!DstTy.isVector() || !Src1Ty.isVector()) + return false; + + const unsigned NumDstElems = DstTy.getNumElements(); + const unsigned NumSrc1Elems = Src1Ty.getNumElements(); + + if (NumDstElems * 2 != NumSrc1Elems) + return false; + + auto CheckExtractMask = [=](unsigned Start, unsigned NumElems) -> bool { + auto ExtractMask = createSequentialMask(Start, NumElems, 0); + + for (unsigned I = 0; I < NumDstElems; I++) { + if (Mask[I] == -1) + continue; + + if (Mask[I] != ExtractMask[I]) + return false; + } + + return true; + }; + + const Register UndefReg = MRI.createGenericVirtualRegister(DstTy); + Register UnmergeDst1; + Register UnmergeDst2; + if (CheckExtractMask(0, NumDstElems)) { + UnmergeDst1 = DstReg; + UnmergeDst2 = UndefReg; + } else if (CheckExtractMask(NumDstElems, NumDstElems)) { + UnmergeDst1 = UndefReg; + UnmergeDst2 = DstReg; + } else { + return false; + } + + MatchInfo = [=](MachineIRBuilder &B) { + B.buildUnmerge({UnmergeDst1, UnmergeDst2}, Src1Reg); + }; + + return true; +} + bool CombinerHelper::matchIntToPtrContant(MachineInstr &MI, MachineRegisterInfo &MRI, BuildFnTy &MatchInfo) { diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 2c5d33da93c8..db8250db4320 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -42,8 +42,7 @@ define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) { ; ; CHECK-GISEL-LABEL: i8_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> @@ -254,9 +253,7 @@ define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) { ; ; CHECK-GISEL-LABEL: i8_zero_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 3254c5ebe9c6..42c68883351f 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3744,17 +3744,13 @@ define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i1 ; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-GI-NEXT: uaddw2 v0.4s, v4.4s, v0.8h +; CHECK-GI-NEXT: uaddw2 v1.4s, v5.4s, v1.8h +; CHECK-GI-NEXT: uaddw2 v2.4s, v6.4s, v2.8h +; CHECK-GI-NEXT: uaddw2 v3.4s, v7.4s, v3.8h ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index f61e90620642..6c0af33b059a 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -15,63 +15,18 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov r8, r16 // Delay Slot 1 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: mova r16, #8; nopb ; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: mova r16, #9 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #10 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: mova r16, #11 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #12 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: mova r16, #13 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #15 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: mova r16, #14 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r8 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wl0, wh0; nopv ; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: // %if.then -; CHECK-NEXT: mova r16, #0; nopb ; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r8 // Delay Slot 1 +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: nopa ; ret lr +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp = icmp eq i32 %idx, 0 br i1 %cmp, label %if.then, label %if.end