Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Generic][AIE2] Combiner for shufflevectors that use build vector #129

Open
wants to merge 13 commits into
base: vvandebe.vshuffle.impl
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,15 @@ class CombinerHelper {
applyCombineUnmergeMergeToPlainValues(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

/// Transform <ty, ...> G_SHUFFLE_VECTOR(G_MERGE ty X Y Z) -> G_MERGE ty X,Y,Z
bool
matchCombineShuffleVectorBuildVector(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

void
applyCombineShuffleVectorBuildVector(MachineInstr &MI,
SmallVectorImpl<Register> &Operands);

/// Transform G_UNMERGE Constant -> Constant1, Constant2, ...
bool matchCombineUnmergeConstant(MachineInstr &MI,
SmallVectorImpl<APInt> &Csts);
Expand Down
19 changes: 19 additions & 0 deletions llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its
// affiliates
//
//===----------------------------------------------------------------------===//
/// \file
/// Declares convenience wrapper classes for interpreting MachineInstr instances
Expand Down Expand Up @@ -240,6 +243,22 @@ class GUnmerge : public GenericMachineInstr {
}
};

/// Represents a G_SHUFFLE_VECTOR.
class GShuffleVector : public GenericMachineInstr {
public:
/// Returns the number of source registers.
unsigned getNumSources() const { return getNumOperands() - 2; }
/// Returns the I'th source register.
Register getSourceReg(unsigned I) const {
assert(I + 1 <= getNumSources());
return getReg(I + 1);
}

static bool classof(const MachineInstr *MI) {
return MI->getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR;
}
};

/// Represents G_BUILD_VECTOR, G_CONCAT_VECTORS or G_MERGE_VALUES.
/// All these have the common property of generating a single value from
/// multiple sources.
Expand Down
12 changes: 11 additions & 1 deletion llvm/include/llvm/Target/GlobalISel/Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -756,7 +756,7 @@ def fneg_fneg_fold: GICombineRule <
(apply (GIReplaceReg $dst, $src))
>;

// Fold (unmerge(merge x, y, z)) -> z, y, z.
// Fold (unmerge(merge x, y, z)) -> x, y, z.
def unmerge_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
def unmerge_merge : GICombineRule<
(defs root:$d, unmerge_merge_matchinfo:$info),
Expand All @@ -765,6 +765,16 @@ def unmerge_merge : GICombineRule<
(apply [{ Helper.applyCombineUnmergeMergeToPlainValues(*${d}, ${info}); }])
>;

// Fold (unmerge(merge x, y, z)) -> z, y, z.
def shufflevector_merge_matchinfo : GIDefMatchData<"SmallVector<Register, 8>">;
def shufflevector_merge : GICombineRule<
(defs root:$d, shufflevector_merge_matchinfo:$info),
(match (wip_match_opcode G_SHUFFLE_VECTOR): $d,
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A recent move in LLVM is that you shouldn't use wip_match_opcode anymore since it slows down compilation.

https://llvm.org/docs/GlobalISel/MIRPatterns.html#gallery

[{ return Helper.matchCombineShuffleVectorBuildVector(*${d}, ${info}); }]),
(apply [{ Helper.applyCombineShuffleVectorBuildVector(*${d}, ${info}); }])
>;


// Fold merge(unmerge).
def merge_unmerge : GICombineRule<
(defs root:$d, register_matchinfo:$matchinfo),
Expand Down
88 changes: 88 additions & 0 deletions llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallBitVector.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
Expand All @@ -27,6 +28,7 @@
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/RegisterBankInfo.h"
Expand Down Expand Up @@ -2269,6 +2271,92 @@ static Register peekThroughBitcast(Register Reg,
return Reg;
}

bool CombinerHelper::matchCombineShuffleVectorBuildVector(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Expected a shuffle vector");
auto &ShuffleVector = cast<GShuffleVector>(MI);
Register SrcReg1 = peekThroughBitcast(ShuffleVector.getSourceReg(0), MRI);
Register SrcReg2 = peekThroughBitcast(ShuffleVector.getSourceReg(1), MRI);

// Check if the Source registers are either merges or implicit definitions
auto *SrcInstr1 = getOpcodeDef<GBuildVector>(SrcReg1, MRI);
auto *SrcInstr2 = getOpcodeDef<GBuildVector>(SrcReg2, MRI);
auto *IsUndef1 = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg1, MRI);
auto *IsUndef2 = getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg2, MRI);

// Our inputs need to be either be build vectors or undefined, register inputs
// break this optimization. You could maybe do something clever were you
// concatenate vectors to save half a build vector.
if ((!SrcInstr1 && !IsUndef1) || (!SrcInstr2 && !IsUndef2))
return false;

ValentijnvdBeek marked this conversation as resolved.
Show resolved Hide resolved
if (IsUndef1 && IsUndef2)
return true;

Register UndefReg;
if (SrcInstr1 || SrcInstr2)
UndefReg = MRI.createGenericVirtualRegister(MRI.getType(SrcReg1));

// Since our inputs to shufflevector must be of the same size, we can reuse
// the size of the defined register.
const unsigned NumElements = (SrcInstr1 != 0) ? SrcInstr1->getNumSources()
: SrcInstr2->getNumSources();
for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
const Register Elt =
(SrcInstr1 != 0) ? SrcInstr1->getSourceReg(Idx) : UndefReg;
Operands.push_back(Elt);
}

for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
const Register Elt =
(SrcInstr2 != 0) ? SrcInstr2->getSourceReg(Idx) : UndefReg;
Operands.push_back(Elt);
}

return true;
}

void CombinerHelper::applyCombineShuffleVectorBuildVector(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
"Expected a shuffle vector");
auto &ShuffleVector = cast<GShuffleVector>(MI);
const Register SrcReg1 =
peekThroughBitcast(ShuffleVector.getSourceReg(0), MRI);
const Register SrcReg2 =
peekThroughBitcast(ShuffleVector.getSourceReg(1), MRI);

// Check if the Source registers are either merges or implicit definitions
const MachineInstr *IsUndef1 =
getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg1, MRI);
const MachineInstr *IsUndef2 =
getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, SrcReg2, MRI);

// If they're both undefined, we will just return an undefined as well.
if (IsUndef1 && IsUndef2) {
Builder.buildUndef(ShuffleVector.getReg(0));
MI.eraseFromParent();
return;
}

const LLT SrcReg1Ty = MRI.getType(SrcReg1);
const ArrayRef<int> ShiftMask = MI.getOperand(3).getShuffleMask();
Register UndefReg;
SmallVector<Register, 8> Arguments;
for (int Index : ShiftMask) {
if (!UndefReg) {
UndefReg = Builder.buildUndef(SrcReg1Ty.getScalarType()).getReg(0);
}

const Register Argument = Index >= 0 ? Operands[Index] : UndefReg;
Arguments.push_back(Argument);
}

Builder.buildBuildVector(ShuffleVector.getOperand(0), Arguments);
MI.eraseFromParent();
}

bool CombinerHelper::matchCombineUnmergeMergeToPlainValues(
MachineInstr &MI, SmallVectorImpl<Register> &Operands) {
assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -295,5 +295,5 @@ def AArch64PostLegalizerCombiner
ptr_add_immed_chain, overlapping_and,
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs]> {
commute_constant_to_rhs, shufflevector_merge]> {
}
Comment on lines 297 to 299
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed since some ARM64 tests relies on the legalizer changing the inputs of the shufflevector and if you don't run them afterwards you get worse code.

3 changes: 2 additions & 1 deletion llvm/lib/Target/AIE/AIECombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def AIE2PreLegalizerCombiner
all_combines, combine_S20NarrowingOpt,
combine_globalval_offset,
combine_extract_vector_elt_and_zsa_ext,
combine_splat_vector ]> {
combine_splat_vector,
shufflevector_merge ]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;

def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, clamp_i64_to_i16, foldable_fneg]> {
[all_combines, clamp_i64_to_i16, foldable_fneg, shufflevector_merge]> {
let CombineAllMethodName = "tryCombineAllImpl";
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/RISCV/RISCVCombine.td
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
include "llvm/Target/GlobalISel/Combine.td"

def RISCVPreLegalizerCombiner: GICombiner<
"RISCVPreLegalizerCombinerImpl", [all_combines]> {
"RISCVPreLegalizerCombinerImpl", [all_combines, shufflevector_merge]> {
}

def RISCVO0PreLegalizerCombiner: GICombiner<
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,18 @@ declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>) #0
define i32 @bar() {
; CHECK-LABEL: bar:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: mov b1, v0[1]
; CHECK-NEXT: mov b2, v0[2]
; CHECK-NEXT: mov b3, v0[3]
; CHECK-NEXT: mov.h v0[1], v1[0]
; CHECK-NEXT: mov w8, #0 ; =0x0
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: mov.16b v1, v0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add copyright header!

; CHECK-NEXT: mov.b v1[1], v0[0]
; CHECK-NEXT: mov.b v1[2], v0[0]
; CHECK-NEXT: mov.b v1[3], v0[0]
; CHECK-NEXT: mov b0, v1[1]
; CHECK-NEXT: mov b2, v1[2]
; CHECK-NEXT: mov b3, v1[3]
; CHECK-NEXT: mov.h v1[1], v0[0]
; CHECK-NEXT: mov.h v2[1], v3[0]
; CHECK-NEXT: ushll.4s v0, v0, #0
; CHECK-NEXT: ushll.4s v0, v1, #0
; CHECK-NEXT: ushll.4s v1, v2, #0
; CHECK-NEXT: mov.d v0[1], v1[0]
; CHECK-NEXT: movi.4s v1, #1
Expand Down
75 changes: 28 additions & 47 deletions llvm/test/CodeGen/AArch64/shufflevector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -210,24 +210,14 @@ define i32 @shufflevector_v4i8(<4 x i8> %a, <4 x i8> %b){
; CHECK-GI-LABEL: shufflevector_v4i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h2, v0.h[1]
; CHECK-GI-NEXT: mov h3, v1.h[1]
; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: mov h4, v0.h[2]
; CHECK-GI-NEXT: mov h5, v0.h[3]
; CHECK-GI-NEXT: mov h6, v1.h[3]
; CHECK-GI-NEXT: mov v0.b[1], v2.b[0]
; CHECK-GI-NEXT: mov h2, v1.h[2]
; CHECK-GI-NEXT: mov v1.b[1], v3.b[0]
; CHECK-GI-NEXT: mov v0.b[2], v4.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v2.b[0]
; CHECK-GI-NEXT: mov v0.b[3], v5.b[0]
; CHECK-GI-NEXT: mov v1.b[3], v6.b[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov h0, v0.h[2]
; CHECK-GI-NEXT: mov v2.b[1], v0.b[0]
; CHECK-GI-NEXT: mov h0, v1.h[3]
; CHECK-GI-NEXT: mov v2.b[2], v1.b[0]
; CHECK-GI-NEXT: mov v2.b[3], v0.b[0]
; CHECK-GI-NEXT: fmov w0, s2
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 1, i32 2, i32 4, i32 7>
%d = bitcast <4 x i8> %c to i32
Expand Down Expand Up @@ -280,14 +270,8 @@ define i32 @shufflevector_v2i16(<2 x i16> %a, <2 x i16> %b){
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-GI-NEXT: mov s2, v0.s[1]
; CHECK-GI-NEXT: mov s3, v1.s[1]
; CHECK-GI-NEXT: adrp x8, .LCPI17_0
; CHECK-GI-NEXT: mov v0.h[1], v2.h[0]
; CHECK-GI-NEXT: mov v1.h[1], v3.h[0]
; CHECK-GI-NEXT: mov v0.d[1], v1.d[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI17_0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: mov s0, v0.s[1]
; CHECK-GI-NEXT: mov v0.h[1], v1.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 1, i32 2>
Expand Down Expand Up @@ -397,9 +381,12 @@ define i32 @shufflevector_v4i8_zeroes(<4 x i8> %a, <4 x i8> %b){
;
; CHECK-GI-LABEL: shufflevector_v4i8_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.8b, w8
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v1.b[1], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[3], v0.b[0]
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: ret
%c = shufflevector <4 x i8> %a, <4 x i8> %b, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
%d = bitcast <4 x i8> %c to i32
Expand Down Expand Up @@ -433,8 +420,8 @@ define i32 @shufflevector_v2i16_zeroes(<2 x i16> %a, <2 x i16> %b){
;
; CHECK-GI-LABEL: shufflevector_v2i16_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov w8, s0
; CHECK-GI-NEXT: dup v0.4h, w8
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-GI-NEXT: mov v0.h[1], v0.h[0]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: ret
%c = shufflevector <2 x i16> %a, <2 x i16> %b, <2 x i32> <i32 0, i32 0>
Expand Down Expand Up @@ -492,20 +479,11 @@ define <3 x i8> @shufflevector_v3i8(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: fmov s1, w1
; CHECK-GI-NEXT: adrp x8, .LCPI30_0
; CHECK-GI-NEXT: fmov s2, w3
; CHECK-GI-NEXT: fmov s3, w4
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s0, w1
; CHECK-GI-NEXT: fmov s1, w2
; CHECK-GI-NEXT: mov v2.b[1], v3.b[0]
; CHECK-GI-NEXT: fmov s3, w5
; CHECK-GI-NEXT: mov v0.b[1], v1.b[0]
; CHECK-GI-NEXT: fmov s1, w4
; CHECK-GI-NEXT: mov v0.b[2], v1.b[0]
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI30_0]
; CHECK-GI-NEXT: mov v2.b[2], v3.b[0]
; CHECK-GI-NEXT: mov v0.d[1], v2.d[0]
; CHECK-GI-NEXT: tbl v0.16b, { v0.16b }, v1.16b
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
; CHECK-GI-NEXT: fmov w0, s0
Expand Down Expand Up @@ -614,11 +592,14 @@ define <3 x i8> @shufflevector_v3i8_zeroes(<3 x i8> %a, <3 x i8> %b) {
;
; CHECK-GI-LABEL: shufflevector_v3i8_zeroes:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: dup v0.8b, w0
; CHECK-GI-NEXT: mov b1, v0.b[1]
; CHECK-GI-NEXT: mov b2, v0.b[2]
; CHECK-GI-NEXT: fmov w0, s0
; CHECK-GI-NEXT: fmov w1, s1
; CHECK-GI-NEXT: fmov s0, w0
; CHECK-GI-NEXT: mov v1.16b, v0.16b
; CHECK-GI-NEXT: mov v1.b[1], v0.b[0]
; CHECK-GI-NEXT: mov v1.b[2], v0.b[0]
; CHECK-GI-NEXT: mov b0, v1.b[1]
; CHECK-GI-NEXT: mov b2, v1.b[2]
; CHECK-GI-NEXT: fmov w0, s1
; CHECK-GI-NEXT: fmov w1, s0
; CHECK-GI-NEXT: fmov w2, s2
; CHECK-GI-NEXT: ret
%c = shufflevector <3 x i8> %a, <3 x i8> %b, <3 x i32> <i32 0, i32 0, i32 0>
Expand Down
Loading