From 718d50d6d03449962b14a3c8357a6ee3fa145f36 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 1 Nov 2024 17:28:37 +0000 Subject: [PATCH] [VectorCombine] foldPermuteOfBinops - prefer the new fold for matching costs. Minor tweak to #114101 - as we're reducing the instruction count, we should prefer the fold if the old/new costs are the same. --- .../Transforms/Vectorize/VectorCombine.cpp | 4 +- .../Transforms/PhaseOrdering/X86/pr50392.ll | 46 +++++++++++-------- .../Transforms/PhaseOrdering/X86/pr94546.ll | 45 +++++++++--------- 3 files changed, 54 insertions(+), 41 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 3283cc8a229e5c..025234c54956b2 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -1478,7 +1478,9 @@ bool VectorCombine::foldPermuteOfBinops(Instruction &I) { LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost << "\n"); - if (NewCost >= OldCost) + + // If costs are equal, still fold as we reduce instruction count. + if (NewCost > OldCost) return false; Value *Shuf0 = Builder.CreateShuffleVector(Op00, Op01, NewMask0); diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index 53d4b1ad96cb82..d0568f3b961fde 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -1,30 +1,39 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 ; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 ; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: @PR50392( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2 -; SSE-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3 -; SSE-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]] -; SSE-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3 -; SSE-NEXT: ret <4 x double> [[SHUFFLE]] +; SSE2-LABEL: @PR50392( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE2-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2 +; SSE2-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3 +; SSE2-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]] +; SSE2-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3 +; SSE2-NEXT: ret <4 x double> [[SHUFFLE]] +; +; SSE4-LABEL: @PR50392( +; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2 +; SSE4-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3 +; SSE4-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]] +; SSE4-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3 +; SSE4-NEXT: ret <4 x double> [[SHUFFLE]] ; ; AVX1-LABEL: @PR50392( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX1-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] ; AVX1-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2 ; AVX1-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3 ; AVX1-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]] @@ -61,3 +70,4 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX: {{.*}} +; SSE: {{.*}} diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll index 6ff68f50db1b7a..ec9bfc542e32f5 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr94546.ll @@ -1,33 +1,32 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 ; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 ; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { -; SSE-LABEL: @PR94546( -; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; SSE-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; SSE-NEXT: ret <4 x double> [[TMP4]] +; SSE2-LABEL: @PR94546( +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] +; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> +; SSE2-NEXT: ret <4 x double> [[TMP4]] ; -; AVX1-LABEL: @PR94546( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> -; AVX1-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; AVX1-NEXT: ret <4 x double> [[TMP4]] +; SSE4-LABEL: @PR94546( +; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; SSE4-NEXT: ret <4 x double> [[TMP3]] ; -; AVX2-LABEL: @PR94546( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; AVX2-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: ret <4 x double> [[TMP3]] +; AVX-LABEL: @PR94546( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: ret <4 x double> [[TMP3]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 @@ -49,4 +48,6 @@ define <4 x double> @PR94546(<4 x double> %a, <4 x double> %b) { ret <4 x double> %shuffle } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} +; AVX1: {{.*}} +; AVX2: {{.*}} +; SSE: {{.*}}