diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 3842af56e6b3d7..4c76592c42e1eb 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -5616,6 +5616,10 @@ class TargetLowering : public TargetLoweringBase { return true; } + // Expand vector operation by dividing it into smaller length operations and + // joining their results. SDValue() is returned when expansion did not happen. + SDValue expandVectorNaryOpBySplitting(SDNode *Node, SelectionDAG &DAG) const; + private: SDValue foldSetCCWithAnd(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond, const SDLoc &DL, DAGCombinerInfo &DCI) const; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 0adf3cfb34c949..ffecca78a2252c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -1197,6 +1197,24 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl &Results) { case ISD::UCMP: Results.push_back(TLI.expandCMP(Node, DAG)); return; + + case ISD::FADD: + case ISD::FMUL: + case ISD::FMA: + case ISD::FDIV: + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FNEARBYINT: + case ISD::FRINT: + case ISD::FROUND: + case ISD::FROUNDEVEN: + case ISD::FTRUNC: + case ISD::FSQRT: + if (SDValue Expanded = TLI.expandVectorNaryOpBySplitting(Node, DAG)) { + Results.push_back(Expanded); + return; + } + break; } SDValue Unrolled = DAG.UnrollVectorOp(Node); @@ -1885,6 +1903,11 @@ void VectorLegalizer::ExpandFSUB(SDNode *Node, TLI.isOperationLegalOrCustom(ISD::FADD, VT)) return; // Defer to LegalizeDAG + if (SDValue Expanded = TLI.expandVectorNaryOpBySplitting(Node, DAG)) { + Results.push_back(Expanded); + return; + } + SDValue Tmp = DAG.UnrollVectorOp(Node); Results.push_back(Tmp); } diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp index f19975557a0a77..9df0f0adcc1ea7 100644 --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -8440,15 +8440,18 @@ TargetLowering::createSelectForFMINNUM_FMAXNUM(SDNode *Node, SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SelectionDAG &DAG) const { - SDLoc dl(Node); - unsigned NewOp = Node->getOpcode() == ISD::FMINNUM ? - ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; - EVT VT = Node->getValueType(0); + if (SDValue Expanded = expandVectorNaryOpBySplitting(Node, DAG)) + return Expanded; + EVT VT = Node->getValueType(0); if (VT.isScalableVector()) report_fatal_error( "Expanding fminnum/fmaxnum for scalable vectors is undefined."); + SDLoc dl(Node); + unsigned NewOp = + Node->getOpcode() == ISD::FMINNUM ? ISD::FMINNUM_IEEE : ISD::FMAXNUM_IEEE; + if (isOperationLegalOrCustom(NewOp, VT)) { SDValue Quiet0 = Node->getOperand(0); SDValue Quiet1 = Node->getOperand(1); @@ -8493,6 +8496,9 @@ SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node, SDValue TargetLowering::expandFMINIMUM_FMAXIMUM(SDNode *N, SelectionDAG &DAG) const { + if (SDValue Expanded = expandVectorNaryOpBySplitting(N, DAG)) + return Expanded; + SDLoc DL(N); SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); @@ -11920,3 +11926,35 @@ bool TargetLowering::LegalizeSetCCCondCode(SelectionDAG &DAG, EVT VT, } return false; } + +SDValue TargetLowering::expandVectorNaryOpBySplitting(SDNode *Node, + SelectionDAG &DAG) const { + EVT VT = Node->getValueType(0); + // Despite its documentation, GetSplitDestVTs will assert if VT cannot be + // split into two equal parts. + if (!VT.isVector() || !VT.getVectorElementCount().isKnownMultipleOf(2)) + return SDValue(); + + // Restrict expansion to cases where both parts can be concatenated. + auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT); + if (LoVT != HiVT || !isTypeLegal(LoVT)) + return SDValue(); + + SDLoc DL(Node); + unsigned Opcode = Node->getOpcode(); + + // Don't expand if the result is likely to be unrolled anyway. + if (!isOperationLegalOrCustomOrPromote(Opcode, LoVT)) + return SDValue(); + + SmallVector LoOps, HiOps; + for (const SDValue &V : Node->op_values()) { + auto [Lo, Hi] = DAG.SplitVector(V, DL, LoVT, HiVT); + LoOps.push_back(Lo); + HiOps.push_back(Hi); + } + + SDValue SplitOpLo = DAG.getNode(Opcode, DL, LoVT, LoOps); + SDValue SplitOpHi = DAG.getNode(Opcode, DL, HiVT, HiOps); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SplitOpLo, SplitOpHi); +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 48e1b96d841efb..460ac79991e233 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1663,12 +1663,42 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, for (auto VT : {MVT::nxv2bf16, MVT::nxv4bf16, MVT::nxv8bf16}) { setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); + setOperationAction(ISD::FABS, VT, Legal); + setOperationAction(ISD::FNEG, VT, Legal); setOperationAction(ISD::FP_EXTEND, VT, Custom); setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); + + if (Subtarget->hasSVEB16B16()) { + setOperationAction(ISD::FADD, VT, Legal); + setOperationAction(ISD::FMA, VT, Custom); + setOperationAction(ISD::FMAXIMUM, VT, Custom); + setOperationAction(ISD::FMAXNUM, VT, Custom); + setOperationAction(ISD::FMINIMUM, VT, Custom); + setOperationAction(ISD::FMINNUM, VT, Custom); + setOperationAction(ISD::FMUL, VT, Legal); + setOperationAction(ISD::FSUB, VT, Legal); + } + } + + for (auto Opcode : + {ISD::FCEIL, ISD::FDIV, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT, + ISD::FROUND, ISD::FROUNDEVEN, ISD::FSQRT, ISD::FTRUNC}) { + setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32); + setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32); + setOperationAction(Opcode, MVT::nxv8bf16, Expand); + } + + if (!Subtarget->hasSVEB16B16()) { + for (auto Opcode : {ISD::FADD, ISD::FMA, ISD::FMAXIMUM, ISD::FMAXNUM, + ISD::FMINIMUM, ISD::FMINNUM, ISD::FMUL, ISD::FSUB}) { + setOperationPromotedToType(Opcode, MVT::nxv2bf16, MVT::nxv2f32); + setOperationPromotedToType(Opcode, MVT::nxv4bf16, MVT::nxv4f32); + setOperationAction(Opcode, MVT::nxv8bf16, Expand); + } } setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 76362768e0aa6b..e78c67abeca30e 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -663,6 +663,15 @@ let Predicates = [HasSVEorSME] in { defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", AArch64fabs_mt>; defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", AArch64fneg_mt>; + foreach VT = [nxv2bf16, nxv4bf16, nxv8bf16] in { + // No dedicated instruction, so just clear the sign bit. + def : Pat<(VT (fabs VT:$op)), + (AND_ZI $op, (i64 (logical_imm64_XFORM(i64 0x7fff7fff7fff7fff))))>; + // No dedicated instruction, so just invert the sign bit. + def : Pat<(VT (fneg VT:$op)), + (EOR_ZI $op, (i64 (logical_imm64_XFORM(i64 0x8000800080008000))))>; + } + // zext(cmpeq(x, splat(0))) -> cnot(x) def : Pat<(nxv16i8 (zext (nxv16i1 (AArch64setcc_z (nxv16i1 (SVEAllActive):$Pg), nxv16i8:$Op2, (SVEDup0), SETEQ)))), (CNOT_ZPmZ_B $Op2, $Pg, $Op2)>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 13c2a90a963f8c..4a720270df9120 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -2299,6 +2299,8 @@ multiclass sve_fp_3op_u_zd_bfloat opc, string asm, SDPatternOperator op> def NAME : sve_fp_3op_u_zd<0b00, opc, asm, ZPR16>; def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; + def : SVE_2_Op_Pat(NAME)>; } multiclass sve_fp_3op_u_zd_ftsmul opc, string asm, SDPatternOperator op> { @@ -9078,6 +9080,8 @@ multiclass sve_fp_bin_pred_bfloat { def _UNDEF : PredTwoOpPseudo; def : SVE_3_Op_Pat(NAME # _UNDEF)>; + def : SVE_3_Op_Pat(NAME # _UNDEF)>; + def : SVE_3_Op_Pat(NAME # _UNDEF)>; } // Predicated pseudo floating point three operand instructions. @@ -9099,6 +9103,8 @@ multiclass sve_fp_3op_pred_bfloat { def _UNDEF : PredThreeOpPseudo; def : SVE_4_Op_Pat(NAME # _UNDEF)>; + def : SVE_4_Op_Pat(NAME # _UNDEF)>; + def : SVE_4_Op_Pat(NAME # _UNDEF)>; } // Predicated pseudo integer two operand instructions. diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll new file mode 100644 index 00000000000000..e8468ddfeed181 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-bf16-arith.ll @@ -0,0 +1,752 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16 +; RUN: llc -mattr=+sve,+bf16,+sve-b16b16 < %s | FileCheck %s --check-prefixes=CHECK,B16B16 +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,NOB16B16 +; RUN: llc -mattr=+sme,+sve-b16b16 -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,B16B16 + +target triple = "aarch64-unknown-linux-gnu" + +; +; FABS +; + +define @fabs_nxv2bf16( %a) { +; CHECK-LABEL: fabs_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: ret + %res = call @llvm.fabs.nxv2bf16( %a) + ret %res +} + +define @fabs_nxv4bf16( %a) { +; CHECK-LABEL: fabs_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: ret + %res = call @llvm.fabs.nxv4bf16( %a) + ret %res +} + +define @fabs_nxv8bf16( %a) { +; CHECK-LABEL: fabs_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.h, z0.h, #0x7fff +; CHECK-NEXT: ret + %res = call @llvm.fabs.nxv8bf16( %a) + ret %res +} + +; +; FADD +; + +define @fadd_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fadd_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fadd z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fadd_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fadd %a, %b + ret %res +} + +define @fadd_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fadd_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fadd_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fadd %a, %b + ret %res +} + +define @fadd_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fadd_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fadd z2.s, z3.s, z2.s +; NOB16B16-NEXT: fadd z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fadd_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfadd z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fadd %a, %b + ret %res +} + +; +; FDIV +; + +define @fdiv_nxv2bf16( %a, %b) { +; CHECK-LABEL: fdiv_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = fdiv %a, %b + ret %res +} + +define @fdiv_nxv4bf16( %a, %b) { +; CHECK-LABEL: fdiv_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = fdiv %a, %b + ret %res +} + +define @fdiv_nxv8bf16( %a, %b) { +; CHECK-LABEL: fdiv_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpkhi z3.s, z0.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z2.s, z2.s, #16 +; CHECK-NEXT: lsl z3.s, z3.s, #16 +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fdivr z2.s, p0/m, z2.s, z3.s +; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z2.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fdiv %a, %b + ret %res +} + +; +; FMAX +; + +define @fmax_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fmax_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmax_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.d +; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maximum.nxv2bf16( %a, %b) + ret %res +} + +define @fmax_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fmax_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmax_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.s +; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maximum.nxv4bf16( %a, %b) + ret %res +} + +define @fmax_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fmax_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fmax z2.s, p0/m, z2.s, z3.s +; NOB16B16-NEXT: fmax z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmax_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.h +; B16B16-NEXT: bfmax z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maximum.nxv8bf16( %a, %b) + ret %res +} + +; +; FMAXNM +; + +define @fmaxnm_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fmaxnm_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmaxnm_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.d +; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maxnum.nxv2bf16( %a, %b) + ret %res +} + +define @fmaxnm_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fmaxnm_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmaxnm_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.s +; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maxnum.nxv4bf16( %a, %b) + ret %res +} + +define @fmaxnm_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fmaxnm_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fmaxnm z2.s, p0/m, z2.s, z3.s +; NOB16B16-NEXT: fmaxnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmaxnm_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.h +; B16B16-NEXT: bfmaxnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.maxnum.nxv8bf16( %a, %b) + ret %res +} + +; +; FMIN +; + +define @fmin_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fmin_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmin_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.d +; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minimum.nxv2bf16( %a, %b) + ret %res +} + +define @fmin_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fmin_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmin_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.s +; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minimum.nxv4bf16( %a, %b) + ret %res +} + +define @fmin_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fmin_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fmin z2.s, p0/m, z2.s, z3.s +; NOB16B16-NEXT: fmin z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmin_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.h +; B16B16-NEXT: bfmin z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minimum.nxv8bf16( %a, %b) + ret %res +} + +; +; FMINNM +; + +define @fminnm_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fminnm_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fminnm_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.d +; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minnum.nxv2bf16( %a, %b) + ret %res +} + +define @fminnm_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fminnm_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fminnm_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.s +; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minnum.nxv4bf16( %a, %b) + ret %res +} + +define @fminnm_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fminnm_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fminnm z2.s, p0/m, z2.s, z3.s +; NOB16B16-NEXT: fminnm z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fminnm_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.h +; B16B16-NEXT: bfminnm z0.h, p0/m, z0.h, z1.h +; B16B16-NEXT: ret + %res = call @llvm.minnum.nxv8bf16( %a, %b) + ret %res +} + +; +; FMLA +; + +define @fmla_nxv2bf16( %a, %b, %c) { +; NOB16B16-LABEL: fmla_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmla_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.d +; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; B16B16-NEXT: ret + %res = call @llvm.fma.nxv2bf16( %a, %b, %c) + ret %res +} + +define @fmla_nxv4bf16( %a, %b, %c) { +; NOB16B16-LABEL: fmla_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmla_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.s +; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; B16B16-NEXT: ret + %res = call @llvm.fma.nxv4bf16( %a, %b, %c) + ret %res +} + +define @fmla_nxv8bf16( %a, %b, %c) { +; NOB16B16-LABEL: fmla_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z3.s, z1.h +; NOB16B16-NEXT: uunpkhi z4.s, z0.h +; NOB16B16-NEXT: uunpkhi z5.s, z2.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: uunpklo z2.s, z2.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z4.s, z4.s, #16 +; NOB16B16-NEXT: lsl z5.s, z5.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: fmad z3.s, p0/m, z4.s, z5.s +; NOB16B16-NEXT: fmad z0.s, p0/m, z1.s, z2.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z3.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmla_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: ptrue p0.h +; B16B16-NEXT: bfmla z0.h, p0/m, z1.h, z2.h +; B16B16-NEXT: ret + %res = call @llvm.fma.nxv8bf16( %a, %b, %c) + ret %res +} + +; +; FMUL +; + +define @fmul_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fmul_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fmul z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmul_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfmul z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fmul %a, %b + ret %res +} + +define @fmul_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fmul_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmul_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfmul z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fmul %a, %b + ret %res +} + +define @fmul_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fmul_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fmul z2.s, z3.s, z2.s +; NOB16B16-NEXT: fmul z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fmul_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfmul z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fmul %a, %b + ret %res +} + +; +; FNEG +; + +define @fneg_nxv2bf16( %a) { +; CHECK-LABEL: fneg_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.h, z0.h, #0x8000 +; CHECK-NEXT: ret + %res = fneg %a + ret %res +} + +define @fneg_nxv4bf16( %a) { +; CHECK-LABEL: fneg_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.h, z0.h, #0x8000 +; CHECK-NEXT: ret + %res = fneg %a + ret %res +} + +define @fneg_nxv8bf16( %a) { +; CHECK-LABEL: fneg_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: eor z0.h, z0.h, #0x8000 +; CHECK-NEXT: ret + %res = fneg %a + ret %res +} + +; +; FSQRT +; + +define @fsqrt_nxv2bf16( %a) { +; CHECK-LABEL: fsqrt_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv2bf16( %a) + ret %res +} + +define @fsqrt_nxv4bf16( %a) { +; CHECK-LABEL: fsqrt_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv4bf16( %a) + ret %res +} + +define @fsqrt_nxv8bf16( %a) { +; CHECK-LABEL: fsqrt_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fsqrt z1.s, p0/m, z1.s +; CHECK-NEXT: fsqrt z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.sqrt.nxv8bf16( %a) + ret %res +} + +; +; FSUB +; + +define @fsub_nxv2bf16( %a, %b) { +; NOB16B16-LABEL: fsub_nxv2bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.d +; NOB16B16-NEXT: fsub z0.s, p0/m, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fsub_nxv2bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fsub %a, %b + ret %res +} + +define @fsub_nxv4bf16( %a, %b) { +; NOB16B16-LABEL: fsub_nxv4bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fsub_nxv4bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fsub %a, %b + ret %res +} + +define @fsub_nxv8bf16( %a, %b) { +; NOB16B16-LABEL: fsub_nxv8bf16: +; NOB16B16: // %bb.0: +; NOB16B16-NEXT: uunpkhi z2.s, z1.h +; NOB16B16-NEXT: uunpkhi z3.s, z0.h +; NOB16B16-NEXT: uunpklo z1.s, z1.h +; NOB16B16-NEXT: uunpklo z0.s, z0.h +; NOB16B16-NEXT: ptrue p0.s +; NOB16B16-NEXT: lsl z2.s, z2.s, #16 +; NOB16B16-NEXT: lsl z3.s, z3.s, #16 +; NOB16B16-NEXT: lsl z1.s, z1.s, #16 +; NOB16B16-NEXT: lsl z0.s, z0.s, #16 +; NOB16B16-NEXT: fsub z2.s, z3.s, z2.s +; NOB16B16-NEXT: fsub z0.s, z0.s, z1.s +; NOB16B16-NEXT: bfcvt z1.h, p0/m, z2.s +; NOB16B16-NEXT: bfcvt z0.h, p0/m, z0.s +; NOB16B16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOB16B16-NEXT: ret +; +; B16B16-LABEL: fsub_nxv8bf16: +; B16B16: // %bb.0: +; B16B16-NEXT: bfsub z0.h, z0.h, z1.h +; B16B16-NEXT: ret + %res = fsub %a, %b + ret %res +} + +declare @llvm.fabs.nxv2bf16() +declare @llvm.fabs.nxv4bf16() +declare @llvm.fabs.nxv8bf16() + +declare @llvm.fma.nxv2bf16(, , ) +declare @llvm.fma.nxv4bf16(, , ) +declare @llvm.fma.nxv8bf16(, , ) + +declare @llvm.maximum.nxv2bf16(, ) +declare @llvm.maximum.nxv4bf16(, ) +declare @llvm.maximum.nxv8bf16(, ) + +declare @llvm.maxnum.nxv2bf16(, ) +declare @llvm.maxnum.nxv4bf16(, ) +declare @llvm.maxnum.nxv8bf16(, ) + +declare @llvm.minimum.nxv2bf16(, ) +declare @llvm.minimum.nxv4bf16(, ) +declare @llvm.minimum.nxv8bf16(, ) + +declare @llvm.minnum.nxv2bf16(, ) +declare @llvm.minnum.nxv4bf16(, ) +declare @llvm.minnum.nxv8bf16(, ) + +declare @llvm.sqrt.nxv2bf16() +declare @llvm.sqrt.nxv4bf16() +declare @llvm.sqrt.nxv8bf16() diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll new file mode 100644 index 00000000000000..65d273d1209827 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-bf16-rounding.ll @@ -0,0 +1,355 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; +; FCEIL +; + +define @frintp_nxv2bf16( %a) { +; CHECK-LABEL: frintp_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv2bf16( %a) + ret %res +} + +define @frintp_nxv4bf16( %a) { +; CHECK-LABEL: frintp_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv4bf16( %a) + ret %res +} + +define @frintp_nxv8bf16( %a) { +; CHECK-LABEL: frintp_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frintp z1.s, p0/m, z1.s +; CHECK-NEXT: frintp z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.ceil.nxv8bf16( %a) + ret %res +} + +; +; FFLOOR +; + +define @frintm_nxv2bf16( %a) { +; CHECK-LABEL: frintm_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.floor.nxv2bf16( %a) + ret %res +} + +define @frintm_nxv4bf16( %a) { +; CHECK-LABEL: frintm_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.floor.nxv4bf16( %a) + ret %res +} + +define @frintm_nxv8bf16( %a) { +; CHECK-LABEL: frintm_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frintm z1.s, p0/m, z1.s +; CHECK-NEXT: frintm z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.floor.nxv8bf16( %a) + ret %res +} + +; +; FNEARBYINT +; + +define @frinti_nxv2bf16( %a) { +; CHECK-LABEL: frinti_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.nearbyint.nxv2bf16( %a) + ret %res +} + +define @frinti_nxv4bf16( %a) { +; CHECK-LABEL: frinti_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.nearbyint.nxv4bf16( %a) + ret %res +} + +define @frinti_nxv8bf16( %a) { +; CHECK-LABEL: frinti_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frinti z1.s, p0/m, z1.s +; CHECK-NEXT: frinti z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.nearbyint.nxv8bf16( %a) + ret %res +} + +; +; FRINT +; + +define @frintx_nxv2bf16( %a) { +; CHECK-LABEL: frintx_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.rint.nxv2bf16( %a) + ret %res +} + +define @frintx_nxv4bf16( %a) { +; CHECK-LABEL: frintx_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.rint.nxv4bf16( %a) + ret %res +} + +define @frintx_nxv8bf16( %a) { +; CHECK-LABEL: frintx_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frintx z1.s, p0/m, z1.s +; CHECK-NEXT: frintx z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.rint.nxv8bf16( %a) + ret %res +} + +; +; ROUND +; + +define @frinta_nxv2bf16( %a) { +; CHECK-LABEL: frinta_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.round.nxv2bf16( %a) + ret %res +} + +define @frinta_nxv4bf16( %a) { +; CHECK-LABEL: frinta_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.round.nxv4bf16( %a) + ret %res +} + +define @frinta_nxv8bf16( %a) { +; CHECK-LABEL: frinta_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frinta z1.s, p0/m, z1.s +; CHECK-NEXT: frinta z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.round.nxv8bf16( %a) + ret %res +} + +; +; ROUNDEVEN +; + +define @frintn_nxv2bf16( %a) { +; CHECK-LABEL: frintn_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.roundeven.nxv2bf16( %a) + ret %res +} + +define @frintn_nxv4bf16( %a) { +; CHECK-LABEL: frintn_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.roundeven.nxv4bf16( %a) + ret %res +} + +define @frintn_nxv8bf16( %a) { +; CHECK-LABEL: frintn_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frintn z1.s, p0/m, z1.s +; CHECK-NEXT: frintn z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.roundeven.nxv8bf16( %a) + ret %res +} + +; +; FTRUNC +; + +define @frintz_nxv2bf16( %a) { +; CHECK-LABEL: frintz_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.trunc.nxv2bf16( %a) + ret %res +} + +define @frintz_nxv4bf16( %a) { +; CHECK-LABEL: frintz_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = call @llvm.trunc.nxv4bf16( %a) + ret %res +} + +define @frintz_nxv8bf16( %a) { +; CHECK-LABEL: frintz_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: frintz z1.s, p0/m, z1.s +; CHECK-NEXT: frintz z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = call @llvm.trunc.nxv8bf16( %a) + ret %res +} + +declare @llvm.ceil.nxv2bf16( ) +declare @llvm.ceil.nxv4bf16( ) +declare @llvm.ceil.nxv8bf16( ) + +declare @llvm.floor.nxv2bf16( ) +declare @llvm.floor.nxv4bf16( ) +declare @llvm.floor.nxv8bf16( ) + +declare @llvm.nearbyint.nxv2bf16( ) +declare @llvm.nearbyint.nxv4bf16( ) +declare @llvm.nearbyint.nxv8bf16( ) + +declare @llvm.rint.nxv2bf16( ) +declare @llvm.rint.nxv4bf16( ) +declare @llvm.rint.nxv8bf16( ) + +declare @llvm.round.nxv2bf16( ) +declare @llvm.round.nxv4bf16( ) +declare @llvm.round.nxv8bf16( ) + +declare @llvm.roundeven.nxv2bf16( ) +declare @llvm.roundeven.nxv4bf16( ) +declare @llvm.roundeven.nxv8bf16( ) + +declare @llvm.trunc.nxv2bf16( ) +declare @llvm.trunc.nxv4bf16( ) +declare @llvm.trunc.nxv8bf16( ) diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll index 5b7f0e72b70da5..95d579be04ed27 100644 --- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -955,13 +955,10 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0 +; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0 ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0 -; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0] -; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -971,10 +968,8 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0 ; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0 -; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 -; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -984,10 +979,8 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s ; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0 ; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0 -; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 -; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt: @@ -1162,11 +1155,8 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0 -; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0 -; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, v7 clamp ; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt: @@ -1176,10 +1166,8 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0 -; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0 -; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, v7 clamp ; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt: @@ -1189,10 +1177,8 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s ; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] ; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0 -; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0 -; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0] -; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0] +; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, v6 clamp +; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, v7 clamp ; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31] ; ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt: