From 4952addb0c00546fcf8a9d6b0ca7aec3f4243cca Mon Sep 17 00:00:00 2001 From: Nekotekina Date: Tue, 19 Jun 2018 21:38:53 +0300 Subject: [PATCH] X86: optimize VSELECT for v16i8 with shl + sign bit test --- lib/Target/X86/X86ISelLowering.cpp | 61 ++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 6c7e316686fb..db8cee65cf9e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -31858,6 +31858,67 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, if (!TLI.isTypeLegal(VT)) return SDValue(); + // Simplify VSELECT if the condition is sign test. + if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && + ((Subtarget.hasSSE2() && VT == MVT::v16i8) || + (Subtarget.hasAVX2() && VT == MVT::v32i8))) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + SDValue C0 = Cond.getOperand(0); + SDValue C1 = Cond.getOperand(1); + + // Check if one of the arms of the SETCC is a zero vector. If it's on the + // left side invert the predicate to simplify logic below. + SDValue Other; + if (ISD::isBuildVectorAllZeros(C0.getNode())) { + Other = C1; + CC = ISD::getSetCCInverse(CC, true); + } else if (ISD::isBuildVectorAllZeros(C1.getNode())) { + Other = C0; + } + + if (Other.getNode() && Other.getValueType() == VT && + (CC == ISD::SETLT || CC == ISD::SETGE)) { + SDValue NextOp = peekThroughBitcasts(Other); + + // Try to lower v16i8 SHL by const to v8i16 shift (with AND) + if (NextOp.getOpcode() == ISD::SHL && NextOp.getValueType() == VT) { + SDValue C = NextOp.getOperand(1); + if (ISD::isBuildVectorOfConstantSDNodes(C.getNode())) { + if (SDValue ShiftOp = LowerShift(NextOp, Subtarget, DAG)) { + NextOp = ShiftOp; + } + } + } + + // Try to eliminate unnecessary AND if it doesn't touch sign bit. + if (NextOp.getOpcode() == ISD::AND) { + SDValue M = NextOp.getOperand(1); + EVT MT = M.getValueType(); + if (ISD::isBuildVectorOfConstantSDNodes(M.getNode())) { + unsigned Count = MT.getVectorNumElements(); + + for (unsigned i = 0; i < Count; ++i) { + auto *C = cast(M.getOperand(i)); + uint64_t mask = 0x8080808080808080ull; + mask >>= (64 - MT.getScalarSizeInBits()); + + if ((C->getZExtValue() & mask) != mask) + Count = 0; + } + + if (Count) { + Other = peekThroughBitcasts(NextOp.getOperand(0)); + Other = DAG.getBitcast(VT, Other); + } + } + } + + if (CC == ISD::SETGE) + std::swap(LHS, RHS); + return DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, Other, LHS, RHS); + } + } + // Match VSELECTs into subs with unsigned saturation. if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.