diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h index 84a2673fecb5bf..4383249658e606 100644 --- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h +++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h @@ -35,6 +35,7 @@ struct ComplexDeinterleavingPass enum class ComplexDeinterleavingOperation { CAdd, CMulPartial, + CDot, // The following 'operations' are used to represent internal states. Backends // are not expected to try and support these in any capacity. Deinterleave, @@ -43,6 +44,7 @@ enum class ComplexDeinterleavingOperation { ReductionPHI, ReductionOperation, ReductionSelect, + ReductionSingle }; enum class ComplexDeinterleavingRotation { diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp index f3f7ea9407b46f..aec8df962ffb7c 100644 --- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp +++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp @@ -108,6 +108,13 @@ static bool isNeg(Value *V); static Value *getNegOperand(Value *V); namespace { +template +std::optional findCommonBetweenCollections(IterT A, IterT B) { + auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); }); + if (Common != A.end()) + return std::make_optional(*Common); + return std::nullopt; +} class ComplexDeinterleavingLegacyPass : public FunctionPass { public: @@ -144,6 +151,7 @@ struct ComplexDeinterleavingCompositeNode { friend class ComplexDeinterleavingGraph; using NodePtr = std::shared_ptr; using RawNodePtr = ComplexDeinterleavingCompositeNode *; + bool OperandsValid = true; public: ComplexDeinterleavingOperation Operation; @@ -160,7 +168,11 @@ struct ComplexDeinterleavingCompositeNode { SmallVector Operands; Value *ReplacementNode = nullptr; - void addOperand(NodePtr Node) { Operands.push_back(Node.get()); } + void addOperand(NodePtr Node) { + if (!Node || !Node.get()) + OperandsValid = false; + Operands.push_back(Node.get()); + } void dump() { dump(dbgs()); } void dump(raw_ostream &OS) { @@ -194,6 +206,8 @@ struct ComplexDeinterleavingCompositeNode { PrintNodeRef(Op); } } + + bool areOperandsValid() { return OperandsValid; } }; class ComplexDeinterleavingGraph { @@ -293,7 +307,7 @@ class ComplexDeinterleavingGraph { NodePtr submitCompositeNode(NodePtr Node) { CompositeNodes.push_back(Node); - if (Node->Real && Node->Imag) + if (Node->Real) CachedResult[{Node->Real, Node->Imag}] = Node; return Node; } @@ -327,6 +341,8 @@ class ComplexDeinterleavingGraph { /// i: ai - br NodePtr identifyAdd(Instruction *Real, Instruction *Imag); NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag); + NodePtr identifyPartialReduction(Value *R, Value *I); + NodePtr identifyDotProduct(Value *Inst); NodePtr identifyNode(Value *R, Value *I); @@ -396,6 +412,7 @@ class ComplexDeinterleavingGraph { /// * Deinterleave the final value outside of the loop and repurpose original /// reduction users void processReductionOperation(Value *OperationReplacement, RawNodePtr Node); + void processReductionSingle(Value *OperationReplacement, RawNodePtr Node); public: void dump() { dump(dbgs()); } @@ -891,17 +908,163 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real, } ComplexDeinterleavingGraph::NodePtr -ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { - LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n"); - assert(R->getType() == I->getType() && - "Real and imaginary parts should not have different types"); +ComplexDeinterleavingGraph::identifyDotProduct(Value *V) { + + if (!TL->isComplexDeinterleavingOperationSupported( + ComplexDeinterleavingOperation::CDot, V->getType())) { + LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving " + "operation CDot with the type " + << *V->getType() << "\n"); + return nullptr; + } + + auto *Inst = cast(V); + auto *RealUser = cast(*Inst->user_begin()); + + NodePtr CN = + prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr); + + NodePtr ANode; + + const Intrinsic::ID PartialReduceInt = + Intrinsic::experimental_vector_partial_reduce_add; + + Value *AReal = nullptr; + Value *AImag = nullptr; + Value *BReal = nullptr; + Value *BImag = nullptr; + Value *Phi = nullptr; + + auto UnwrapCast = [](Value *V) -> Value * { + if (auto *CI = dyn_cast(V)) + return CI->getOperand(0); + return V; + }; + + auto PatternRot0 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(AReal))), + m_Neg(m_Mul(m_Value(BImag), m_Value(AImag)))); + + auto PatternRot270 = m_Intrinsic( + m_Intrinsic( + m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))), + m_Mul(m_Value(BImag), m_Value(AReal))); + + if (match(Inst, PatternRot0)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_0; + } else if (match(Inst, PatternRot270)) { + CN->Rotation = ComplexDeinterleavingRotation::Rotation_270; + } else { + Value *A0, *A1; + // The rotations 90 and 180 share the same operation pattern, so inspect the + // order of the operands, identifying where the real and imaginary + // components of A go, to discern between the aforementioned rotations. + auto PatternRot90Rot180 = m_Intrinsic( + m_Intrinsic(m_Value(Phi), + m_Mul(m_Value(BReal), m_Value(A0))), + m_Mul(m_Value(BImag), m_Value(A1))); + + if (!match(Inst, PatternRot90Rot180)) + return nullptr; + + A0 = UnwrapCast(A0); + A1 = UnwrapCast(A1); + + // Test if A0 is real/A1 is imag + ANode = identifyNode(A0, A1); + if (!ANode) { + // Test if A0 is imag/A1 is real + ANode = identifyNode(A1, A0); + // Unable to identify operand components, thus unable to identify rotation + if (!ANode) + return nullptr; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_90; + AReal = A1; + AImag = A0; + } else { + AReal = A0; + AImag = A1; + CN->Rotation = ComplexDeinterleavingRotation::Rotation_180; + } + } + + AReal = UnwrapCast(AReal); + AImag = UnwrapCast(AImag); + BReal = UnwrapCast(BReal); + BImag = UnwrapCast(BImag); + + VectorType *VTy = cast(V->getType()); + Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2); + if (AReal->getType() != ExpectedOperandTy) + return nullptr; + if (AImag->getType() != ExpectedOperandTy) + return nullptr; + if (BReal->getType() != ExpectedOperandTy) + return nullptr; + if (BImag->getType() != ExpectedOperandTy) + return nullptr; + + if (Phi->getType() != VTy && RealUser->getType() != VTy) + return nullptr; + + NodePtr Node = identifyNode(AReal, AImag); + + // In the case that a node was identified to figure out the rotation, ensure + // that trying to identify a node with AReal and AImag post-unwrap results in + // the same node + if (ANode && Node != ANode) { + LLVM_DEBUG( + dbgs() + << "Identified node is different from previously identified node. " + "Unable to confidently generate a complex operation node\n"); + return nullptr; + } + + CN->addOperand(Node); + CN->addOperand(identifyNode(BReal, BImag)); + CN->addOperand(identifyNode(Phi, RealUser)); + + return submitCompositeNode(CN); +} + +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) { + // Partial reductions don't support non-vector types, so check these first + if (!isa(R->getType()) || !isa(I->getType())) + return nullptr; + + auto CommonUser = + findCommonBetweenCollections(R->users(), I->users()); + if (!CommonUser) + return nullptr; + + auto *IInst = dyn_cast(*CommonUser); + if (!IInst || IInst->getIntrinsicID() != + Intrinsic::experimental_vector_partial_reduce_add) + return nullptr; + + if (NodePtr CN = identifyDotProduct(IInst)) + return CN; + + return nullptr; +} +ComplexDeinterleavingGraph::NodePtr +ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) { auto It = CachedResult.find({R, I}); if (It != CachedResult.end()) { LLVM_DEBUG(dbgs() << " - Folding to existing node\n"); return It->second; } + if (NodePtr CN = identifyPartialReduction(R, I)) + return CN; + + bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I); + if (!IsReduction && R->getType() != I->getType()) + return nullptr; + if (NodePtr CN = identifySplat(R, I)) return CN; @@ -1427,12 +1590,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) { if (It != RootToNode.end()) { auto RootNode = It->second; assert(RootNode->Operation == - ComplexDeinterleavingOperation::ReductionOperation); + ComplexDeinterleavingOperation::ReductionOperation || + RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle); // Find out which part, Real or Imag, comes later, and only if we come to // the latest part, add it to OrderedRoots. auto *R = cast(RootNode->Real); - auto *I = cast(RootNode->Imag); - auto *ReplacementAnchor = R->comesBefore(I) ? I : R; + auto *I = RootNode->Imag ? cast(RootNode->Imag) : nullptr; + + Instruction *ReplacementAnchor; + if (I) + ReplacementAnchor = R->comesBefore(I) ? I : R; + else + ReplacementAnchor = R; + if (ReplacementAnchor != RootI) return false; OrderedRoots.push_back(RootI); @@ -1523,7 +1694,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { for (size_t j = i + 1; j < OperationInstruction.size(); ++j) { if (Processed[j]) continue; - auto *Real = OperationInstruction[i]; auto *Imag = OperationInstruction[j]; if (Real->getType() != Imag->getType()) @@ -1556,6 +1726,28 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { break; } } + + auto *Real = OperationInstruction[i]; + // We want to check that we have 2 operands, but the function attributes + // being counted as operands bloats this value. + if (Real->getNumOperands() < 2) + continue; + + RealPHI = ReductionInfo[Real].first; + ImagPHI = nullptr; + PHIsFound = false; + auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1)); + if (Node && PHIsFound) { + LLVM_DEBUG( + dbgs() << "Identified single reduction starting from instruction: " + << *Real << "/" << *ReductionInfo[Real].second << "\n"); + Processed[i] = true; + auto RootNode = prepareCompositeNode( + ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr); + RootNode->addOperand(Node); + RootToNode[Real] = RootNode; + submitCompositeNode(RootNode); + } } RealPHI = nullptr; @@ -1563,6 +1755,24 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() { } bool ComplexDeinterleavingGraph::checkNodes() { + + bool FoundDeinterleaveNode = false; + for (NodePtr N : CompositeNodes) { + if (!N->areOperandsValid()) + return false; + if (N->Operation == ComplexDeinterleavingOperation::Deinterleave) + FoundDeinterleaveNode = true; + } + + // We need a deinterleave node in order to guarantee that we're working with + // complex numbers. + if (!FoundDeinterleaveNode) { + LLVM_DEBUG( + dbgs() << "Couldn't find a deinterleave node within the graph, cannot " + "guarantee safety during graph transformation.\n"); + return false; + } + // Collect all instructions from roots to leaves SmallPtrSet AllInstructions; SmallVector Worklist; @@ -1831,7 +2041,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) { ComplexDeinterleavingGraph::NodePtr ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real, Instruction *Imag) { - if (Real != RealPHI || Imag != ImagPHI) + if (Real != RealPHI || (ImagPHI && Imag != ImagPHI)) return nullptr; PHIsFound = true; @@ -1926,6 +2136,16 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, Value *ReplacementNode; switch (Node->Operation) { + case ComplexDeinterleavingOperation::CDot: { + Value *Input0 = ReplaceOperandIfExist(Node, 0); + Value *Input1 = ReplaceOperandIfExist(Node, 1); + Value *Accumulator = ReplaceOperandIfExist(Node, 2); + assert(!Input1 || (Input0->getType() == Input1->getType() && + "Node inputs need to be of the same type")); + ReplacementNode = TL->createComplexDeinterleavingIR( + Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator); + break; + } case ComplexDeinterleavingOperation::CAdd: case ComplexDeinterleavingOperation::CMulPartial: case ComplexDeinterleavingOperation::Symmetric: { @@ -1969,13 +2189,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, case ComplexDeinterleavingOperation::ReductionPHI: { // If Operation is ReductionPHI, a new empty PHINode is created. // It is filled later when the ReductionOperation is processed. + auto *OldPHI = cast(Node->Real); auto *VTy = cast(Node->Real->getType()); auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt()); - OldToNewPHI[dyn_cast(Node->Real)] = NewPHI; + OldToNewPHI[OldPHI] = NewPHI; ReplacementNode = NewPHI; break; } + case ComplexDeinterleavingOperation::ReductionSingle: + ReplacementNode = replaceNode(Builder, Node->Operands[0]); + processReductionSingle(ReplacementNode, Node); + break; case ComplexDeinterleavingOperation::ReductionOperation: ReplacementNode = replaceNode(Builder, Node->Operands[0]); processReductionOperation(ReplacementNode, Node); @@ -2000,6 +2225,38 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder, return ReplacementNode; } +void ComplexDeinterleavingGraph::processReductionSingle( + Value *OperationReplacement, RawNodePtr Node) { + auto *Real = cast(Node->Real); + auto *OldPHI = ReductionInfo[Real].first; + auto *NewPHI = OldToNewPHI[OldPHI]; + auto *VTy = cast(Real->getType()); + auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy); + + Value *Init = OldPHI->getIncomingValueForBlock(Incoming); + + IRBuilder<> Builder(Incoming->getTerminator()); + + Value *NewInit = nullptr; + if (auto *C = dyn_cast(Init)) { + if (C->isZeroValue()) + NewInit = Constant::getNullValue(NewVTy); + } + + if (!NewInit) + NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy, + {Init, Constant::getNullValue(VTy)}); + + NewPHI->addIncoming(NewInit, Incoming); + NewPHI->addIncoming(OperationReplacement, BackEdge); + + auto *FinalReduction = ReductionInfo[Real].second; + Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt()); + + auto *AddReduce = Builder.CreateAddReduce(OperationReplacement); + FinalReduction->replaceAllUsesWith(AddReduce); +} + void ComplexDeinterleavingGraph::processReductionOperation( Value *OperationReplacement, RawNodePtr Node) { auto *Real = cast(Node->Real); @@ -2059,8 +2316,13 @@ void ComplexDeinterleavingGraph::replaceNodes() { auto *RootImag = cast(RootNode->Imag); ReductionInfo[RootReal].first->removeIncomingValue(BackEdge); ReductionInfo[RootImag].first->removeIncomingValue(BackEdge); - DeadInstrRoots.push_back(cast(RootReal)); - DeadInstrRoots.push_back(cast(RootImag)); + DeadInstrRoots.push_back(RootReal); + DeadInstrRoots.push_back(RootImag); + } else if (RootNode->Operation == + ComplexDeinterleavingOperation::ReductionSingle) { + auto *RootInst = cast(RootNode->Real); + ReductionInfo[RootInst].first->removeIncomingValue(BackEdge); + DeadInstrRoots.push_back(ReductionInfo[RootInst].second); } else { assert(R && "Unable to find replacement for RootInstruction"); DeadInstrRoots.push_back(RootInstruction); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 070163a5fb297c..c965659b0fef11 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -29654,9 +29654,16 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported( if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) { unsigned ScalarWidth = ScalarTy->getScalarSizeInBits(); + + if (Operation == ComplexDeinterleavingOperation::CDot) + return ScalarWidth == 32 || ScalarWidth == 64; return 8 <= ScalarWidth && ScalarWidth <= 64; } + // CDot is not supported outside of scalable/sve scopes + if (Operation == ComplexDeinterleavingOperation::CDot) + return false; + return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) || ScalarTy->isFloatTy() || ScalarTy->isDoubleTy(); } @@ -29666,6 +29673,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator) const { VectorType *Ty = cast(InputA->getType()); + if (Accumulator == nullptr) + Accumulator = Constant::getNullValue(Ty); bool IsScalable = Ty->isScalableTy(); bool IsInt = Ty->getElementType()->isIntegerTy(); @@ -29677,6 +29686,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( if (TyWidth > 128) { int Stride = Ty->getElementCount().getKnownMinValue() / 2; + int AccStride = cast(Accumulator->getType()) + ->getElementCount() + .getKnownMinValue() / + 2; auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0)); auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0)); @@ -29686,25 +29699,26 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride)); Value *LowerSplitAcc = nullptr; Value *UpperSplitAcc = nullptr; - if (Accumulator) { - LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0)); - UpperSplitAcc = - B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride)); - } + Type *FullTy = Ty; + FullTy = Accumulator->getType(); + auto *HalfAccTy = VectorType::getHalfElementsVectorType( + cast(Accumulator->getType())); + LowerSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0)); + UpperSplitAcc = + B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride)); auto *LowerSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc); auto *UpperSplitInt = createComplexDeinterleavingIR( B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc); - auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt, - B.getInt64(0)); - return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride)); + auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), + LowerSplitInt, B.getInt64(0)); + return B.CreateInsertVector(FullTy, Result, UpperSplitInt, + B.getInt64(AccStride)); } if (OperationType == ComplexDeinterleavingOperation::CMulPartial) { - if (Accumulator == nullptr) - Accumulator = Constant::getNullValue(Ty); - if (IsScalable) { if (IsInt) return B.CreateIntrinsic( @@ -29756,6 +29770,13 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR( return B.CreateIntrinsic(IntId, Ty, {InputA, InputB}); } + if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt && + IsScalable) { + return B.CreateIntrinsic( + Intrinsic::aarch64_sve_cdot, Accumulator->getType(), + {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)}); + } + return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll new file mode 100644 index 00000000000000..11cf4c31936d8f --- /dev/null +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll @@ -0,0 +1,1136 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2 +; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE +; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-none-unknown-elf" + +define i32 @cdotp_i8_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i32 @cdotp_i8_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[A]], i64 16) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv16i8.nxv32i8( [[B]], i64 16) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP11]], i64 4) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv4i32( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP9]], [[TMP8]], i64 4) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i64 @cdotp_i16_rot0( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 0) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 0) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot90( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 90) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 90) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot180( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 180) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 180) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + +define i64 @cdotp_i16_rot270( %a, %b) { +; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 0) +; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 0) +; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[A]], i64 8) +; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call @llvm.vector.extract.nxv8i16.nxv16i16( [[B]], i64 8) +; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 0) +; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2i64.nxv4i64( [[TMP11]], i64 2) +; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP5]], [[TMP1]], [[TMP2]], i32 270) +; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.cdot.nxv2i64( [[TMP6]], [[TMP3]], [[TMP4]], i32 270) +; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, [[TMP7]], i64 0) +; CHECK-SVE2-NEXT: [[TMP10]] = call @llvm.vector.insert.nxv4i64.nxv2i64( [[TMP9]], [[TMP8]], i64 2) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64( [[TMP10]]) +; CHECK-SVE2-NEXT: ret i64 [[TMP0]] +; +; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i64 [[TMP11]] +; +; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i16( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( [[REAL_MUL_REDUCED]], [[IMAG_MUL]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i64 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v16i16( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.imag.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.real.ext + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64( %real.mul.reduced, %imag.mul) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i64 @llvm.vector.reduce.add.nxv2i64( %partial.reduce.sub) + ret i64 %0 +} + + +define i32 @not_cdotp( %a, %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub zeroinitializer, [[REAL_MUL]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[REAL_MUL_NEG]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.neg = sub zeroinitializer, %real.mul + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %vec.phi, %real.mul.neg) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.nxv4i32( %partial.reduce.sub) + ret i32 %0 +} + +define i16 @invalid_type( %a, %b) { +; CHECK-SVE2-LABEL: define i16 @invalid_type( +; CHECK-SVE2-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i16 [[TMP0]] +; +; CHECK-SVE-LABEL: define i16 @invalid_type( +; CHECK-SVE-SAME: [[A:%.*]], [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i16 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i16 @invalid_type( +; CHECK-NOSVE-SAME: [[A:%.*]], [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { , } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { , } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext [[A_REAL]] to +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext [[A_IMAG]] to +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext [[B_REAL]] to +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext [[B_IMAG]] to +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[VEC_PHI]], [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( [[REAL_MUL_REDUCED]], [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16( [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i16 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %a) + %b.deinterleaved = call { , } @llvm.vector.deinterleave2.v32i8( %b) + %a.real = extractvalue { , } %a.deinterleaved, 0 + %a.imag = extractvalue { , } %a.deinterleaved, 1 + %b.real = extractvalue { , } %b.deinterleaved, 0 + %b.imag = extractvalue { , } %b.deinterleaved, 1 + %a.real.ext = sext %a.real to + %a.imag.ext = sext %a.imag to + %b.real.ext = sext %b.real to + %b.imag.ext = sext %b.imag to + %real.mul = mul %b.real.ext, %a.real.ext + %real.mul.reduced = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %vec.phi, %real.mul) + %imag.mul = mul %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub zeroinitializer, %imag.mul + %partial.reduce.sub = call @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32( %real.mul.reduced, %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i16 @llvm.vector.reduce.add.nxv8i16( %partial.reduce.sub) + ret i16 %0 +} + +define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) { +; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE2-NEXT: [[ENTRY:.*]]: +; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE2: [[VECTOR_BODY]]: +; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE2: [[MIDDLE_BLOCK]]: +; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE2-NEXT: ret i32 [[TMP0]] +; +; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] { +; CHECK-SVE-NEXT: [[ENTRY:.*]]: +; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-SVE: [[VECTOR_BODY]]: +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-SVE: [[MIDDLE_BLOCK]]: +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-SVE-NEXT: ret i32 [[TMP0]] +; +; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length( +; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) { +; CHECK-NOSVE-NEXT: [[ENTRY:.*]]: +; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-NOSVE: [[VECTOR_BODY]]: +; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]]) +; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]]) +; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0 +; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1 +; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32> +; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]] +; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]]) +; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]] +; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]] +; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]]) +; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]] +; CHECK-NOSVE: [[MIDDLE_BLOCK]]: +; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]]) +; CHECK-NOSVE-NEXT: ret i32 [[TMP0]] +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ] + %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a) + %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b) + %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0 + %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1 + %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0 + %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1 + %a.real.ext = sext <16 x i8> %a.real to <16 x i32> + %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32> + %b.real.ext = sext <16 x i8> %b.real to <16 x i32> + %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32> + %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext + %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul) + %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext + %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul + %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg) + br i1 true, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub) + ret i32 %0 +} + +declare @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(, ) +declare @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(, ) + +declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) + +declare i32 @llvm.vector.reduce.add.nxv4i32() +declare i64 @llvm.vector.reduce.add.nxv2i64() diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll index 68cb29f8f5c8f8..7542e9c4b8f5be 100644 --- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll +++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-crash.ll @@ -29,3 +29,92 @@ bb193: ; preds = %bb173 store volatile i32 0, ptr null, align 4 unreachable } + +; Check that the deinterleaving pass doesn't try to transform isolated patterns without a relevant deinterleaving pattern +define i32 @check_deinterleaving_has_deinterleave(ptr %a) { +; CHECK-LABEL: check_deinterleaving_has_deinterleave: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: movi v1.4s, #1 +; CHECK-NEXT: add x8, x0, #16 +; CHECK-NEXT: movi v3.2d, #0000000000000000 +; CHECK-NEXT: movi v2.2d, #0000000000000000 +; CHECK-NEXT: mov w9, #32 // =0x20 +; CHECK-NEXT: movi v4.2d, #0000000000000000 +; CHECK-NEXT: movi v5.2d, #0000000000000000 +; CHECK-NEXT: movi v7.2d, #0000000000000000 +; CHECK-NEXT: movi v6.2d, #0000000000000000 +; CHECK-NEXT: movi v16.2d, #0000000000000000 +; CHECK-NEXT: .LBB1_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldp q17, q18, [x8, #-16] +; CHECK-NEXT: subs x9, x9, #32 +; CHECK-NEXT: add x8, x8, #32 +; CHECK-NEXT: cmeq v17.16b, v17.16b, #0 +; CHECK-NEXT: cmeq v18.16b, v18.16b, #0 +; CHECK-NEXT: ushll2 v19.8h, v17.16b, #0 +; CHECK-NEXT: ushll v17.8h, v17.8b, #0 +; CHECK-NEXT: ushll2 v20.8h, v18.16b, #0 +; CHECK-NEXT: ushll v18.8h, v18.8b, #0 +; CHECK-NEXT: ushll v21.4s, v19.4h, #0 +; CHECK-NEXT: ushll2 v19.4s, v19.8h, #0 +; CHECK-NEXT: ushll v22.4s, v17.4h, #0 +; CHECK-NEXT: ushll2 v17.4s, v17.8h, #0 +; CHECK-NEXT: ushll2 v23.4s, v20.8h, #0 +; CHECK-NEXT: ushll v24.4s, v18.4h, #0 +; CHECK-NEXT: ushll2 v18.4s, v18.8h, #0 +; CHECK-NEXT: ushll v20.4s, v20.4h, #0 +; CHECK-NEXT: and v21.16b, v21.16b, v1.16b +; CHECK-NEXT: and v19.16b, v19.16b, v1.16b +; CHECK-NEXT: and v22.16b, v22.16b, v1.16b +; CHECK-NEXT: and v17.16b, v17.16b, v1.16b +; CHECK-NEXT: and v23.16b, v23.16b, v1.16b +; CHECK-NEXT: and v24.16b, v24.16b, v1.16b +; CHECK-NEXT: and v18.16b, v18.16b, v1.16b +; CHECK-NEXT: and v20.16b, v20.16b, v1.16b +; CHECK-NEXT: add v4.4s, v4.4s, v19.4s +; CHECK-NEXT: add v2.4s, v2.4s, v21.4s +; CHECK-NEXT: add v0.4s, v0.4s, v22.4s +; CHECK-NEXT: add v3.4s, v3.4s, v17.4s +; CHECK-NEXT: add v16.4s, v16.4s, v23.4s +; CHECK-NEXT: add v5.4s, v5.4s, v24.4s +; CHECK-NEXT: add v6.4s, v6.4s, v20.4s +; CHECK-NEXT: add v7.4s, v7.4s, v18.4s +; CHECK-NEXT: b.ne .LBB1_1 +; CHECK-NEXT: // %bb.2: // %middle.block +; CHECK-NEXT: add v1.4s, v7.4s, v3.4s +; CHECK-NEXT: add v3.4s, v16.4s, v4.4s +; CHECK-NEXT: add v0.4s, v5.4s, v0.4s +; CHECK-NEXT: add v2.4s, v6.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v3.4s +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: addv s0, v0.4s +; CHECK-NEXT: fmov w0, s0 +; CHECK-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <16 x i32> [ zeroinitializer, %entry ], [ %9, %vector.body ] + %vec.phi50 = phi <16 x i32> [ zeroinitializer, %entry ], [ %10, %vector.body ] + %next.gep = getelementptr i8, ptr %a, i64 %index + %4 = getelementptr i8, ptr %next.gep, i64 16 + %wide.load = load <16 x i8>, ptr %next.gep, align 1 + %wide.load51 = load <16 x i8>, ptr %4, align 1 + %5 = icmp eq <16 x i8> %wide.load, zeroinitializer + %6 = icmp eq <16 x i8> %wide.load51, zeroinitializer + %7 = zext <16 x i1> %5 to <16 x i32> + %8 = zext <16 x i1> %6 to <16 x i32> + %9 = add <16 x i32> %vec.phi, %7 + %10 = add <16 x i32> %vec.phi50, %8 + %index.next = add nuw i64 %index, 32 + %11 = icmp eq i64 %index.next, 32 + br i1 %11, label %middle.block, label %vector.body + +middle.block: + %bin.rdx = add <16 x i32> %10, %9 + %12 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %bin.rdx) + ret i32 %12 +}