diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp index c43a7b095d0c..e451f3c94e3f 100644 --- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -27,10 +27,6 @@ cl::opt EnableSubregRenaming("aie-subreg-renaming", cl::Hidden, cl::init(false), cl::desc("Enable RenameIndependentSubregs pass")); -static cl::opt - EnableWAWRegRewrite("aie-wawreg-rewrite", - cl::desc("Enable the WAW Register Renaming in loops"), - cl::init(true), cl::Hidden); static cl::opt EnableReservedRegsLICM("aie-reserved-regs-licm", cl::Hidden, cl::init(true), cl::desc("Enable LICM for some reserved registers")); @@ -45,6 +41,7 @@ extern cl::opt EnableStagedRA; extern cl::opt EnableSuperRegSplitting; extern cl::opt AllocateMRegsFirst; extern cl::opt EnablePreMISchedCoalescer; +extern cl::opt EnableWAWRegRewrite; extern bool AIEDumpArtifacts; @@ -161,8 +158,10 @@ bool AIE2PassConfig::addRegAssignAndRewriteOptimized() { addPass(createAIESuperRegRewriter()); } addPass(createGreedyRegisterAllocator()); - if (EnableWAWRegRewrite) + if (EnableWAWRegRewrite) { addPass(createAIEWawRegRewriter()); + addPass(createGreedyRegisterAllocator()); + } addPass(createVirtRegRewriter()); return true; diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp index 53888fc9c79f..dbd15a05eeb0 100644 --- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp +++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp @@ -77,6 +77,11 @@ cl::opt EnableStagedRA("aie-staged-ra", cl::Hidden, cl::init(true), cl::desc("Enable multi-stage register allocation")); +cl::opt + EnableWAWRegRewrite("aie-wawreg-rewrite", + cl::desc("Enable the WAW Register Renaming in loops"), + cl::init(true), cl::Hidden); + cl::opt EnableSuperRegSplitting("aie-split-superregs", cl::Hidden, cl::init(true), cl::desc("Enable splitting super-regs into their " diff --git a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp index 0a371119479a..546164cf17b4 100644 --- a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates // //===----------------------------------------------------------------------===// // @@ -17,6 +17,7 @@ #include "AIEBaseRegisterInfo.h" #include "Utils/AIELoopUtils.h" +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/LiveDebugVariables.h" #include "llvm/CodeGen/LiveIntervals.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCRegister.h" #include "llvm/Support/Debug.h" @@ -38,8 +40,17 @@ using namespace llvm; #define DEBUG_TYPE "aie-waw-reg-rewrite" +static cl::opt AggressiveReAlloc( + "aie-aggressive-realloc", cl::Hidden, cl::init(false), + cl::desc("Aggressively de-allocate live-through registers to favor " + "loop-local registers")); +static cl::opt GPRRealloc("aie-gpr-realloc", cl::Hidden, cl::init(false), + cl::desc("Re-allocate GPRs as well")); + namespace { +using RoundRobin = std::list; + /// /// This pass rewrites physical register assignments in critical parts of the /// code (like loops) to break WAW and WAR dependencies. @@ -91,17 +102,19 @@ class AIEWawRegRewriter : public MachineFunctionPass { /// pass tries to remove. BitVector getDefinedPhysRegs(const MachineBasicBlock *MBB) const; - /// returns true if the physical register of Reg was replaced - bool replaceReg(const Register Reg, BitVector &BlockedPhysRegs); + /// Returns true if the physical register \p Reg was replaced + bool replaceReg(const Register Reg, RoundRobin &Registers, + BitVector &UsedUnits); + + void unassignReg(Register Reg); + void assignReg(Register Reg, MCPhysReg PhysReg); - /// Find a free register of the same register class type, but - /// exclude the blocked physical registers from the result. - /// Otherwise a new WAW dependencies can be introduced, that was previously - /// removed. - MCPhysReg getReplacementPhysReg(const Register Reg, - const BitVector &BlockedPhysRegs) const; + /// Find a free register of the same register class type + MCPhysReg getReplacementPhysReg(const Register Reg, RoundRobin &Registers, + BitVector &UsedUnits) const; - bool isWorthRenaming(const Register &Reg, const BitVector &UsedPhysRegs, + /// Whether \p Reg should be considered a candidate for re-assignment. + bool isWorthRenaming(const Register &Reg, const BitVector &VRegWithCopies) const; /// return the Physical register of the Register, look it up in VirtRegMap if @@ -132,11 +145,6 @@ class AIEWawRegRewriter : public MachineFunctionPass { /// instruction. IndexedMap getLastVRegDef(const MachineBasicBlock &MBB) const; - - /// Block every sub- and super-register of a physical register, so that it is - /// removed for future replacement strategies, i.e. block wl4, wh4, y2 if X4 - /// is used. - void addAliasRegs(BitVector &BlockedPhysRegs, const MCPhysReg PhysReg) const; }; MCPhysReg AIEWawRegRewriter::getAssignedPhysReg(const Register Reg) const { @@ -209,16 +217,6 @@ AIEWawRegRewriter::getVRegWithCopies(const MachineBasicBlock &MBB) const { bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "WAW Reg Renaming BasicBlock "; MBB->dump(); dbgs() << "\n"); - bool Modified = false; - // Add the used physical registers one machine instruction at a time. - // This vector is used to determine, if a physical register has already been - // defined in the machine basic block. - BitVector UsedPhysRegs(TRI->getNumRegs()); - - // Get a list of registers, that are not allowed as a replacement register. - // This list gets updated with the newly replaced physical register, so that - // this pass does not introduce WAW dependencies. - BitVector BlockedPhysRegs = getDefinedPhysRegs(MBB); // Collect all the virtual registers that have at least a copy instruction // that defines them. Subregisters may contain constants that may be shared @@ -230,15 +228,18 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { IndexedMap LastVRegDef = getLastVRegDef(*MBB); - for (const MachineInstr &MI : *MBB) { + // Record the candidates and their original allocation + using OriginalAllocation = + std::vector>; + OriginalAllocation Candidates; + for (const MachineInstr &MI : *MBB) { // Identity copies will be removed in a later pass, therefore, these are not // real defines of a physical register if (isIdentityCopy(MI)) continue; for (const MachineOperand &MO : MI.defs()) { - Register Reg = MO.getReg(); if (!Reg.isVirtual()) continue; @@ -249,103 +250,252 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { // several definitions of the same virtual register are not relevant // because even if the virtual register is renamed, by construction // all the definitions would be renamed as well and achieve nothing wrt - // WAW dependecy resolution + // WAW dependency resolution if (LastVRegDef[Reg] != &MI) continue; - if (isWorthRenaming(Reg, UsedPhysRegs, VRegWithCopies) && - replaceReg(Reg, BlockedPhysRegs)) { - - LLVM_DEBUG(dbgs() << MI); - Modified = true; - - } else { - // Keep track of already visited physical registers. - // Incrementally add the encountered physical registers, so that a - // second occurrence of a physical register can trigger the register - // rewriting - // Blocked registers for the replacement are recorded in - // BlockedPhysRegs. Initially all the used physical registers - // from the MBB are blocked, so that replacements do not introduce WAW - // dependencies. Additionally, replaced registers are already blocked in - // BlockedPhysRegs, so that an additional replacement will not cause a - // WAW, which this pass is trying to remove. - UsedPhysRegs[VRM->getPhys(Reg)] = true; + if (isWorthRenaming(Reg, VRegWithCopies)) { + assert(VRM->hasPhys(Reg)); + MCRegister AssignedPhysReg = VRM->getPhys(Reg); + Candidates.emplace_back(&MO, AssignedPhysReg); + LLVM_DEBUG(dbgs() << "Candidate " << printReg(Reg, TRI, 0, MRI) << ":" + << TRI->getRegClassName(MRI->getRegClass(Reg)) << " (" + << TRI->getName(AssignedPhysReg) << ")\n"); } } } - return Modified; + // Free physregs of all candidates and register their regclasses + std::set RegClasses; + for (auto &[MO, Org] : Candidates) { + auto VReg = MO->getReg(); + if (VRM->hasPhys(VReg)) + unassignReg(VReg); + auto *RC = MRI->getRegClass(VReg); + RegClasses.insert(RC); + } + LLVM_DEBUG(dbgs() << "Renaming " << Candidates.size() << " candidates in " + << RegClasses.size() << " classes\n"); + + // If requested, unassign MBB's liveins as well to get even more freedom + if (AggressiveReAlloc) { + for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) { + Register Reg = Register::index2VirtReg(I); + if (!LIS->hasInterval(Reg) || !RegClasses.count(MRI->getRegClass(Reg))) + continue; + LiveInterval &LI = LIS->getInterval(Reg); + if (LIS->isLiveInToMBB(LI, MBB) && VRM->hasPhys(Reg)) { + unassignReg(Reg); + } + } + } + + // Reallocate all virtual registers in Candidates. + // Return true if successful. + auto ReAllocate = [&](OriginalAllocation &Candidates, RoundRobin &Registers) { + BitVector UsedUnits; + UsedUnits.resize(TRI->getNumRegUnits()); + for (auto &[MO, Org] : Candidates) { + auto VReg = MO->getReg(); + if (!replaceReg(VReg, Registers, UsedUnits)) { + LLVM_DEBUG(dbgs() << "Renaming " << printReg(VReg, TRI, 0, MRI) + << " failed\n"); + return false; + } + } + return true; + }; + + // Reapply the original allocation to all Candidates + auto RevertAllocation = [&](OriginalAllocation &Candidates) { + // The partial allocation may conflict with the original one in ugly ways. + // To be safe, reset all allocations first. + for (auto &[MO, Org] : Candidates) { + auto VReg = MO->getReg(); + if (VRM->hasPhys(VReg)) { + unassignReg(VReg); + } + } + for (auto &[MO, Org] : Candidates) { + auto VReg = MO->getReg(); + assignReg(VReg, Org); + } + }; + + // Least-Recently-Used list of physical registers for assignments to VRegs. + // Physical registers that have recently been used are moved to the back. + std::list LRURegisters; + + // For each reg class, allocate the candidates in round-robin fashion. + // If we fail, we fall back to the original allocation + BitVector ExcludedPhysRegs{TRI->getNumRegs()}; + + // Exclude CSRs + for (const MCPhysReg *CSR = MRI->getCalleeSavedRegs(); CSR && *CSR; ++CSR) + ExcludedPhysRegs[*CSR] = true; + + for (const auto *RC : RegClasses) { + + LLVM_DEBUG(dbgs() << "Allowed registers in RC=" << TRI->getRegClassName(RC) + << ":"); + for (MCPhysReg PhysReg : RC->getRegisters()) { + if (!ExcludedPhysRegs[PhysReg]) { + LLVM_DEBUG(dbgs() << " " << printReg(PhysReg, TRI)); + LRURegisters.push_back(PhysReg); + } + ExcludedPhysRegs[PhysReg] = true; + } + LLVM_DEBUG(dbgs() << "\n"); + } + if (!ReAllocate(Candidates, LRURegisters)) { + RevertAllocation(Candidates); + return false; + } + + return true; } bool AIEWawRegRewriter::isWorthRenaming(const Register &Reg, - const BitVector &UsedPhysRegs, const BitVector &VRegWithCopies) const { assert(Reg.isVirtual()); - // Only rename registers mapped to a phys reg assigned more than once - if (!UsedPhysRegs[VRM->getPhys(Reg)]) + // The register might have been de-allocated when processing another loop. + if (!VRM->hasPhys(Reg)) return false; - if (!TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg)))) + // Only consider vec/acc registers as candidates, and optionally GPRs. + bool IsCandidateClass = + TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg))) || + (GPRRealloc && + TRI->getGPRRegClass(*MF)->hasSubClassEq(MRI->getRegClass(Reg))); + if (!IsCandidateClass) return false; return !VRegWithCopies[Reg.virtRegIndex()]; } -BitVector -AIEWawRegRewriter::getDefinedPhysRegs(const MachineBasicBlock *MBB) const { - BitVector BlockedPhysRegs(TRI->getNumRegs()); +void AIEWawRegRewriter::unassignReg(Register VReg) { + const LiveInterval &LI = LIS->getInterval(VReg); + LRM->unassign(LI); +} - for (const MachineInstr &MI : *MBB) { - for (const MachineOperand &Op : MI.defs()) { - MCPhysReg PhysReg = getAssignedPhysReg(Op.getReg()); - if (MCRegister::isPhysicalRegister(PhysReg)) - addAliasRegs(BlockedPhysRegs, PhysReg); - } +void AIEWawRegRewriter::assignReg(Register VReg, MCPhysReg PhysReg) { + const LiveInterval &LI = LIS->getInterval(VReg); + if (VRM->hasPhys(VReg)) { + LRM->unassign(LI); } - - return BlockedPhysRegs; + LRM->assign(LI, PhysReg); } -bool AIEWawRegRewriter::replaceReg(const Register Reg, - BitVector &BlockedPhysRegs) { - assert(Reg.isVirtual()); - LLVM_DEBUG(dbgs() << " WAW RegRewriter: Register to replace " - << TRI->getName(VRM->getPhys(Reg)) << "\n"); - - MCPhysReg ReplacementPhysReg = getReplacementPhysReg(Reg, BlockedPhysRegs); +bool AIEWawRegRewriter::replaceReg(const Register VReg, + RoundRobin &LRURegisters, + BitVector &UsedUnits) { + assert(VReg.isVirtual()); + MCPhysReg ReplacementPhysReg = + getReplacementPhysReg(VReg, LRURegisters, UsedUnits); if (ReplacementPhysReg == MCRegister::NoRegister) return false; - LLVM_DEBUG(dbgs() << " WAW Replacement: Virtual Register " - << printReg(VRM->getPhys(Reg), TRI, 0, MRI) - << " will replace " - << printReg(VRM->getPhys(Reg), TRI, 0, MRI) << " with " - << printReg(ReplacementPhysReg, TRI, 0, MRI) << '\n'); + LLVM_DEBUG(dbgs() << " replace: " << printReg(VReg, TRI) << " with " + << TRI->getName(ReplacementPhysReg) << '\n'); + assert(Register::isPhysicalRegister(ReplacementPhysReg)); - const LiveInterval &LI = LIS->getInterval(Reg); - LRM->unassign(LI); - LRM->assign(LI, ReplacementPhysReg); - addAliasRegs(BlockedPhysRegs, ReplacementPhysReg); + assignReg(VReg, ReplacementPhysReg); return true; } -MCPhysReg AIEWawRegRewriter::getReplacementPhysReg( - const Register Reg, const BitVector &BlockedPhysRegs) const { - assert(Reg.isVirtual() && "Reg has to be a virtual register"); - const TargetRegisterClass *RC = MRI->getRegClass(Reg); +/// Returns a vreg of the same class that is exclusively used (and killed) +/// at the point \p VReg gets defined. +std::optional +getKilledRegAtSingledDefPoint(Register VReg, const MachineRegisterInfo &MRI) { + MachineOperand *MO = MRI.getOneDef(VReg); + if (!MO) + return std::nullopt; + + MachineInstr &DefMI = *MO->getParent(); + auto OnlyUsedByInstr = [&MRI](Register Reg, const MachineInstr &MI) { + return all_of(MRI.use_instructions(Reg), + [&MI](const MachineInstr &UseMI) { return &UseMI == &MI; }); + }; + + for (MachineOperand &UseMO : DefMI.explicit_uses()) { + if (UseMO.isReg() && UseMO.getReg().isVirtual() && + MRI.getRegClass(VReg) == MRI.getRegClass(UseMO.getReg()) && + OnlyUsedByInstr(UseMO.getReg(), DefMI)) { + return UseMO.getReg(); + } + } + return std::nullopt; +} + +void moveRegAndAliasesBack(MCPhysReg PhysReg, RoundRobin &LRURegisters, + const TargetRegisterInfo *TRI) { + for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid(); + ++AI) { + // TODO: Use hints to speed up the search of aliases? + auto AliasIt = llvm::find(LRURegisters, *AI); + if (AliasIt != LRURegisters.end()) { + LRURegisters.erase(AliasIt); + LRURegisters.emplace_back(*AI); + } + } +} + +MCPhysReg AIEWawRegRewriter::getReplacementPhysReg(const Register VReg, + RoundRobin &LRURegisters, + BitVector &UsedUnits) const { + assert(VReg.isVirtual() && "Reg has to be a virtual register"); - LiveInterval &LI = LIS->getInterval(Reg); - for (const MCPhysReg &PhysReg : RC->getRegisters()) { + /// Whether \p PhysReg was ever used for re-assigning a vreg + auto WasUsedForReassignment = [TRI = this->TRI, + &UsedUnits](MCPhysReg PhysReg) { + return any_of(TRI->regunits(PhysReg), + [&UsedUnits](MCRegUnit RU) { return UsedUnits.test(RU); }); + }; - if (BlockedPhysRegs[PhysReg]) - continue; + LLVM_DEBUG(dbgs() << " Try to re-assign" << printReg(VReg, TRI) << "\n"); + const TargetRegisterClass *RC = MRI->getRegClass(VReg); + const LiveInterval &LI = LIS->getInterval(VReg); + // Find the least-recently assigned register to assign to VReg. + for (auto It = LRURegisters.begin(); It != LRURegisters.end(); ++It) { + MCPhysReg PhysReg = *It; + + if (!RC->contains(PhysReg)) { + continue; + } LiveRegMatrix::InterferenceKind IK = LRM->checkInterference(LI, PhysReg); - if (IK == LiveRegMatrix::IK_Free) + if (IK == LiveRegMatrix::IK_Free) { + // If the chosen physical register has already been used and the vreg to + // allocate is defined at a point where another vreg gets killed, prefer + // reusing the assignment of the killed reg. + if (std::optional KilledReg = + getKilledRegAtSingledDefPoint(VReg, *MRI); + KilledReg && WasUsedForReassignment(PhysReg)) { + MCRegister KilledPhysReg = getAssignedPhysReg(*KilledReg); + if (KilledPhysReg && LRM->checkInterference(LI, KilledPhysReg) == + LiveRegMatrix::IK_Free) { + + LLVM_DEBUG(dbgs() << " re-use killed physreg for assigning: " + << printReg(VReg, TRI) << " to " + << TRI->getName(KilledPhysReg) << '\n'); + PhysReg = KilledPhysReg; + It = llvm::find(LRURegisters, KilledPhysReg); + assert(It != LRURegisters.end()); + } + } + + // Move it to the end of the list. We return, so don't have to + // care about invalidation + moveRegAndAliasesBack(PhysReg, LRURegisters, TRI); + for (MCRegUnit RU : TRI->regunits(PhysReg)) + UsedUnits.set(RU); return PhysReg; + } + LLVM_DEBUG(dbgs() << " Cannot assign " << printReg(VReg, TRI) + << " to " << TRI->getName(PhysReg) + << " due to interference\n"); } return MCRegister::NoRegister; } @@ -379,20 +529,6 @@ AIEWawRegRewriter::getLastVRegDef(const MachineBasicBlock &MBB) const { return LastVRegDef; } -void AIEWawRegRewriter::addAliasRegs(BitVector &BlockedPhysRegs, - const MCPhysReg PhysReg) const { - assert(MCRegister::isPhysicalRegister(PhysReg)); - - LLVM_DEBUG(dbgs() << "Adding to Blocked Regs (" - << printReg(PhysReg, TRI, 0, MRI) << ") with alias: "); - for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid(); - ++AI) { - BlockedPhysRegs[*AI] = true; - LLVM_DEBUG(dbgs() << printReg(*AI, TRI, 0, MRI) << " "); - } - LLVM_DEBUG(dbgs() << "\n"); -} - } // end anonymous namespace char AIEWawRegRewriter::ID = 0; diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp index 7ea7bc8ab9cf..ab0158f21334 100644 --- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp +++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp @@ -22,6 +22,7 @@ extern cl::opt EnableSuperRegSplitting; extern cl::opt AllocateMRegsFirst; extern cl::opt EnablePreMISchedCoalescer; extern cl::opt EnableAddressChaining; +extern cl::opt EnableWAWRegRewrite; void AIE2PTargetMachine::anchor() {} @@ -102,6 +103,10 @@ bool AIE2PPassConfig::addRegAssignAndRewriteOptimized() { addPass(createAIESuperRegRewriter()); } addPass(createGreedyRegisterAllocator()); + if (EnableWAWRegRewrite) { + addPass(createAIEWawRegRewriter()); + addPass(createGreedyRegisterAllocator()); + } addPass(createVirtRegRewriter()); return true; diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll index 7af1e07aa841..981c2f53d80a 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll @@ -88,71 +88,71 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c ; ASM-NEXT: lda r24, [p6, #0]; paddb [p7], #-128; mov p6, sp ; ASM-NEXT: lda m7, [p7, #0]; paddb [p6], #-124; movx r8, #11; mov dc7, dj3 ; ASM-NEXT: lda m4, [p6, #0]; movx r9, #31; mov r26, dj3 -; ASM-NEXT: // implicit-def: $x4 -; ASM-NEXT: // implicit-def: $x2 +; ASM-NEXT: // implicit-def: $x8 +; ASM-NEXT: // implicit-def: $x1 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_1: // %outer.loop.header ; ASM-NEXT: // =>This Loop Header: Depth=1 ; ASM-NEXT: // Child Loop BB0_2 Depth 2 -; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p4 -; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m1 -; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5 -; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m2 -; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m1 -; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32]; mov m3, r15 -; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m3 +; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; mov m1, p4 +; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m2, p5 +; ASM-NEXT: vlda.ups.s32.s16 bml1, s0, [p2], m2 +; ASM-NEXT: vlda.ups.s32.s16 bmh2, s0, [p2, #32] +; ASM-NEXT: vlda.ups.s32.s16 bml2, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m3, r15 +; ASM-NEXT: vlda.ups.s32.s16 bml3, s0, [p2], m3 +; ASM-NEXT: vlda.ups.s32.s16 bmh4, s0, [p2, #32] +; ASM-NEXT: vlda.ups.s32.s16 bml4, s0, [p2], m1 ; ASM-NEXT: vlda.ups.s32.s16 bmh5, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m1 +; ASM-NEXT: vlda.ups.s32.s16 bml5, s0, [p2], m2 ; ASM-NEXT: vlda.ups.s32.s16 bmh6, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m2 -; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32] -; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2], m1; mov r0, p0 -; ASM-NEXT: vlda.ups.s32.s16 bmh0, s0, [p2, #32]; and r0, r0, r9 -; ASM-NEXT: vlda.ups.s32.s16 bml0, s0, [p2, #0]; add r1, r0, #33; mov r0, r5 +; ASM-NEXT: vlda.ups.s32.s16 bml6, s0, [p2], m1; mov r0, p0 +; ASM-NEXT: vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9 +; ASM-NEXT: vlda.ups.s32.s16 bml7, s0, [p2, #0]; add r1, r0, #33; mov r0, r5 ; ASM-NEXT: .p2align 4 ; ASM-NEXT: .LBB0_2: // %inner.loop ; ASM-NEXT: // Parent Loop BB0_1 Depth=1 ; ASM-NEXT: // => This Inner Loop Header: Depth=2 -; ASM-NEXT: vldb wl6, [p0], m6; nopx -; ASM-NEXT: vldb wh6, [p0], m6 -; ASM-NEXT: vldb wl8, [p0], m6 -; ASM-NEXT: vldb.3d wh8, [p0], d0 +; ASM-NEXT: vldb wl2, [p0], m6; nopx +; ASM-NEXT: vldb wh2, [p0], m6 +; ASM-NEXT: vldb wl4, [p0], m6 +; ASM-NEXT: vldb.3d wh4, [p0], d0 ; ASM-NEXT: nop +; ASM-NEXT: vldb wl6, [p1], #32 +; ASM-NEXT: vldb wh6, [p1], #32 ; ASM-NEXT: vldb wl10, [p1], #32 ; ASM-NEXT: vldb wh10, [p1], #32 -; ASM-NEXT: vldb wl7, [p1], #32 -; ASM-NEXT: vldb wh7, [p1], #32 -; ASM-NEXT: vshift.align x4, x4, s1, x6, r1 -; ASM-NEXT: vshift.align x2, x2, s1, x8, r1 -; ASM-NEXT: vshuffle x9, x4, x2, r2 -; ASM-NEXT: vshuffle x3, x4, x2, r3 -; ASM-NEXT: vmac cm1, cm1, x9, x10, r4 -; ASM-NEXT: add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm3, cm3, x3, x10, r4 -; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm5, cm5, x9, x7, r4 -; ASM-NEXT: vshuffle x5, x3, x0, r8; vmac cm7, cm7, x3, x7, r4 // Delay Slot 5 -; ASM-NEXT: vmac cm2, cm2, x1, x10, r4 // Delay Slot 4 -; ASM-NEXT: mov r1, p0; vmac cm4, cm4, x5, x10, r4 // Delay Slot 3 -; ASM-NEXT: and r1, r1, r9; vmac cm6, cm6, x1, x7, r4 // Delay Slot 2 -; ASM-NEXT: add r1, r1, #33; vmac cm0, cm0, x5, x7, r4 // Delay Slot 1 +; ASM-NEXT: vshift.align x8, x8, s1, x2, r1 +; ASM-NEXT: vshift.align x1, x1, s1, x4, r1 +; ASM-NEXT: vshuffle x3, x8, x1, r2 +; ASM-NEXT: vshuffle x7, x8, x1, r3 +; ASM-NEXT: vmac cm0, cm0, x3, x6, r4 +; ASM-NEXT: add r0, r0, #-1; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x7, x6, r4 +; ASM-NEXT: jnz r0, #.LBB0_2; vmac cm4, cm4, x3, x10, r4 +; ASM-NEXT: vshuffle x9, x7, x0, r8; vmac cm6, cm6, x7, x10, r4 // Delay Slot 5 +; ASM-NEXT: vmac cm1, cm1, x5, x6, r4 // Delay Slot 4 +; ASM-NEXT: mov r1, p0; vmac cm3, cm3, x9, x6, r4 // Delay Slot 3 +; ASM-NEXT: and r1, r1, r9; vmac cm5, cm5, x5, x10, r4 // Delay Slot 2 +; ASM-NEXT: add r1, r1, #33; vmac cm7, cm7, x9, x10, r4 // Delay Slot 1 ; ASM-NEXT: // %bb.3: // %outer.loop.latch ; ASM-NEXT: // in Loop: Header=BB0_1 Depth=1 -; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopxm ; nopv -; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], #64 +; ASM-NEXT: nopb ; nopa ; vst.srs.s16.s32 bmh0, s2, [p3, #32]; nopxm ; nopv +; ASM-NEXT: vst.srs.s16.s32 bml0, s3, [p3], #64 +; ASM-NEXT: vst.srs.s16.s32 bmh1, s3, [p3, #32] +; ASM-NEXT: vst.srs.s16.s32 bml1, s3, [p3], m4 ; ASM-NEXT: vst.srs.s16.s32 bmh2, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], m4 +; ASM-NEXT: vst.srs.s16.s32 bml2, s3, [p3], #64 ; ASM-NEXT: vst.srs.s16.s32 bmh3, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], #64 +; ASM-NEXT: vst.srs.s16.s32 bml3, s3, [p3], m7 ; ASM-NEXT: vst.srs.s16.s32 bmh4, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], m7 -; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32] -; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], #64 -; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26 -; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27 -; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28 -; ASM-NEXT: vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10 -; ASM-NEXT: vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13 -; ASM-NEXT: vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11 +; ASM-NEXT: vst.srs.s16.s32 bml4, s3, [p3], #64 +; ASM-NEXT: vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov dc5, r26 +; ASM-NEXT: vst.srs.s16.s32 bml5, s3, [p3], m4; mov dn5, r27 +; ASM-NEXT: vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dj5, r28 +; ASM-NEXT: vst.srs.s16.s32 bml6, s3, [p3], #64; mov m1, r10 +; ASM-NEXT: vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov m2, r13 +; ASM-NEXT: vst.2d.srs.s16.s32 bml7, s3, [p3], d5; mov dj5, r11 ; ASM-NEXT: add r7, r7, #-1; mov dn5, r12 ; ASM-NEXT: jnz r7, #.LBB0_1 ; ASM-NEXT: mov r26, dc5 // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll index 63dc588fa834..4a58268d3241 100644 --- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll +++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll @@ -106,36 +106,36 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non ; CHECK-NEXT: nopb ; nopa ; nops ; nopxm ; nopv ; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv ; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv -; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv +; CHECK-NEXT: nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: nopa ; nopb ; nopx ; vband x9, x8, x3; nops -; CHECK-NEXT: vmax_lt.bf16 x5, r16, x5, x10 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vband x9, x8, x3; nopv +; CHECK-NEXT: vldb wl7, [p0], #32; vmov wh3, wl2 +; CHECK-NEXT: nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x10; vmac.f bmh4, bmh0, x3, x4, r1 +; CHECK-NEXT: vband x9, x8, x5; vmul.f bmh2, x6, x9, r1 ; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh6, x7, x0, r1 -; CHECK-NEXT: vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1 -; CHECK-NEXT: vmov wh9, wl2; vmul.f bmh3, x6, x9, r1 -; CHECK-NEXT: vmac.f bmh5, bmh0, x3, x4, r1 -; CHECK-NEXT: vmul.f bmh4, x6, x9, r1 -; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0 +; CHECK-NEXT: vsub.f bml0, bmh5, bmh1, r0 +; CHECK-NEXT: vmul.f bmh3, x6, x9, r1 ; CHECK-NEXT: vmul.f bmh7, x0, x7, r1 +; CHECK-NEXT: vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0 +; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh2; vmul.f bmh8, x0, x7, r1 ; CHECK-NEXT: vmac.f bml2, bmh0, x5, x4, r1 -; CHECK-NEXT: vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1 -; CHECK-NEXT: vsub.f bml0, bmh2, bmh1, r0 -; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1 -; CHECK-NEXT: nop -; CHECK-NEXT: vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1 -; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32 -; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1 -; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10 +; CHECK-NEXT: vmsc.f bml3, bmh4, x7, x3, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh3 +; CHECK-NEXT: vconv.bf16.fp32 wl3, bmh7 +; CHECK-NEXT: vst.conv.bf16.fp32 bml0, [p1], #32; vmsc.f bml4, bml2, x3, x5, r1 +; CHECK-NEXT: vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x9, r16, x3, x1 +; CHECK-NEXT: vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x10 ; CHECK-NEXT: .L_LEnd0: -; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv +; CHECK-NEXT: nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv ; CHECK-NEXT: // %bb.2: -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv +; CHECK-NEXT: nopa ; nopb ; nopxm +; CHECK-NEXT: vmov wh7, wl2 ; CHECK-NEXT: vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2 ; CHECK-NEXT: vmov wh6, wl2; vmul.f bmh3, x7, x0, r1 -; CHECK-NEXT: vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1 +; CHECK-NEXT: vmax_lt.bf16 x10, r16, x11, x10; vmul.f bmh2, x1, x0, r1 ; CHECK-NEXT: vband x1, x8, x3 ; CHECK-NEXT: vband x8, x8, x10 ; CHECK-NEXT: vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0 diff --git a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll index 67d9fc16fa44..444c0b0bc709 100644 --- a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll +++ b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll @@ -212,6 +212,7 @@ ; AIE-O123-NEXT: AIE super-reg rewrite ; AIE-O123-NEXT: Greedy Register Allocator ; AIE-O123-NEXT: AIE waw-reg rewrite +; AIE-O123-NEXT: Greedy Register Allocator ; AIE-O123-NEXT: Virtual Register Rewriter ; AIE-O123-NEXT: Stack Slot Coloring ; AIE-O123-NEXT: AIE 1D operands to 2D/3D rewriter diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir new file mode 100644 index 000000000000..4dbb982b1f0b --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# NOTE: Example file for Write After Write Register Renaming in Loop test +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates +# unit test for the WAW register renaming pass and check edge cases +# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter \ +# RUN: --aie-aggressive-realloc %s -o - | FileCheck %s + + +# This is a simplified example taken from the GEMM_bf16 kernel. +# We want to make sure that live-through registers are de-allocated to make +# space for loop-local registers when --aie-aggressive-realloc is passed. +--- +name: dealloc_live_through +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: dealloc_live_through + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $m0, $p0, $r0, $d1_3d, $d2_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $p1 = COPY $p0 + ; CHECK-NEXT: renamable $m3 = COPY $m0 + ; CHECK-NEXT: renamable $m4 = COPY $m0 + ; CHECK-NEXT: renamable $r1 = COPY $r0 + ; CHECK-NEXT: renamable $r2 = COPY $r0 + ; CHECK-NEXT: renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: VST_X_SPILL killed renamable $x0, %stack.0, implicit $sp :: (store (s512) into %stack.0, align 32) + ; CHECK-NEXT: renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: VST_X_SPILL killed renamable $x0, %stack.1, implicit $sp :: (store (s512) into %stack.1, align 32) + ; CHECK-NEXT: renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: VST_X_SPILL killed renamable $x0, %stack.2, implicit $sp :: (store (s512) into %stack.2, align 32) + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $m0, $m3, $m4, $p0, $p1, $r0, $r1, $r2, $d1_3d:0x000000000003C870, $d2_3d:0x000000000003C870 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wl2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $p1 = PADD_mod_pseudo killed renamable $p1, renamable $m0 + ; CHECK-NEXT: renamable $wl4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wl6, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: $wh6, $p1, $dc2, $dc6 = VLD_3D_pseudo killed $p1, $d2_3d + ; CHECK-NEXT: renamable $x8 = VSHUFFLE renamable $x0, renamable $x4, renamable $r0 + ; CHECK-NEXT: renamable $x10 = VSHUFFLE killed renamable $x0, killed renamable $x4, renamable $r1 + ; CHECK-NEXT: renamable $x1 = VSHUFFLE renamable $x2, renamable $x6, renamable $r0 + ; CHECK-NEXT: renamable $x3 = VSHUFFLE killed renamable $x2, killed renamable $x6, renamable $r1 + ; CHECK-NEXT: renamable $wl5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl11, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: $wh11, $p0, $dc1, $dc5 = VLD_3D_pseudo killed $p0, $d1_3d + ; CHECK-NEXT: renamable $x5 = VSHUFFLE killed renamable $x5, renamable $x5, renamable $r2 + ; CHECK-NEXT: renamable $x7 = VSHUFFLE killed renamable $x7, renamable $x7, renamable $r2 + ; CHECK-NEXT: renamable $x9 = VSHUFFLE killed renamable $x9, renamable $x9, renamable $r2 + ; CHECK-NEXT: renamable $x11 = VSHUFFLE killed renamable $x11, renamable $x11, renamable $r2 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1, implicit killed renamable $x8, implicit killed renamable $x10, implicit killed renamable $x1, implicit killed renamable $x3, implicit killed renamable $x5, implicit killed renamable $x7, implicit killed renamable $x9, implicit killed renamable $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: renamable $x0 = VLDA_X_SPILL %stack.0, implicit $sp :: (load (s512) from %stack.0, align 32) + ; CHECK-NEXT: renamable $x2 = VLDA_X_SPILL %stack.1, implicit $sp :: (load (s512) from %stack.1, align 32) + ; CHECK-NEXT: renamable $x4 = VLDA_X_SPILL %stack.2, implicit $sp :: (load (s512) from %stack.2, align 32) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit killed renamable $x0, implicit killed renamable $x2, implicit killed renamable $x4 + bb.0.entry: + successors: %bb.1 + liveins: $r0, $p0, $m0, $d1_3d, $d2_3d + %598:ep_as_32bit = COPY $p0 + %599:ep_as_32bit = COPY $p0 + %7:em = COPY $m0 + %13:em = COPY $m0 + %14:em = COPY $m0 + %582:eds = COPY $d1_3d + %583:eds = COPY $d2_3d + %108:er = COPY $r0 + %110:er = COPY $r0 + %20:er = COPY $r0 + undef %10.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %10.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %11.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %11.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %12.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %12.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + LoopStart $r0, 0 + + bb.1: + successors: %bb.1, %bb.2 + undef %338.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %338.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %346.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %346.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %599:ep_as_32bit = PADD_mod_pseudo %599, %7 + undef %355.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %355.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %361.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %361.sub_256_hi:vec512, %599:ep_as_32bit, %583.sub_dim_count:eds, %583.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %599, %583 + %369:vec512 = VSHUFFLE %338, %355, %108 + %370:vec512 = VSHUFFLE %338, %355, %110 + %371:vec512 = VSHUFFLE %346, %361, %108 + %372:vec512 = VSHUFFLE %346, %361, %110 + undef %378.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %378.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %386.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %386.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %394.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %394.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %402.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %402.sub_256_hi:vec512, %598:ep_as_32bit, %582.sub_dim_count:eds, %582.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %598, %582 + %410:vec512 = VSHUFFLE %378, %378, %20 + %411:vec512 = VSHUFFLE %386, %386, %20 + %412:vec512 = VSHUFFLE %394, %394, %20 + %413:vec512 = VSHUFFLE %402, %402, %20 + PseudoLoopEnd , %bb.1, implicit %369, implicit %370, implicit %371, implicit %372, implicit %410, implicit %411, implicit %412, implicit %413 + + bb.2: + PseudoRET implicit $lr, implicit %10, implicit %11, implicit %12 +... diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir new file mode 100644 index 000000000000..ea219fdcc073 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir @@ -0,0 +1,55 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# NOTE: Example file for Write After Write Register Renaming in Loop test +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter \ +# RUN: --aie-gpr-realloc %s -o - | FileCheck %s + + +# Check general purpose registers can also be renamed. +--- +name: gpr_renaming +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: gpr_renaming + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $r1, $r2, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r0 = AND $r1, $r2 + ; CHECK-NEXT: renamable $r3 = AND $r1, $r8 + ; CHECK-NEXT: renamable $r4 = AND killed renamable $r0, renamable $r3 + ; CHECK-NEXT: dead renamable $r5 = AND killed renamable $r3, killed renamable $r4 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $r1, $r2, $r8 + LoopStart $r0, 0 + bb.1: + successors: %bb.1, %bb.2 + liveins: $r1, $r2, $r8 + %0:er = AND $r1, $r2 + %1:er = AND $r1, $r8 + %2:er = AND %0, %1 + %3:er = AND %1, %2 + PseudoLoopEnd , %bb.1 + bb.2: + PseudoRET implicit $lr +... + diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir index c8875a371847..9dadcec0609b 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir @@ -5,7 +5,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # -# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates # unit test for the WAW register renaming pass and check edge cases # RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s @@ -509,6 +509,110 @@ body: | PseudoRET implicit $lr ... +# This is a simplified example taken from the GEMM_bf16 kernel. +# When we get to the last 4 VSHUFFLE, all 12 X registers have already been +# distributed. Those VSHUFFLE represent a kill point for other X registers. +# We should then prefer "re-using" the killed registers instead of us starting +# anew from x0, x2, x4 and x6. +--- +name: def_kill_instrs +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: def_kill_instrs + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $m0, $p0, $r0, $d1_3d, $d2_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $p1 = COPY $p0 + ; CHECK-NEXT: renamable $m3 = COPY $m0 + ; CHECK-NEXT: renamable $m4 = COPY $m0 + ; CHECK-NEXT: renamable $r1 = COPY $r0 + ; CHECK-NEXT: renamable $r2 = COPY $r0 + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $m0, $m3, $m4, $p0, $p1, $r0, $r1, $r2, $d1_3d:0x000000000003C870, $d2_3d:0x000000000003C870 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wl2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $p1 = PADD_mod_pseudo killed renamable $p1, renamable $m0 + ; CHECK-NEXT: renamable $wl4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wh4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: renamable $wl6, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32 + ; CHECK-NEXT: $wh6, $p1, $dc2, $dc6 = VLD_3D_pseudo killed $p1, $d2_3d + ; CHECK-NEXT: renamable $x8 = VSHUFFLE renamable $x0, renamable $x4, renamable $r0 + ; CHECK-NEXT: renamable $x10 = VSHUFFLE killed renamable $x0, killed renamable $x4, renamable $r1 + ; CHECK-NEXT: renamable $x1 = VSHUFFLE renamable $x2, renamable $x6, renamable $r0 + ; CHECK-NEXT: renamable $x3 = VSHUFFLE killed renamable $x2, killed renamable $x6, renamable $r1 + ; CHECK-NEXT: renamable $wl5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: renamable $wh9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4 + ; CHECK-NEXT: renamable $wl11, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3 + ; CHECK-NEXT: $wh11, $p0, $dc1, $dc5 = VLD_3D_pseudo killed $p0, $d1_3d + ; CHECK-NEXT: renamable $x5 = VSHUFFLE killed renamable $x5, renamable $x5, renamable $r2 + ; CHECK-NEXT: renamable $x7 = VSHUFFLE killed renamable $x7, renamable $x7, renamable $r2 + ; CHECK-NEXT: renamable $x9 = VSHUFFLE killed renamable $x9, renamable $x9, renamable $r2 + ; CHECK-NEXT: renamable $x11 = VSHUFFLE killed renamable $x11, renamable $x11, renamable $r2 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1, implicit killed renamable $x8, implicit killed renamable $x10, implicit killed renamable $x1, implicit killed renamable $x3, implicit killed renamable $x5, implicit killed renamable $x7, implicit killed renamable $x9, implicit killed renamable $x11 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $p0, $m0, $d1_3d, $d2_3d + %598:ep_as_32bit = COPY $p0 + %599:ep_as_32bit = COPY $p0 + %7:em = COPY $m0 + %13:em = COPY $m0 + %14:em = COPY $m0 + %582:eds = COPY $d1_3d + %583:eds = COPY $d2_3d + %108:er = COPY $r0 + %110:er = COPY $r0 + %20:er = COPY $r0 + LoopStart $r0, 0 + + bb.1: + successors: %bb.1, %bb.2 + undef %338.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %338.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %346.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %346.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %599:ep_as_32bit = PADD_mod_pseudo %599, %7 + undef %355.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %355.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + undef %361.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32 + %361.sub_256_hi:vec512, %599:ep_as_32bit, %583.sub_dim_count:eds, %583.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %599, %583 + %369:vec512 = VSHUFFLE %338, %355, %108 + %370:vec512 = VSHUFFLE %338, %355, %110 + %371:vec512 = VSHUFFLE %346, %361, %108 + %372:vec512 = VSHUFFLE %346, %361, %110 + undef %378.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %378.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %386.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %386.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %394.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %394.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14 + undef %402.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13 + %402.sub_256_hi:vec512, %598:ep_as_32bit, %582.sub_dim_count:eds, %582.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %598, %582 + %410:vec512 = VSHUFFLE %378, %378, %20 + %411:vec512 = VSHUFFLE %386, %386, %20 + %412:vec512 = VSHUFFLE %394, %394, %20 + %413:vec512 = VSHUFFLE %402, %402, %20 + PseudoLoopEnd , %bb.1, implicit %369, implicit %370, implicit %371, implicit %372, implicit %410, implicit %411, implicit %412, implicit %413 + + bb.2: + PseudoRET implicit $lr +... + # Ignore renaming of general purpose registers. --- name: gpr_replacement diff --git a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll index 4acdfb6ebdbf..982e6fe361fd 100644 --- a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll +++ b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll @@ -235,6 +235,8 @@ ; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: AIE super-reg rewrite ; AIE-O1-NEXT: Greedy Register Allocator +; AIE-O1-NEXT: AIE waw-reg rewrite +; AIE-O1-NEXT: Greedy Register Allocator ; AIE-O1-NEXT: Virtual Register Rewriter ; AIE-O1-NEXT: Stack Slot Coloring ; AIE-O1-NEXT: AIE 1D operands to 2D/3D rewriter @@ -441,6 +443,8 @@ ; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: AIE super-reg rewrite ; AIE-O23-NEXT: Greedy Register Allocator +; AIE-O23-NEXT: AIE waw-reg rewrite +; AIE-O23-NEXT: Greedy Register Allocator ; AIE-O23-NEXT: Virtual Register Rewriter ; AIE-O23-NEXT: Stack Slot Coloring ; AIE-O23-NEXT: AIE 1D operands to 2D/3D rewriter diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir b/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir new file mode 100644 index 000000000000..261cd827283c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir @@ -0,0 +1,56 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 +# NOTE: Example file for Write After Write Register Renaming in Loop test +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates + +# Basic test for the WAW register renaming pass. Check AIE2 tests for more coverage. + +# RUN: llc -mtriple=aie2p -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s + +# Make sure VLD and VMAX define different X registers. +--- +name: simple_waw_replacement +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: simple_waw_replacement + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $p0, $p1, $r0, $x0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $d0, $d2, $p0, $p1, $p2, $x0, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $x2, renamable $p0 = VLDA_dmx_lda_x_pstm_nrm_imm killed renamable $p0, 64 + ; CHECK-NEXT: renamable $x4, dead renamable $r16 = VMAX_LT_32_vaddSign1 killed renamable $x2, renamable $x0, implicit $vaddsign1 + ; CHECK-NEXT: renamable $p1 = VST_dmx_sts_x_pstm_nrm_imm killed renamable $x4, killed renamable $p1, 64 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $p0, $p1, $x0 + %0:ep = COPY $p0 + %1:vec512 = COPY $x0 + %2:ep = COPY $p1 + LoopStart $r0, 0 + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p2, $d0, $d1_3d, $d2 + + %10:vec512, %0:ep = VLDA_dmx_lda_x_pstm_nrm_imm %0, 64 + %11:vec512, %12:mr16_vcompare = VMAX_LT_32_vaddSign1 %10, %1, implicit $vaddsign1 + %2:ep = VST_dmx_sts_x_pstm_nrm_imm %11, %2, 64 + PseudoLoopEnd , %bb.1 + bb.2: + PseudoRET implicit $lr +... diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll index 8107e2e98e19..d684a5641161 100644 --- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll +++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll @@ -20,27 +20,29 @@ define <32 x i16> @zol(i32 %n, ptr %p) { ; CHECK-NEXT: add.nc lc, r0, #-7 ; CHECK-NEXT: movxm ls, #.LBB0_1 ; CHECK-NEXT: movxm le, #.L_LEnd0 -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv -; CHECK-NEXT: // implicit-def: $x0 +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv +; CHECK-NEXT: // implicit-def: $x2 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: // %for.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: .L_LEnd0: -; CHECK-NEXT: nopa ; vldb x2, [p0], #64; nops ; nopx ; vadd.16 x0, x2, x0; nopv +; CHECK-NEXT: nopa ; vldb x0, [p0], #64; nops ; nopx ; vadd.16 x2, x0, x2; nopv ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup -; CHECK-NEXT: nopa ; nopx ; vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 -; CHECK-NEXT: vadd.16 x0, x2, x0 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: vadd.16 x2, x0, x2 +; CHECK-NEXT: nop +; CHECK-NEXT: vmov x0, x2 ; CHECK-NEXT: ret lr ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4