diff --git a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp index 0a371119479a..550120dd01d2 100644 --- a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp +++ b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp @@ -22,24 +22,27 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/LiveStacks.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/SlotIndexes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCRegister.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" -#include using namespace llvm; #define DEBUG_TYPE "aie-waw-reg-rewrite" -namespace { +static cl::opt + RenameAllRegs("aie-waw-rename-all", cl::Hidden, cl::init(true), + cl::desc("Rename every Register, not only accumulator and " + "vector registers (default true).")); +namespace { +using PhysRegVec = SmallVector; /// /// This pass rewrites physical register assignments in critical parts of the /// code (like loops) to break WAW and WAR dependencies. @@ -55,11 +58,11 @@ class AIEWawRegRewriter : public MachineFunctionPass { AU.setPreservesCFG(); AU.addRequired(); AU.addPreserved(); - // no Machine Instructions are added, therefore the SlotIndexes remain + // No Machine Instructions are added, therefore the SlotIndexes remain // constant and preserved AU.addRequired(); AU.addPreserved(); - // no new Virtual Registers are generated, therefore the LiveDebugVariables + // No new Virtual Registers are generated, therefore the LiveDebugVariables // do not have to be updated AU.addRequired(); AU.addPreserved(); @@ -82,16 +85,25 @@ class AIEWawRegRewriter : public MachineFunctionPass { LiveRegMatrix *LRM = nullptr; LiveIntervals *LIS = nullptr; const TargetInstrInfo *TII = nullptr; + /// Vector to keep track which additional registers are blocked + /// by + /// a) being unused GPR registers within the MachineFunction or \ + /// b) being Callee save registers. \ + /// Unused GPR registers can be used for spill destinations, therefore + /// avoiding costly spills to memory. + PhysRegVec AdditionalBlockedRegs; bool renameMBBPhysRegs(const MachineBasicBlock *MBB); - /// Get all the defined physical registers that the MachineBasicBlock already - /// uses. These physical registers should not be used for replacement - /// candidates, since this would introduce new WAW dependencies, which this - /// pass tries to remove. - BitVector getDefinedPhysRegs(const MachineBasicBlock *MBB) const; + /// Return a Bitvector with all the defined physical registers of the + /// \param MBB. These physical registers should not be + /// used for replacement candidates, since this would introduce new WAW + /// dependencies, which this pass tries to remove. Additionally, block all the + /// physical register in \param AdditionalBlockedRegs. + BitVector getBlockedPhysRegs(const MachineBasicBlock *MBB, + const PhysRegVec &AdditionalBlockedRegs) const; - /// returns true if the physical register of Reg was replaced + /// Returns true if the physical register of Reg was replaced bool replaceReg(const Register Reg, BitVector &BlockedPhysRegs); /// Find a free register of the same register class type, but @@ -104,13 +116,13 @@ class AIEWawRegRewriter : public MachineFunctionPass { bool isWorthRenaming(const Register &Reg, const BitVector &UsedPhysRegs, const BitVector &VRegWithCopies) const; - /// return the Physical register of the Register, look it up in VirtRegMap if + /// Return the Physical register of \param Reg , look it up in VirtRegMap if /// the Reg is virtual MCPhysReg getAssignedPhysReg(const Register Reg) const; bool isIdentityCopy(const MachineInstr &MI) const; - /// return a BitVector to identify if a VirtualRegister has been defined by at + /// Return a BitVector to identify if a VirtualRegister has been defined by at /// least one copy. /// The Virtual Registers are accessed by the VirtRegIndex BitVector getVRegWithCopies(const MachineBasicBlock &MBB) const; @@ -137,6 +149,27 @@ class AIEWawRegRewriter : public MachineFunctionPass { /// removed for future replacement strategies, i.e. block wl4, wh4, y2 if X4 /// is used. void addAliasRegs(BitVector &BlockedPhysRegs, const MCPhysReg PhysReg) const; + + /// \return additional blocked registers of the \p MF + PhysRegVec getAdditionalBlockedRegs(const MachineFunction &MF) const; + + /// Accumulate defined registers within the \param MBB in \param UsedRegs + void accumulateDefRegs(const MachineBasicBlock &MBB, + std::set &UsedRegs) const; + + /// Return all the GPR and alias register within \p MF + PhysRegVec getGPRAndAliasRegs(const MachineFunction &MF) const; + + /// Add Register to \p UsedRegs, if all its alias register are already used + /// and exist in \p AdditionalUsedRegs , i.e. if r16 and r17 are already + /// used, also insert their combined alias register (l0) into \p UsedRegs + /// since it is consequently also used. + /// Fixme: Work on Regunits and remove this method + void setAliasUsage(std::set &UsedRegs, + const PhysRegVec &AdditionalUsedRegs) const; + + /// Remove callee save registers from \param UsedRegs + void removeCalleeSaveRegs(std::set &UsedRegs) const; }; MCPhysReg AIEWawRegRewriter::getAssignedPhysReg(const Register Reg) const { @@ -168,6 +201,10 @@ bool AIEWawRegRewriter::runOnMachineFunction(MachineFunction &MF) { LLVM_DEBUG(dbgs() << "*** WAW Loop Register Rewriting: " << MF.getName()); LLVM_DEBUG(dbgs() << " ***\n"); + AdditionalBlockedRegs.clear(); + if (RenameAllRegs) + AdditionalBlockedRegs = getAdditionalBlockedRegs(MF); + for (const MachineBasicBlock *MBB : LoopMBBs) Modified |= renameMBBPhysRegs(MBB); @@ -198,7 +235,7 @@ AIEWawRegRewriter::getVRegWithCopies(const MachineBasicBlock &MBB) const { } } - // copy to BitVector so that lookups become very cheap + // Copy to BitVector so that lookups become very cheap BitVector VRegWithCopies(MaxVReg + 1); for (const unsigned RegIndex : VRegs) VRegWithCopies[RegIndex] = true; @@ -206,6 +243,138 @@ AIEWawRegRewriter::getVRegWithCopies(const MachineBasicBlock &MBB) const { return VRegWithCopies; } +PhysRegVec +AIEWawRegRewriter::getGPRAndAliasRegs(const MachineFunction &MF) const { + LLVM_DEBUG(dbgs() << "Collecting All GPR and Alias register\n"); + PhysRegVec GPRRegs; + + for (auto PhysReg : *TRI->getGPRRegClass(MF)) { + LLVM_DEBUG(dbgs() << "GPR check: " << printReg(PhysReg, TRI, 0, MRI) + << "\n"); + + for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid(); + ++AI) { + auto AliasReg = *AI; + if (find(GPRRegs, AliasReg) != GPRRegs.end()) + continue; + + GPRRegs.push_back(AliasReg); + LLVM_DEBUG(dbgs() << " Added: " << printReg(PhysReg, TRI, 0, MRI) + << "\n"); + } + } + + return GPRRegs; +} + +void AIEWawRegRewriter::setAliasUsage( + std::set &UsedRegs, const PhysRegVec &AdditionalUsedRegs) const { + + for (auto PhysReg : AdditionalUsedRegs) { + if (UsedRegs.count(PhysReg)) + continue; + + bool OccupiedAllAliasRegs = true; + for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid(); + ++AI) { + auto AliasReg = *AI; + if (PhysReg == AliasReg) + continue; + + // If even one sub-register of PhysReg is not used, PhysReg is also not + // used + if (TRI->getRegSizeInBits(PhysReg, *MRI) >= + TRI->getRegSizeInBits(AliasReg, *MRI) && + UsedRegs.count(AliasReg) == 0) { + OccupiedAllAliasRegs = false; + break; + } + } + + // If all the smaller register classes of PhysReg are used, PhysReg is also + // used, even if it is not directly used + if (OccupiedAllAliasRegs) { + LLVM_DEBUG(dbgs() << "All subregisters are used, so alias is also used: " + << printReg(PhysReg, TRI, 0, MRI) << "\n"); + UsedRegs.insert(PhysReg); + } + } +} + +void AIEWawRegRewriter::accumulateDefRegs(const MachineBasicBlock &MBB, + std::set &UsedRegs) const { + for (const MachineInstr &MI : MBB) { + for (const MachineOperand &OP : MI.defs()) { + MCPhysReg PhysReg = getAssignedPhysReg(OP.getReg()); + if (!MCRegister::isPhysicalRegister(PhysReg)) + continue; + + UsedRegs.insert(PhysReg); + LLVM_DEBUG(dbgs() << "Encountered " << printReg(PhysReg, TRI, 0, MRI) + << " \n"); + } + } +} + +/// Remove callee save registers from \param UsedRegs +void AIEWawRegRewriter::removeCalleeSaveRegs( + std::set &UsedRegs) const { + LLVM_DEBUG(dbgs() << "Removing Calle Save Regs from UsedRegs:\n"); + const auto *CSRegs = MRI->getCalleeSavedRegs(); + + for (unsigned I = 0; CSRegs[I]; ++I) { + for (MCRegAliasIterator AI(MCRegister(CSRegs[I]), TRI, true); AI.isValid(); + ++AI) { + auto AliasReg = *AI; + if (UsedRegs.count(AliasReg)) { + LLVM_DEBUG(dbgs() << "Already blocked: " + << printReg(AliasReg, TRI, 0, MRI) << " \n"); + continue; + } + + UsedRegs.erase(AliasReg); + LLVM_DEBUG(dbgs() << "Erasing " << printReg(AliasReg, TRI, 0, MRI) + << "\n"); + } + } + + LLVM_DEBUG(dbgs() << "\n"); +} + +PhysRegVec +AIEWawRegRewriter::getAdditionalBlockedRegs(const MachineFunction &MF) const { + // Only GPR registers have to be handled specially, since callee save + // registers should not be used for renaming (could cause spilling) and unused + // gpr registers can be used for register spilling to regs instead of memory. + // Spilling to regs is more performant, therefore block these unused registers + // for register renaming + LLVM_DEBUG(dbgs() << "Setting Additional Blocked Registers\n"); + + PhysRegVec GPRRegs = getGPRAndAliasRegs(MF); + + std::set UsedRegs; + for (const MachineBasicBlock &MBB : MF) + accumulateDefRegs(MBB, UsedRegs); + + setAliasUsage(UsedRegs, GPRRegs); + + removeCalleeSaveRegs(UsedRegs); + + // Get unused GPR registers by removing the UsedRegs from + // the set of all available GPR Registers + GPRRegs.erase(std::remove_if( + GPRRegs.begin(), GPRRegs.end(), + [&UsedRegs](MCPhysReg Reg) { return UsedRegs.count(Reg); }), + GPRRegs.end()); + + LLVM_DEBUG(dbgs() << "Permanently blocking: "; for (auto Reg + : GPRRegs) { + dbgs() << printReg(Reg, TRI, 0, MRI) << " "; + } dbgs() << "\n"); + + return GPRRegs; +} + bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { LLVM_DEBUG(dbgs() << "WAW Reg Renaming BasicBlock "; MBB->dump(); dbgs() << "\n"); @@ -218,7 +387,7 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { // Get a list of registers, that are not allowed as a replacement register. // This list gets updated with the newly replaced physical register, so that // this pass does not introduce WAW dependencies. - BitVector BlockedPhysRegs = getDefinedPhysRegs(MBB); + BitVector BlockedPhysRegs = getBlockedPhysRegs(MBB, AdditionalBlockedRegs); // Collect all the virtual registers that have at least a copy instruction // that defines them. Subregisters may contain constants that may be shared @@ -246,7 +415,7 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) { continue; if (MO.isTied()) continue; - // several definitions of the same virtual register are not relevant + // Several definitions of the same virtual register are not relevant // because even if the virtual register is renamed, by construction // all the definitions would be renamed as well and achieve nothing wrt // WAW dependecy resolution @@ -287,14 +456,15 @@ bool AIEWawRegRewriter::isWorthRenaming(const Register &Reg, if (!UsedPhysRegs[VRM->getPhys(Reg)]) return false; - if (!TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg)))) + if (!RenameAllRegs && !TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg)))) return false; return !VRegWithCopies[Reg.virtRegIndex()]; } -BitVector -AIEWawRegRewriter::getDefinedPhysRegs(const MachineBasicBlock *MBB) const { +BitVector AIEWawRegRewriter::getBlockedPhysRegs( + const MachineBasicBlock *MBB, + const PhysRegVec &AdditionalBlockedRegs) const { BitVector BlockedPhysRegs(TRI->getNumRegs()); for (const MachineInstr &MI : *MBB) { @@ -305,6 +475,13 @@ AIEWawRegRewriter::getDefinedPhysRegs(const MachineBasicBlock *MBB) const { } } + // Do not allow additional blocked regs for register replacement + LLVM_DEBUG(dbgs() << "Additionally blocking: \n"); + for (auto PhysReg : AdditionalBlockedRegs) { + addAliasRegs(BlockedPhysRegs, PhysReg); + } + LLVM_DEBUG(dbgs() << "\n"); + return BlockedPhysRegs; } @@ -312,7 +489,7 @@ bool AIEWawRegRewriter::replaceReg(const Register Reg, BitVector &BlockedPhysRegs) { assert(Reg.isVirtual()); LLVM_DEBUG(dbgs() << " WAW RegRewriter: Register to replace " - << TRI->getName(VRM->getPhys(Reg)) << "\n"); + << printReg(Reg, TRI, 0, MRI) << "\n"); MCPhysReg ReplacementPhysReg = getReplacementPhysReg(Reg, BlockedPhysRegs); @@ -320,8 +497,7 @@ bool AIEWawRegRewriter::replaceReg(const Register Reg, return false; LLVM_DEBUG(dbgs() << " WAW Replacement: Virtual Register " - << printReg(VRM->getPhys(Reg), TRI, 0, MRI) - << " will replace " + << printReg(Reg, TRI, 0, MRI) << " will replace " << printReg(VRM->getPhys(Reg), TRI, 0, MRI) << " with " << printReg(ReplacementPhysReg, TRI, 0, MRI) << '\n'); @@ -347,6 +523,8 @@ MCPhysReg AIEWawRegRewriter::getReplacementPhysReg( if (IK == LiveRegMatrix::IK_Free) return PhysReg; } + LLVM_DEBUG(dbgs() << "No free register found for " + << printReg(Reg, TRI, 0, MRI) << "\n"); return MCRegister::NoRegister; } diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll index 2915bbb173ef..2fd17ed4b371 100644 --- a/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/legalize-dyn-stackalloc.ll @@ -77,19 +77,19 @@ define void @test_loop_dyn_alloca(i32 noundef %n) { ; CHECK-NEXT: nopa ; nopx ; mov p6, sp ; CHECK-NEXT: mov p1, sp ; CHECK-NEXT: lshl r0, r17, r19 -; CHECK-NEXT: add r0, r0, #31 +; CHECK-NEXT: add r1, r0, #31 ; CHECK-NEXT: jl #extern_call ; CHECK-NEXT: mov p0, p1 // Delay Slot 5 -; CHECK-NEXT: and r0, r0, r20 // Delay Slot 4 -; CHECK-NEXT: mov m0, r0 // Delay Slot 3 +; CHECK-NEXT: and r2, r1, r20 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 ; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopa ; nopb ; add r17, r17, #1; nopm ; nops -; CHECK-NEXT: ltu r0, r17, r16 -; CHECK-NEXT: add r21, r21, r0 -; CHECK-NEXT: xor r0, r17, r18 -; CHECK-NEXT: or r0, r0, r21 -; CHECK-NEXT: jnz r0, #.LBB1_1 +; CHECK-NEXT: ltu r3, r17, r16 +; CHECK-NEXT: xor r4, r17, r18 +; CHECK-NEXT: add r21, r21, r3 +; CHECK-NEXT: or r5, r4, r21 +; CHECK-NEXT: jnz r5, #.LBB1_1 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 diff --git a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll index 337fecd1e4bd..a98049d72d9e 100644 --- a/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll +++ b/llvm/test/CodeGen/AIE/aie2/dyn-stackalloc.ll @@ -77,19 +77,19 @@ define void @test_loop_dyn_alloca(i32 noundef %n) { ; CHECK-NEXT: nopa ; nopx ; mov p6, sp ; CHECK-NEXT: mov p1, sp ; CHECK-NEXT: lshl r0, r17, r19 -; CHECK-NEXT: add r0, r0, #31 +; CHECK-NEXT: add r1, r0, #31 ; CHECK-NEXT: jl #extern_call ; CHECK-NEXT: mov p0, p1 // Delay Slot 5 -; CHECK-NEXT: and r0, r0, r20 // Delay Slot 4 -; CHECK-NEXT: mov m0, r0 // Delay Slot 3 +; CHECK-NEXT: and r2, r1, r20 // Delay Slot 4 +; CHECK-NEXT: mov m0, r2 // Delay Slot 3 ; CHECK-NEXT: paddb [p1], m0 // Delay Slot 2 ; CHECK-NEXT: mov sp, p1 // Delay Slot 1 ; CHECK-NEXT: nopa ; nopb ; add r17, r17, #1; nopm ; nops -; CHECK-NEXT: ltu r0, r17, r16 -; CHECK-NEXT: add r21, r21, r0 -; CHECK-NEXT: xor r0, r17, r18 -; CHECK-NEXT: or r0, r0, r21 -; CHECK-NEXT: jnz r0, #.LBB1_1 +; CHECK-NEXT: ltu r3, r17, r16 +; CHECK-NEXT: xor r4, r17, r18 +; CHECK-NEXT: add r21, r21, r3 +; CHECK-NEXT: or r5, r4, r21 +; CHECK-NEXT: jnz r5, #.LBB1_1 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll index 66977e1274ca..d7582c409444 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/nested.ll @@ -30,14 +30,14 @@ define void @nested(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 ; CHECK-NEXT: nopb ; nopa ; nops ; lshl r7, r6, r4; nopm ; nopv ; CHECK-NEXT: mov dj0, r7 -; CHECK-NEXT: lda r7, [p3, dj0] +; CHECK-NEXT: lda r8, [p3, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: jnzd r5, r5, p2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: add r6, r6, #1 // Delay Slot 3 -; CHECK-NEXT: add r2, r2, r7 // Delay Slot 2 +; CHECK-NEXT: add r2, r2, r8 // Delay Slot 2 ; CHECK-NEXT: st r2, [p0, #0] // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: // %for.cond3.for.cond.cleanup5_crit_edge diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll index 5e1501961f7a..0cecd744965b 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/sibling.ll @@ -23,14 +23,14 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; nopa ; nops ; lshl r6, r5, r4; nopm ; nopv ; CHECK-NEXT: mov dj0, r6 -; CHECK-NEXT: lda r6, [p1, dj0] +; CHECK-NEXT: lda r7, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: jnzd r0, r0, p2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: add r5, r5, #1 // Delay Slot 3 -; CHECK-NEXT: add r3, r3, r6 // Delay Slot 2 +; CHECK-NEXT: add r3, r3, r7 // Delay Slot 2 ; CHECK-NEXT: st r3, [p0, #0] // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: // %for.body6.lr.ph @@ -43,14 +43,14 @@ define void @sibling(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv ; CHECK-NEXT: mov dj0, r4 -; CHECK-NEXT: lda r4, [p1, dj0] +; CHECK-NEXT: lda r5, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: jnzd r1, r1, p2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: add r2, r2, #1 // Delay Slot 3 -; CHECK-NEXT: add r0, r0, r4 // Delay Slot 2 +; CHECK-NEXT: add r0, r0, r5 // Delay Slot 2 ; CHECK-NEXT: st r0, [p0, #0] // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.4: // %for.cond.cleanup5 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll index 7168d4023e37..9be9a2b14b2b 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/simple.ll @@ -22,14 +22,14 @@ define void @simple(ptr nocapture %out, ptr nocapture readonly %in, i32 noundef ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: nopb ; nopa ; nops ; lshl r4, r2, r3; nopm ; nopv ; CHECK-NEXT: mov dj0, r4 -; CHECK-NEXT: lda r4, [p1, dj0] +; CHECK-NEXT: lda r5, [p1, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: jnzd r0, r0, p2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: add r2, r2, #1 // Delay Slot 3 -; CHECK-NEXT: add r1, r1, r4 // Delay Slot 2 +; CHECK-NEXT: add r1, r1, r5 // Delay Slot 2 ; CHECK-NEXT: st r1, [p0, #0] // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: // %bb.2: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll index 22a430126228..9e7ff4260629 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/unknown-tc.ll @@ -19,14 +19,14 @@ define void @cbz_exit(ptr %in, ptr %res) { ; CHECK-NEXT: nopa ; nopb ; add r0, r0, #1 ; CHECK-NEXT: lshl r2, r0, r1 ; CHECK-NEXT: mov dj0, r2 -; CHECK-NEXT: lda r2, [p0, dj0] +; CHECK-NEXT: lda r3, [p0, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: jnz r2, #.LBB0_1 +; CHECK-NEXT: jnz r3, #.LBB0_1 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 @@ -67,14 +67,14 @@ define void @cbnz_exit(ptr %in, ptr %res) { ; CHECK-NEXT: nopa ; nopb ; add r0, r0, #1 ; CHECK-NEXT: lshl r2, r0, r1 ; CHECK-NEXT: mov dj0, r2 -; CHECK-NEXT: lda r2, [p0, dj0] +; CHECK-NEXT: lda r3, [p0, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop -; CHECK-NEXT: jz r2, #.LBB1_1 +; CHECK-NEXT: jz r3, #.LBB1_1 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 diff --git a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll index 2eea3c89b871..f11ca8b03d54 100644 --- a/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll +++ b/llvm/test/CodeGen/AIE/aie2/hardware-loops/zol-loop.ll @@ -42,9 +42,9 @@ define void @simple_loop(i32 noundef %n, ptr nocapture readonly %in, ptr nocaptu ; CHECK-NEXT: nop ; CHECK-NEXT: lshl r4, r1, r0 ; CHECK-NEXT: add r1, r1, #1 -; CHECK-NEXT: add r3, r2, r3; mov dj0, r4 +; CHECK-NEXT: add r5, r2, r3; mov dj0, r4 ; CHECK-NEXT: .L_LEnd0: -; CHECK-NEXT: nopb ; nopa ; st r3, [p1, dj0]; add r2, r2, #-1; nopm ; nopv +; CHECK-NEXT: nopb ; nopa ; st r5, [p1, dj0]; add r2, r2, #-1; nopm ; nopv ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr diff --git a/llvm/test/CodeGen/AIE/aie2/loop.ll b/llvm/test/CodeGen/AIE/aie2/loop.ll index cdedd49d48bf..fbde84aabc0c 100644 --- a/llvm/test/CodeGen/AIE/aie2/loop.ll +++ b/llvm/test/CodeGen/AIE/aie2/loop.ll @@ -26,20 +26,20 @@ define i32 @accumulate(i32 %size, ptr %array) { ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: lshl r3, r2, r1 ; CHECK-NEXT: mov dj0, r3 -; CHECK-NEXT: lda r3, [p0, dj0] +; CHECK-NEXT: lda r5, [p0, dj0] ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: nop ; CHECK-NEXT: add r2, r2, #1 -; CHECK-NEXT: eq r4, r2, r3 +; CHECK-NEXT: eq r4, r2, r5 ; CHECK-NEXT: jz r4, #.LBB0_2 ; CHECK-NEXT: nop // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: add r0, r3, r0 // Delay Slot 1 +; CHECK-NEXT: add r0, r5, r0 // Delay Slot 1 ; CHECK-NEXT: // %bb.3: // %for.cond.cleanup ; CHECK-NEXT: nopa ; ret lr ; CHECK-NEXT: nop // Delay Slot 5 diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir index c8875a371847..45a719fa982d 100644 --- a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir +++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir @@ -7,7 +7,7 @@ # # (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates # unit test for the WAW register renaming pass and check edge cases -# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s +# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter --aie-waw-rename-all=1 %s -o - | FileCheck %s # Basic check if %7:mxm will be renamed ($x0 -> $x4) and will not use the another @@ -529,8 +529,8 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: renamable $r0 = AND $r1, $r2 ; CHECK-NEXT: renamable $r3 = AND $r1, $r8 - ; CHECK-NEXT: renamable $r0 = AND killed renamable $r0, renamable $r3 - ; CHECK-NEXT: dead renamable $r0 = AND killed renamable $r3, killed renamable $r0 + ; CHECK-NEXT: renamable $r4 = AND killed renamable $r0, renamable $r3 + ; CHECK-NEXT: dead renamable $r5 = AND killed renamable $r3, killed renamable $r4 ; CHECK-NEXT: PseudoLoopEnd , %bb.1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: @@ -548,6 +548,195 @@ body: | %3:er = AND %1, %2 PseudoLoopEnd , %bb.1 bb.2: + liveins: PseudoRET implicit $lr ... +# Rename %2 with an already used, free GPR register. +--- +name: reuse_gpr_renaming +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: reuse_gpr_renaming + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r3 = AND $r1, $r2 + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: liveins: $r1, $r2, $r3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r0 = AND $r1, $r2 + ; CHECK-NEXT: renamable $r4 = AND $r1, renamable $r3 + ; CHECK-NEXT: renamable $r5 = AND renamable $r0, renamable $r4 + ; CHECK-NEXT: dead renamable $r4 = AND killed renamable $r4, killed renamable $r5 + ; CHECK-NEXT: $r8 = AND killed renamable $r3, killed renamable $r0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000) + ; CHECK-NEXT: liveins: $r1, $r2, $r8 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $r0 = AND $r1, $r2 + ; CHECK-NEXT: renamable $r3 = AND $r1, $r8 + ; CHECK-NEXT: renamable $r4 = ADD killed renamable $r0, renamable $r3, implicit-def dead $srcarry + ; CHECK-NEXT: dead renamable $r5 = ADD killed renamable $r3, killed renamable $r4, implicit-def dead $srcarry + ; CHECK-NEXT: PseudoLoopEnd , %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $r1, $r2 + %0:er = AND $r1, $r2 + LoopStart $r0, 0 + + bb.1: + successors: %bb.2 + liveins: $r1, $r2 + %1:er = AND $r1, $r2 + %2:er = AND $r1, %0 + %3:er = AND %1, %2 + %4:er = AND %2, %3 + $r8 = AND %0, %1 + + bb.2: + successors: %bb.2, %bb.3 + liveins: $r1, $r2, $r8 + %5:er = AND $r1, $r2 + %6:er = AND $r1, $r8 + %7:er = ADD %5, %6, implicit-def dead $srcarry + %8:er = ADD %6, %7, implicit-def dead $srcarry + PseudoLoopEnd , %bb.2 + bb.3: + PseudoRET implicit $lr +... + + +# check that %10:el is replaced, since long registers are already used in the function +--- +name: long_replacement +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: long_replacement + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $d0, $d2, $p0, $p1, $p2, $r0, $x2, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $x4, $l4 = VMAX_LT_D8 $x2, $x2, implicit $crvaddsign + ; CHECK-NEXT: $x4, $l5 = VMAX_LT_D8 $x2, $x2, implicit $crvaddsign + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $d0, $d2, $p0, $p1, $p2, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $p3 = COPY $p0 + ; CHECK-NEXT: renamable $d3 = COPY $d0 + ; CHECK-NEXT: $wl0, dead $p3, dead $dc3 = VLDA_2D_dmw_lda_w killed $p3, $d3 + ; CHECK-NEXT: renamable $p3 = COPY $p1 + ; CHECK-NEXT: renamable $d3_3d = COPY $d1_3d + ; CHECK-NEXT: $wl2, dead $p3, dead $dc3, dead $dc7 = VLDA_3D_dmw_lda_w killed $p3, $d3_3d + ; CHECK-NEXT: renamable $x4, dead renamable $l4 = VMAX_LT_D8 renamable $x0, renamable $x2, implicit $crvaddsign + ; CHECK-NEXT: dead renamable $x6, dead renamable $l5 = VMAX_LT_D8 renamable $x0, renamable $x2, implicit $crvaddsign + ; CHECK-NEXT: renamable $p3 = COPY $p2 + ; CHECK-NEXT: renamable $d3 = COPY $d2 + ; CHECK-NEXT: dead $p3, dead $dc3 = VST_2D_dmw_sts_w killed $wl4, killed $p3, $d3 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $p0, $p1, $p2, $d0, $d1_3d, $d2, $x2 + $x4, $l4 = VMAX_LT_D8 $x2, $x2, implicit $crvaddsign + $x4, $l5 = VMAX_LT_D8 $x2, $x2, implicit $crvaddsign + + LoopStart $r0, 0 + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p2, $d0, $d1_3d, $d2 + %110:ep = COPY $p0 + %111:ed = COPY $d0 + undef %0.sub_256_lo:mxa, %110:ep, %111.sub_dim_count:ed = VLDA_2D_dmw_lda_w %110, killed %111 + %99:ep = COPY $p1 + %100:eds = COPY $d1_3d + undef %3.sub_256_lo:mxa, %99:ep, %100.sub_dim_count:eds, %100.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w killed %99, killed %100 + %7:mxm, %8:el = VMAX_LT_D8 %0, %3, implicit $crvaddsign + %9:mxm, %10:el = VMAX_LT_D8 %0, %3, implicit $crvaddsign + %120:ep = COPY $p2 + %121:ed = COPY $d2 + %120:ep, %121.sub_dim_count:ed = VST_2D_dmw_sts_w %7.sub_256_lo:mxm, killed %120, killed %121 + PseudoLoopEnd , %bb.1 + bb.2: + PseudoRET implicit $lr +... + +# check that %10:el is renamed, even though only the underlining GPR registers were used +--- +name: defGPRsAllowsLongs +alignment: 16 +legalized: true +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: defGPRsAllowsLongs + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $d0, $d2, $p0, $p1, $p2, $r0, $r1, $r2, $r8, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $r30 = AND $r1, $r2 + ; CHECK-NEXT: $r31 = AND $r1, $r8 + ; CHECK-NEXT: dead renamable $r1 = AND $r30, $r31 + ; CHECK-NEXT: LoopStart $r0, 0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: liveins: $d0, $d2, $p0, $p1, $p2, $d1_3d + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: renamable $p3 = COPY $p0 + ; CHECK-NEXT: renamable $d3 = COPY $d0 + ; CHECK-NEXT: $wl0, dead $p3, dead $dc3 = VLDA_2D_dmw_lda_w killed $p3, $d3 + ; CHECK-NEXT: renamable $p3 = COPY $p1 + ; CHECK-NEXT: renamable $d3_3d = COPY $d1_3d + ; CHECK-NEXT: $wl2, dead $p3, dead $dc3, dead $dc7 = VLDA_3D_dmw_lda_w killed $p3, $d3_3d + ; CHECK-NEXT: renamable $x4, dead renamable $l4 = VMAX_LT_D8 renamable $x0, renamable $x2, implicit $crvaddsign + ; CHECK-NEXT: dead renamable $x6, dead renamable $l7 = VMAX_LT_D8 renamable $x0, renamable $x2, implicit $crvaddsign + ; CHECK-NEXT: renamable $p3 = COPY $p2 + ; CHECK-NEXT: renamable $d3 = COPY $d2 + ; CHECK-NEXT: dead $p3, dead $dc3 = VST_2D_dmw_sts_w killed $wl4, killed $p3, $d3 + ; CHECK-NEXT: PseudoLoopEnd , %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: PseudoRET implicit $lr + bb.0.entry: + successors: %bb.1 + liveins: $r0, $p0, $p1, $p2, $d0, $d1_3d, $d2, $r1, $r2, $r8 + $r30 = AND $r1, $r2 + $r31 = AND $r1, $r8 + %2:er = AND $r30, $r31 + + LoopStart $r0, 0 + bb.1: + successors: %bb.1, %bb.2 + liveins: $p0, $p1, $p2, $d0, $d1_3d, $d2 + %110:ep = COPY $p0 + %111:ed = COPY $d0 + undef %0.sub_256_lo:mxa, %110:ep, %111.sub_dim_count:ed = VLDA_2D_dmw_lda_w %110, killed %111 + %99:ep = COPY $p1 + %100:eds = COPY $d1_3d + undef %3.sub_256_lo:mxa, %99:ep, %100.sub_dim_count:eds, %100.sub_hi_dim_then_sub_dim_count:eds = VLDA_3D_dmw_lda_w killed %99, killed %100 + %7:mxm, %8:el = VMAX_LT_D8 %0, %3, implicit $crvaddsign + %9:mxm, %10:el = VMAX_LT_D8 %0, %3, implicit $crvaddsign + %120:ep = COPY $p2 + %121:ed = COPY $d2 + %120:ep, %121.sub_dim_count:ed = VST_2D_dmw_sts_w %7.sub_256_lo:mxm, killed %120, killed %121 + PseudoLoopEnd , %bb.1 + bb.2: + PseudoRET implicit $lr +...