diff --git a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
index c43a7b095d0c..e451f3c94e3f 100644
--- a/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
+++ b/llvm/lib/Target/AIE/AIE2TargetMachine.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2023-2025 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -27,10 +27,6 @@ cl::opt<bool>
     EnableSubregRenaming("aie-subreg-renaming", cl::Hidden, cl::init(false),
                          cl::desc("Enable RenameIndependentSubregs pass"));
 
-static cl::opt<bool>
-    EnableWAWRegRewrite("aie-wawreg-rewrite",
-                        cl::desc("Enable the WAW Register Renaming in loops"),
-                        cl::init(true), cl::Hidden);
 static cl::opt<bool>
     EnableReservedRegsLICM("aie-reserved-regs-licm", cl::Hidden, cl::init(true),
                            cl::desc("Enable LICM for some reserved registers"));
@@ -45,6 +41,7 @@ extern cl::opt<bool> EnableStagedRA;
 extern cl::opt<bool> EnableSuperRegSplitting;
 extern cl::opt<bool> AllocateMRegsFirst;
 extern cl::opt<bool> EnablePreMISchedCoalescer;
+extern cl::opt<bool> EnableWAWRegRewrite;
 
 extern bool AIEDumpArtifacts;
 
@@ -161,8 +158,10 @@ bool AIE2PassConfig::addRegAssignAndRewriteOptimized() {
     addPass(createAIESuperRegRewriter());
   }
   addPass(createGreedyRegisterAllocator());
-  if (EnableWAWRegRewrite)
+  if (EnableWAWRegRewrite) {
     addPass(createAIEWawRegRewriter());
+    addPass(createGreedyRegisterAllocator());
+  }
   addPass(createVirtRegRewriter());
 
   return true;
diff --git a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
index 53888fc9c79f..dbd15a05eeb0 100644
--- a/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
+++ b/llvm/lib/Target/AIE/AIEBaseTargetMachine.cpp
@@ -77,6 +77,11 @@ cl::opt<bool>
     EnableStagedRA("aie-staged-ra", cl::Hidden, cl::init(true),
                    cl::desc("Enable multi-stage register allocation"));
 
+cl::opt<bool>
+    EnableWAWRegRewrite("aie-wawreg-rewrite",
+                        cl::desc("Enable the WAW Register Renaming in loops"),
+                        cl::init(true), cl::Hidden);
+
 cl::opt<bool>
     EnableSuperRegSplitting("aie-split-superregs", cl::Hidden, cl::init(true),
                             cl::desc("Enable splitting super-regs into their "
diff --git a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp
index 0a371119479a..546164cf17b4 100644
--- a/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp
+++ b/llvm/lib/Target/AIE/AIEWawRegRewriter.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+// (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
 //
 //===----------------------------------------------------------------------===//
 //
@@ -17,6 +17,7 @@
 #include "AIEBaseRegisterInfo.h"
 #include "Utils/AIELoopUtils.h"
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCRegister.h"
 #include "llvm/Support/Debug.h"
@@ -38,8 +40,17 @@ using namespace llvm;
 
 #define DEBUG_TYPE "aie-waw-reg-rewrite"
 
+static cl::opt<bool> AggressiveReAlloc(
+    "aie-aggressive-realloc", cl::Hidden, cl::init(false),
+    cl::desc("Aggressively de-allocate live-through registers to favor "
+             "loop-local registers"));
+static cl::opt<bool> GPRRealloc("aie-gpr-realloc", cl::Hidden, cl::init(false),
+                                cl::desc("Re-allocate GPRs as well"));
+
 namespace {
 
+using RoundRobin = std::list<MCPhysReg>;
+
 ///
 /// This pass rewrites physical register assignments in critical parts of the
 /// code (like loops) to break WAW and WAR dependencies.
@@ -91,17 +102,19 @@ class AIEWawRegRewriter : public MachineFunctionPass {
   /// pass tries to remove.
   BitVector getDefinedPhysRegs(const MachineBasicBlock *MBB) const;
 
-  /// returns true if the physical register of Reg was replaced
-  bool replaceReg(const Register Reg, BitVector &BlockedPhysRegs);
+  /// Returns true if the physical register \p Reg was replaced
+  bool replaceReg(const Register Reg, RoundRobin &Registers,
+                  BitVector &UsedUnits);
+
+  void unassignReg(Register Reg);
+  void assignReg(Register Reg, MCPhysReg PhysReg);
 
-  /// Find a free register of the same register class type, but
-  /// exclude the blocked physical registers from the result.
-  /// Otherwise a new WAW dependencies can be introduced, that was previously
-  /// removed.
-  MCPhysReg getReplacementPhysReg(const Register Reg,
-                                  const BitVector &BlockedPhysRegs) const;
+  /// Find a free register of the same register class type
+  MCPhysReg getReplacementPhysReg(const Register Reg, RoundRobin &Registers,
+                                  BitVector &UsedUnits) const;
 
-  bool isWorthRenaming(const Register &Reg, const BitVector &UsedPhysRegs,
+  /// Whether \p Reg should be considered a candidate for re-assignment.
+  bool isWorthRenaming(const Register &Reg,
                        const BitVector &VRegWithCopies) const;
 
   /// return the Physical register of the Register, look it up in VirtRegMap if
@@ -132,11 +145,6 @@ class AIEWawRegRewriter : public MachineFunctionPass {
   /// instruction.
   IndexedMap<const MachineInstr *, VirtReg2IndexFunctor>
   getLastVRegDef(const MachineBasicBlock &MBB) const;
-
-  /// Block every sub- and super-register of a physical register, so that it is
-  /// removed for future replacement strategies, i.e. block wl4, wh4, y2 if X4
-  /// is used.
-  void addAliasRegs(BitVector &BlockedPhysRegs, const MCPhysReg PhysReg) const;
 };
 
 MCPhysReg AIEWawRegRewriter::getAssignedPhysReg(const Register Reg) const {
@@ -209,16 +217,6 @@ AIEWawRegRewriter::getVRegWithCopies(const MachineBasicBlock &MBB) const {
 bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "WAW Reg Renaming BasicBlock "; MBB->dump();
              dbgs() << "\n");
-  bool Modified = false;
-  // Add the used physical registers one machine instruction at a time.
-  // This vector is used to determine, if a physical register has already been
-  // defined in the machine basic block.
-  BitVector UsedPhysRegs(TRI->getNumRegs());
-
-  // Get a list of registers, that are not allowed as a replacement register.
-  // This list gets updated with the newly replaced physical register, so that
-  // this pass does not introduce WAW dependencies.
-  BitVector BlockedPhysRegs = getDefinedPhysRegs(MBB);
 
   // Collect all the virtual registers that have at least a copy instruction
   // that defines them. Subregisters may contain constants that may be shared
@@ -230,15 +228,18 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
   IndexedMap<const MachineInstr *, VirtReg2IndexFunctor> LastVRegDef =
       getLastVRegDef(*MBB);
 
-  for (const MachineInstr &MI : *MBB) {
+  // Record the candidates and their original allocation
+  using OriginalAllocation =
+      std::vector<std::pair<const MachineOperand *, Register>>;
+  OriginalAllocation Candidates;
 
+  for (const MachineInstr &MI : *MBB) {
     // Identity copies will be removed in a later pass, therefore, these are not
     // real defines of a physical register
     if (isIdentityCopy(MI))
       continue;
 
     for (const MachineOperand &MO : MI.defs()) {
-
       Register Reg = MO.getReg();
       if (!Reg.isVirtual())
         continue;
@@ -249,103 +250,252 @@ bool AIEWawRegRewriter::renameMBBPhysRegs(const MachineBasicBlock *MBB) {
       // several definitions of the same virtual register are not relevant
       // because even if the virtual register is renamed, by construction
       // all the definitions would be renamed as well and achieve nothing wrt
-      // WAW dependecy resolution
+      // WAW dependency resolution
       if (LastVRegDef[Reg] != &MI)
         continue;
 
-      if (isWorthRenaming(Reg, UsedPhysRegs, VRegWithCopies) &&
-          replaceReg(Reg, BlockedPhysRegs)) {
-
-        LLVM_DEBUG(dbgs() << MI);
-        Modified = true;
-
-      } else {
-        // Keep track of already visited physical registers.
-        // Incrementally add the encountered physical registers, so that a
-        // second occurrence of a physical register can trigger the register
-        // rewriting
-        // Blocked registers for the replacement are recorded in
-        // BlockedPhysRegs. Initially all the used physical registers
-        // from the MBB are blocked, so that replacements do not introduce WAW
-        // dependencies. Additionally, replaced registers are already blocked in
-        // BlockedPhysRegs, so that an additional replacement will not cause a
-        // WAW, which this pass is trying to remove.
-        UsedPhysRegs[VRM->getPhys(Reg)] = true;
+      if (isWorthRenaming(Reg, VRegWithCopies)) {
+        assert(VRM->hasPhys(Reg));
+        MCRegister AssignedPhysReg = VRM->getPhys(Reg);
+        Candidates.emplace_back(&MO, AssignedPhysReg);
+        LLVM_DEBUG(dbgs() << "Candidate " << printReg(Reg, TRI, 0, MRI) << ":"
+                          << TRI->getRegClassName(MRI->getRegClass(Reg)) << " ("
+                          << TRI->getName(AssignedPhysReg) << ")\n");
       }
     }
   }
 
-  return Modified;
+  // Free physregs of all candidates and register their regclasses
+  std::set<const TargetRegisterClass *> RegClasses;
+  for (auto &[MO, Org] : Candidates) {
+    auto VReg = MO->getReg();
+    if (VRM->hasPhys(VReg))
+      unassignReg(VReg);
+    auto *RC = MRI->getRegClass(VReg);
+    RegClasses.insert(RC);
+  }
+  LLVM_DEBUG(dbgs() << "Renaming " << Candidates.size() << " candidates in "
+                    << RegClasses.size() << " classes\n");
+
+  // If requested, unassign MBB's liveins as well to get even more freedom
+  if (AggressiveReAlloc) {
+    for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
+      Register Reg = Register::index2VirtReg(I);
+      if (!LIS->hasInterval(Reg) || !RegClasses.count(MRI->getRegClass(Reg)))
+        continue;
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (LIS->isLiveInToMBB(LI, MBB) && VRM->hasPhys(Reg)) {
+        unassignReg(Reg);
+      }
+    }
+  }
+
+  // Reallocate all virtual registers in Candidates.
+  // Return true if successful.
+  auto ReAllocate = [&](OriginalAllocation &Candidates, RoundRobin &Registers) {
+    BitVector UsedUnits;
+    UsedUnits.resize(TRI->getNumRegUnits());
+    for (auto &[MO, Org] : Candidates) {
+      auto VReg = MO->getReg();
+      if (!replaceReg(VReg, Registers, UsedUnits)) {
+        LLVM_DEBUG(dbgs() << "Renaming " << printReg(VReg, TRI, 0, MRI)
+                          << " failed\n");
+        return false;
+      }
+    }
+    return true;
+  };
+
+  // Reapply the original allocation to all Candidates
+  auto RevertAllocation = [&](OriginalAllocation &Candidates) {
+    // The partial allocation may conflict with the original one in ugly ways.
+    // To be safe, reset all allocations first.
+    for (auto &[MO, Org] : Candidates) {
+      auto VReg = MO->getReg();
+      if (VRM->hasPhys(VReg)) {
+        unassignReg(VReg);
+      }
+    }
+    for (auto &[MO, Org] : Candidates) {
+      auto VReg = MO->getReg();
+      assignReg(VReg, Org);
+    }
+  };
+
+  // Least-Recently-Used list of physical registers for assignments to VRegs.
+  // Physical registers that have recently been used are moved to the back.
+  std::list<MCPhysReg> LRURegisters;
+
+  // For each reg class, allocate the candidates in round-robin fashion.
+  // If we fail, we fall back to the original allocation
+  BitVector ExcludedPhysRegs{TRI->getNumRegs()};
+
+  // Exclude CSRs
+  for (const MCPhysReg *CSR = MRI->getCalleeSavedRegs(); CSR && *CSR; ++CSR)
+    ExcludedPhysRegs[*CSR] = true;
+
+  for (const auto *RC : RegClasses) {
+
+    LLVM_DEBUG(dbgs() << "Allowed registers in RC=" << TRI->getRegClassName(RC)
+                      << ":");
+    for (MCPhysReg PhysReg : RC->getRegisters()) {
+      if (!ExcludedPhysRegs[PhysReg]) {
+        LLVM_DEBUG(dbgs() << " " << printReg(PhysReg, TRI));
+        LRURegisters.push_back(PhysReg);
+      }
+      ExcludedPhysRegs[PhysReg] = true;
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+  if (!ReAllocate(Candidates, LRURegisters)) {
+    RevertAllocation(Candidates);
+    return false;
+  }
+
+  return true;
 }
 
 bool AIEWawRegRewriter::isWorthRenaming(const Register &Reg,
-                                        const BitVector &UsedPhysRegs,
                                         const BitVector &VRegWithCopies) const {
   assert(Reg.isVirtual());
 
-  // Only rename registers mapped to a phys reg assigned more than once
-  if (!UsedPhysRegs[VRM->getPhys(Reg)])
+  // The register might have been de-allocated when processing another loop.
+  if (!VRM->hasPhys(Reg))
     return false;
 
-  if (!TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg))))
+  // Only consider vec/acc registers as candidates, and optionally GPRs.
+  bool IsCandidateClass =
+      TRI->isVecOrAccRegClass(*(MRI->getRegClass(Reg))) ||
+      (GPRRealloc &&
+       TRI->getGPRRegClass(*MF)->hasSubClassEq(MRI->getRegClass(Reg)));
+  if (!IsCandidateClass)
     return false;
 
   return !VRegWithCopies[Reg.virtRegIndex()];
 }
 
-BitVector
-AIEWawRegRewriter::getDefinedPhysRegs(const MachineBasicBlock *MBB) const {
-  BitVector BlockedPhysRegs(TRI->getNumRegs());
+void AIEWawRegRewriter::unassignReg(Register VReg) {
+  const LiveInterval &LI = LIS->getInterval(VReg);
+  LRM->unassign(LI);
+}
 
-  for (const MachineInstr &MI : *MBB) {
-    for (const MachineOperand &Op : MI.defs()) {
-      MCPhysReg PhysReg = getAssignedPhysReg(Op.getReg());
-      if (MCRegister::isPhysicalRegister(PhysReg))
-        addAliasRegs(BlockedPhysRegs, PhysReg);
-    }
+void AIEWawRegRewriter::assignReg(Register VReg, MCPhysReg PhysReg) {
+  const LiveInterval &LI = LIS->getInterval(VReg);
+  if (VRM->hasPhys(VReg)) {
+    LRM->unassign(LI);
   }
-
-  return BlockedPhysRegs;
+  LRM->assign(LI, PhysReg);
 }
 
-bool AIEWawRegRewriter::replaceReg(const Register Reg,
-                                   BitVector &BlockedPhysRegs) {
-  assert(Reg.isVirtual());
-  LLVM_DEBUG(dbgs() << " WAW RegRewriter: Register to replace "
-                    << TRI->getName(VRM->getPhys(Reg)) << "\n");
-
-  MCPhysReg ReplacementPhysReg = getReplacementPhysReg(Reg, BlockedPhysRegs);
+bool AIEWawRegRewriter::replaceReg(const Register VReg,
+                                   RoundRobin &LRURegisters,
+                                   BitVector &UsedUnits) {
+  assert(VReg.isVirtual());
+  MCPhysReg ReplacementPhysReg =
+      getReplacementPhysReg(VReg, LRURegisters, UsedUnits);
 
   if (ReplacementPhysReg == MCRegister::NoRegister)
     return false;
 
-  LLVM_DEBUG(dbgs() << "     WAW Replacement: Virtual Register "
-                    << printReg(VRM->getPhys(Reg), TRI, 0, MRI)
-                    << " will replace "
-                    << printReg(VRM->getPhys(Reg), TRI, 0, MRI) << " with "
-                    << printReg(ReplacementPhysReg, TRI, 0, MRI) << '\n');
+  LLVM_DEBUG(dbgs() << "     replace: " << printReg(VReg, TRI) << " with "
+                    << TRI->getName(ReplacementPhysReg) << '\n');
+  assert(Register::isPhysicalRegister(ReplacementPhysReg));
 
-  const LiveInterval &LI = LIS->getInterval(Reg);
-  LRM->unassign(LI);
-  LRM->assign(LI, ReplacementPhysReg);
-  addAliasRegs(BlockedPhysRegs, ReplacementPhysReg);
+  assignReg(VReg, ReplacementPhysReg);
   return true;
 }
 
-MCPhysReg AIEWawRegRewriter::getReplacementPhysReg(
-    const Register Reg, const BitVector &BlockedPhysRegs) const {
-  assert(Reg.isVirtual() && "Reg has to be a virtual register");
-  const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+/// Returns a vreg of the same class that is exclusively used (and killed)
+/// at the point \p VReg gets defined.
+std::optional<Register>
+getKilledRegAtSingledDefPoint(Register VReg, const MachineRegisterInfo &MRI) {
+  MachineOperand *MO = MRI.getOneDef(VReg);
+  if (!MO)
+    return std::nullopt;
+
+  MachineInstr &DefMI = *MO->getParent();
+  auto OnlyUsedByInstr = [&MRI](Register Reg, const MachineInstr &MI) {
+    return all_of(MRI.use_instructions(Reg),
+                  [&MI](const MachineInstr &UseMI) { return &UseMI == &MI; });
+  };
+
+  for (MachineOperand &UseMO : DefMI.explicit_uses()) {
+    if (UseMO.isReg() && UseMO.getReg().isVirtual() &&
+        MRI.getRegClass(VReg) == MRI.getRegClass(UseMO.getReg()) &&
+        OnlyUsedByInstr(UseMO.getReg(), DefMI)) {
+      return UseMO.getReg();
+    }
+  }
+  return std::nullopt;
+}
+
+void moveRegAndAliasesBack(MCPhysReg PhysReg, RoundRobin &LRURegisters,
+                           const TargetRegisterInfo *TRI) {
+  for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid();
+       ++AI) {
+    // TODO: Use hints to speed up the search of aliases?
+    auto AliasIt = llvm::find(LRURegisters, *AI);
+    if (AliasIt != LRURegisters.end()) {
+      LRURegisters.erase(AliasIt);
+      LRURegisters.emplace_back(*AI);
+    }
+  }
+}
+
+MCPhysReg AIEWawRegRewriter::getReplacementPhysReg(const Register VReg,
+                                                   RoundRobin &LRURegisters,
+                                                   BitVector &UsedUnits) const {
+  assert(VReg.isVirtual() && "Reg has to be a virtual register");
 
-  LiveInterval &LI = LIS->getInterval(Reg);
-  for (const MCPhysReg &PhysReg : RC->getRegisters()) {
+  /// Whether \p PhysReg was ever used for re-assigning a vreg
+  auto WasUsedForReassignment = [TRI = this->TRI,
+                                 &UsedUnits](MCPhysReg PhysReg) {
+    return any_of(TRI->regunits(PhysReg),
+                  [&UsedUnits](MCRegUnit RU) { return UsedUnits.test(RU); });
+  };
 
-    if (BlockedPhysRegs[PhysReg])
-      continue;
+  LLVM_DEBUG(dbgs() << "     Try to re-assign" << printReg(VReg, TRI) << "\n");
+  const TargetRegisterClass *RC = MRI->getRegClass(VReg);
+  const LiveInterval &LI = LIS->getInterval(VReg);
 
+  // Find the least-recently assigned register to assign to VReg.
+  for (auto It = LRURegisters.begin(); It != LRURegisters.end(); ++It) {
+    MCPhysReg PhysReg = *It;
+
+    if (!RC->contains(PhysReg)) {
+      continue;
+    }
     LiveRegMatrix::InterferenceKind IK = LRM->checkInterference(LI, PhysReg);
-    if (IK == LiveRegMatrix::IK_Free)
+    if (IK == LiveRegMatrix::IK_Free) {
+      // If the chosen physical register has already been used and the vreg to
+      // allocate is defined at a point where another vreg gets killed, prefer
+      // reusing the assignment of the killed reg.
+      if (std::optional<Register> KilledReg =
+              getKilledRegAtSingledDefPoint(VReg, *MRI);
+          KilledReg && WasUsedForReassignment(PhysReg)) {
+        MCRegister KilledPhysReg = getAssignedPhysReg(*KilledReg);
+        if (KilledPhysReg && LRM->checkInterference(LI, KilledPhysReg) ==
+                                 LiveRegMatrix::IK_Free) {
+
+          LLVM_DEBUG(dbgs() << "     re-use killed physreg for assigning: "
+                            << printReg(VReg, TRI) << " to "
+                            << TRI->getName(KilledPhysReg) << '\n');
+          PhysReg = KilledPhysReg;
+          It = llvm::find(LRURegisters, KilledPhysReg);
+          assert(It != LRURegisters.end());
+        }
+      }
+
+      // Move it to the end of the list. We return, so don't have to
+      // care about invalidation
+      moveRegAndAliasesBack(PhysReg, LRURegisters, TRI);
+      for (MCRegUnit RU : TRI->regunits(PhysReg))
+        UsedUnits.set(RU);
       return PhysReg;
+    }
+    LLVM_DEBUG(dbgs() << "       Cannot assign " << printReg(VReg, TRI)
+                      << " to " << TRI->getName(PhysReg)
+                      << " due to interference\n");
   }
   return MCRegister::NoRegister;
 }
@@ -379,20 +529,6 @@ AIEWawRegRewriter::getLastVRegDef(const MachineBasicBlock &MBB) const {
   return LastVRegDef;
 }
 
-void AIEWawRegRewriter::addAliasRegs(BitVector &BlockedPhysRegs,
-                                     const MCPhysReg PhysReg) const {
-  assert(MCRegister::isPhysicalRegister(PhysReg));
-
-  LLVM_DEBUG(dbgs() << "Adding to Blocked Regs ("
-                    << printReg(PhysReg, TRI, 0, MRI) << ") with alias: ");
-  for (MCRegAliasIterator AI(MCRegister(PhysReg), TRI, true); AI.isValid();
-       ++AI) {
-    BlockedPhysRegs[*AI] = true;
-    LLVM_DEBUG(dbgs() << printReg(*AI, TRI, 0, MRI) << " ");
-  }
-  LLVM_DEBUG(dbgs() << "\n");
-}
-
 } // end anonymous namespace
 
 char AIEWawRegRewriter::ID = 0;
diff --git a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp
index 7ea7bc8ab9cf..ab0158f21334 100644
--- a/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp
+++ b/llvm/lib/Target/AIE/aie2p/AIE2PTargetMachine.cpp
@@ -22,6 +22,7 @@ extern cl::opt<bool> EnableSuperRegSplitting;
 extern cl::opt<bool> AllocateMRegsFirst;
 extern cl::opt<bool> EnablePreMISchedCoalescer;
 extern cl::opt<bool> EnableAddressChaining;
+extern cl::opt<bool> EnableWAWRegRewrite;
 
 void AIE2PTargetMachine::anchor() {}
 
@@ -102,6 +103,10 @@ bool AIE2PPassConfig::addRegAssignAndRewriteOptimized() {
     addPass(createAIESuperRegRewriter());
   }
   addPass(createGreedyRegisterAllocator());
+  if (EnableWAWRegRewrite) {
+    addPass(createAIEWawRegRewriter());
+    addPass(createGreedyRegisterAllocator());
+  }
   addPass(createVirtRegRewriter());
 
   return true;
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll
index 7af1e07aa841..981c2f53d80a 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/Conv2D-red.ll
@@ -88,71 +88,71 @@ define dso_local void @conv2d.loop.nest(ptr %add.ptr6.i51, ptr %add.ptr5, ptr %c
 ; ASM-NEXT:    lda r24, [p6, #0]; paddb [p7], #-128; mov p6, sp
 ; ASM-NEXT:    lda m7, [p7, #0]; paddb [p6], #-124; movx r8, #11; mov dc7, dj3
 ; ASM-NEXT:    lda m4, [p6, #0]; movx r9, #31; mov r26, dj3
-; ASM-NEXT:    // implicit-def: $x4
-; ASM-NEXT:    // implicit-def: $x2
+; ASM-NEXT:    // implicit-def: $x8
+; ASM-NEXT:    // implicit-def: $x1
 ; ASM-NEXT:    .p2align 4
 ; ASM-NEXT:  .LBB0_1: // %outer.loop.header
 ; ASM-NEXT:    // =>This Loop Header: Depth=1
 ; ASM-NEXT:    // Child Loop BB0_2 Depth 2
-; ASM-NEXT:    vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m1, p4
-; ASM-NEXT:    vlda.ups.s32.s16 bml1, s0, [p2], m1
-; ASM-NEXT:    vlda.ups.s32.s16 bmh2, s0, [p2, #32]; mov m2, p5
-; ASM-NEXT:    vlda.ups.s32.s16 bml2, s0, [p2], m2
-; ASM-NEXT:    vlda.ups.s32.s16 bmh3, s0, [p2, #32]
-; ASM-NEXT:    vlda.ups.s32.s16 bml3, s0, [p2], m1
-; ASM-NEXT:    vlda.ups.s32.s16 bmh4, s0, [p2, #32]; mov m3, r15
-; ASM-NEXT:    vlda.ups.s32.s16 bml4, s0, [p2], m3
+; ASM-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; mov m1, p4
+; ASM-NEXT:    vlda.ups.s32.s16 bml0, s0, [p2], m1
+; ASM-NEXT:    vlda.ups.s32.s16 bmh1, s0, [p2, #32]; mov m2, p5
+; ASM-NEXT:    vlda.ups.s32.s16 bml1, s0, [p2], m2
+; ASM-NEXT:    vlda.ups.s32.s16 bmh2, s0, [p2, #32]
+; ASM-NEXT:    vlda.ups.s32.s16 bml2, s0, [p2], m1
+; ASM-NEXT:    vlda.ups.s32.s16 bmh3, s0, [p2, #32]; mov m3, r15
+; ASM-NEXT:    vlda.ups.s32.s16 bml3, s0, [p2], m3
+; ASM-NEXT:    vlda.ups.s32.s16 bmh4, s0, [p2, #32]
+; ASM-NEXT:    vlda.ups.s32.s16 bml4, s0, [p2], m1
 ; ASM-NEXT:    vlda.ups.s32.s16 bmh5, s0, [p2, #32]
-; ASM-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m1
+; ASM-NEXT:    vlda.ups.s32.s16 bml5, s0, [p2], m2
 ; ASM-NEXT:    vlda.ups.s32.s16 bmh6, s0, [p2, #32]
-; ASM-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2], m2
-; ASM-NEXT:    vlda.ups.s32.s16 bmh7, s0, [p2, #32]
-; ASM-NEXT:    vlda.ups.s32.s16 bml7, s0, [p2], m1; mov r0, p0
-; ASM-NEXT:    vlda.ups.s32.s16 bmh0, s0, [p2, #32]; and r0, r0, r9
-; ASM-NEXT:    vlda.ups.s32.s16 bml0, s0, [p2, #0]; add r1, r0, #33; mov r0, r5
+; ASM-NEXT:    vlda.ups.s32.s16 bml6, s0, [p2], m1; mov r0, p0
+; ASM-NEXT:    vlda.ups.s32.s16 bmh7, s0, [p2, #32]; and r0, r0, r9
+; ASM-NEXT:    vlda.ups.s32.s16 bml7, s0, [p2, #0]; add r1, r0, #33; mov r0, r5
 ; ASM-NEXT:    .p2align 4
 ; ASM-NEXT:  .LBB0_2: // %inner.loop
 ; ASM-NEXT:    // Parent Loop BB0_1 Depth=1
 ; ASM-NEXT:    // => This Inner Loop Header: Depth=2
-; ASM-NEXT:    vldb wl6, [p0], m6; nopx
-; ASM-NEXT:    vldb wh6, [p0], m6
-; ASM-NEXT:    vldb wl8, [p0], m6
-; ASM-NEXT:    vldb.3d wh8, [p0], d0
+; ASM-NEXT:    vldb wl2, [p0], m6; nopx
+; ASM-NEXT:    vldb wh2, [p0], m6
+; ASM-NEXT:    vldb wl4, [p0], m6
+; ASM-NEXT:    vldb.3d wh4, [p0], d0
 ; ASM-NEXT:    nop
+; ASM-NEXT:    vldb wl6, [p1], #32
+; ASM-NEXT:    vldb wh6, [p1], #32
 ; ASM-NEXT:    vldb wl10, [p1], #32
 ; ASM-NEXT:    vldb wh10, [p1], #32
-; ASM-NEXT:    vldb wl7, [p1], #32
-; ASM-NEXT:    vldb wh7, [p1], #32
-; ASM-NEXT:    vshift.align x4, x4, s1, x6, r1
-; ASM-NEXT:    vshift.align x2, x2, s1, x8, r1
-; ASM-NEXT:    vshuffle x9, x4, x2, r2
-; ASM-NEXT:    vshuffle x3, x4, x2, r3
-; ASM-NEXT:    vmac cm1, cm1, x9, x10, r4
-; ASM-NEXT:    add r0, r0, #-1; vshuffle x1, x9, x0, r8; vmac cm3, cm3, x3, x10, r4
-; ASM-NEXT:    jnz r0, #.LBB0_2; vmac cm5, cm5, x9, x7, r4
-; ASM-NEXT:    vshuffle x5, x3, x0, r8; vmac cm7, cm7, x3, x7, r4 // Delay Slot 5
-; ASM-NEXT:    vmac cm2, cm2, x1, x10, r4 // Delay Slot 4
-; ASM-NEXT:    mov r1, p0; vmac cm4, cm4, x5, x10, r4 // Delay Slot 3
-; ASM-NEXT:    and r1, r1, r9; vmac cm6, cm6, x1, x7, r4 // Delay Slot 2
-; ASM-NEXT:    add r1, r1, #33; vmac cm0, cm0, x5, x7, r4 // Delay Slot 1
+; ASM-NEXT:    vshift.align x8, x8, s1, x2, r1
+; ASM-NEXT:    vshift.align x1, x1, s1, x4, r1
+; ASM-NEXT:    vshuffle x3, x8, x1, r2
+; ASM-NEXT:    vshuffle x7, x8, x1, r3
+; ASM-NEXT:    vmac cm0, cm0, x3, x6, r4
+; ASM-NEXT:    add r0, r0, #-1; vshuffle x5, x3, x0, r8; vmac cm2, cm2, x7, x6, r4
+; ASM-NEXT:    jnz r0, #.LBB0_2; vmac cm4, cm4, x3, x10, r4
+; ASM-NEXT:    vshuffle x9, x7, x0, r8; vmac cm6, cm6, x7, x10, r4 // Delay Slot 5
+; ASM-NEXT:    vmac cm1, cm1, x5, x6, r4 // Delay Slot 4
+; ASM-NEXT:    mov r1, p0; vmac cm3, cm3, x9, x6, r4 // Delay Slot 3
+; ASM-NEXT:    and r1, r1, r9; vmac cm5, cm5, x5, x10, r4 // Delay Slot 2
+; ASM-NEXT:    add r1, r1, #33; vmac cm7, cm7, x9, x10, r4 // Delay Slot 1
 ; ASM-NEXT:  // %bb.3: // %outer.loop.latch
 ; ASM-NEXT:    // in Loop: Header=BB0_1 Depth=1
-; ASM-NEXT:    nopb ; nopa ; vst.srs.s16.s32 bmh1, s2, [p3, #32]; nopxm ; nopv
-; ASM-NEXT:    vst.srs.s16.s32 bml1, s3, [p3], #64
+; ASM-NEXT:    nopb ; nopa ; vst.srs.s16.s32 bmh0, s2, [p3, #32]; nopxm ; nopv
+; ASM-NEXT:    vst.srs.s16.s32 bml0, s3, [p3], #64
+; ASM-NEXT:    vst.srs.s16.s32 bmh1, s3, [p3, #32]
+; ASM-NEXT:    vst.srs.s16.s32 bml1, s3, [p3], m4
 ; ASM-NEXT:    vst.srs.s16.s32 bmh2, s3, [p3, #32]
-; ASM-NEXT:    vst.srs.s16.s32 bml2, s3, [p3], m4
+; ASM-NEXT:    vst.srs.s16.s32 bml2, s3, [p3], #64
 ; ASM-NEXT:    vst.srs.s16.s32 bmh3, s3, [p3, #32]
-; ASM-NEXT:    vst.srs.s16.s32 bml3, s3, [p3], #64
+; ASM-NEXT:    vst.srs.s16.s32 bml3, s3, [p3], m7
 ; ASM-NEXT:    vst.srs.s16.s32 bmh4, s3, [p3, #32]
-; ASM-NEXT:    vst.srs.s16.s32 bml4, s3, [p3], m7
-; ASM-NEXT:    vst.srs.s16.s32 bmh5, s3, [p3, #32]
-; ASM-NEXT:    vst.srs.s16.s32 bml5, s3, [p3], #64
-; ASM-NEXT:    vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dc5, r26
-; ASM-NEXT:    vst.srs.s16.s32 bml6, s3, [p3], m4; mov dn5, r27
-; ASM-NEXT:    vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov dj5, r28
-; ASM-NEXT:    vst.srs.s16.s32 bml7, s3, [p3], #64; mov m1, r10
-; ASM-NEXT:    vst.srs.s16.s32 bmh0, s3, [p3, #32]; mov m2, r13
-; ASM-NEXT:    vst.2d.srs.s16.s32 bml0, s3, [p3], d5; mov dj5, r11
+; ASM-NEXT:    vst.srs.s16.s32 bml4, s3, [p3], #64
+; ASM-NEXT:    vst.srs.s16.s32 bmh5, s3, [p3, #32]; mov dc5, r26
+; ASM-NEXT:    vst.srs.s16.s32 bml5, s3, [p3], m4; mov dn5, r27
+; ASM-NEXT:    vst.srs.s16.s32 bmh6, s3, [p3, #32]; mov dj5, r28
+; ASM-NEXT:    vst.srs.s16.s32 bml6, s3, [p3], #64; mov m1, r10
+; ASM-NEXT:    vst.srs.s16.s32 bmh7, s3, [p3, #32]; mov m2, r13
+; ASM-NEXT:    vst.2d.srs.s16.s32 bml7, s3, [p3], d5; mov dj5, r11
 ; ASM-NEXT:    add r7, r7, #-1; mov dn5, r12
 ; ASM-NEXT:    jnz r7, #.LBB0_1
 ; ASM-NEXT:    mov r26, dc5 // Delay Slot 5
diff --git a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
index 63dc588fa834..4a58268d3241 100644
--- a/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
+++ b/llvm/test/CodeGen/AIE/aie2/end-to-end/TanhTemplated-swp.ll
@@ -106,36 +106,36 @@ define dso_local void @TanhTemplated(ptr noalias %ifm, ptr noalias %ofm, ptr non
 ; CHECK-NEXT:    nopb ; nopa ; nops ; nopxm ; nopv
 ; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl5, bmh7; nopx ; vmin_ge.bf16 x3, r16, x3, x1; nopv
 ; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vmax_lt.bf16 x3, r16, x3, x10; nopv
-; CHECK-NEXT:    nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; CHECK-NEXT:    nopb ; mova r0, #28; vconv.bf16.fp32 wl7, bmh3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    nopa ; nopb ; nopx ; vband x9, x8, x3; nops
-; CHECK-NEXT:    vmax_lt.bf16 x5, r16, x5, x10
-; CHECK-NEXT:    vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmov wh3, wl2
+; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vband x9, x8, x3; nopv
+; CHECK-NEXT:    vldb wl7, [p0], #32; vmov wh3, wl2
+; CHECK-NEXT:    nopx ; vmov wh9, wl2; vmul.f bmh5, x7, x0, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl7, bml4; vldb wl7, [p0], #32; vmax_lt.bf16 x5, r16, x11, x10; vmac.f bmh4, bmh0, x3, x4, r1
+; CHECK-NEXT:    vband x9, x8, x5; vmul.f bmh2, x6, x9, r1
 ; CHECK-NEXT:    vmov wh9, wl2; vmul.f bmh6, x7, x0, r1
-; CHECK-NEXT:    vldb wl7, [p0], #32; vband x9, x8, x5; vmul.f bmh2, x7, x0, r1
-; CHECK-NEXT:    vmov wh9, wl2; vmul.f bmh3, x6, x9, r1
-; CHECK-NEXT:    vmac.f bmh5, bmh0, x3, x4, r1
-; CHECK-NEXT:    vmul.f bmh4, x6, x9, r1
-; CHECK-NEXT:    vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0
+; CHECK-NEXT:    vsub.f bml0, bmh5, bmh1, r0
+; CHECK-NEXT:    vmul.f bmh3, x6, x9, r1
 ; CHECK-NEXT:    vmul.f bmh7, x0, x7, r1
+; CHECK-NEXT:    vmov wh5, wl2; vsub.f bml1, bmh6, bmh1, r0
+; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh2; vmul.f bmh8, x0, x7, r1
 ; CHECK-NEXT:    vmac.f bml2, bmh0, x5, x4, r1
-; CHECK-NEXT:    vconv.bf16.fp32 wl7, bmh3; vmul.f bmh8, x0, x7, r1
-; CHECK-NEXT:    vsub.f bml0, bmh2, bmh1, r0
-; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh4; vmsc.f bml3, bmh5, x7, x3, r1
-; CHECK-NEXT:    nop
-; CHECK-NEXT:    vconv.bf16.fp32 wl11, bmh7; vmsc.f bml4, bml2, x3, x5, r1
-; CHECK-NEXT:    vst.conv.bf16.fp32 bml1, [p1], #32
-; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x3, r16, x11, x1
-; CHECK-NEXT:    vst.conv.bf16.fp32 bml0, [p1], #32; vmax_lt.bf16 x3, r16, x3, x10
+; CHECK-NEXT:    vmsc.f bml3, bmh4, x7, x3, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh3
+; CHECK-NEXT:    vconv.bf16.fp32 wl3, bmh7
+; CHECK-NEXT:    vst.conv.bf16.fp32 bml0, [p1], #32; vmsc.f bml4, bml2, x3, x5, r1
+; CHECK-NEXT:    vconv.bf16.fp32 wl5, bmh8; vmin_ge.bf16 x9, r16, x3, x1
+; CHECK-NEXT:    vst.conv.bf16.fp32 bml1, [p1], #32; vmax_lt.bf16 x3, r16, x9, x10
 ; CHECK-NEXT:  .L_LEnd0:
-; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x5, r16, x5, x1; nopv
+; CHECK-NEXT:    nopb ; nopa ; vconv.bf16.fp32 wl7, bml3; nopx ; vmin_ge.bf16 x11, r16, x5, x1; nopv
 ; CHECK-NEXT:  // %bb.2:
-; CHECK-NEXT:    nopb ; nopa ; nops ; nopx ; vmov wh7, wl2; nopv
+; CHECK-NEXT:    nopa ; nopb ; nopxm
+; CHECK-NEXT:    vmov wh7, wl2
 ; CHECK-NEXT:    vconv.bf16.fp32 wl1, bml4; vmov wh1, wl2
 ; CHECK-NEXT:    vmov wh6, wl2; vmul.f bmh3, x7, x0, r1
-; CHECK-NEXT:    vmax_lt.bf16 x10, r16, x5, x10; vmul.f bmh2, x1, x0, r1
+; CHECK-NEXT:    vmax_lt.bf16 x10, r16, x11, x10; vmul.f bmh2, x1, x0, r1
 ; CHECK-NEXT:    vband x1, x8, x3
 ; CHECK-NEXT:    vband x8, x8, x10
 ; CHECK-NEXT:    vmov wh1, wl2; vsub.f bmh3, bmh3, bmh1, r0
diff --git a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll
index 67d9fc16fa44..444c0b0bc709 100644
--- a/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll
+++ b/llvm/test/CodeGen/AIE/aie2/llc-pipeline-aie2.ll
@@ -212,6 +212,7 @@
 ; AIE-O123-NEXT:      AIE super-reg rewrite
 ; AIE-O123-NEXT:      Greedy Register Allocator
 ; AIE-O123-NEXT:      AIE waw-reg rewrite
+; AIE-O123-NEXT:      Greedy Register Allocator
 ; AIE-O123-NEXT:      Virtual Register Rewriter
 ; AIE-O123-NEXT:      Stack Slot Coloring
 ; AIE-O123-NEXT:      AIE 1D operands to 2D/3D rewriter
diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir
new file mode 100644
index 000000000000..4dbb982b1f0b
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_aggressive.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# NOTE: Example file for Write After Write Register Renaming in Loop test
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+# unit test for the WAW register renaming pass and check edge cases
+# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter \
+# RUN:    --aie-aggressive-realloc %s -o - | FileCheck %s
+
+
+# This is a simplified example taken from the GEMM_bf16 kernel.
+# We want to make sure that live-through registers are de-allocated to make
+# space for loop-local registers when --aie-aggressive-realloc is passed.
+---
+name:            dealloc_live_through
+alignment:       16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: dealloc_live_through
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $m0, $p0, $r0, $d1_3d, $d2_3d
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $p1 = COPY $p0
+  ; CHECK-NEXT:   renamable $m3 = COPY $m0
+  ; CHECK-NEXT:   renamable $m4 = COPY $m0
+  ; CHECK-NEXT:   renamable $r1 = COPY $r0
+  ; CHECK-NEXT:   renamable $r2 = COPY $r0
+  ; CHECK-NEXT:   renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   VST_X_SPILL killed renamable $x0, %stack.0, implicit $sp :: (store (s512) into %stack.0, align 32)
+  ; CHECK-NEXT:   renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   VST_X_SPILL killed renamable $x0, %stack.1, implicit $sp :: (store (s512) into %stack.1, align 32)
+  ; CHECK-NEXT:   renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   VST_X_SPILL killed renamable $x0, %stack.2, implicit $sp :: (store (s512) into %stack.2, align 32)
+  ; CHECK-NEXT:   LoopStart $r0, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $m0, $m3, $m4, $p0, $p1, $r0, $r1, $r2, $d1_3d:0x000000000003C870, $d2_3d:0x000000000003C870
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wl2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $p1 = PADD_mod_pseudo killed renamable $p1, renamable $m0
+  ; CHECK-NEXT:   renamable $wl4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wl6, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   $wh6, $p1, $dc2, $dc6 = VLD_3D_pseudo killed $p1, $d2_3d
+  ; CHECK-NEXT:   renamable $x8 = VSHUFFLE renamable $x0, renamable $x4, renamable $r0
+  ; CHECK-NEXT:   renamable $x10 = VSHUFFLE killed renamable $x0, killed renamable $x4, renamable $r1
+  ; CHECK-NEXT:   renamable $x1 = VSHUFFLE renamable $x2, renamable $x6, renamable $r0
+  ; CHECK-NEXT:   renamable $x3 = VSHUFFLE killed renamable $x2, killed renamable $x6, renamable $r1
+  ; CHECK-NEXT:   renamable $wl5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl11, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   $wh11, $p0, $dc1, $dc5 = VLD_3D_pseudo killed $p0, $d1_3d
+  ; CHECK-NEXT:   renamable $x5 = VSHUFFLE killed renamable $x5, renamable $x5, renamable $r2
+  ; CHECK-NEXT:   renamable $x7 = VSHUFFLE killed renamable $x7, renamable $x7, renamable $r2
+  ; CHECK-NEXT:   renamable $x9 = VSHUFFLE killed renamable $x9, renamable $x9, renamable $r2
+  ; CHECK-NEXT:   renamable $x11 = VSHUFFLE killed renamable $x11, renamable $x11, renamable $r2
+  ; CHECK-NEXT:   PseudoLoopEnd <mcsymbol .L_1120>, %bb.1, implicit killed renamable $x8, implicit killed renamable $x10, implicit killed renamable $x1, implicit killed renamable $x3, implicit killed renamable $x5, implicit killed renamable $x7, implicit killed renamable $x9, implicit killed renamable $x11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   renamable $x0 = VLDA_X_SPILL %stack.0, implicit $sp :: (load (s512) from %stack.0, align 32)
+  ; CHECK-NEXT:   renamable $x2 = VLDA_X_SPILL %stack.1, implicit $sp :: (load (s512) from %stack.1, align 32)
+  ; CHECK-NEXT:   renamable $x4 = VLDA_X_SPILL %stack.2, implicit $sp :: (load (s512) from %stack.2, align 32)
+  ; CHECK-NEXT:   PseudoRET implicit $lr, implicit killed renamable $x0, implicit killed renamable $x2, implicit killed renamable $x4
+  bb.0.entry:
+  successors: %bb.1
+  liveins: $r0, $p0, $m0, $d1_3d, $d2_3d
+  %598:ep_as_32bit = COPY $p0
+  %599:ep_as_32bit = COPY $p0
+  %7:em = COPY $m0
+  %13:em = COPY $m0
+  %14:em = COPY $m0
+  %582:eds = COPY $d1_3d
+  %583:eds = COPY $d2_3d
+  %108:er = COPY $r0
+  %110:er = COPY $r0
+  %20:er = COPY $r0
+  undef %10.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  %10.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  undef %11.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  %11.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  undef %12.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  %12.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+  LoopStart $r0, 0
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    undef %338.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %338.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    undef %346.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %346.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %599:ep_as_32bit = PADD_mod_pseudo %599, %7
+    undef %355.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %355.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    undef %361.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %361.sub_256_hi:vec512, %599:ep_as_32bit, %583.sub_dim_count:eds, %583.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %599, %583
+    %369:vec512 = VSHUFFLE %338, %355, %108
+    %370:vec512 = VSHUFFLE %338, %355, %110
+    %371:vec512 = VSHUFFLE %346, %361, %108
+    %372:vec512 = VSHUFFLE %346, %361, %110
+    undef %378.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %378.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %386.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %386.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %394.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %394.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %402.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %402.sub_256_hi:vec512, %598:ep_as_32bit, %582.sub_dim_count:eds, %582.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %598, %582
+    %410:vec512 = VSHUFFLE %378, %378, %20
+    %411:vec512 = VSHUFFLE %386, %386, %20
+    %412:vec512 = VSHUFFLE %394, %394, %20
+    %413:vec512 = VSHUFFLE %402, %402, %20
+    PseudoLoopEnd <mcsymbol .L_1120>, %bb.1, implicit %369, implicit %370, implicit %371, implicit %372, implicit %410, implicit %411, implicit %412, implicit %413
+
+  bb.2:
+    PseudoRET implicit $lr, implicit %10, implicit %11, implicit %12
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir
new file mode 100644
index 000000000000..ea219fdcc073
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_gpr.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# NOTE: Example file for Write After Write Register Renaming in Loop test
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+
+# RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter \
+# RUN:   --aie-gpr-realloc %s -o - | FileCheck %s
+
+
+# Check general purpose registers can also be renamed.
+---
+name:            gpr_renaming
+alignment:       16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: gpr_renaming
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $r0, $r1, $r2, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   LoopStart $r0, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $r1, $r2, $r8
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $r0 = AND $r1, $r2
+  ; CHECK-NEXT:   renamable $r3 = AND $r1, $r8
+  ; CHECK-NEXT:   renamable $r4 = AND killed renamable $r0, renamable $r3
+  ; CHECK-NEXT:   dead renamable $r5 = AND killed renamable $r3, killed renamable $r4
+  ; CHECK-NEXT:   PseudoLoopEnd <mcsymbol .L_1120>, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoRET implicit $lr
+  bb.0.entry:
+  successors: %bb.1
+  liveins: $r0, $r1, $r2, $r8
+  LoopStart $r0, 0
+  bb.1:
+    successors: %bb.1, %bb.2
+    liveins: $r1, $r2, $r8
+    %0:er = AND $r1, $r2
+    %1:er = AND $r1, $r8
+    %2:er = AND %0, %1
+    %3:er = AND %1, %2
+    PseudoLoopEnd <mcsymbol .L_1120>, %bb.1
+  bb.2:
+    PseudoRET implicit $lr
+...
+
diff --git a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir
index c8875a371847..9dadcec0609b 100644
--- a/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir
+++ b/llvm/test/CodeGen/AIE/aie2/ra/waw_reg_renaming_loop.mir
@@ -5,7 +5,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
 # unit test for the WAW register renaming pass and check edge cases
 # RUN: llc -mtriple=aie2 -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s
 
@@ -509,6 +509,110 @@ body:             |
     PseudoRET implicit $lr
 ...
 
+# This is a simplified example taken from the GEMM_bf16 kernel.
+# When we get to the last 4 VSHUFFLE, all 12 X registers have already been
+# distributed. Those VSHUFFLE represent a kill point for other X registers.
+# We should then prefer "re-using" the killed registers instead of us starting
+# anew from x0, x2, x4 and x6.
+---
+name:            def_kill_instrs
+alignment:       16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: def_kill_instrs
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $m0, $p0, $r0, $d1_3d, $d2_3d
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $p1 = COPY $p0
+  ; CHECK-NEXT:   renamable $m3 = COPY $m0
+  ; CHECK-NEXT:   renamable $m4 = COPY $m0
+  ; CHECK-NEXT:   renamable $r1 = COPY $r0
+  ; CHECK-NEXT:   renamable $r2 = COPY $r0
+  ; CHECK-NEXT:   LoopStart $r0, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $m0, $m3, $m4, $p0, $p1, $r0, $r1, $r2, $d1_3d:0x000000000003C870, $d2_3d:0x000000000003C870
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $wl0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh0, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wl2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh2, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $p1 = PADD_mod_pseudo killed renamable $p1, renamable $m0
+  ; CHECK-NEXT:   renamable $wl4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wh4, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   renamable $wl6, renamable $p1 = VLD_pstm_imm_4x32_pseudo killed renamable $p1, 32
+  ; CHECK-NEXT:   $wh6, $p1, $dc2, $dc6 = VLD_3D_pseudo killed $p1, $d2_3d
+  ; CHECK-NEXT:   renamable $x8 = VSHUFFLE renamable $x0, renamable $x4, renamable $r0
+  ; CHECK-NEXT:   renamable $x10 = VSHUFFLE killed renamable $x0, killed renamable $x4, renamable $r1
+  ; CHECK-NEXT:   renamable $x1 = VSHUFFLE renamable $x2, renamable $x6, renamable $r0
+  ; CHECK-NEXT:   renamable $x3 = VSHUFFLE killed renamable $x2, killed renamable $x6, renamable $r1
+  ; CHECK-NEXT:   renamable $wl5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh5, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh7, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   renamable $wh9, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m4
+  ; CHECK-NEXT:   renamable $wl11, renamable $p0 = VLD_pstm_pseudo killed renamable $p0, renamable $m3
+  ; CHECK-NEXT:   $wh11, $p0, $dc1, $dc5 = VLD_3D_pseudo killed $p0, $d1_3d
+  ; CHECK-NEXT:   renamable $x5 = VSHUFFLE killed renamable $x5, renamable $x5, renamable $r2
+  ; CHECK-NEXT:   renamable $x7 = VSHUFFLE killed renamable $x7, renamable $x7, renamable $r2
+  ; CHECK-NEXT:   renamable $x9 = VSHUFFLE killed renamable $x9, renamable $x9, renamable $r2
+  ; CHECK-NEXT:   renamable $x11 = VSHUFFLE killed renamable $x11, renamable $x11, renamable $r2
+  ; CHECK-NEXT:   PseudoLoopEnd <mcsymbol .L_1120>, %bb.1, implicit killed renamable $x8, implicit killed renamable $x10, implicit killed renamable $x1, implicit killed renamable $x3, implicit killed renamable $x5, implicit killed renamable $x7, implicit killed renamable $x9, implicit killed renamable $x11
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoRET implicit $lr
+  bb.0.entry:
+  successors: %bb.1
+  liveins: $r0, $p0, $m0, $d1_3d, $d2_3d
+  %598:ep_as_32bit = COPY $p0
+  %599:ep_as_32bit = COPY $p0
+  %7:em = COPY $m0
+  %13:em = COPY $m0
+  %14:em = COPY $m0
+  %582:eds = COPY $d1_3d
+  %583:eds = COPY $d2_3d
+  %108:er = COPY $r0
+  %110:er = COPY $r0
+  %20:er = COPY $r0
+  LoopStart $r0, 0
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    undef %338.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %338.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    undef %346.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %346.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %599:ep_as_32bit = PADD_mod_pseudo %599, %7
+    undef %355.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %355.sub_256_hi:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    undef %361.sub_256_lo:vec512, %599:ep_as_32bit = VLD_pstm_imm_4x32_pseudo %599, 32
+    %361.sub_256_hi:vec512, %599:ep_as_32bit, %583.sub_dim_count:eds, %583.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %599, %583
+    %369:vec512 = VSHUFFLE %338, %355, %108
+    %370:vec512 = VSHUFFLE %338, %355, %110
+    %371:vec512 = VSHUFFLE %346, %361, %108
+    %372:vec512 = VSHUFFLE %346, %361, %110
+    undef %378.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %378.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %386.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %386.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %394.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %394.sub_256_hi:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %14
+    undef %402.sub_256_lo:vec512, %598:ep_as_32bit = VLD_pstm_pseudo %598, %13
+    %402.sub_256_hi:vec512, %598:ep_as_32bit, %582.sub_dim_count:eds, %582.sub_hi_dim_then_sub_dim_count:eds = VLD_3D_pseudo %598, %582
+    %410:vec512 = VSHUFFLE %378, %378, %20
+    %411:vec512 = VSHUFFLE %386, %386, %20
+    %412:vec512 = VSHUFFLE %394, %394, %20
+    %413:vec512 = VSHUFFLE %402, %402, %20
+    PseudoLoopEnd <mcsymbol .L_1120>, %bb.1, implicit %369, implicit %370, implicit %371, implicit %372, implicit %410, implicit %411, implicit %412, implicit %413
+
+  bb.2:
+    PseudoRET implicit $lr
+...
+
 # Ignore renaming of general purpose registers.
 ---
 name:            gpr_replacement
diff --git a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll
index 4acdfb6ebdbf..982e6fe361fd 100644
--- a/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/llc-pipeline-aie2p.ll
@@ -235,6 +235,8 @@
 ; AIE-O1-NEXT:      Greedy Register Allocator
 ; AIE-O1-NEXT:      AIE super-reg rewrite
 ; AIE-O1-NEXT:      Greedy Register Allocator
+; AIE-O1-NEXT:      AIE waw-reg rewrite
+; AIE-O1-NEXT:      Greedy Register Allocator
 ; AIE-O1-NEXT:      Virtual Register Rewriter
 ; AIE-O1-NEXT:      Stack Slot Coloring
 ; AIE-O1-NEXT:      AIE 1D operands to 2D/3D rewriter
@@ -441,6 +443,8 @@
 ; AIE-O23-NEXT:      Greedy Register Allocator
 ; AIE-O23-NEXT:      AIE super-reg rewrite
 ; AIE-O23-NEXT:      Greedy Register Allocator
+; AIE-O23-NEXT:      AIE waw-reg rewrite
+; AIE-O23-NEXT:      Greedy Register Allocator
 ; AIE-O23-NEXT:      Virtual Register Rewriter
 ; AIE-O23-NEXT:      Stack Slot Coloring
 ; AIE-O23-NEXT:      AIE 1D operands to 2D/3D rewriter
diff --git a/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir b/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir
new file mode 100644
index 000000000000..261cd827283c
--- /dev/null
+++ b/llvm/test/CodeGen/AIE/aie2p/ra/waw_reg_renaming.mir
@@ -0,0 +1,56 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# NOTE: Example file for Write After Write Register Renaming in Loop test
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
+
+# Basic test for the WAW register renaming pass. Check AIE2 tests for more coverage.
+
+# RUN: llc -mtriple=aie2p -verify-machineinstrs --start-before=greedy --stop-after=virtregrewriter %s -o - | FileCheck %s
+
+# Make sure VLD and VMAX define different X registers.
+---
+name:            simple_waw_replacement
+alignment:       16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: simple_waw_replacement
+  ; CHECK: bb.0.entry:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $p0, $p1, $r0, $x0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   LoopStart $r0, 0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT:   liveins: $d0, $d2, $p0, $p1, $p2, $x0, $d1_3d
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   renamable $x2, renamable $p0 = VLDA_dmx_lda_x_pstm_nrm_imm killed renamable $p0, 64
+  ; CHECK-NEXT:   renamable $x4, dead renamable $r16 = VMAX_LT_32_vaddSign1 killed renamable $x2, renamable $x0, implicit $vaddsign1
+  ; CHECK-NEXT:   renamable $p1 = VST_dmx_sts_x_pstm_nrm_imm killed renamable $x4, killed renamable $p1, 64
+  ; CHECK-NEXT:   PseudoLoopEnd <mcsymbol .L_1120>, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   PseudoRET implicit $lr
+  bb.0.entry:
+  successors: %bb.1
+  liveins: $r0, $p0, $p1, $x0
+  %0:ep = COPY $p0
+  %1:vec512 = COPY $x0
+  %2:ep = COPY $p1
+  LoopStart $r0, 0
+  bb.1:
+    successors: %bb.1, %bb.2
+    liveins: $p0, $p1, $p2, $d0, $d1_3d, $d2
+
+    %10:vec512, %0:ep = VLDA_dmx_lda_x_pstm_nrm_imm %0, 64
+    %11:vec512, %12:mr16_vcompare = VMAX_LT_32_vaddSign1 %10, %1, implicit $vaddsign1
+    %2:ep = VST_dmx_sts_x_pstm_nrm_imm %11, %2, 64
+    PseudoLoopEnd <mcsymbol .L_1120>, %bb.1
+  bb.2:
+    PseudoRET implicit $lr
+...
diff --git a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll
index 8107e2e98e19..d684a5641161 100644
--- a/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll
+++ b/llvm/test/CodeGen/AIE/aie2p/schedule/postpipeliner/end-to-end.ll
@@ -20,27 +20,29 @@ define <32 x i16> @zol(i32 %n, ptr %p) {
 ; CHECK-NEXT:    add.nc lc, r0, #-7
 ; CHECK-NEXT:    movxm ls, #.LBB0_1
 ; CHECK-NEXT:    movxm le, #.L_LEnd0
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopxm ; nopv
-; CHECK-NEXT:    // implicit-def: $x0
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopxm ; nopv
+; CHECK-NEXT:    // implicit-def: $x2
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB0_1: // %for.body
 ; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:  .L_LEnd0:
-; CHECK-NEXT:    nopa ; vldb x2, [p0], #64; nops ; nopx ; vadd.16 x0, x2, x0; nopv
+; CHECK-NEXT:    nopa ; vldb x0, [p0], #64; nops ; nopx ; vadd.16 x2, x0, x2; nopv
 ; CHECK-NEXT:  // %bb.2: // %for.cond.cleanup
-; CHECK-NEXT:    nopa ; nopx ; vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
-; CHECK-NEXT:    vadd.16 x0, x2, x0
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    vadd.16 x2, x0, x2
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    vmov x0, x2
 ; CHECK-NEXT:    ret lr
 ; CHECK-NEXT:    nop // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4