diff --git a/example/optsched-cfg/sched.ini b/example/optsched-cfg/sched.ini
index 07e9a626..d1c88a18 100644
--- a/example/optsched-cfg/sched.ini
+++ b/example/optsched-cfg/sched.ini
@@ -8,12 +8,38 @@ USE_OPT_SCHED YES
 # Same options as use optimal scheduling.
 PRINT_SPILL_COUNTS YES
 
+# Print clustering information
+# YES
+# NO
+PRINT_CLUSTER YES
+
 # Use two pass scheduling approach. 
 # First pass minimizes RP and second pass tries to balances RP and ILP.
 # YES
 # NO
 USE_TWO_PASS NO
 
+# Sets a limit for occupancy in the second ILP pass. We will not go below this
+# occupancy when attempting rescheduling.
+# Valid values: 1-10 (whole integers)
+MIN_OCCUPANCY_FOR_RESCHEDULE 3
+
+# Sets the required schedule length improvement percentage for the second ILP
+# pass. If we do not meet this minimum improvement then we do not keep the
+# lower occupancy schedules.
+# Valid values: 1-100 (whole integers)
+MIN_ILP_IMPROVEMENT 10
+
+# Allow enumerator to try to cluster memory operations together in the second
+# pass.
+# YES
+# NO
+CLUSTER_MEMORY_OPS NO
+
+# The weight for clustering. This factor determines the importance of
+# trying to find clusters when enumerating.
+CLUSTER_WEIGHT 1000
+
 # These 3 flags control which schedulers will be used.
 # Each one can be individually toggled. The heuristic
 # list scheduler or ACO must be run before the 
@@ -85,7 +111,8 @@ HEURISTIC LUC_CP_NID
 ENUM_HEURISTIC LUC_CP_NID
 
 # The heuuristic used for the enumerator in the second pass in the two-pass scheduling approach.
-# Same valid values as HEURISTIC.
+# Same valid values as HEURISTIC with an additional heuristic:
+# Cluster: Favor instructions that are part of an active memory clustering group.
 SECOND_PASS_ENUM_HEURISTIC LUC_CP_NID
 
 # The spill cost function to be used. Valid values are:
diff --git a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
index 8eb1499d..6180e344 100644
--- a/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
+++ b/include/opt-sched/Scheduler/OptSchedDDGWrapperBase.h
@@ -14,9 +14,12 @@ class OptSchedDDGWrapperBase {
 public:
   virtual ~OptSchedDDGWrapperBase() = default;
 
-  virtual void convertSUnits() = 0;
+  virtual void convertSUnits(bool IgnoreRealEdges,
+                             bool IgnoreArtificialEdges) = 0;
 
   virtual void convertRegFiles() = 0;
+
+  virtual int findPossibleClusters(bool IsLoad) = 0;
 };
 
 } // namespace opt_sched
diff --git a/include/opt-sched/Scheduler/bb_spill.h b/include/opt-sched/Scheduler/bb_spill.h
index 27e3cbed..ef536b85 100644
--- a/include/opt-sched/Scheduler/bb_spill.h
+++ b/include/opt-sched/Scheduler/bb_spill.h
@@ -12,8 +12,11 @@ Last Update:  Apr. 2011
 #include "opt-sched/Scheduler/OptSchedTarget.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_region.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include <map>
+#include <memory>
 #include <set>
 #include <vector>
 
@@ -32,6 +35,91 @@ class BBWithSpill : public SchedRegion {
 
   InstCount crntSpillCost_;
   InstCount optmlSpillCost_;
+  int CurrentClusterCost;
+
+  /// Used to calculate the dynamic lower bound for clustering.
+  llvm::SmallVector<int, 32> ClusterCount;
+  llvm::SmallVector<int, 32> ClusterInstrRemainderCount;
+  int ClusterGroupCount;
+
+  void computeAndPrintClustering(InstSchedule *Sched) override;
+
+  /// Print the current clusters found so far in the schedule.
+  void printCurrentClustering() override;
+
+  void initForClustering();
+
+  /// Calculate the lower bound cost for memory operations clustering and
+  /// return the lower bound cost. Does not take into account the clustering
+  /// weight.
+  int calculateClusterStaticLB();
+
+  /// Helper function for clustering to save the state of the current cluster.
+  void saveCluster(SchedInstruction *inst);
+
+  /// Helper function for clustering to start a new clustering.
+  void initCluster(SchedInstruction *inst);
+
+  /// Reset the active cluster to 0 (none).
+  void resetActiveCluster(SchedInstruction *inst);
+
+  /// Helper function to restore the previous cluster.
+  void restorePreviousCluster(SchedInstruction *inst);
+
+  bool isClusterFinished();
+
+  int calculateClusterDLB();
+
+  /// Current cluster size
+  unsigned int CurrentClusterSize;
+
+  /// The minimum amount of cluster blocks possible.
+  int MinClusterBlocks;
+
+  /// The minimum amount of cluster blocks + the optimistic expected cluster
+  /// blocks remaining.
+  int DynamicClusterLowerBound;
+
+  /// Current active cluster group.
+  int ClusterActiveGroup;
+
+  int StartCycle;
+
+  /// Data struct to contain information about the previous clusters
+  struct PastClusters {
+    /// The cluster group
+    int ClusterGroup;
+    /// Size of the cluster when it was ended by an instruction not in the
+    /// cluster
+    int ClusterSize;
+
+    /// Instruction number that ended this cluster. Used to check if we should
+    /// restore the cluster state when backtracking.
+    int InstNum;
+
+    int Start;
+
+    /// Contains the actual names of the instructions in the cluster. Only used
+    /// for printing and debugging purposes.
+    std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
+
+    /// Constructor for this struct
+    PastClusters(int Cluster, int Size, int Instructions, int CycleStart)
+        : ClusterGroup(Cluster), ClusterSize(Size), InstNum(Instructions),
+          Start(CycleStart) {}
+  };
+
+  /// Vector containing the (n-1) past clusters
+  llvm::SmallVector<std::unique_ptr<PastClusters>, 4> PastClustersList;
+
+  /// Contains the actual names of the instructions in the current cluster.
+  /// Only used for printing and debugging purposes.
+  std::unique_ptr<llvm::SmallVector<llvm::StringRef, 4>> InstrList;
+
+  /// Pointer to the last cluster. This is kept out of the vector to avoid
+  /// having to fetch it every time we compare the current instruction
+  /// number to the one that ended the cluster.
+  std::unique_ptr<PastClusters> LastCluster;
 
   // The target machine
   const OptSchedTarget *OST;
@@ -103,7 +191,8 @@ class BBWithSpill : public SchedRegion {
   void InitForCostCmputtn_();
   InstCount CmputDynmcCost_();
 
-  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts);
+  void UpdateSpillInfoForSchdul_(SchedInstruction *inst, bool trackCnflcts,
+                                 int Start);
   void UpdateSpillInfoForUnSchdul_(SchedInstruction *inst);
   void SetupPhysRegs_();
   void CmputCrntSpillCost_();
diff --git a/include/opt-sched/Scheduler/data_dep.h b/include/opt-sched/Scheduler/data_dep.h
index 3ef48cab..5b021145 100644
--- a/include/opt-sched/Scheduler/data_dep.h
+++ b/include/opt-sched/Scheduler/data_dep.h
@@ -13,6 +13,7 @@ Last Update:  Mar. 2011
 #include "opt-sched/Scheduler/buffers.h"
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/sched_basic_data.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include <memory>
 
@@ -291,7 +292,24 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
 
   RegisterFile *getRegFiles() { return RegFiles.get(); }
 
+  // Memory clustering helper functions
+  int getMinClusterCount() { return MinClusterCount; }
+  void setMinClusterCount(int Max) { MinClusterCount = Max; }
+  int getTotalInstructionsInAllClusters() {
+    return TotalInstructionsInAllClusters;
+  }
+  void setTotalInstructionsInAllClusters(int Max) {
+    TotalInstructionsInAllClusters = Max;
+  }
+  int getTotalInstructionsInCluster(int Cluster);
+
 protected:
+  int MinClusterCount;
+  int TotalInstructionsInAllClusters;
+  /// Map the cluster block to the total number of instructions found in the
+  /// block
+  MapVector<int, int> MaxInstructionsInEachClusters;
+
   // TODO(max): Get rid of this.
   // Number of basic blocks
   int32_t bscBlkCnt_;
@@ -391,7 +409,7 @@ class DataDepGraph : public llvm::opt_sched::OptSchedDDGWrapperBase,
                                 InstCount fileUB, int blkNum);
   FUNC_RESULT FinishNode_(InstCount nodeNum, InstCount edgeCnt = -1);
   void CreateEdge_(InstCount frmInstNum, InstCount toInstNum, int ltncy,
-                   DependenceType depType);
+                   DependenceType depType, bool IsArtificial = false);
 
   FUNC_RESULT Finish_();
 
@@ -629,6 +647,9 @@ class InstSchedule {
   // The schedule's spill cost according to the cost function used
   InstCount spillCost_;
 
+  // The number of clusters
+  int ClusterSize;
+
   // An array of peak reg pressures for all reg types in the schedule
   InstCount *peakRegPressures_;
 
@@ -676,6 +697,8 @@ class InstSchedule {
   InstCount GetExecCost() const;
   void SetSpillCost(InstCount cost);
   InstCount GetSpillCost() const;
+  void setClusterSize(int size);
+  int getClusterSize() const;
 
   void ResetInstIter();
   InstCount GetFrstInst(InstCount &cycleNum, InstCount &slotNum);
@@ -699,6 +722,7 @@ class InstSchedule {
   void Print(std::ostream &out, char const *const title);
   void PrintInstList(FILE *file, DataDepGraph *dataDepGraph,
                      const char *title) const;
+  void Print(std::ostream &out, char const *const title, DataDepGraph *ddg);
   void PrintRegPressures() const;
   bool Verify(MachineModel *machMdl, DataDepGraph *dataDepGraph);
   void PrintClassData();
diff --git a/include/opt-sched/Scheduler/enumerator.h b/include/opt-sched/Scheduler/enumerator.h
index be2f376f..d165ddd0 100644
--- a/include/opt-sched/Scheduler/enumerator.h
+++ b/include/opt-sched/Scheduler/enumerator.h
@@ -153,6 +153,12 @@ class EnumTreeNode {
   InstCount peakSpillCost_;
   InstCount spillCostSum_;
   InstCount totalCost_ = -1;
+  int ClusterCost;
+  int ClusterActiveGroup;
+  int ClusterAbsorbCount;
+  int ClusterDLB;
+  int ClusterTotalCost = -1;
+  int ClusterBestCost;
   bool totalCostIsActualCost_ = false;
   ReserveSlot *rsrvSlots_;
 
@@ -276,6 +282,18 @@ class EnumTreeNode {
   inline void SetSpillCostSum(InstCount cost);
   inline InstCount GetSpillCostSum();
 
+  inline void setClusteringCost(int Cost);
+  inline int getClusteringCost();
+  inline void setCurClusteringGroup(int Group);
+  inline int getCurClusteringGroup();
+  inline void setClusterAbsorbCount(int Absorb);
+  inline int getClusterAbsorbCount();
+  inline void setClusterLwrBound(int ClusterDynamicLowerBound);
+  inline int getClusterLwrBound();
+  inline void setTotalClusterCost(int Cost);
+  inline int getTotalClusterCost();
+  inline bool isClustering();
+
   bool ChkInstRdndncy(SchedInstruction *inst, int brnchNum);
   bool IsNxtSlotStall();
 
@@ -317,6 +335,9 @@ class Enumerator : public ConstrainedScheduler {
   friend class HistEnumTreeNode;
   friend class CostHistEnumTreeNode;
 
+  // Should we cluster memory operations
+  bool Clustering;
+
   // TODO(max): Document.
   bool isCnstrctd_;
 
@@ -508,7 +529,7 @@ class Enumerator : public ConstrainedScheduler {
              InstCount schedUprBound, int16_t sigHashSize,
              SchedPriorities prirts, Pruning PruningStrategy,
              bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
-             InstCount preFxdInstCnt = 0,
+             bool ClusteringEnabled, InstCount preFxdInstCnt = 0,
              SchedInstruction *preFxdInsts[] = NULL);
   virtual ~Enumerator();
   virtual void Reset();
@@ -525,6 +546,8 @@ class Enumerator : public ConstrainedScheduler {
   // (Chris)
   inline bool IsSchedForRPOnly() const { return SchedForRPOnly_; }
 
+  inline bool isClustering() const { return Clustering; }
+
   // Calculates the schedule and returns it in the passed argument.
   FUNC_RESULT FindSchedule(InstSchedule *sched, SchedRegion *rgn) {
     return RES_ERROR;
@@ -586,6 +609,7 @@ class LengthCostEnumerator : public Enumerator {
   bool WasObjctvMet_();
   bool BackTrack_();
   InstCount GetBestCost_();
+  int GetBestClusterCost_();
   void CreateRootNode_();
 
   // Check if branching from the current node by scheduling this instruction
@@ -603,7 +627,7 @@ class LengthCostEnumerator : public Enumerator {
                        SchedPriorities prirts, Pruning PruningStrategy,
                        bool SchedForRPOnly, bool enblStallEnum,
                        Milliseconds timeout, SPILL_COST_FUNCTION spillCostFunc,
-                       InstCount preFxdInstCnt = 0,
+                       bool ClusteringEnabled, InstCount preFxdInstCnt = 0,
                        SchedInstruction *preFxdInsts[] = NULL);
   virtual ~LengthCostEnumerator();
   void Reset();
@@ -616,6 +640,7 @@ class LengthCostEnumerator : public Enumerator {
   bool IsCostEnum();
   SPILL_COST_FUNCTION GetSpillCostFunc() { return spillCostFunc_; }
   inline InstCount GetBestCost() { return GetBestCost_(); }
+  int getBestClusterCost() { return GetBestClusterCost_(); }
 };
 /*****************************************************************************/
 
@@ -851,6 +876,44 @@ void EnumTreeNode::SetSpillCostSum(InstCount cost) {
 InstCount EnumTreeNode::GetSpillCostSum() { return spillCostSum_; }
 /*****************************************************************************/
 
+void EnumTreeNode::setClusteringCost(int Cost) {
+  assert(Cost >= 0);
+  ClusterCost = Cost;
+}
+
+int EnumTreeNode::getClusteringCost() { return ClusterCost; }
+
+void EnumTreeNode::setCurClusteringGroup(int Group) {
+  assert(Group >= 0);
+  ClusterActiveGroup = Group;
+}
+
+int EnumTreeNode::getCurClusteringGroup() { return ClusterActiveGroup; }
+
+void EnumTreeNode::setClusterAbsorbCount(int Absorb) {
+  assert(Absorb >= 0);
+  ClusterAbsorbCount = Absorb;
+}
+
+int EnumTreeNode::getClusterAbsorbCount() { return ClusterAbsorbCount; }
+
+void EnumTreeNode::setClusterLwrBound(int ClusterDynamicLowerBound) {
+  assert(ClusterDynamicLowerBound >= 0);
+  ClusterDLB = ClusterDynamicLowerBound;
+}
+
+int EnumTreeNode::getClusterLwrBound() { return ClusterDLB; }
+
+void EnumTreeNode::setTotalClusterCost(int Cost) {
+  assert(Cost >= 0);
+  ClusterTotalCost = Cost;
+}
+
+int EnumTreeNode::getTotalClusterCost() { return ClusterTotalCost; }
+
+bool EnumTreeNode::isClustering() { return enumrtr_->isClustering(); }
+/*****************************************************************************/
+
 bool EnumTreeNode::IsNxtCycleNew_() {
   if (enumrtr_->issuRate_ == 1) {
     return true;
diff --git a/include/opt-sched/Scheduler/graph.h b/include/opt-sched/Scheduler/graph.h
index af8ba8f2..fea0576f 100644
--- a/include/opt-sched/Scheduler/graph.h
+++ b/include/opt-sched/Scheduler/graph.h
@@ -49,11 +49,15 @@ struct GraphEdge {
   UDT_GEDGES predOrder;
   // The second node's order in the first node's successor list.
   UDT_GEDGES succOrder;
+  // Whether or not the edge is an artificial dependency meaning it isn't
+  // required to be correct
+  bool IsArtificial;
 
   // Creates an edge between two nodes with labels label and label2.
   GraphEdge(GraphNode *from, GraphNode *to, UDT_GLABEL label,
-            UDT_GLABEL label2 = 0)
-      : from(from), to(to), label(label), label2(label2) {}
+            UDT_GLABEL label2 = 0, bool IsArtificial = false)
+      : from(from), to(to), label(label), label2(label2),
+        IsArtificial(IsArtificial) {}
 
   // Returns the node on the other side of the edge from the provided node.
   // Assumes that the argument is one of the nodes on the sides of the edge.
@@ -512,7 +516,7 @@ inline UDT_GEDGES GraphNode::GetRcrsvScsrCnt() const {
 }
 
 inline LinkedList<GraphEdge> *GraphNode::GetNghbrLst(DIRECTION dir) {
-  return dir == DIR_FRWRD ? scsrLst_ : prdcsrLst_;
+  return dir == DIR_FRWRD ? prdcsrLst_ : scsrLst_;
 }
 
 inline GraphEdge *GraphNode::GetFrstScsrEdge() {
diff --git a/include/opt-sched/Scheduler/hist_table.h b/include/opt-sched/Scheduler/hist_table.h
index 982c87a6..85f6592b 100644
--- a/include/opt-sched/Scheduler/hist_table.h
+++ b/include/opt-sched/Scheduler/hist_table.h
@@ -109,6 +109,10 @@ class CostHistEnumTreeNode : public HistEnumTreeNode {
   InstCount cost_;
   InstCount peakSpillCost_;
   InstCount spillCostSum_;
+  int ClusterCost;
+  int ClusterActiveGroup;
+  int ClusterAbsorbCount;
+  int ClusterTotalCost;
 
   // (Chris)
   InstCount totalCost_ = -1;
@@ -119,7 +123,6 @@ class CostHistEnumTreeNode : public HistEnumTreeNode {
 #ifdef IS_DEBUG
   bool costInfoSet_;
 #endif
-
   bool ChkCostDmntnForBBSpill_(EnumTreeNode *node, Enumerator *enumrtr);
   bool ChkCostDmntn_(EnumTreeNode *node, Enumerator *enumrtr,
                      InstCount &maxShft);
diff --git a/include/opt-sched/Scheduler/lnkd_lst.h b/include/opt-sched/Scheduler/lnkd_lst.h
index 8861d843..537de59a 100644
--- a/include/opt-sched/Scheduler/lnkd_lst.h
+++ b/include/opt-sched/Scheduler/lnkd_lst.h
@@ -573,43 +573,66 @@ inline T *PriorityList<T, K>::GetNxtPriorityElmnt(K &key) {
   }
 }
 
+//(Vlad) added functionality to decrease priority
+// used for decreasing priority of clusterable instrs
+// when leaving a cluster
 template <class T, class K>
 void PriorityList<T, K>::BoostEntry(KeyedEntry<T, K> *entry, K newKey) {
   KeyedEntry<T, K> *crnt;
   KeyedEntry<T, K> *next = entry->GetNext();
   KeyedEntry<T, K> *prev = entry->GetPrev();
 
-  assert(newKey > entry->key);
   assert(LinkedList<T>::topEntry_ != NULL);
 
-  entry->key = newKey;
+  if (entry->key < newKey) // behave normally
+  {
+    entry->key = newKey;
 
-  // If it is already at the top, or its previous still has a larger key,
-  // then the entry is already in place and no boosting is needed
-  if (entry == LinkedList<T>::topEntry_ || prev->key >= newKey)
-    return;
+    // If it is already at the top, or its previous still has a larger key,
+    // then the entry is already in place and no boosting is needed
+    if (entry == LinkedList<T>::topEntry_ || prev->key >= newKey)
+      return;
 
-  prev = NULL;
+    prev = NULL;
 
-  for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) {
-    if (crnt->key >= newKey) {
-      assert(crnt != entry);
-      assert(crnt != entry->GetPrev());
-      prev = crnt;
-      break;
+    for (crnt = entry->GetPrev(); crnt != NULL; crnt = crnt->GetPrev()) {
+      if (crnt->key >= newKey) {
+        assert(crnt != entry);
+        assert(crnt != entry->GetPrev());
+        prev = crnt;
+        break;
+      }
     }
-  }
 
-  if (prev == NULL) {
-    next = (KeyedEntry<T, K> *)LinkedList<T>::topEntry_;
-  } else {
-    next = prev->GetNext();
-    assert(next != NULL);
-  }
+    if (prev == NULL) {
+      next = (KeyedEntry<T, K> *)LinkedList<T>::topEntry_;
+    } else {
+      next = prev->GetNext();
+      assert(next != NULL);
+    }
 
-  assert(next != entry->GetNext());
-  LinkedList<T>::RmvEntry_(entry, false);
-  InsrtEntry_(entry, next);
+    assert(next != entry->GetNext());
+    LinkedList<T>::RmvEntry_(entry, false);
+    InsrtEntry_(entry, next);
+  } else // move entry down on priority list
+  {
+    entry->key = newKey;
+
+    // if it is at the bottom or next entry still has a smaller key,
+    // then the entry is already in place
+    if (entry == LinkedList<T>::bottomEntry_ || next->key <= newKey)
+      return;
+
+    for (crnt = entry->GetNext(); crnt != NULL; crnt = crnt->GetNext()) {
+      if (crnt->key <= newKey) {
+        next = crnt;
+        break;
+      }
+    }
+
+    LinkedList<T>::RmvEntry_(entry, false);
+    InsrtEntry_(entry, next);
+  }
 
   this->itrtrReset_ = true;
 }
diff --git a/include/opt-sched/Scheduler/ready_list.h b/include/opt-sched/Scheduler/ready_list.h
index 3c7bb1a6..054b19f1 100644
--- a/include/opt-sched/Scheduler/ready_list.h
+++ b/include/opt-sched/Scheduler/ready_list.h
@@ -115,6 +115,7 @@ class ReadyList {
   int16_t ltncySumBits_;
   int16_t nodeID_Bits_;
   int16_t inptSchedOrderBits_;
+  int16_t ClusterBit;
 
   // Constructs the priority-list key based on the schemes listed in prirts_.
   unsigned long CmputKey_(SchedInstruction *inst, bool isUpdate, bool &changed);
diff --git a/include/opt-sched/Scheduler/sched_basic_data.h b/include/opt-sched/Scheduler/sched_basic_data.h
index c177c77f..46117e9e 100644
--- a/include/opt-sched/Scheduler/sched_basic_data.h
+++ b/include/opt-sched/Scheduler/sched_basic_data.h
@@ -8,14 +8,11 @@ Last Update:  Sept. 2013
 #ifndef OPTSCHED_BASIC_SCHED_BASIC_DATA_H
 #define OPTSCHED_BASIC_SCHED_BASIC_DATA_H
 
-// For class string.
-#include <string>
-// For class ostream.
 #include "opt-sched/Scheduler/defines.h"
 #include "opt-sched/Scheduler/graph.h"
 #include "opt-sched/Scheduler/hash_table.h"
 #include "opt-sched/Scheduler/machine_model.h"
-#include <iostream>
+#include <string>
 
 namespace llvm {
 namespace opt_sched {
@@ -51,7 +48,11 @@ enum LISTSCHED_HEURISTIC {
   LSH_LS = 7,
 
   // LLVM list scheduler order
-  LSH_LLVM = 8
+  LSH_LLVM = 8,
+
+  // Dynamic memory clustering heuristic, favor instructions that are part of
+  // an active cluster
+  LSH_CLUSTER = 9
 };
 
 #define MAX_SCHED_PRIRTS 10
@@ -204,12 +205,14 @@ class SchedInstruction : public GraphNode {
   //   depType: the type of dependence between this node and the successor.
   SchedInstruction *GetFrstScsr(InstCount *prdcsrNum = NULL,
                                 UDT_GLABEL *ltncy = NULL,
-                                DependenceType *depType = NULL);
+                                DependenceType *depType = NULL,
+                                bool *IsArtificial = nullptr);
   // Returns the next successor of this instruction node and moves the
   // successor iterator forward. Fills parameters as above.
   SchedInstruction *GetNxtScsr(InstCount *prdcsrNum = NULL,
                                UDT_GLABEL *ltncy = NULL,
-                               DependenceType *depType = NULL);
+                               DependenceType *depType = NULL,
+                               bool *IsArtificial = nullptr);
 
   // Returns the last successor of this instruction node and moves the
   // successor iterator to the end of the list. If prdcsrNum is provided, this
@@ -414,6 +417,15 @@ class SchedInstruction : public GraphNode {
 
   InstType GetCrtclPathFrmRoot() { return crtclPathFrmRoot_; }
 
+  /// Set MayCluster to true if clustering memory operations was found
+  /// to be possible.
+  void SetMayCluster(int ClusteringGroup);
+  bool GetMayCluster() { return MayCluster; }
+  int GetClusterGroup() { return ClusterGroup; }
+  static int GetActiveCluster() { return ActiveCluster; }
+  static void SetActiveCluster(int Active) { ActiveCluster = Active; }
+  bool getWasActive() { return WasActive; }
+  bool computeWasActive();
   friend class SchedRange;
 
 protected:
@@ -421,6 +433,16 @@ class SchedInstruction : public GraphNode {
   string name_;
   // The mnemonic of this instruction, e.g. "add" or "jmp".
   string opCode_;
+
+  bool WasActive;
+
+  /// The cluster group that the current instruction is a part of.
+  /// Default of 0 means that it is not part of any cluster.
+  int ClusterGroup;
+  /// This value should be set to true if clustering may be possible.
+  bool MayCluster;
+  /// Currently active cluster. Used for ready list.
+  static int ActiveCluster;
   // A numberical ID for this instruction.
   int nodeID_;
   // The type of this instruction.
diff --git a/include/opt-sched/Scheduler/sched_region.h b/include/opt-sched/Scheduler/sched_region.h
index de36f85b..2685b7d0 100644
--- a/include/opt-sched/Scheduler/sched_region.h
+++ b/include/opt-sched/Scheduler/sched_region.h
@@ -52,12 +52,18 @@ class SchedRegion {
   // Destroys the region. Must be overriden by child classes.
   virtual ~SchedRegion() {}
 
+  bool PrintClustering;
+  bool TwoPassEnabled;
+  virtual void computeAndPrintClustering(InstSchedule *Sched) = 0;
+
+  virtual void printCurrentClustering() = 0;
   // Returns the dependence graph of this region.
   inline DataDepGraph *GetDepGraph() { return dataDepGraph_; }
   // Returns the lower bound on the cost of this region.
   inline int GetCostLwrBound() { return costLwrBound_; }
   // Returns the best cost found so far for this region.
   inline InstCount GetBestCost() { return bestCost_; }
+  inline int getBestClusterCost() { return BestClusterCost; }
   // Returns a pointer to the list scheduler heurisitcs.
   inline SchedPriorities GetHeuristicPriorities() { return hurstcPrirts_; }
   // Get the number of simulated spills code added for this block.
@@ -107,6 +113,9 @@ class SchedRegion {
   // Initialie variables for the second pass of the two-pass-optsched
   void InitSecondPass();
 
+  bool enumFoundSchedule() { return EnumFoundSchedule; }
+  void setEnumFoundSchedule() { EnumFoundSchedule = true; }
+
 private:
   // The algorithm to use for calculated lower bounds.
   LB_ALG lbAlg_;
@@ -127,11 +136,14 @@ class SchedRegion {
   // Used for two-pass-optsched to enable second pass functionalies.
   bool isSecondPass_;
 
+  bool EnumFoundSchedule;
+
   // The absolute cost lower bound to be used as a ref for normalized costs.
   InstCount costLwrBound_ = 0;
-  
+
   // The best results found so far.
   InstCount bestCost_;
+  int BestClusterCost;
   InstCount bestSchedLngth_;
 
   // (Chris): The cost function. Defaults to PERP.
@@ -160,6 +172,11 @@ class SchedRegion {
   InstSchedule *enumBestSched_;
   // The best schedule found so far (may be heuristic or enumerator generated)
   InstSchedule *bestSched_;
+  /// Flag to enable or disable clustering memory operations in the ILP pass.
+  /// Reads from the sched.ini file then set the flag accordingly.
+  bool ClusterMemoryOperations;
+  /// The weight for memory ops clustering.
+  int ClusteringWeight;
 
   // TODO(max): Document.
   InstCount schedLwrBound_;
@@ -180,9 +197,13 @@ class SchedRegion {
 
   void SetBestCost(InstCount bestCost) { bestCost_ = bestCost; }
 
-  void SetBestSchedLength(InstCount bestSchedLngth) { bestSchedLngth_ = bestSchedLngth; }
+  void setBestClusterCost(int BestCost) { BestClusterCost = BestCost; }
+
+  void SetBestSchedLength(InstCount bestSchedLngth) {
+    bestSchedLngth_ = bestSchedLngth;
+  }
 
-  const SchedPriorities& GetEnumPriorities() const { return enumPrirts_; }
+  const SchedPriorities &GetEnumPriorities() const { return enumPrirts_; }
 
   int16_t GetSigHashSize() const { return sigHashSize_; }
 
diff --git a/lib/Scheduler/bb_spill.cpp b/lib/Scheduler/bb_spill.cpp
index 4acd4903..ee817ec7 100644
--- a/lib/Scheduler/bb_spill.cpp
+++ b/lib/Scheduler/bb_spill.cpp
@@ -25,7 +25,10 @@ extern bool OPTSCHED_gPrintSpills;
 using namespace llvm::opt_sched;
 
 // The denominator used when calculating cost weight.
-static const int COST_WGHT_BASE = 10;
+static const int COST_WGHT_BASE = 100;
+
+// The max number of instructions in a cluster
+static const unsigned MAX_INSTR_IN_CLUSTER = 15;
 
 BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
                          long rgnNum, int16_t sigHashSize, LB_ALG lbAlg,
@@ -67,9 +70,36 @@ BBWithSpill::BBWithSpill(const OptSchedTarget *OST_, DataDepGraph *dataDepGraph,
   schduldEntryInstCnt_ = 0;
   schduldExitInstCnt_ = 0;
   schduldInstCnt_ = 0;
+  ClusterGroupCount = dataDepGraph_->getMinClusterCount();
+  MinClusterBlocks = 0;
+//  if (ClusterMemoryOperations && ClusterGroupCount > 0) {
+  if (ClusterGroupCount > 0) {
+    ClusterCount.resize(ClusterGroupCount + 1);
+    ClusterInstrRemainderCount.resize(ClusterGroupCount + 1);
+    MinClusterBlocks = calculateClusterStaticLB();
+    initForClustering();
+  }
 }
 /****************************************************************************/
 
+void BBWithSpill::initForClustering() {
+  // Memory clustering variables initialization
+  SchedInstruction::SetActiveCluster(0);
+  CurrentClusterSize = 0;
+  ClusterActiveGroup = 0;
+  CurrentClusterCost = 0;
+  PastClustersList.clear();
+  LastCluster.reset();
+  InstrList.reset();
+  DynamicClusterLowerBound = 0;
+
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    ClusterCount[begin] = 0;
+    ClusterInstrRemainderCount[begin] =
+        dataDepGraph_->getTotalInstructionsInCluster(begin);
+  }
+}
+
 BBWithSpill::~BBWithSpill() {
   if (enumrtr_ != NULL) {
     delete enumrtr_;
@@ -82,6 +112,26 @@ BBWithSpill::~BBWithSpill() {
 }
 /*****************************************************************************/
 
+int BBWithSpill::calculateClusterStaticLB() {
+  // No cluster in this scheduling region
+  if (ClusterGroupCount == 0)
+    return 0;
+
+  // Calculate the minimum cluster blocks that will be needed to cluster all of
+  // the instructions. The maximum amount in a cluster block is determined by
+  // the constant MAX_INSTR_IN_CLUSTER.
+  int ClusterCost = 0;
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    int InstructionCount = dataDepGraph_->getTotalInstructionsInCluster(begin);
+    int CurrentClusterCost =
+        std::ceil(double(InstructionCount) / MAX_INSTR_IN_CLUSTER);
+    Logger::Info("Cost for block %d is %d", begin, CurrentClusterCost);
+    ClusterCost += CurrentClusterCost;
+  }
+
+  return ClusterCost;
+}
+
 bool BBWithSpill::EnableEnum_() {
   return true;
   /*
@@ -305,6 +355,11 @@ InstCount BBWithSpill::CmputCostLwrBound() {
   InstCount staticLowerBound =
       schedLwrBound_ * schedCostFactor_ + spillCostLwrBound * SCW_;
 
+  // Add the minimum of the possible clusters to the lower bound
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    staticLowerBound += MinClusterBlocks * ClusteringWeight;
+  }
+
 #if defined(IS_DEBUG_STATIC_LOWER_BOUND)
   Logger::Info(
       "DAG %s spillCostLB %d scFactor %d lengthLB %d lenFactor %d staticLB %d",
@@ -326,6 +381,9 @@ void BBWithSpill::InitForSchdulng() {
 /*****************************************************************************/
 
 void BBWithSpill::InitForCostCmputtn_() {
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled))
+    initForClustering();
+
   int i;
 
   crntCycleNum_ = 0;
@@ -376,8 +434,23 @@ InstCount BBWithSpill::CmputNormCost_(InstSchedule *sched,
 
 InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
                                   InstCount &execCost, bool trackCnflcts) {
+
+  InstCount instNum;
+  InstCount cycleNum;
+  InstCount slotNum;
+  SchedInstruction *inst;
+
   if (compMode == CCM_STTC) {
-    if (GetSpillCostFunc() == SCF_SPILLS) {
+    if (GetSpillCostFunc() != SCF_SPILLS) {
+      InitForCostCmputtn_();
+
+      for (instNum = sched->GetFrstInst(cycleNum, slotNum);
+           instNum != INVALID_VALUE;
+           instNum = sched->GetNxtInst(cycleNum, slotNum)) {
+        inst = dataDepGraph_->GetInstByIndx(instNum);
+        SchdulInst(inst, cycleNum, slotNum, trackCnflcts);
+      }
+    } else {
       LocalRegAlloc regAlloc(sched, dataDepGraph_);
       regAlloc.SetupForRegAlloc();
       regAlloc.AllocRegs();
@@ -389,6 +462,13 @@ InstCount BBWithSpill::CmputCost_(InstSchedule *sched, COST_COMP_MODE compMode,
   InstCount cost = sched->GetCrntLngth() * schedCostFactor_;
   execCost = cost;
   cost += crntSpillCost_ * SCW_;
+  // Add the current clustering cost
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    cost += CurrentClusterCost * ClusteringWeight;
+    assert(calculateClusterDLB() == CurrentClusterCost);
+    sched->setClusterSize(CurrentClusterCost);
+  }
+
   sched->SetSpillCosts(spillCosts_);
   sched->SetPeakRegPressures(peakRegPressures_);
   sched->SetSpillCost(crntSpillCost_);
@@ -421,8 +501,108 @@ void BBWithSpill::CmputCrntSpillCost_() {
 }
 /*****************************************************************************/
 
+void BBWithSpill::computeAndPrintClustering(InstSchedule *Sched) {
+  InstCount instNum;
+  InstCount cycleNum;
+  InstCount slotNum;
+  SchedInstruction *inst;
+  bool temp = ClusterMemoryOperations;
+
+  ClusterMemoryOperations = true;
+  InitForCostCmputtn_();
+  for (instNum = Sched->GetFrstInst(cycleNum, slotNum);
+       instNum != INVALID_VALUE;
+       instNum = Sched->GetNxtInst(cycleNum, slotNum)) {
+    inst = dataDepGraph_->GetInstByIndx(instNum);
+    SchdulInst(inst, cycleNum, slotNum, false);
+  }
+  printCurrentClustering();
+  ClusterMemoryOperations = temp;
+}
+
+void BBWithSpill::saveCluster(SchedInstruction *inst) {
+  if (LastCluster)
+    // Save previous clusters in a vector except the last cluster
+    // that we just exited out of.
+    PastClustersList.push_back(std::move(LastCluster));
+
+  // Last cluster that we just exited out of, used for fast accessing
+  // to its contents.
+  LastCluster = llvm::make_unique<PastClusters>(
+      ClusterActiveGroup, CurrentClusterSize, inst->GetNum(), StartCycle);
+
+  LastCluster->InstrList = std::move(InstrList);
+}
+
+void BBWithSpill::initCluster(SchedInstruction *inst) {
+  ClusterActiveGroup = inst->GetClusterGroup();
+  inst->SetActiveCluster(ClusterActiveGroup);
+  CurrentClusterSize = 1;
+  ClusterInstrRemainderCount[ClusterActiveGroup]--;
+  InstrList = llvm::make_unique<llvm::SmallVector<llvm::StringRef, 4>>();
+  InstrList->push_back(inst->GetName());
+  ClusterCount[ClusterActiveGroup]++;
+  CurrentClusterCost++;
+}
+
+void BBWithSpill::resetActiveCluster(SchedInstruction *inst) {
+  ClusterActiveGroup = 0;
+  inst->SetActiveCluster(0);
+  CurrentClusterSize = 0;
+}
+
+void BBWithSpill::restorePreviousCluster(SchedInstruction *inst) {
+  CurrentClusterSize = LastCluster->ClusterSize;
+  ClusterActiveGroup = LastCluster->ClusterGroup;
+  StartCycle = LastCluster->Start;
+  inst->SetActiveCluster(ClusterActiveGroup);
+  InstrList = std::move(LastCluster->InstrList);
+  LastCluster.reset(); // Release current cluster pointer
+
+  // Get previous cluster from vector list
+  if (!PastClustersList.empty()) {
+    LastCluster = std::move(PastClustersList.back());
+    PastClustersList.pop_back();
+  }
+}
+
+bool BBWithSpill::isClusterFinished() {
+  assert(ClusterActiveGroup != 0);
+  if (ClusterInstrRemainderCount[ClusterActiveGroup] == 0 ||
+      CurrentClusterSize == MAX_INSTR_IN_CLUSTER) {
+    return true;
+  }
+  return false;
+}
+
+int BBWithSpill::calculateClusterDLB() {
+  int OptimisticLowerBound = 0;
+
+  for (int begin = 1; begin <= ClusterGroupCount; begin++) {
+    if (begin != ClusterActiveGroup)
+      OptimisticLowerBound += std::ceil(
+          double(ClusterInstrRemainderCount[begin]) / MAX_INSTR_IN_CLUSTER);
+    else {
+      // The amount of instructions remaining that the current open cluster can
+      // add
+      int AbsorbCount = MAX_INSTR_IN_CLUSTER - CurrentClusterSize;
+      // Assume the current open cluster can add the max amount of instructions
+      // that a cluster can contain.
+      int Remainder = ClusterInstrRemainderCount[begin] - AbsorbCount;
+      // If the remainder is negative then that indicates the open cluster can
+      // absorb all of the remaining instructions.
+      if (Remainder < 0)
+        Remainder = 0;
+      // Estimate the optimistic dynamic lower bound for the current cluster
+      OptimisticLowerBound +=
+          std::ceil(double(Remainder) / MAX_INSTR_IN_CLUSTER);
+    }
+  }
+  return CurrentClusterCost + OptimisticLowerBound;
+}
+
 void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
-                                            bool trackCnflcts) {
+                                            bool trackCnflcts, int Start) {
   int16_t regType;
   int defCnt, useCnt, regNum, physRegNum;
   Register **defs, **uses;
@@ -430,6 +610,72 @@ void BBWithSpill::UpdateSpillInfoForSchdul_(SchedInstruction *inst,
   int liveRegs;
   InstCount newSpillCost;
 
+  // Conditions for creating a cluster:
+  // 1.) If a block is ended before it reaches 15 && there are remaining
+  // instructions
+
+  // Conditions for removing a cluster:
+  // 1.) If the block is not 15 && there are remaining instructions
+
+  // Scheduling cases for clustering project:
+  // 1.) Same Cluster -> Same Cluster
+  // If size == MAX_INSTR_IN_CLUSTER
+  // Save cluster to restore
+  // Set active to 0
+  // 2.) Cluster -> Different Cluster
+  // 3.) Non-Cluster -> Cluster
+  // 4.) Cluster -> Non-Cluster
+
+  // Possibly keep track of the current memory clustering size here
+  // and in UpdateSpillInfoForUnSchdul_()
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
+    // Check if the current instruction is part of a cluster
+    if (inst->GetMayCluster()) {
+      // Check if there is a current active cluster
+      // A ClusterActiveGroup == 0 indicates that there is no currently active
+      // clustering While ClusterActiveGroup != 0 indicates that there is active
+      // clustering
+      if (ClusterActiveGroup != 0) {
+        // Check if the instruction is in the same cluster group as the active
+        // cluster
+        if (ClusterActiveGroup == inst->GetClusterGroup()) {
+          // Case 1: Simple case where the current instruction is part of an
+          // already active cluster.
+          CurrentClusterSize++;
+          ClusterInstrRemainderCount[ClusterActiveGroup]--;
+          InstrList->push_back(inst->GetName());
+
+          // If we reach the max amount for this cluster then save the cluster
+          // and reset.
+          if (isClusterFinished()) {
+            saveCluster(inst);
+            resetActiveCluster(inst);
+          }
+        } else {
+          // Case 2: Else the instruction is part of different cluster that
+          // is not currently active. Store information of the old cluster
+          // group and start clustering for the new cluster.
+          saveCluster(inst);
+
+          // Finish setting up the new cluster
+          initCluster(inst);
+          StartCycle = Start;
+        }
+      } else {
+        // Case 3: Not currently clustering. Initialize clustering
+        initCluster(inst);
+        StartCycle = Start;
+      }
+    } else if (ClusterActiveGroup != 0) {
+      // Case 4: Exiting out of an active cluster
+      // Save the cluster to restore when backtracking.
+      saveCluster(inst);
+
+      // Reset active cluster
+      resetActiveCluster(inst);
+    }
+  }
+
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
 
@@ -621,6 +867,56 @@ void BBWithSpill::UpdateSpillInfoForUnSchdul_(SchedInstruction *inst) {
                inst->GetNum());
 #endif
 
+  // Backtracking cases for clustering project:
+  // 1.) Same Cluster <- Same Cluster
+  // 2.) Non-Cluster <- Cluster
+  // 3.) Different Cluster <- Cluster
+  // 4.) Cluster <- Non-cluster
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
+    // If the instruction we are backtracking from is part of a cluster
+    if (inst->GetMayCluster()) {
+      if (CurrentClusterSize != 0) {
+        // Case 1, 2, and 3
+        // Reduce the cluster size
+        CurrentClusterSize--;
+        ClusterInstrRemainderCount[ClusterActiveGroup]++;
+        // Remove instruction's name from the list
+        InstrList->pop_back();
+
+        // Case 2: If there are no more instructions in the currently active
+        // cluster then it indicates that we backtracked out of a cluster.
+        if (CurrentClusterSize == 0) {
+          ClusterCount[ClusterActiveGroup]--;
+          assert(ClusterCount[ClusterActiveGroup] >= 0);
+          CurrentClusterCost--;
+          // Set active cluster to none.
+          resetActiveCluster(inst);
+
+          // Case 3: Check If this instruction ended another cluster
+          if (LastCluster && LastCluster->InstNum == inst->GetNum()) {
+            // If so, then we need to restore the state of the previous cluster
+            restorePreviousCluster(inst);
+          }
+        }
+      }
+      // A cluster size of 0 while an instruction may cluster indicates that
+      // the current instruction is at the end of a finished cluster
+      else if (CurrentClusterSize == 0) {
+        assert(inst->GetNum() == LastCluster->InstNum);
+        restorePreviousCluster(inst);
+
+        CurrentClusterSize--;
+        ClusterInstrRemainderCount[ClusterActiveGroup]++;
+        // Remove instruction's name from the list
+        InstrList->pop_back();
+      }
+    } else if (LastCluster && LastCluster->InstNum == inst->GetNum()) {
+      // Case 4: If there was a previous cluster and this instruction
+      // ended the cluster then restore the previous cluster's state
+      restorePreviousCluster(inst);
+    }
+  }
+
   defCnt = inst->GetDefs(defs);
   useCnt = inst->GetUses(uses);
 
@@ -728,7 +1024,7 @@ void BBWithSpill::SchdulInst(SchedInstruction *inst, InstCount cycleNum,
   if (inst == NULL)
     return;
   assert(inst != NULL);
-  UpdateSpillInfoForSchdul_(inst, trackCnflcts);
+  UpdateSpillInfoForSchdul_(inst, trackCnflcts, crntCycleNum_);
 }
 /*****************************************************************************/
 
@@ -764,7 +1060,7 @@ void BBWithSpill::FinishHurstc_() {
 
 void BBWithSpill::FinishOptml_() {
 #ifdef IS_DEBUG_BBSPILL_COST
-  stats::traceOptimalCost.Record(bestCost_);
+  stats::traceOptimalCost.Record(GetBestCost());
   stats::traceOptimalScheduleLength.Record(bestSchedLngth_);
 #endif
 }
@@ -772,6 +1068,7 @@ void BBWithSpill::FinishOptml_() {
 
 Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) {
   bool enblStallEnum = enblStallEnum_;
+  bool ClusteringEnabled = IsSecondPass() && ClusterMemoryOperations;
   /*  if (!dataDepGraph_->IncludesUnpipelined()) {
       enblStallEnum = false;
     }*/
@@ -779,7 +1076,7 @@ Enumerator *BBWithSpill::AllocEnumrtr_(Milliseconds timeout) {
   enumrtr_ = new LengthCostEnumerator(
       dataDepGraph_, machMdl_, schedUprBound_, GetSigHashSize(),
       GetEnumPriorities(), GetPruningStrategy(), SchedForRPOnly_, enblStallEnum,
-      timeout, GetSpillCostFunc(), 0, NULL);
+      timeout, GetSpillCostFunc(), ClusteringEnabled, 0, NULL);
 
   return enumrtr_;
 }
@@ -813,26 +1110,14 @@ FUNC_RESULT BBWithSpill::Enumerate_(Milliseconds startTime,
     HandlEnumrtrRslt_(rslt, trgtLngth);
 
     if (GetBestCost() == 0 || rslt == RES_ERROR ||
-        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT) ||
-        (rslt == RES_SUCCESS && IsSecondPass())) {
-
-      // If doing two pass optsched and on the second pass then terminate if a
-      // schedule is found with the same min-RP found in first pass.
-      if (rslt == RES_SUCCESS && IsSecondPass()) {
-        Logger::Info("Schedule found in second pass, terminating BB loop.");
-
-        if (trgtLngth  < schedUprBound_)
-          Logger::Info("Schedule found with length %d is shorter than current schedule with length %d.", trgtLngth, schedUprBound_);
-      }
-
+        (lngthDeadline == rgnDeadline && rslt == RES_TIMEOUT)) {
       break;
     }
 
     enumrtr_->Reset();
     enumCrntSched_->Reset();
 
-    if (!IsSecondPass())
-      CmputSchedUprBound_();
+    CmputSchedUprBound_();
 
     iterCnt++;
     costLwrBound += 1;
@@ -880,14 +1165,54 @@ InstCount BBWithSpill::UpdtOptmlSched(InstSchedule *crntSched,
       Logger::Info("$$$ GOOD_HIT: Better spill cost for a longer schedule");
 
     SetBestCost(crntCost);
+    if (IsSecondPass() && ClusterMemoryOperations)
+      setBestClusterCost(CurrentClusterCost);
     optmlSpillCost_ = crntSpillCost_;
     SetBestSchedLength(crntSched->GetCrntLngth());
     enumBestSched_->Copy(crntSched);
     bestSched_ = enumBestSched_;
+    if (!enumFoundSchedule())
+      setEnumFoundSchedule();
   }
 
   return GetBestCost();
 }
+
+void BBWithSpill::printCurrentClustering() {
+  // Print the instructions in the clusters after finding a schedule.
+  if (ClusterMemoryOperations && (IsSecondPass() || !TwoPassEnabled)) {
+    dbgs() << "Printing clustered instructions:\n";
+    int i = 1;
+    for (const auto &clusters : PastClustersList) {
+      dbgs() << "Printing cluster " << i << ", start cycle (" << clusters->Start
+             << "): ";
+      for (const auto &instr : *clusters->InstrList) {
+        dbgs() << instr << " ";
+      }
+      i++;
+      dbgs() << '\n';
+    }
+
+    if (LastCluster) {
+      dbgs() << "Printing cluster " << i << ", start cycle ("
+             << LastCluster->Start << "): ";
+      for (const auto &instr : *(LastCluster->InstrList)) {
+        dbgs() << instr << " ";
+      }
+      i++;
+      dbgs() << '\n';
+    }
+
+    if (InstrList && InstrList->size() > 0) {
+      dbgs() << "Printing cluster " << i << ", start cycle (" << StartCycle
+             << "): ";
+      for (const auto &instr : *InstrList) {
+        dbgs() << instr << " ";
+      }
+      dbgs() << '\n';
+    }
+  }
+}
 /*****************************************************************************/
 
 void BBWithSpill::SetupForSchdulng_() {
@@ -914,17 +1239,32 @@ void BBWithSpill::SetupForSchdulng_() {
 bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
   bool fsbl = true;
   InstCount crntCost, dynmcCostLwrBound;
+  int ClusterDynamicLowerBound;
   if (GetSpillCostFunc() == SCF_SLIL) {
     crntCost = dynamicSlilLowerBound_ * SCW_ + trgtLngth * schedCostFactor_;
   } else {
     crntCost = crntSpillCost_ * SCW_ + trgtLngth * schedCostFactor_;
   }
+  // Add the cost of clustering
+  if (IsSecondPass() && ClusterMemoryOperations) {
+    ClusterDynamicLowerBound = calculateClusterDLB();
+    crntCost += ClusterDynamicLowerBound * ClusteringWeight;
+  }
+
   crntCost -= GetCostLwrBound();
   dynmcCostLwrBound = crntCost;
 
   // assert(cost >= 0);
   assert(dynmcCostLwrBound >= 0);
 
+  /*
+    if (IsSecondPass() && ClusterMemoryOperations) {
+      dbgs() << "Current cycle: " << node->GetTime() <<", current cost is: " <<
+    dynmcCostLwrBound << ". Current best is: " << GetBestCost() << '\n';
+      printCurrentClustering();
+    }
+  */
+
   fsbl = dynmcCostLwrBound < GetBestCost();
 
   // FIXME: RP tracking should be limited to the current SCF. We need RP
@@ -934,6 +1274,16 @@ bool BBWithSpill::ChkCostFsblty(InstCount trgtLngth, EnumTreeNode *node) {
     node->SetCostLwrBound(dynmcCostLwrBound);
     node->SetPeakSpillCost(peakSpillCost_);
     node->SetSpillCostSum(totSpillCost_);
+    if (IsSecondPass() && ClusterMemoryOperations) {
+      node->setClusteringCost(CurrentClusterCost);
+      node->setCurClusteringGroup(ClusterActiveGroup);
+      node->setClusterLwrBound(ClusterDynamicLowerBound);
+      if (ClusterActiveGroup != 0) {
+        node->setClusterAbsorbCount(15 - CurrentClusterSize);
+      } else {
+        node->setClusterAbsorbCount(0);
+      }
+    }
   }
   return fsbl;
 }
diff --git a/lib/Scheduler/data_dep.cpp b/lib/Scheduler/data_dep.cpp
index c7273b78..ef6e2cda 100644
--- a/lib/Scheduler/data_dep.cpp
+++ b/lib/Scheduler/data_dep.cpp
@@ -197,6 +197,9 @@ DataDepGraph::DataDepGraph(MachineModel *machMdl, LATENCY_PRECISION ltncyPrcsn)
   exitInstCnt_ = 0;
 
   RegFiles = llvm::make_unique<RegisterFile[]>(machMdl_->GetRegTypeCnt());
+
+  MinClusterCount = 0;
+  TotalInstructionsInAllClusters = 0;
 }
 
 DataDepGraph::~DataDepGraph() {
@@ -211,6 +214,11 @@ DataDepGraph::~DataDepGraph() {
   delete[] instCntPerType_;
 }
 
+int DataDepGraph::getTotalInstructionsInCluster(int Cluster) {
+  assert(Cluster > 0);
+  return MaxInstructionsInEachClusters[Cluster];
+}
+
 FUNC_RESULT DataDepGraph::SetupForSchdulng(bool cmputTrnstvClsr) {
   assert(wasSetupForSchduling_ == false);
 
@@ -899,7 +907,8 @@ void DataDepGraph::CreateEdge(SchedInstruction *frmNode,
 }
 
 void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum,
-                               int ltncy, DependenceType depType) {
+                               int ltncy, DependenceType depType,
+                               bool IsArtificial) {
   GraphEdge *edge;
 
   assert(frmNodeNum < instCnt_);
@@ -928,7 +937,7 @@ void DataDepGraph::CreateEdge_(InstCount frmNodeNum, InstCount toNodeNum,
     Logger::Info("Creating edge from %d to %d of type %d and latency %d",
                  frmNodeNum, toNodeNum, depType, ltncy);
 #endif
-    edge = new GraphEdge(frmNode, toNode, ltncy, depType);
+    edge = new GraphEdge(frmNode, toNode, ltncy, depType, IsArtificial);
 
     frmNode->AddScsr(edge);
     toNode->AddPrdcsr(edge);
@@ -2753,6 +2762,7 @@ void InstSchedule::Copy(InstSchedule *src) {
 
   SetSpillCosts(src->spillCosts_);
   SetPeakRegPressures(src->peakRegPressures_);
+  setClusterSize(src->getClusterSize());
   cost_ = src->cost_;
   execCost_ = src->execCost_;
   spillCost_ = src->spillCost_;
@@ -2827,6 +2837,44 @@ void InstSchedule::Print(std::ostream &out, char const *const label) {
   }
 }
 
+
+ void InstSchedule::Print(std::ostream &out, char const *const title,
+		 DataDepGraph *ddg) {
+  InstCount slotInCycle = 0;
+  InstCount cycleNum = 0;
+  InstCount i;
+
+  // out << '\n' << label << " Schedule";
+  Logger::Info("Printing Schedule");
+
+  for (i = 0; i < crntSlotNum_; i++) {
+    if (slotInCycle == 0) {
+      if (instInSlot_[i] != SCHD_STALL) {
+        InstCount instNum = instInSlot_[i];
+        SchedInstruction *inst = ddg->GetInstByIndx(instNum);
+        Logger::Info("Cycle# %d : %d - %s", cycleNum, instInSlot_[i], inst->GetName());
+      } else
+        Logger::Info("Cycle# %d : %d -", cycleNum, instInSlot_[i]);
+    } 
+    /*
+    out << "\nCycle# " << cycleNum << ":  ";
+
+    if (instInSlot_[i] == SCHD_STALL) {
+      out << "X ";
+    } else {
+      out << instInSlot_[i] << ' ';
+    }
+   */
+
+    slotInCycle++;
+
+    if (slotInCycle == issuRate_) {
+      slotInCycle = 0;
+      cycleNum++;
+    }
+  }
+ }
+
 #if defined(IS_DEBUG_PEAK_PRESSURE) || defined(IS_DEBUG_OPTSCHED_PRESSURES)
 void InstSchedule::PrintRegPressures() const {
   Logger::Info("OptSched max reg pressures:");
@@ -2972,8 +3020,15 @@ bool InstSchedule::VerifyDataDeps_(DataDepGraph *dataDepGraph) {
 
     UDT_GLABEL ltncy;
     DependenceType depType;
-    for (SchedInstruction *scsr = inst->GetFrstScsr(NULL, &ltncy, &depType);
-         scsr != NULL; scsr = inst->GetNxtScsr(NULL, &ltncy, &depType)) {
+    bool IsArtificial;
+    for (SchedInstruction *scsr =
+             inst->GetFrstScsr(NULL, &ltncy, &depType, &IsArtificial);
+         scsr != NULL;
+         scsr = inst->GetNxtScsr(NULL, &ltncy, &depType, &IsArtificial)) {
+      // Artificial nodes are not required for the schedule to be correct
+      if (IsArtificial)
+        continue;
+
       InstCount scsrCycle = GetSchedCycle(scsr);
       if (scsrCycle < (instCycle + ltncy)) {
         Logger::Error("Invalid schedule: Latency from %d to %d not satisfied",
@@ -3043,6 +3098,10 @@ void InstSchedule::SetSpillCost(InstCount cost) { spillCost_ = cost; }
 
 InstCount InstSchedule::GetSpillCost() const { return spillCost_; }
 
+void InstSchedule::setClusterSize(int size) { ClusterSize = size; }
+
+int InstSchedule::getClusterSize() const { return ClusterSize; }
+
 /*******************************************************************************
  * Previously inlined functions
  ******************************************************************************/
@@ -3205,7 +3264,6 @@ bool DataDepGraph::DoesFeedUser(SchedInstruction *inst) {
     // If there is a successor instruction that decreases live intervals
     // or one that does not increase live intervals, then return true.
     return true;
-
   }
 // Return false if there is no recursive successor of inst
 // that uses a live register.
diff --git a/lib/Scheduler/enumerator.cpp b/lib/Scheduler/enumerator.cpp
index d9c4e3b1..43bf6ed6 100644
--- a/lib/Scheduler/enumerator.cpp
+++ b/lib/Scheduler/enumerator.cpp
@@ -64,6 +64,12 @@ void EnumTreeNode::Init_() {
   isLeaf_ = false;
   cost_ = INVALID_VALUE;
   costLwrBound_ = INVALID_VALUE;
+  ClusterCost = INVALID_VALUE;
+  ClusterActiveGroup = INVALID_VALUE;
+  ClusterAbsorbCount = INVALID_VALUE;
+  ClusterDLB = INVALID_VALUE;
+  ClusterTotalCost = -1;
+  ClusterBestCost = 99999999;
   crntCycleBlkd_ = false;
   rsrvSlots_ = NULL;
   totalCostIsActualCost_ = false;
@@ -434,8 +440,8 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl,
                        InstCount schedUprBound, int16_t sigHashSize,
                        SchedPriorities prirts, Pruning PruningStrategy,
                        bool SchedForRPOnly, bool enblStallEnum,
-                       Milliseconds timeout, InstCount preFxdInstCnt,
-                       SchedInstruction *preFxdInsts[])
+                       Milliseconds timeout, bool ClusteringEnabled,
+                       InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : ConstrainedScheduler(dataDepGraph, machMdl, schedUprBound) {
   memAllocBlkSize_ = (int)timeout / TIMEOUT_TO_MEMBLOCK_RATIO;
   assert(preFxdInstCnt >= 0);
@@ -454,6 +460,7 @@ Enumerator::Enumerator(DataDepGraph *dataDepGraph, MachineModel *machMdl,
   prune_ = PruningStrategy;
   SchedForRPOnly_ = SchedForRPOnly;
   enblStallEnum_ = enblStallEnum;
+  Clustering = ClusteringEnabled;
 
   isEarlySubProbDom_ = true;
 
@@ -1316,17 +1323,27 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode,
     Logger::Info("Leaf node total cost %d", currentNode->GetCost());
 #endif
     currentNode->SetTotalCost(currentNode->GetCost());
+    if (currentNode->isClustering())
+      currentNode->setTotalClusterCost(currentNode->getClusteringCost());
     currentNode->SetTotalCostIsActualCost(true);
   } else {
-    if (!currentNode->GetTotalCostIsActualCost() &&
-        (currentNode->GetTotalCost() == -1 ||
-         currentNode->GetCostLwrBound() < currentNode->GetTotalCost())) {
-#if defined(IS_DEBUG_ARCHIVE)
-      Logger::Info("Inner node doesn't have a real cost yet. Setting total "
-                   "cost to dynamic lower bound %d",
-                   currentNode->GetCostLwrBound());
-#endif
-      currentNode->SetTotalCost(currentNode->GetCostLwrBound());
+    if (!currentNode->GetTotalCostIsActualCost()) {
+      // Set overall weighted sum cost
+      if (currentNode->GetTotalCost() == -1 ||
+          currentNode->GetCostLwrBound() < currentNode->GetTotalCost()) {
+  #if defined(IS_DEBUG_ARCHIVE)
+        Logger::Info("Inner node doesn't have a real cost yet. Setting total "
+                    "cost to dynamic lower bound %d",
+                    currentNode->GetCostLwrBound());
+  #endif
+        currentNode->SetTotalCost(currentNode->GetCostLwrBound());
+      }
+
+      // Set clustering cost
+      if ((currentNode->isClustering() && currentNode->getTotalClusterCost() == -1) || 
+          (currentNode->getClusterLwrBound() < currentNode->getTotalClusterCost())) {
+        currentNode->setTotalClusterCost(currentNode->getClusterLwrBound());
+      }
     }
   }
 
@@ -1359,16 +1376,25 @@ void SetTotalCostsAndSuffixes(EnumTreeNode *const currentNode,
                      currentNode->GetTotalCost());
 #endif
         parentNode->SetTotalCost(currentNode->GetTotalCost());
+        if (currentNode->isClustering())
+          parentNode->setTotalClusterCost(currentNode->getTotalClusterCost());
         parentNode->SetTotalCostIsActualCost(true);
         parentNode->SetSuffix(std::move(parentSuffix));
-      } else if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) {
-#if defined(IS_DEBUG_ARCHIVE)
-        Logger::Info(
-            "Current node has a real cost (%d), and so does parent. (%d)",
-            currentNode->GetTotalCost(), parentNode->GetTotalCost());
-#endif
-        parentNode->SetTotalCost(currentNode->GetTotalCost());
-        parentNode->SetSuffix(std::move(parentSuffix));
+      } else {
+        if (currentNode->GetTotalCost() < parentNode->GetTotalCost()) {
+  #if defined(IS_DEBUG_ARCHIVE)
+          Logger::Info(
+              "Current node has a real cost (%d), and so does parent. (%d)",
+              currentNode->GetTotalCost(), parentNode->GetTotalCost());
+  #endif
+          parentNode->SetTotalCost(currentNode->GetTotalCost());
+          parentNode->SetSuffix(std::move(parentSuffix));
+        }
+
+        // Set clustering cost
+        if (currentNode->isClustering() && currentNode->getTotalClusterCost() < parentNode->getTotalClusterCost()) {
+          parentNode->setTotalClusterCost(currentNode->getTotalClusterCost());
+        }
       }
     }
   }
@@ -1856,7 +1882,7 @@ LengthEnumerator::LengthEnumerator(
     bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
     InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts,
-                 PruningStrategy, SchedForRPOnly, enblStallEnum, timeout,
+                 PruningStrategy, SchedForRPOnly, enblStallEnum, timeout, false,
                  preFxdInstCnt, preFxdInsts) {
   SetupAllocators_();
   tmpHstryNode_ = new HistEnumTreeNode;
@@ -1941,11 +1967,11 @@ LengthCostEnumerator::LengthCostEnumerator(
     DataDepGraph *dataDepGraph, MachineModel *machMdl, InstCount schedUprBound,
     int16_t sigHashSize, SchedPriorities prirts, Pruning PruningStrategy,
     bool SchedForRPOnly, bool enblStallEnum, Milliseconds timeout,
-    SPILL_COST_FUNCTION spillCostFunc, InstCount preFxdInstCnt,
-    SchedInstruction *preFxdInsts[])
+    SPILL_COST_FUNCTION spillCostFunc, bool ClusteringEnabled,
+    InstCount preFxdInstCnt, SchedInstruction *preFxdInsts[])
     : Enumerator(dataDepGraph, machMdl, schedUprBound, sigHashSize, prirts,
                  PruningStrategy, SchedForRPOnly, enblStallEnum, timeout,
-                 preFxdInstCnt, preFxdInsts) {
+                 ClusteringEnabled, preFxdInstCnt, preFxdInsts) {
   SetupAllocators_();
 
   costChkCnt_ = 0;
@@ -2141,6 +2167,7 @@ bool LengthCostEnumerator::BackTrack_() {
 /*****************************************************************************/
 
 InstCount LengthCostEnumerator::GetBestCost_() { return rgn_->GetBestCost(); }
+int LengthCostEnumerator::GetBestClusterCost_() { return rgn_->getBestClusterCost(); }
 /*****************************************************************************/
 
 void LengthCostEnumerator::CreateRootNode_() {
diff --git a/lib/Scheduler/hist_table.cpp b/lib/Scheduler/hist_table.cpp
index a4c1cae7..8a9ff356 100644
--- a/lib/Scheduler/hist_table.cpp
+++ b/lib/Scheduler/hist_table.cpp
@@ -400,6 +400,10 @@ void CostHistEnumTreeNode::Init_() {
   costInfoSet_ = false;
 #endif
   cost_ = 0;
+  ClusterCost = 9999999;
+  ClusterTotalCost = 9999999;
+  ClusterActiveGroup = 0;
+  ClusterAbsorbCount = 0;
 }
 
 bool CostHistEnumTreeNode::DoesDominate(EnumTreeNode *node,
@@ -467,6 +471,41 @@ static bool doesHistoryPeakCostDominate(InstCount OtherPrefixCost,
   return LCE->GetBestCost() <= OtherPrefixCost;
 }
 
+static bool doesClusterCostDominate(EnumTreeNode *CurEnumNode,
+                                    int ClusterActiveGroup, int ClusterCost,
+                                    int ClusterAbsorbCount, int ClusterTotalCost,
+                                    int ClusterBest) {
+  // Correct but too restrictive
+  if (CurEnumNode->getCurClusteringGroup() != ClusterActiveGroup)
+    return false;
+
+  // Count the instructions only if there is an instruction in the ready list that belongs
+  // to the open cluster. If there is none, you can't add any instructions. If there are no instructions
+  // on the ready list that belong to the open cluster, we can set the cluster absorb count to 0.
+  if (CurEnumNode->getClusteringCost() >= ClusterCost &&
+      CurEnumNode->getClusterAbsorbCount() <= ClusterAbsorbCount)
+    return true;
+
+  // More room in the open cluster can reduce the number clusters by at most one
+  if (CurEnumNode->getClusteringCost() >= ClusterCost + 1)
+    return true;
+
+  int improvement = ClusterCost - CurEnumNode->getClusteringCost();
+
+  // If the current node has a better absorb count then we optimistically assume it may
+  // improve the number of clusters by 1
+  if (CurEnumNode->getClusterAbsorbCount() < ClusterAbsorbCount)
+    improvement++;
+
+  // Two cases for a history node,
+  // 1.) One without a full schedule below it. Look at DLB.
+  // 2.) One with a full schedule below it. Look at the best found below the history node.
+  if (ClusterBest != INVALID_VALUE && improvement <= ClusterTotalCost - ClusterBest)
+    return true;
+
+  return false;
+}
+
 // Should we prune the other node based on RP cost.
 bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node,
                                                    Enumerator *E) {
@@ -502,6 +541,10 @@ bool CostHistEnumTreeNode::ChkCostDmntnForBBSpill_(EnumTreeNode *Node,
       ShouldPrune =
           spillCostSum_ % instCnt >= Node->GetSpillCostSum() % instCnt;
     }
+    if (!ShouldPrune && LCE->isClustering()) {
+      int ClusterBest = LCE->getBestClusterCost();
+      ShouldPrune = doesClusterCostDominate(Node, ClusterActiveGroup, ClusterCost, ClusterAbsorbCount, ClusterTotalCost, ClusterBest);
+    }
   }
   return ShouldPrune;
 }
@@ -511,6 +554,10 @@ void CostHistEnumTreeNode::SetCostInfo(EnumTreeNode *node, bool, Enumerator *) {
   peakSpillCost_ = node->GetPeakSpillCost();
   spillCostSum_ = node->GetSpillCostSum();
   isLngthFsbl_ = node->IsLngthFsbl();
+  ClusterCost = node->getClusteringCost();
+  ClusterActiveGroup = node->getCurClusteringGroup();
+  ClusterAbsorbCount = node->getClusterAbsorbCount();
+  ClusterTotalCost = node->getTotalClusterCost();
 
   // (Chris)
   partialCost_ = node->GetCostLwrBound();
diff --git a/lib/Scheduler/ready_list.cpp b/lib/Scheduler/ready_list.cpp
index 7abd3ff0..6bee513b 100644
--- a/lib/Scheduler/ready_list.cpp
+++ b/lib/Scheduler/ready_list.cpp
@@ -15,7 +15,7 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
   // enable fast updating for dynamic heuristics.
   if (prirts_.isDynmc)
     keyedEntries_ = new KeyedEntry<SchedInstruction, unsigned long>
-        *[dataDepGraph->GetInstCnt()];
+        *[dataDepGraph->GetInstCnt()]();
   else
     keyedEntries_ = nullptr;
 
@@ -34,9 +34,6 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       break;
 
     case LSH_LUC:
-      for (int j = 0; j < dataDepGraph->GetInstCnt(); j++) {
-        keyedEntries_[j] = NULL;
-      }
       maxUseCnt_ = dataDepGraph->GetMaxUseCnt();
       useCntBits_ = Utilities::clcltBitsNeededToHoldNum(maxUseCnt_);
       totKeyBits += useCntBits_;
@@ -73,6 +70,14 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       ltncySumBits_ = Utilities::clcltBitsNeededToHoldNum(maxLtncySum_);
       totKeyBits += ltncySumBits_;
       break;
+
+    case LSH_CLUSTER:
+      // Bits needed: 1
+      // 0: Not part of an active cluster
+      // 1: Part of an active cluster
+      ClusterBit = Utilities::clcltBitsNeededToHoldNum(1);
+      totKeyBits += ClusterBit;
+      break;
     } // end switch
   }   // end for
 
@@ -116,6 +121,10 @@ ReadyList::ReadyList(DataDepGraph *dataDepGraph, SchedPriorities prirts) {
       AddPrirtyToKey_(maxPriority_, keySize, ltncySumBits_, maxLtncySum_,
                       maxLtncySum_);
       break;
+
+    case LSH_CLUSTER:
+      AddPrirtyToKey_(maxPriority_, keySize, ClusterBit, 1, 1);
+      break;
     }
   }
 }
@@ -152,6 +161,8 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
   int16_t keySize = 0;
   int i;
   int16_t oldLastUseCnt, newLastUseCnt;
+  unsigned long ValueForKey;
+  bool OldWasActive, NewWasActive;
   changed = true;
   if (isUpdate)
     changed = false;
@@ -198,6 +209,24 @@ unsigned long ReadyList::CmputKey_(SchedInstruction *inst, bool isUpdate,
       AddPrirtyToKey_(key, keySize, ltncySumBits_, inst->GetLtncySum(),
                       maxLtncySum_);
       break;
+
+    case LSH_CLUSTER:
+      // Partially copied how LUC is calculated to be updated.
+      if (inst->GetClusterGroup() == 0)
+        ValueForKey = 0;
+      else {
+        OldWasActive = inst->getWasActive();
+        NewWasActive = inst->computeWasActive();
+
+        if (OldWasActive != NewWasActive) {
+          changed = true;
+        }
+        ValueForKey =
+            inst->GetClusterGroup() == SchedInstruction::GetActiveCluster() ? 1
+                : 0;
+      }
+      AddPrirtyToKey_(key, keySize, ClusterBit, ValueForKey, 1);
+      break;
     }
   }
   return key;
@@ -214,14 +243,17 @@ void ReadyList::AddLatestSubLists(LinkedList<SchedInstruction> *lst1,
 }
 
 void ReadyList::Print(std::ostream &out) {
+  PriorityList<SchedInstruction> *OutList = new PriorityList<SchedInstruction>;
+  OutList->CopyList(prirtyLst_, nullptr);
   out << "Ready List: ";
-  for (const auto *crntInst = prirtyLst_->GetFrstElmnt(); crntInst != NULL;
-       crntInst = prirtyLst_->GetNxtElmnt()) {
-    out << " " << crntInst->GetNum();
+  for (auto *crntInst = OutList->GetFrstElmnt(); crntInst != NULL;
+       crntInst = OutList->GetNxtElmnt()) {
+    out << " " << crntInst->GetNum() << "(" << crntInst->GetClusterGroup()
+        << ")";
   }
   out << '\n';
 
-  prirtyLst_->ResetIterator();
+  delete OutList;
 }
 
 void ReadyList::AddLatestSubList_(LinkedList<SchedInstruction> *lst) {
@@ -280,6 +312,7 @@ void ReadyList::AddInst(SchedInstruction *inst) {
   assert(changed == true);
   KeyedEntry<SchedInstruction, unsigned long> *entry =
       prirtyLst_->InsrtElmnt(inst, key, true);
+
   InstCount instNum = inst->GetNum();
   if (prirts_.isDynmc)
     keyedEntries_[instNum] = entry;
diff --git a/lib/Scheduler/sched_basic_data.cpp b/lib/Scheduler/sched_basic_data.cpp
index 2c2e4752..4aec6ec6 100644
--- a/lib/Scheduler/sched_basic_data.cpp
+++ b/lib/Scheduler/sched_basic_data.cpp
@@ -4,6 +4,9 @@
 
 using namespace llvm::opt_sched;
 
+// Initially set the active clustering to 0 for none.
+int SchedInstruction::ActiveCluster = 0;
+
 SchedInstruction::SchedInstruction(InstCount num, const string &name,
                                    InstType instType, const string &opCode,
                                    InstCount maxInstCnt, int nodeID,
@@ -15,6 +18,8 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
   name_ = name;
   opCode_ = opCode;
   instType_ = instType;
+  ClusterGroup = 0;
+  MayCluster = false;
 
   frwrdLwrBound_ = INVALID_VALUE;
   bkwrdLwrBound_ = INVALID_VALUE;
@@ -60,6 +65,7 @@ SchedInstruction::SchedInstruction(InstCount num, const string &name,
 
   mustBeInBBEntry_ = false;
   mustBeInBBExit_ = false;
+  WasActive = false;
 }
 
 SchedInstruction::~SchedInstruction() {
@@ -68,6 +74,11 @@ SchedInstruction::~SchedInstruction() {
   delete crntRange_;
 }
 
+bool SchedInstruction::computeWasActive() {
+  WasActive = GetActiveCluster() == GetClusterGroup();
+  return WasActive;
+}
+
 void SchedInstruction::SetupForSchdulng(InstCount instCnt, bool isCP_FromScsr,
                                         bool isCP_FromPrdcsr) {
   if (memAllocd_)
@@ -373,7 +384,8 @@ SchedInstruction *SchedInstruction::GetNxtPrdcsr(InstCount *scsrNum,
 
 SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum,
                                                 UDT_GLABEL *ltncy,
-                                                DependenceType *depType) {
+                                                DependenceType *depType,
+                                                bool *IsArtificial) {
   GraphEdge *edge = GetFrstScsrEdge();
   if (!edge)
     return NULL;
@@ -383,12 +395,15 @@ SchedInstruction *SchedInstruction::GetFrstScsr(InstCount *prdcsrNum,
     *ltncy = edge->label;
   if (depType)
     *depType = (DependenceType)edge->label2;
+  if (IsArtificial)
+    *IsArtificial = edge->IsArtificial;
   return (SchedInstruction *)(edge->to);
 }
 
 SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum,
                                                UDT_GLABEL *ltncy,
-                                               DependenceType *depType) {
+                                               DependenceType *depType,
+                                               bool *IsArtificial) {
   GraphEdge *edge = GetNxtScsrEdge();
   if (!edge)
     return NULL;
@@ -398,6 +413,8 @@ SchedInstruction *SchedInstruction::GetNxtScsr(InstCount *prdcsrNum,
     *ltncy = edge->label;
   if (depType)
     *depType = (DependenceType)edge->label2;
+  if (IsArtificial)
+    *IsArtificial = edge->IsArtificial;
   return (SchedInstruction *)(edge->to);
 }
 
@@ -717,6 +734,13 @@ int16_t SchedInstruction::CmputLastUseCnt() {
   return lastUseCnt_;
 }
 
+void SchedInstruction::SetMayCluster(int ClusteringGroup) {
+  if (ClusteringGroup > 0) {
+    ClusterGroup = ClusteringGroup;
+    MayCluster = true;
+  }
+}
+
 /******************************************************************************
  * SchedRange                                                                 *
  ******************************************************************************/
diff --git a/lib/Scheduler/sched_region.cpp b/lib/Scheduler/sched_region.cpp
index 23dfd165..64e4bc56 100644
--- a/lib/Scheduler/sched_region.cpp
+++ b/lib/Scheduler/sched_region.cpp
@@ -2,6 +2,7 @@
 #include <memory>
 #include <utility>
 
+#include "Wrapper/OptSchedDDGWrapperBasic.h"
 #include "opt-sched/Scheduler/aco.h"
 #include "opt-sched/Scheduler/bb_spill.h"
 #include "opt-sched/Scheduler/config.h"
@@ -39,6 +40,7 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph,
 
   totalSimSpills_ = INVALID_VALUE;
   bestCost_ = INVALID_VALUE;
+  BestClusterCost = INVALID_VALUE;
   bestSchedLngth_ = INVALID_VALUE;
   hurstcCost_ = INVALID_VALUE;
   enumCrntSched_ = NULL;
@@ -47,6 +49,8 @@ SchedRegion::SchedRegion(MachineModel *machMdl, DataDepGraph *dataDepGraph,
   schedUprBound_ = INVALID_VALUE;
 
   spillCostFunc_ = spillCostFunc;
+  PrintClustering = false;
+  EnumFoundSchedule = false;
 }
 
 void SchedRegion::UseFileBounds_() {
@@ -122,6 +126,10 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   // heuristic scheduler or ACO before the branch & bound enumerator must be
   // enabled.
   Config &schedIni = SchedulerOptions::getInstance();
+  PrintClustering = schedIni.GetBool("PRINT_CLUSTER");
+  TwoPassEnabled = schedIni.GetBool("USE_TWO_PASS");
+  ClusterMemoryOperations = schedIni.GetBool("CLUSTER_MEMORY_OPS");
+  ClusteringWeight = schedIni.GetInt("CLUSTER_WEIGHT");
   bool HeuristicSchedulerEnabled = schedIni.GetBool("HEUR_ENABLED");
   bool AcoSchedulerEnabled = schedIni.GetBool("ACO_ENABLED");
   bool BbSchedulerEnabled = isBbEnabled(schedIni, rgnTimeout);
@@ -176,17 +184,6 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   CmputAbslutUprBound_();
   schedLwrBound_ = dataDepGraph_->GetSchedLwrBound();
 
-  // We can calculate lower bounds here since it is only dependent
-  // on schedLwrBound_
-  if (!BbSchedulerEnabled)
-    costLwrBound_ = CmputCostLwrBound();
-  else
-    CmputLwrBounds_(false);
-
-  // Log the lower bound on the cost, allowing tools reading the log to compare
-  // absolute rather than relative costs.
-  Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_);
-
   // Step #1: Find the heuristic schedule if enabled.
   // Note: Heuristic scheduler is required for the two-pass scheduler
   // to use the sequential list scheduler which inserts stalls into
@@ -208,9 +205,37 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
 
     hurstcTime = Utilities::GetProcessorTime() - hurstcStart;
     stats::heuristicTime.Record(hurstcTime);
+
     if (hurstcTime > 0)
       Logger::Info("Heuristic_Time %d", hurstcTime);
+  }
+
+  // After the sequential scheduler in the second pass, add the artificial edges
+  // to the DDG. Some mutations were adding artificial edges which caused a
+  // conflict with the sequential scheduler. Therefore, wait until the
+  // sequential scheduler is done before adding artificial edges.
+  if (IsSecondPass()) {
+    static_cast<OptSchedDDGWrapperBasic *>(dataDepGraph_)->addArtificialEdges();
+    rslt = dataDepGraph_->UpdateSetupForSchdulng(needTransitiveClosure);
+    if (rslt != RES_SUCCESS) {
+      Logger::Info("Invalid DAG after adding artificial cluster edges");
+      return rslt;
+    }
+  }
+
+  // This must be done after SetupForSchdulng() or UpdateSetupForSchdulng() to
+  // avoid resetting lower bound values.
+  if (!BbSchedulerEnabled)
+    costLwrBound_ = CmputCostLwrBound();
+  else
+    CmputLwrBounds_(false);
+
+  // Log the lower bound on the cost, allowing tools reading the log to compare
+  // absolute rather than relative costs.
+  Logger::Info("Lower bound of cost before scheduling: %d", costLwrBound_);
 
+  // Cost calculation must be below lower bounds calculation
+  if (HeuristicSchedulerEnabled || IsSecondPass()) {
     heuristicScheduleLength = lstSched->GetCrntLngth();
     InstCount hurstcExecCost;
     // Compute cost for Heuristic list scheduler, this must be called before
@@ -225,6 +250,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = lstSched;
       bestSchedLngth_ = heuristicScheduleLength;
       bestCost_ = hurstcCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(lstSched->getClusterSize());
     }
 
     FinishHurstc_();
@@ -279,6 +306,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = AcoSchedule;
       bestSchedLngth_ = AcoScheduleLength_;
       bestCost_ = AcoScheduleCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(AcoSchedule->getClusterSize());
     }
   }
 
@@ -294,6 +323,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = lstSched;
       bestSchedLngth_ = heuristicScheduleLength;
       bestCost_ = hurstcCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(lstSched->getClusterSize());
     }
     // B) Heuristic was never run. In that case, just use ACO and run with its
     // results, into B&B.
@@ -301,6 +332,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_ = AcoSchedule;
       bestSchedLngth_ = AcoScheduleLength_;
       bestCost_ = AcoScheduleCost_;
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(AcoSchedule->getClusterSize());
       // C) Neither scheduler was optimal. In that case, compare the two
       // schedules and use the one that's better as the input (initialSched) for
       // B&B.
@@ -309,6 +342,8 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
       bestSched = bestSched_;
       bestSchedLngth_ = bestSched_->GetCrntLngth();
       bestCost_ = bestSched_->GetCost();
+      if (IsSecondPass() && ClusterMemoryOperations)
+        bestSched->setClusterSize(bestSched_->getClusterSize());
     }
   }
   // Step #3: Compute the cost upper bound.
@@ -376,6 +411,9 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
   InitialSchedule = bestSched_;
   InitialScheduleCost = bestCost_;
   InitialScheduleLength = bestSchedLngth_;
+  /*Logger::Info("Printing Initiial schedule");
+  InitialSchedule->Print(Logger::GetLogStream(), "InitialSched", dataDepGraph_);
+  Logger::Info("Finish printing initial schedule");*/
 
   // Step #4: Find the optimal schedule if the heuristc and ACO was not optimal.
   if (BbSchedulerEnabled) {
@@ -606,6 +644,14 @@ FUNC_RESULT SchedRegion::FindOptimalSchedule(
     Logger::Info("DAG %s PEAK %d", dataDepGraph_->GetDagID(), maxSpillCost);
   }
 #endif
+
+  if (PrintClustering && bestSched != NULL && (IsSecondPass() || !TwoPassEnabled)) {
+    computeAndPrintClustering(bestSched);
+  }
+
+  //if (bestSched != NULL)
+  //  bestSched->Print(Logger::GetLogStream(), "FinalSched", dataDepGraph_);
+
   return rslt;
 }
 
@@ -708,11 +754,6 @@ bool SchedRegion::CmputUprBounds_(InstSchedule *schedule, bool useFileBounds) {
     // If the heuristic schedule is optimal, we are done!
     schedUprBound_ = bestSchedLngth_;
     return true;
-  } else if (IsSecondPass()) {
-    // In the second pass, the upper bound is the length of the min-RP schedule
-    // that was found in the first pass with stalls inserted.
-    schedUprBound_ = schedule->GetCrntLngth();
-    return false;
   } else {
     CmputSchedUprBound_();
     return false;
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.cpp b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
index 46d6c1a3..915f4e6b 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.cpp
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.cpp
@@ -7,8 +7,13 @@
 #include "GCNOptSched.h"
 #include "AMDGPUMacroFusion.h"
 #include "GCNSchedStrategy.h"
+#include "OptSchedGCNTarget.h"
 #include "SIMachineFunctionInfo.h"
+//#include "llvm/CodeGen/OptSequential.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <string>
 
 #define DEBUG_TYPE "optsched"
 
@@ -43,7 +48,31 @@ static void getRealRegionPressure(MachineBasicBlock::const_iterator Begin,
 
 ScheduleDAGOptSchedGCN::ScheduleDAGOptSchedGCN(
     llvm::MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S)
-    : ScheduleDAGOptSched(C, std::move(S)) {}
+    : ScheduleDAGOptSched(C, std::move(S)) {
+  MinOcc = getMinOcc();
+}
+
+unsigned ScheduleDAGOptSchedGCN::getMinOcc() {
+  SchedulerOptions &schedIni = SchedulerOptions::getInstance();
+  int MinOcc = schedIni.GetInt("MIN_OCCUPANCY_FOR_RESCHEDULE");
+  if (MinOcc <= 10 && MinOcc >= 1)
+    return MinOcc;
+
+  llvm::report_fatal_error(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" +
+      std::to_string(MinOcc), false);
+}
+
+int ScheduleDAGOptSchedGCN::getMinILPImprovement() {
+  SchedulerOptions &schedIni = SchedulerOptions::getInstance();
+  int MinIlpImprovement = schedIni.GetInt("MIN_ILP_IMPROVEMENT");
+  if (MinIlpImprovement <= 100 && MinIlpImprovement >= 0)
+    return MinIlpImprovement;
+
+  llvm::report_fatal_error(
+      "Unrecognized option for MIN_OCCUPANCY_FOR_RESCHEDULE setting: %d" +
+      std::to_string(MinIlpImprovement), false);
+}
 
 void ScheduleDAGOptSchedGCN::initSchedulers() {
   // Add DAG mutations that apply to both GCN and OptSched DAG's
@@ -58,22 +87,54 @@ void ScheduleDAGOptSchedGCN::initSchedulers() {
 
   // First
   SchedPasses.push_back(OptSchedMaxOcc);
-  // Second
+  // Second ILP passes
   SchedPasses.push_back(OptSchedBalanced);
-}   
+  SchedPasses.push_back(OptSchedLowerOccAnalysis);
+  SchedPasses.push_back(OptSchedCommitLowerOcc);
+}
 
 // Execute scheduling passes.
 // Partially copied GCNScheduleDAGMILive::finalizeSchedule
 void ScheduleDAGOptSchedGCN::finalizeSchedule() {
   if (TwoPassEnabled && OptSchedEnabled) {
     initSchedulers();
+    RescheduleRegions.resize(Regions.size());
+    ILPAnalysis.resize(Regions.size());
+    CostAnalysis.resize(Regions.size());
+    LowerOccScheds.resize(Regions.size());
+    RescheduleRegions.set();
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
     TwoPassSchedulingStarted = true;
     for (const SchedPassStrategy &S : SchedPasses) {
       MachineBasicBlock *MBB = nullptr;
       // Reset
-      RegionNumber = ~0u;
+      RegionIdx = 0;
+
+      if (S == OptSchedLowerOccAnalysis) {
+        if (RescheduleRegions.none())
+          break;
+        else {
+          auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
+          unsigned TargetOccupancy = GCNOST->getTargetOcc();
+          if (TargetOccupancy <= MinOcc)
+            break;
+
+          unsigned NewTarget = TargetOccupancy - 1u;
+          dbgs() << "Decreasing current target occupancy " << TargetOccupancy
+                 << " to new target " << NewTarget << '\n';
+          GCNOST->limitOccupancy(NewTarget);
+        }
+      }
+
+      if (S == OptSchedCommitLowerOcc) {
+        if (!shouldCommitLowerOccSched()) {
+          dbgs()
+              << "Lower occupancy schedule did not meet minimum improvement.\n";
+          break;
+        }
+        dbgs() << "Lower occupancy met minimum improvement requirement!\n";
+      }
 
       for (auto &Region : Regions) {
         RegionBegin = Region.first;
@@ -93,36 +154,42 @@ void ScheduleDAGOptSchedGCN::finalizeSchedule() {
           exitRegion();
           continue;
         }
-        LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before"));
+        LLVM_DEBUG(
+            getRealRegionPressure(RegionBegin, RegionEnd, LIS, "Before"));
         runSchedPass(S);
         LLVM_DEBUG(getRealRegionPressure(RegionBegin, RegionEnd, LIS, "After"));
         Region = std::make_pair(RegionBegin, RegionEnd);
         exitRegion();
+        ++RegionIdx;
       }
       finishBlock();
     }
   }
 
   ScheduleDAGMILive::finalizeSchedule();
-
-  LLVM_DEBUG(if (isSimRegAllocEnabled()) {
-    dbgs() << "*************************************\n";
-    dbgs() << "Function: " << MF.getName()
-           << "\nTotal Simulated Spills: " << SimulatedSpills << "\n";
-    dbgs() << "*************************************\n";
-  });
 }
 
 void ScheduleDAGOptSchedGCN::runSchedPass(SchedPassStrategy S) {
+  RescheduleRegions[RegionIdx] = false;
   switch (S) {
   case GCNMaxOcc:
     scheduleGCNMaxOcc();
     break;
   case OptSchedMaxOcc:
     scheduleOptSchedMaxOcc();
+    Logger::Info("End of first pass through");
     break;
   case OptSchedBalanced:
     scheduleOptSchedBalanced();
+    Logger::Info("End of second pass through");
+    break;
+  case OptSchedLowerOccAnalysis:
+    scheduleOptSchedLowerOccAnalysis();
+    Logger::Info("End of third pass through");
+    break;
+  case OptSchedCommitLowerOcc:
+    scheduleCommitLowerOcc();
+    Logger::Info("End of fourth pass through");
     break;
   }
 }
@@ -144,3 +211,37 @@ void ScheduleDAGOptSchedGCN::scheduleOptSchedMaxOcc() {
 void ScheduleDAGOptSchedGCN::scheduleOptSchedBalanced() {
   ScheduleDAGOptSched::scheduleOptSchedBalanced();
 }
+
+void ScheduleDAGOptSchedGCN::scheduleOptSchedLowerOccAnalysis() {
+  IsThirdPass = true;
+  ScheduleDAGOptSched::scheduleOptSchedBalanced();
+  IsThirdPass = false;
+}
+
+void ScheduleDAGOptSchedGCN::scheduleCommitLowerOcc() {
+  IsFourthPass = true;
+  ScheduleDAGOptSched::scheduleOptSchedBalanced();
+  IsFourthPass = false;
+}
+
+bool ScheduleDAGOptSchedGCN::shouldCommitLowerOccSched() {
+  // First analyze ILP improvements
+  int FirstPassLengthSum = 0;
+  int SecondPassLengthSum = 0;
+  int MinILPImprovement = getMinILPImprovement();
+  for (std::pair<int, int> &RegionLength : ILPAnalysis) {
+    FirstPassLengthSum += RegionLength.first;
+    SecondPassLengthSum += RegionLength.second;
+  }
+  double FirstPassAverageLength = (double)FirstPassLengthSum / Regions.size();
+  double SecondPassAverageLength = (double)SecondPassLengthSum / Regions.size();
+  double ILPImprovement = ((FirstPassAverageLength - SecondPassAverageLength) /
+                           FirstPassAverageLength) *
+                          100.0;
+  dbgs() << "ILPImprovement from second ILP pass is " << ILPImprovement
+         << ", min improvement is: " << MinILPImprovement << '\n';
+  if (ILPImprovement - MinILPImprovement >= 0)
+    return true;
+
+  return false;
+}
diff --git a/lib/Wrapper/AMDGPU/GCNOptSched.h b/lib/Wrapper/AMDGPU/GCNOptSched.h
index f08056aa..c24c93c1 100644
--- a/lib/Wrapper/AMDGPU/GCNOptSched.h
+++ b/lib/Wrapper/AMDGPU/GCNOptSched.h
@@ -9,17 +9,37 @@
 
 #include "../OptimizingScheduler.h"
 #include "GCNRegPressure.h"
+#include "OptSchedGCNTarget.h"
 
 namespace llvm {
 namespace opt_sched {
 
 class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
 private:
-  enum SchedPassStrategy { GCNMaxOcc, OptSchedMaxOcc, OptSchedBalanced };
+  enum SchedPassStrategy {
+    GCNMaxOcc,
+    OptSchedMaxOcc,
+    OptSchedBalanced,
+    OptSchedLowerOccAnalysis,
+    OptSchedCommitLowerOcc
+  };
+
+  /// Get the minimum occupancy value from the sched.ini settings file. Check
+  /// if the value is between 1-10 and gives an error if it is not between the
+  /// valid range.
+  unsigned getMinOcc();
+
+  int getMinILPImprovement();
+
+  /// Analyze the possible improvements from lowering the target occupancy
+  /// and decide if we should keep the schedules.
+  bool shouldCommitLowerOccSched();
 
   // Vector of scheduling passes to execute.
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
+  unsigned MinOcc;
+
 public:
   ScheduleDAGOptSchedGCN(llvm::MachineSchedContext *C,
                          std::unique_ptr<MachineSchedStrategy> S);
@@ -45,6 +65,13 @@ class ScheduleDAGOptSchedGCN : public ScheduleDAGOptSched {
 
   // Run OptSched in ILP/RP balanced mode.
   void scheduleOptSchedBalanced() override;
+
+  // Lower occupancy and run OptSched in ILP/RP balanced mode for analysis.
+  void scheduleOptSchedLowerOccAnalysis();
+
+  // Lower occupancy and run OptSched in ILP/RP balanced mode to commit
+  // scheduling in analysis pass.
+  void scheduleCommitLowerOcc();
 };
 
 } // namespace opt_sched
diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
index 21faf51e..9f63a720 100644
--- a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
+++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.cpp
@@ -3,6 +3,7 @@
 // AMDGCN OptSched target.
 //
 //===----------------------------------------------------------------------===//
+#include "OptSchedGCNTarget.h"
 #include "OptSchedDDGWrapperGCN.h"
 #include "SIMachineFunctionInfo.h"
 #include "Wrapper/OptSchedMachineWrapper.h"
@@ -22,7 +23,7 @@ using namespace llvm::opt_sched;
 
 // This is necessary because we cannot perfectly predict the number of registers
 // of each type that will be allocated.
-static const unsigned GPRErrorMargin = 3;
+static const unsigned GPRErrorMargin = 0;
 
 #ifndef NDEBUG
 static unsigned getOccupancyWeight(unsigned Occupancy) {
@@ -62,56 +63,6 @@ static unsigned getAdjustedOccupancy(const GCNSubtarget *ST, unsigned VGPRCount,
 
 namespace {
 
-class OptSchedGCNTarget : public OptSchedTarget {
-public:
-  std::unique_ptr<OptSchedMachineModel>
-  createMachineModel(const char *ConfigPath) override {
-    return llvm::make_unique<OptSchedMachineModel>(ConfigPath);
-  }
-
-  std::unique_ptr<OptSchedDDGWrapperBase>
-  createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG,
-                   OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision,
-                   const std::string &RegionID) override {
-    return llvm::make_unique<OptSchedDDGWrapperGCN>(Context, DAG, MM,
-                                                    LatencyPrecision, RegionID);
-  }
-
-  void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override;
-
-  void finalizeRegion(const InstSchedule *Schedule) override;
-
-  // Returns occupancy cost with number of VGPRs and SGPRs from PRP for
-  // a partial or complete schedule.
-  InstCount getCost(const llvm::SmallVectorImpl<unsigned> &PRP) const override;
-
-  void dumpOccupancyInfo(const InstSchedule *Schedule) const;
-
-  // Revert scheduing if we decrease occupancy.
-  bool shouldKeepSchedule() override;
-
-private:
-  const llvm::MachineFunction *MF;
-  SIMachineFunctionInfo *MFI;
-  ScheduleDAGOptSched *DAG;
-  const GCNSubtarget *ST;
-
-  unsigned RegionStartingOccupancy;
-  unsigned RegionEndingOccupancy;
-  unsigned TargetOccupancy;
-
-  // Max occupancy with local memory size;
-  unsigned MaxOccLDS;
-
-  // In RP only (max occupancy) scheduling mode we should try to find
-  // a min-RP schedule without considering perf hints which suggest limiting
-  // occupancy. Returns true if we should consider perf hints.
-  bool shouldLimitWaves() const;
-
-  // Find occupancy with spill cost.
-  unsigned getOccupancyWithCost(const InstCount Cost) const;
-};
-
 std::unique_ptr<OptSchedTarget> createOptSchedGCNTarget() {
   return llvm::make_unique<OptSchedGCNTarget>();
 }
@@ -161,9 +112,9 @@ void OptSchedGCNTarget::initRegion(llvm::ScheduleDAGInstrs *DAG_,
   TargetOccupancy =
       shouldLimitWaves() ? MFI->getMinAllowedOccupancy() : MFI->getOccupancy();
 
-  LLVM_DEBUG(dbgs() << "Region starting occupancy is "
+  dbgs() << "Region starting occupancy is "
                     << RegionStartingOccupancy << "\n"
-                    << "Target occupancy is " << TargetOccupancy << "\n");
+                    << "Target occupancy is " << TargetOccupancy << "\n";
 }
 
 bool OptSchedGCNTarget::shouldLimitWaves() const {
@@ -173,6 +124,16 @@ bool OptSchedGCNTarget::shouldLimitWaves() const {
   return false;
 }
 
+void OptSchedGCNTarget::setTargetOcc(unsigned Target) {
+  dbgs() << "Setting target occupancy to " << Target << '\n';
+  TargetOccupancy = Target;
+}
+void OptSchedGCNTarget::limitOccupancy(unsigned Limit) {
+  dbgs() << "Limiting occupancy to " << Limit << '\n';
+  MFI->limitOccupancy(Limit);
+  TargetOccupancy = MFI->getOccupancy();
+}
+
 unsigned OptSchedGCNTarget::getOccupancyWithCost(const InstCount Cost) const {
   return TargetOccupancy - Cost;
 }
@@ -184,9 +145,9 @@ void OptSchedGCNTarget::finalizeRegion(const InstSchedule *Schedule) {
   // If we decrease occupancy we may revert scheduling.
   unsigned RegionOccupancy =
       std::max(RegionStartingOccupancy, RegionEndingOccupancy);
-  LLVM_DEBUG(if (RegionOccupancy < MFI->getOccupancy()) dbgs()
+  if (RegionOccupancy < MFI->getOccupancy()) dbgs()
              << "Limiting occupancy to " << RegionEndingOccupancy
-             << " waves.\n");
+             << " waves.\n";
   MFI->limitOccupancy(RegionOccupancy);
 }
 
diff --git a/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h
new file mode 100644
index 00000000..996caaff
--- /dev/null
+++ b/lib/Wrapper/AMDGPU/OptSchedGCNTarget.h
@@ -0,0 +1,73 @@
+#ifndef LLVM_GCN_OPT_SCHED_TARGET_H
+#define LLVM_GCN_OPT_SCHED_TARGET_H
+
+#include "OptSchedDDGWrapperGCN.h"
+#include "SIMachineFunctionInfo.h"
+#include "Wrapper/OptSchedMachineWrapper.h"
+#include "opt-sched/Scheduler/OptSchedTarget.h"
+#include "opt-sched/Scheduler/data_dep.h"
+#include "opt-sched/Scheduler/defines.h"
+#include "opt-sched/Scheduler/machine_model.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::opt_sched;
+
+class OptSchedGCNTarget : public OptSchedTarget {
+public:
+  std::unique_ptr<OptSchedMachineModel>
+  createMachineModel(const char *ConfigPath) override {
+    return llvm::make_unique<OptSchedMachineModel>(ConfigPath);
+  }
+
+  std::unique_ptr<OptSchedDDGWrapperBase>
+  createDDGWrapper(llvm::MachineSchedContext *Context, ScheduleDAGOptSched *DAG,
+                   OptSchedMachineModel *MM, LATENCY_PRECISION LatencyPrecision,
+                   const std::string &RegionID) override {
+    return llvm::make_unique<OptSchedDDGWrapperGCN>(Context, DAG, MM,
+                                                    LatencyPrecision, RegionID);
+  }
+
+  void initRegion(llvm::ScheduleDAGInstrs *DAG, MachineModel *MM_) override;
+
+  void finalizeRegion(const InstSchedule *Schedule) override;
+
+  // Returns occupancy cost with number of VGPRs and SGPRs from PRP for
+  // a partial or complete schedule.
+  InstCount getCost(const llvm::SmallVectorImpl<unsigned> &PRP) const override;
+
+  void dumpOccupancyInfo(const InstSchedule *Schedule) const;
+
+  // Revert scheduing if we decrease occupancy.
+  bool shouldKeepSchedule() override;
+
+  void limitOccupancy(unsigned Limit);
+  unsigned getTargetOcc() { return TargetOccupancy; }
+  void setTargetOcc(unsigned Target);
+
+private:
+  const llvm::MachineFunction *MF;
+  SIMachineFunctionInfo *MFI;
+  ScheduleDAGOptSched *DAG;
+  const GCNSubtarget *ST;
+
+  unsigned RegionStartingOccupancy;
+  unsigned RegionEndingOccupancy;
+  unsigned TargetOccupancy;
+
+  // Max occupancy with local memory size;
+  unsigned MaxOccLDS;
+
+  // In RP only (max occupancy) scheduling mode we should try to find
+  // a min-RP schedule without considering perf hints which suggest limiting
+  // occupancy. Returns true if we should consider perf hints.
+  bool shouldLimitWaves() const;
+
+  // Find occupancy with spill cost.
+  unsigned getOccupancyWithCost(const InstCount Cost) const;
+};
+
+#endif
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
index ba6985cf..f5b03fe7 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.cpp
@@ -9,6 +9,8 @@
 #include "opt-sched/Scheduler/logger.h"
 #include "opt-sched/Scheduler/register.h"
 #include "opt-sched/Scheduler/sched_basic_data.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -26,6 +28,7 @@
 #include <set>
 #include <stack>
 #include <string>
+#include <utility>
 #include <vector>
 
 #define DEBUG_TYPE "optsched-ddg-wrapper"
@@ -71,9 +74,12 @@ OptSchedDDGWrapperBasic::OptSchedDDGWrapperBasic(
   if (ShouldFilterRegisterTypes)
     RTFilter = createLLVMRegTypeFilter(MM, DAG->TRI,
                                        DAG->getRegPressure().MaxSetPressure);
+
+  ClusterCount = 0;
 }
 
-void OptSchedDDGWrapperBasic::convertSUnits() {
+void OptSchedDDGWrapperBasic::convertSUnits(bool IgnoreRealEdges,
+                                            bool IgnoreArtificialEdges) {
   LLVM_DEBUG(dbgs() << "Building opt_sched DAG\n");
   // The extra 2 are for the artifical root and leaf nodes.
   instCnt_ = nodeCnt_ = DAG->SUnits.size() + 2;
@@ -89,7 +95,7 @@ void OptSchedDDGWrapperBasic::convertSUnits() {
 
   // Create edges.
   for (const auto &SU : DAG->SUnits) {
-    convertEdges(SU);
+    convertEdges(SU, IgnoreRealEdges, IgnoreArtificialEdges);
   }
 
   // Add artificial root and leaf nodes and edges.
@@ -407,13 +413,27 @@ inline void OptSchedDDGWrapperBasic::setupLeaf() {
       CreateEdge_(i, LeafNum, 0, DEP_OTHER);
 }
 
-void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) {
+void OptSchedDDGWrapperBasic::addArtificialEdges() {
+  for (const auto &SU : DAG->SUnits) {
+    convertEdges(SU, true, false);
+  }
+}
+
+void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU,
+                                           bool IgnoreRealEdges,
+                                           bool IgnoreArtificialEdges) {
   const MachineInstr *instr = SU.getInstr();
   SUnit::const_succ_iterator I, E;
   for (I = SU.Succs.begin(), E = SU.Succs.end(); I != E; ++I) {
     if (I->getSUnit()->isBoundaryNode())
       continue;
 
+    bool IsArtificial = I->isArtificial() || I->isCluster();
+    if (IgnoreArtificialEdges && IsArtificial)
+      continue;
+    else if (IgnoreRealEdges && !IsArtificial)
+      continue;
+
     DependenceType DepType;
     switch (I->getKind()) {
     case SDep::Data:
@@ -440,7 +460,8 @@ void OptSchedDDGWrapperBasic::convertEdges(const SUnit &SU) {
     else
       Latency = 1; // unit latency = ignore ilp
 
-    CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType);
+    CreateEdge_(SU.NodeNum, I->getSUnit()->NodeNum, Latency, DepType,
+                IsArtificial);
   }
 }
 
@@ -500,6 +521,148 @@ void OptSchedDDGWrapperBasic::countBoundaryLiveness(
   }
 }
 
+// Iterate through all chains found by LLVm and verify that the instructions
+// are actually able to be clustered together.
+// Partially copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1554
+int OptSchedDDGWrapperBasic::clusterNeighboringMemOps(
+    ArrayRef<const SUnit *> MemOps) {
+  // Will be set to true if clustering was found to be possible in this chain.
+  bool InitForNewCluster = true;
+  // Keep track of the count of instructions that are able to be clustered
+  // and return the number.
+  int TotalInstructionsPossible = 0;
+  int InstructionsInEachCluster = 0;
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  for (const SUnit *SU : MemOps) {
+    MachineOperand *BaseOp;
+    int64_t Offset;
+    if (DAG->TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset,
+                                          DAG->TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset));
+  }
+
+  if (MemOpRecords.size() < 2) {
+    LLVM_DEBUG(dbgs() << "  Unable to cluster memop cluster of 1.\n");
+    return 0;
+  }
+
+  llvm::sort(MemOpRecords);
+  unsigned ClusterLength = 1;
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    const SUnit *SUa = MemOpRecords[Idx].SU;
+    const SUnit *SUb = MemOpRecords[Idx + 1].SU;
+    LLVM_DEBUG(dbgs() << "  Checking possible clustering of (" << SUa->NodeNum << ") and ("
+           << SUb->NodeNum << ")\n");
+
+    // Pass constant of 1 to AMD's function to determine clustering to remove
+    // the limit of 15. Our enumerator can determine when it has reached the
+    // limit instead of depending on AMD.
+    if (DAG->TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
+                                      *MemOpRecords[Idx + 1].BaseOp, 1u)) {
+      LLVM_DEBUG(dbgs() << "    Cluster possible at SU(" << SUa->NodeNum << ")- SU("
+             << SUb->NodeNum << ")\n");
+
+      // If clustering is possible then increase the cluster count. This only
+      // happens once every new cluster
+      if (InitForNewCluster) {
+        InitForNewCluster = false;
+        ClusterCount++;
+        setMinClusterCount(ClusterCount);
+        dbgs() << "    Setting total cluster count to " << ClusterCount << "\n";
+      }
+
+      // Tell the instructions what cluster group they are in
+      if (insts_[SUa->NodeNum]->GetClusterGroup() == 0) {
+        insts_[SUa->NodeNum]->SetMayCluster(ClusterCount);
+        InstructionsInEachCluster++;
+      }
+
+      if (insts_[SUb->NodeNum]->GetClusterGroup() == 0) {
+        insts_[SUb->NodeNum]->SetMayCluster(ClusterCount);
+        InstructionsInEachCluster++;
+      }
+
+      ++ClusterLength;
+    } else {
+      if (!InitForNewCluster) {
+        // If a cluster was initialized and started then the information before
+        // starting a new one.
+        MaxInstructionsInEachClusters.insert(
+            std::make_pair(ClusterCount, InstructionsInEachCluster));
+        TotalInstructionsPossible += InstructionsInEachCluster;
+        InitForNewCluster = true;
+        InstructionsInEachCluster = 0;
+      }
+      ClusterLength = 1;
+    }
+  }
+  // Save the total instructions possible in this cluster. This number will be
+  // used in enumeration to estimate an optimistic cost on the remaining
+  // cluster blocks.i
+  if (!InitForNewCluster) {
+    MaxInstructionsInEachClusters.insert(
+        std::make_pair(ClusterCount, InstructionsInEachCluster));
+    TotalInstructionsPossible += InstructionsInEachCluster;
+  }
+
+  // Return the total number of instructions in this cluster block
+  return TotalInstructionsPossible;
+}
+
+// Iterate through SUnits and find all possible clustering using LLVM/AMD's
+// method for possible clustering detection then transfer the information to
+// our scheduler so that our scheduler can access it during enumeration.
+// Partially copied from
+// https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1595
+int OptSchedDDGWrapperBasic::findPossibleClusters(bool IsLoad) {
+  // The count of all of the instructions that are in a load/store cluster.
+  int TotalInstructionsPossible = 0;
+  // Map DAG NodeNum to store chain ID.
+  DenseMap<unsigned, unsigned> StoreChainIDs;
+  // Map each store chain to a set of dependent MemOps.
+  SmallVector<SmallVector<const SUnit *, 4>, 32> StoreChainDependents;
+  for (const SUnit &SU : DAG->SUnits) {
+    if ((IsLoad && !SU.getInstr()->mayLoad()) ||
+        (!IsLoad && !SU.getInstr()->mayStore()))
+      continue;
+    auto MI = SU.getInstr();
+
+    // Print which instruction may load or store. Used for debugging purposes.
+    dbgs() << "Instruction (" << SU.NodeNum << ") "
+           << DAG->TII->getName(MI->getOpcode()) << " may "
+           << (IsLoad ? "load" : "store") << "\n";
+
+    unsigned ChainPredID = DAG->SUnits.size();
+    for (const SDep &Pred : SU.Preds) {
+      if (Pred.isCtrl() && !(Pred.isArtificial() || Pred.isCluster())) {
+        ChainPredID = Pred.getSUnit()->NodeNum;
+        break;
+      }
+    }
+    // Check if this chain-like pred has been seen
+    // before. ChainPredID==MaxNodeID at the top of the schedule.
+    unsigned NumChains = StoreChainDependents.size();
+    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+        StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+    if (Result.second)
+      StoreChainDependents.resize(NumChains + 1);
+    StoreChainDependents[Result.first->second].push_back(&SU);
+  }
+
+  // Iterate over the store chains.
+  for (auto &SCD : StoreChainDependents) {
+    // Print the chain that LLVM has found
+    LLVM_DEBUG(dbgs() << "Printing the Node ID of the current chain: ");
+    for (auto SU1 : SCD)
+      LLVM_DEBUG(dbgs() << SU1->NodeNum << " ");
+    LLVM_DEBUG(dbgs() << '\n');
+
+    TotalInstructionsPossible += clusterNeighboringMemOps(SCD);
+  }
+  return TotalInstructionsPossible;
+}
+
 LLVMRegTypeFilter::LLVMRegTypeFilter(
     const MachineModel *MM, const llvm::TargetRegisterInfo *TRI,
     const std::vector<unsigned> &RegionPressure, float RegFilterFactor)
diff --git a/lib/Wrapper/OptSchedDDGWrapperBasic.h b/lib/Wrapper/OptSchedDDGWrapperBasic.h
index 88631511..0679e2b8 100644
--- a/lib/Wrapper/OptSchedDDGWrapperBasic.h
+++ b/lib/Wrapper/OptSchedDDGWrapperBasic.h
@@ -13,6 +13,7 @@
 #include "opt-sched/Scheduler/graph_trans.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include <map>
 #include <set>
@@ -47,8 +48,10 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   /// Dump Optsched register def/use information for the region.
   void dumpOptSchedRegisters() const;
 
-  void convertSUnits() override;
+  void convertSUnits(bool IgnoreRealEdges, bool IgnoreArtificialEdges) override;
+  void addArtificialEdges();
   void convertRegFiles() override;
+  int findPossibleClusters(bool IsLoad) override;
 
 protected:
   // A convenience machMdl_ pointer casted to OptSchedMachineModel*.
@@ -123,7 +126,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   void convertSUnit(const llvm::SUnit &SU);
 
   // Create edges between optsched graph nodes using SUnit successors.
-  void convertEdges(const llvm::SUnit &SU);
+  void convertEdges(const llvm::SUnit &SU, bool IgnoreRealEdges,
+                    bool IgnoreArtificialEdges);
 
   // Count number or registers defined by the region boundary.
   void countBoundaryLiveness(std::vector<int> &RegDefCounts,
@@ -133,6 +137,8 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
   // Find liveness info generated by the region boundary.
   void discoverBoundaryLiveness(const llvm::MachineInstr *MI);
 
+  int clusterNeighboringMemOps(ArrayRef<const SUnit *> MemOps);
+
   // Holds a register live range, mapping a producer to a set of consumers.
   struct LiveRange {
     // The node which defines the register tracked by this live range.
@@ -140,12 +146,56 @@ class OptSchedDDGWrapperBasic : public DataDepGraph {
     // The nodes which use the register tracked by this live range.
     std::vector<SchedInstruction *> consumers;
   };
+
+  /// Count of the total clusters possible
+  int ClusterCount;
+
+  // Copied from
+  // https://github.com/RadeonOpenCompute/llvm/blob/roc-ocl-2.4.0/lib/CodeGen/MachineScheduler.cpp#L1467
+  struct MemOpInfo {
+    const SUnit *SU;
+    MachineOperand *BaseOp;
+    int64_t Offset;
+
+    MemOpInfo(const SUnit *su, MachineOperand *Op, int64_t ofs)
+        : SU(su), BaseOp(Op), Offset(ofs) {}
+
+    bool operator<(const MemOpInfo &RHS) const {
+      if (BaseOp->getType() != RHS.BaseOp->getType())
+        return BaseOp->getType() < RHS.BaseOp->getType();
+
+      if (BaseOp->isReg())
+        return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
+               std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
+                               RHS.SU->NodeNum);
+      if (BaseOp->isFI()) {
+        const MachineFunction &MF =
+            *BaseOp->getParent()->getParent()->getParent();
+        const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
+        bool StackGrowsDown = TFI.getStackGrowthDirection() ==
+                              TargetFrameLowering::StackGrowsDown;
+        // Can't use tuple comparison here since we might need to use a
+        // different order when the stack grows down.
+        if (BaseOp->getIndex() != RHS.BaseOp->getIndex())
+          return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex()
+                                : BaseOp->getIndex() < RHS.BaseOp->getIndex();
+
+        if (Offset != RHS.Offset)
+          return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset;
+
+        return SU->NodeNum < RHS.SU->NodeNum;
+      }
+
+      llvm_unreachable("MemOpClusterMutation only supports register or frame "
+                       "index bases.");
+    }
+  };
 };
 
 // Exclude certain registers from being visible to the scheduler. Use LLVM's
-// register pressure tracker to find the MAX register pressure for each register
-// type (pressure set). If the MAX pressure is below a certain threshold don't
-// track that register.
+// register pressure tracker to find the MAX register pressure for each
+// register type (pressure set). If the MAX pressure is below a certain
+// threshold don't track that register.
 class LLVMRegTypeFilter {
 private:
   const MachineModel *MM;
diff --git a/lib/Wrapper/OptimizingScheduler.cpp b/lib/Wrapper/OptimizingScheduler.cpp
index 5d0416c5..a3ad4e1c 100644
--- a/lib/Wrapper/OptimizingScheduler.cpp
+++ b/lib/Wrapper/OptimizingScheduler.cpp
@@ -16,10 +16,15 @@
 #include "opt-sched/Scheduler/register.h"
 #include "opt-sched/Scheduler/sched_region.h"
 #include "opt-sched/Scheduler/utilities.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineScheduler.h"
+/*#include "llvm/CodeGen/OptSequential.h"*/
+#include "AMDGPU/OptSchedGCNTarget.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
@@ -39,20 +44,21 @@
 
 using namespace llvm::opt_sched;
 
+llvm::SmallVector<llvm::StringRef, 8> UniqueRegionNames;
+llvm::DenseMap<llvm::StringRef, unsigned> RegionCounter;
+
 // hack to print spills
 bool OPTSCHED_gPrintSpills;
 
 // An array of possible OptSched heuristic names
 constexpr struct {
-  const char* Name;
+  const char *Name;
   LISTSCHED_HEURISTIC HID;
-} HeuristicNames[] = {
-    {"CP", LSH_CP},   {"LUC", LSH_LUC},
-    {"UC", LSH_UC},   {"NID", LSH_NID},
-    {"CPR", LSH_CPR}, {"ISO", LSH_ISO},
-    {"SC", LSH_SC},   {"LS", LSH_LS},
-    {"LLVM", LSH_LLVM}
-};
+} HeuristicNames[] = {{"CP", LSH_CP},     {"LUC", LSH_LUC},
+                      {"UC", LSH_UC},     {"NID", LSH_NID},
+                      {"CPR", LSH_CPR},   {"ISO", LSH_ISO},
+                      {"SC", LSH_SC},     {"LS", LSH_LS},
+                      {"LLVM", LSH_LLVM}, {"CLUSTER", LSH_CLUSTER}};
 
 // Default path to the the configuration directory for opt-sched.
 static constexpr const char *DEFAULT_CFG_DIR = "~/.optsched-cfg/";
@@ -128,7 +134,8 @@ nextIfDebug(MachineBasicBlock::iterator I,
   return I;
 }
 
-static bool scheduleSpecificRegion(const StringRef RegionName, const Config &SchedIni) {
+static bool scheduleSpecificRegion(const StringRef RegionName,
+                                   const Config &SchedIni) {
   const bool ScheduleSpecificRegions =
       SchedIni.GetBool("SCHEDULE_SPECIFIC_REGIONS");
 
@@ -259,10 +266,8 @@ void ScheduleDAGOptSched::schedule() {
   ShouldTrackLaneMasks = true;
   Config &schedIni = SchedulerOptions::getInstance();
 
-  ++RegionNumber;
   const std::string RegionName = C->MF->getFunction().getName().data() +
-                                 std::string(":") +
-                                 std::to_string(RegionNumber);
+                                 std::string(":") + std::to_string(RegionIdx);
 
   // If two pass scheduling is enabled then
   // first just record the scheduling region.
@@ -375,10 +380,59 @@ void ScheduleDAGOptSched::schedule() {
   // Build LLVM DAG
   SetupLLVMDag();
   OST->initRegion(this, MM.get());
+  /*if (IsSecondPass && !IsThirdPass && !IsFourthPass) {
+    auto GCNOST = static_cast<OptSchedGCNTarget *>(OST.get());
+    GCNOST->setTargetOcc(5);
+  }*/
+
   // Convert graph
   auto DDG =
       OST->createDDGWrapper(C, this, MM.get(), LatencyPrecision, RegionName);
-  DDG->convertSUnits();
+
+  // In the second pass, ignore artificial edges before running the sequential
+  // heuristic list scheduler.
+  if (IsSecondPass)
+    DDG->convertSUnits(/*IgnoreRealEdges=*/false,
+                       /*IgnoreArtificialEdges=*/true);
+  else
+    DDG->convertSUnits(false, false);
+
+  // Find all clusterable instructions for the second pass.
+  if (IsSecondPass || (!TwoPassEnabled && schedIni.GetBool("PRINT_CLUSTER"))) {
+    dbgs() << "Finding load clusters.\n";
+    int TotalLoadsInstructionsClusterable = DDG->findPossibleClusters(true);
+    if (TotalLoadsInstructionsClusterable == 0)
+      dbgs() << "  No load clustering possible\n";
+
+    dbgs() << "Finding store clusters.\n";
+    int TotalStoreInstructionsClusterable = DDG->findPossibleClusters(false);
+    if (TotalStoreInstructionsClusterable == 0)
+      dbgs() << "  No store clustering possible\n";
+
+    Logger::Info("Total clusterable instructions: %d loads, %d stores",
+                 TotalLoadsInstructionsClusterable,
+                 TotalStoreInstructionsClusterable);
+
+    // Get the DDG instance so that we can set and get information that will
+    // be read later on during enumeration.
+    auto DataDepGraphInstance = static_cast<DataDepGraph *>(DDG.get());
+    // Store total instructions in all clusters in the DDG instance.
+    DataDepGraphInstance->setTotalInstructionsInAllClusters(
+        TotalLoadsInstructionsClusterable + TotalStoreInstructionsClusterable);
+    int end = DataDepGraphInstance->getMinClusterCount();
+
+    // Iterate through all of the cluster blocks and print the total
+    // instructions in each block.
+    if (end > 0) {
+      Logger::Info("Total clusters in region: %d", end);
+      for (int begin = 1; begin <= end; begin++) {
+        Logger::Info(
+            "  Cluster %d has total instructions %d", begin,
+            DataDepGraphInstance->getTotalInstructionsInCluster(begin));
+      }
+    }
+  }
+
   DDG->convertRegFiles();
 
   auto *BDDG = static_cast<OptSchedDDGWrapperBasic *>(DDG.get());
@@ -409,30 +463,60 @@ void ScheduleDAGOptSched::schedule() {
   }
 
   // Used for two-pass-optsched to alter upper bound value.
-  if (SecondPass)
+  if (IsSecondPass)
     region->InitSecondPass();
 
   // Setup time before scheduling
   Utilities::startTime = std::chrono::high_resolution_clock::now();
   // Schedule region.
-  Rslt = region->FindOptimalSchedule(CurrentRegionTimeout, CurrentLengthTimeout,
-                                     IsEasy, NormBestCost, BestSchedLngth,
-                                     NormHurstcCost, HurstcSchedLngth, Sched,
-                                     FilterByPerp, blocksToKeep(schedIni));
-
-  if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) {
-    LLVM_DEBUG(
-        Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.",
-                     Rslt, (void *)Sched));
-    // Scheduling with opt-sched failed.
-    // fallbackScheduler();
-    return;
+  if (!IsFourthPass) {
+    Rslt = region->FindOptimalSchedule(
+        CurrentRegionTimeout, CurrentLengthTimeout, IsEasy, NormBestCost,
+        BestSchedLngth, NormHurstcCost, HurstcSchedLngth, Sched, FilterByPerp,
+        blocksToKeep(schedIni));
+
+    if ((!(Rslt == RES_SUCCESS || Rslt == RES_TIMEOUT) || Sched == NULL)) {
+      LLVM_DEBUG(
+          Logger::Info("OptSched run failed: rslt=%d, sched=%p. Falling back.",
+                       Rslt, (void *)Sched));
+      // Scheduling with opt-sched failed.
+      // fallbackScheduler();
+      return;
+    }
+  } else {
+    dbgs() << "Processing DAG " << RegionName << '\n';
+    dbgs() << "Restoring schedule from second ILP pass: \n";
+    Sched = LowerOccScheds[RegionIdx];
+    dbgs() << "Applying lower occupancy schedule\n";
   }
 
+  // BB Enumerator did not find a schedule.
+  // Add the region to the list to be rescheduled.
+  if (IsSecondPass && !region->enumFoundSchedule() && !IsEasy && !IsThirdPass &&
+      !IsFourthPass)
+    RescheduleRegions[RegionIdx] = true;
+
   LLVM_DEBUG(Logger::Info("OptSched succeeded."));
+
   OST->finalizeRegion(Sched);
-  if (!OST->shouldKeepSchedule())
+
+  if (!IsThirdPass && !IsFourthPass && (IsFirstPass || IsSecondPass))
+    if (!OST->shouldKeepSchedule()) {
+      if (IsSecondPass) {
+        // We do not keep the schedule so the results of the sequential
+        // heuristic scheduler is the final result for the second pass.
+        ILPAnalysis[RegionIdx].first = HurstcSchedLngth;
+      }
+      return;
+    }
+
+  if (IsSecondPass && !IsThirdPass && !IsFourthPass)
+    ILPAnalysis[RegionIdx].first = BestSchedLngth;
+  else if (IsThirdPass) {
+    ILPAnalysis[RegionIdx].second = BestSchedLngth;
+    LowerOccScheds[RegionIdx] = Sched;
     return;
+  }
 
   // Count simulated spills.
   if (isSimRegAllocEnabled()) {
@@ -529,7 +613,10 @@ void ScheduleDAGOptSched::loadOptSchedConfig() {
   OptSchedEnabled = isOptSchedEnabled();
   TwoPassEnabled = isTwoPassEnabled();
   TwoPassSchedulingStarted = false;
-  SecondPass = false;
+  IsFirstPass = false;
+  IsSecondPass = false;
+  IsThirdPass = false;
+  IsFourthPass = false;
   LatencyPrecision = fetchLatencyPrecision();
   TreatOrderAsDataDeps = schedIni.GetBool("TREAT_ORDER_DEPS_AS_DATA_DEPS");
 
@@ -665,6 +752,7 @@ SchedPriorities ScheduleDAGOptSched::parseHeuristic(const std::string &Str) {
     Priorities.vctr[Priorities.cnt++] = LSH;
     switch (LSH) {
     // Is LUC still the only dynamic heuristic?
+    case LSH_CLUSTER:
     case LSH_LUC:
       Priorities.isDynmc = true;
       break;
@@ -743,13 +831,14 @@ bool ScheduleDAGOptSched::rpMismatch(InstSchedule *sched) {
 void ScheduleDAGOptSched::finalizeSchedule() {
   if (TwoPassEnabled && OptSchedEnabled) {
     initSchedulers();
+    RescheduleRegions.resize(Regions.size());
 
     LLVM_DEBUG(dbgs() << "Starting two pass scheduling approach\n");
     TwoPassSchedulingStarted = true;
     for (const SchedPassStrategy &S : SchedPasses) {
       MachineBasicBlock *MBB = nullptr;
       // Reset
-      RegionNumber = ~0u;
+      RegionIdx = 0;
 
       for (auto &Region : Regions) {
         RegionBegin = Region.first;
@@ -791,14 +880,17 @@ void ScheduleDAGOptSched::runSchedPass(SchedPassStrategy S) {
   switch (S) {
   case OptSchedMinRP:
     scheduleOptSchedMinRP();
+    Logger::Info("End of first pass through");
     break;
   case OptSchedBalanced:
     scheduleOptSchedBalanced();
+    Logger::Info("End of second pass through");
     break;
   }
 }
 
 void ScheduleDAGOptSched::scheduleOptSchedMinRP() {
+  IsFirstPass = true;
   LatencyPrecision = LTP_UNITY;
   // Set times for the first pass
   RegionTimeout = FirstPassRegionTimeout;
@@ -806,11 +898,11 @@ void ScheduleDAGOptSched::scheduleOptSchedMinRP() {
   HeurSchedType = SCHED_LIST;
 
   schedule();
-  Logger::Info("End of first pass through\n");
+  IsFirstPass = false;
 }
 
 void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
-  SecondPass = true;
+  IsSecondPass = true;
   LatencyPrecision = LTP_ROUGH;
 
   // Set times for the second pass
@@ -837,7 +929,7 @@ void ScheduleDAGOptSched::scheduleOptSchedBalanced() {
   MultiPassStaticNodeSup = false;
 
   schedule();
-  Logger::Info("End of second pass through");
+  IsSecondPass = false;
 }
 
 bool ScheduleDAGOptSched::isSimRegAllocEnabled() const {
diff --git a/lib/Wrapper/OptimizingScheduler.h b/lib/Wrapper/OptimizingScheduler.h
index 13b92e7d..72191801 100644
--- a/lib/Wrapper/OptimizingScheduler.h
+++ b/lib/Wrapper/OptimizingScheduler.h
@@ -14,6 +14,7 @@
 #include "opt-sched/Scheduler/data_dep.h"
 #include "opt-sched/Scheduler/graph_trans.h"
 #include "opt-sched/Scheduler/sched_region.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/Support/Debug.h"
@@ -37,12 +38,21 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   SmallVector<SchedPassStrategy, 4> SchedPasses;
 
 protected:
-
   // Vector of regions recorded for later rescheduling
   SmallVector<
       std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>, 32>
       Regions;
 
+  /// Contains the results of the first ILP pass and second analysis ILP pass.
+  /// Used to calculate if we should keep the lower target occupancy schedules
+  /// in the second ILP pass. First element is the first ILP pass and second
+  /// element is the second analysis ILP pass.
+  SmallVector<std::pair<int, int>, 32> ILPAnalysis;
+  /// TODO: Same as above for cost analysis.
+  SmallVector<std::pair<int, int>, 32> CostAnalysis;
+  /// Store the lower occupancy schedules from the second ILP pass.
+  SmallVector<InstSchedule *, 16> LowerOccScheds;
+
   // Path to opt-sched config options directory.
   SmallString<128> PathCfg;
 
@@ -55,17 +65,27 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // Path to the machine model specification file for opt-sched.
   SmallString<128> PathCfgMM;
 
+  bool IsFirstPass;
+
   // Bool value indicating that the scheduler is in the second
   // pass. Used for the two pass scheduling approach.
-  bool SecondPass;
+  bool IsSecondPass;
+
+  bool IsThirdPass;
+
+  bool IsFourthPass;
 
   // Region number uniquely identifies DAGs.
-  unsigned RegionNumber = ~0u;
+  size_t RegionIdx;
+
+  // Records if a region is not yet scheduled, or schedule has been reverted,
+  // or we generally desire to reschedule it.
+  llvm::BitVector RescheduleRegions;
 
   MachineSchedContext *C;
 
   // The OptSched target machine.
-  std::unique_ptr<OptSchedTarget> OST;
+  std::shared_ptr<OptSchedTarget> OST;
 
   // into the OptSched machine model
   std::unique_ptr<OptSchedMachineModel> MM;
@@ -158,7 +178,10 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   // The heuristic used for the enumerator.
   SchedPriorities EnumPriorities;
 
-  // The heuristic used for the second pass enumerator in the two-pass scheduling approach.
+  SchedPriorities SecondPassPriorities;
+
+  // The heuristic used for the second pass enumerator in the two-pass
+  // scheduling approach.
   SchedPriorities SecondPassEnumPriorities;
 
   // Static node superiority RP only graph transformation.
@@ -249,7 +272,7 @@ class ScheduleDAGOptSched : public ScheduleDAGMILive {
   void dumpLLVMRegisters() const;
 
   // Getter for region number
-  int getRegionNum() const { return RegionNumber; }
+  int getRegionNum() const { return RegionIdx; }
 
   // Return the boundary instruction for this region if it is not a sentinel
   // value.