From 9cc25bd7511983f3fab3f1ef46277e0d80a8c4db Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Thu, 23 Feb 2023 22:42:38 -0800
Subject: [PATCH 01/10] add sparse raja example

---
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp | 476 ++++++++++++++++++++++++
 src/Drivers/Sparse/NlpSparseRajaEx2.hpp | 309 +++++++++++++++
 2 files changed, 785 insertions(+)
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.cpp
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2.hpp

diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
new file mode 100644
index 000000000..d1b9c2552
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -0,0 +1,476 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory (LLNL).
+// LLNL-CODE-742473. All rights reserved.
+//
+// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp
+// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause).
+// Please also read "Additional BSD Notice" below.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// i. Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the disclaimer below.
+// ii. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the disclaimer (as noted below) in the documentation and/or
+// other materials provided with the distribution.
+// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to
+// endorse or promote products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Additional BSD Notice
+// 1. This notice is required to be provided under our contract with the U.S. Department
+// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under
+// Contract No. DE-AC52-07NA27344 with the DOE.
+// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC
+// nor any of their employees, makes any warranty, express or implied, or assumes any
+// liability or responsibility for the accuracy, completeness, or usefulness of any
+// information, apparatus, product, or process disclosed, or represents that its use would
+// not infringe privately-owned rights.
+// 3. Also, reference herein to any specific commercial products, process, or services by
+// trade name, trademark, manufacturer or otherwise does not necessarily constitute or
+// imply its endorsement, recommendation, or favoring by the United States Government or
+// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed
+// herein do not necessarily state or reflect those of the United States Government or
+// Lawrence Livermore National Security, LLC, and shall not be used for advertising or
+// product endorsement purposes.
+
+/**
+ * @file NlpSparseRajaEx2.cpp
+ * 
+ * @author Nai-Yuan Chiang <chiang7@lnnl.gov>, LNNL
+ *
+ */
+
+#include "SparseRajaEx2.hpp"
+
+#include <umpire/Allocator.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <RAJA/RAJA.hpp>
+
+#include <hiopMatrixRajaSparseTriplet.hpp>
+
+#include <cmath>
+#include <cstring> //for memcpy
+#include <cstdio>
+
+
+//TODO: A good idea to not use the internal HiOp Raja policies here and, instead, give self-containing
+// definitions of the policies here so that the user gets a better grasp of the concept and does not
+// rely on the internals of HiOp. For example:
+// #define RAJA_LAMBDA [=] __device__
+// using ex2_raja_exec = RAJA::cuda_exec<128>;
+// more defs here
+
+
+#if defined(HIOP_USE_CUDA)
+#include "ExecPoliciesRajaCudaImpl.hpp"
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_reduce;
+using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaCuda>;
+#elif defined(HIOP_USE_HIP)
+#include <ExecPoliciesRajaHipImpl.hpp>
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_reduce;
+using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaHip>;
+#else
+//#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP)
+#include <ExecPoliciesRajaOmpImpl.hpp>
+using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_exec;
+using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_reduce;
+using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaOmp>;
+#endif
+
+
+/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM
+ * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx1.
+ *
+ *  min   (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx
+ *  s.t.
+ *            4*x_1 + 2*x_2                     == 10
+ *        5<= 2*x_1         + x_3
+ *        1<= 2*x_1                 + 0.5*x_i   <= 2*n, for i=4,...,n
+ *        x_1 free
+ *        0.0 <= x_2
+ *        1.0 <= x_3 <= 10
+ *        x_i >=0.5, i=4,...,n
+ *
+ * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian
+ *
+ *  s.t.  [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]                  (rnkdef-con1)
+ *        4*x_1 + 2*x_2 == 10                                (rnkdef-con2)
+ *
+ *  other parameters are:
+ *  convex_obj: set to 1 to have a convex problem, otherwise set it to 0.
+ *  scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default).
+ *
+ */
+SparseRajaEx2::SparseRajaEx2(std::string mem_space,
+                             int n,
+                             bool convex_obj,
+                             bool rankdefic_Jac_eq,
+                             bool rankdefic_Jac_ineq,
+                             double scal_neg_obj)
+  : mem_space_{mem_space},
+    convex_obj_{convex_obj},
+    rankdefic_eq_{rankdefic_Jac_eq},
+    rankdefic_ineq_{rankdefic_Jac_ineq},
+    n_vars_{n},
+    scal_neg_obj_{scal_neg_obj},
+    n_cons_{2}
+{
+  // Make sure mem_space_ is uppercase
+  transform(mem_space_.begin(), mem_space_.end(), mem_space_.begin(), ::toupper);
+
+  assert(n>=3 && "number of variables should be greater than 3 for this example");
+  if(n>3) {
+    n_cons_ += n-3;
+  }
+  n_cons_ += rankdefic_eq_ + rankdefic_ineq_;
+}
+
+SparseRajaEx2::~SparseRajaEx2()
+{
+}
+
+bool SparseRajaEx2::get_prob_sizes(size_type& n, size_type& m)
+{
+  n = n_vars_;
+  m = n_cons_;
+  return true;
+}
+
+bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type)
+{
+  assert(n==n_vars_);
+
+  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, 3),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      if(i==0) {
+        xlow[i] = -1e20;
+        xupp[i] =  1e20;
+      } else if(i==1) {
+        xlow[i] = 0.0;
+        xupp[i] = 1e20;
+      } else { 
+        xlow[i] = 1.0;
+        xupp[i] = 10.0;
+      }
+    });
+
+  if(n>3) {
+    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(3, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        xlow[i] = 0.5;
+        xupp[i] = 1e20;
+      });
+  }
+
+  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, n),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      type[i] = hiopNonlinear;
+    });
+  return true;
+}
+
+bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type)
+{
+  assert(m==n_cons_);
+
+  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, 2),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      if(i==0) {
+        clow[i] = 10.0;
+        cupp[i] = 10.0;
+      } else {
+        clow[i] = 5.0;
+        cupp[i] = 1e20;
+      }
+    });
+
+  if(n>3) {
+    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(3, m),
+      RAJA_LAMBDA(RAJA::Index_type conidx)
+      {
+        clow[conidx] = 1.0;
+        cupp[conidx] = 2*n;
+      });
+  }
+
+  if(rankdefic_ineq_) {
+    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(m, m+1),
+      RAJA_LAMBDA(RAJA::Index_type conidx)
+      {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        clow[conidx] = -1e+20;
+        cupp[conidx] = 19.;
+      });
+  }
+
+  if(rankdefic_eq_) {
+    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(m+rankdefic_ineq_, m+rankdefic_ineq_+1),
+      RAJA_LAMBDA(RAJA::Index_type conidx)
+      {
+        //  4*x_1 + 2*x_2 == 10
+        clow[conidx] = 10;
+        cupp[conidx] = 10;
+      });
+  }
+
+  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, m),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      type[i] = hiopInterfaceBase::hiopNonlinear;
+    });
+
+  return true;
+}
+
+bool SparseRajaEx2::get_sparse_blocks_info(int& nx,
+                                           int& nnz_sparse_Jaceq,
+                                           int& nnz_sparse_Jacineq,
+                                           int& nnz_sparse_Hess_Lagr)
+{
+  nx = n_vars_;;
+  nnz_sparse_Jaceq = 2 + 2*rankdefic_eq_;
+  nnz_sparse_Jacineq = 2 + 2*(n_vars_-3) + 2*rankdefic_ineq_;
+  nnz_sparse_Hess_Lagr = n_vars_;
+  return true;
+}
+
+bool SparseRajaEx2::eval_f(const size_type& n, const double* x, bool new_x, double& obj_value)
+{
+  assert(n==n_vars_);
+  obj_value=0.;
+  {
+    int convex_obj = (int) convex_obj_;
+    double scal_neg_obj = scal_neg_obj_;
+
+    RAJA::ReduceSum<ex2_raja_reduce, double> aux(0);
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        aux += (2*convex_obj-1) * scal_neg_obj * 0.25 * std::pow(x[i]-1., 4) + 0.5 * std::pow(x[i], 2);
+      });
+    obj_value += aux.get();
+  }
+  return true;
+}
+
+bool SparseRajaEx2::eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf)
+{
+  assert(n==n_vars_);
+  {
+    int convex_obj = (int) convex_obj_;
+    double scal_neg_obj = scal_neg_obj_;
+
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        gradf[i] = (2*convex_obj-1) * scal_neg_obj * std::pow(x[i]-1.,3) + x[i];
+      });
+  }
+  return true;
+}
+
+bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons)
+{
+  assert(n==n_vars_); assert(m==n_cons_);
+  assert(n_cons_==2+n-3+rankdefic_eq_+rankdefic_ineq_);
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      if(i==0) {
+        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+        cons[i] = 4*x[0] + 2*x[1];
+      } else if(i==1) {
+        // --- constraint 2 body ---> 2*x_1 + x_3
+        cons[i] = 2*x[0] + 1*x[2];
+      }
+    });
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+      cons[i] = 2*x[0] + 0.5*x[i+1];
+    });
+
+
+  if(rankdefic_ineq_) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        cons[i] = 4*x[0] + 2*x[2];
+      });
+  }
+
+  if(rankdefic_eq_) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1+rankdefic_ineq_, n+rankdefic_ineq_),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        //  4*x_1 + 2*x_2 == 10
+        cons[i] = 4*x[0] + 2*x[1];
+      });
+  }
+  assert(conidx==m);
+
+  return true;
+}
+
+bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
+                                  const size_type& m,
+                                  const double* x,
+                                  bool new_x,
+                                  const int& nnzJacS,
+                                  index_type* iJacS,
+                                  index_type* jJacS,
+                                  double* MJacS)
+{
+  assert(n==n_vars_); assert(m==n_cons_);
+  assert(n>=3);
+
+  assert(nnzJacS == 4 + 2*(n-3) + 2*rankdefic_eq_ + 2*rankdefic_ineq_);
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
+    RAJA_LAMBDA(RAJA::Index_type itrow)
+    {
+      if(itrow==0) {
+        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = 1;
+        if(MJacS != nullptr) {
+          MJacS[2*itrow] = 4.0;
+          MJacS[2*itrow+1] = 2.0;
+        }
+      } else if(itrow==1) {
+        // --- constraint 2 body ---> 2*x_1 + x_3
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = 2;
+        if(MJacS != nullptr) {
+          MJacS[2*itrow] = 2.0;
+          MJacS[2*itrow+1] = 1.0;
+        }
+      }
+    });
+
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+    RAJA_LAMBDA(RAJA::Index_type itrow)
+    {
+      // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+      iJacS[2*itrow] = itrow;
+      jJacS[2*itrow] = 0;
+      iJacS[2*itrow+1] = itrow;
+      jJacS[2*itrow+1] = itrow+1;
+      if(MJacS != nullptr) {
+        MJacS[2*itrow] = 2.0;
+        MJacS[2*itrow+1] = 0.5;
+      }
+    });
+
+
+  if(rankdefic_ineq_) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1, n),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = 2;
+        if(MJacS != nullptr) {
+          MJacS[2*itrow] = 4.0;
+          MJacS[2*itrow+1] = 2.0;
+        }
+      });
+  }
+
+  if(rankdefic_eq_) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1+rankdefic_ineq_, n+rankdefic_ineq_),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        //  4*x_1 + 2*x_2 == 10
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = 1;
+        if(MJacS != nullptr) {
+          MJacS[2*itrow] = 4.0;
+          MJacS[2*itrow+1] = 2.0;
+        }
+      });
+  }
+
+  return true;
+}
+
+bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n,
+                               const size_type& m,
+                               const double* x,
+                               bool new_x,
+                               const double& obj_factor,
+                               const double* lambda,
+                               bool new_lambda,
+                               const size_type& nnzHSS,
+                               index_type* iHSS,
+                               index_type* jHSS,
+                               double* MHSS)
+{
+    //Note: lambda is not used since all the constraints are linear and, therefore, do
+    //not contribute to the Hessian of the Lagrangian
+    assert(nnzHSS == n);
+
+    if(iHSS!=NULL && jHSS!=NULL) {
+      RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+        RAJA_LAMBDA(RAJA::Index_type i)
+        {
+          iHSS[i] = i;
+          jHSS[i] = i;
+        });
+    }
+
+    int convex_obj = (int) convex_obj_;
+    double scal_neg_obj = scal_neg_obj_;
+    if(MHSS!=NULL) {
+      RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+        RAJA_LAMBDA(RAJA::Index_type i)
+        {
+          MHSS[i] = obj_factor * ( (2*convex_obj-1) * scal_neg_obj * 3 * std::pow(x[i]-1., 2) + 1);
+        });
+    }
+    return true;
+}
+
+
+
+
+bool SparseRajaEx2::get_starting_point(const size_type& n, double* x0)
+{
+  assert(n==n_vars_);
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+    RAJA_LAMBDA(RAJA::Index_type i)
+    {
+      x0[i] = 0.0;
+    });
+  return true;
+}
diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.hpp b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp
new file mode 100644
index 000000000..acec2cb13
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.hpp
@@ -0,0 +1,309 @@
+// Copyright (c) 2017, Lawrence Livermore National Security, LLC.
+// Produced at the Lawrence Livermore National Laboratory (LLNL).
+// LLNL-CODE-742473. All rights reserved.
+//
+// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp
+// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause).
+// Please also read "Additional BSD Notice" below.
+//
+// Redistribution and use in source and binary forms, with or without modification,
+// are permitted provided that the following conditions are met:
+// i. Redistributions of source code must retain the above copyright notice, this list
+// of conditions and the disclaimer below.
+// ii. Redistributions in binary form must reproduce the above copyright notice,
+// this list of conditions and the disclaimer (as noted below) in the documentation and/or
+// other materials provided with the distribution.
+// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to
+// endorse or promote products derived from this software without specific prior written
+// permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Additional BSD Notice
+// 1. This notice is required to be provided under our contract with the U.S. Department
+// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under
+// Contract No. DE-AC52-07NA27344 with the DOE.
+// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC
+// nor any of their employees, makes any warranty, express or implied, or assumes any
+// liability or responsibility for the accuracy, completeness, or usefulness of any
+// information, apparatus, product, or process disclosed, or represents that its use would
+// not infringe privately-owned rights.
+// 3. Also, reference herein to any specific commercial products, process, or services by
+// trade name, trademark, manufacturer or otherwise does not necessarily constitute or
+// imply its endorsement, recommendation, or favoring by the United States Government or
+// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed
+// herein do not necessarily state or reflect those of the United States Government or
+// Lawrence Livermore National Security, LLC, and shall not be used for advertising or
+// product endorsement purposes.
+
+/**
+ * @file NlpSparseRajaEx2.hpp
+ * 
+ * @author Nai-Yuan Chiang <chiang7@lnnl.gov>, LNNL
+ *
+ */
+
+#ifndef HIOP_EXAMPLE_SPARSE_RAJA_EX2
+#define HIOP_EXAMPLE_SPARSE_RAJA_EX2
+
+#include "hiopInterface.hpp"
+#include "LinAlgFactory.hpp"
+
+#ifdef HIOP_USE_MPI
+#include "mpi.h"
+#else
+#define MPI_COMM_WORLD 0
+#define MPI_Comm int
+#endif
+
+#include <cassert>
+#include <cstring> //for memcpy
+#include <cstdio>
+#include <cmath>
+
+using size_type = hiop::size_type;
+using index_type = hiop::index_type;
+
+/** Nonlinear *highly nonconvex* and *rank deficient* problem test for the Filter IPM
+ * Newton of HiOp. It uses a Sparse NLP formulation. The problem is based on SparseEx2.
+ *
+ *  min   (2*convex_obj-1)*scal*sum 1/4* { (x_{i}-1)^4 : i=1,...,n} + 0.5x^Tx
+ *  s.t.
+ *            4*x_1 + 2*x_2                     == 10
+ *        5<= 2*x_1         + x_3
+ *        1<= 2*x_1                 + 0.5*x_i   <= 2*n, for i=4,...,n
+ *        x_1 free
+ *        0.0 <= x_2
+ *        1.0 <= x_3 <= 10
+ *        x_i >=0.5, i=4,...,n
+ *
+ * Optionally, one can add the following constraints to obtain a rank-deficient Jacobian
+ *
+ *  s.t.  [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]                  (rnkdef-con1 --- ineq con)
+ *        4*x_1 + 2*x_2 == 10                                (rnkdef-con2 ---   eq con)
+ *
+ *  other parameters are:
+ *  convex_obj: set to 1 to have a convex problem, otherwise set it to 0
+ *  scale_quartic_obj_term: scaling factor for the quartic term in the objective (1.0 by default).
+ *
+ * @note All pointers marked as "managed by Umpire" are allocated by HiOp using the
+ * Umpire's API. They all are addresses in the same memory space; however, the memory 
+ * space can be host (typically CPU), device (typically GPU), or unified memory (um) 
+ * spaces as per Umpire specification. The selection of the memory space is done via 
+ * the option "mem_space" of HiOp. It is the responsibility of the implementers of 
+ * the HiOp's interfaces (such as the hiop::hiopInterfaceMDS used in this example) to 
+ * work with the "managed by Umpire" pointers in the same memory space as the one 
+ * specified by the "mem_space" option.
+ * 
+ */
+class SparseRajaEx2 : public hiop::hiopInterfaceSparse
+{
+public:
+  SparseRajaEx2(std::string mem_space,
+                int n,
+                bool convex_obj,
+                bool rankdefic_Jac_eq,
+                bool rankdefic_Jac_ineq,
+                double scal_neg_obj = 1.0);
+  virtual ~SparseRajaEx2();
+
+  /**
+   * @brief Number of variables and constraints.
+   */
+  virtual bool get_prob_sizes(size_type& n, size_type& m);
+
+  /**
+   * @brief Get types and bounds on the variables. 
+   * 
+   * @param[in] n number of variables
+   * @param[out] ixlow array with lower bounds (managed by Umpire)
+   * @param[out] ixupp array with upper bounds (managed by Umpire)
+   * @param[out] type array with the variable types (on host)
+   */
+  virtual bool get_vars_info(const size_type& n, double *xlow, double* xupp, NonlinearityType* type);
+
+  /**
+   * Get types and bounds corresponding to constraints. An equality constraint is specified
+   * by setting the lower and upper bounds equal.
+   * 
+   * @param[in] m Number of constraints
+   * @param[out] iclow array with lower bounds (managed by Umpire)
+   * @param[out] icupp array with upper bounds (managed by Umpire)
+   * @param[out] type array with the variable types (on host)
+   */
+  virtual bool get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type);
+
+  /**
+   *  Returns the sizes and number of nonzeros of the sparse blocks
+   *
+   * @param[out] nx number of variables
+   * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. 
+   *                             sparse variables 
+   * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. 
+   *                             sparse variables 
+   * @param[out] nnz_sparse_Hess_Lagr number of nonzeros in the (sparse) Hessian
+   */
+  virtual bool get_sparse_blocks_info(index_type& nx,
+                                      index_type& nnz_sparse_Jace,
+                                      index_type& nnz_sparse_Jaci,
+                                      index_type& nnz_sparse_Hess_Lagr);
+
+  /**
+   * Evaluate objective. 
+   * 
+   * @param[in] n number of variables
+   * @param[in] x array with the optimization variables or point at which to evaluate 
+   *              (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been 
+   * evaluated previously (false) or not (true) at x
+   * @param[out] obj_value the objective function value.
+   */
+  virtual bool eval_f(const size_type& n, const double* x, bool new_x, double& obj_value);
+
+  /**
+   * Evaluate a subset of the constraints specified by idx_cons
+   *
+   * @param[in] num_cons number of constraints to evaluate (size of idx_cons array)
+   * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire)
+   * @param[in] x the point at which to evaluate (managed by Umpire) 
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *            previously (false) or not (true) at x
+   * @param[out] cons array with values of the constraints (managed by Umpire, size num_cons) 
+   */
+  virtual bool eval_cons(const size_type& n,
+                         const size_type& m,
+                         const size_type& num_cons,
+                         const index_type* idx_cons,
+                         const double* x,
+                         bool new_x,
+                         double* cons)
+  {
+    //return false so that HiOp will rely on the constraint evaluator defined below
+    return false;
+  }
+
+  virtual bool eval_cons(const size_type& n,
+                         const size_type& m,
+                         const double* x,
+                         bool new_x,
+                         double* cons);
+
+  /**
+   * Evaluation of the gradient of the objective. 
+   *
+   * @param[in] n number of variables
+   * @param[in] x array with the optimization variables or point at which to evaluate
+   *              (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[out] gradf array with the values of the gradient (managed by Umpire) 
+   */
+  virtual bool eval_grad_f(const size_type& n,
+                           const double* x,
+                           bool new_x,
+                           double* gradf);
+
+  /**
+   * Evaluates the Jacobian of the constraints. Please check the user manual and the 
+   * documentation of hiop::hiopInterfaceMDS for a detailed discussion of how the last 
+   * four arguments are expected to behave. 
+   *
+   * @param[in] n number of variables
+   * @param[in] m Number of constraints
+   * @param[in] num_cons number of constraints to evaluate (size of idx_cons array)
+   * @param[in] idx_cons indexes of the constraints to evaluate (managed by Umpire)
+   * @param[in] x the point at which to evaluate (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[in] nnzJacS number of nonzeros in the sparse Jacobian
+   * @param[out] iJacS array of row indexes in the sparse Jacobian (managed by Umpire)
+   * @param[out] jJacS array of column indexes in the sparse Jacobian (managed by Umpire)
+   * @param[out] MJacS array of nonzero values in the sparse Jacobian (managed by Umpire)
+   */
+  virtual bool eval_Jac_cons(const size_type& n,
+                             const size_type& m,
+                             const size_type& num_cons, 
+                             const index_type* idx_cons,
+                             const double* x,
+                             bool new_x,
+                             const size_type& nnzJacS,
+                             index_type* iJacS,
+                             index_type* jJacS,
+                             double* MJacS)
+  {
+    //return false so that HiOp will rely on the Jacobian evaluator defined below
+    return false;
+  }
+
+  /// Similar to the above, but not used in this example.
+  virtual bool eval_Jac_cons(const size_type& n,
+                             const size_type& m,
+                             const double* x,
+                             bool new_x,
+                             const size_type& nnzJacS,
+                             index_type* iJacS,
+                             index_type* jJacS,
+                             double* MJacS);
+
+
+  /**
+   * Evaluate the Hessian of the Lagrangian function. Please consult the user manual for a 
+   * detailed discussion of the form the Lagrangian function takes.
+   * 
+   * @param[in] n number of variables
+   * @param[in] m Number of constraints
+   * @param[in] x the point at which to evaluate (managed by Umpire)
+   * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
+   *                  previously (false) or not (true) at x
+   * @param[in] obj_factor scalar that multiplies the objective term in the Lagrangian function
+   * @param[in] lambda array with values of the multipliers used by the Lagrangian function
+   * @param[in] new_lambda indicates whether lambda  values changed since last call
+   * @param[in] nnzHSS number of nonzeros in the (sparse) Hessian w.r.t. sparse variables 
+   * @param[out] iHSS array of row indexes in the Hessian w.r.t. sparse variables
+   *                  (managed by Umpire)
+   * @param[out] jHSS array of column indexes in the Hessian w.r.t. sparse variables 
+   *                  (managed by Umpire)
+   * @param[out] MHSS array of nonzero values in the Hessian w.r.t. sparse variables
+   *                  (managed by Umpire)
+   */
+  virtual bool eval_Hess_Lagr(const size_type& n,
+                              const size_type& m,
+                              const double* x,
+                              bool new_x,
+                              const double& obj_factor,
+                              const double* lambda,
+                              bool new_lambda,
+                              const size_type& nnzHSS,
+                              index_type* iHSS,
+                              index_type* jHSS,
+                              double* MHSS);
+
+  /**
+   * Implementation of the primal starting point specification
+   * 
+   * @param[in] n number of variables
+   * @param[in] x0 the primal starting point(managed by Umpire)
+   */
+  virtual bool get_starting_point(const size_type&n, double* x0);
+
+private:
+  int n_vars_;
+  int n_cons_;
+  bool convex_obj_;
+  bool rankdefic_eq_;
+  bool rankdefic_ineq_;
+  double scal_neg_obj_;
+  
+  std::string mem_space_;
+};
+
+#endif

From 994f748c09991503d12513b108f7021d265c33da Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Thu, 23 Feb 2023 22:43:33 -0800
Subject: [PATCH 02/10] minor changes

---
 src/Drivers/MDS/NlpMdsRajaEx1.hpp   | 4 ++--
 src/Drivers/Sparse/NlpSparseEx2.cpp | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/Drivers/MDS/NlpMdsRajaEx1.hpp b/src/Drivers/MDS/NlpMdsRajaEx1.hpp
index c43346ccd..fef3747d6 100644
--- a/src/Drivers/MDS/NlpMdsRajaEx1.hpp
+++ b/src/Drivers/MDS/NlpMdsRajaEx1.hpp
@@ -170,7 +170,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS
    *  Returns the sizes and number of nonzeros of the sparse and dense blocks within MDS
    *
    * @param[out] nx_sparse number of sparse variables
-   * @param[out] nx_ense number of dense variables
+   * @param[out] nx_dense number of dense variables
    * @param[out] nnz_sparse_Jace number of nonzeros in the Jacobian of the equalities w.r.t. 
    *                             sparse variables 
    * @param[out] nnz_sparse_Jaci number of nonzeros in the Jacobian of the inequalities w.r.t. 
@@ -233,7 +233,7 @@ class MdsEx1 : public hiop::hiopInterfaceMDS
    * @param[in] x array with the optimization variables or point at which to evaluate
    *              (managed by Umpire)
    * @param[in] new_x indicates whether any of the other eval functions have been evaluated 
-   *             previously (false) or not (true) at x
+   *                  previously (false) or not (true) at x
    * @param[out] gradf array with the values of the gradient (managed by Umpire) 
    */
   bool eval_grad_f(const size_type& n, const double* x, bool new_x, double* gradf);
diff --git a/src/Drivers/Sparse/NlpSparseEx2.cpp b/src/Drivers/Sparse/NlpSparseEx2.cpp
index 72073e914..ce4a0615e 100644
--- a/src/Drivers/Sparse/NlpSparseEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseEx2.cpp
@@ -153,7 +153,6 @@ bool SparseEx2::eval_cons(const size_type& n,
   return false;
 }
 
-/* Four constraints no matter how large n is */
 bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons)
 {
   assert(n==n_vars); assert(m==n_cons);
@@ -170,7 +169,7 @@ bool SparseEx2::eval_cons(const size_type& n, const size_type& m, const double*
   // --- constraint 2 body ---> 2*x_1 + x_3
   cons[conidx++] += 2*x[0] + 1*x[2];
 
-  // --- constraint 3 body --->   2*x_1 + 0.5*x_i, for i>=4
+  // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
   for(auto i=3; i<n; i++) {
     cons[conidx++] += 2*x[0] + 0.5*x[i];
   }

From 3b42f8df5f06de139baa5ad174ab5b8874b7c5c4 Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Mon, 27 Feb 2023 18:00:50 -0800
Subject: [PATCH 03/10] update cmake file

---
 src/Drivers/Sparse/CMakeLists.txt | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt
index 646c6e43b..1d6c505a5 100644
--- a/src/Drivers/Sparse/CMakeLists.txt
+++ b/src/Drivers/Sparse/CMakeLists.txt
@@ -7,6 +7,19 @@ target_link_libraries(NlpSparseEx2.exe HiOp::HiOp)
 add_executable(NlpSparseEx3.exe NlpSparseEx3.cpp NlpSparseEx3Driver.cpp)
 target_link_libraries(NlpSparseEx3.exe HiOp::HiOp)
 
+if(HIOP_USE_RAJA)
+  if(HIOP_USE_GPU AND HIOP_USE_CUDA)
+    set_source_files_properties(
+      NlpSparseRajaEx2.cpp 
+      NlpSparseRajaEx2Driver.cpp 
+      PROPERTIES LANGUAGE CUDA
+    )
+  endif()
+  add_executable(NlpSparseRajaEx2.exe  NlpSparseRajaEx2Driver.cpp  NlpSparseRajaEx2.cpp)
+  target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp)
+  install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin)
+endif()
+
 if(HIOP_BUILD_SHARED)
   add_executable(NlpSparseCEx1.exe NlpSparseCEx1.c)
   target_link_libraries(NlpSparseCEx1.exe HiOp::HiOp)
@@ -52,6 +65,11 @@ if(HIOP_USE_GINKGO)
       add_test(NAME NlpSparse2_6 COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseEx2.exe>" "500" "-ginkgo_hip" "-inertiafree" "-selfcheck")
   endif(HIOP_USE_HIP)
 endif(HIOP_USE_GINKGO)
+
+if(HIOP_USE_RAJA)
+  add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} bash -c "$<TARGET_FILE:NlpSparseRajaEx2.exe> 500 -selfcheck ")
+endif()
+
 add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseEx3.exe>" "500" "-selfcheck")
 if(HIOP_BUILD_SHARED AND NOT HIOP_USE_GPU)
   add_test(NAME NlpSparseCinterface COMMAND ${RUNCMD} "$<TARGET_FILE:NlpSparseCEx1.exe>")

From f051f7d066491d95aaadd8b4136e156ff6570f34 Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Mon, 27 Feb 2023 18:30:35 -0800
Subject: [PATCH 04/10] add driver

---
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp       |  94 +++---
 src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 318 ++++++++++++++++++
 src/LinAlg/hiopLinSolver.cpp                  |   8 +-
 src/LinAlg/hiopLinSolver.hpp                  |   6 +-
 src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp    |  78 ++++-
 src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp    |  26 +-
 src/LinAlg/hiopMatrixRajaSparseTriplet.hpp    |   4 +-
 .../hiopMatrixRajaSparseTripletImpl.hpp       |  14 +-
 src/Optimization/hiopKKTLinSysSparse.cpp      |  44 ++-
 .../hiopKKTLinSysSparseCondensed.hpp          |   1 -
 10 files changed, 509 insertions(+), 84 deletions(-)
 create mode 100644 src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp

diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
index d1b9c2552..51c8a7232 100644
--- a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -52,13 +52,13 @@
  *
  */
 
-#include "SparseRajaEx2.hpp"
+#include "NlpSparseRajaEx2.hpp"
 
 #include <umpire/Allocator.hpp>
 #include <umpire/ResourceManager.hpp>
 #include <RAJA/RAJA.hpp>
 
-#include <hiopMatrixRajaSparseTriplet.hpp>
+//#include <hiopMatrixRajaSparseTriplet.hpp>
 
 #include <cmath>
 #include <cstring> //for memcpy
@@ -77,18 +77,18 @@
 #include "ExecPoliciesRajaCudaImpl.hpp"
 using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_exec;
 using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaCuda>::hiop_raja_reduce;
-using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaCuda>;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaCuda>;
 #elif defined(HIOP_USE_HIP)
 #include <ExecPoliciesRajaHipImpl.hpp>
 using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_exec;
 using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaHip>::hiop_raja_reduce;
-using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaHip>;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaHip>;
 #else
 //#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP)
 #include <ExecPoliciesRajaOmpImpl.hpp>
 using ex2_raja_exec = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_exec;
 using ex2_raja_reduce = hiop::ExecRajaPoliciesBackend<hiop::ExecPolicyRajaOmp>::hiop_raja_reduce;
-using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaOmp>;
+//using hiopMatrixRajaSparse = hiop::hiopMatrixRajaSparseTriplet<hiop::MemBackendUmpire, hiop::ExecPolicyRajaOmp>;
 #endif
 
 
@@ -154,7 +154,7 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
 {
   assert(n==n_vars_);
 
-  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, 3),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 3),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       if(i==0) {
@@ -170,7 +170,7 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
     });
 
   if(n>3) {
-    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(3, n),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(3, n),
       RAJA_LAMBDA(RAJA::Index_type i)
       {
         xlow[i] = 0.5;
@@ -178,7 +178,7 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
       });
   }
 
-  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, n),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       type[i] = hiopNonlinear;
@@ -189,8 +189,9 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
 bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp, NonlinearityType* type)
 {
   assert(m==n_cons_);
+  size_type n = n_vars_;
 
-  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, 2),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       if(i==0) {
@@ -203,16 +204,16 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
     });
 
   if(n>3) {
-    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(3, m),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, 2+n-3),
       RAJA_LAMBDA(RAJA::Index_type conidx)
       {
         clow[conidx] = 1.0;
-        cupp[conidx] = 2*n;
+        cupp[conidx] = 2*n_vars_;
       });
   }
 
   if(rankdefic_ineq_) {
-    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(m, m+1),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3, 2+n-3+1),
       RAJA_LAMBDA(RAJA::Index_type conidx)
       {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
@@ -222,7 +223,7 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
   }
 
   if(rankdefic_eq_) {
-    RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(m+rankdefic_ineq_, m+rankdefic_ineq_+1),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3+rankdefic_ineq_, m),
       RAJA_LAMBDA(RAJA::Index_type conidx)
       {
         //  4*x_1 + 2*x_2 == 10
@@ -231,7 +232,7 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
       });
   }
 
-  RAJA::forall<ex1_raja_exec>(RAJA::RangeSegment(0, m),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, m),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       type[i] = hiopInterfaceBase::hiopNonlinear;
@@ -304,16 +305,15 @@ bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const doub
       }
     });
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, 2+n-3),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
       cons[i] = 2*x[0] + 0.5*x[i+1];
     });
 
-
   if(rankdefic_ineq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1, n),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3, 2+n-3+1),
       RAJA_LAMBDA(RAJA::Index_type i)
       {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
@@ -322,14 +322,13 @@ bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const doub
   }
 
   if(rankdefic_eq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1+rankdefic_ineq_, n+rankdefic_ineq_),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3+rankdefic_ineq_, m),
       RAJA_LAMBDA(RAJA::Index_type i)
       {
         //  4*x_1 + 2*x_2 == 10
         cons[i] = 4*x[0] + 2*x[1];
       });
   }
-  assert(conidx==m);
 
   return true;
 }
@@ -353,20 +352,24 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
     {
       if(itrow==0) {
         // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
-        iJacS[2*itrow] = itrow;
-        jJacS[2*itrow] = 0;
-        iJacS[2*itrow+1] = itrow;
-        jJacS[2*itrow+1] = 1;
+        if(iJacS !=nullptr && jJacS != nullptr) {
+          iJacS[2*itrow] = itrow;
+          jJacS[2*itrow] = 0;
+          iJacS[2*itrow+1] = itrow;
+          jJacS[2*itrow+1] = 1;
+        }
         if(MJacS != nullptr) {
           MJacS[2*itrow] = 4.0;
           MJacS[2*itrow+1] = 2.0;
         }
       } else if(itrow==1) {
         // --- constraint 2 body ---> 2*x_1 + x_3
-        iJacS[2*itrow] = itrow;
-        jJacS[2*itrow] = 0;
-        iJacS[2*itrow+1] = itrow;
-        jJacS[2*itrow+1] = 2;
+        if(iJacS !=nullptr && jJacS != nullptr) { 
+          iJacS[2*itrow] = itrow;
+          jJacS[2*itrow] = 0;
+          iJacS[2*itrow+1] = itrow;
+          jJacS[2*itrow+1] = 2;
+        }
         if(MJacS != nullptr) {
           MJacS[2*itrow] = 2.0;
           MJacS[2*itrow+1] = 1.0;
@@ -378,26 +381,29 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
     RAJA_LAMBDA(RAJA::Index_type itrow)
     {
       // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
-      iJacS[2*itrow] = itrow;
-      jJacS[2*itrow] = 0;
-      iJacS[2*itrow+1] = itrow;
-      jJacS[2*itrow+1] = itrow+1;
+      if(iJacS !=nullptr && jJacS != nullptr) {
+        iJacS[2*itrow] = itrow;
+        jJacS[2*itrow] = 0;
+        iJacS[2*itrow+1] = itrow;
+        jJacS[2*itrow+1] = itrow+1;
+      }
       if(MJacS != nullptr) {
         MJacS[2*itrow] = 2.0;
         MJacS[2*itrow+1] = 0.5;
       }
     });
 
-
   if(rankdefic_ineq_) {
     RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1, n),
       RAJA_LAMBDA(RAJA::Index_type itrow)
       {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
-        iJacS[2*itrow] = itrow;
-        jJacS[2*itrow] = 0;
-        iJacS[2*itrow+1] = itrow;
-        jJacS[2*itrow+1] = 2;
+        if(iJacS !=nullptr && jJacS != nullptr) {
+          iJacS[2*itrow] = itrow;
+          jJacS[2*itrow] = 0;
+          iJacS[2*itrow+1] = itrow;
+          jJacS[2*itrow+1] = 2;
+        }
         if(MJacS != nullptr) {
           MJacS[2*itrow] = 4.0;
           MJacS[2*itrow+1] = 2.0;
@@ -407,13 +413,15 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
 
   if(rankdefic_eq_) {
     RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1+rankdefic_ineq_, n+rankdefic_ineq_),
-      RAJA_LAMBDA(RAJA::Index_type i)
+      RAJA_LAMBDA(RAJA::Index_type itrow)
       {
         //  4*x_1 + 2*x_2 == 10
-        iJacS[2*itrow] = itrow;
-        jJacS[2*itrow] = 0;
-        iJacS[2*itrow+1] = itrow;
-        jJacS[2*itrow+1] = 1;
+        if(iJacS !=nullptr && jJacS != nullptr) {
+          iJacS[2*itrow] = itrow;
+          jJacS[2*itrow] = 0;
+          iJacS[2*itrow+1] = itrow;
+          jJacS[2*itrow+1] = 1;
+        }
         if(MJacS != nullptr) {
           MJacS[2*itrow] = 4.0;
           MJacS[2*itrow+1] = 2.0;
@@ -440,7 +448,7 @@ bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n,
     //not contribute to the Hessian of the Lagrangian
     assert(nnzHSS == n);
 
-    if(iHSS!=NULL && jHSS!=NULL) {
+    if(iHSS!=nullptr && jHSS!=nullptr) {
       RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
         RAJA_LAMBDA(RAJA::Index_type i)
         {
@@ -451,7 +459,7 @@ bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n,
 
     int convex_obj = (int) convex_obj_;
     double scal_neg_obj = scal_neg_obj_;
-    if(MHSS!=NULL) {
+    if(MHSS!=nullptr) {
       RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
         RAJA_LAMBDA(RAJA::Index_type i)
         {
diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp
new file mode 100644
index 000000000..4df767d99
--- /dev/null
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp
@@ -0,0 +1,318 @@
+#include "NlpSparseRajaEx2.hpp"
+#include "hiopNlpFormulation.hpp"
+#include "hiopAlgFilterIPM.hpp"
+
+#include <cstdlib>
+#include <string>
+
+#include <umpire/Allocator.hpp>
+#include <umpire/ResourceManager.hpp>
+#include <RAJA/RAJA.hpp>
+
+using namespace hiop;
+
+static bool self_check(size_type n, double obj_value, const bool inertia_free);
+
+static bool parse_arguments(int argc,
+                            char **argv,
+                            size_type& n,
+                            bool& self_check,
+                            bool& inertia_free,
+                            bool& use_cusolver,
+                            bool& use_cusolver_lu,
+                            bool& use_ginkgo,
+                            bool& use_ginkgo_cuda,
+                            bool& use_ginkgo_hip)
+{
+  self_check = false;
+  n = 3;
+  inertia_free = false;
+  use_cusolver = false;
+  use_cusolver_lu = false;
+  use_ginkgo = false;
+  use_ginkgo_cuda = false;
+  use_ginkgo_cuda = false;
+  switch(argc) {
+  case 1:
+    //no arguments
+    return true;
+    break;
+  case 5: //4 arguments
+    {
+      if(std::string(argv[4]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[4]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[4]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[4]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[4]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[4]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[4]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+  case 4: //3 arguments
+    {
+      if(std::string(argv[3]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[3]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[3]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[3]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[3]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[3]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[3]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+    case 3: //2 arguments
+    {
+      if(std::string(argv[2]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[2]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[2]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[2]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[2]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[2]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[2]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+  case 2: //1 argument
+    {
+      if(std::string(argv[1]) == "-selfcheck") {
+        self_check = true;    
+      } else if(std::string(argv[1]) == "-inertiafree") {
+        inertia_free = true;
+      } else if(std::string(argv[1]) == "-cusolver") {
+        use_cusolver = true;
+      } else if(std::string(argv[1]) == "-ginkgo"){
+        use_ginkgo = true;
+      } else if(std::string(argv[1]) == "-ginkgo_cuda"){
+        use_ginkgo = true;
+        use_ginkgo_cuda = true;
+      } else if(std::string(argv[1]) == "-ginkgo_hip"){
+        use_ginkgo = true;
+        use_ginkgo_hip = true;
+      } else {
+        n = std::atoi(argv[1]);
+        if(n<=0) {
+          return false;
+        }
+      }
+    }
+    break;
+  default:
+    return false; // 4 or more arguments
+  }
+
+// If CUDA is not available, de-select cuSOLVER
+#ifndef HIOP_USE_CUDA
+  if(use_cusolver) {
+    printf("HiOp built without CUDA support. ");
+    printf("Using default instead of cuSOLVER ...\n");
+    use_cusolver = false;
+  }
+#endif
+
+// Use cuSOLVER's LU factorization, if it was configured
+#ifdef HIOP_USE_CUSOLVER_LU
+  if(use_cusolver) {
+    use_cusolver_lu = true;
+  }
+#endif
+
+  // If cuSOLVER was selected, but inertia free approach was not, add inertia-free
+  if(use_cusolver && !(inertia_free)) {
+    inertia_free = true;
+    printf("LU solver from cuSOLVER library requires inertia free approach. ");
+    printf("Enabling now ...\n");
+  }
+
+// If Ginkgo is not available, de-select it.
+#ifndef HIOP_USE_GINKGO
+  if(use_ginkgo) {
+    printf("HiOp not built with GINKGO support, using default linear solver ...\n");
+    use_ginkgo = false;
+  }
+#endif
+
+  // If Ginkgo was selected, but inertia free approach was not, add inertia-free
+  if(use_ginkgo && !(inertia_free)) {
+    inertia_free = true;
+    printf("LU solver from GINKGO library requires inertia free approach. ");
+    printf("Enabling now ...\n");
+  }
+
+  return true;
+};
+
+static void usage(const char* exeName)
+{
+  printf("hiOp driver %s that solves a synthetic convex problem of variable size.\n", exeName);
+  printf("Usage: \n");
+  printf("  '$ %s problem_size -inertiafree -selfcheck'\n", exeName);
+  printf("Arguments:\n");
+  printf("  'problem_size': number of decision variables [optional, default is 50]\n");
+  printf("  '-inertiafree': indicate if inertia free approach should be used [optional]\n");
+  printf("  '-selfcheck': compares the optimal objective with a previously saved value for the "
+         "problem specified by 'problem_size'. [optional]\n");
+  printf("  '-cusolver': use cuSOLVER linear solver [optional]\n");
+  printf("  '-ginkgo': use GINKGO linear solver [optional]\n");
+}
+
+
+int main(int argc, char **argv)
+{
+  int rank=0;
+#ifdef HIOP_USE_MPI
+  MPI_Init(&argc, &argv);
+  int comm_size;
+  int ierr = MPI_Comm_size(MPI_COMM_WORLD, &comm_size); assert(MPI_SUCCESS==ierr);
+  if(comm_size != 1) {
+    printf("[error] driver detected more than one rank but the driver should be run "
+           "in serial only; will exit\n");
+    MPI_Finalize();
+    return 1;
+  }
+#endif
+
+  // Set memory space where to create models and perform NLP solve
+#ifdef HIOP_USE_GPU
+  std::string mem_space = "device";
+#else
+  std::string mem_space = "host";
+#endif
+
+  bool selfCheck = false;
+  size_type n = 50;
+  bool inertia_free = false;
+  bool use_cusolver = false;
+  bool use_cusolver_lu = false;
+  bool use_ginkgo = false;
+  bool use_ginkgo_cuda = false;
+  bool use_ginkgo_hip = false;
+  if(!parse_arguments(argc, argv, n, selfCheck, inertia_free, use_cusolver, use_cusolver_lu, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { 
+    usage(argv[0]);
+#ifdef HIOP_USE_MPI
+    MPI_Finalize();
+#endif
+    return 1;
+  }
+
+  bool convex_obj = false;
+  bool rankdefic_Jac_eq = true;
+  bool rankdefic_Jac_ineq = true;
+  double scal_neg_obj = 0.1;
+
+  //first test
+  {
+    SparseRajaEx2 nlp_interface(mem_space, n, convex_obj, rankdefic_Jac_eq, rankdefic_Jac_ineq, scal_neg_obj);
+    hiopNlpSparse nlp(nlp_interface);
+    nlp.options->SetStringValue("compute_mode", "gpu");
+    nlp.options->SetStringValue("KKTLinsys", "xdycyd");
+
+    // only support cusolverLU right now, 2023.02.28
+    //lsq initialization of the duals fails for this example since the Jacobian is rank deficient
+    //use zero initialization
+    nlp.options->SetStringValue("duals_init", "zero");
+    nlp.options->SetStringValue("mem_space", "device");
+    nlp.options->SetStringValue("fact_acceptor", "inertia_free");
+    nlp.options->SetStringValue("linsol_mode", "speculative");
+
+    hiopAlgFilterIPMNewton solver(&nlp);
+    hiopSolveStatus status = solver.run();
+    
+    double obj_value = solver.getObjective();
+    
+    if(status<0) {
+      if(rank==0) {
+        printf("solver returned negative solve status: %d (with objective is %18.12e)\n", status, obj_value);
+      }
+#ifdef HIOP_USE_MPI
+      MPI_Finalize();
+#endif
+      return -1;
+    }
+
+    //this is used for "regression" testing when the driver is called with -selfcheck
+    if(selfCheck) {
+      if(!self_check(n, obj_value, inertia_free)) {
+#ifdef HIOP_USE_MPI
+        MPI_Finalize();
+#endif
+        return -1;
+      }
+    } else {
+      if(rank==0) {
+        printf("Optimal objective: %22.14e. Solver status: %d\n", obj_value, status);
+      }
+    }
+  }
+  
+  
+#ifdef HIOP_USE_MPI
+  MPI_Finalize();
+#endif
+
+  return 0;
+}
+
+
+static bool self_check(size_type n, double objval, const bool inertia_free)
+{
+#define num_n_saved 3 //keep this is sync with n_saved and objval_saved
+  const size_type n_saved[] = {50, 500, 10000};
+  const double objval_saved[] = { 8.7754974e+00,  6.4322371e+01,  1.2369786e+03};
+
+#define relerr 1e-6
+  bool found=false;
+  for(int it=0; it<num_n_saved; it++) {
+    if(n_saved[it]==n) {
+      found=true;
+      if(fabs( (objval_saved[it]-objval)/(1+objval_saved[it])) > relerr) {
+        printf("selfcheck failure. Objective (%18.12e) does not agree (%d digits) with the saved value (%18.12e) for n=%d.\n",
+               objval, -(int)log10(relerr), objval_saved[it], n);
+        return false;
+      } else {
+        printf("selfcheck success (%d digits)\n",  -(int)log10(relerr));
+      }
+      break;
+    }
+  }
+
+  if(!found) {
+    printf("selfcheck: driver does not have the objective for n=%d saved. BTW, obj=%18.12e was obtained for this n.\n", n, objval);
+    return false;
+  }
+
+  return true;
+}
diff --git a/src/LinAlg/hiopLinSolver.cpp b/src/LinAlg/hiopLinSolver.cpp
index 0c3760dcc..480aa9e28 100644
--- a/src/LinAlg/hiopLinSolver.cpp
+++ b/src/LinAlg/hiopLinSolver.cpp
@@ -99,7 +99,8 @@ namespace hiop {
     //we default to triplet matrix for now; derived classes using CSR matrices will not call
     //this constructor (will call the 1-parameter constructor below) so they avoid creating
     //the triplet matrix
-    M_ = new hiopMatrixSparseTriplet(n, n, nnz);
+    M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
+    M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz);
     //this class will own `M_`
     sys_mat_owned_ = true;
     nlp_ = nlp;
@@ -112,19 +113,22 @@ namespace hiop {
     sys_mat_owned_ = false;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
+    M_host_ = nullptr;
   }
 
   hiopLinSolverSymSparse::hiopLinSolverSymSparse(hiopMatrixSparse* M, hiopNlpFormulation* nlp)
   {
     M_ = M;
     sys_mat_owned_ = false;
+    M_host_ = nullptr;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
   }
   
   hiopLinSolverNonSymSparse::hiopLinSolverNonSymSparse(int n, int nnz, hiopNlpFormulation* nlp)
   {
-    M_ = new hiopMatrixSparseTriplet(n, n, nnz);
+    M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
+    M_host_ = nullptr;
     sys_mat_owned_ = false;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
diff --git a/src/LinAlg/hiopLinSolver.hpp b/src/LinAlg/hiopLinSolver.hpp
index 79fe53bc5..db953be1f 100644
--- a/src/LinAlg/hiopLinSolver.hpp
+++ b/src/LinAlg/hiopLinSolver.hpp
@@ -136,7 +136,8 @@ class hiopLinSolverSparseBase : public hiopLinSolver
 public:
   hiopLinSolverSparseBase()
     : M_(nullptr),
-      sys_mat_owned_(true)
+      sys_mat_owned_(true),
+      M_host_(nullptr)
   {
   }
   
@@ -144,6 +145,7 @@ class hiopLinSolverSparseBase : public hiopLinSolver
   {
     if(sys_mat_owned_) {
       delete M_;
+      delete M_host_;
     }
   }
   
@@ -157,12 +159,14 @@ class hiopLinSolverSparseBase : public hiopLinSolver
     if(sys_mat_owned_) {
       assert(false && "system matrix should not have been allocated internally when calling set_sys_matrix");
       delete M_;
+      delete M_host_;
     }
     sys_mat_owned_ = false;
     M_ = M;
   }
 protected:
   hiopMatrixSparse* M_;
+  hiopMatrixSparse* M_host_;  // TODO: for cusolver, delete this later! 
   bool sys_mat_owned_;
 };
 
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
index 957b31f33..e4a51f5f7 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
@@ -252,12 +252,12 @@ namespace hiop
     } else {
       // update matrix
       for(int k = 0; k < nnz_; k++) {
-        kVal_[k] = M_->M()[index_covert_CSR2Triplet_[k]];
+        kVal_[k] = M_host_->M()[index_covert_CSR2Triplet_[k]];
       }
       for(int i = 0; i < n_; i++) {
         if(index_covert_extra_Diag2CSR_[i] != -1)
           kVal_[index_covert_extra_Diag2CSR_[i]]
-            += M_->M()[M_->numberOfNonzeros() - n_ + i];
+            += M_host_->M()[M_host_->numberOfNonzeros() - n_ + i];
       }
     } // else
 
@@ -469,10 +469,10 @@ namespace hiop
     //
     // off-diagonal part
     kRowPtr_[0] = 0;
-    for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) {
-      if(M_->i_row()[k] != M_->j_col()[k]) {
-        kRowPtr_[M_->i_row()[k] + 1]++;
-        kRowPtr_[M_->j_col()[k] + 1]++;
+    for(int k = 0; k < M_host_->numberOfNonzeros() - n_; k++) {
+      if(M_host_->i_row()[k] != M_host_->j_col()[k]) {
+        kRowPtr_[M_host_->i_row()[k] + 1]++;
+        kRowPtr_[M_host_->j_col()[k] + 1]++;
         nnz_ += 2;
       }
     }
@@ -502,16 +502,16 @@ namespace hiop
       index_covert_extra_Diag2CSR_[k] = -1;
     }
 
-    for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) {
-      rowID_tmp = M_->i_row()[k];
-      colID_tmp = M_->j_col()[k];
+    for(int k = 0; k < M_host_->numberOfNonzeros() - n_; k++) {
+      rowID_tmp = M_host_->i_row()[k];
+      colID_tmp = M_host_->j_col()[k];
       if(rowID_tmp == colID_tmp) {
         nnz_tmp = nnz_each_row_tmp[rowID_tmp] + kRowPtr_[rowID_tmp];
         jCol_[nnz_tmp] = colID_tmp;
-        kVal_[nnz_tmp] = M_->M()[k];
+        kVal_[nnz_tmp] = M_host_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
-        kVal_[nnz_tmp] += M_->M()[M_->numberOfNonzeros() - n_ + rowID_tmp];
+        kVal_[nnz_tmp] += M_host_->M()[M_host_->numberOfNonzeros() - n_ + rowID_tmp];
         index_covert_extra_Diag2CSR_[rowID_tmp] = nnz_tmp;
 
         nnz_each_row_tmp[rowID_tmp]++;
@@ -519,12 +519,12 @@ namespace hiop
       } else {
         nnz_tmp = nnz_each_row_tmp[rowID_tmp] + kRowPtr_[rowID_tmp];
         jCol_[nnz_tmp] = colID_tmp;
-        kVal_[nnz_tmp] = M_->M()[k];
+        kVal_[nnz_tmp] = M_host_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
         nnz_tmp = nnz_each_row_tmp[colID_tmp] + kRowPtr_[colID_tmp];
         jCol_[nnz_tmp] = rowID_tmp;
-        kVal_[nnz_tmp] = M_->M()[k];
+        kVal_[nnz_tmp] = M_host_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
         nnz_each_row_tmp[rowID_tmp]++;
@@ -538,8 +538,8 @@ namespace hiop
         assert(nnz_each_row_tmp[i] == kRowPtr_[i + 1] - kRowPtr_[i] - 1);
         nnz_tmp = nnz_each_row_tmp[i] + kRowPtr_[i];
         jCol_[nnz_tmp] = i;
-        kVal_[nnz_tmp] = M_->M()[M_->numberOfNonzeros() - n_ + i];
-        index_covert_CSR2Triplet_[nnz_tmp] = M_->numberOfNonzeros() - n_ + i;
+        kVal_[nnz_tmp] = M_host_->M()[M_host_->numberOfNonzeros() - n_ + i];
+        index_covert_CSR2Triplet_[nnz_tmp] = M_host_->numberOfNonzeros() - n_ + i;
         total_nnz_tmp += 1;
 
         std::vector<int> ind_temp(kRowPtr_[i + 1] - kRowPtr_[i]);
@@ -1619,4 +1619,52 @@ namespace hiop
         break;
     } // switch
   } // GramSchmidt
+
+
+
+  hiopLinSolverSymSparseCUSOLVERGPU::hiopLinSolverSymSparseCUSOLVERGPU(const int& n, 
+                                                                       const int& nnz, 
+                                                                       hiopNlpFormulation* nlp)
+    : hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp), 
+      rhs_host_{nullptr}
+  {
+    rhs_host_ = LinearAlgebraFactory::create_vector("default", n);
+  }
+  
+  hiopLinSolverSymSparseCUSOLVERGPU::~hiopLinSolverSymSparseCUSOLVERGPU()
+  {
+    delete rhs_host_;
+  }
+
+  int hiopLinSolverSymSparseCUSOLVERGPU::matrixChanged()
+  {
+    size_type nnz = M_->numberOfNonzeros();
+    double* mval_dev = M_->M();
+    double* mval_host= M_host_->M();
+    checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost));
+
+    index_type* irow_dev = M_->i_row();
+    index_type* irow_host= M_host_->i_row();
+    checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
+  
+    index_type* jcol_dev = M_->j_col();
+    index_type* jcol_host= M_host_->j_col();
+    checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
+
+    return hiopLinSolverSymSparseCUSOLVER::matrixChanged();
+  }
+  
+  bool hiopLinSolverSymSparseCUSOLVERGPU::solve(hiopVector& x)
+  {
+    double* mval_dev = x.local_data();
+    double* mval_host= rhs_host_->local_data();
+    
+    checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost));
+
+    bool bret = hiopLinSolverSymSparseCUSOLVER::solve(*rhs_host_);
+    
+    checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice));
+    return bret;
+  }
+
 } // namespace hiop
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
index 16518052c..db6ea84e0 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
@@ -88,12 +88,12 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
   /** Triggers a refactorization of the matrix, if necessary.
    * Overload from base class.
    * In this case, KLU (SuiteSparse) is used to refactor*/
-  int matrixChanged();
+  virtual int matrixChanged();
 
   /** solves a linear system.
    * param 'x' is on entry the right hand side(s) of the system to be solved.
    * On exit is contains the solution(s).  */
-  bool solve(hiopVector& x_);
+  virtual bool solve(hiopVector& x_);
 
   /** Multiple rhs not supported yet */
   virtual bool
@@ -103,7 +103,7 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
     return false;
   }
 
-private:
+protected:
   //
   int m_;   // number of rows of the whole matrix
   int n_;   // number of cols of the whole matrix
@@ -204,6 +204,26 @@ class hiopLinSolverSymSparseCUSOLVER : public hiopLinSolverSymSparse
   void IRsetup();
 };
 
+class hiopLinSolverSymSparseCUSOLVERGPU : public hiopLinSolverSymSparseCUSOLVER
+{
+public:  
+  hiopLinSolverSymSparseCUSOLVERGPU(const int& n, const int& nnz, hiopNlpFormulation* nlp);
+  virtual ~hiopLinSolverSymSparseCUSOLVERGPU();
+
+  virtual int matrixChanged();
+  virtual bool solve(hiopVector& x_);
+
+  /** Multiple rhs not supported yet */
+  virtual bool
+  solve(hiopMatrix& /* x */)
+  {
+    assert(false && "not yet supported");
+    return false;
+  }
+
+private:
+  hiopVector* rhs_host_;
+};
 
 // Forward declaration of LU class
 class hiopLinSolverSymSparseCUSOLVERLU;
diff --git a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
index 4769d47fa..00bc19a08 100644
--- a/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
+++ b/src/LinAlg/hiopMatrixRajaSparseTriplet.hpp
@@ -390,11 +390,13 @@ class hiopMatrixRajaSparseTriplet : public hiopMatrixSparse
   {
     index_type *idx_start_; //size num_rows+1
     index_type *idx_start_host_; //size num_rows+1
+    index_type register_row_st_;
     size_type num_rows_;
     std::string mem_space_;
     RowStartsInfo()
       : idx_start_(nullptr),
-        num_rows_(0)
+        num_rows_(0), 
+        register_row_st_{0}
     {}
     RowStartsInfo(size_type n_rows, std::string memspace);
     virtual ~RowStartsInfo();
diff --git a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
index 11919070f..fd538740f 100644
--- a/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
+++ b/src/LinAlg/hiopMatrixRajaSparseTripletImpl.hpp
@@ -1123,8 +1123,6 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   assert(n_rows + rows_src_idx_st <= src.m());
   assert(n_rows + rows_dst_idx_st <= this->m());
 
-  index_type* src_row_st = src.row_starts_->idx_start_;
-
   const index_type* iRow_src = src.i_row();
   const index_type* jCol_src = src.j_col();
   const double* values_src = src.M();
@@ -1143,6 +1141,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   }
   assert(src.row_starts_);
 
+  index_type* src_row_st = src.row_starts_->idx_start_;
+  
   //
   // The latest CPU code can be found in 342eb99ec16d45f57a492be1bf1e39cce73995a5
   // It is replaced by RAJA::inclusive_scan after that commit
@@ -1165,10 +1165,12 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
   umpire::Allocator hostalloc = rm.getAllocator("HOST");
 
   int *next_row_nnz = static_cast<size_type*>(hostalloc.allocate(sizeof(size_type)));
+  index_type register_row_st = row_starts_->register_row_st_;
 
   rm.copy(next_row_nnz, dst_row_st_dev+1+rows_dst_idx_st, 1*sizeof(size_type));
 
   if(next_row_nnz[0] == 0) {
+    assert(rows_dst_idx_st >= register_row_st);
     // comput nnz in each row from source
     RAJA::forall<hiop_raja_exec>(
       RAJA::RangeSegment(0, n_rows),
@@ -1179,7 +1181,8 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
         dst_row_st_dev[row_dst+1] = src_row_st[row_src+1] - src_row_st[row_src];
       }
     );
-    RAJA::inclusive_scan_inplace<hiop_raja_exec>(RAJA::make_span(dst_row_st_dev, n_rows+1), RAJA::operators::plus<index_type>());
+    RAJA::inclusive_scan_inplace<hiop_raja_exec>(RAJA::make_span(dst_row_st_dev+register_row_st, n_rows+1+rows_dst_idx_st-register_row_st), RAJA::operators::plus<index_type>());
+    row_starts_->register_row_st_ = n_rows + rows_dst_idx_st;
   }
 
   index_type* dst_row_st = row_starts_->idx_start_;
@@ -1191,7 +1194,7 @@ copyRowsBlockFrom(const hiopMatrix& src_gen,
       const index_type row_src = rows_src_idx_st + row_add;
       const index_type row_dst = rows_dst_idx_st + row_add;
       index_type k_src = src_row_st[row_src];
-      index_type k_dst = dst_row_st[row_dst];
+      index_type k_dst = dst_row_st[row_dst] - dst_row_st[rows_dst_idx_st] + dest_nnz_st;
   
       // copy from src
       while(k_src < src_row_st[row_src+1]) {
@@ -1283,7 +1286,8 @@ template<class MEMBACKEND, class RAJAEXECPOL>
 hiopMatrixRajaSparseTriplet<MEMBACKEND, RAJAEXECPOL>::RowStartsInfo::
 RowStartsInfo(size_type n_rows, std::string memspace)
   : num_rows_(n_rows),
-    mem_space_(memspace)
+    mem_space_(memspace), 
+    register_row_st_{0}
 {
   auto& resmgr = umpire::ResourceManager::getInstance();
   umpire::Allocator alloc = resmgr.getAllocator(mem_space_);
diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp
index 7a1360bf2..8a6419f7e 100644
--- a/src/Optimization/hiopKKTLinSysSparse.cpp
+++ b/src/Optimization/hiopKKTLinSysSparse.cpp
@@ -63,6 +63,10 @@
 #ifdef HIOP_USE_GINKGO
 #include "hiopLinSolverSparseGinkgo.hpp"
 #endif
+
+#ifdef HIOP_USE_GPU
+#include "hiopMatrixRajaSparseTriplet.hpp"
+#endif
 #endif
 
 namespace hiop
@@ -184,7 +188,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
@@ -418,14 +427,13 @@ namespace hiop
     delta_wd_ = perturb_calc_->get_curr_delta_wd();
     delta_cc_ = perturb_calc_->get_curr_delta_cc();
     delta_cd_ = perturb_calc_->get_curr_delta_cd();
-    
-    HessSp_ = dynamic_cast<hiopMatrixSymSparseTriplet*>(Hess_);
-    if(!HessSp_) { assert(false); return false; }
 
-    Jac_cSp_ = dynamic_cast<const hiopMatrixSparseTriplet*>(Jac_c_);
+    HessSp_ = dynamic_cast<hiopMatrixSparse*>(Hess_);
+    Jac_cSp_ = dynamic_cast<const hiopMatrixSparse*>(Jac_c_);
+    Jac_dSp_ = dynamic_cast<const hiopMatrixSparse*>(Jac_d_);
+ 
+    if(!HessSp_) { assert(false); return false; }
     if(!Jac_cSp_) { assert(false); return false; }
-
-    Jac_dSp_ = dynamic_cast<const hiopMatrixSparseTriplet*>(Jac_d_);
     if(!Jac_dSp_) { assert(false); return false; }
 
     size_type nx = HessSp_->n(), nd=Jac_dSp_->m(), neq=Jac_cSp_->m(), nineq=Jac_dSp_->m();
@@ -436,7 +444,7 @@ namespace hiop
     auto* linSys = dynamic_cast<hiopLinSolverSymSparse*> (linSys_);
     assert(linSys);
 
-    auto* Msys = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+    auto* Msys = dynamic_cast<hiopMatrixSparse*>(linSys->sys_matrix());
     assert(Msys);
     if(perf_report_) {
       nlp_->log->printf(hovSummary,
@@ -510,7 +518,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
@@ -750,13 +763,13 @@ namespace hiop
         //
         // We don't allow CPU linear solvers.
         /////////////////////////////////////////////////////////////////////////////////////////////
-        assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported.");
-        assert(false == safe_mode_);
+ //       assert(false && "KKT_SPARSE_XDYcYd linsys: GPU compute mode not yet supported.");
+ //       assert(false == safe_mode_);
         assert(nullptr == linSys_);
         
         if(linear_solver == "cusolver-lu" || linear_solver == "auto") {
 #if defined(HIOP_USE_CUSOLVER_LU)        
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           nlp_->log->printf(hovScalars,
                             "KKT_SPARSE_XDYcYd linsys: alloc CUSOLVER-LU size %d (%d cons) (gpu)\n",
                             n,
@@ -1046,7 +1059,12 @@ namespace hiop
       write_linsys_counter_++;
     }
     if(write_linsys_counter_>=0) {
-      csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#ifndef HIOP_USE_GPU
+      auto* MsysSp = dynamic_cast<hiopMatrixSparseTriplet*>(linSys->sys_matrix());
+      csr_writer_.writeMatToFile(*MsysSp, write_linsys_counter_, nx, neq, nineq);
+#else
+    //TODO csr_writer_.writeMatToFile(*Msys, write_linsys_counter_, nx, neq, nineq);
+#endif
     }
 
     return true;
diff --git a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
index 1e0747c61..cb9c6fc8d 100644
--- a/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
+++ b/src/Optimization/hiopKKTLinSysSparseCondensed.hpp
@@ -210,7 +210,6 @@ class hiopKKTLinSysCondensedSparse : public hiopKKTLinSysCompressedSparseXDYcYd
     } else {
       //(opt_compute_mode == "hybrid" || opt_compute_mode == "gpu") {
 #ifdef HIOP_USE_CUDA
-      assert(opt_compute_mode != "gpu" && "When code is GPU-ready, remove this method");
       return "CUDA";
 #else
       assert(false && "compute mode not supported without HIOP_USE_CUDA build");

From cbe3a5331a710a2294a2a741b6377ce342c55d1d Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Wed, 1 Mar 2023 21:29:17 -0800
Subject: [PATCH 05/10] fix cusolver for hybrid mode

---
 src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp | 32 ++++++++++++++++------
 src/Optimization/hiopKKTLinSysSparse.cpp   |  4 +--
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
index e4a51f5f7..5403d29a0 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
@@ -1641,15 +1641,22 @@ namespace hiop
     size_type nnz = M_->numberOfNonzeros();
     double* mval_dev = M_->M();
     double* mval_host= M_host_->M();
-    checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost));
 
     index_type* irow_dev = M_->i_row();
     index_type* irow_host= M_host_->i_row();
-    checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
-  
+
     index_type* jcol_dev = M_->j_col();
     index_type* jcol_host= M_host_->j_col();
-    checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
+
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyDeviceToHost));
+      checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));
+      checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyDeviceToHost));      
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * nnz, cudaMemcpyHostToHost));
+      checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
+      checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
+    }
 
     return hiopLinSolverSymSparseCUSOLVER::matrixChanged();
   }
@@ -1658,12 +1665,21 @@ namespace hiop
   {
     double* mval_dev = x.local_data();
     double* mval_host= rhs_host_->local_data();
-    
-    checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost));
+   
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyDeviceToHost));
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyHostToHost));
+    }
 
     bool bret = hiopLinSolverSymSparseCUSOLVER::solve(*rhs_host_);
-    
-    checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice));
+   
+    if("device" == nlp_->options->GetString("mem_space")) {
+      checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice));
+    } else {
+      checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToHost));     
+    }
+ 
     return bret;
   }
 
diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp
index 8a6419f7e..517b9fd79 100644
--- a/src/Optimization/hiopKKTLinSysSparse.cpp
+++ b/src/Optimization/hiopKKTLinSysSparse.cpp
@@ -338,7 +338,7 @@ namespace hiop
 
         if( (nullptr == linSys_ && linear_solver == "auto") || linear_solver == "cusolver-lu") {
 #if defined(HIOP_USE_CUSOLVER_LU)
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           linsol_actual = "CUSOLVER-LU";
           auto* fact_acceptor_ic = dynamic_cast<hiopFactAcceptorIC*> (fact_acceptor_);
           if(fact_acceptor_ic) {
@@ -698,7 +698,7 @@ namespace hiop
         if(linear_solver == "cusolver-lu" || linear_solver == "auto") {
 #if defined(HIOP_USE_CUSOLVER_LU)
           actual_lin_solver = "CUSOLVER-LU";
-          linSys_ = new hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp_);
+          linSys_ = new hiopLinSolverSymSparseCUSOLVERGPU(n, nnz, nlp_);
           auto* fact_acceptor_ic = dynamic_cast<hiopFactAcceptorIC*> (fact_acceptor_);
           if(fact_acceptor_ic) {
             nlp_->log->printf(hovError,

From bed6ac821d30c0d3286ead023535b5b635841cf7 Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Wed, 1 Mar 2023 21:58:31 -0800
Subject: [PATCH 06/10] update cmake file

---
 src/Drivers/Sparse/CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt
index 1d6c505a5..7c96910e4 100644
--- a/src/Drivers/Sparse/CMakeLists.txt
+++ b/src/Drivers/Sparse/CMakeLists.txt
@@ -14,10 +14,11 @@ if(HIOP_USE_RAJA)
       NlpSparseRajaEx2Driver.cpp 
       PROPERTIES LANGUAGE CUDA
     )
+  
+    add_executable(NlpSparseRajaEx2.exe  NlpSparseRajaEx2Driver.cpp  NlpSparseRajaEx2.cpp)
+    target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp)
+    install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin)
   endif()
-  add_executable(NlpSparseRajaEx2.exe  NlpSparseRajaEx2Driver.cpp  NlpSparseRajaEx2.cpp)
-  target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp)
-  install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin)
 endif()
 
 if(HIOP_BUILD_SHARED)
@@ -66,7 +67,7 @@ if(HIOP_USE_GINKGO)
   endif(HIOP_USE_HIP)
 endif(HIOP_USE_GINKGO)
 
-if(HIOP_USE_RAJA)
+if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA)
   add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} bash -c "$<TARGET_FILE:NlpSparseRajaEx2.exe> 500 -selfcheck ")
 endif()
 

From 8d16e81b90561647c85220bc9acad82f3f6e876a Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Thu, 2 Mar 2023 10:11:38 -0800
Subject: [PATCH 07/10] re-engineer the code

---
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp    | 308 ++++++++++-----------
 src/LinAlg/hiopLinSolver.cpp               |   4 -
 src/LinAlg/hiopLinSolver.hpp               |   6 +-
 src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp |  57 ++--
 src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp |   1 +
 5 files changed, 182 insertions(+), 194 deletions(-)

diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
index 51c8a7232..1d730ba39 100644
--- a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -154,19 +154,18 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
 {
   assert(n==n_vars_);
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 3),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
-      if(i==0) {
-        xlow[i] = -1e20;
-        xupp[i] =  1e20;
-      } else if(i==1) {
-        xlow[i] = 0.0;
-        xupp[i] = 1e20;
-      } else { 
-        xlow[i] = 1.0;
-        xupp[i] = 10.0;
-      }
+      xlow[0] = -1e20;
+      xupp[0] =  1e20;
+      type[0] = hiopNonlinear;
+      xlow[1] = 0.0;
+      xupp[1] = 1e20;
+      type[1] = hiopNonlinear;
+      xlow[2] = 1.0;
+      xupp[2] = 10.0;
+      type[2] = hiopNonlinear;      
     });
 
   if(n>3) {
@@ -175,14 +174,10 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
       {
         xlow[i] = 0.5;
         xupp[i] = 1e20;
+        type[i] = hiopNonlinear;
       });
   }
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
-    RAJA_LAMBDA(RAJA::Index_type i)
-    {
-      type[i] = hiopNonlinear;
-    });
   return true;
 }
 
@@ -190,54 +185,44 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
 {
   assert(m==n_cons_);
   size_type n = n_vars_;
-
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
+  assert(m-1 == n-1+rankdefic_ineq_);
+  
+  // serial part
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
     RAJA_LAMBDA(RAJA::Index_type i)
-    {
-      if(i==0) {
-        clow[i] = 10.0;
-        cupp[i] = 10.0;
-      } else {
-        clow[i] = 5.0;
-        cupp[i] = 1e20;
+    {      
+      clow[0] = 10.0;
+      cupp[0] = 10.0;
+      type[0] = hiopInterfaceBase::hiopNonlinear;
+      clow[1] = 5.0;
+      cupp[1] = 1e20;
+      type[1] = hiopInterfaceBase::hiopNonlinear;
+
+      if(rankdefic_ineq_) {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        clow[n-1] = -1e+20;
+        cupp[n-1] = 19.;
+        type[n-1] = hiopInterfaceBase::hiopNonlinear;
+      }
+
+      if(rankdefic_eq_) {
+          //  4*x_1 + 2*x_2 == 10
+          clow[m-1] = 10;
+          cupp[m-1] = 10;
+          type[m-1] = hiopInterfaceBase::hiopNonlinear;
       }
     });
 
   if(n>3) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, 2+n-3),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
       RAJA_LAMBDA(RAJA::Index_type conidx)
       {
         clow[conidx] = 1.0;
-        cupp[conidx] = 2*n_vars_;
+        cupp[conidx] = 2*n;
+        type[conidx] = hiopInterfaceBase::hiopNonlinear;
       });
   }
 
-  if(rankdefic_ineq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3, 2+n-3+1),
-      RAJA_LAMBDA(RAJA::Index_type conidx)
-      {
-        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
-        clow[conidx] = -1e+20;
-        cupp[conidx] = 19.;
-      });
-  }
-
-  if(rankdefic_eq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3+rankdefic_ineq_, m),
-      RAJA_LAMBDA(RAJA::Index_type conidx)
-      {
-        //  4*x_1 + 2*x_2 == 10
-        clow[conidx] = 10;
-        cupp[conidx] = 10;
-      });
-  }
-
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, m),
-    RAJA_LAMBDA(RAJA::Index_type i)
-    {
-      type[i] = hiopInterfaceBase::hiopNonlinear;
-    });
-
   return true;
 }
 
@@ -290,46 +275,37 @@ bool SparseRajaEx2::eval_grad_f(const size_type& n, const double* x, bool new_x,
 
 bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const double* x, bool new_x, double* cons)
 {
-  assert(n==n_vars_); assert(m==n_cons_);
+  assert(n==n_vars_);
+  assert(m==n_cons_);
   assert(n_cons_==2+n-3+rankdefic_eq_+rankdefic_ineq_);
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
+  // serial part
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
     RAJA_LAMBDA(RAJA::Index_type i)
-    {
-      if(i==0) {
-        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
-        cons[i] = 4*x[0] + 2*x[1];
-      } else if(i==1) {
-        // --- constraint 2 body ---> 2*x_1 + x_3
-        cons[i] = 2*x[0] + 1*x[2];
+    {      
+      // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+      cons[0] = 4*x[0] + 2*x[1];
+      // --- constraint 2 body ---> 2*x_1 + x_3
+      cons[1] = 2*x[0] + 1*x[2];
+
+      if(rankdefic_ineq_) {
+        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+        cons[n-1] = 4*x[0] + 2*x[2];
+      }
+
+      if(rankdefic_eq_) {
+        //  4*x_1 + 2*x_2 == 10
+        cons[m-1] = 4*x[0] + 2*x[1];
       }
     });
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, 2+n-3),
+  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
     RAJA_LAMBDA(RAJA::Index_type i)
     {
       // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
       cons[i] = 2*x[0] + 0.5*x[i+1];
     });
 
-  if(rankdefic_ineq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3, 2+n-3+1),
-      RAJA_LAMBDA(RAJA::Index_type i)
-      {
-        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
-        cons[i] = 4*x[0] + 2*x[2];
-      });
-  }
-
-  if(rankdefic_eq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2+n-3+rankdefic_ineq_, m),
-      RAJA_LAMBDA(RAJA::Index_type i)
-      {
-        //  4*x_1 + 2*x_2 == 10
-        cons[i] = 4*x[0] + 2*x[1];
-      });
-  }
-
   return true;
 }
 
@@ -346,87 +322,85 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
   assert(n>=3);
 
   assert(nnzJacS == 4 + 2*(n-3) + 2*rankdefic_eq_ + 2*rankdefic_ineq_);
-
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 2),
-    RAJA_LAMBDA(RAJA::Index_type itrow)
-    {
-      if(itrow==0) {
+ 
+  if(iJacS !=nullptr && jJacS != nullptr) {
+    // serial part
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
         // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
-        if(iJacS !=nullptr && jJacS != nullptr) {
-          iJacS[2*itrow] = itrow;
-          jJacS[2*itrow] = 0;
-          iJacS[2*itrow+1] = itrow;
-          jJacS[2*itrow+1] = 1;
-        }
-        if(MJacS != nullptr) {
-          MJacS[2*itrow] = 4.0;
-          MJacS[2*itrow+1] = 2.0;
-        }
-      } else if(itrow==1) {
+        iJacS[0] = 0;
+        jJacS[0] = 0;
+        iJacS[1] = 0;
+        jJacS[1] = 1;
         // --- constraint 2 body ---> 2*x_1 + x_3
-        if(iJacS !=nullptr && jJacS != nullptr) { 
-          iJacS[2*itrow] = itrow;
-          jJacS[2*itrow] = 0;
-          iJacS[2*itrow+1] = itrow;
-          jJacS[2*itrow+1] = 2;
+        iJacS[2] = 1;
+        jJacS[2] = 0;
+        iJacS[3] = 1;
+        jJacS[3] = 2;
+
+        if(rankdefic_ineq_) {   
+          // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+          iJacS[2*n-2] = n-1;
+          jJacS[2*n-2] = 0;
+          iJacS[2*n-1] = n-1;
+          jJacS[2*n-1] = 2;
         }
-        if(MJacS != nullptr) {
-          MJacS[2*itrow] = 2.0;
-          MJacS[2*itrow+1] = 1.0;
-        }
-      }
-    });
 
-  RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
-    RAJA_LAMBDA(RAJA::Index_type itrow)
-    {
-      // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
-      if(iJacS !=nullptr && jJacS != nullptr) {
+        if(rankdefic_eq_) {
+          //  4*x_1 + 2*x_2 == 10
+          iJacS[2*m-2] = m-1;
+          jJacS[2*m-2] = 0;
+          iJacS[2*m-1] = m-1;
+          jJacS[2*m-1] = 1;
+        }
+      });
+    
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
+      RAJA_LAMBDA(RAJA::Index_type itrow)
+      {
+        // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
         iJacS[2*itrow] = itrow;
         jJacS[2*itrow] = 0;
         iJacS[2*itrow+1] = itrow;
         jJacS[2*itrow+1] = itrow+1;
-      }
-      if(MJacS != nullptr) {
-        MJacS[2*itrow] = 2.0;
-        MJacS[2*itrow+1] = 0.5;
-      }
-    });
+      });
+
+  }
 
-  if(rankdefic_ineq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1, n),
+  if(MJacS != nullptr) {
+    // serial part
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
       RAJA_LAMBDA(RAJA::Index_type itrow)
       {
-        // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
-        if(iJacS !=nullptr && jJacS != nullptr) {
-          iJacS[2*itrow] = itrow;
-          jJacS[2*itrow] = 0;
-          iJacS[2*itrow+1] = itrow;
-          jJacS[2*itrow+1] = 2;
-        }
-        if(MJacS != nullptr) {
-          MJacS[2*itrow] = 4.0;
-          MJacS[2*itrow+1] = 2.0;
+        // --- constraint 1 body --->  4*x_1 + 2*x_2 == 10
+        MJacS[0] = 4.0;
+        MJacS[1] = 2.0;
+        // --- constraint 2 body ---> 2*x_1 + x_3
+        MJacS[2] = 2.0;
+        MJacS[3] = 1.0;
+
+        if(rankdefic_ineq_) {   
+          // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
+          MJacS[2*n-2] = 4.0;
+          MJacS[2*n-1] = 2.0;
+        }      
+
+        if(rankdefic_eq_) {
+          //  4*x_1 + 2*x_2 == 10
+          MJacS[2*m-2] = 4.0;
+          MJacS[2*m-1] = 2.0;
         }
       });
-  }
 
-  if(rankdefic_eq_) {
-    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(n-1+rankdefic_ineq_, n+rankdefic_ineq_),
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(2, n-1),
       RAJA_LAMBDA(RAJA::Index_type itrow)
       {
-        //  4*x_1 + 2*x_2 == 10
-        if(iJacS !=nullptr && jJacS != nullptr) {
-          iJacS[2*itrow] = itrow;
-          jJacS[2*itrow] = 0;
-          iJacS[2*itrow+1] = itrow;
-          jJacS[2*itrow+1] = 1;
-        }
-        if(MJacS != nullptr) {
-          MJacS[2*itrow] = 4.0;
-          MJacS[2*itrow+1] = 2.0;
-        }
+        // --- constraint 3 body ---> 2*x_1 + 0.5*x_i, for i>=4
+        MJacS[2*itrow] = 2.0;
+        MJacS[2*itrow+1] = 0.5;
       });
+
   }
 
   return true;
@@ -444,29 +418,29 @@ bool SparseRajaEx2::eval_Hess_Lagr(const size_type& n,
                                index_type* jHSS,
                                double* MHSS)
 {
-    //Note: lambda is not used since all the constraints are linear and, therefore, do
-    //not contribute to the Hessian of the Lagrangian
-    assert(nnzHSS == n);
-
-    if(iHSS!=nullptr && jHSS!=nullptr) {
-      RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
-        RAJA_LAMBDA(RAJA::Index_type i)
-        {
-          iHSS[i] = i;
-          jHSS[i] = i;
-        });
-    }
+  //Note: lambda is not used since all the constraints are linear and, therefore, do
+  //not contribute to the Hessian of the Lagrangian
+  assert(nnzHSS == n);
 
-    int convex_obj = (int) convex_obj_;
-    double scal_neg_obj = scal_neg_obj_;
-    if(MHSS!=nullptr) {
-      RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
-        RAJA_LAMBDA(RAJA::Index_type i)
-        {
-          MHSS[i] = obj_factor * ( (2*convex_obj-1) * scal_neg_obj * 3 * std::pow(x[i]-1., 2) + 1);
-        });
-    }
-    return true;
+  if(iHSS!=nullptr && jHSS!=nullptr) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        iHSS[i] = i;
+        jHSS[i] = i;
+      });
+  }
+
+  int convex_obj = (int) convex_obj_;
+  double scal_neg_obj = scal_neg_obj_;
+  if(MHSS!=nullptr) {
+    RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, n),
+      RAJA_LAMBDA(RAJA::Index_type i)
+      {
+        MHSS[i] = obj_factor * ( (2*convex_obj-1) * scal_neg_obj * 3 * std::pow(x[i]-1., 2) + 1);
+      });
+  }
+  return true;
 }
 
 
diff --git a/src/LinAlg/hiopLinSolver.cpp b/src/LinAlg/hiopLinSolver.cpp
index 480aa9e28..868981175 100644
--- a/src/LinAlg/hiopLinSolver.cpp
+++ b/src/LinAlg/hiopLinSolver.cpp
@@ -100,7 +100,6 @@ namespace hiop {
     //this constructor (will call the 1-parameter constructor below) so they avoid creating
     //the triplet matrix
     M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
-    M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz);
     //this class will own `M_`
     sys_mat_owned_ = true;
     nlp_ = nlp;
@@ -113,14 +112,12 @@ namespace hiop {
     sys_mat_owned_ = false;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
-    M_host_ = nullptr;
   }
 
   hiopLinSolverSymSparse::hiopLinSolverSymSparse(hiopMatrixSparse* M, hiopNlpFormulation* nlp)
   {
     M_ = M;
     sys_mat_owned_ = false;
-    M_host_ = nullptr;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
   }
@@ -128,7 +125,6 @@ namespace hiop {
   hiopLinSolverNonSymSparse::hiopLinSolverNonSymSparse(int n, int nnz, hiopNlpFormulation* nlp)
   {
     M_ = LinearAlgebraFactory::create_matrix_sparse(nlp->options->GetString("mem_space"), n, n, nnz);
-    M_host_ = nullptr;
     sys_mat_owned_ = false;
     nlp_ = nlp;
     perf_report_ = "on"==hiop::tolower(nlp->options->GetString("time_kkt"));
diff --git a/src/LinAlg/hiopLinSolver.hpp b/src/LinAlg/hiopLinSolver.hpp
index db953be1f..79fe53bc5 100644
--- a/src/LinAlg/hiopLinSolver.hpp
+++ b/src/LinAlg/hiopLinSolver.hpp
@@ -136,8 +136,7 @@ class hiopLinSolverSparseBase : public hiopLinSolver
 public:
   hiopLinSolverSparseBase()
     : M_(nullptr),
-      sys_mat_owned_(true),
-      M_host_(nullptr)
+      sys_mat_owned_(true)
   {
   }
   
@@ -145,7 +144,6 @@ class hiopLinSolverSparseBase : public hiopLinSolver
   {
     if(sys_mat_owned_) {
       delete M_;
-      delete M_host_;
     }
   }
   
@@ -159,14 +157,12 @@ class hiopLinSolverSparseBase : public hiopLinSolver
     if(sys_mat_owned_) {
       assert(false && "system matrix should not have been allocated internally when calling set_sys_matrix");
       delete M_;
-      delete M_host_;
     }
     sys_mat_owned_ = false;
     M_ = M;
   }
 protected:
   hiopMatrixSparse* M_;
-  hiopMatrixSparse* M_host_;  // TODO: for cusolver, delete this later! 
   bool sys_mat_owned_;
 };
 
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
index 5403d29a0..b50537324 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.cpp
@@ -252,12 +252,12 @@ namespace hiop
     } else {
       // update matrix
       for(int k = 0; k < nnz_; k++) {
-        kVal_[k] = M_host_->M()[index_covert_CSR2Triplet_[k]];
+        kVal_[k] = M_->M()[index_covert_CSR2Triplet_[k]];
       }
       for(int i = 0; i < n_; i++) {
         if(index_covert_extra_Diag2CSR_[i] != -1)
           kVal_[index_covert_extra_Diag2CSR_[i]]
-            += M_host_->M()[M_host_->numberOfNonzeros() - n_ + i];
+            += M_->M()[M_->numberOfNonzeros() - n_ + i];
       }
     } // else
 
@@ -469,10 +469,10 @@ namespace hiop
     //
     // off-diagonal part
     kRowPtr_[0] = 0;
-    for(int k = 0; k < M_host_->numberOfNonzeros() - n_; k++) {
-      if(M_host_->i_row()[k] != M_host_->j_col()[k]) {
-        kRowPtr_[M_host_->i_row()[k] + 1]++;
-        kRowPtr_[M_host_->j_col()[k] + 1]++;
+    for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) {
+      if(M_->i_row()[k] != M_->j_col()[k]) {
+        kRowPtr_[M_->i_row()[k] + 1]++;
+        kRowPtr_[M_->j_col()[k] + 1]++;
         nnz_ += 2;
       }
     }
@@ -502,16 +502,16 @@ namespace hiop
       index_covert_extra_Diag2CSR_[k] = -1;
     }
 
-    for(int k = 0; k < M_host_->numberOfNonzeros() - n_; k++) {
-      rowID_tmp = M_host_->i_row()[k];
-      colID_tmp = M_host_->j_col()[k];
+    for(int k = 0; k < M_->numberOfNonzeros() - n_; k++) {
+      rowID_tmp = M_->i_row()[k];
+      colID_tmp = M_->j_col()[k];
       if(rowID_tmp == colID_tmp) {
         nnz_tmp = nnz_each_row_tmp[rowID_tmp] + kRowPtr_[rowID_tmp];
         jCol_[nnz_tmp] = colID_tmp;
-        kVal_[nnz_tmp] = M_host_->M()[k];
+        kVal_[nnz_tmp] = M_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
-        kVal_[nnz_tmp] += M_host_->M()[M_host_->numberOfNonzeros() - n_ + rowID_tmp];
+        kVal_[nnz_tmp] += M_->M()[M_->numberOfNonzeros() - n_ + rowID_tmp];
         index_covert_extra_Diag2CSR_[rowID_tmp] = nnz_tmp;
 
         nnz_each_row_tmp[rowID_tmp]++;
@@ -519,12 +519,12 @@ namespace hiop
       } else {
         nnz_tmp = nnz_each_row_tmp[rowID_tmp] + kRowPtr_[rowID_tmp];
         jCol_[nnz_tmp] = colID_tmp;
-        kVal_[nnz_tmp] = M_host_->M()[k];
+        kVal_[nnz_tmp] = M_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
         nnz_tmp = nnz_each_row_tmp[colID_tmp] + kRowPtr_[colID_tmp];
         jCol_[nnz_tmp] = rowID_tmp;
-        kVal_[nnz_tmp] = M_host_->M()[k];
+        kVal_[nnz_tmp] = M_->M()[k];
         index_covert_CSR2Triplet_[nnz_tmp] = k;
 
         nnz_each_row_tmp[rowID_tmp]++;
@@ -538,8 +538,8 @@ namespace hiop
         assert(nnz_each_row_tmp[i] == kRowPtr_[i + 1] - kRowPtr_[i] - 1);
         nnz_tmp = nnz_each_row_tmp[i] + kRowPtr_[i];
         jCol_[nnz_tmp] = i;
-        kVal_[nnz_tmp] = M_host_->M()[M_host_->numberOfNonzeros() - n_ + i];
-        index_covert_CSR2Triplet_[nnz_tmp] = M_host_->numberOfNonzeros() - n_ + i;
+        kVal_[nnz_tmp] = M_->M()[M_->numberOfNonzeros() - n_ + i];
+        index_covert_CSR2Triplet_[nnz_tmp] = M_->numberOfNonzeros() - n_ + i;
         total_nnz_tmp += 1;
 
         std::vector<int> ind_temp(kRowPtr_[i + 1] - kRowPtr_[i]);
@@ -1626,14 +1626,17 @@ namespace hiop
                                                                        const int& nnz, 
                                                                        hiopNlpFormulation* nlp)
     : hiopLinSolverSymSparseCUSOLVER(n, nnz, nlp), 
-      rhs_host_{nullptr}
+      rhs_host_{nullptr},
+      M_host_{nullptr}
   {
     rhs_host_ = LinearAlgebraFactory::create_vector("default", n);
+    M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz);
   }
   
   hiopLinSolverSymSparseCUSOLVERGPU::~hiopLinSolverSymSparseCUSOLVERGPU()
   {
     delete rhs_host_;
+    delete M_host_;
   }
 
   int hiopLinSolverSymSparseCUSOLVERGPU::matrixChanged()
@@ -1657,8 +1660,18 @@ namespace hiop
       checkCudaErrors(cudaMemcpy(irow_host, irow_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
       checkCudaErrors(cudaMemcpy(jcol_host, jcol_dev, sizeof(index_type) * nnz, cudaMemcpyHostToHost));
     }
+    
+    hiopMatrixSparse* swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+    
+    int vret = hiopLinSolverSymSparseCUSOLVER::matrixChanged();
 
-    return hiopLinSolverSymSparseCUSOLVER::matrixChanged();
+    swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+    
+    return vret;
   }
   
   bool hiopLinSolverSymSparseCUSOLVERGPU::solve(hiopVector& x)
@@ -1672,8 +1685,16 @@ namespace hiop
       checkCudaErrors(cudaMemcpy(mval_host, mval_dev, sizeof(double) * n_, cudaMemcpyHostToHost));
     }
 
+    hiopMatrixSparse* swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+
     bool bret = hiopLinSolverSymSparseCUSOLVER::solve(*rhs_host_);
-   
+
+    swap_ptr = M_;
+    M_ = M_host_;
+    M_host_ = swap_ptr;
+
     if("device" == nlp_->options->GetString("mem_space")) {
       checkCudaErrors(cudaMemcpy(mval_dev, mval_host, sizeof(double) * n_, cudaMemcpyHostToDevice));
     } else {
diff --git a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
index db6ea84e0..1833afe67 100644
--- a/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
+++ b/src/LinAlg/hiopLinSolverSparseCUSOLVER.hpp
@@ -223,6 +223,7 @@ class hiopLinSolverSymSparseCUSOLVERGPU : public hiopLinSolverSymSparseCUSOLVER
 
 private:
   hiopVector* rhs_host_;
+  hiopMatrixSparse* M_host_;
 };
 
 // Forward declaration of LU class

From 353268c27052b65deab71d585a38b5b08495e5a8 Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Mon, 6 Mar 2023 16:42:19 -0800
Subject: [PATCH 08/10] try to fix marianas raja issue

---
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp | 32 +++++++++++++++++--------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
index 1d730ba39..55628a224 100644
--- a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -186,6 +186,10 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
   assert(m==n_cons_);
   size_type n = n_vars_;
   assert(m-1 == n-1+rankdefic_ineq_);
+
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
   
   // serial part
   RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
@@ -198,14 +202,14 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
       cupp[1] = 1e20;
       type[1] = hiopInterfaceBase::hiopNonlinear;
 
-      if(rankdefic_ineq_) {
+      if(rankdefic_ineq) {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
         clow[n-1] = -1e+20;
         cupp[n-1] = 19.;
         type[n-1] = hiopInterfaceBase::hiopNonlinear;
       }
 
-      if(rankdefic_eq_) {
+      if(rankdefic_eq) {
           //  4*x_1 + 2*x_2 == 10
           clow[m-1] = 10;
           cupp[m-1] = 10;
@@ -279,6 +283,10 @@ bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const doub
   assert(m==n_cons_);
   assert(n_cons_==2+n-3+rankdefic_eq_+rankdefic_ineq_);
 
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
+
   // serial part
   RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
     RAJA_LAMBDA(RAJA::Index_type i)
@@ -288,12 +296,12 @@ bool SparseRajaEx2::eval_cons(const size_type& n, const size_type& m, const doub
       // --- constraint 2 body ---> 2*x_1 + x_3
       cons[1] = 2*x[0] + 1*x[2];
 
-      if(rankdefic_ineq_) {
+      if(rankdefic_ineq) {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
         cons[n-1] = 4*x[0] + 2*x[2];
       }
 
-      if(rankdefic_eq_) {
+      if(rankdefic_eq) {
         //  4*x_1 + 2*x_2 == 10
         cons[m-1] = 4*x[0] + 2*x[1];
       }
@@ -322,7 +330,11 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
   assert(n>=3);
 
   assert(nnzJacS == 4 + 2*(n-3) + 2*rankdefic_eq_ + 2*rankdefic_ineq_);
- 
+
+  // RAJA doesn't like member objects
+  bool rankdefic_eq = rankdefic_eq_;
+  bool rankdefic_ineq = rankdefic_ineq_;
+
   if(iJacS !=nullptr && jJacS != nullptr) {
     // serial part
     RAJA::forall<ex2_raja_exec>(RAJA::RangeSegment(0, 1),
@@ -339,7 +351,7 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
         iJacS[3] = 1;
         jJacS[3] = 2;
 
-        if(rankdefic_ineq_) {   
+        if(rankdefic_ineq) {
           // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
           iJacS[2*n-2] = n-1;
           jJacS[2*n-2] = 0;
@@ -347,7 +359,7 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
           jJacS[2*n-1] = 2;
         }
 
-        if(rankdefic_eq_) {
+        if(rankdefic_eq) {
           //  4*x_1 + 2*x_2 == 10
           iJacS[2*m-2] = m-1;
           jJacS[2*m-2] = 0;
@@ -380,13 +392,13 @@ bool SparseRajaEx2::eval_Jac_cons(const size_type& n,
         MJacS[2] = 2.0;
         MJacS[3] = 1.0;
 
-        if(rankdefic_ineq_) {   
+        if(rankdefic_ineq) {
           // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
           MJacS[2*n-2] = 4.0;
           MJacS[2*n-1] = 2.0;
-        }      
+        }
 
-        if(rankdefic_eq_) {
+        if(rankdefic_eq) {
           //  4*x_1 + 2*x_2 == 10
           MJacS[2*m-2] = 4.0;
           MJacS[2*m-1] = 2.0;

From 5fdb4d547560f5bfbc2ccfebc872fd394006792e Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Fri, 10 Mar 2023 16:02:57 -0800
Subject: [PATCH 09/10] host data for NonlinearityType

---
 src/Drivers/Sparse/NlpSparseRajaEx2.cpp | 32 ++++++++++++++++++-------
 1 file changed, 23 insertions(+), 9 deletions(-)

diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
index 55628a224..bd9fca748 100644
--- a/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
+++ b/src/Drivers/Sparse/NlpSparseRajaEx2.cpp
@@ -159,13 +159,13 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
     {
       xlow[0] = -1e20;
       xupp[0] =  1e20;
-      type[0] = hiopNonlinear;
+//      type[0] = hiopNonlinear;
       xlow[1] = 0.0;
       xupp[1] = 1e20;
-      type[1] = hiopNonlinear;
+//      type[1] = hiopNonlinear;
       xlow[2] = 1.0;
       xupp[2] = 10.0;
-      type[2] = hiopNonlinear;      
+//      type[2] = hiopNonlinear;      
     });
 
   if(n>3) {
@@ -174,10 +174,17 @@ bool SparseRajaEx2::get_vars_info(const size_type& n, double *xlow, double* xupp
       {
         xlow[i] = 0.5;
         xupp[i] = 1e20;
-        type[i] = hiopNonlinear;
+//        type[i] = hiopNonlinear;
       });
   }
 
+  // Use a sequential policy for host computations for now
+  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, n),
+    [=] (RAJA::Index_type i)
+    {
+      type[i] = hiopNonlinear;
+    });
+
   return true;
 }
 
@@ -197,23 +204,23 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
     {      
       clow[0] = 10.0;
       cupp[0] = 10.0;
-      type[0] = hiopInterfaceBase::hiopNonlinear;
+//      type[0] = hiopInterfaceBase::hiopNonlinear;
       clow[1] = 5.0;
       cupp[1] = 1e20;
-      type[1] = hiopInterfaceBase::hiopNonlinear;
+//      type[1] = hiopInterfaceBase::hiopNonlinear;
 
       if(rankdefic_ineq) {
         // [-inf] <= 4*x_1 + 2*x_3 <= [ 19 ]
         clow[n-1] = -1e+20;
         cupp[n-1] = 19.;
-        type[n-1] = hiopInterfaceBase::hiopNonlinear;
+//        type[n-1] = hiopInterfaceBase::hiopNonlinear;
       }
 
       if(rankdefic_eq) {
           //  4*x_1 + 2*x_2 == 10
           clow[m-1] = 10;
           cupp[m-1] = 10;
-          type[m-1] = hiopInterfaceBase::hiopNonlinear;
+//          type[m-1] = hiopInterfaceBase::hiopNonlinear;
       }
     });
 
@@ -223,10 +230,17 @@ bool SparseRajaEx2::get_cons_info(const size_type& m, double* clow, double* cupp
       {
         clow[conidx] = 1.0;
         cupp[conidx] = 2*n;
-        type[conidx] = hiopInterfaceBase::hiopNonlinear;
+//        type[conidx] = hiopInterfaceBase::hiopNonlinear;
       });
   }
 
+  // Must be a sequential host policy for now
+  RAJA::forall<RAJA::loop_exec>(RAJA::RangeSegment(0, m),
+    [=] (RAJA::Index_type i)
+    {
+      type[i] = hiopNonlinear;
+    });
+
   return true;
 }
 

From 3a4ee914bf651ec1451aaa748c7523c94ee8d95d Mon Sep 17 00:00:00 2001
From: nychiang <sorakid507@gmail.com>
Date: Mon, 13 Mar 2023 10:52:53 -0700
Subject: [PATCH 10/10] clean the code

---
 src/Optimization/hiopKKTLinSysSparse.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp
index 517b9fd79..c3c43c832 100644
--- a/src/Optimization/hiopKKTLinSysSparse.cpp
+++ b/src/Optimization/hiopKKTLinSysSparse.cpp
@@ -64,9 +64,6 @@
 #include "hiopLinSolverSparseGinkgo.hpp"
 #endif
 
-#ifdef HIOP_USE_GPU
-#include "hiopMatrixRajaSparseTriplet.hpp"
-#endif
 #endif
 
 namespace hiop