ginkgo-project · pratikvn · Nov 5, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 27, 2023
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -103,6 +103,28 @@ __global__ __launch_bounds__(
 }
 
 
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
 template <typename Group, typename ValueType, typename Mapping>
 __device__ __forceinline__ void gen_one_dot(
     const gko::batch::multi_vector::batch_item<const ValueType>& x,
@@ -165,6 +187,27 @@ __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_
 }
 
 
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
 template <typename Group, typename ValueType>
 __device__ __forceinline__ void one_norm2(
     const gko::batch::multi_vector::batch_item<const ValueType>& x,
@@ -238,6 +281,17 @@ __global__ __launch_bounds__(
 }
 
 
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
 /**
  * Copies the values of one multi-vector into another.
  *

diff --git a/common/cuda_hip/log/batch_logger.hpp.inc b/common/cuda_hip/log/batch_logger.hpp.inc
@@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 template <typename RealType>
 class SimpleFinalLogger final {
 public:
-    using real_type = remove_complex<RealType>;
+    using real_type = RealType;
 
     SimpleFinalLogger(real_type* const batch_residuals, int* const batch_iters)
         : final_residuals_{batch_residuals}, final_iters_{batch_iters}

diff --git a/common/cuda_hip/preconditioner/batch_identity.hpp.inc b/common/cuda_hip/preconditioner/batch_identity.hpp.inc
@@ -45,16 +45,9 @@ public:
         return 0;
     }
 
-    __device__ __forceinline__ void generate(
-        size_type,
-        const gko::batch::matrix::ell::batch_item<const ValueType, gko::int32>&,
-        ValueType*)
-    {}
-
-    __device__ __forceinline__ void generate(
-        size_type,
-        const gko::batch::matrix::dense::batch_item<const ValueType>&,
-        ValueType*)
+    template <typename batch_item_type>
+    __device__ __forceinline__ void generate(size_type, const batch_item_type&,
+                                             ValueType*)
     {}
 
     __device__ __forceinline__ void apply(const int num_rows,