From 48fe94bafb1953f20e984eef8bdb699dafcb3ba0 Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Tue, 17 Sep 2024 16:40:20 +0200
Subject: [PATCH] fixup! [batch] split cg compilation (cuda)

---
 cuda/solver/batch_cg_launch.cuh            |  2 +-
 cuda/solver/batch_cg_launch.instantiate.cu | 25 ++++++++++++----------
 2 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
index d6bae86df87..7196d6f8366 100644
--- a/cuda/solver/batch_cg_launch.cuh
+++ b/cuda/solver/batch_cg_launch.cuh
@@ -8,7 +8,7 @@
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
 
 
 namespace gko {
diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
index 0c130c20ed2..9fca587f33e 100644
--- a/cuda/solver/batch_cg_launch.instantiate.cu
+++ b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -2,13 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "cuda/solver/batch_cg_launch.cuh"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/solver/batch_bicgstab_launch.cuh"
 
 
 namespace gko {
@@ -28,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int device_max_threads =
         (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     const int num_regs_used = funcattr.numRegs;
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
@@ -52,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
                            cudaDevAttrMaxSharedMemoryPerMultiprocessor,
                            exec->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
-                     ValueType>,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     return funcattr.maxDynamicSharedSizeBytes;
 }