fixup! [batch] split cg compilation (cuda)

ginkgo-project · Sep 17, 2024 · 48fe94b · 48fe94b
1 parent d26a1ca
commit 48fe94b
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 12 deletions.
diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh
@@ -8,7 +8,7 @@
 #include "common/cuda_hip/matrix/batch_struct.hpp"
 #include "core/base/batch_struct.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
 
 
 namespace gko {

diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu
@@ -2,13 +2,14 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "cuda/solver/batch_cg_launch.cuh"
+
 #include <ginkgo/core/base/exception_helpers.hpp>
 
-#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
+#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
 #include "core/matrix/batch_struct.hpp"
-#include "core/solver/batch_bicgstab_kernels.hpp"
+#include "core/solver/batch_cg_kernels.hpp"
 #include "core/solver/batch_dispatch.hpp"
-#include "cuda/solver/batch_bicgstab_launch.cuh"
 
 
 namespace gko {
@@ -28,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
     const int device_max_threads =
         (std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     const int num_regs_used = funcattr.numRegs;
     int max_regs_blk = 0;
     cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
@@ -52,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
                            cudaDevAttrMaxSharedMemoryPerMultiprocessor,
                            exec->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
-        apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
-                     ValueType>,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>,
         cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
     cudaFuncAttributes funcattr;
-    cudaFuncGetAttributes(&funcattr,
-                          apply_kernel<StopType, 5, true, PrecType, LogType,
-                                       BatchMatrixType, ValueType>);
+    cudaFuncGetAttributes(
+        &funcattr,
+        batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
+                                           BatchMatrixType, ValueType>);
     return funcattr.maxDynamicSharedSizeBytes;
 }