From 48fe94bafb1953f20e984eef8bdb699dafcb3ba0 Mon Sep 17 00:00:00 2001 From: Marcel Koch Date: Tue, 17 Sep 2024 16:40:20 +0200 Subject: [PATCH] fixup! [batch] split cg compilation (cuda) --- cuda/solver/batch_cg_launch.cuh | 2 +- cuda/solver/batch_cg_launch.instantiate.cu | 25 ++++++++++++---------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/cuda/solver/batch_cg_launch.cuh b/cuda/solver/batch_cg_launch.cuh index d6bae86df87..7196d6f8366 100644 --- a/cuda/solver/batch_cg_launch.cuh +++ b/cuda/solver/batch_cg_launch.cuh @@ -8,7 +8,7 @@ #include "common/cuda_hip/matrix/batch_struct.hpp" #include "core/base/batch_struct.hpp" #include "core/matrix/batch_struct.hpp" -#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_cg_kernels.hpp" namespace gko { diff --git a/cuda/solver/batch_cg_launch.instantiate.cu b/cuda/solver/batch_cg_launch.instantiate.cu index 0c130c20ed2..9fca587f33e 100644 --- a/cuda/solver/batch_cg_launch.instantiate.cu +++ b/cuda/solver/batch_cg_launch.instantiate.cu @@ -2,13 +2,14 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "cuda/solver/batch_cg_launch.cuh" + #include -#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp" +#include "common/cuda_hip/solver/batch_cg_kernels.hpp" #include "core/matrix/batch_struct.hpp" -#include "core/solver/batch_bicgstab_kernels.hpp" +#include "core/solver/batch_cg_kernels.hpp" #include "core/solver/batch_dispatch.hpp" -#include "cuda/solver/batch_bicgstab_launch.cuh" namespace gko { @@ -28,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr exec, const int device_max_threads = (std::max(num_rows, min_block_size) / warp_sz) * warp_sz; cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); const int num_regs_used = funcattr.numRegs; int max_regs_blk = 0; cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock, @@ -52,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr exec) cudaDevAttrMaxSharedMemoryPerMultiprocessor, exec->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute( - apply_kernel, + batch_single_kernels::apply_kernel, cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/)); cudaFuncAttributes funcattr; - cudaFuncGetAttributes(&funcattr, - apply_kernel); + cudaFuncGetAttributes( + &funcattr, + batch_single_kernels::apply_kernel); return funcattr.maxDynamicSharedSizeBytes; }