Skip to content

Commit

Permalink
fixup! [batch] split cg compilation (cuda)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Sep 17, 2024
1 parent d26a1ca commit 48fe94b
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 12 deletions.
2 changes: 1 addition & 1 deletion cuda/solver/batch_cg_launch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include "common/cuda_hip/matrix/batch_struct.hpp"
#include "core/base/batch_struct.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"
#include "core/solver/batch_cg_kernels.hpp"


namespace gko {
Expand Down
25 changes: 14 additions & 11 deletions cuda/solver/batch_cg_launch.instantiate.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
//
// SPDX-License-Identifier: BSD-3-Clause

#include "cuda/solver/batch_cg_launch.cuh"

#include <ginkgo/core/base/exception_helpers.hpp>

#include "common/cuda_hip/solver/batch_bicgstab_kernels.hpp"
#include "common/cuda_hip/solver/batch_cg_kernels.hpp"
#include "core/matrix/batch_struct.hpp"
#include "core/solver/batch_bicgstab_kernels.hpp"
#include "core/solver/batch_cg_kernels.hpp"
#include "core/solver/batch_dispatch.hpp"
#include "cuda/solver/batch_bicgstab_launch.cuh"


namespace gko {
Expand All @@ -28,9 +29,10 @@ int get_num_threads_per_block(std::shared_ptr<const DefaultExecutor> exec,
const int device_max_threads =
(std::max(num_rows, min_block_size) / warp_sz) * warp_sz;
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
cudaFuncGetAttributes(
&funcattr,
batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
const int num_regs_used = funcattr.numRegs;
int max_regs_blk = 0;
cudaDeviceGetAttribute(&max_regs_blk, cudaDevAttrMaxRegistersPerBlock,
Expand All @@ -52,13 +54,14 @@ int get_max_dynamic_shared_memory(std::shared_ptr<const DefaultExecutor> exec)
cudaDevAttrMaxSharedMemoryPerMultiprocessor,
exec->get_device_id());
GKO_ASSERT_NO_CUDA_ERRORS(cudaFuncSetAttribute(
apply_kernel<StopType, 5, true, PrecType, LogType, BatchMatrixType,
ValueType>,
batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>,
cudaFuncAttributePreferredSharedMemoryCarveout, 99 /*%*/));
cudaFuncAttributes funcattr;
cudaFuncGetAttributes(&funcattr,
apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
cudaFuncGetAttributes(
&funcattr,
batch_single_kernels::apply_kernel<StopType, 5, true, PrecType, LogType,
BatchMatrixType, ValueType>);
return funcattr.maxDynamicSharedSizeBytes;
}

Expand Down

0 comments on commit 48fe94b

Please sign in to comment.