From c96003862d47a82425389916805877c59a861641 Mon Sep 17 00:00:00 2001 From: Pratik Nayak Date: Tue, 23 Jul 2024 13:34:59 +0200 Subject: [PATCH] unify cuda/hip batch_mvec --- common/cuda_hip/CMakeLists.txt | 1 + ...hpp.inc => batch_multi_vector_kernels.cpp} | 67 +++- .../base/batch_multi_vector_kernels.hpp | 326 ++++++++++++++++++ .../base/batch_multi_vector_kernels.hpp.inc | 43 +-- cuda/CMakeLists.txt | 1 - cuda/base/batch_multi_vector_kernels.cu | 56 --- hip/CMakeLists.txt | 1 - hip/base/batch_multi_vector_kernels.hip.cpp | 56 --- 8 files changed, 390 insertions(+), 161 deletions(-) rename common/cuda_hip/base/{batch_multi_vector_kernel_launcher.hpp.inc => batch_multi_vector_kernels.cpp} (67%) create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp delete mode 100644 cuda/base/batch_multi_vector_kernels.cu delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt index 463abfd9284..15d3a82419e 100644 --- a/common/cuda_hip/CMakeLists.txt +++ b/common/cuda_hip/CMakeLists.txt @@ -1,5 +1,6 @@ include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake) set(CUDA_HIP_SOURCES + base/batch_multi_vector_kernels.cpp base/device_matrix_data_kernels.cpp base/index_set_kernels.cpp components/prefix_sum_kernels.cpp diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp similarity index 67% rename from common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp index 19b5b74a547..17f65487464 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp @@ -2,6 +2,32 @@ // // SPDX-License-Identifier: BSD-3-Clause +#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp" + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "core/base/batch_multi_vector_kernels.hpp" +#include "core/base/batch_struct.hpp" + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_multi_vector { + + +constexpr auto default_block_size = 256; + + template void scale(std::shared_ptr exec, const batch::MultiVector* const alpha, @@ -11,16 +37,19 @@ void scale(std::shared_ptr exec, const auto alpha_ub = get_batch_struct(alpha); const auto x_ub = get_batch_struct(x); if (alpha->get_common_size()[1] == 1) { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return 0; }); } else if (alpha->get_common_size() == x->get_common_size()) { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return row * stride + col; }); } else { - scale_kernel<<get_stream()>>>( + batch_single_kernels::scale_kernel<<get_stream()>>>( alpha_ub, x_ub, [] __device__(int row, int col, int stride) { return col; }); } @@ -42,12 +71,12 @@ void add_scaled(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); if (alpha->get_common_size()[1] == 1) { - add_scaled_kernel<<get_stream()>>>( + batch_single_kernels::add_scaled_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; }); } else { - add_scaled_kernel<<get_stream()>>>( + batch_single_kernels::add_scaled_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; }); } } @@ -67,8 +96,8 @@ void compute_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_gen_dot_product_kernel<<get_stream()>>>( + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( x_ub, y_ub, res_ub, [] __device__(auto val) { return val; }); } @@ -87,8 +116,8 @@ void compute_conj_dot(std::shared_ptr exec, const auto x_ub = get_batch_struct(x); const auto y_ub = get_batch_struct(y); const auto res_ub = get_batch_struct(result); - compute_gen_dot_product_kernel<<get_stream()>>>( + batch_single_kernels::compute_gen_dot_product_kernel<<< + num_blocks, default_block_size, 0, exec->get_stream()>>>( x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); }); } @@ -105,8 +134,9 @@ void compute_norm2(std::shared_ptr exec, const auto num_rhs = x->get_common_size()[1]; const auto x_ub = get_batch_struct(x); const auto res_ub = get_batch_struct(result); - compute_norm2_kernel<<get_stream()>>>(x_ub, res_ub); + batch_single_kernels::compute_norm2_kernel<<get_stream()>>>( + x_ub, res_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( @@ -121,8 +151,15 @@ void copy(std::shared_ptr exec, const auto num_blocks = x->get_num_batch_items(); const auto result_ub = get_batch_struct(result); const auto x_ub = get_batch_struct(x); - copy_kernel<<get_stream()>>>( - x_ub, result_ub); + batch_single_kernels:: + copy_kernel<<get_stream()>>>( + x_ub, result_ub); } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL); + + +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp new file mode 100644 index 00000000000..36aa69d7d99 --- /dev/null +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp @@ -0,0 +1,326 @@ +// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include +#include + +#include +#include +#include +#include + +#include "common/cuda_hip/base/config.hpp" +#include "common/cuda_hip/base/math.hpp" +#include "common/cuda_hip/base/runtime.hpp" +#include "common/cuda_hip/base/thrust.hpp" +#include "common/cuda_hip/base/types.hpp" +#include "common/cuda_hip/components/cooperative_groups.hpp" +#include "common/cuda_hip/components/format_conversion.hpp" +#include "common/cuda_hip/components/reduction.hpp" +#include "common/cuda_hip/components/segment_scan.hpp" +#include "common/cuda_hip/components/thread_ids.hpp" +#include "common/cuda_hip/components/warp_blas.hpp" + +#if defined(GKO_COMPILING_CUDA) +#include "cuda/base/batch_struct.hpp" +#elif defined(GKO_COMPILING_HIP) +#include "hip/base/batch_struct.hip.hpp" +#else +#error "batch struct def missing" +#endif + + +namespace gko { +namespace kernels { +namespace GKO_DEVICE_NAMESPACE { +namespace batch_multi_vector { +namespace batch_single_kernels { + + +constexpr auto default_block_size = 256; + + +template +__device__ __forceinline__ void scale( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + x.values[row * x.stride + col] = + alpha.values[map(row, col, alpha.stride)] * + x.values[row * x.stride + col]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void scale_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + scale(alpha_b, x_b, map); + } +} + + +template +__device__ __forceinline__ void add_scaled( + const gko::batch::multi_vector::batch_item& alpha, + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, Mapping map) +{ + const int max_li = x.num_rows * x.num_rhs; + for (int li = threadIdx.x; li < max_li; li += blockDim.x) { + const int row = li / x.num_rhs; + const int col = li % x.num_rhs; + + y.values[row * y.stride + col] += + alpha.values[map(col)] * x.values[row * x.stride + col]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id); + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + add_scaled(alpha_b, x_b, y_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup, + const int num_rows, + const ValueType* x, + const ValueType* y, + ValueType& result) + +{ + ValueType val = zero(); + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += conj(x[r]) * y[r]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result = val; + } +} + + +template +__device__ __forceinline__ void gen_one_dot( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const int rhs_index, + const gko::batch::multi_vector::batch_item& result, + Group subgroup, Mapping conj_map) +{ + ValueType val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += conj_map(x.values[r * x.stride + rhs_index]) * + y.values[r * y.stride + rhs_index]; + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = val; + } +} + + +template +__device__ __forceinline__ void compute_gen_dot_product( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item& y, + const gko::batch::multi_vector::batch_item& result, + Mapping conj_map) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + gen_one_dot(x, y, rhs_index, result, subgroup, conj_map); + } +} + + +template +__global__ +__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, + const gko::batch::multi_vector::uniform_batch result, + Mapping map) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto y_b = gko::batch::extract_batch_item(y, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_gen_dot_product(x_b, y_b, r_b, map); + } +} + + +template +__device__ __forceinline__ void single_rhs_compute_norm2( + Group subgroup, const int num_rows, const ValueType* x, + remove_complex& result) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) { + val += squared_norm(x[r]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result = sqrt(val); + } +} + + +template +__device__ __forceinline__ void one_norm2( + const gko::batch::multi_vector::batch_item& x, + const int rhs_index, + const gko::batch::multi_vector::batch_item>& + result, + Group subgroup) +{ + using real_type = typename gko::remove_complex; + real_type val = zero(); + + for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) { + val += squared_norm(x.values[r * x.stride + rhs_index]); + } + + // subgroup level reduction + val = reduce(subgroup, val, thrust::plus>{}); + + if (subgroup.thread_rank() == 0) { + result.values[rhs_index] = sqrt(val); + } +} + + +/** + * Computes the 2-norms of some column vectors in global or shared memory. + * + * @param x A row-major multivector with nrhs columns. + * @param result Holds norm value for each vector in x. + */ +template +__device__ __forceinline__ void compute_norm2( + const gko::batch::multi_vector::batch_item& x, + const gko::batch::multi_vector::batch_item>& + result) +{ + constexpr auto tile_size = config::warp_size; + auto thread_block = group::this_thread_block(); + auto subgroup = group::tiled_partition(thread_block); + const auto subgroup_id = static_cast(threadIdx.x / tile_size); + const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size); + + for (int rhs_index = subgroup_id; rhs_index < x.num_rhs; + rhs_index += num_subgroups_per_block) { + one_norm2(x, rhs_index, result, subgroup); + } +} + + +template +__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch> + result) +{ + for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; + batch_id += gridDim.x) { + const auto x_b = gko::batch::extract_batch_item(x, batch_id); + const auto r_b = gko::batch::extract_batch_item(result, batch_id); + compute_norm2(x_b, r_b); + } +} + + +template +__device__ __forceinline__ void single_rhs_copy(const int num_rows, + const ValueType* in, + ValueType* out) +{ + for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) { + out[iz] = in[iz]; + } +} + + +/** + * Copies the values of one multi-vector into another. + * + * Note that the output multi-vector should already have memory allocated + * and stride set. + */ +template +__device__ __forceinline__ void copy( + const gko::batch::multi_vector::batch_item& in, + const gko::batch::multi_vector::batch_item& out) +{ + for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs; + iz += blockDim.x) { + const int i = iz / in.num_rhs; + const int j = iz % in.num_rhs; + out.values[i * out.stride + j] = in.values[i * in.stride + j]; + } +} + + +template +__global__ __launch_bounds__(default_block_size) void copy_kernel( + const gko::batch::multi_vector::uniform_batch src, + const gko::batch::multi_vector::uniform_batch dst) +{ + for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items; + batch_id += gridDim.x) { + const auto dst_b = gko::batch::extract_batch_item(dst, batch_id); + const auto src_b = gko::batch::extract_batch_item(src, batch_id); + copy(src_b, dst_b); + } +} + + +} // namespace batch_single_kernels +} // namespace batch_multi_vector +} // namespace GKO_DEVICE_NAMESPACE +} // namespace kernels +} // namespace gko diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc index 9b6301674be..7af3c84303f 100644 --- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc +++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc @@ -20,8 +20,7 @@ __device__ __forceinline__ void scale( template -__global__ -__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel( +__global__ __launch_bounds__(default_block_size) void scale_kernel( const gko::batch::multi_vector::uniform_batch alpha, const gko::batch::multi_vector::uniform_batch x, Mapping map) { @@ -52,20 +51,10 @@ __device__ __forceinline__ void add_scaled( template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector:: - uniform_batch< - const ValueType> - alpha, - const gko::batch::multi_vector:: - uniform_batch< - const ValueType> - x, - const gko::batch::multi_vector:: - uniform_batch - y, - Mapping map) +__global__ __launch_bounds__(default_block_size) void add_scaled_kernel( + const gko::batch::multi_vector::uniform_batch alpha, + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch y, Mapping map) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -145,7 +134,7 @@ __device__ __forceinline__ void compute_gen_dot_product( template __global__ -__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel( +__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel( const gko::batch::multi_vector::uniform_batch x, const gko::batch::multi_vector::uniform_batch y, const gko::batch::multi_vector::uniform_batch result, @@ -232,19 +221,10 @@ __device__ __forceinline__ void compute_norm2( template -__global__ __launch_bounds__( - default_block_size, - sm_oversubscription) void compute_norm2_kernel(const gko::batch:: - multi_vector:: - uniform_batch< - const ValueType> - x, - const gko::batch:: - multi_vector:: - uniform_batch< - remove_complex< - ValueType>> - result) +__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel( + const gko::batch::multi_vector::uniform_batch x, + const gko::batch::multi_vector::uniform_batch> + result) { for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items; batch_id += gridDim.x) { @@ -287,8 +267,7 @@ __device__ __forceinline__ void copy( template -__global__ -__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel( +__global__ __launch_bounds__(default_block_size) void copy_kernel( const gko::batch::multi_vector::uniform_batch src, const gko::batch::multi_vector::uniform_batch dst) { diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index d4a94eda802..3631a65f48d 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -7,7 +7,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) target_sources(ginkgo_cuda PRIVATE - base/batch_multi_vector_kernels.cu base/device.cpp base/exception.cpp base/executor.cpp diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu deleted file mode 100644 index 3dad5ba94f1..00000000000 --- a/cuda/base/batch_multi_vector_kernels.cu +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/warp_blas.hpp" -#include "core/base/batch_struct.hpp" -#include "cuda/base/batch_struct.hpp" - - -namespace gko { -namespace kernels { -namespace cuda { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace cuda -} // namespace kernels -} // namespace gko diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 30e675509d5..0882250dc0e 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -5,7 +5,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne # we don't split up the dense kernels into distinct compilations list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp) set(GINKGO_HIP_SOURCES - base/batch_multi_vector_kernels.hip.cpp base/device.hip.cpp base/exception.hip.cpp base/executor.hip.cpp diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp deleted file mode 100644 index 701f4655a9a..00000000000 --- a/hip/base/batch_multi_vector_kernels.hip.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors -// -// SPDX-License-Identifier: BSD-3-Clause - -#include "core/base/batch_multi_vector_kernels.hpp" - -#include -#include - -#include -#include - -#include "common/cuda_hip/base/blas_bindings.hpp" -#include "common/cuda_hip/base/config.hpp" -#include "common/cuda_hip/base/pointer_mode_guard.hpp" -#include "common/cuda_hip/base/runtime.hpp" -#include "common/cuda_hip/base/thrust.hpp" -#include "common/cuda_hip/components/cooperative_groups.hpp" -#include "common/cuda_hip/components/reduction.hpp" -#include "common/cuda_hip/components/thread_ids.hpp" -#include "common/cuda_hip/components/uninitialized_array.hpp" -#include "core/base/batch_struct.hpp" -#include "hip/base/batch_struct.hip.hpp" - - -namespace gko { -namespace kernels { -namespace hip { -/** - * @brief The MultiVector matrix format namespace. - * - * @ingroup batch_multi_vector - */ -namespace batch_multi_vector { - - -constexpr auto default_block_size = 256; -constexpr int sm_oversubscription = 4; - - -// clang-format off - -// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES - -#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc" - - -#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc" - -// clang-format on - - -} // namespace batch_multi_vector -} // namespace hip -} // namespace kernels -} // namespace gko