From c96003862d47a82425389916805877c59a861641 Mon Sep 17 00:00:00 2001
From: Pratik Nayak <pratikvn@protonmail.com>
Date: Tue, 23 Jul 2024 13:34:59 +0200
Subject: [PATCH] unify cuda/hip batch_mvec

---
 common/cuda_hip/CMakeLists.txt                |   1 +
 ...hpp.inc => batch_multi_vector_kernels.cpp} |  67 +++-
 .../base/batch_multi_vector_kernels.hpp       | 326 ++++++++++++++++++
 .../base/batch_multi_vector_kernels.hpp.inc   |  43 +--
 cuda/CMakeLists.txt                           |   1 -
 cuda/base/batch_multi_vector_kernels.cu       |  56 ---
 hip/CMakeLists.txt                            |   1 -
 hip/base/batch_multi_vector_kernels.hip.cpp   |  56 ---
 8 files changed, 390 insertions(+), 161 deletions(-)
 rename common/cuda_hip/base/{batch_multi_vector_kernel_launcher.hpp.inc => batch_multi_vector_kernels.cpp} (67%)
 create mode 100644 common/cuda_hip/base/batch_multi_vector_kernels.hpp
 delete mode 100644 cuda/base/batch_multi_vector_kernels.cu
 delete mode 100644 hip/base/batch_multi_vector_kernels.hip.cpp
diff --git a/common/cuda_hip/CMakeLists.txt b/common/cuda_hip/CMakeLists.txt
index 463abfd9284..15d3a82419e 100644
--- a/common/cuda_hip/CMakeLists.txt
+++ b/common/cuda_hip/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(${PROJECT_SOURCE_DIR}/cmake/template_instantiation.cmake)
 set(CUDA_HIP_SOURCES
+    base/batch_multi_vector_kernels.cpp
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
     components/prefix_sum_kernels.cpp
diff --git a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
similarity index 67%
rename from common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
rename to common/cuda_hip/base/batch_multi_vector_kernels.cpp
index 19b5b74a547..17f65487464 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.cpp
@@ -2,6 +2,32 @@
 //
 // SPDX-License-Identifier: BSD-3-Clause
 
+#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp"
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "core/base/batch_multi_vector_kernels.hpp"
+#include "core/base/batch_struct.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+
+
+constexpr auto default_block_size = 256;
+
+
 template <typename ValueType>
 void scale(std::shared_ptr<const DefaultExecutor> exec,
            const batch::MultiVector<ValueType>* const alpha,
@@ -11,16 +37,19 @@ void scale(std::shared_ptr<const DefaultExecutor> exec,
     const auto alpha_ub = get_batch_struct(alpha);
     const auto x_ub = get_batch_struct(x);
     if (alpha->get_common_size()[1] == 1) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return 0; });
     } else if (alpha->get_common_size() == x->get_common_size()) {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub, [] __device__(int row, int col, int stride) {
                 return row * stride + col;
             });
     } else {
-        scale_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+        batch_single_kernels::scale_kernel<<<num_blocks, default_block_size, 0,
+                                             exec->get_stream()>>>(
             alpha_ub, x_ub,
             [] __device__(int row, int col, int stride) { return col; });
     }
@@ -42,12 +71,12 @@ void add_scaled(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     if (alpha->get_common_size()[1] == 1) {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return 0; });
     } else {
-        add_scaled_kernel<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
+        batch_single_kernels::add_scaled_kernel<<<
+            num_blocks, default_block_size, 0, exec->get_stream()>>>(
             alpha_ub, x_ub, y_ub, [] __device__(int col) { return col; });
     }
 }
@@ -67,8 +96,8 @@ void compute_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return val; });
 }
 
@@ -87,8 +116,8 @@ void compute_conj_dot(std::shared_ptr<const DefaultExecutor> exec,
     const auto x_ub = get_batch_struct(x);
     const auto y_ub = get_batch_struct(y);
     const auto res_ub = get_batch_struct(result);
-    compute_gen_dot_product_kernel<<<num_blocks, default_block_size, 0,
-                                     exec->get_stream()>>>(
+    batch_single_kernels::compute_gen_dot_product_kernel<<<
+        num_blocks, default_block_size, 0, exec->get_stream()>>>(
         x_ub, y_ub, res_ub, [] __device__(auto val) { return conj(val); });
 }
 
@@ -105,8 +134,9 @@ void compute_norm2(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_rhs = x->get_common_size()[1];
     const auto x_ub = get_batch_struct(x);
     const auto res_ub = get_batch_struct(result);
-    compute_norm2_kernel<<<num_blocks, default_block_size, 0,
-                           exec->get_stream()>>>(x_ub, res_ub);
+    batch_single_kernels::compute_norm2_kernel<<<num_blocks, default_block_size,
+                                                 0, exec->get_stream()>>>(
+        x_ub, res_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -121,8 +151,15 @@ void copy(std::shared_ptr<const DefaultExecutor> exec,
     const auto num_blocks = x->get_num_batch_items();
     const auto result_ub = get_batch_struct(result);
     const auto x_ub = get_batch_struct(x);
-    copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
-        x_ub, result_ub);
+    batch_single_kernels::
+        copy_kernel<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+            x_ub, result_ub);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BATCH_MULTI_VECTOR_COPY_KERNEL);
+
+
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
new file mode 100644
index 00000000000..36aa69d7d99
--- /dev/null
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp
@@ -0,0 +1,326 @@
+// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
+//
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include <thrust/functional.h>
+#include <thrust/transform.h>
+
+#include <ginkgo/core/base/batch_multi_vector.hpp>
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/math.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+#include "common/cuda_hip/base/config.hpp"
+#include "common/cuda_hip/base/math.hpp"
+#include "common/cuda_hip/base/runtime.hpp"
+#include "common/cuda_hip/base/thrust.hpp"
+#include "common/cuda_hip/base/types.hpp"
+#include "common/cuda_hip/components/cooperative_groups.hpp"
+#include "common/cuda_hip/components/format_conversion.hpp"
+#include "common/cuda_hip/components/reduction.hpp"
+#include "common/cuda_hip/components/segment_scan.hpp"
+#include "common/cuda_hip/components/thread_ids.hpp"
+#include "common/cuda_hip/components/warp_blas.hpp"
+
+#if defined(GKO_COMPILING_CUDA)
+#include "cuda/base/batch_struct.hpp"
+#elif defined(GKO_COMPILING_HIP)
+#include "hip/base/batch_struct.hip.hpp"
+#else
+#error "batch struct def missing"
+#endif
+
+
+namespace gko {
+namespace kernels {
+namespace GKO_DEVICE_NAMESPACE {
+namespace batch_multi_vector {
+namespace batch_single_kernels {
+
+
+constexpr auto default_block_size = 256;
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void scale(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<ValueType>& x, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        x.values[row * x.stride + col] =
+            alpha.values[map(row, col, alpha.stride)] *
+            x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        scale(alpha_b, x_b, map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void add_scaled(
+    const gko::batch::multi_vector::batch_item<const ValueType>& alpha,
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<ValueType>& y, Mapping map)
+{
+    const int max_li = x.num_rows * x.num_rhs;
+    for (int li = threadIdx.x; li < max_li; li += blockDim.x) {
+        const int row = li / x.num_rhs;
+        const int col = li % x.num_rhs;
+
+        y.values[row * y.stride + col] +=
+            alpha.values[map(col)] * x.values[row * x.stride + col];
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto alpha_b = gko::batch::extract_batch_item(alpha, batch_id);
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        add_scaled(alpha_b, x_b, y_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
+                                                            const int num_rows,
+                                                            const ValueType* x,
+                                                            const ValueType* y,
+                                                            ValueType& result)
+
+{
+    ValueType val = zero<ValueType>();
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += conj(x[r]) * y[r];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = val;
+    }
+}
+
+
+template <typename Group, typename ValueType, typename Mapping>
+__device__ __forceinline__ void gen_one_dot(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Group subgroup, Mapping conj_map)
+{
+    ValueType val = zero<ValueType>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += conj_map(x.values[r * x.stride + rhs_index]) *
+               y.values[r * y.stride + rhs_index];
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<ValueType>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = val;
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__device__ __forceinline__ void compute_gen_dot_product(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<const ValueType>& y,
+    const gko::batch::multi_vector::batch_item<ValueType>& result,
+    Mapping conj_map)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        gen_one_dot(x, y, rhs_index, result, subgroup, conj_map);
+    }
+}
+
+
+template <typename ValueType, typename Mapping>
+__global__
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> y,
+    const gko::batch::multi_vector::uniform_batch<ValueType> result,
+    Mapping map)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto y_b = gko::batch::extract_batch_item(y, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_gen_dot_product(x_b, y_b, r_b, map);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void single_rhs_compute_norm2(
+    Group subgroup, const int num_rows, const ValueType* x,
+    remove_complex<ValueType>& result)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
+        val += squared_norm(x[r]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result = sqrt(val);
+    }
+}
+
+
+template <typename Group, typename ValueType>
+__device__ __forceinline__ void one_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const int rhs_index,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result,
+    Group subgroup)
+{
+    using real_type = typename gko::remove_complex<ValueType>;
+    real_type val = zero<real_type>();
+
+    for (int r = subgroup.thread_rank(); r < x.num_rows; r += subgroup.size()) {
+        val += squared_norm(x.values[r * x.stride + rhs_index]);
+    }
+
+    // subgroup level reduction
+    val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});
+
+    if (subgroup.thread_rank() == 0) {
+        result.values[rhs_index] = sqrt(val);
+    }
+}
+
+
+/**
+ * Computes the 2-norms of some column vectors in global or shared memory.
+ *
+ * @param x  A row-major multivector with nrhs columns.
+ * @param result  Holds norm value for each vector in x.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void compute_norm2(
+    const gko::batch::multi_vector::batch_item<const ValueType>& x,
+    const gko::batch::multi_vector::batch_item<remove_complex<ValueType>>&
+        result)
+{
+    constexpr auto tile_size = config::warp_size;
+    auto thread_block = group::this_thread_block();
+    auto subgroup = group::tiled_partition<tile_size>(thread_block);
+    const auto subgroup_id = static_cast<int>(threadIdx.x / tile_size);
+    const int num_subgroups_per_block = ceildiv(blockDim.x, tile_size);
+
+    for (int rhs_index = subgroup_id; rhs_index < x.num_rhs;
+         rhs_index += num_subgroups_per_block) {
+        one_norm2(x, rhs_index, result, subgroup);
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto x_b = gko::batch::extract_batch_item(x, batch_id);
+        const auto r_b = gko::batch::extract_batch_item(result, batch_id);
+        compute_norm2(x_b, r_b);
+    }
+}
+
+
+template <typename ValueType>
+__device__ __forceinline__ void single_rhs_copy(const int num_rows,
+                                                const ValueType* in,
+                                                ValueType* out)
+{
+    for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
+        out[iz] = in[iz];
+    }
+}
+
+
+/**
+ * Copies the values of one multi-vector into another.
+ *
+ * Note that the output multi-vector should already have memory allocated
+ * and stride set.
+ */
+template <typename ValueType>
+__device__ __forceinline__ void copy(
+    const gko::batch::multi_vector::batch_item<const ValueType>& in,
+    const gko::batch::multi_vector::batch_item<ValueType>& out)
+{
+    for (int iz = threadIdx.x; iz < in.num_rows * in.num_rhs;
+         iz += blockDim.x) {
+        const int i = iz / in.num_rhs;
+        const int j = iz % in.num_rhs;
+        out.values[i * out.stride + j] = in.values[i * in.stride + j];
+    }
+}
+
+
+template <typename ValueType>
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> src,
+    const gko::batch::multi_vector::uniform_batch<ValueType> dst)
+{
+    for (size_type batch_id = blockIdx.x; batch_id < src.num_batch_items;
+         batch_id += gridDim.x) {
+        const auto dst_b = gko::batch::extract_batch_item(dst, batch_id);
+        const auto src_b = gko::batch::extract_batch_item(src, batch_id);
+        copy(src_b, dst_b);
+    }
+}
+
+
+}  // namespace batch_single_kernels
+}  // namespace batch_multi_vector
+}  // namespace GKO_DEVICE_NAMESPACE
+}  // namespace kernels
+}  // namespace gko
diff --git a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
index 9b6301674be..7af3c84303f 100644
--- a/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
+++ b/common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
@@ -20,8 +20,7 @@ __device__ __forceinline__ void scale(
 
 
 template <typename ValueType, typename Mapping>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void scale_kernel(
+__global__ __launch_bounds__(default_block_size) void scale_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
     const gko::batch::multi_vector::uniform_batch<ValueType> x, Mapping map)
 {
@@ -52,20 +51,10 @@ __device__ __forceinline__ void add_scaled(
 
 
 template <typename ValueType, typename Mapping>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void add_scaled_kernel(const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        alpha,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<
-                                                        const ValueType>
-                                                        x,
-                                                const gko::batch::multi_vector::
-                                                    uniform_batch<ValueType>
-                                                        y,
-                                                Mapping map)
+__global__ __launch_bounds__(default_block_size) void add_scaled_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> alpha,
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<ValueType> y, Mapping map)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -145,7 +134,7 @@ __device__ __forceinline__ void compute_gen_dot_product(
 
 template <typename ValueType, typename Mapping>
 __global__
-__launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_product_kernel(
+__launch_bounds__(default_block_size) void compute_gen_dot_product_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> x,
     const gko::batch::multi_vector::uniform_batch<const ValueType> y,
     const gko::batch::multi_vector::uniform_batch<ValueType> result,
@@ -232,19 +221,10 @@ __device__ __forceinline__ void compute_norm2(
 
 
 template <typename ValueType>
-__global__ __launch_bounds__(
-    default_block_size,
-    sm_oversubscription) void compute_norm2_kernel(const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               const ValueType>
-                                                               x,
-                                                   const gko::batch::
-                                                       multi_vector::
-                                                           uniform_batch<
-                                                               remove_complex<
-                                                                   ValueType>>
-                                                               result)
+__global__ __launch_bounds__(default_block_size) void compute_norm2_kernel(
+    const gko::batch::multi_vector::uniform_batch<const ValueType> x,
+    const gko::batch::multi_vector::uniform_batch<remove_complex<ValueType>>
+        result)
 {
     for (size_type batch_id = blockIdx.x; batch_id < x.num_batch_items;
          batch_id += gridDim.x) {
@@ -287,8 +267,7 @@ __device__ __forceinline__ void copy(
 
 
 template <typename ValueType>
-__global__
-__launch_bounds__(default_block_size, sm_oversubscription) void copy_kernel(
+__global__ __launch_bounds__(default_block_size) void copy_kernel(
     const gko::batch::multi_vector::uniform_batch<const ValueType> src,
     const gko::batch::multi_vector::uniform_batch<ValueType> dst)
 {
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index d4a94eda802..3631a65f48d 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -7,7 +7,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 target_sources(ginkgo_cuda
     PRIVATE
-    base/batch_multi_vector_kernels.cu
     base/device.cpp
     base/exception.cpp
     base/executor.cpp
diff --git a/cuda/base/batch_multi_vector_kernels.cu b/cuda/base/batch_multi_vector_kernels.cu
deleted file mode 100644
index 3dad5ba94f1..00000000000
--- a/cuda/base/batch_multi_vector_kernels.cu
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/warp_blas.hpp"
-#include "core/base/batch_struct.hpp"
-#include "cuda/base/batch_struct.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace cuda {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace cuda
-}  // namespace kernels
-}  // namespace gko
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 30e675509d5..0882250dc0e 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -5,7 +5,6 @@ add_instantiation_files(${PROJECT_SOURCE_DIR}/common/cuda_hip matrix/fbcsr_kerne
 # we don't split up the dense kernels into distinct compilations
 list(APPEND GKO_UNIFIED_COMMON_SOURCES ${PROJECT_SOURCE_DIR}/common/unified/matrix/dense_kernels.instantiate.cpp)
 set(GINKGO_HIP_SOURCES
-    base/batch_multi_vector_kernels.hip.cpp
     base/device.hip.cpp
     base/exception.hip.cpp
     base/executor.hip.cpp
diff --git a/hip/base/batch_multi_vector_kernels.hip.cpp b/hip/base/batch_multi_vector_kernels.hip.cpp
deleted file mode 100644
index 701f4655a9a..00000000000
--- a/hip/base/batch_multi_vector_kernels.hip.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
-//
-// SPDX-License-Identifier: BSD-3-Clause
-
-#include "core/base/batch_multi_vector_kernels.hpp"
-
-#include <thrust/functional.h>
-#include <thrust/transform.h>
-
-#include <ginkgo/core/base/math.hpp>
-#include <ginkgo/core/base/range_accessors.hpp>
-
-#include "common/cuda_hip/base/blas_bindings.hpp"
-#include "common/cuda_hip/base/config.hpp"
-#include "common/cuda_hip/base/pointer_mode_guard.hpp"
-#include "common/cuda_hip/base/runtime.hpp"
-#include "common/cuda_hip/base/thrust.hpp"
-#include "common/cuda_hip/components/cooperative_groups.hpp"
-#include "common/cuda_hip/components/reduction.hpp"
-#include "common/cuda_hip/components/thread_ids.hpp"
-#include "common/cuda_hip/components/uninitialized_array.hpp"
-#include "core/base/batch_struct.hpp"
-#include "hip/base/batch_struct.hip.hpp"
-
-
-namespace gko {
-namespace kernels {
-namespace hip {
-/**
- * @brief The MultiVector matrix format namespace.
- *
- * @ingroup batch_multi_vector
- */
-namespace batch_multi_vector {
-
-
-constexpr auto default_block_size = 256;
-constexpr int sm_oversubscription = 4;
-
-
-// clang-format off
-
-// NOTE: DO NOT CHANGE THE ORDERING OF THE INCLUDES
-
-#include "common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc"
-
-
-#include "common/cuda_hip/base/batch_multi_vector_kernel_launcher.hpp.inc"
-
-// clang-format on
-
-
-}  // namespace batch_multi_vector
-}  // namespace hip
-}  // namespace kernels
-}  // namespace gko