Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CUDA, HIP and DPCPP batch bicgstab kernels #1443

Merged
merged 28 commits into from
Nov 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
ff31a16
Add cuda batch bicgstab kernels
pratikvn Oct 26, 2023
b357c6b
Add hip bicgstab solver kernels
pratikvn Oct 26, 2023
cf9839a
Add dpcpp kernels
pratikvn Oct 27, 2023
1ef1f68
Fix dpcpp kernel issues
pratikvn Oct 28, 2023
6bcdd57
add mvec single rhs specializations
pratikvn Oct 29, 2023
20fe495
minor dpcpp fixes
pratikvn Oct 29, 2023
2a41fd7
Review updates
pratikvn Oct 29, 2023
5c0f4f4
Fix sycl group and subgroup sizes
pratikvn Oct 29, 2023
a3fe9bb
Format files
ginkgo-bot Oct 29, 2023
4072b50
Review updates
pratikvn Oct 30, 2023
84be7dd
Use synchronize for error handling
pratikvn Nov 1, 2023
c6e9543
Format files
ginkgo-bot Nov 1, 2023
1054b7b
Add scoped cuda shmem config
pratikvn Nov 1, 2023
cc22557
move max_shmem query to internal
pratikvn Nov 1, 2023
501c4e7
Update size_type in tests
pratikvn Nov 2, 2023
7b0ebfd
Update contributors.txt
pratikvn Nov 2, 2023
f1babfd
review updates
pratikvn Nov 2, 2023
221bba9
Format files
ginkgo-bot Nov 2, 2023
aa026c1
dpcpp group size and doc fixes
pratikvn Nov 2, 2023
79e5cad
use global_and_local barrier
pratikvn Nov 3, 2023
693d308
Fix Intel2020 apply call issue
pratikvn Nov 3, 2023
705339e
Fix diag_dominance and tol issue
pratikvn Nov 3, 2023
6729f68
Fix some include issues
pratikvn Nov 4, 2023
eebc06a
Review updates
pratikvn Nov 4, 2023
498512c
use fence_space::global_and_local
pratikvn Nov 4, 2023
1bc6d83
Use updated deferred factory macros.
pratikvn Nov 5, 2023
79e68b3
Review updates
pratikvn Nov 5, 2023
a1b84d4
Format files
ginkgo-bot Nov 5, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions common/cuda_hip/base/batch_multi_vector_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,28 @@ __global__ __launch_bounds__(
}


template <typename Group, typename ValueType>
__device__ __forceinline__ void single_rhs_compute_conj_dot(Group subgroup,
const int num_rows,
const ValueType* x,
const ValueType* y,
ValueType& result)

{
ValueType val = zero<ValueType>();
for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
val += conj(x[r]) * y[r];
}

// subgroup level reduction
val = reduce(subgroup, val, thrust::plus<ValueType>{});

if (subgroup.thread_rank() == 0) {
result = val;
}
}


template <typename Group, typename ValueType, typename Mapping>
__device__ __forceinline__ void gen_one_dot(
const gko::batch::multi_vector::batch_item<const ValueType>& x,
Expand Down Expand Up @@ -165,6 +187,27 @@ __launch_bounds__(default_block_size, sm_oversubscription) void compute_gen_dot_
}


template <typename Group, typename ValueType>
__device__ __forceinline__ void single_rhs_compute_norm2(
Group subgroup, const int num_rows, const ValueType* x,
remove_complex<ValueType>& result)
{
using real_type = typename gko::remove_complex<ValueType>;
real_type val = zero<real_type>();

for (int r = subgroup.thread_rank(); r < num_rows; r += subgroup.size()) {
val += squared_norm(x[r]);
}

// subgroup level reduction
val = reduce(subgroup, val, thrust::plus<remove_complex<ValueType>>{});

if (subgroup.thread_rank() == 0) {
result = sqrt(val);
}
}


template <typename Group, typename ValueType>
__device__ __forceinline__ void one_norm2(
const gko::batch::multi_vector::batch_item<const ValueType>& x,
Expand Down Expand Up @@ -238,6 +281,17 @@ __global__ __launch_bounds__(
}


template <typename ValueType>
__device__ __forceinline__ void single_rhs_copy(const int num_rows,
const ValueType* in,
ValueType* out)
{
for (int iz = threadIdx.x; iz < num_rows; iz += blockDim.x) {
out[iz] = in[iz];
}
}


/**
* Copies the values of one multi-vector into another.
*
Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/log/batch_logger.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
template <typename RealType>
class SimpleFinalLogger final {
public:
using real_type = remove_complex<RealType>;
using real_type = RealType;

SimpleFinalLogger(real_type* const batch_residuals, int* const batch_iters)
: final_residuals_{batch_residuals}, final_iters_{batch_iters}
Expand Down
13 changes: 3 additions & 10 deletions common/cuda_hip/preconditioner/batch_identity.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -45,16 +45,9 @@ public:
return 0;
}

__device__ __forceinline__ void generate(
size_type,
const gko::batch::matrix::ell::batch_item<const ValueType, gko::int32>&,
ValueType*)
{}

__device__ __forceinline__ void generate(
size_type,
const gko::batch::matrix::dense::batch_item<const ValueType>&,
ValueType*)
template <typename batch_item_type>
__device__ __forceinline__ void generate(size_type, const batch_item_type&,
ValueType*)
{}

__device__ __forceinline__ void apply(const int num_rows,
Expand Down
Loading
Loading