Skip to content

Commit

Permalink
[prec] implement sycl sor kernels
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcelKoch committed Jul 10, 2024
1 parent 903862b commit 720c76b
Show file tree
Hide file tree
Showing 3 changed files with 194 additions and 94 deletions.
110 changes: 110 additions & 0 deletions dpcpp/factorization/factorization_helpers.dp.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors
//
// SPDX-License-Identifier: BSD-3-Clause

#include <CL/sycl.hpp>

#include "core/factorization/factorization_helpers.hpp"
#include "dpcpp/base/config.hpp"
#include "dpcpp/base/dim3.dp.hpp"
#include "dpcpp/base/dpct.hpp"
#include "dpcpp/components/thread_ids.dp.hpp"


namespace gko {
namespace kernels {
namespace dpcpp {
namespace factorization {
namespace helpers {

using namespace ::gko::factorization;


template <typename ValueType, typename IndexType, typename LClosure,
typename UClosure>
void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs,
ValueType* __restrict__ l_values,
const IndexType* __restrict__ u_row_ptrs,
IndexType* __restrict__ u_col_idxs,
ValueType* __restrict__ u_values, LClosure l_closure,
UClosure u_closure, sycl::nd_item<3> item_ct1)
{
const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately
// default diagonal to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = l_closure.map_off_diag(val);
++l_idx;
}
if (row < col) {
u_col_idxs[u_idx] = col;
u_values[u_idx] = u_closure.map_off_diag(val);
++u_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
auto u_diag_idx = u_row_ptrs[row];
l_col_idxs[l_diag_idx] = row;
u_col_idxs[u_diag_idx] = row;
l_values[l_diag_idx] = l_closure.map_diag(diag_val);
u_values[u_diag_idx] = u_closure.map_diag(diag_val);
}
}


template <typename ValueType, typename IndexType, typename LClosure>
void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs,
ValueType* __restrict__ l_values, LClosure l_closure,
sycl::nd_item<3> item_ct1)
{
const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
// if there was no diagonal entry, default to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = l_closure.map_off_diag(val);
++l_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
l_col_idxs[l_diag_idx] = row;
l_values[l_diag_idx] = l_closure.map_diag(diag_val);
}
}


} // namespace helpers
} // namespace factorization
} // namespace dpcpp
} // namespace kernels
} // namespace gko
114 changes: 23 additions & 91 deletions dpcpp/factorization/factorization_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "dpcpp/components/intrinsics.dp.hpp"
#include "dpcpp/components/searching.dp.hpp"
#include "dpcpp/components/thread_ids.dp.hpp"
#include "dpcpp/factorization/factorization_helpers.dp.hpp"


namespace gko {
Expand Down Expand Up @@ -320,51 +321,6 @@ void count_nnz_per_l_u_row(dim3 grid, dim3 block,
}


template <typename ValueType, typename IndexType>
void initialize_l_u(size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs,
ValueType* __restrict__ l_values,
const IndexType* __restrict__ u_row_ptrs,
IndexType* __restrict__ u_col_idxs,
ValueType* __restrict__ u_values, sycl::nd_item<3> item_ct1)
{
const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately
// default diagonal to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = val;
++l_idx;
}
if (row < col) {
u_col_idxs[u_idx] = col;
u_values[u_idx] = val;
++u_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
auto u_diag_idx = u_row_ptrs[row];
l_col_idxs[l_diag_idx] = row;
u_col_idxs[u_diag_idx] = row;
l_values[l_diag_idx] = one<ValueType>();
u_values[u_diag_idx] = diag_val;
}
}

template <typename ValueType, typename IndexType>
void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
sycl::queue* queue, size_type num_rows,
Expand All @@ -376,9 +332,14 @@ void initialize_l_u(dim3 grid, dim3 block, size_type dynamic_shared_memory,
{
queue->parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
initialize_l_u(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
l_col_idxs, l_values, u_row_ptrs, u_col_idxs,
u_values, item_ct1);
helpers::initialize_l_u(
num_rows, row_ptrs, col_idxs, values, l_row_ptrs, l_col_idxs,
l_values, u_row_ptrs, u_col_idxs, u_values,
helpers::triangular_mtx_closure(
[](auto) { return one<ValueType>(); }, helpers::identity{}),
helpers::triangular_mtx_closure(helpers::identity{},
helpers::identity{}),
item_ct1);
});
}

Expand Down Expand Up @@ -418,47 +379,6 @@ void count_nnz_per_l_row(dim3 grid, dim3 block, size_type dynamic_shared_memory,
}


template <typename ValueType, typename IndexType>
void initialize_l(size_type num_rows, const IndexType* __restrict__ row_ptrs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values,
const IndexType* __restrict__ l_row_ptrs,
IndexType* __restrict__ l_col_idxs,
ValueType* __restrict__ l_values, bool use_sqrt,
sycl::nd_item<3> item_ct1)
{
const auto row = thread::get_thread_id_flat<IndexType>(item_ct1);
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
// if there was no diagonal entry, default to one
auto diag_val = one<ValueType>();
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
diag_val = val;
}
if (col < row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = val;
++l_idx;
}
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
l_col_idxs[l_diag_idx] = row;
// compute square root with sentinel
if (use_sqrt) {
diag_val = std::sqrt(diag_val);
if (!is_finite(diag_val)) {
diag_val = one<ValueType>();
}
}
l_values[l_diag_idx] = diag_val;
}
}

template <typename ValueType, typename IndexType>
void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
sycl::queue* queue, size_type num_rows,
Expand All @@ -468,8 +388,20 @@ void initialize_l(dim3 grid, dim3 block, size_type dynamic_shared_memory,
{
queue->parallel_for(
sycl_nd_range(grid, block), [=](sycl::nd_item<3> item_ct1) {
initialize_l(num_rows, row_ptrs, col_idxs, values, l_row_ptrs,
l_col_idxs, l_values, use_sqrt, item_ct1);
helpers::initialize_l(num_rows, row_ptrs, col_idxs, values,
l_row_ptrs, l_col_idxs, l_values,
helpers::triangular_mtx_closure(
[use_sqrt](auto val) {
if (use_sqrt) {
val = sqrt(val);
if (!is_finite(val)) {
val = one<ValueType>();
}
}
return val;
},
helpers::identity{}),
item_ct1);
});
}

Expand Down
64 changes: 61 additions & 3 deletions dpcpp/preconditioner/sor_kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,44 @@
#include <ginkgo/core/base/math.hpp>
#include <ginkgo/core/matrix/csr.hpp>

#include "dpcpp/factorization/factorization_helpers.dp.hpp"

namespace gko {
namespace kernels {
namespace dpcpp {
namespace sor {


constexpr int default_block_size{256};


template <typename ValueType, typename IndexType>
void initialize_weighted_l(
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* system_matrix,
remove_complex<ValueType> weight,
matrix::Csr<ValueType, IndexType>* l_mtx) GKO_NOT_IMPLEMENTED;
remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx)
{
const size_type num_rows{system_matrix->get_size()[0]};
const dim3 block_size{default_block_size, 1, 1};
const dim3 grid_dim{static_cast<uint32>(ceildiv(
num_rows, static_cast<size_type>(block_size.x))),
1, 1};

auto inv_weight = one(weight) / weight;

exec->get_queue()->parallel_for(
sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
factorization::kernel::initialize_l(
num_rows, system_matrix->get_const_row_ptrs(),
system_matrix->get_const_col_idxs(),
system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
l_mtx->get_col_idxs(), l_mtx->get_values(),
factorization::kernel::triangular_mtx_closure(
[inv_weight](auto val) { return val * inv_weight; },
[](auto val) { return val; }),
item_ct1);
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L);
Expand All @@ -30,7 +55,40 @@ void initialize_weighted_l_u(
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<ValueType, IndexType>* system_matrix,
remove_complex<ValueType> weight, matrix::Csr<ValueType, IndexType>* l_mtx,
matrix::Csr<ValueType, IndexType>* u_mtx) GKO_NOT_IMPLEMENTED;
matrix::Csr<ValueType, IndexType>* u_mtx)
{
const size_type num_rows{system_matrix->get_size()[0]};
const dim3 block_size{default_block_size, 1, 1};
const dim3 grid_dim{static_cast<uint32>(ceildiv(
num_rows, static_cast<size_type>(block_size.x))),
1, 1};

auto inv_weight = one(weight) / weight;
auto inv_two_minus_weight =
one(weight) / (static_cast<remove_complex<ValueType>>(2.0) - weight);

exec->get_queue()->parallel_for(
sycl_nd_range(grid_dim, block_size), [=](sycl::nd_item<3> item_ct1) {
initialize_l_u(
num_rows, system_matrix->get_const_row_ptrs(),
system_matrix->get_const_col_idxs(),
system_matrix->get_const_values(), l_mtx->get_const_row_ptrs(),
l_mtx->get_col_idxs(), l_mtx->get_values(),
u_mtx->get_const_row_ptrs(), u_mtx->get_col_idxs(),
u_mtx->get_values(),
factorization::kernel::triangular_mtx_closure(
[inv_weight](auto val) { return val * inv_weight; },
factorization::kernel::identity{}),
factorization::kernel::triangular_mtx_closure(
[inv_two_minus_weight](auto val) {
return val * inv_two_minus_weight;
},
[weight, inv_two_minus_weight](auto val) {
return val * weight * inv_two_minus_weight;
}),
item_ct1);
});
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_SOR_INITIALIZE_WEIGHTED_L_U);
Expand Down

0 comments on commit 720c76b

Please sign in to comment.