Skip to content

Commit

Permalink
Implement specialization constants
Browse files Browse the repository at this point in the history
  • Loading branch information
fknorr committed Jan 5, 2024
1 parent b74370d commit 4dd312c
Show file tree
Hide file tree
Showing 5 changed files with 167 additions and 56 deletions.
86 changes: 60 additions & 26 deletions include/simsycl/detail/parallel_for.hh
Original file line number Diff line number Diff line change
Expand Up @@ -64,64 +64,88 @@ template<int Dimensions>
void cooperative_for_nd_range(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, const nd_kernel<Dimensions> &kernel);

template<typename KernelName, int Dimensions, typename Offset, typename KernelFunc, typename... Params>
void execute_parallel_for(
const sycl::range<Dimensions> &range, const Offset &offset, KernelFunc &&func, Params &&...args) {
template<typename KernelName, int Dimensions, typename Offset, typename KernelFunc, typename... Reducers>
void execute_parallel_for(const sycl::range<Dimensions> &range, const Offset &offset, sycl::kernel_handler kh,
const KernelFunc &func,
Reducers &...reducers) //
{
register_kernel_on_static_construction<KernelName, KernelFunc>();
const simple_kernel<Dimensions, with_offset_v<Offset>> kernel(
[&](const sycl::item<Dimensions> &item) { func(item, std::forward<Params>(args)...); });

simple_kernel<Dimensions, with_offset_v<Offset>> kernel;
if constexpr(std::is_invocable_v<const KernelFunc, sycl::item<Dimensions, with_offset_v<Offset>>, Reducers &...,
sycl::kernel_handler>) {
kernel = [&](const sycl::item<Dimensions> &item) { func(item, reducers..., kh); };
} else {
static_assert(
std::is_invocable_v<const KernelFunc, sycl::item<Dimensions, with_offset_v<Offset>>, Reducers &...>);
kernel = [&](const sycl::item<Dimensions> &item) { func(item, reducers...); };
}
sequential_for(range, offset, kernel);
}

template<typename KernelName, int Dimensions, typename KernelFunc, typename... Params>
template<typename KernelName, int Dimensions, typename KernelFunc, typename... Reducers>
void execute_parallel_for(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, KernelFunc &&func, Params &&...args) {
const nd_kernel<Dimensions> kernel(
[&](const sycl::nd_item<Dimensions> &item) { func(item, std::forward<Params>(args)...); });
const std::vector<local_memory_requirement> &local_memory, sycl::kernel_handler kh, const KernelFunc &func,
Reducers &...reducers) //
{
register_kernel_on_static_construction<KernelName, KernelFunc>();

nd_kernel<Dimensions> kernel;
if constexpr(std::is_invocable_v<const KernelFunc, sycl::nd_item<Dimensions>, Reducers &...,
sycl::kernel_handler>) {
kernel = [&](const sycl::nd_item<Dimensions> &item) { func(item, reducers..., kh); };
} else {
static_assert(std::is_invocable_v<const KernelFunc, sycl::nd_item<Dimensions>, Reducers &...>);
kernel = [&](const sycl::nd_item<Dimensions> &item) { func(item, reducers...); };
}
cooperative_for_nd_range(device, range, local_memory, kernel);
}

template<typename KernelName, typename KernelFunc>
void execute_single_task(KernelFunc &&func) {
void execute_single_task(sycl::kernel_handler kh, KernelFunc &&func) {
register_kernel_on_static_construction<KernelName, KernelFunc>();
func();
if constexpr(std::is_invocable_v<const KernelFunc, sycl::kernel_handler>) {
func(kh);
} else {
static_assert(std::is_invocable_v<const KernelFunc>);
func();
}
}

template<typename KernelName, int Dimensions, typename ParamTuple, size_t... ReductionIndices, size_t KernelIndex>
void dispatch_parallel_for(const sycl::range<Dimensions> &range, ParamTuple &&params,
void dispatch_parallel_for(const sycl::range<Dimensions> &range, sycl::kernel_handler kh, ParamTuple &&params,
std::index_sequence<ReductionIndices...> /* reduction_indices */,
std::index_sequence<KernelIndex> /* kernel_index */) {
auto &kernel_func = std::get<KernelIndex>(params);
execute_parallel_for<KernelName>(range, no_offset, kernel_func, std::get<ReductionIndices>(params)...);
execute_parallel_for<KernelName>(range, no_offset, kh, kernel_func, std::get<ReductionIndices>(params)...);
}

template<typename KernelName, int Dimensions, typename ParamTuple, size_t... ReductionIndices, size_t KernelIndex>
template<typename KernelName, int Dimensions, typename RestTuple, size_t... ReductionIndices, size_t KernelIndex>
void dispatch_parallel_for(const sycl::device &device, const sycl::nd_range<Dimensions> &range,
const std::vector<local_memory_requirement> &local_memory, ParamTuple &&params,
const std::vector<local_memory_requirement> &local_memory, sycl::kernel_handler kh, RestTuple &&rest,
std::index_sequence<ReductionIndices...> /* reduction_indices */,
std::index_sequence<KernelIndex> /* kernel_index */) {
const auto &kernel_func = std::get<KernelIndex>(params);
execute_parallel_for<KernelName>(device, range, local_memory, kernel_func, std::get<ReductionIndices>(params)...);
const auto &kernel_func = std::get<KernelIndex>(rest);
execute_parallel_for<KernelName>(device, range, local_memory, kh, kernel_func, std::get<ReductionIndices>(rest)...);
}

template<typename KernelName, int Dimensions, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
void parallel_for(sycl::range<Dimensions> num_work_items, Rest &&...rest) {
dispatch_parallel_for<KernelName>(num_work_items, std::forward_as_tuple(std::forward<Rest>(rest)...),
void parallel_for(sycl::range<Dimensions> num_work_items, sycl::kernel_handler kh, Rest &&...rest) {
dispatch_parallel_for<KernelName>(num_work_items, kh, std::forward_as_tuple(std::forward<Rest>(rest)...),
std::make_index_sequence<sizeof...(Rest) - 1>(), std::index_sequence<sizeof...(Rest) - 1>());
}

template<typename KernelName, typename KernelFunc, int Dimensions>
void parallel_for(
sycl::range<Dimensions> num_work_items, sycl::id<Dimensions> work_item_offset, KernelFunc &&kernel_func) {
execute_parallel_for<KernelName>(num_work_items, work_item_offset, kernel_func);
void parallel_for(sycl::range<Dimensions> num_work_items, sycl::id<Dimensions> work_item_offset,
sycl::kernel_handler kh, const KernelFunc &kernel_func) {
execute_parallel_for<KernelName>(num_work_items, work_item_offset, kh, kernel_func);
}

template<typename KernelName = unnamed_kernel, int Dimensions, typename... Rest,
std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
void parallel_for(const sycl::device &device, sycl::nd_range<Dimensions> execution_range,
const std::vector<local_memory_requirement> &local_memory, Rest &&...rest) {
detail::dispatch_parallel_for<KernelName>(device, execution_range, local_memory,
const std::vector<local_memory_requirement> &local_memory, sycl::kernel_handler kh, Rest &&...rest) {
detail::dispatch_parallel_for<KernelName>(device, execution_range, local_memory, kh,
std::forward_as_tuple(std::forward<Rest>(rest)...), std::make_index_sequence<sizeof...(Rest) - 1>(),
std::index_sequence<sizeof...(Rest) - 1>());
}
Expand All @@ -133,10 +157,20 @@ template<int Dimensions>
template<typename KernelName, int Dimensions, typename WorkgroupFunctionType>
void parallel_for_work_group(const sycl::device &device, sycl::range<Dimensions> num_work_groups,
std::optional<sycl::range<Dimensions>> work_group_size, const std::vector<local_memory_requirement> &local_memory,
const WorkgroupFunctionType &kernel_func) {
sycl::kernel_handler kh, const WorkgroupFunctionType &kernel_func) //
{
register_kernel_on_static_construction<KernelName, WorkgroupFunctionType>();

hierarchical_kernel<Dimensions> kernel;
if constexpr(std::is_invocable_v<const WorkgroupFunctionType, sycl::group<Dimensions>, sycl::kernel_handler>) {
kernel = [&](const sycl::group<Dimensions> &group) { kernel_func(group, kh); };
} else {
static_assert(std::is_invocable_v<const WorkgroupFunctionType, sycl::group<Dimensions>>);
kernel = kernel_func;
}

const auto local_allocations = prepare_hierarchical_parallel_for(device, work_group_size, local_memory);
sequential_for_work_group(num_work_groups, work_group_size, hierarchical_kernel<Dimensions>(kernel_func));
sequential_for_work_group(num_work_groups, work_group_size, kernel);
}

} // namespace simsycl::detail
3 changes: 3 additions & 0 deletions include/simsycl/sycl/forward.hh
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,9 @@ class sampled_image;
template<typename DataT, int Dimensions, image_target AccessTarget = image_target::device>
class sampled_image_accessor;

template<typename T>
class specialization_id;

class stream;

template<typename DataT, int Dimensions, access_mode AccessMode, image_target AccessTarget = image_target::device>
Expand Down
50 changes: 41 additions & 9 deletions include/simsycl/sycl/handler.hh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <any>
#include <cassert>
#include <cstddef>
#include <cstring>
Expand Down Expand Up @@ -72,43 +73,45 @@ class handler {

template<typename KernelName = simsycl::detail::unnamed_kernel, typename KernelType>
void single_task(const KernelType &kernel_func) {
detail::execute_single_task<KernelName>(kernel_func);
detail::execute_single_task<KernelName>(kernel_handler(this), kernel_func);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename... Rest>
requires(sizeof...(Rest) > 0)
void parallel_for(size_t num_work_items, Rest &&...rest) {
detail::parallel_for<KernelName>(range<1>(num_work_items), std::forward<Rest>(rest)...);
detail::parallel_for<KernelName>(range<1>(num_work_items), kernel_handler(this), std::forward<Rest>(rest)...);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, int Dimensions, typename... Rest>
requires(sizeof...(Rest) > 0 && Dimensions > 0)
void parallel_for(range<Dimensions> num_work_items, Rest &&...rest) {
detail::parallel_for<KernelName>(num_work_items, std::forward<Rest>(rest)...);
detail::parallel_for<KernelName>(num_work_items, kernel_handler(this), std::forward<Rest>(rest)...);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename KernelType, int Dimensions>
SIMSYCL_DETAIL_DEPRECATED_IN_SYCL void parallel_for(
range<Dimensions> num_work_items, id<Dimensions> work_item_offset, KernelType &&kernel_func) {
detail::parallel_for<KernelName>(num_work_items, work_item_offset, kernel_func);
detail::parallel_for<KernelName>(num_work_items, work_item_offset, kernel_handler(this), kernel_func);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, int Dimensions, typename... Rest>
requires(sizeof...(Rest) > 0)
void parallel_for(nd_range<Dimensions> execution_range, Rest &&...rest) {
detail::parallel_for<KernelName>(m_device, execution_range, m_local_memory, std::forward<Rest>(rest)...);
detail::parallel_for<KernelName>(
m_device, execution_range, m_local_memory, kernel_handler(this), std::forward<Rest>(rest)...);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename WorkgroupFunctionType, int Dimensions>
void parallel_for_work_group(range<Dimensions> num_work_groups, const WorkgroupFunctionType &kernel_func) {
detail::parallel_for_work_group<KernelName>(m_device, num_work_groups, {}, m_local_memory, kernel_func);
detail::parallel_for_work_group<KernelName>(
m_device, num_work_groups, {}, m_local_memory, kernel_handler(this), kernel_func);
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename WorkgroupFunctionType, int Dimensions>
void parallel_for_work_group(range<Dimensions> num_work_groups, range<Dimensions> work_group_size,
const WorkgroupFunctionType &kernel_func) {
detail::parallel_for_work_group<KernelName>(
m_device, num_work_groups, {work_group_size}, m_local_memory, kernel_func);
m_device, num_work_groups, {work_group_size}, m_local_memory, kernel_handler(this), kernel_func);
}

void single_task(const kernel &kernel_object) {
Expand Down Expand Up @@ -206,21 +209,50 @@ class handler {
void use_kernel_bundle(const kernel_bundle<bundle_state::executable> &exec_bundle);

template<auto &SpecName>
void set_specialization_constant(typename std::remove_reference_t<decltype(SpecName)>::value_type value);
void set_specialization_constant(typename std::remove_reference_t<decltype(SpecName)>::value_type value) {
static_assert(detail::is_specialization_id_v<std::remove_cvref_t<decltype(SpecName)>>);
if(auto existing = find_specialization_constant(this, &SpecName)) {
*existing = value;
} else {
m_specialization_constants.emplace_back(&SpecName, value);
}
}

template<auto &SpecName>
typename std::remove_reference_t<decltype(SpecName)>::value_type get_specialization_constant();
typename std::remove_reference_t<decltype(SpecName)>::value_type get_specialization_constant() const {
static_assert(detail::is_specialization_id_v<std::remove_cvref_t<decltype(SpecName)>>);
if(auto existing = find_specialization_constant(this, &SpecName)) {
return std::any_cast<typename std::remove_reference_t<decltype(SpecName)>::value_type>(*existing);
}
return detail::get_specialization_default(SpecName);
}

private:
friend handler simsycl::detail::make_handler(const sycl::device &device);
friend void **simsycl::detail::require_local_memory(handler &cgh, size_t size, size_t align);

device m_device;
std::vector<detail::local_memory_requirement> m_local_memory;
std::vector<std::pair<const void *, std::any>> m_specialization_constants;

explicit handler(const device &device) : m_device(device) {}

static auto find_specialization_constant(auto self, const void *spec_id)
-> decltype(&self->m_specialization_constants[0].second) {
if(const auto it = std::find_if(self->m_specialization_constants.begin(),
self->m_specialization_constants.end(), [&](const auto &pair) { return pair.first == spec_id; });
it != self->m_specialization_constants.end()) {
return &it->second;
}
return nullptr;
}
};

template<auto &SpecName>
typename std::remove_reference_t<decltype(SpecName)>::value_type kernel_handler::get_specialization_constant() {
return m_cgh->get_specialization_constant<SpecName>();
}

} // namespace simsycl::sycl

namespace simsycl::detail {
Expand Down
63 changes: 57 additions & 6 deletions include/simsycl/sycl/kernel.hh
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,63 @@
#include <vector>


namespace simsycl::detail {

template<typename T>
struct is_specialization_id : std::false_type {};

template<typename T>
struct is_specialization_id<sycl::specialization_id<T>> : std::true_type {};

template<typename T>
inline constexpr bool is_specialization_id_v = is_specialization_id<T>::value;

template<typename T>
const T &get_specialization_default(const sycl::specialization_id<T> &id) {
return id.m_default_value;
}

} // namespace simsycl::detail

namespace simsycl::sycl {

template<typename T>
class specialization_id {
public:
using value_type = T;

template<class... Args>
explicit constexpr specialization_id(Args &&...args)
requires(std::is_constructible_v<T, Args...>)
: m_default_value(std::forward<Args>(args)...) {}

specialization_id(const specialization_id &rhs) = delete;
specialization_id(specialization_id &&rhs) = delete;
specialization_id &operator=(const specialization_id &rhs) = delete;
specialization_id &operator=(specialization_id &&rhs) = delete;

private:
template<typename U>
friend const U &detail::get_specialization_default(const sycl::specialization_id<U> &id);

value_type m_default_value;
};

class kernel_handler {
public:
template<auto &SpecName>
typename std::remove_reference_t<decltype(SpecName)>::value_type get_specialization_constant();
// implemented in handler.hh

private:
friend class handler;
explicit kernel_handler(handler *cgh) : m_cgh(cgh) {}

handler *m_cgh;
};

} // namespace simsycl::sycl

namespace simsycl::detail {

struct kernel_id_state {
Expand Down Expand Up @@ -226,12 +283,6 @@ kernel kernel_bundle<State>::get_kernel() const
return get_kernel(get_kernel_id<KernelName>());
}

class kernel_handler {
public:
template<auto &SpecName>
typename std::remove_reference_t<decltype(SpecName)>::value_type get_specialization_constant();
};

template<typename KernelName>
kernel_id get_kernel_id() {
return detail::get_kernel_id(typeid(KernelName *));
Expand Down
21 changes: 6 additions & 15 deletions include/simsycl/sycl/queue.hh
Original file line number Diff line number Diff line change
Expand Up @@ -112,25 +112,19 @@ class queue final : public detail::reference_type<queue, detail::queue_state>,

template<typename KernelName = simsycl::detail::unnamed_kernel, typename KernelFunc>
event single_task(const KernelFunc &kernel_func) {
auto status = detail::event_state::submit_and_start();
detail::execute_single_task<KernelName>(kernel_func);
return status.end();
return submit([&](handler &cgh) { cgh.single_task<KernelName>(kernel_func); });
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename KernelType>
event single_task(event dep_event, const KernelType &kernel_func) {
(void)dep_event;
auto status = detail::event_state::submit_and_start();
detail::execute_single_task<KernelName>(kernel_func);
return status.end();
return submit([&](handler &cgh) { cgh.single_task<KernelName>(kernel_func); });
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename KernelType>
event single_task(const std::vector<event> &dep_events, const KernelType &kernel_func) {
(void)dep_events;
auto status = detail::event_state::submit_and_start();
detail::execute_single_task<KernelName>(kernel_func);
return status.end();
return submit([&](handler &cgh) { cgh.single_task<KernelName>(kernel_func); });
}

template<typename KernelName = simsycl::detail::unnamed_kernel, typename... Rest,
Expand Down Expand Up @@ -430,16 +424,13 @@ class queue final : public detail::reference_type<queue, detail::queue_state>,

template<typename KernelName, int Dims, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
event simple_parallel_for(range<Dims> num_work_items, Rest &&...rest) {
auto status = detail::event_state::submit_and_start();
simsycl::detail::parallel_for<KernelName>(num_work_items, std::forward<Rest>(rest)...);
return status.end();
return submit([&](handler &cgh) { cgh.parallel_for<KernelName>(num_work_items, std::forward<Rest>(rest)...); });
}

template<typename KernelName, int Dims, typename... Rest, std::enable_if_t<(sizeof...(Rest) > 0), int> = 0>
event parallel_for_nd_range(nd_range<Dims> execution_range, Rest &&...rest) {
auto status = detail::event_state::submit_and_start();
simsycl::detail::parallel_for<KernelName>(get_device(), execution_range, {}, std::forward<Rest>(rest)...);
return status.end();
return submit(
[&](handler &cgh) { cgh.parallel_for<KernelName>(execution_range, std::forward<Rest>(rest)...); });
}
};

Expand Down

0 comments on commit 4dd312c

Please sign in to comment.