diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index e616dafc2e3cd..6e7a54679f75b 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1566,12 +1566,12 @@ def SYCLType: InheritableAttr { let Subjects = SubjectList<[CXXRecord, Enum], ErrorDiag>; let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; let Args = [EnumArgument<"Type", "SYCLType", /*is_string=*/true, - ["accessor", "local_accessor", + ["accessor", "local_accessor", "work_group_memory", "specialization_id", "kernel_handler", "buffer_location", "no_alias", "accessor_property_list", "group", "private_memory", "aspect", "annotated_ptr", "annotated_arg", "stream", "sampler", "host_pipe", "multi_ptr"], - ["accessor", "local_accessor", + ["accessor", "local_accessor", "work_group_memory", "specialization_id", "kernel_handler", "buffer_location", "no_alias", "accessor_property_list", "group", "private_memory", "aspect", "annotated_ptr", "annotated_arg", diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h index 451d24f087c44..a79d9f55d7736 100644 --- a/clang/include/clang/Sema/SemaSYCL.h +++ b/clang/include/clang/Sema/SemaSYCL.h @@ -62,7 +62,8 @@ class SYCLIntegrationHeader { kind_pointer, kind_specialization_constants_buffer, kind_stream, - kind_last = kind_stream + kind_work_group_memory, + kind_last = kind_work_group_memory }; public: diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 9150ece385879..d8925669b1ae3 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -4693,6 +4693,9 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { CurOffset + offsetOf(FD, FieldTy)); } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::stream)) { addParam(FD, FieldTy, SYCLIntegrationHeader::kind_stream); + } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::work_group_memory)) { + addParam(FieldTy, SYCLIntegrationHeader::kind_work_group_memory, + offsetOf(FD, FieldTy)); } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::sampler) || SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::annotated_ptr) || SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::annotated_arg)) { @@ -5773,6 +5776,7 @@ static const char *paramKind2Str(KernelParamKind K) { CASE(stream); CASE(specialization_constants_buffer); CASE(pointer); + CASE(work_group_memory); } return ""; diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp index 63b304535302f..9fbf260b0c350 100644 --- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp +++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp @@ -649,6 +649,24 @@ const stream& operator<<(const stream &S, T&&) { return S; } +// Dummy implementation of work_group_memory for use in CodeGenSYCL tests. +template +class __attribute__((sycl_special_class)) +__SYCL_TYPE(work_group_memory) work_group_memory { +public: + work_group_memory(handler &CGH) {} +#ifdef __SYCL_DEVICE_ONLY__ + // Default constructor for objects later initialized with __init member. + work_group_memory() = default; +#endif + + void __init(__attribute((opencl_local)) DataT *Ptr) { this->Ptr = Ptr; } + __attribute((opencl_local)) DataT *operator&() const { return Ptr; } + +private: + __attribute((opencl_local)) DataT *Ptr; +}; + template class buffer { diff --git a/clang/test/CodeGenSYCL/work_group_memory.cpp b/clang/test/CodeGenSYCL/work_group_memory.cpp new file mode 100644 index 0000000000000..a4ebb72ad862d --- /dev/null +++ b/clang/test/CodeGenSYCL/work_group_memory.cpp @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm %s -o %t.ll +// RUN: FileCheck < %t.ll %s --check-prefix CHECK-IR +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-int-header=%t.h %s +// RUN: FileCheck < %t.h %s --check-prefix CHECK-INT-HEADER +// +// Tests for work_group_memory kernel parameter using the dummy implementation in Inputs/sycl.hpp. +// The first two RUN commands verify that the init call is generated with the correct arguments in LLVM IR +// and the second two RUN commands verify the contents of the integration header produced by the frontend. +// +// CHECK-IR: define dso_local spir_kernel void @ +// CHECK-IR-SAME: ptr addrspace(3) noundef align 4 [[PTR:%[a-zA-Z0-9_]+]] +// +// CHECK-IR: [[PTR]].addr = alloca ptr addrspace(3), align 8 +// CHECK-IR: [[PTR]].addr.ascast = addrspacecast ptr [[PTR]].addr to ptr addrspace(4) +// CHECK-IR: store ptr addrspace(3) [[PTR]], ptr addrspace(4) [[PTR]].addr.ascast, align 8 +// CHECK-IR: [[PTR_LOAD:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) [[PTR]].addr.ascast, align 8 +// +// CHECK-IR: call spir_func void @{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %{{[a-zA-Z0-9_]+}}, ptr addrspace(3) noundef [[PTR_LOAD]]) +// +// CHECK-INT-HEADER: const kernel_param_desc_t kernel_signatures[] = { +// CHECK-INT-HEADER-NEXT: //--- _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_4itemILi1EEEE_ +// CHECK-INT-HEADER-NEXT: { kernel_param_kind_t::kind_work_group_memory, {{[4,8]}}, 0 }, +// CHECK-INT-HEADER-EMPTY: +// CHECK-INT-HEADER-NEXT: { kernel_param_kind_t::kind_invalid, -987654321, -987654321 }, +// CHECK-INT-HEADER-NEXT: }; + +#include "Inputs/sycl.hpp" + +int main() { + sycl::queue Q; + Q.submit([&](sycl::handler &CGH) { + sycl::work_group_memory mem; + sycl::range<1> ndr; + CGH.parallel_for(ndr, [=](sycl::item<1> it) { int *ptr = &mem; }); + }); + return 0; +} diff --git a/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp b/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp index 6ef9122d45209..820013ed0eff7 100644 --- a/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp +++ b/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp @@ -18,6 +18,7 @@ namespace detail { kind_pointer = 3, kind_specialization_constants_buffer = 4, kind_stream = 5, + kind_work_group_memory = 6, kind_invalid = 0xf, // not a valid kernel kind }; diff --git a/sycl-jit/common/include/Kernel.h b/sycl-jit/common/include/Kernel.h index 4b4bb35b9bbf9..2959b2e1b9871 100644 --- a/sycl-jit/common/include/Kernel.h +++ b/sycl-jit/common/include/Kernel.h @@ -58,6 +58,7 @@ enum class ParameterKind : uint32_t { Pointer = 3, SpecConstBuffer = 4, Stream = 5, + WorkGroupMemory = 6, Invalid = 0xF, }; diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp index 1049c4d78aadd..d4fd31fc889f5 100644 --- a/sycl/include/sycl/detail/kernel_desc.hpp +++ b/sycl/include/sycl/detail/kernel_desc.hpp @@ -58,6 +58,7 @@ enum class kernel_param_kind_t { kind_pointer = 3, kind_specialization_constants_buffer = 4, kind_stream = 5, + kind_work_group_memory = 6, kind_invalid = 0xf, // not a valid kernel kind }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp new file mode 100644 index 0000000000000..7870ebd3ca73e --- /dev/null +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -0,0 +1,84 @@ +//===-------------------- work_group_memory.hpp ---------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace sycl { +inline namespace _V1 { +namespace detail { +template struct is_unbounded_array : std::false_type {}; + +template struct is_unbounded_array : std::true_type {}; + +template +inline constexpr bool is_unbounded_array_v = is_unbounded_array::value; + +class work_group_memory_impl { +public: + work_group_memory_impl() : buffer_size{0} {} + work_group_memory_impl(const work_group_memory_impl &rhs) = default; + work_group_memory_impl & + operator=(const work_group_memory_impl &rhs) = default; + work_group_memory_impl(size_t buffer_size) : buffer_size{buffer_size} {} + +private: + size_t buffer_size; + friend class sycl::handler; +}; + +} // namespace detail +namespace ext::oneapi::experimental { + +template +class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory + : sycl::detail::work_group_memory_impl { +public: + using value_type = std::remove_all_extents_t; + +private: + using decoratedPtr = typename sycl::detail::DecoratedType< + value_type, access::address_space::local_space>::type *; + +public: + work_group_memory() = default; + work_group_memory(const work_group_memory &rhs) = default; + work_group_memory &operator=(const work_group_memory &rhs) = default; + template >> + work_group_memory(handler &) + : sycl::detail::work_group_memory_impl(sizeof(DataT)) {} + template >> + work_group_memory(size_t num, handler &) + : sycl::detail::work_group_memory_impl( + num * sizeof(std::remove_extent_t)) {} + template + multi_ptr + get_multi_ptr() const { + return sycl::address_space_cast(ptr); + } + DataT *operator&() const { return reinterpret_cast(ptr); } + operator DataT &() const { return *reinterpret_cast(ptr); } + template >> + const work_group_memory &operator=(const DataT &value) const { + *ptr = value; + return *this; + } +#ifdef __SYCL_DEVICE_ONLY__ + void __init(decoratedPtr ptr) { this->ptr = ptr; } +#endif +private: + decoratedPtr ptr; +}; +} // namespace ext::oneapi::experimental +} // namespace _V1 +} // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index cda66914dcb06..32084bd8ddc2e 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -163,6 +163,8 @@ class pipe; } namespace ext ::oneapi ::experimental { +template +class work_group_memory; struct image_descriptor; } // namespace ext::oneapi::experimental @@ -171,6 +173,7 @@ class graph_impl; } // namespace ext::oneapi::experimental::detail namespace detail { +class work_group_memory_impl; class handler_impl; class kernel_impl; class queue_impl; @@ -564,8 +567,8 @@ class __SYCL_EXPORT handler { // The version for regular(standard layout) argument. template void setArgsHelper(int ArgIndex, T &&Arg, Ts &&...Args) { - set_arg(ArgIndex, std::move(Arg)); - setArgsHelper(++ArgIndex, std::move(Args)...); + set_arg(ArgIndex, std::forward(Arg)); + setArgsHelper(++ArgIndex, std::forward(Args)...); } void setArgsHelper(int) {} @@ -603,6 +606,8 @@ class __SYCL_EXPORT handler { #endif } + void setArgHelper(int ArgIndex, detail::work_group_memory_impl &Arg); + // setArgHelper for non local accessor argument. template @@ -1096,7 +1101,7 @@ class __SYCL_EXPORT handler { KernelType KernelFunc) { #ifndef __SYCL_DEVICE_ONLY__ throwIfActionIsCreated(); - throwOnLocalAccessorMisuse(); + throwOnKernelParameterMisuse(); if (!range_size_fits_in_size_t(UserRange)) throw sycl::exception(make_error_code(errc::runtime), "The total number of work-items in " @@ -1641,7 +1646,7 @@ class __SYCL_EXPORT handler { kernel_single_task_wrapper(KernelFunc); #ifndef __SYCL_DEVICE_ONLY__ throwIfActionIsCreated(); - throwOnLocalAccessorMisuse(); + throwOnKernelParameterMisuse(); verifyUsedKernelBundleInternal( detail::string_view{detail::getKernelName()}); // No need to check if range is out of INT_MAX limits as it's compile-time @@ -1840,6 +1845,14 @@ class __SYCL_EXPORT handler { setArgHelper(ArgIndex, std::move(Arg)); } + template + void set_arg( + int ArgIndex, + ext::oneapi::experimental::work_group_memory &Arg) { + setArgHelper(ArgIndex, Arg); + } + // set_arg for graph dynamic_parameters template void set_arg(int argIndex, @@ -1858,9 +1871,8 @@ class __SYCL_EXPORT handler { /// /// \param Args are argument values to be set. template void set_args(Ts &&...Args) { - setArgsHelper(0, std::move(Args)...); + setArgsHelper(0, std::forward(Args)...); } - /// Defines and invokes a SYCL kernel function as a function object type. /// /// If it is a named function object and the function object type is @@ -3233,7 +3245,6 @@ class __SYCL_EXPORT handler { private: std::shared_ptr impl; std::shared_ptr MQueue; - std::vector MLocalAccStorage; std::vector> MStreamStorage; detail::string MKernelName; @@ -3554,7 +3565,7 @@ class __SYCL_EXPORT handler { /// must not be used in a SYCL kernel function that is invoked via single_task /// or via the simple form of parallel_for that takes a range parameter. template - void throwOnLocalAccessorMisuse() const { + void throwOnKernelParameterMisuse() const { using NameT = typename detail::get_kernel_name_t::name; for (unsigned I = 0; I < detail::getKernelNumParams(); ++I) { @@ -3570,6 +3581,12 @@ class __SYCL_EXPORT handler { "A local accessor must not be used in a SYCL kernel function " "that is invoked via single_task or via the simple form of " "parallel_for that takes a range parameter."); + if (Kind == detail::kernel_param_kind_t::kind_work_group_memory) + throw sycl::exception( + make_error_code(errc::kernel_argument), + "A work group memory object must not be used in a SYCL kernel " + "function that is invoked via single_task or via the simple form " + "of parallel_for that takes a range parameter."); } } diff --git a/sycl/include/sycl/sycl.hpp b/sycl/include/sycl/sycl.hpp index 868fec035eed7..8c3c5c388cf4c 100644 --- a/sycl/include/sycl/sycl.hpp +++ b/sycl/include/sycl/sycl.hpp @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index b34c22ad2777e..2c2a4963bed98 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -197,6 +197,9 @@ class handler_impl { /// True if MCodeLoc is sycl entry point code location bool MIsTopCodeLoc = true; + + /// List of work group memory objects associated with this handler + std::vector> MWorkGroupMemoryObjects; }; } // namespace detail diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index c48ba4c15b802..f4332143aa617 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -133,6 +133,8 @@ translateArgType(kernel_param_kind_t Kind) { return PK::SpecConstBuffer; case kind::kind_stream: return PK::Stream; + case kind::kind_work_group_memory: + return PK::WorkGroupMemory; case kind::kind_invalid: return PK::Invalid; } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 6556b8008b30a..946d2917895ed 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2297,6 +2297,8 @@ void SetArgBasedOnType( const std::function &getMemAllocationFunc, const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) { switch (Arg.MType) { + case kernel_param_kind_t::kind_work_group_memory: + break; case kernel_param_kind_t::kind_stream: break; case kernel_param_kind_t::kind_accessor: { diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 12f38de0c1c8e..61ea7ee7be0a0 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -34,6 +34,7 @@ #include #include +#include #include namespace sycl { @@ -795,6 +796,12 @@ void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, } break; } + case kernel_param_kind_t::kind_work_group_memory: { + addArg(kernel_param_kind_t::kind_std_layout, nullptr, + static_cast(Ptr)->buffer_size, + Index + IndexShift); + break; + } case kernel_param_kind_t::kind_sampler: { addArg(kernel_param_kind_t::kind_sampler, Ptr, sizeof(sampler), Index + IndexShift); @@ -812,6 +819,13 @@ void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, } } +void handler::setArgHelper(int ArgIndex, detail::work_group_memory_impl &Arg) { + impl->MWorkGroupMemoryObjects.push_back( + std::make_shared(Arg)); + addArg(detail::kernel_param_kind_t::kind_work_group_memory, + impl->MWorkGroupMemoryObjects.back().get(), 0, ArgIndex); +} + // The argument can take up more space to store additional information about // MAccessRange, MMemoryRange, and MOffset added with addArgsForGlobalAccessor. // We use the worst-case estimate because the lifetime of the vector is short. diff --git a/sycl/test-e2e/WorkGroupMemory/invalid_usage.cpp b/sycl/test-e2e/WorkGroupMemory/invalid_usage.cpp new file mode 100644 index 0000000000000..7501fcd7591d5 --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/invalid_usage.cpp @@ -0,0 +1,39 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +#include +#include +#include +using namespace sycl::ext::oneapi::experimental; + +// As per the spec, a work_group_memory object cannot be used in a single task +// kernel or in a sycl::range kernel. An exception with error code +// errc::kernel_argument must be thrown in such cases. This test verifies this. + +int main() { + + sycl::queue q; + try { + q.submit([&](sycl::handler &cgh) { + work_group_memory mem{cgh}; + cgh.single_task([=]() { mem = 42; }); + }); + assert(false && "Work group memory was used in a single_task kernel and an " + "exception was not seen"); // Fail, exception was not seen + } catch (sycl::exception &e) { + // Exception seen but must verify that the error code is correct + assert(e.code() == sycl::errc::kernel_argument); + } + // Same thing but with a range kernel + try { + q.submit([&](sycl::handler &cgh) { + work_group_memory mem{cgh}; + cgh.parallel_for(sycl::range{1}, [=](sycl::id<> it) { mem = 42; }); + }); + assert(false && "Work group memory was used in a range kernel and an " + "exception was not seen"); // Fail, exception was not seen + } catch (sycl::exception &e) { + // Exception seen but must verify that the error code is correct + assert(e.code() == sycl::errc::kernel_argument); + } + return 0; +} diff --git a/sycl/test-e2e/WorkGroupMemory/swap_test.cpp b/sycl/test-e2e/WorkGroupMemory/swap_test.cpp new file mode 100644 index 0000000000000..13fbde212a47d --- /dev/null +++ b/sycl/test-e2e/WorkGroupMemory/swap_test.cpp @@ -0,0 +1,382 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +#include +#include +#include +#include +#include +namespace syclexp = sycl::ext::oneapi::experimental; + +sycl::queue q; + +// This test performs a swap of two scalars/arrays inside a kernel using a +// work_group_memory object as a temporary buffer. The test is done for scalar +// types and bounded arrays. After the kernel finishes, it is verified on the +// host side that the swap worked. + +template void swap_scalar(T &a, T &b) { + const T old_a = a; + const T old_b = b; + const size_t size = 1; + const size_t wgsize = 1; + { + sycl::buffer buf_a{&a, 1}; + sycl::buffer buf_b{&b, 1}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) { + temp = acc_a[0]; + acc_a[0] = acc_b[0]; + acc_b[0] = temp; + }); + }); + } + assert(a == old_b && b == old_a && "Incorrect swap!"); + + // swap again but this time using two temporaries. The first temporary will be + // used to save the value of a and the second temporay will be + // default-constructed and then copy-assigned from the first temporary to be + // then used to write that value to b. + { + sycl::buffer buf_a{&a, 1}; + sycl::buffer buf_b{&b, 1}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) { + syclexp::work_group_memory temp2; + temp2 = temp; // temp and temp2 have the same underlying data + temp = acc_a[0]; + acc_a[0] = acc_b[0]; + acc_b[0] = temp2; // safe to use temp2 + }); + }); + } + // Two swaps same as no swaps + assert(a == old_a && b == old_b && "Incorrect swap!"); + + // Initialize a second temporary and instead of assigning the first temporary + // to it, assign only the value of the data of the first temporary so that + // unlike above, the two temporaries will not be aliasing the same memory + // location but they will have equal values. + { + sycl::buffer buf_a{&a, 1}; + sycl::buffer buf_b{&b, 1}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + syclexp::work_group_memory temp2{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + temp = acc_a[0]; + acc_a[0] = acc_b[0]; + temp2 = *(temp.get_multi_ptr()); // temp2 now has the same value as temp + // but not the same memory location + acc_b[0] = temp2; + }); + }); + } + // Three swaps same as one swap + assert(a == old_b && b == old_a && "Incorrect swap!"); + + // Same as above but instead of using multi_ptr, use address-of operator. + { + sycl::buffer buf_a{&a, 1}; + sycl::buffer buf_b{&b, 1}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + syclexp::work_group_memory temp2{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + temp = acc_a[0]; + acc_a[0] = acc_b[0]; + temp2 = *(&temp); + acc_b[0] = temp2; + }); + }); + } + // Four swaps same as no swap + assert(a == old_a && b == old_b && "Incorrect swap!"); +} + +// Swap two 1d arrays in batches of size batch_size where each batch will be +// swapped by items in the same work group. +template +void swap_array_1d(T (&a)[N], T (&b)[N], size_t batch_size) { + sycl::queue q; + T old_a[N]; + std::memcpy(old_a, a, sizeof(a)); + T old_b[N]; + std::memcpy(old_b, b, sizeof(b)); + const size_t size = N; + const size_t wgsize = batch_size; + { + sycl::buffer buf_a{a, N}; + sycl::buffer buf_b{b, N}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + const auto i = it.get_global_id(); + temp[i] = acc_a[i]; + acc_a[i] = acc_b[i]; + acc_b[i] = temp[i]; + }); + }); + } + for (int i = 0; i < N; ++i) { + assert(a[i] == old_b[i] && b[i] == old_a[i] && "Incorrect swap!"); + } + + // Instead of working with the temporary work group memory object, we retrieve + // its corresponding multi-pointer and work with it instead. + { + sycl::buffer buf_a{a, N}; + sycl::buffer buf_b{b, N}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + auto ptr = temp.get_multi_ptr(); + const auto i = it.get_global_id(); + ptr[i] = acc_a[i]; + acc_a[i] = acc_b[i]; + acc_b[i] = ptr[i]; + }); + }); + } + // Two swaps same as no swap + for (int i = 0; i < N; ++i) { + assert(a[i] == old_a[i] && b[i] == old_b[i] && "Incorrect swap!"); + } + + // Same as above but use a pointer returned by the address-of operator + // instead. + { + sycl::buffer buf_a{a, N}; + sycl::buffer buf_b{b, N}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + const auto i = it.get_global_id(); + auto ptr = &temp; + (*ptr)[i] = acc_a[i]; + acc_a[i] = acc_b[i]; + acc_b[i] = (*ptr)[i]; + }); + }); + } + // Three swaps same as one swap + for (int i = 0; i < N; ++i) { + assert(a[i] == old_b[i] && b[i] == old_a[i] && "Incorrect swap!"); + } + + // Same as above but use an unbounded array as temporary storage + { + sycl::buffer buf_a{a, N}; + sycl::buffer buf_b{b, N}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{N, cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<> it) { + const auto i = it.get_global_id(); + auto ptr = &temp; + (*ptr)[i] = acc_a[i]; + acc_a[i] = acc_b[i]; + acc_b[i] = (*ptr)[i]; + }); + }); + } + // Four swaps same as no swap + for (int i = 0; i < N; ++i) { + assert(a[i] == old_a[i] && b[i] == old_b[i] && "Incorrect swap!"); + } +} + +template +void swap_array_2d(T (&a)[N][N], T (&b)[N][N], size_t batch_size) { + sycl::queue q; + T old_a[N][N]; + for (int i = 0; i < N; ++i) { + std::memcpy(old_a[i], a[i], sizeof(a[0])); + } + T old_b[N][N]; + for (int i = 0; i < N; ++i) { + + std::memcpy(old_b[i], b[i], sizeof(b[0])); + } + const auto size = sycl::range{N, N}; + const auto wgsize = sycl::range{batch_size, batch_size}; + { + sycl::buffer buf_a{a[0], sycl::range{N, N}}; + sycl::buffer buf_b{b[0], sycl::range{N, N}}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<2> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<2> it) { + const auto i = it.get_global_id()[0]; + const auto j = it.get_global_id()[1]; + temp[i][j] = acc_a[i][j]; + acc_a[i][j] = acc_b[i][j]; + acc_b[i][j] = temp[i][j]; + }); + }); + } + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + assert(a[i][j] == old_b[i][j] && b[i][j] == old_a[i][j] && + "Incorrect swap!"); + } + } + + // Perform the swap but this time use two temporary work group memory objects. + // One will save the value of acc_a and the other will be copy-assigned from + // it and will be used to write the values back to acc_b. + { + sycl::buffer buf_a{a[0], sycl::range{N, N}}; + sycl::buffer buf_b{b[0], sycl::range{N, N}}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<2> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<2> it) { + const auto i = it.get_global_id()[0]; + const auto j = it.get_global_id()[1]; + temp[i][j] = acc_a[i][j]; + acc_a[i][j] = acc_b[i][j]; + syclexp::work_group_memory temp2; + temp2 = temp; + acc_b[i][j] = temp2[i][j]; + }); + }); + } + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + // Two swaps are the same as no swap + assert(a[i][j] == old_a[i][j] && b[i][j] == old_b[i][j] && + "Incorrect swap!"); + } + } + + // Same as above but construct the second temporary inside the kernel and + // copy-construct it from the first temporary. + { + sycl::buffer buf_a{a[0], sycl::range{N, N}}; + sycl::buffer buf_b{b[0], sycl::range{N, N}}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{cgh}; + sycl::nd_range<2> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<2> it) { + const auto i = it.get_global_id()[0]; + const auto j = it.get_global_id()[1]; + temp[i][j] = acc_a[i][j]; + acc_a[i][j] = acc_b[i][j]; + syclexp::work_group_memory temp2{temp}; + acc_b[i][j] = temp2[i][j]; + }); + }); + } + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + // Three swaps are the same as one swap + assert(a[i][j] == old_b[i][j] && b[i][j] == old_a[i][j] && + "Incorrect swap!"); + } + } + + // Same as above but use an unbounded array as temporary storage + { + sycl::buffer buf_a{a[0], sycl::range{N, N}}; + sycl::buffer buf_b{b[0], sycl::range{N, N}}; + q.submit([&](sycl::handler &cgh) { + sycl::accessor acc_a{buf_a, cgh}; + sycl::accessor acc_b{buf_b, cgh}; + syclexp::work_group_memory temp{N, cgh}; + sycl::nd_range<2> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<2> it) { + const auto i = it.get_global_id()[0]; + const auto j = it.get_global_id()[1]; + temp[i][j] = acc_a[i][j]; + acc_a[i][j] = acc_b[i][j]; + syclexp::work_group_memory temp2{temp}; + acc_b[i][j] = temp2[i][j]; + }); + }); + } + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + // Four swaps are the same as no swap + assert(a[i][j] == old_a[i][j] && b[i][j] == old_b[i][j] && + "Incorrect swap!"); + } + } +} + +// Coherency test that checks that work group memory is truly shared by +// work-items in a work group. The test allocates an integer in +// work group memory and each leader of the work groups will assign +// its work group index to this integer. The computation that the +// leader does yields the same value for all work-items in the work-group +// so we can verify that each work-item sees the value written by its leader. +// The test also is a sanity check that different work groups get different +// work group memory locations as otherwise we'd have data races. +void coherency(size_t size, size_t wgsize) { + q.submit([&](sycl::handler &cgh) { + syclexp::work_group_memory data{cgh}; + sycl::nd_range<1> ndr{size, wgsize}; + cgh.parallel_for(ndr, [=](sycl::nd_item<1> it) { + if (it.get_group().leader()) { + data = it.get_global_id() / wgsize; + } + sycl::group_barrier(it.get_group()); + assert(data == it.get_global_id() / wgsize); + }); + }); +} + +constexpr size_t N = 32; +int main() { + int intarr1[N][N]; + int intarr2[N][N]; + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + intarr1[i][j] = i + j; + intarr2[i][j] = i * j; + } + } + for (int i = 0; i < N; ++i) { + for (int j = 0; j < N; ++j) { + swap_scalar(intarr1[i][j], intarr2[i][j]); + } + swap_array_1d(intarr1[i], intarr2[i], 8); + } + swap_array_2d(intarr1, intarr2, 8); + coherency(N, N / 2); + coherency(N, N / 4); + coherency(N, N / 8); + coherency(N, N / 16); + coherency(N, N / 32); + return 0; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 2d1dcce2b68e2..11d85801727c7 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3478,6 +3478,7 @@ _ZN4sycl3_V17handler11saveCodeLocENS0_6detail13code_locationE _ZN4sycl3_V17handler11saveCodeLocENS0_6detail13code_locationEb _ZN4sycl3_V17handler11storeRawArgEPKvm _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE +_ZN4sycl3_V17handler12setArgHelperEiRNS0_6detail22work_group_memory_implE _ZN4sycl3_V17handler13getKernelNameEv _ZN4sycl3_V17handler14addAccessorReqESt10shared_ptrINS0_6detail16AccessorImplHostEE _ZN4sycl3_V17handler14setNDRangeUsedEb diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index 33092af8477f5..fa1f641ccd45a 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -3683,6 +3683,7 @@ ?addLifetimeSharedPtrStorage@handler@_V1@sycl@@AEAAXV?$shared_ptr@$$CBX@std@@@Z ?addOrReplaceAccessorProperties@buffer_plain@detail@_V1@sycl@@IEAAXAEBVproperty_list@34@@Z ?addReduction@handler@_V1@sycl@@AEAAXAEBV?$shared_ptr@$$CBX@std@@@Z +?setArgHelper@handler@_V1@sycl@@AEAAXHAEAVwork_group_memory_impl@detail@23@@Z ?addStream@handler@_V1@sycl@@AEAAXAEBV?$shared_ptr@Vstream_impl@detail@_V1@sycl@@@std@@@Z ?alignedAlloc@OSUtil@detail@_V1@sycl@@SAPEAX_K0@Z ?alignedFree@OSUtil@detail@_V1@sycl@@SAXPEAX@Z diff --git a/sycl/unittests/Extensions/CMakeLists.txt b/sycl/unittests/Extensions/CMakeLists.txt index 44a59f9a5e136..ed29406511fbd 100644 --- a/sycl/unittests/Extensions/CMakeLists.txt +++ b/sycl/unittests/Extensions/CMakeLists.txt @@ -15,6 +15,7 @@ add_sycl_unittest(ExtensionsTests OBJECT ProfilingTag.cpp KernelProperties.cpp NoDeviceIPVersion.cpp + WorkGroupMemoryBackendArgument.cpp GetLastEvent.cpp ) diff --git a/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp new file mode 100644 index 0000000000000..7ddb1ee28310b --- /dev/null +++ b/sycl/unittests/Extensions/WorkGroupMemoryBackendArgument.cpp @@ -0,0 +1,66 @@ +//==--------------------- WorkGroupMemoryBackendArgument.cpp --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +// Check that the work group memory object is mapped to exactly one backend +// kernel argument. +class WorkGroupMemoryKernel; +namespace syclext = sycl::ext::oneapi::experimental; +using arg_type = syclext::work_group_memory; + +namespace sycl { +inline namespace _V1 { +namespace detail { +template <> struct KernelInfo { + static constexpr unsigned getNumParams() { return 1; } + static const detail::kernel_param_desc_t &getParamDesc(int) { + static detail::kernel_param_desc_t WorkGroupMemory = { + detail::kernel_param_kind_t::kind_work_group_memory, 0, 0}; + return WorkGroupMemory; + } + static constexpr bool isESIMD() { return false; } + static constexpr bool callsThisItem() { return false; } + static constexpr bool callsAnyThisFreeFunction() { return false; } + static constexpr int64_t getKernelSize() { return sizeof(arg_type); } + static constexpr const char *getName() { return "WorkGroupMemoryKernel"; } +}; + +} // namespace detail +} // namespace _V1 +} // namespace sycl + +static sycl::unittest::UrImage Img = + sycl::unittest::generateDefaultImage({"WorkGroupMemoryKernel"}); +static sycl::unittest::UrImageArray<1> ImgArray{&Img}; + +static int urKernelSetArgLocalCalls = 0; +inline ur_result_t redefined_urKernelSetArgLocal(void *) { + ++urKernelSetArgLocalCalls; + return UR_RESULT_SUCCESS; +} + +TEST(URArgumentTest, URArgumentTest) { + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_replace_callback("urKernelSetArgLocal", + &redefined_urKernelSetArgLocal); + sycl::platform Platform = sycl::platform(); + const sycl::device dev = Platform.get_devices()[0]; + sycl::queue q{dev}; + syclext::submit(q, [&](sycl::handler &cgh) { + arg_type data{cgh}; + const auto kernel = [=](sycl::nd_item<1> it) { data = 42; }; + syclext::nd_launch(cgh, sycl::nd_range<1>{1, 1}, + kernel); + }); + q.wait(); + ASSERT_EQ(urKernelSetArgLocalCalls, 1); +}