From bfd4ad6adaa9ad2259ced629ace98d8986ff47a1 Mon Sep 17 00:00:00 2001 From: Przemek Malon Date: Thu, 9 Jan 2025 05:59:29 +0000 Subject: [PATCH] [SYCL][Bindless][E2E] Test for host USM backed images This patch adds a test for an image backed by a host USM allocation. --- sycl/cmake/modules/FetchUnifiedRuntime.cmake | 3 +- sycl/cmake/modules/UnifiedRuntimeTag.cmake | 23 ++- .../bindless_images/sampling_2D_USM_host.cpp | 147 ++++++++++++++++++ 3 files changed, 165 insertions(+), 8 deletions(-) create mode 100644 sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp diff --git a/sycl/cmake/modules/FetchUnifiedRuntime.cmake b/sycl/cmake/modules/FetchUnifiedRuntime.cmake index 72841724fa01d..92e9287d47bb5 100644 --- a/sycl/cmake/modules/FetchUnifiedRuntime.cmake +++ b/sycl/cmake/modules/FetchUnifiedRuntime.cmake @@ -116,7 +116,8 @@ if(SYCL_UR_USE_FETCH_CONTENT) CACHE PATH "Path to external '${name}' adapter source dir" FORCE) endfunction() - set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + # set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git") + set(UNIFIED_RUNTIME_REPO "https://github.com/przemektmalon/unified-runtime.git") include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake) set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES") diff --git a/sycl/cmake/modules/UnifiedRuntimeTag.cmake b/sycl/cmake/modules/UnifiedRuntimeTag.cmake index de4a5aa14251b..e566012fa5693 100644 --- a/sycl/cmake/modules/UnifiedRuntimeTag.cmake +++ b/sycl/cmake/modules/UnifiedRuntimeTag.cmake @@ -1,8 +1,17 @@ -# commit 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f -# Merge: 9c652ffb b78cfa71 -# Author: Ross Brunton -# Date: Thu Jan 9 17:28:00 2025 +0000 -# Merge pull request #2048 from RossBrunton/ross/refc +# commit 70053658b3f741f41cf9b62e063e8c4fe51957e0 +# Author: Przemek Malon +# Date: Wed Jan 8 19:53:17 2025 +0000 +# Enable creation of images backed by host USM # -# Use reference counting on factories -set(UNIFIED_RUNTIME_TAG 7eae5c80a9e969bc12fda57c9cc0a0970f0cd17f) +# Small patch to enable bindless images backed by host USM in the CUDA +# adapter. +# +# Host and Device USM pointers are usable across the host and device +# for all versions of CUDA that we support. There is no need to provide +# the `CU_MEMHOSTALLOC_DEVICEMAP` flag during allocation, or calling +# `cuMemHostGetDevicePointer` to retrieve a device usable address. +# +# Passing a `CU_MEMHOSTALLOC_WRITECOMBINED` flag to the host USM +# allocation will enhance performance in certain scenarios, however, an +# extension allowing this is not yet available. +set(UNIFIED_RUNTIME_TAG 70053658b3f741f41cf9b62e063e8c4fe51957e0) diff --git a/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp b/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp new file mode 100644 index 0000000000000..bca3d2c1c0ddd --- /dev/null +++ b/sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp @@ -0,0 +1,147 @@ +// REQUIRES: cuda +// REQUIRES: aspect-ext_oneapi_bindless_images_2d_usm + +// RUN: %{build} -o %t.out +// RUN: %{run-unfiltered-devices} %t.out + +#include +#include +#include + +#include +#include + +// Uncomment to print additional test information +// #define VERBOSE_PRINT + +class image_addition; + +int main() { + + sycl::device dev; + sycl::queue q(dev); + auto ctxt = q.get_context(); + + // declare image data + size_t width = 5; + size_t height = 6; + size_t N = width * height; + size_t widthInBytes = width * sizeof(float); + std::vector out(N); + std::vector expected(N); + std::vector dataIn(N); + + for (int i = 0; i < width; i++) { + for (int j = 0; j < height; j++) { + expected[i + (width * j)] = i + (width * j); + dataIn[i + (width * j)] = i + (width * j); + } + } + + try { + sycl::ext::oneapi::experimental::bindless_image_sampler samp( + sycl::addressing_mode::clamp, + sycl::coordinate_normalization_mode::normalized, + sycl::filtering_mode::linear); + + // Extension: image descriptor + sycl::ext::oneapi::experimental::image_descriptor desc( + {width, height}, 1, sycl::image_channel_type::fp32); + + auto devicePitchAlign = dev.get_info< + sycl::ext::oneapi::experimental::info::device::image_row_pitch_align>(); + auto deviceMaxPitch = + dev.get_info(); + + // Pitch requirements: + // - pitch % devicePitchAlign == 0 + // - pitch >= widthInBytes + // - pitch <= deviceMaxPitch + size_t pitch = devicePitchAlign * + std::ceil(float(widthInBytes) / float(devicePitchAlign)); + assert(pitch <= deviceMaxPitch); + + // Host USM allocation + auto imgMem = + sycl::aligned_alloc_host(devicePitchAlign, (pitch * height), ctxt); + + if (imgMem == nullptr) { + std::cerr << "Error allocating images!" << std::endl; + return 1; + } + + // Copy to host USM and incorporate pitch + for (size_t i = 0; i < height; i++) { + memcpy(static_cast(imgMem) + (i * pitch / sizeof(float)), + dataIn.data() + (i * width), widthInBytes); + } + + // Extension: create the image and return the handle + sycl::ext::oneapi::experimental::sampled_image_handle imgHandle = + sycl::ext::oneapi::experimental::create_image(imgMem, pitch, samp, desc, + dev, ctxt); + + sycl::buffer buf((float *)out.data(), + sycl::range<2>{height, width}); + q.submit([&](sycl::handler &cgh) { + auto outAcc = buf.get_access( + cgh, sycl::range<2>{height, width}); + + cgh.parallel_for( + sycl::nd_range<2>{{width, height}, {width, height}}, + [=](sycl::nd_item<2> it) { + size_t dim0 = it.get_local_id(0); + size_t dim1 = it.get_local_id(1); + + // Normalize coordinates -- +0.5 to look towards centre of pixel + float fdim0 = float(dim0 + 0.5f) / (float)width; + float fdim1 = float(dim1 + 0.5f) / (float)height; + + // Extension: sample image data from handle + float px = sycl::ext::oneapi::experimental::sample_image( + imgHandle, sycl::float2(fdim0, fdim1)); + + outAcc[sycl::id<2>{dim1, dim0}] = px; + }); + }); + + q.wait_and_throw(); + + // Extension: cleanup + sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle, dev, ctxt); + sycl::free(imgMem, ctxt); + } catch (sycl::exception e) { + std::cerr << "SYCL exception caught! : " << e.what() << "\n"; + return 1; + } catch (...) { + std::cerr << "Unknown exception caught!\n"; + return 2; + } + + // collect and validate output + bool validated = true; + for (int i = 0; i < N; i++) { + bool mismatch = false; + if (out[i] != expected[i]) { + mismatch = true; + validated = false; + } + + if (mismatch) { +#ifdef VERBOSE_PRINT + std::cout << "Result mismatch! Expected: " << expected[i] + << ", Actual: " << out[i] << std::endl; +#else + break; +#endif + } + } + if (validated) { + std::cout << "Test passed!" << std::endl; + return 0; + } + + std::cout << "Test failed!" << std::endl; + return 3; +}