Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL][Bindless][E2E] Test for host USM backed images #16607

Merged
3 changes: 2 additions & 1 deletion sycl/cmake/modules/FetchUnifiedRuntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ if(SYCL_UR_USE_FETCH_CONTENT)
CACHE PATH "Path to external '${name}' adapter source dir" FORCE)
endfunction()

set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
# set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
set(UNIFIED_RUNTIME_REPO "https://github.com/przemektmalon/unified-runtime.git")
kbenzie marked this conversation as resolved.
Show resolved Hide resolved
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/modules/UnifiedRuntimeTag.cmake)

set(UMF_BUILD_EXAMPLES OFF CACHE INTERNAL "EXAMPLES")
Expand Down
24 changes: 17 additions & 7 deletions sycl/cmake/modules/UnifiedRuntimeTag.cmake
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
# commit 222e4b1d51536bb38e03e2000a79679af0a44a6d
# Merge: 30d183a0 28108a7e
# Author: Kenneth Benzie (Benie) <[email protected]>
# Date: Fri Jan 17 10:28:34 2025 +0000
# Merge pull request #2561 from Bensuo/ben/cmd-buffer-l0-fence
# [L0][CMDBUF] Optimize fence/event waits during update
set(UNIFIED_RUNTIME_TAG 222e4b1d51536bb38e03e2000a79679af0a44a6d)
# commit d49c8cd9e4b911444c9abc124a9e6fb0b097c074
# Author: Przemek Malon <[email protected]>
# Date: Wed Jan 8 19:53:17 2025 +0000
# Enable creation of images backed by host USM
#
# Small patch to enable bindless images backed by host USM in the CUDA
# adapter.
#
# Host and Device USM pointers are usable across the host and device
# for all versions of CUDA that we support. There is no need to provide
# the `CU_MEMHOSTALLOC_DEVICEMAP` flag during allocation, or calling
# `cuMemHostGetDevicePointer` to retrieve a device usable address.
#
# Passing a `CU_MEMHOSTALLOC_WRITECOMBINED` flag to the host USM
# allocation will enhance performance in certain scenarios, however, an
# extension allowing this is not yet available.
set(UNIFIED_RUNTIME_TAG d49c8cd9e4b911444c9abc124a9e6fb0b097c074)
147 changes: 147 additions & 0 deletions sycl/test-e2e/bindless_images/sampling_2D_USM_host.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
// REQUIRES: cuda
// REQUIRES: aspect-ext_oneapi_bindless_images_2d_usm

// RUN: %{build} -o %t.out
// RUN: %{run-unfiltered-devices} %t.out

#include <cmath>
#include <iostream>
#include <sycl/detail/core.hpp>

#include <sycl/ext/oneapi/bindless_images.hpp>
#include <sycl/usm.hpp>

// Uncomment to print additional test information
// #define VERBOSE_PRINT

class image_addition;

int main() {

sycl::device dev;
sycl::queue q(dev);
auto ctxt = q.get_context();

// declare image data
size_t width = 5;
size_t height = 6;
size_t N = width * height;
size_t widthInBytes = width * sizeof(float);
std::vector<float> out(N);
std::vector<float> expected(N);
std::vector<float> dataIn(N);

for (int i = 0; i < width; i++) {
for (int j = 0; j < height; j++) {
expected[i + (width * j)] = i + (width * j);
dataIn[i + (width * j)] = i + (width * j);
}
}

try {
sycl::ext::oneapi::experimental::bindless_image_sampler samp(
sycl::addressing_mode::clamp,
sycl::coordinate_normalization_mode::normalized,
sycl::filtering_mode::linear);

// Extension: image descriptor
sycl::ext::oneapi::experimental::image_descriptor desc(
{width, height}, 1, sycl::image_channel_type::fp32);

auto devicePitchAlign = dev.get_info<
sycl::ext::oneapi::experimental::info::device::image_row_pitch_align>();
auto deviceMaxPitch =
dev.get_info<sycl::ext::oneapi::experimental::info::device::
max_image_linear_row_pitch>();

// Pitch requirements:
// - pitch % devicePitchAlign == 0
// - pitch >= widthInBytes
// - pitch <= deviceMaxPitch
size_t pitch = devicePitchAlign *
std::ceil(float(widthInBytes) / float(devicePitchAlign));
assert(pitch <= deviceMaxPitch);

// Host USM allocation
auto imgMem =
sycl::aligned_alloc_host(devicePitchAlign, (pitch * height), ctxt);

if (imgMem == nullptr) {
std::cerr << "Error allocating images!" << std::endl;
return 1;
}

// Copy to host USM and incorporate pitch
for (size_t i = 0; i < height; i++) {
memcpy(static_cast<float *>(imgMem) + (i * pitch / sizeof(float)),
dataIn.data() + (i * width), widthInBytes);
}

// Extension: create the image and return the handle
sycl::ext::oneapi::experimental::sampled_image_handle imgHandle =
sycl::ext::oneapi::experimental::create_image(imgMem, pitch, samp, desc,
dev, ctxt);

sycl::buffer<float, 2> buf((float *)out.data(),
sycl::range<2>{height, width});
q.submit([&](sycl::handler &cgh) {
auto outAcc = buf.get_access<sycl::access_mode::write>(
cgh, sycl::range<2>{height, width});

cgh.parallel_for<image_addition>(
sycl::nd_range<2>{{width, height}, {width, height}},
[=](sycl::nd_item<2> it) {
size_t dim0 = it.get_local_id(0);
size_t dim1 = it.get_local_id(1);

// Normalize coordinates -- +0.5 to look towards centre of pixel
float fdim0 = float(dim0 + 0.5f) / (float)width;
float fdim1 = float(dim1 + 0.5f) / (float)height;

// Extension: sample image data from handle
float px = sycl::ext::oneapi::experimental::sample_image<float>(
imgHandle, sycl::float2(fdim0, fdim1));

outAcc[sycl::id<2>{dim1, dim0}] = px;
});
});

q.wait_and_throw();

// Extension: cleanup
sycl::ext::oneapi::experimental::destroy_image_handle(imgHandle, dev, ctxt);
sycl::free(imgMem, ctxt);
} catch (sycl::exception e) {
std::cerr << "SYCL exception caught! : " << e.what() << "\n";
return 1;
} catch (...) {
std::cerr << "Unknown exception caught!\n";
return 2;
}

// collect and validate output
bool validated = true;
for (int i = 0; i < N; i++) {
bool mismatch = false;
if (out[i] != expected[i]) {
mismatch = true;
validated = false;
}

if (mismatch) {
#ifdef VERBOSE_PRINT
std::cout << "Result mismatch! Expected: " << expected[i]
<< ", Actual: " << out[i] << std::endl;
#else
break;
#endif
}
}
if (validated) {
std::cout << "Test passed!" << std::endl;
return 0;
}

std::cout << "Test failed!" << std::endl;
return 3;
}
Loading