|
| 1 | +/* |
| 2 | +// Copyright (c) 2024 Ben Ashbaugh |
| 3 | +// |
| 4 | +// SPDX-License-Identifier: MIT |
| 5 | +*/ |
| 6 | + |
| 7 | +#include <popl/popl.hpp> |
| 8 | + |
| 9 | +#include <CL/opencl.hpp> |
| 10 | + |
| 11 | +#include "util.hpp" |
| 12 | + |
| 13 | +// TODO: clean this up once support is in the upstream headers. |
| 14 | +#if !defined(cl_intel_concurrent_dispatch) |
| 15 | + |
| 16 | +#define cl_intel_concurrent_dispatch 1 |
| 17 | +#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME \ |
| 18 | + "cl_intel_concurrent_dispatch" |
| 19 | + |
| 20 | +#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0) |
| 21 | + |
| 22 | +/* cl_kernel_exec_info */ |
| 23 | +#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL 0x4257 |
| 24 | + |
| 25 | +typedef cl_uint cl_kernel_exec_info_dispatch_type_intel; |
| 26 | + |
| 27 | +/* cl_kernel_exec_info_dispatch_type_intel */ |
| 28 | +#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_DEFAULT_INTEL 0 |
| 29 | +#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL 1 |
| 30 | + |
| 31 | +typedef cl_int CL_API_CALL |
| 32 | +clGetKernelMaxConcurrentWorkGroupCountINTEL_t( |
| 33 | + cl_command_queue command_queue, |
| 34 | + cl_kernel kernel, |
| 35 | + cl_uint work_dim, |
| 36 | + const size_t* global_work_offset, |
| 37 | + const size_t* local_work_size, |
| 38 | + size_t* max_work_group_count); |
| 39 | + |
| 40 | +typedef clGetKernelMaxConcurrentWorkGroupCountINTEL_t * |
| 41 | +clGetKernelMaxConcurrentWorkGroupCountINTEL_fn ; |
| 42 | + |
| 43 | +#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) |
| 44 | + |
| 45 | +extern CL_API_ENTRY cl_int CL_API_CALL |
| 46 | +clGetKernelMaxConcurrentWorkGroupCountINTEL( |
| 47 | + cl_command_queue command_queue, |
| 48 | + cl_kernel kernel, |
| 49 | + cl_uint work_dim, |
| 50 | + const size_t* global_work_offset, |
| 51 | + const size_t* local_work_size, |
| 52 | + size_t* max_work_group_count) ; |
| 53 | + |
| 54 | +#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */ |
| 55 | + |
| 56 | +#endif // !defined(cl_intel_concurrent_dispatch) |
| 57 | + |
| 58 | +static const char kernelString[] = R"CLC( |
| 59 | +#pragma OPENCL EXTENSION cl_intel_concurrent_dispatch : enable |
| 60 | +kernel void DeviceBarrierTest( global uint* dst ) |
| 61 | +{ |
| 62 | + const size_t gws = get_global_size(0); |
| 63 | + atomic_add( &dst[gws], 1 ); |
| 64 | +
|
| 65 | + //if (intel_is_device_barrier_valid()) { |
| 66 | + //intel_device_barrier( CLK_LOCAL_MEM_FENCE ); // TODO: check fence flags |
| 67 | + //intel_device_barrier( CLK_LOCAL_MEM_FENCE, memory_scope_device ); // TODO: check fence flags |
| 68 | + //} |
| 69 | +
|
| 70 | + const uint id = get_global_id(0); |
| 71 | + dst[id] = dst[gws] + 1; |
| 72 | +} |
| 73 | +)CLC"; |
| 74 | + |
| 75 | +int main(int argc, char** argv) |
| 76 | +{ |
| 77 | + int platformIndex = 0; |
| 78 | + int deviceIndex = 0; |
| 79 | + |
| 80 | + size_t iterations = 16; |
| 81 | + size_t lws = 64; |
| 82 | + size_t wgCount = 0; |
| 83 | + |
| 84 | + { |
| 85 | + popl::OptionParser op("Supported Options"); |
| 86 | + op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex); |
| 87 | + op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex); |
| 88 | + op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations); |
| 89 | + op.add<popl::Value<size_t>>("", "lws", "Local Work-Group Size", lws, &lws); |
| 90 | + |
| 91 | + bool printUsage = false; |
| 92 | + try { |
| 93 | + op.parse(argc, argv); |
| 94 | + } catch (std::exception& e) { |
| 95 | + fprintf(stderr, "Error: %s\n\n", e.what()); |
| 96 | + printUsage = true; |
| 97 | + } |
| 98 | + |
| 99 | + if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) { |
| 100 | + fprintf(stderr, |
| 101 | + "Usage: concurrentdispatch [options]\n" |
| 102 | + "%s", op.help().c_str()); |
| 103 | + return -1; |
| 104 | + } |
| 105 | + } |
| 106 | + |
| 107 | + std::vector<cl::Platform> platforms; |
| 108 | + cl::Platform::get(&platforms); |
| 109 | + |
| 110 | + printf("Running on platform: %s\n", |
| 111 | + platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() ); |
| 112 | + |
| 113 | + std::vector<cl::Device> devices; |
| 114 | + platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices); |
| 115 | + |
| 116 | + printf("Running on device: %s\n", |
| 117 | + devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() ); |
| 118 | + |
| 119 | + // On some implementations, the feature test macros for float atomics are |
| 120 | + // only defined when compiling for OpenCL C 3.0 or newer. |
| 121 | + if (checkDeviceForExtension(devices[deviceIndex], CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME)) { |
| 122 | + printf("Device supports " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n"); |
| 123 | + } else { |
| 124 | + printf("Device does not support " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n"); |
| 125 | + return -1; |
| 126 | + } |
| 127 | + |
| 128 | + cl::Context context{devices[deviceIndex]}; |
| 129 | + cl::CommandQueue commandQueue{context, devices[deviceIndex]}; |
| 130 | + |
| 131 | + cl::Program program{ context, kernelString }; |
| 132 | + program.build("-cl-std=CL3.0"); |
| 133 | + cl::Kernel kernel = cl::Kernel{ program, "DeviceBarrierTest" }; |
| 134 | + |
| 135 | + cl_kernel_exec_info_dispatch_type_intel dispatchType = |
| 136 | + CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL; |
| 137 | + kernel.setExecInfo(CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL, dispatchType); |
| 138 | + |
| 139 | + auto clGetKernelMaxConcurrentWorkGroupCountINTEL_ = (clGetKernelMaxConcurrentWorkGroupCountINTEL_fn) |
| 140 | + clGetExtensionFunctionAddressForPlatform( |
| 141 | + platforms[platformIndex](), |
| 142 | + "clGetKernelMaxConcurrentWorkGroupCountINTEL"); |
| 143 | + clGetKernelMaxConcurrentWorkGroupCountINTEL_( |
| 144 | + commandQueue(), |
| 145 | + kernel(), |
| 146 | + 1, |
| 147 | + nullptr, |
| 148 | + &lws, |
| 149 | + &wgCount); |
| 150 | + |
| 151 | + printf("Max concurrent work-group count for local work size %zu is %zu.\n", |
| 152 | + lws, wgCount); |
| 153 | + |
| 154 | + const size_t gws = lws * wgCount; |
| 155 | + |
| 156 | + cl::Buffer dst = cl::Buffer{ |
| 157 | + context, |
| 158 | + CL_MEM_READ_WRITE, |
| 159 | + (gws + 1) * sizeof(cl_uint) }; |
| 160 | + |
| 161 | + // execution |
| 162 | + { |
| 163 | + kernel.setArg(0, dst); |
| 164 | + |
| 165 | + commandQueue.finish(); |
| 166 | + |
| 167 | + auto start = std::chrono::system_clock::now(); |
| 168 | + for (size_t i = 0; i < iterations; i++) { |
| 169 | + cl_uint zero = 0; |
| 170 | + commandQueue.enqueueFillBuffer( |
| 171 | + dst, |
| 172 | + zero, |
| 173 | + 0, |
| 174 | + (gws + 1) * sizeof(cl_uint)); |
| 175 | + commandQueue.enqueueNDRangeKernel( |
| 176 | + kernel, |
| 177 | + cl::NullRange, |
| 178 | + cl::NDRange{gws}, |
| 179 | + cl::NDRange{lws}); |
| 180 | + } |
| 181 | + |
| 182 | + commandQueue.finish(); |
| 183 | + |
| 184 | + auto end = std::chrono::system_clock::now(); |
| 185 | + std::chrono::duration<float> elapsed_seconds = end - start; |
| 186 | + printf("Finished in %f seconds\n", elapsed_seconds.count()); |
| 187 | + } |
| 188 | + |
| 189 | + // verification |
| 190 | + { |
| 191 | + const cl_uint* pDst = (const cl_uint*)commandQueue.enqueueMapBuffer( |
| 192 | + dst, |
| 193 | + CL_TRUE, |
| 194 | + CL_MAP_READ, |
| 195 | + 0, |
| 196 | + (gws + 1) * sizeof(cl_uint) ); |
| 197 | + |
| 198 | + size_t mismatches = 0; |
| 199 | + |
| 200 | + for( size_t i = 0; i < gws + 1; i++ ) |
| 201 | + { |
| 202 | + uint check = (i == gws) ? gws : gws + 1; |
| 203 | + if( pDst[i] != check ) |
| 204 | + { |
| 205 | + if( mismatches < 16 ) |
| 206 | + { |
| 207 | + fprintf(stderr, "MisMatch! dst[%zu] == %08X, want %08X\n", |
| 208 | + i, |
| 209 | + pDst[i], |
| 210 | + check ); |
| 211 | + } |
| 212 | + mismatches++; |
| 213 | + } |
| 214 | + } |
| 215 | + |
| 216 | + if( mismatches ) |
| 217 | + { |
| 218 | + fprintf(stderr, "Error: Found %zu mismatches / %zu values!!!\n", |
| 219 | + mismatches, |
| 220 | + gws + 1 ); |
| 221 | + } |
| 222 | + else |
| 223 | + { |
| 224 | + printf("Success.\n"); |
| 225 | + } |
| 226 | + |
| 227 | + commandQueue.enqueueUnmapMemObject( |
| 228 | + dst, |
| 229 | + (void*)pDst ); |
| 230 | + } |
| 231 | + |
| 232 | + return 0; |
| 233 | +} |
0 commit comments