Skip to content

Commit 935f200

Browse files
committed
add the start of a test for cl_intel_concurrent_dispatch
1 parent b1a1675 commit 935f200

File tree

3 files changed

+245
-0
lines changed

3 files changed

+245
-0
lines changed
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# Copyright (c) 2024 Ben Ashbaugh
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
add_opencl_sample(
6+
TEST
7+
NUMBER 17
8+
TARGET concurrentdispatch
9+
VERSION 200 # for clSetKernelExecInfo
10+
SOURCES main.cpp
11+
LIBS OpenCLExt)
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
/*
2+
// Copyright (c) 2024 Ben Ashbaugh
3+
//
4+
// SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <popl/popl.hpp>
8+
9+
#include <CL/opencl.hpp>
10+
11+
#include "util.hpp"
12+
13+
// TODO: clean this up once support is in the upstream headers.
14+
#if !defined(cl_intel_concurrent_dispatch)
15+
16+
#define cl_intel_concurrent_dispatch 1
17+
#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME \
18+
"cl_intel_concurrent_dispatch"
19+
20+
#define CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_VERSION CL_MAKE_VERSION(1, 0, 0)
21+
22+
/* cl_kernel_exec_info */
23+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL 0x4257
24+
25+
typedef cl_uint cl_kernel_exec_info_dispatch_type_intel;
26+
27+
/* cl_kernel_exec_info_dispatch_type_intel */
28+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_DEFAULT_INTEL 0
29+
#define CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL 1
30+
31+
typedef cl_int CL_API_CALL
32+
clGetKernelMaxConcurrentWorkGroupCountINTEL_t(
33+
cl_command_queue command_queue,
34+
cl_kernel kernel,
35+
cl_uint work_dim,
36+
const size_t* global_work_offset,
37+
const size_t* local_work_size,
38+
size_t* max_work_group_count);
39+
40+
typedef clGetKernelMaxConcurrentWorkGroupCountINTEL_t *
41+
clGetKernelMaxConcurrentWorkGroupCountINTEL_fn ;
42+
43+
#if !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES)
44+
45+
extern CL_API_ENTRY cl_int CL_API_CALL
46+
clGetKernelMaxConcurrentWorkGroupCountINTEL(
47+
cl_command_queue command_queue,
48+
cl_kernel kernel,
49+
cl_uint work_dim,
50+
const size_t* global_work_offset,
51+
const size_t* local_work_size,
52+
size_t* max_work_group_count) ;
53+
54+
#endif /* !defined(CL_NO_NON_ICD_DISPATCH_EXTENSION_PROTOTYPES) */
55+
56+
#endif // !defined(cl_intel_concurrent_dispatch)
57+
58+
static const char kernelString[] = R"CLC(
59+
#pragma OPENCL EXTENSION cl_intel_concurrent_dispatch : enable
60+
kernel void DeviceBarrierTest( global uint* dst )
61+
{
62+
const size_t gws = get_global_size(0);
63+
atomic_add( &dst[gws], 1 );
64+
65+
//if (intel_is_device_barrier_valid()) {
66+
//intel_device_barrier( CLK_LOCAL_MEM_FENCE ); // TODO: check fence flags
67+
//intel_device_barrier( CLK_LOCAL_MEM_FENCE, memory_scope_device ); // TODO: check fence flags
68+
//}
69+
70+
const uint id = get_global_id(0);
71+
dst[id] = dst[gws] + 1;
72+
}
73+
)CLC";
74+
75+
int main(int argc, char** argv)
76+
{
77+
int platformIndex = 0;
78+
int deviceIndex = 0;
79+
80+
size_t iterations = 16;
81+
size_t lws = 64;
82+
size_t wgCount = 0;
83+
84+
{
85+
popl::OptionParser op("Supported Options");
86+
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
87+
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
88+
op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
89+
op.add<popl::Value<size_t>>("", "lws", "Local Work-Group Size", lws, &lws);
90+
91+
bool printUsage = false;
92+
try {
93+
op.parse(argc, argv);
94+
} catch (std::exception& e) {
95+
fprintf(stderr, "Error: %s\n\n", e.what());
96+
printUsage = true;
97+
}
98+
99+
if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
100+
fprintf(stderr,
101+
"Usage: concurrentdispatch [options]\n"
102+
"%s", op.help().c_str());
103+
return -1;
104+
}
105+
}
106+
107+
std::vector<cl::Platform> platforms;
108+
cl::Platform::get(&platforms);
109+
110+
printf("Running on platform: %s\n",
111+
platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );
112+
113+
std::vector<cl::Device> devices;
114+
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
115+
116+
printf("Running on device: %s\n",
117+
devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
118+
119+
// On some implementations, the feature test macros for float atomics are
120+
// only defined when compiling for OpenCL C 3.0 or newer.
121+
if (checkDeviceForExtension(devices[deviceIndex], CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME)) {
122+
printf("Device supports " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n");
123+
} else {
124+
printf("Device does not support " CL_INTEL_CONCURRENT_DISPATCH_EXTENSION_NAME ".\n");
125+
return -1;
126+
}
127+
128+
cl::Context context{devices[deviceIndex]};
129+
cl::CommandQueue commandQueue{context, devices[deviceIndex]};
130+
131+
cl::Program program{ context, kernelString };
132+
program.build("-cl-std=CL3.0");
133+
cl::Kernel kernel = cl::Kernel{ program, "DeviceBarrierTest" };
134+
135+
cl_kernel_exec_info_dispatch_type_intel dispatchType =
136+
CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_CONCURRENT_INTEL;
137+
kernel.setExecInfo(CL_KERNEL_EXEC_INFO_DISPATCH_TYPE_INTEL, dispatchType);
138+
139+
auto clGetKernelMaxConcurrentWorkGroupCountINTEL_ = (clGetKernelMaxConcurrentWorkGroupCountINTEL_fn)
140+
clGetExtensionFunctionAddressForPlatform(
141+
platforms[platformIndex](),
142+
"clGetKernelMaxConcurrentWorkGroupCountINTEL");
143+
clGetKernelMaxConcurrentWorkGroupCountINTEL_(
144+
commandQueue(),
145+
kernel(),
146+
1,
147+
nullptr,
148+
&lws,
149+
&wgCount);
150+
151+
printf("Max concurrent work-group count for local work size %zu is %zu.\n",
152+
lws, wgCount);
153+
154+
const size_t gws = lws * wgCount;
155+
156+
cl::Buffer dst = cl::Buffer{
157+
context,
158+
CL_MEM_READ_WRITE,
159+
(gws + 1) * sizeof(cl_uint) };
160+
161+
// execution
162+
{
163+
kernel.setArg(0, dst);
164+
165+
commandQueue.finish();
166+
167+
auto start = std::chrono::system_clock::now();
168+
for (size_t i = 0; i < iterations; i++) {
169+
cl_uint zero = 0;
170+
commandQueue.enqueueFillBuffer(
171+
dst,
172+
zero,
173+
0,
174+
(gws + 1) * sizeof(cl_uint));
175+
commandQueue.enqueueNDRangeKernel(
176+
kernel,
177+
cl::NullRange,
178+
cl::NDRange{gws},
179+
cl::NDRange{lws});
180+
}
181+
182+
commandQueue.finish();
183+
184+
auto end = std::chrono::system_clock::now();
185+
std::chrono::duration<float> elapsed_seconds = end - start;
186+
printf("Finished in %f seconds\n", elapsed_seconds.count());
187+
}
188+
189+
// verification
190+
{
191+
const cl_uint* pDst = (const cl_uint*)commandQueue.enqueueMapBuffer(
192+
dst,
193+
CL_TRUE,
194+
CL_MAP_READ,
195+
0,
196+
(gws + 1) * sizeof(cl_uint) );
197+
198+
size_t mismatches = 0;
199+
200+
for( size_t i = 0; i < gws + 1; i++ )
201+
{
202+
uint check = (i == gws) ? gws : gws + 1;
203+
if( pDst[i] != check )
204+
{
205+
if( mismatches < 16 )
206+
{
207+
fprintf(stderr, "MisMatch! dst[%zu] == %08X, want %08X\n",
208+
i,
209+
pDst[i],
210+
check );
211+
}
212+
mismatches++;
213+
}
214+
}
215+
216+
if( mismatches )
217+
{
218+
fprintf(stderr, "Error: Found %zu mismatches / %zu values!!!\n",
219+
mismatches,
220+
gws + 1 );
221+
}
222+
else
223+
{
224+
printf("Success.\n");
225+
}
226+
227+
commandQueue.enqueueUnmapMemObject(
228+
dst,
229+
(void*)pDst );
230+
}
231+
232+
return 0;
233+
}

samples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,4 +90,5 @@ if(BUILD_EXTENSION_SAMPLES)
9090
add_subdirectory( 13_mutablecommandbuffers )
9191
add_subdirectory( 14_ooqcommandbuffers )
9292
add_subdirectory( 15_mutablecommandbufferasserts )
93+
add_subdirectory( 17_concurrentdispatch )
9394
endif()

0 commit comments

Comments
 (0)