Skip to content

Commit 3f487d8

Browse files
authored
add a float atomics sample (#125)
* initial version * added documentation added AMD intrinsic variant * more updates and intermediate results validation * temporarily enable intermediate result checks for emulated atomics * add documentation, clean up * add a slower emulated fallback that uses a compare-and-swap loop Might try to shift to the older OpenCL 1.x atomics to improve portability. * switch to the OpenCL 1.x atomics for more portability * final tidy up * a few more minor fixes * fix one very long line
1 parent 979eb30 commit 3f487d8

File tree

5 files changed

+330
-0
lines changed

5 files changed

+330
-0
lines changed

include/CL/opencl.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1799,6 +1799,12 @@ CL_HPP_DECLARE_PARAM_TRAITS_(cl_mutable_command_info_khr, CL_MUTABLE_DISPATCH_LO
17991799
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR, cl_device_kernel_clock_capabilities_khr)
18001800
#endif /* cl_khr_kernel_clock */
18011801

1802+
#if defined(cl_ext_float_atomics)
1803+
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
1804+
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
1805+
CL_HPP_DECLARE_PARAM_TRAITS_(cl_device_info, CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT, cl_device_fp_atomic_capabilities_ext)
1806+
#endif /* cl_ext_float_atomics */
1807+
18021808
#if defined(cl_intel_command_queue_families)
18031809
CL_HPP_PARAM_NAME_CL_INTEL_COMMAND_QUEUE_FAMILIES_(CL_HPP_DECLARE_PARAM_TRAITS_)
18041810
#endif // cl_intel_command_queue_families
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
# Copyright (c) 2024 Ben Ashbaugh
2+
#
3+
# SPDX-License-Identifier: MIT
4+
5+
add_opencl_sample(
6+
TEST
7+
NUMBER 16
8+
TARGET floatatomics
9+
VERSION 120
10+
SOURCES main.cpp)

samples/16_floatatomics/README.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Floating-point Atomic Adds
2+
3+
## Sample Purpose
4+
5+
This is an advanced sample that demonstrates how to do atomic floating-point addition in a kernel.
6+
The most standard way to perform atomic floating-point addition uses the [cl_ext_float_atomics](https://registry.khronos.org/OpenCL/extensions/ext/cl_ext_float_atomics.html) extension.
7+
This extension adds device queries and built-in functions to optionally support floating-point atomic add, min, max, load, and store on 16-bit, 32-bit, and 64-bit floating-point types.
8+
When the `cl_ext_float_atomics` extension is supported, and 32-bit floating point atomic adds are supported, this sample will use the built-in functions added by this extension.
9+
10+
This sample also includes fallback implementations when the `cl_ext_float_atomics` extension is not supported:
11+
12+
* For NVIDIA GPUs, this sample includes a fallback that does the floating-point atomic add using inline PTX assembly language.
13+
* For AMD GPUs, this sample includes a fallback that calls a compiler intrinsic to do the floating-point atomic add.
14+
* For other devices, this sample includes two fallback implementations:
15+
* The first emulates the floating-point atomic add using 32-bit `atomic_xchg` functions.
16+
This fallback implementation cannot reliably return the "old" value that was in memory before performing the atomic add, so it is unsuitable for all usages, but it does work for some important uses-cases, such as reductions.
17+
* The second emulates the floating-point atomic add using 32-bit `atomic_cmpxchg` functions.
18+
This is a slower emulation, but it is able to reliably return the "old" value that was in memory before performing the atomic add.
19+
20+
This sample was inspired by the blog post: https://pipinspace.github.io/blog/atomic-float-addition-in-opencl.html
21+
22+
## Key APIs and Concepts
23+
24+
```
25+
CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT
26+
__opencl_c_ext_fp32_global_atomic_add
27+
atomic_fetch_add_explicit
28+
atomic_xchg
29+
atomic_cmpxchg
30+
```
31+
32+
## Command Line Options
33+
34+
| Option | Default Value | Description |
35+
|:--|:-:|:--|
36+
| `-d <index>` | 0 | Specify the index of the OpenCL device in the platform to execute on the sample on.
37+
| `-p <index>` | 0 | Specify the index of the OpenCL platform to execute the sample on.
38+
| `-i <number>` | 16 | Specify the number of iterations to execute.
39+
| `--gwx <number>` | 16384 | Specify the global work size, which is also the number of floating-point atomics to perform.
40+
| `-e` | N/A | Unconditionally use the emulated floating-point atomic add.
41+
| `-s` | N/A | Unconditionally use the slower and safer emulated floating-point atomic add.
42+
| `-e` | N/A | Check intermediate results for correctness, unsupported for the faster emulated atomics, requires adding a positive value.

samples/16_floatatomics/main.cpp

Lines changed: 271 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,271 @@
1+
/*
2+
// Copyright (c) 2024 Ben Ashbaugh
3+
//
4+
// SPDX-License-Identifier: MIT
5+
*/
6+
7+
#include <popl/popl.hpp>
8+
9+
#include <CL/opencl.hpp>
10+
11+
#include <algorithm>
12+
#include <chrono>
13+
#include <cinttypes>
14+
#include <vector>
15+
16+
#include "util.hpp"
17+
18+
static const char kernelString[] = R"CLC(
19+
float atomic_add_f(volatile global float* addr, float val)
20+
{
21+
#if defined(__opencl_c_ext_fp32_global_atomic_add) && !defined(EMULATE)
22+
//#pragma message("using cl_ext_float_atomics")
23+
return atomic_fetch_add_explicit((volatile global atomic_float*)addr, val, memory_order_relaxed);
24+
#elif defined(cl_nv_pragma_unroll) && !defined(EMULATE)
25+
//#pragma message("using PTX atomics")
26+
float ret; asm volatile("atom.global.add.f32 %0,[%1],%2;":"=f"(ret):"l"(addr),"f"(val):"memory");
27+
return ret;
28+
#elif __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) && !defined(EMULATE)
29+
//#pragma message("using AMD atomics")
30+
return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
31+
#elif !defined(SLOW_EMULATE)
32+
// fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
33+
//#pragma message("using emulated float atomics")
34+
float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
35+
// Note: this emulated version cannot reliably return the previous value!
36+
// This makes it unsuitable for general-purpose use, but it is sufficient
37+
// for some cases, such as reductions.
38+
return 0.0f;
39+
#else
40+
// This is the traditional fallback that uses a compare and exchange loop.
41+
// It is much slower, but it supports returning the previous value.
42+
//#pragma message("using slow emulated float atomics")
43+
volatile global int* iaddr = (volatile global int*)addr;
44+
int old;
45+
int check;
46+
do {
47+
old = atomic_or(iaddr, 0); // emulated atomic load
48+
int new = as_int(as_float(old) + val);
49+
check = atomic_cmpxchg(iaddr, old, new);
50+
} while (check != old);
51+
return as_float(old);
52+
#endif
53+
}
54+
55+
kernel void FloatAtomicTest(global float* dst, global float* results)
56+
{
57+
int index = get_global_id(0);
58+
results[index] = atomic_add_f(dst, 1.0f);
59+
}
60+
)CLC";
61+
62+
static void PrintFloatAtomicCapabilities(
63+
cl_device_fp_atomic_capabilities_ext caps )
64+
{
65+
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT\n");
66+
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT\n");
67+
if (caps & CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT\n");
68+
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT\n");
69+
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT\n");
70+
if (caps & CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT ) printf("\t\tCL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT\n");
71+
72+
cl_device_command_buffer_capabilities_khr extra = caps & ~(
73+
CL_DEVICE_GLOBAL_FP_ATOMIC_LOAD_STORE_EXT |
74+
CL_DEVICE_GLOBAL_FP_ATOMIC_ADD_EXT |
75+
CL_DEVICE_GLOBAL_FP_ATOMIC_MIN_MAX_EXT |
76+
CL_DEVICE_LOCAL_FP_ATOMIC_LOAD_STORE_EXT |
77+
CL_DEVICE_LOCAL_FP_ATOMIC_ADD_EXT |
78+
CL_DEVICE_LOCAL_FP_ATOMIC_MIN_MAX_EXT );
79+
if (extra) {
80+
printf("\t\t(Unknown capability: %016" PRIx64 ")\n", extra);
81+
}
82+
}
83+
84+
int main(
85+
int argc,
86+
char** argv )
87+
{
88+
int platformIndex = 0;
89+
int deviceIndex = 0;
90+
91+
size_t iterations = 16;
92+
size_t gwx = 64 * 1024;
93+
94+
bool emulate = false;
95+
bool slowEmulate = false;
96+
bool check = false;
97+
98+
{
99+
popl::OptionParser op("Supported Options");
100+
op.add<popl::Value<int>>("p", "platform", "Platform Index", platformIndex, &platformIndex);
101+
op.add<popl::Value<int>>("d", "device", "Device Index", deviceIndex, &deviceIndex);
102+
op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
103+
op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
104+
op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
105+
op.add<popl::Switch>("s", "slow-emulate", "Unconditionally Emulate Float Atomics (slowly and safely)", &slowEmulate);
106+
op.add<popl::Switch>("c", "check", "Check Intermediate Results", &check);
107+
108+
bool printUsage = false;
109+
try {
110+
op.parse(argc, argv);
111+
} catch (std::exception& e) {
112+
fprintf(stderr, "Error: %s\n\n", e.what());
113+
printUsage = true;
114+
}
115+
116+
if (printUsage || !op.unknown_options().empty() || !op.non_option_args().empty()) {
117+
fprintf(stderr,
118+
"Usage: floatatomics [options]\n"
119+
"%s", op.help().c_str());
120+
return -1;
121+
}
122+
}
123+
124+
std::vector<cl::Platform> platforms;
125+
cl::Platform::get(&platforms);
126+
127+
printf("Running on platform: %s\n",
128+
platforms[platformIndex].getInfo<CL_PLATFORM_NAME>().c_str() );
129+
130+
std::vector<cl::Device> devices;
131+
platforms[platformIndex].getDevices(CL_DEVICE_TYPE_ALL, &devices);
132+
133+
printf("Running on device: %s\n",
134+
devices[deviceIndex].getInfo<CL_DEVICE_NAME>().c_str() );
135+
136+
// On some implementations, the feature test macros for float atomics are
137+
// only defined when compiling for OpenCL C 3.0 or newer.
138+
std::string buildOptions = "-cl-std=CL3.0";
139+
if (slowEmulate) {
140+
printf("Forcing slow and safe emulation.\n");
141+
buildOptions += " -DEMULATE -DSLOW_EMULATE";
142+
} else if (emulate) {
143+
printf("Forcing emulation.\n");
144+
buildOptions += " -DEMULATE";
145+
} else if (!checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
146+
printf("Device does not support " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
147+
} else {
148+
printf("Device supports " CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME ".\n");
149+
150+
cl_device_fp_atomic_capabilities_ext spcaps =
151+
devices[deviceIndex].getInfo<CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT>();
152+
printf("CL_DEVICE_SINGLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
153+
PrintFloatAtomicCapabilities(spcaps);
154+
155+
cl_device_fp_atomic_capabilities_ext dpcaps =
156+
devices[deviceIndex].getInfo<CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT>();
157+
printf("CL_DEVICE_DOUBLE_FP_ATOMIC_CAPABILITIES_EXT:\n");
158+
PrintFloatAtomicCapabilities(dpcaps);
159+
160+
cl_device_fp_atomic_capabilities_ext hpcaps =
161+
devices[deviceIndex].getInfo<CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT>();
162+
printf("CL_DEVICE_HALF_FP_ATOMIC_CAPABILITIES_EXT:\n");
163+
PrintFloatAtomicCapabilities(hpcaps);
164+
}
165+
166+
cl::Context context{devices[deviceIndex]};
167+
cl::CommandQueue commandQueue{context, devices[deviceIndex]};
168+
169+
cl::Program program{ context, kernelString };
170+
program.build(buildOptions);
171+
cl::Kernel kernel = cl::Kernel{ program, "FloatAtomicTest" };
172+
173+
cl::Buffer dst = cl::Buffer{
174+
context,
175+
CL_MEM_READ_WRITE,
176+
sizeof(cl_float) };
177+
cl::Buffer intermediates = cl::Buffer{
178+
context,
179+
CL_MEM_READ_WRITE,
180+
gwx * sizeof(cl_float) };
181+
182+
// execution
183+
{
184+
kernel.setArg(0, dst);
185+
kernel.setArg(1, intermediates);
186+
187+
commandQueue.finish();
188+
189+
auto start = std::chrono::system_clock::now();
190+
for (size_t i = 0; i < iterations; i++) {
191+
cl_float zero = 0.0f;
192+
commandQueue.enqueueFillBuffer(
193+
dst,
194+
zero,
195+
0,
196+
sizeof(zero));
197+
commandQueue.enqueueNDRangeKernel(
198+
kernel,
199+
cl::NullRange,
200+
cl::NDRange{gwx});
201+
}
202+
203+
commandQueue.finish();
204+
205+
auto end = std::chrono::system_clock::now();
206+
std::chrono::duration<float> elapsed_seconds = end - start;
207+
printf("Finished in %f seconds\n", elapsed_seconds.count());
208+
}
209+
210+
// basic validation
211+
{
212+
cl_float check = 0.0f;
213+
for (size_t i = 0; i < gwx; i++) {
214+
check += 1.0f;
215+
}
216+
217+
cl_float result = 0.0f;
218+
commandQueue.enqueueReadBuffer(
219+
dst,
220+
CL_TRUE,
221+
0,
222+
sizeof(result),
223+
&result);
224+
if (result != check) {
225+
printf("Error: expected %f, got %f!\n", check, result);
226+
} else {
227+
printf("Basic Validation: Success.\n");
228+
}
229+
}
230+
231+
// intermediate results validation
232+
if (check) {
233+
if (emulate && !slowEmulate) {
234+
printf("The emulated float atomic add does not support intermediate results.\n");
235+
} else {
236+
std::vector<cl_float> test(gwx);
237+
commandQueue.enqueueReadBuffer(
238+
intermediates,
239+
CL_TRUE,
240+
0,
241+
gwx * sizeof(cl_float),
242+
test.data());
243+
244+
std::sort(test.begin(), test.end());
245+
246+
size_t mismatches = 0;
247+
for (size_t i = 0; i < gwx; i++) {
248+
if (i == 0 && !(test[i] == 0.0f)) {
249+
if (mismatches < 16) {
250+
printf("Error at index %zu: expected %f, got %f!\n", i, 0.0f, test[i]);
251+
}
252+
mismatches++;
253+
} else if (i > 0 && !(test[i] > test[i-1])) {
254+
if (mismatches < 16) {
255+
printf("Error at index %zu: expected %f > %f!\n", i, test[i], test[i-1]);
256+
}
257+
mismatches++;
258+
}
259+
}
260+
261+
if (mismatches) {
262+
printf("Intermediate Results Validation: Found %zu mismatches / %zu values!!!\n",
263+
mismatches, gwx);
264+
} else {
265+
printf("Intermediate Results Validation: Success.\n");
266+
}
267+
}
268+
}
269+
270+
return 0;
271+
}

samples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ add_subdirectory( 05_spirvkernelfromfile )
7575
add_subdirectory( 06_ndrangekernelfromfile )
7676

7777
add_subdirectory( 10_queueexperiments )
78+
add_subdirectory( 16_floatatomics )
7879

7980
set(BUILD_EXTENSION_SAMPLES TRUE)
8081
if(NOT TARGET OpenCLExt)

0 commit comments

Comments
 (0)