Skip to content

Commit 83aeec8

Browse files
committed
add a slower emulated fallback that uses a compare-and-swap loop
Might try to shift to the older OpenCL 1.x atomics to improve portability.
1 parent 9860956 commit 83aeec8

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

samples/16_floatatomics/main.cpp

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,26 @@ float atomic_add_f(volatile global float* addr, float val)
2828
#elif __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) && !defined(EMULATE)
2929
//#pragma message("using AMD atomics")
3030
return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
31-
#else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
32-
#if 0
31+
#elif !defined(SLOW_EMULATE)
32+
// fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
3333
//#pragma message("using emulated float atomics")
3434
float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
3535
// Note: this emulated version cannot reliably return the previous value!
3636
// This makes it unsuitable for general-purpose use, but it is sufficient
3737
// for some cases, such as reductions.
38-
// A more reliable version would use a compare-exchange loop, though it
39-
// would be much slower.
4038
return 0.0f;
41-
#else
42-
float old = val;
43-
float ret = 0.0f;
44-
while ((old = atomic_xchg(addr, ret = atomic_xchg(addr, 0.0f) + old)) != 0.0f);
45-
return ret;
46-
#endif
39+
#else
40+
// This is the traditional fallback that uses a compare and exchange loop.
41+
// It is much slower, but it supports returning the previous value.
42+
//#pragma message("using slow emulated float atomics")
43+
volatile global atomic_float* faddr = (volatile global atomic_float*)addr;
44+
float old;
45+
float new;
46+
do {
47+
old = atomic_load_explicit(faddr, memory_order_relaxed);
48+
new = old + val;
49+
} while (!atomic_compare_exchange_strong_explicit(faddr, &old, new, memory_order_relaxed, memory_order_relaxed));
50+
return old;
4751
#endif
4852
}
4953
@@ -87,6 +91,7 @@ int main(
8791
size_t gwx = 64 * 1024;
8892

8993
bool emulate = false;
94+
bool slowEmulate = false;
9095
bool check = false;
9196

9297
{
@@ -96,6 +101,7 @@ int main(
96101
op.add<popl::Value<size_t>>("i", "iterations", "Iterations", iterations, &iterations);
97102
op.add<popl::Value<size_t>>("", "gwx", "Global Work Size X AKA Number of Atomics", gwx, &gwx);
98103
op.add<popl::Switch>("e", "emulate", "Unconditionally Emulate Float Atomics", &emulate);
104+
op.add<popl::Switch>("s", "slow-emulate", "Unconditionally Emulate Float Atomics with Return Support", &slowEmulate);
99105
op.add<popl::Switch>("c", "check", "Check Intermediate Results", &check);
100106

101107
bool printUsage = false;
@@ -129,7 +135,10 @@ int main(
129135
// On some implementations, the feature test macros for float atomics are
130136
// only defined when compiling for OpenCL C 3.0 or newer.
131137
std::string buildOptions = "-cl-std=CL3.0";
132-
if (emulate) {
138+
if (slowEmulate) {
139+
printf("Forcing slow and safe emulation.\n");
140+
buildOptions += " -DEMULATE -DSLOW_EMULATE";
141+
} else if (emulate) {
133142
printf("Forcing emulation.\n");
134143
buildOptions += " -DEMULATE";
135144
} else if (!checkDeviceForExtension(devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
@@ -220,8 +229,8 @@ int main(
220229

221230
// intermediate results validation
222231
if (check) {
223-
if (false && emulate) {
224-
printf("Skipping The emulated float atomic add does not support intermediate results.\n");
232+
if (emulate && !slowEmulate) {
233+
printf("The emulated float atomic add does not support intermediate results.\n");
225234
} else {
226235
std::vector<cl_float> test(gwx);
227236
commandQueue.enqueueReadBuffer(

0 commit comments

Comments
 (0)