@@ -28,22 +28,26 @@ float atomic_add_f(volatile global float* addr, float val)
28
28
#elif __has_builtin(__builtin_amdgcn_global_atomic_fadd_f32) && !defined(EMULATE)
29
29
//#pragma message("using AMD atomics")
30
30
return __builtin_amdgcn_global_atomic_fadd_f32(addr, val);
31
- #else // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
32
- #if 0
31
+ #elif !defined(SLOW_EMULATE)
32
+ // fallback, see: https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639/7
33
33
//#pragma message("using emulated float atomics")
34
34
float old = val; while((old=atomic_xchg(addr, atomic_xchg(addr, 0.0f)+old))!=0.0f);
35
35
// Note: this emulated version cannot reliably return the previous value!
36
36
// This makes it unsuitable for general-purpose use, but it is sufficient
37
37
// for some cases, such as reductions.
38
- // A more reliable version would use a compare-exchange loop, though it
39
- // would be much slower.
40
38
return 0.0f;
41
- #else
42
- float old = val;
43
- float ret = 0.0f;
44
- while ((old = atomic_xchg(addr, ret = atomic_xchg(addr, 0.0f) + old)) != 0.0f);
45
- return ret;
46
- #endif
39
+ #else
40
+ // This is the traditional fallback that uses a compare and exchange loop.
41
+ // It is much slower, but it supports returning the previous value.
42
+ //#pragma message("using slow emulated float atomics")
43
+ volatile global atomic_float* faddr = (volatile global atomic_float*)addr;
44
+ float old;
45
+ float new;
46
+ do {
47
+ old = atomic_load_explicit(faddr, memory_order_relaxed);
48
+ new = old + val;
49
+ } while (!atomic_compare_exchange_strong_explicit(faddr, &old, new, memory_order_relaxed, memory_order_relaxed));
50
+ return old;
47
51
#endif
48
52
}
49
53
@@ -87,6 +91,7 @@ int main(
87
91
size_t gwx = 64 * 1024 ;
88
92
89
93
bool emulate = false ;
94
+ bool slowEmulate = false ;
90
95
bool check = false ;
91
96
92
97
{
@@ -96,6 +101,7 @@ int main(
96
101
op.add <popl::Value<size_t >>(" i" , " iterations" , " Iterations" , iterations, &iterations);
97
102
op.add <popl::Value<size_t >>(" " , " gwx" , " Global Work Size X AKA Number of Atomics" , gwx, &gwx);
98
103
op.add <popl::Switch>(" e" , " emulate" , " Unconditionally Emulate Float Atomics" , &emulate);
104
+ op.add <popl::Switch>(" s" , " slow-emulate" , " Unconditionally Emulate Float Atomics with Return Support" , &slowEmulate);
99
105
op.add <popl::Switch>(" c" , " check" , " Check Intermediate Results" , &check);
100
106
101
107
bool printUsage = false ;
@@ -129,7 +135,10 @@ int main(
129
135
// On some implementations, the feature test macros for float atomics are
130
136
// only defined when compiling for OpenCL C 3.0 or newer.
131
137
std::string buildOptions = " -cl-std=CL3.0" ;
132
- if (emulate) {
138
+ if (slowEmulate) {
139
+ printf (" Forcing slow and safe emulation.\n " );
140
+ buildOptions += " -DEMULATE -DSLOW_EMULATE" ;
141
+ } else if (emulate) {
133
142
printf (" Forcing emulation.\n " );
134
143
buildOptions += " -DEMULATE" ;
135
144
} else if (!checkDeviceForExtension (devices[deviceIndex], CL_EXT_FLOAT_ATOMICS_EXTENSION_NAME)) {
@@ -220,8 +229,8 @@ int main(
220
229
221
230
// intermediate results validation
222
231
if (check) {
223
- if (false && emulate ) {
224
- printf (" Skipping The emulated float atomic add does not support intermediate results.\n " );
232
+ if (emulate && !slowEmulate ) {
233
+ printf (" The emulated float atomic add does not support intermediate results.\n " );
225
234
} else {
226
235
std::vector<cl_float> test (gwx);
227
236
commandQueue.enqueueReadBuffer (
0 commit comments