21
21
#include < unordered_map>
22
22
#endif
23
23
24
+ #ifdef __CUDACC__
24
25
#include < nv/target>
26
+ #endif
25
27
26
28
// Defining these attributes seems to help nvc++ in OpenMP target offload mode.
27
29
#if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
28
30
defined (_OPENMP) && defined(__CUDACC__)
29
31
#define CORENRN_HOST_DEVICE __host__ __device__
30
32
#elif defined(__CUDACC__)
33
+ // This is necessary to make the new CUDA-syntax-in-.cpp version compile
31
34
#define CORENRN_HOST_DEVICE __host__ __device__
32
35
#else
33
36
#define CORENRN_HOST_DEVICE
@@ -88,20 +91,24 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
88
91
OMP_Mutex g_instance_count_mutex;
89
92
std::size_t g_instance_count{};
90
93
91
- // not sure quite how nvc++ handles these, not sure we actually need the 2
92
- // different names?
93
94
philox4x32_key_t g_k{};
95
+ #ifdef __CUDACC__
96
+ // Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
97
+ // but it's clearer and the overhead cannot be high (if it exists).
94
98
__constant__ __device__ philox4x32_key_t g_k_dev{};
95
99
// noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
96
100
__attribute__ ((noinline)) philox4x32_key_t& global_state() {
97
101
if target (nv::target::is_device) {
98
- // printf("dev: &g_k=%p [seed %d]\n", &g_k_dev, g_k_dev.v[0]);
99
102
return g_k_dev;
100
103
} else {
101
- // printf("host: &g_k=%p [seed %d]\n", &g_k, g_k.v[0]);
102
104
return g_k;
103
105
}
104
106
}
107
+ #else
108
+ philox4x32_key_t & global_state () {
109
+ return g_k;
110
+ }
111
+ #endif
105
112
106
113
constexpr double SHIFT32 = 1.0 / 4294967297.0 ; /* 1/(2^32 + 1) */
107
114
@@ -114,14 +121,6 @@ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_Sta
114
121
} // namespace
115
122
116
123
namespace coreneuron {
117
- void init_nrnran123 () {
118
- // if(coreneuron::gpu_enabled()) {
119
- // // TODO only do this if it isn't already present?
120
- // auto& g_k = global_state();
121
- // nrn_pragma_acc(enter data copyin(g_k))
122
- // }
123
- }
124
-
125
124
std::size_t nrnran123_instance_count () {
126
125
return g_instance_count;
127
126
}
@@ -216,6 +215,7 @@ void nrnran123_set_globalindex(uint32_t gix) {
216
215
if (g_k.v [0 ] != gix) {
217
216
g_k.v [0 ] = gix;
218
217
if (coreneuron::gpu_enabled ()) {
218
+ #ifdef __CUDACC__
219
219
{
220
220
auto const code = cudaMemcpyToSymbol (g_k_dev, &g_k, sizeof (g_k));
221
221
assert (code == cudaSuccess);
@@ -224,10 +224,10 @@ void nrnran123_set_globalindex(uint32_t gix) {
224
224
auto const code = cudaDeviceSynchronize ();
225
225
assert (code == cudaSuccess);
226
226
}
227
- std::cout << " trying to read g_k_dev from host... " << std::endl;
228
- std::cout << g_k_dev. v [ 0 ] << std::endl;
229
- // nrn_pragma_acc( update device (g_k))
230
- // nrn_pragma_omp(target update to(g_k))
227
+ # else
228
+ nrn_pragma_acc (update device (g_k))
229
+ nrn_pragma_omp (target update to (g_k))
230
+ # endif
231
231
}
232
232
}
233
233
}
0 commit comments