Cleanup

olupton · olupton · commit 9ebd22d2bbe1 · 2022-04-26T10:54:29.000+02:00
diff --git a/coreneuron/gpu/nrn_acc_manager.cpp b/coreneuron/gpu/nrn_acc_manager.cpp
@@ -670,9 +670,8 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {
     if (n) {
         cnrn_target_delete(vec.data(), n);
     }
-    // cnrn_target_delete(&vec);
 #else
-    (void) vec;
+    static_cast<void>(vec);
 #endif
 }
 
@@ -1336,8 +1335,6 @@ void init_gpu() {
         std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size
                   << " ranks per node\n";
     }
-
-    init_nrnran123();
 }
 
 void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {
diff --git a/coreneuron/gpu/nrn_acc_manager.hpp b/coreneuron/gpu/nrn_acc_manager.hpp
@@ -1,17 +1,16 @@
 /*
 # =============================================================================
-# Copyright (c) 2016 - 2021 Blue Brain Project/EPFL
+# Copyright (c) 2016 - 2022 Blue Brain Project/EPFL
 #
 # See top-level LICENSE file for details.
 # =============================================================================
 */
-
-#ifndef _nrn_device_manager_
-#define _nrn_device_manager_
-
-#include "coreneuron/sim/multicore.hpp"
+#pragma once
 
 namespace coreneuron {
+struct Memb_list;
+struct NrnThread;
+struct NetSendBuffer_t;
 void setup_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void delete_nrnthreads_on_device(NrnThread* threads, int nthreads);
 void update_nrnthreads_on_host(NrnThread* threads, int nthreads);
@@ -24,6 +23,4 @@ void update_net_send_buffer_on_host(NrnThread* nt, NetSendBuffer_t* nsb);
 
 void update_weights_from_gpu(NrnThread* threads, int nthreads);
 void init_gpu();
-void init_nrnran123();
 }  // namespace coreneuron
-#endif  // _nrn_device_manager_
diff --git a/coreneuron/utils/randoms/nrnran123.cpp b/coreneuron/utils/randoms/nrnran123.cpp
@@ -21,13 +21,16 @@
 #include <unordered_map>
 #endif
 
+#ifdef __CUDACC__
 #include <nv/target>
+#endif
 
 // Defining these attributes seems to help nvc++ in OpenMP target offload mode.
 #if defined(CORENEURON_ENABLE_GPU) && defined(CORENEURON_PREFER_OPENMP_OFFLOAD) && \
     defined(_OPENMP) && defined(__CUDACC__)
 #define CORENRN_HOST_DEVICE __host__ __device__
 #elif defined(__CUDACC__)
+// This is necessary to make the new CUDA-syntax-in-.cpp version compile
 #define CORENRN_HOST_DEVICE __host__ __device__
 #else
 #define CORENRN_HOST_DEVICE
@@ -88,20 +91,24 @@ using random123_allocator = coreneuron::unified_allocator<coreneuron::nrnran123_
 OMP_Mutex g_instance_count_mutex;
 std::size_t g_instance_count{};
 
-// not sure quite how nvc++ handles these, not sure we actually need the 2
-// different names?
 philox4x32_key_t g_k{};
+#ifdef __CUDACC__
+// Not 100% clear we need a different name (g_k_dev) here in addition to g_k,
+// but it's clearer and the overhead cannot be high (if it exists).
 __constant__ __device__ philox4x32_key_t g_k_dev{};
 // noinline to force "CUDA" not "acc routine seq" behaviour :shrug:
 __attribute__((noinline)) philox4x32_key_t& global_state() {
     if target (nv::target::is_device) {
-        // printf("dev: &g_k=%p [seed %d]\n", &g_k_dev, g_k_dev.v[0]);
         return g_k_dev;
     } else {
-        // printf("host: &g_k=%p [seed %d]\n", &g_k, g_k.v[0]);
         return g_k;
     }
 }
+#else
+philox4x32_key_t& global_state() {
+    return g_k;
+}
+#endif
 
 constexpr double SHIFT32 = 1.0 / 4294967297.0; /* 1/(2^32 + 1) */
 
@@ -114,14 +121,6 @@ CORENRN_HOST_DEVICE philox4x32_ctr_t philox4x32_helper(coreneuron::nrnran123_Sta
 }  // namespace
 
 namespace coreneuron {
-void init_nrnran123() {
-    // if(coreneuron::gpu_enabled()) {
-    //     // TODO only do this if it isn't already present?
-    //     auto& g_k = global_state();
-    //     nrn_pragma_acc(enter data copyin(g_k))
-    // }
-}
-
 std::size_t nrnran123_instance_count() {
     return g_instance_count;
 }
@@ -216,6 +215,7 @@ void nrnran123_set_globalindex(uint32_t gix) {
     if (g_k.v[0] != gix) {
         g_k.v[0] = gix;
         if (coreneuron::gpu_enabled()) {
+#ifdef __CUDACC__
             {
                 auto const code = cudaMemcpyToSymbol(g_k_dev, &g_k, sizeof(g_k));
                 assert(code == cudaSuccess);
@@ -224,10 +224,10 @@ void nrnran123_set_globalindex(uint32_t gix) {
                 auto const code = cudaDeviceSynchronize();
                 assert(code == cudaSuccess);
             }
-            std::cout << "trying to read g_k_dev from host..." << std::endl;
-            std::cout << g_k_dev.v[0] << std::endl;
-            //     nrn_pragma_acc(update device(g_k))
-            //     nrn_pragma_omp(target update to(g_k))
+#else
+            nrn_pragma_acc(update device(g_k))
+            nrn_pragma_omp(target update to(g_k))
+#endif
         }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -670,9 +670,8 @@ void delete_ivoc_vect_from_device(IvocVect& vec) {`
`670`	`670`	`if (n) {`
`671`	`671`	`cnrn_target_delete(vec.data(), n);`
`672`	`672`	`}`
`673`		`- // cnrn_target_delete(&vec);`
`674`	`673`	`#else`
`675`		`- (void) vec;`
	`674`	`+ static_cast<void>(vec);`
`676`	`675`	`#endif`
`677`	`676`	`}`
`678`	`677`
`@@ -1336,8 +1335,6 @@ void init_gpu() {`
`1336`	`1335`	`std::cout << " Info : " << num_devices_per_node << " GPUs shared by " << local_size`
`1337`	`1336`	`<< " ranks per node\n";`
`1338`	`1337`	`}`
`1339`		`-`
`1340`		`- init_nrnran123();`
`1341`	`1338`	`}`
`1342`	`1339`
`1343`	`1340`	`void nrn_VecPlay_copyto_device(NrnThread* nt, void** d_vecplay) {`