From 2c643b844e2e4e8bb5b90f4c0bfe8de778f98e61 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Thu, 7 Jul 2022 19:11:49 +0200 Subject: [PATCH 1/3] Put RNG in shared memory in TransportElectrons --- examples/Example19/electrons.cu | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/examples/Example19/electrons.cu b/examples/Example19/electrons.cu index 3abba5fec..a4118e37e 100644 --- a/examples/Example19/electrons.cu +++ b/examples/Example19/electrons.cu @@ -43,6 +43,10 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons constexpr double Mass = copcore::units::kElectronMassC2; fieldPropagatorConstBz fieldPropagatorBz(BzFieldValue); + // The shared memory handles the access pattern to the RNG better than global memory. And we don't have enough + // registers to keep it local. This is a byte array, because RanluxppDouble has a ctor that we do not want to run. + __shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)]; + int activeSize = active->size(); for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) { const int globalSlot = (*active)[i]; @@ -53,7 +57,11 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons const int lvolID = volume->GetLogicalVolume()->id(); const int theMCIndex = MCIndex[lvolID]; + auto &rngState = reinterpret_cast(rngSM)[threadIdx.x]; + rngState = currentTrack.rngState; + auto survive = [&](bool push = true) { + currentTrack.rngState = rngState; if (push) activeQueue->push_back(globalSlot); }; @@ -76,7 +84,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons // Prepare a branched RNG state while threads are synchronized. Even if not // used, this provides a fresh round of random numbers and reduces thread // divergence because the RNG state doesn't need to be advanced later. - RanluxppDouble newRNG(currentTrack.rngState.BranchNoAdvance()); + RanluxppDouble newRNG(rngState.BranchNoAdvance()); // Compute safety, needed for MSC step limit. double safety = 0; @@ -85,13 +93,13 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons } theTrack->SetSafety(safety); - G4HepEmRandomEngine rnge(¤tTrack.rngState); + G4HepEmRandomEngine rnge(&rngState); // Sample the `number-of-interaction-left` and put it into the track. for (int ip = 0; ip < 3; ++ip) { double numIALeft = currentTrack.numIALeft[ip]; if (numIALeft <= 0) { - numIALeft = -std::log(currentTrack.Uniform()); + numIALeft = -std::log(rngState.Rndm()); } theTrack->SetNumIALeft(numIALeft, ip); } @@ -228,9 +236,9 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons Track &gamma2 = secondaries.gammas.NextTrack(); atomicAdd(&globalScoring->numGammas, 2); - const double cost = 2 * currentTrack.Uniform() - 1; + const double cost = 2 * rngState.Rndm() - 1; const double sint = sqrt(1 - cost * cost); - const double phi = k2Pi * currentTrack.Uniform(); + const double phi = k2Pi * rngState.Rndm(); double sinPhi, cosPhi; sincos(phi, &sinPhi, &cosPhi); @@ -242,7 +250,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons gamma2.InitAsSecondary(/*parent=*/currentTrack); // Reuse the RNG state of the dying track. - gamma2.rngState = currentTrack.rngState; + gamma2.rngState = rngState; gamma2.energy = copcore::units::kElectronMassC2; gamma2.dir = -gamma1.dir; } @@ -279,7 +287,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons currentTrack.numIALeft[winnerProcessIndex] = -1.0; // Check if a delta interaction happens instead of the real discrete process. - if (G4HepEmElectronManager::CheckDelta(&g4HepEmData, theTrack, currentTrack.Uniform())) { + if (G4HepEmElectronManager::CheckDelta(&g4HepEmData, theTrack, rngState.Rndm())) { // A delta interaction happened, move on. survive(); continue; From 51ab9db069dc779cbf2f7b98071cd28f341eebe4 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 7 Oct 2022 13:28:10 +0200 Subject: [PATCH 2/3] Put RNG in SM for electron interactions --- examples/Example19/electrons.cu | 16 ++++++++++++---- examples/Example19/example.cuh | 2 +- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/examples/Example19/electrons.cu b/examples/Example19/electrons.cu index a4118e37e..946949938 100644 --- a/examples/Example19/electrons.cu +++ b/examples/Example19/electrons.cu @@ -326,13 +326,21 @@ __device__ void ElectronInteraction(int const globalSlot, SOAData const & /*soaD const int lvolID = volume->GetLogicalVolume()->id(); const int theMCIndex = MCIndex[lvolID]; - auto survive = [&] { activeQueue->push_back(globalSlot); }; + __shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)]; + + auto &rngState = reinterpret_cast(rngSM)[threadIdx.x]; + rngState = currentTrack.rngState; + + auto survive = [&] { + currentTrack.rngState = rngState; + activeQueue->push_back(globalSlot); + }; const double energy = currentTrack.energy; const double theElCut = g4HepEmData.fTheMatCutData->fMatCutData[theMCIndex].fSecElProdCutE; - RanluxppDouble newRNG{currentTrack.rngState.Branch()}; - G4HepEmRandomEngine rnge{¤tTrack.rngState}; + RanluxppDouble newRNG{rngState.Branch()}; + G4HepEmRandomEngine rnge{&rngState}; if constexpr (ProcessIndex == 0) { // Invoke ionization (for e-/e+): @@ -397,7 +405,7 @@ __device__ void ElectronInteraction(int const globalSlot, SOAData const & /*soaD gamma2.InitAsSecondary(/*parent=*/currentTrack); // Reuse the RNG state of the dying track. - gamma2.rngState = currentTrack.rngState; + gamma2.rngState = rngState; gamma2.energy = theGamma2Ekin; gamma2.dir.Set(theGamma2Dir[0], theGamma2Dir[1], theGamma2Dir[2]); diff --git a/examples/Example19/example.cuh b/examples/Example19/example.cuh index 24327929c..4e831d81e 100644 --- a/examples/Example19/example.cuh +++ b/examples/Example19/example.cuh @@ -141,7 +141,7 @@ template __device__ void InteractionLoop(Func interactionFunction, adept::MParray const *active, SOAData const soaData, Args &&...args) { - constexpr unsigned int sharedSize = 8192; + constexpr unsigned int sharedSize = 7166; __shared__ int candidates[sharedSize]; __shared__ unsigned int counter; __shared__ int threadsRunning; From 232d6db989c98c03399275b4c6a1fcf0d14a66c8 Mon Sep 17 00:00:00 2001 From: Bernhard Manfred Gruber Date: Fri, 7 Oct 2022 14:07:58 +0200 Subject: [PATCH 3/3] Put RNG in SM for gamma interactions --- examples/Example19/gammas.cu | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/examples/Example19/gammas.cu b/examples/Example19/gammas.cu index eb75061b3..87d2bf78b 100644 --- a/examples/Example19/gammas.cu +++ b/examples/Example19/gammas.cu @@ -135,10 +135,18 @@ __device__ void GammaInteraction(int const globalSlot, SOAData const &soaData, i const int theMCIndex = MCIndex[lvolID]; const auto energy = currentTrack.energy; - auto survive = [&] { activeQueue->push_back(globalSlot); }; + __shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)]; - RanluxppDouble newRNG{currentTrack.rngState.Branch()}; - G4HepEmRandomEngine rnge{¤tTrack.rngState}; + auto &rngState = reinterpret_cast(rngSM)[threadIdx.x]; + rngState = currentTrack.rngState; + + auto survive = [&] { + currentTrack.rngState = rngState; + activeQueue->push_back(globalSlot); + }; + + RanluxppDouble newRNG{rngState.Branch()}; + G4HepEmRandomEngine rnge{&rngState}; if constexpr (ProcessIndex == 0) { // Invoke gamma conversion to e-/e+ pairs, if the energy is above the threshold. @@ -169,7 +177,7 @@ __device__ void GammaInteraction(int const globalSlot, SOAData const &soaData, i positron.InitAsSecondary(/*parent=*/currentTrack); // Reuse the RNG state of the dying track. - positron.rngState = currentTrack.rngState; + positron.rngState = rngState; positron.energy = posKinEnergy; positron.dir.Set(dirSecondaryPos[0], dirSecondaryPos[1], dirSecondaryPos[2]);