Skip to content

Put RNG in shared memory where beneficial #229

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 27 additions & 11 deletions examples/Example19/electrons.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
constexpr double Mass = copcore::units::kElectronMassC2;
fieldPropagatorConstBz fieldPropagatorBz(BzFieldValue);

// The shared memory handles the access pattern to the RNG better than global memory. And we don't have enough
// registers to keep it local. This is a byte array, because RanluxppDouble has a ctor that we do not want to run.
__shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)];

int activeSize = active->size();
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < activeSize; i += blockDim.x * gridDim.x) {
const int globalSlot = (*active)[i];
Expand All @@ -53,7 +57,11 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
const int lvolID = volume->GetLogicalVolume()->id();
const int theMCIndex = MCIndex[lvolID];

auto &rngState = reinterpret_cast<RanluxppDouble *>(rngSM)[threadIdx.x];
rngState = currentTrack.rngState;

auto survive = [&](bool push = true) {
currentTrack.rngState = rngState;
if (push) activeQueue->push_back(globalSlot);
};

Expand All @@ -76,7 +84,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
// Prepare a branched RNG state while threads are synchronized. Even if not
// used, this provides a fresh round of random numbers and reduces thread
// divergence because the RNG state doesn't need to be advanced later.
RanluxppDouble newRNG(currentTrack.rngState.BranchNoAdvance());
RanluxppDouble newRNG(rngState.BranchNoAdvance());

// Compute safety, needed for MSC step limit.
double safety = 0;
Expand All @@ -85,13 +93,13 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
}
theTrack->SetSafety(safety);

G4HepEmRandomEngine rnge(&currentTrack.rngState);
G4HepEmRandomEngine rnge(&rngState);

// Sample the `number-of-interaction-left` and put it into the track.
for (int ip = 0; ip < 3; ++ip) {
double numIALeft = currentTrack.numIALeft[ip];
if (numIALeft <= 0) {
numIALeft = -std::log(currentTrack.Uniform());
numIALeft = -std::log(rngState.Rndm());
}
theTrack->SetNumIALeft(numIALeft, ip);
}
Expand Down Expand Up @@ -228,9 +236,9 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
Track &gamma2 = secondaries.gammas.NextTrack();
atomicAdd(&globalScoring->numGammas, 2);

const double cost = 2 * currentTrack.Uniform() - 1;
const double cost = 2 * rngState.Rndm() - 1;
const double sint = sqrt(1 - cost * cost);
const double phi = k2Pi * currentTrack.Uniform();
const double phi = k2Pi * rngState.Rndm();
double sinPhi, cosPhi;
sincos(phi, &sinPhi, &cosPhi);

Expand All @@ -242,7 +250,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons

gamma2.InitAsSecondary(/*parent=*/currentTrack);
// Reuse the RNG state of the dying track.
gamma2.rngState = currentTrack.rngState;
gamma2.rngState = rngState;
gamma2.energy = copcore::units::kElectronMassC2;
gamma2.dir = -gamma1.dir;
}
Expand Down Expand Up @@ -279,7 +287,7 @@ static __device__ __forceinline__ void TransportElectrons(Track *electrons, cons
currentTrack.numIALeft[winnerProcessIndex] = -1.0;

// Check if a delta interaction happens instead of the real discrete process.
if (G4HepEmElectronManager::CheckDelta(&g4HepEmData, theTrack, currentTrack.Uniform())) {
if (G4HepEmElectronManager::CheckDelta(&g4HepEmData, theTrack, rngState.Rndm())) {
// A delta interaction happened, move on.
survive();
continue;
Expand Down Expand Up @@ -318,13 +326,21 @@ __device__ void ElectronInteraction(int const globalSlot, SOAData const & /*soaD
const int lvolID = volume->GetLogicalVolume()->id();
const int theMCIndex = MCIndex[lvolID];

auto survive = [&] { activeQueue->push_back(globalSlot); };
__shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)];

auto &rngState = reinterpret_cast<RanluxppDouble *>(rngSM)[threadIdx.x];
rngState = currentTrack.rngState;

auto survive = [&] {
currentTrack.rngState = rngState;
activeQueue->push_back(globalSlot);
};

const double energy = currentTrack.energy;
const double theElCut = g4HepEmData.fTheMatCutData->fMatCutData[theMCIndex].fSecElProdCutE;

RanluxppDouble newRNG{currentTrack.rngState.Branch()};
G4HepEmRandomEngine rnge{&currentTrack.rngState};
RanluxppDouble newRNG{rngState.Branch()};
G4HepEmRandomEngine rnge{&rngState};

if constexpr (ProcessIndex == 0) {
// Invoke ionization (for e-/e+):
Expand Down Expand Up @@ -389,7 +405,7 @@ __device__ void ElectronInteraction(int const globalSlot, SOAData const & /*soaD

gamma2.InitAsSecondary(/*parent=*/currentTrack);
// Reuse the RNG state of the dying track.
gamma2.rngState = currentTrack.rngState;
gamma2.rngState = rngState;
gamma2.energy = theGamma2Ekin;
gamma2.dir.Set(theGamma2Dir[0], theGamma2Dir[1], theGamma2Dir[2]);

Expand Down
2 changes: 1 addition & 1 deletion examples/Example19/example.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ template <int ProcessIndex, typename Func, typename... Args>
__device__ void InteractionLoop(Func interactionFunction, adept::MParray const *active, SOAData const soaData,
Args &&...args)
{
constexpr unsigned int sharedSize = 8192;
constexpr unsigned int sharedSize = 7166;
__shared__ int candidates[sharedSize];
__shared__ unsigned int counter;
__shared__ int threadsRunning;
Expand Down
16 changes: 12 additions & 4 deletions examples/Example19/gammas.cu
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,18 @@ __device__ void GammaInteraction(int const globalSlot, SOAData const &soaData, i
const int theMCIndex = MCIndex[lvolID];
const auto energy = currentTrack.energy;

auto survive = [&] { activeQueue->push_back(globalSlot); };
__shared__ std::byte rngSM[ThreadsPerBlock * sizeof(RanluxppDouble)];

RanluxppDouble newRNG{currentTrack.rngState.Branch()};
G4HepEmRandomEngine rnge{&currentTrack.rngState};
auto &rngState = reinterpret_cast<RanluxppDouble *>(rngSM)[threadIdx.x];
rngState = currentTrack.rngState;

auto survive = [&] {
currentTrack.rngState = rngState;
activeQueue->push_back(globalSlot);
};

RanluxppDouble newRNG{rngState.Branch()};
G4HepEmRandomEngine rnge{&rngState};

if constexpr (ProcessIndex == 0) {
// Invoke gamma conversion to e-/e+ pairs, if the energy is above the threshold.
Expand Down Expand Up @@ -169,7 +177,7 @@ __device__ void GammaInteraction(int const globalSlot, SOAData const &soaData, i

positron.InitAsSecondary(/*parent=*/currentTrack);
// Reuse the RNG state of the dying track.
positron.rngState = currentTrack.rngState;
positron.rngState = rngState;
positron.energy = posKinEnergy;
positron.dir.Set(dirSecondaryPos[0], dirSecondaryPos[1], dirSecondaryPos[2]);

Expand Down