-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMersenneTwister_kernel.cu
136 lines (112 loc) · 4.36 KB
/
MersenneTwister_kernel.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
/*
* Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
//#include <shrUtils.h>
#include <stdio.h>
#include "MersenneTwister.h"
__device__ static mt_struct_stripped ds_MT[MT_RNG_COUNT];
static mt_struct_stripped h_MT[MT_RNG_COUNT];
//Load twister configurations
void loadMTGPU(const char *fname){
FILE *fd = fopen(fname, "rb");
if(!fd){
fprintf(stderr,"initMTGPU(): failed to open %s\n", fname);
fprintf(stderr,"FAILED\n");
exit(0);
}
if( !fread(h_MT, sizeof(h_MT), 1, fd) ){
fprintf(stderr,"initMTGPU(): failed to load %s\n", fname);
fprintf(stderr,"FAILED\n");
exit(0);
}
fclose(fd);
}
//Initialize/seed twister for current GPU context
void seedMTGPU(unsigned int seed){
int i;
//Need to be thread-safe
mt_struct_stripped *MT = (mt_struct_stripped *)malloc(MT_RNG_COUNT * sizeof(mt_struct_stripped));
for(i = 0; i < MT_RNG_COUNT; i++){
MT[i] = h_MT[i];
MT[i].seed = seed;
}
cudaMemcpyToSymbol(ds_MT, MT, sizeof(h_MT));
free(MT);
}
////////////////////////////////////////////////////////////////////////////////
// Write MT_RNG_COUNT vertical lanes of nPerRng random numbers to *d_Random.
// For coalesced global writes MT_RNG_COUNT should be a multiple of warp size.
// Initial states for each generator are the same, since the states are
// initialized from the global seed. In order to improve distribution properties
// on small NPerRng supply dedicated (local) seed to each twister.
// The local seeds, in their turn, can be extracted from global seed
// by means of any simple random number generator, like LCG.
////////////////////////////////////////////////////////////////////////////////
__global__ void RandomGPU(
float *d_Random,
int nPerRng)
{
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
int iState, iState1, iStateM, iOut;
unsigned int mti, mti1, mtiM, x;
unsigned int mt[MT_NN], matrix_a, mask_b, mask_c;
//Load bit-vector Mersenne Twister parameters
matrix_a = ds_MT[tid].matrix_a;
mask_b = ds_MT[tid].mask_b;
mask_c = ds_MT[tid].mask_c;
//Initialize current state
mt[0] = ds_MT[tid].seed;
for (iState = 1; iState < MT_NN; iState++)
mt[iState] = (1812433253U * (mt[iState - 1] ^ (mt[iState - 1] >> 30)) + iState) & MT_WMASK;
iState = 0;
mti1 = mt[0];
for (iOut = 0; iOut < nPerRng; iOut++) {
iState1 = iState + 1;
iStateM = iState + MT_MM;
if(iState1 >= MT_NN) iState1 -= MT_NN;
if(iStateM >= MT_NN) iStateM -= MT_NN;
mti = mti1;
mti1 = mt[iState1];
mtiM = mt[iStateM];
// MT recurrence
x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);
mt[iState] = x;
iState = iState1;
//Tempering transformation
x ^= (x >> MT_SHIFT0);
x ^= (x << MT_SHIFTB) & mask_b;
x ^= (x << MT_SHIFTC) & mask_c;
x ^= (x >> MT_SHIFT1);
//Convert to (0, 1] float and write to global memory
d_Random[tid + iOut * MT_RNG_COUNT] = ((float)x + 1.0f) / 4294967296.0f;
}
}
////////////////////////////////////////////////////////////////////////////////
// Transform each of MT_RNG_COUNT lanes of nPerRng uniformly distributed
// random samples, produced by RandomGPU(), to normally distributed lanes
// using Cartesian form of Box-Muller transformation.
// nPerRng must be even.
////////////////////////////////////////////////////////////////////////////////
#define PI 3.14159265358979f
__device__ inline void BoxMuller(float& u1, float& u2){
float r = sqrtf(-2.0f * logf(u1));
float phi = 2 * PI * u2;
u1 = r * __cosf(phi);
u2 = r * __sinf(phi);
}
__global__ void BoxMullerGPU(float *d_Random, int nPerRng){
const int tid = blockDim.x * blockIdx.x + threadIdx.x;
for (int iOut = 0; iOut < nPerRng; iOut += 2)
BoxMuller(
d_Random[tid + (iOut + 0) * MT_RNG_COUNT],
d_Random[tid + (iOut + 1) * MT_RNG_COUNT]
);
}