-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreduction.cu
114 lines (92 loc) · 3.12 KB
/
reduction.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h> /* time */
#include <curand.h>
#include <curand_kernel.h>
#define LATTICE_LENGTH 256
#define LATTICE_2 (LATTICE_LENGTH * LATTICE_LENGTH)
#define N LATTICE_LENGTH
//
// __global__ void reduce0(int *g_idata, int *g_odata)
// {
// extern __shared__ int sdata[];
// // each thread loads one element from global to shared mem
// unsigned int tidx = threadIdx.x;
// unsigned int tidy = threadIdx.y;
//
// unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// unsigned int j = blockIdx.y * blockDim.y + threadIdx.y;
//
// sdata[tidx] = g_idata[i + j * N];
// __syncthreads();
// // do reduction in shared mem
// for(unsigned int s=1; s < blockDim.x; s *= 2) {
// if (tid % (2*s) == 0) {
// sdata[tid] += sdata[tid + s];
// }
// __syncthreads();
// }
//
// // write result for this block to global mem
// if (tid == 0) g_odata[blockIdx.x] = sdata[0];
//
// printf("die\n");
// }
__global__ void test(int *input, int *output){
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < N && idy < N){
output[idx + idy * N] = input[idx + idy * N];
}
}
__global__ void printstate(int* output) {
const unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int idy = blockIdx.y * blockDim.y + threadIdx.y;
if (idx < N && idy < N){
printf("%d, %d, %d\n", idx, idy, output[idx + idy * N]);
}
}
int main (int argc, char *argv[]){
int *input;
int *d_input;
int *output;
int *d_output;
int numthreadx = 16;
int numthready = 16;
int numblocksX = LATTICE_LENGTH / numthreadx;
int numblocksY = LATTICE_LENGTH / numthready;
input = (int*)malloc(LATTICE_2 * sizeof(int));
output = (int*)malloc(LATTICE_2 * sizeof(int));
for(int i = 0; i < LATTICE_2; i++){
input[i] = 1;
output[i] = 0;
}
const size_t bytes_input = LATTICE_2 * sizeof(int);
const size_t bytes_output = LATTICE_2 * sizeof(int);
// Set dimensions of block and grid
dim3 grid(numblocksX, numblocksY, 1);
dim3 thread(numthreadx, numthready,1);
// Allocate memoery in device and copy from host to device
cudaMalloc((void **)&d_input, bytes_input);
cudaMalloc((void **)&d_output, bytes_output);
cudaMemcpy(d_input, input, bytes_input, cudaMemcpyHostToDevice);
cudaMemcpy(d_output, output, bytes_output, cudaMemcpyHostToDevice);
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, N * N * sizeof(int) * N);
test<<<grid, thread>>>(d_input, d_output);
cudaMemcpy(output, d_output, bytes_output, cudaMemcpyDeviceToHost);
int sum = 0;
for (int i = 0; i < N ; i++){
for (int j = 0; j < N; j++){
printf("%d \n", output[i + j * N]);
// sum += output[i + j * N];
}
}
// printf("%d\n", sum);
// printstate<<<grid, thread>>>(d_output);
cudaDeviceSynchronize();
free(input);
cudaFree(d_input);
free(output);
cudaFree(d_output);
}