-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathshared.cu
148 lines (100 loc) · 3.47 KB
/
shared.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#include <stdio.h>
#include "repeat.h"
__global__ void shared_latency (unsigned int * my_array, int array_length, int iterations, unsigned long long * duration) {
unsigned int start_time, end_time;
int i, k;
unsigned int j = 0;
unsigned long long sum_time;
my_array[array_length - 1] = 0;
sum_time = 0;
duration[0] = 0;
// sdata[] is used to hold the data in shared memory. Dynamically allocated at launch time.
extern __shared__ unsigned int sdata[];
for (i=0; i < array_length; i++) {
sdata[i] = my_array[i];
}
j=0;
for (k= 0; k<= iterations; k++) {
if (k==1) {
sum_time = 0;
}
start_time = clock();
repeat256(j=sdata[j];);
end_time = clock();
sum_time += (end_time -start_time);
}
my_array[array_length - 1] = j;
duration[0] = sum_time;
}
// Shared memory array size is N-2. Last two elements are used as dummy variables.
void parametric_measure_shared(int N, int iterations, int stride) {
int i;
unsigned int * h_a;
unsigned int * d_a;
unsigned long long * duration;
unsigned long long * latency;
cudaError_t error_id;
/* allocate array on CPU */
h_a = (unsigned int *)malloc(sizeof(unsigned int) * N);
latency = (unsigned long long *)malloc(2*sizeof(unsigned long long));
/* initialize array elements on CPU */
for (i = 0; i < N-2; i++) {
h_a[i] = (i + stride) % (N-2);
}
h_a[N-2] = 0;
h_a[N-1] = 0;
/* allocate arrays on GPU */
cudaMalloc ((void **) &d_a, sizeof(unsigned int) * N);
cudaMalloc ((void **) &duration, 2*sizeof(unsigned long long));
cudaThreadSynchronize ();
error_id = cudaGetLastError();
if (error_id != cudaSuccess) {
printf("Error 1 is %s\n", cudaGetErrorString(error_id));
}
/* copy array elements from CPU to GPU */
cudaMemcpy((void *)d_a, (void *)h_a, sizeof(unsigned int) * N, cudaMemcpyHostToDevice);
cudaMemcpy((void *)duration, (void *)latency, 2*sizeof(unsigned long long), cudaMemcpyHostToDevice);
cudaThreadSynchronize ();
error_id = cudaGetLastError();
if (error_id != cudaSuccess) {
printf("Error 2 is %s\n", cudaGetErrorString(error_id));
}
/* launch kernel*/
dim3 Db = dim3(1);
dim3 Dg = dim3(1,1,1);
//printf("Launch kernel with parameters: %d, N: %d, stride: %d\n", iterations, N, stride);
int sharedMemSize = sizeof(unsigned int) * N ;
shared_latency <<<Dg, Db, sharedMemSize>>>(d_a, N, iterations, duration);
cudaThreadSynchronize ();
error_id = cudaGetLastError();
if (error_id != cudaSuccess) {
printf("Error 3 is %s\n", cudaGetErrorString(error_id));
}
/* copy results from GPU to CPU */
cudaThreadSynchronize ();
cudaMemcpy((void *)h_a, (void *)d_a, sizeof(unsigned int) * N, cudaMemcpyDeviceToHost);
cudaMemcpy((void *)latency, (void *)duration, 2*sizeof(unsigned long long), cudaMemcpyDeviceToHost);
cudaThreadSynchronize ();
/* print results*/
printf(" %d, %f\n",stride,(double)(latency[0]/(256.0*iterations)));
/* free memory on GPU */
cudaFree(d_a);
cudaFree(duration);
cudaThreadSynchronize ();
/*free memory on CPU */
free(h_a);
free(latency);
}
int main() {
int N, stride;
// initialize upper bounds here
int stride_upper_bound = 1024;
printf("Shared memory latency for varying stride.\n");
printf("stride (bytes), latency (clocks)\n");
N = 256;
stride_upper_bound = N;
for (stride = 1; stride <= stride_upper_bound; stride += 1) {
parametric_measure_shared(N+2, 10, stride);
}
return 0;
}