-
Notifications
You must be signed in to change notification settings - Fork 183
/
Copy pathping-pong.cpp
201 lines (163 loc) · 6.09 KB
/
ping-pong.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#include <cstdio>
#include <cstdlib>
#include <mpi.h>
#include <unistd.h>
#include <hip/hip_runtime.h>
/* HIP kernel to increment every element of a vector by one */
__global__ void add_kernel(double *in, int N)
{
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < N)
in[tid]++;
}
/*
This routine can be used to inspect the properties of a node
Return arguments:
nodeRank (int *) -- My rank in the node communicator
nodeProcs (int *) -- Total number of processes in this node
devCount (int *) -- Number of HIP devices available in the node
*/
void getNodeInfo(int *nodeRank, int *nodeProcs, int *devCount)
{
MPI_Comm intranodecomm;
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0,
MPI_INFO_NULL, &intranodecomm);
MPI_Comm_rank(intranodecomm, nodeRank);
MPI_Comm_size(intranodecomm, nodeProcs);
MPI_Comm_free(&intranodecomm);
hipGetDeviceCount(devCount);
}
/* Ping-pong test for CPU-to-CPU communication */
void CPUtoCPU(int rank, double *data, int N, double &timer)
{
double start, stop;
start = MPI_Wtime();
if (rank == 0) {
MPI_Send(data, N, MPI_DOUBLE, 1, 11, MPI_COMM_WORLD);
MPI_Recv(data, N, MPI_DOUBLE, 1, 12, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
} else if (rank == 1) {
MPI_Recv(data, N, MPI_DOUBLE, 0, 11, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
// Increment by one on the CPU
for (int i = 0; i < N; ++i)
data[i] += 1.0;
MPI_Send(data, N, MPI_DOUBLE, 0, 12, MPI_COMM_WORLD);
}
stop = MPI_Wtime();
timer = stop - start;
}
/* Ping-pong test for indirect GPU-to-GPU communication via the host */
void GPUtoGPUviaHost(int rank, double *hA, double *dA, int N, double &timer)
{
double start, stop;
start = MPI_Wtime();
// TODO: Implement a GPU-to-GPU ping-pong that communicates via the host,
// but uses the GPU to increment the vector elements. Copy data from
// device to host (and back) and use normal MPI communication on the
// host. Use the HIP kernel add_kernel() to increment values before
// sending them back to rank 0.
if (rank == 0) {
// TODO: Copy vector to host and send it to rank 1
// TODO: Receive vector from rank 1 and copy it to the device
} else if (rank == 1) {
// TODO: Receive vector from rank 0 and copy it to the device
// TODO: Launch kernel to increment values on the GPU
// TODO: Copy vector to host and send it to rank 0
}
stop = MPI_Wtime();
timer = stop - start;
}
/* Ping-pong test for direct GPU-to-GPU communication using HIP-aware MPI */
void GPUtoGPUdirect(int rank, double *dA, int N, double &timer)
{
double start, stop;
start = MPI_Wtime();
// TODO: Implement a GPU-to-GPU ping-pong that communicates directly
// from GPU memory using HIP-aware MPI.
if (rank == 0) {
// TODO: Send vector to rank 1
// TODO: Receive vector from rank 1
} else if (rank == 1) {
// TODO: Receive vector from rank 0
// TODO: Launch kernel to increment values on the GPU
// TODO: Send vector to rank 0
}
stop = MPI_Wtime();
timer = stop - start;
}
int main(int argc, char *argv[])
{
int rank, nprocs, noderank, nodenprocs, devcount;
int N = 256*1024*1024;
double GPUtime, CPUtime;
double *dA, *hA;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &nprocs);
getNodeInfo(&noderank, &nodenprocs, &devcount);
// Check that we have enough MPI tasks and GPUs
if (nprocs < 2) {
printf("Not enough MPI tasks! Need at least 2.\n");
exit(EXIT_FAILURE);
} else if (devcount == 0) {
printf("Could not find any GPU devices.\n");
exit(EXIT_FAILURE);
} else {
printf("MPI rank %d: Found %d GPU devices, using GPU %d\n",
rank, devcount, noderank % devcount);
}
// Select the device according to the node rank
hipSetDevice(noderank % devcount);
// Allocate enough pinned host and device memory for hA and dA
// to store N doubles
hipHostMalloc((void **) &hA, sizeof(double) * N);
hipMalloc((void **) &dA, sizeof(double) * N);
// Dummy transfer to remove the overhead of the first communication
CPUtoCPU(rank, hA, N, CPUtime);
// Initialize the vectors
for (int i = 0; i < N; ++i)
hA[i] = 1.0;
hipMemcpy(dA, hA, sizeof(double) * N, hipMemcpyHostToDevice);
// CPU-to-CPU test
CPUtoCPU(rank, hA, N, CPUtime);
if (rank == 0) {
double errorsum = 0;
for (int i = 0; i < N; ++i)
errorsum += hA[i] - 2.0;
printf("CPU-CPU: time %e, errorsum %f\n", CPUtime, errorsum);
}
// Dummy transfer to remove the overhead of the first communication
GPUtoGPUdirect(rank, dA, N, GPUtime);
// Re-initialize the vectors
for (int i = 0; i < N; ++i)
hA[i] = 1.0;
hipMemcpy(dA, hA, sizeof(double) * N, hipMemcpyHostToDevice);
// GPU-to-GPU test, direct communication with HIP-aware MPI
GPUtoGPUdirect(rank, dA, N, GPUtime);
hipMemcpy(hA, dA, sizeof(double) * N, hipMemcpyDeviceToHost);
if (rank == 0) {
double errorsum = 0;
for (int i = 0; i < N; ++i)
errorsum += hA[i] - 2.0;
printf("GPU-GPU direct: time %e, errorsum %f\n", GPUtime, errorsum);
}
// Dummy transfer to remove the overhead of the first communication
GPUtoGPUviaHost(rank, hA, dA, N, GPUtime);
// Re-initialize the vectors
for (int i = 0; i < N; ++i)
hA[i] = 1.0;
hipMemcpy(dA, hA, sizeof(double) * N, hipMemcpyHostToDevice);
// GPU-to-GPU test, communication via host
GPUtoGPUviaHost(rank, hA, dA, N, GPUtime);
hipMemcpy(hA, dA, sizeof(double) * N, hipMemcpyDeviceToHost);
if (rank == 0) {
double errorsum = 0;
for (int i = 0; i < N; ++i)
errorsum += hA[i] - 2.0;
printf("GPU-GPU via host: time %e, errorsum %f\n", GPUtime, errorsum);
}
// Deallocate memory
hipHostFree(hA);
hipFree(dA);
MPI_Finalize();
return 0;
}