-
Notifications
You must be signed in to change notification settings - Fork 0
/
sdl_allreduce.c
101 lines (79 loc) · 3.59 KB
/
sdl_allreduce.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <mpix_harmonize.h>
#define NUM_ITERATIONS 100
int main(int argc, char** argv) {
MPI_Init(&argc, &argv);
int rank, size, flag;
int VECTOR_SIZE;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
// Create a random vector on each process
//int local_vector[VECTOR_SIZE];
if(argc !=2) {
if(rank==0) printf("\nUsage: mpirun -np NP vectorSize \n");
MPI_Finalize();
exit(1);
}
else {
//sscanf(argv[1],"%d",&VECTOR_SIZE);
VECTOR_SIZE = atoi(argv[1]);
}
int* local_vector = (int*) malloc(VECTOR_SIZE*sizeof(int));
//int local_vector[VECTOR_SIZE];
srand(rank);
for (int i = 0; i < VECTOR_SIZE; ++i) {
local_vector[i] = rand() % 10;
}
/* I looked at the code and I have questions ;) Why do call Allreduce three
times each loop but only measure one of them? Why not have 100
iterations with one barrier/harmonize call and one allreduce? Also, you
only print the last iteration but you should add up the measurements
from each iteration and get the average of it. At the end, you probably
want to get the average and max/min across all processes. Rank 0 is only
one of the processes that matters...edit
*/
double max_sync_time = 0.0; double max_sync_time2 = 0.0;
//double max_allreduce_time = 0.0;
int* global_vector = (int*) malloc(VECTOR_SIZE*sizeof(int));
double total_sync_time = 0.0;
double total_allreduce_time = 0.0;
double start_sync_time, end_sync_time, end_allreduce_time, start_allreduce_time, local_sync_time, local_allreduce_time;
for (int iter = 0; iter < NUM_ITERATIONS; ++iter) {
start_sync_time = MPI_Wtime();
//MPIX_Harmonize(MPI_COMM_WORLD, &flag);
MPI_Barrier(MPI_COMM_WORLD);
end_sync_time = MPI_Wtime();
local_sync_time = end_sync_time -start_sync_time;
start_allreduce_time = MPI_Wtime();
MPI_Allreduce(local_vector, global_vector, VECTOR_SIZE, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
end_allreduce_time = MPI_Wtime();
local_allreduce_time = end_allreduce_time - start_allreduce_time;
total_sync_time += local_sync_time;
total_allreduce_time += local_allreduce_time;
}
double min_allreduce_time, max_allreduce_time;
MPI_Reduce(&total_allreduce_time, &min_allreduce_time, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce(&total_allreduce_time, &max_allreduce_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
/* You probably want to divide min_allreduce_time and max_allreduce_time by
the number of iterations (since you're taking the min/max over the sum
of all measurements on each process). Plus, you should do the same for
the sync time :)*/
if (rank == 0) {
double avg_sync_time = total_sync_time / NUM_ITERATIONS;
double avg_allreduce_time = total_allreduce_time / NUM_ITERATIONS;
double avg_min_allreduce_time = min_allreduce_time / NUM_ITERATIONS;
double avg_max_allreduce_time = max_allreduce_time / NUM_ITERATIONS;
printf("Number of processes = %d, vector size = %d \n", size, VECTOR_SIZE);
printf("Average synchronization time across all iterations: %f seconds\n", avg_sync_time);
printf("Average AllReduce time across all iterations: %f seconds\n", avg_allreduce_time);
printf("Minimum AllReduce time across all processes: %f seconds\n", avg_min_allreduce_time);
printf("Maximum AllReduce time across all processes: %f seconds\n", avg_max_allreduce_time);
}
free(local_vector);
free(global_vector);
MPI_Finalize();
return 0;
}