sdl_allreduce.c

#include <stdio.h>
#include <stdlib.h>
#include <mpi.h>
#include <time.h>
#include <mpix_harmonize.h>

#define NUM_ITERATIONS 100

int main(int argc, char** argv) {
    MPI_Init(&argc, &argv);
    int rank, size, flag;
    int VECTOR_SIZE;
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    // Create a random vector on each process
    //int local_vector[VECTOR_SIZE];
    if(argc !=2) {
       	if(rank==0) printf("\nUsage: mpirun -np NP vectorSize \n");
	MPI_Finalize();
	exit(1);
    }
    else {
       	//sscanf(argv[1],"%d",&VECTOR_SIZE);
	VECTOR_SIZE = atoi(argv[1]);
	
    }

    int* local_vector = (int*) malloc(VECTOR_SIZE*sizeof(int)); 
    //int local_vector[VECTOR_SIZE];
    srand(rank);
    for (int i = 0; i < VECTOR_SIZE; ++i) {
        local_vector[i] = rand() % 10;
    }

      /* I looked at the code and I have questions ;) Why do call Allreduce three
	times each loop but only measure one of them? Why not have 100
	iterations with one barrier/harmonize call and one allreduce? Also, you
	only print the last iteration but you should add up the measurements
	from each iteration and get the average of it. At the end, you probably
	want to get the average and max/min across all processes. Rank 0 is only
	one of the processes that matters...edit

	*/ 

    double max_sync_time = 0.0; double max_sync_time2 = 0.0;
    //double max_allreduce_time = 0.0;

    int* global_vector = (int*) malloc(VECTOR_SIZE*sizeof(int));

    double total_sync_time = 0.0;
    double total_allreduce_time = 0.0;
    double start_sync_time, end_sync_time, end_allreduce_time, start_allreduce_time, local_sync_time, local_allreduce_time;

    for (int iter = 0; iter < NUM_ITERATIONS; ++iter) {

    	start_sync_time = MPI_Wtime();
    	//MPIX_Harmonize(MPI_COMM_WORLD, &flag);
    	MPI_Barrier(MPI_COMM_WORLD);
    	end_sync_time = MPI_Wtime();
	local_sync_time = end_sync_time -start_sync_time;
    	
    	start_allreduce_time = MPI_Wtime();
    	MPI_Allreduce(local_vector, global_vector, VECTOR_SIZE, MPI_INT, MPI_SUM, MPI_COMM_WORLD);
    	end_allreduce_time = MPI_Wtime();
	local_allreduce_time = end_allreduce_time - start_allreduce_time;

	total_sync_time += local_sync_time;
        total_allreduce_time += local_allreduce_time;
    }	

    double min_allreduce_time, max_allreduce_time;
    MPI_Reduce(&total_allreduce_time, &min_allreduce_time, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
    MPI_Reduce(&total_allreduce_time, &max_allreduce_time, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
	
/*    You probably want to divide min_allreduce_time and max_allreduce_time by
the number of iterations (since you're taking the min/max over the sum
of all measurements on each process). Plus, you should do the same for
the sync time :)*/


    if (rank == 0) {   
        double avg_sync_time = total_sync_time / NUM_ITERATIONS;
        double avg_allreduce_time = total_allreduce_time / NUM_ITERATIONS;
	double avg_min_allreduce_time = min_allreduce_time / NUM_ITERATIONS;
	double avg_max_allreduce_time = max_allreduce_time / NUM_ITERATIONS;

        printf("Number of processes = %d, vector size = %d \n", size, VECTOR_SIZE);
        printf("Average synchronization time across all iterations: %f seconds\n", avg_sync_time);
        printf("Average AllReduce time across all iterations: %f seconds\n", avg_allreduce_time);
        printf("Minimum AllReduce time across all processes: %f seconds\n", avg_min_allreduce_time);
        printf("Maximum AllReduce time across all processes: %f seconds\n", avg_max_allreduce_time);
    }
      
    free(local_vector);
    free(global_vector);
    MPI_Finalize();
    return 0;
}