From ee89a6a9d923cd928a21121da645f1ce6b04c4c7 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 20 Jun 2017 15:47:02 +0200 Subject: [PATCH 1/8] Pull out min/max values into double --- main.cpp | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/main.cpp b/main.cpp index 33cef1e0..bc71f295 100644 --- a/main.cpp +++ b/main.cpp @@ -201,16 +201,20 @@ void run() { // Get min/max; ignore the first result auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end()); + double min = *minmax.first; + double max = *minmax.second; // Calculate average; ignore the first result - double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1); + double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0); + + average = average / (double)(num_times - 1); // Display results std::cout << std::left << std::setw(12) << labels[i] - << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first) - << std::left << std::setw(12) << std::setprecision(5) << *minmax.first - << std::left << std::setw(12) << std::setprecision(5) << *minmax.second + << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / min + << std::left << std::setw(12) << std::setprecision(5) << min + << std::left << std::setw(12) << std::setprecision(5) << max << std::left << std::setw(12) << std::setprecision(5) << average << std::endl; From f74d7634d85b00408ae227890a678e200f6c7708 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 20 Jun 2017 15:47:05 +0200 Subject: [PATCH 2/8] Add MPI routines to run on multiple processes All changes were made in the driver code, and no changes should be required in the different implementations. Changes involved: 1. Initilising MPI and getting rank and size 2. Guarding std::cout so only rank 0 prints 3. Adding Barriers between kernels 4. Dot produce performs Reduction. --- main.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 2 deletions(-) diff --git a/main.cpp b/main.cpp index bc71f295..6a4ef3ac 100644 --- a/main.cpp +++ b/main.cpp @@ -51,12 +51,34 @@ void run(); void parseArguments(int argc, char *argv[]); +#ifdef USE_MPI + #include + // MPI parameters + int rank, procs; +#endif + int main(int argc, char *argv[]) { - std::cout +#ifdef USE_MPI + int provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided); + if (provided < MPI_THREAD_FUNNELED) MPI_Abort(MPI_COMM_WORLD, provided); + + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &procs); +#endif + +#ifdef USE_MPI + if (rank == 0) { +#endif + std::cout << "BabelStream" << std::endl << "Version: " << VERSION_STRING << std::endl << "Implementation: " << IMPLEMENTATION_STRING << std::endl; +#ifdef USE_MPI + std::cout << "Number of MPI ranks: " << procs << std::endl; + } +#endif parseArguments(argc, argv); @@ -68,22 +90,37 @@ int main(int argc, char *argv[]) #endif run(); + // End MPI +#ifdef USE_MPI + MPI_Finalize(); +#endif } template void run() { +#ifdef USE_MPI + if (rank == 0) +#endif + + { std::cout << "Running kernels " << num_times << " times" << std::endl; if (sizeof(T) == sizeof(float)) std::cout << "Precision: float" << std::endl; else std::cout << "Precision: double" << std::endl; + } // Create host vectors std::vector a(ARRAY_SIZE); std::vector b(ARRAY_SIZE); std::vector c(ARRAY_SIZE); + +#ifdef USE_MPI +if (rank == 0) +#endif + { std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" @@ -91,6 +128,7 @@ void run() std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; std::cout.precision(ss); + } // Result of the Dot kernel T sum; @@ -139,36 +177,59 @@ void run() // Declare timers std::chrono::high_resolution_clock::time_point t1, t2; + // Main loop for (unsigned int k = 0; k < num_times; k++) { + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif + // Execute Copy t1 = std::chrono::high_resolution_clock::now(); stream->copy(); + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif t2 = std::chrono::high_resolution_clock::now(); timings[0].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Mul t1 = std::chrono::high_resolution_clock::now(); stream->mul(); + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif t2 = std::chrono::high_resolution_clock::now(); timings[1].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Add t1 = std::chrono::high_resolution_clock::now(); stream->add(); + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif t2 = std::chrono::high_resolution_clock::now(); timings[2].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Triad t1 = std::chrono::high_resolution_clock::now(); stream->triad(); + #ifdef USE_MPI + MPI_Barrier(MPI_COMM_WORLD); + #endif t2 = std::chrono::high_resolution_clock::now(); timings[3].push_back(std::chrono::duration_cast >(t2 - t1).count()); // Execute Dot t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); + #ifdef USE_MPI + if (rank == 0) + MPI_Reduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + else + MPI_Reduce(&sum, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + #endif t2 = std::chrono::high_resolution_clock::now(); timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); @@ -179,6 +240,10 @@ void run() check_solution(num_times, a, b, c, sum); // Display timing results +#ifdef USE_MPI + if (rank == 0) +#endif + { std::cout << std::left << std::setw(12) << "Function" << std::left << std::setw(12) << "MBytes/sec" @@ -187,6 +252,7 @@ void run() << std::left << std::setw(12) << "Average" << std::endl; std::cout << std::fixed; + } std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"}; size_t sizes[5] = { @@ -207,9 +273,28 @@ void run() // Calculate average; ignore the first result double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0); +#ifdef USE_MPI + // Collate timings + if (rank == 0) + { + MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + } + else + { + MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + } + sizes[i] *= procs; +#endif + average = average / (double)(num_times - 1); // Display results +#ifdef USE_MPI + if (rank == 0) +#endif + { std::cout << std::left << std::setw(12) << labels[i] << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / min @@ -217,7 +302,7 @@ void run() << std::left << std::setw(12) << std::setprecision(5) << max << std::left << std::setw(12) << std::setprecision(5) << average << std::endl; - + } } delete stream; @@ -247,6 +332,13 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector // Do the reduction goldSum = goldA * goldB * ARRAY_SIZE; +#ifdef USE_MPI + if (rank == 0) + { + goldSum *= (T)procs; + } +#endif + // Calculate the average error double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); }); errA /= a.size(); From afa1e234861e9981a5e14b9456080e4a9859d0b9 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 20 Jun 2017 15:52:12 +0200 Subject: [PATCH 3/8] Dot kernel should do Allreduce so value is the same everywhere --- main.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/main.cpp b/main.cpp index 6a4ef3ac..a07dc1f1 100644 --- a/main.cpp +++ b/main.cpp @@ -225,10 +225,7 @@ if (rank == 0) t1 = std::chrono::high_resolution_clock::now(); sum = stream->dot(); #ifdef USE_MPI - if (rank == 0) - MPI_Reduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); - else - MPI_Reduce(&sum, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); #endif t2 = std::chrono::high_resolution_clock::now(); timings[4].push_back(std::chrono::duration_cast >(t2 - t1).count()); @@ -333,10 +330,7 @@ void check_solution(const unsigned int ntimes, std::vector& a, std::vector goldSum = goldA * goldB * ARRAY_SIZE; #ifdef USE_MPI - if (rank == 0) - { goldSum *= (T)procs; - } #endif // Calculate the average error From 5a64ce142f22d9d8f483740fdcc7ef395594b22a Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 27 Jun 2017 16:36:30 +0100 Subject: [PATCH 4/8] Only check verificaiton on master MPI rank Prevents lots of output in case of failure --- main.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/main.cpp b/main.cpp index a07dc1f1..068afc43 100644 --- a/main.cpp +++ b/main.cpp @@ -234,6 +234,11 @@ if (rank == 0) // Check solutions stream->read_arrays(a, b, c); + +#ifdef USE_MPI + // Only check solutions on the master rank in case verificaiton fails + if (rank == 0) +#endif check_solution(num_times, a, b, c, sum); // Display timing results From a57fdecc65f4ad8b847684bd332641c4af7c5123 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 27 Jun 2017 16:46:42 +0100 Subject: [PATCH 5/8] Fix indenting --- main.cpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/main.cpp b/main.cpp index 068afc43..f27866be 100644 --- a/main.cpp +++ b/main.cpp @@ -276,18 +276,18 @@ if (rank == 0) double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0); #ifdef USE_MPI - // Collate timings - if (rank == 0) - { - MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); - MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - } - else - { - MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); - MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); - } - sizes[i] *= procs; + // Collate timings + if (rank == 0) + { + MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + } + else + { + MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD); + MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + } + sizes[i] *= procs; #endif average = average / (double)(num_times - 1); From fc71f6d4b5d4f33d9af730ad12c36670c85d5c3f Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 27 Jun 2017 16:58:25 +0100 Subject: [PATCH 6/8] [CUDA] Seperate build of driver --- CUDA.make | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/CUDA.make b/CUDA.make index 3edf0f51..e222c06e 100644 --- a/CUDA.make +++ b/CUDA.make @@ -1,10 +1,14 @@ CXXFLAGS=-O3 CUDA_CXX=nvcc +CUDA_LIBS=-lcudart -cuda-stream: main.cpp CUDAStream.cu - $(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@ +cuda-stream: main.cpp CUDAStream.o + $(CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) $(CUDA_LIBS) -o $@ + +CUDAStream.o: CUDAStream.cu + $(CUDA_CXX) -std=c++11 $(CXXFLAGS) $< $(CUDA_EXTRA_FLAGS) -c .PHONY: clean clean: - rm -f cuda-stream + rm -f cuda-stream CUDAStream.o From ef22cbcc84c21790fdef61816131a46f0804f002 Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Tue, 27 Jun 2017 16:58:54 +0100 Subject: [PATCH 7/8] [CUDA] Add MPI=yes option --- CUDA.make | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CUDA.make b/CUDA.make index e222c06e..55889b6e 100644 --- a/CUDA.make +++ b/CUDA.make @@ -2,6 +2,11 @@ CXXFLAGS=-O3 CUDA_CXX=nvcc CUDA_LIBS=-lcudart +ifeq ($(MPI), yes) + CXX=mpicxx + EXTRA_FLAGS+=-DUSE_MPI +endif + cuda-stream: main.cpp CUDAStream.o $(CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) $(CUDA_LIBS) -o $@ From cb92f94b17df16b59d1ff0be8f11fba99da391bb Mon Sep 17 00:00:00 2001 From: Tom Deakin Date: Fri, 30 Jun 2017 16:58:24 +0100 Subject: [PATCH 8/8] Add 'per rank' to size print outs --- main.cpp | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/main.cpp b/main.cpp index f27866be..ae23f884 100644 --- a/main.cpp +++ b/main.cpp @@ -123,9 +123,19 @@ if (rank == 0) { std::streamsize ss = std::cout.precision(); std::cout << std::setprecision(1) << std::fixed - << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" - << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; - std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" +#ifdef USE_MPI + << "Array size (per rank): " +#else + << "Array size: " +#endif + << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" + << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl +#ifdef USE_MPI + << "Total size (per rank): " +#else + << "Total size: " +#endif + << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB" << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl; std::cout.precision(ss); }