From ee89a6a9d923cd928a21121da645f1ce6b04c4c7 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 20 Jun 2017 15:47:02 +0200
Subject: [PATCH 1/8] Pull out min/max values into double

---
 main.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/main.cpp b/main.cpp
index 33cef1e0..bc71f295 100644
--- a/main.cpp
+++ b/main.cpp
@@ -201,16 +201,20 @@ void run()
   {
     // Get min/max; ignore the first result
     auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
+    double min = *minmax.first;
+    double max = *minmax.second;
 
     // Calculate average; ignore the first result
-    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
+    double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0);
+
+    average = average / (double)(num_times - 1);
 
     // Display results
     std::cout
       << std::left << std::setw(12) << labels[i]
-      << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
-      << std::left << std::setw(12) << std::setprecision(5) << *minmax.first
-      << std::left << std::setw(12) << std::setprecision(5) << *minmax.second
+      << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / min
+      << std::left << std::setw(12) << std::setprecision(5) << min
+      << std::left << std::setw(12) << std::setprecision(5) << max
       << std::left << std::setw(12) << std::setprecision(5) << average
       << std::endl;
 

From f74d7634d85b00408ae227890a678e200f6c7708 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 20 Jun 2017 15:47:05 +0200
Subject: [PATCH 2/8] Add MPI routines to run on multiple processes

All changes were made in the driver code, and no changes should
be required in the different implementations.

Changes involved:
1. Initilising MPI and getting rank and size
2. Guarding std::cout so only rank 0 prints
3. Adding Barriers between kernels
4. Dot produce performs Reduction.
---
 main.cpp | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 94 insertions(+), 2 deletions(-)

diff --git a/main.cpp b/main.cpp
index bc71f295..6a4ef3ac 100644
--- a/main.cpp
+++ b/main.cpp
@@ -51,12 +51,34 @@ void run();
 
 void parseArguments(int argc, char *argv[]);
 
+#ifdef USE_MPI
+  #include <mpi.h>
+  // MPI parameters
+  int rank, procs;
+#endif
+
 int main(int argc, char *argv[])
 {
-  std::cout
+#ifdef USE_MPI
+  int provided;
+  MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
+  if (provided < MPI_THREAD_FUNNELED) MPI_Abort(MPI_COMM_WORLD, provided);
+
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  MPI_Comm_size(MPI_COMM_WORLD, &procs);
+#endif
+
+#ifdef USE_MPI
+  if (rank == 0) {
+#endif
+    std::cout
     << "BabelStream" << std::endl
     << "Version: " << VERSION_STRING << std::endl
     << "Implementation: " << IMPLEMENTATION_STRING << std::endl;
+#ifdef USE_MPI
+    std::cout << "Number of MPI ranks: " << procs << std::endl;
+  }
+#endif
 
   parseArguments(argc, argv);
 
@@ -68,22 +90,37 @@ int main(int argc, char *argv[])
 #endif
     run<double>();
 
+  // End MPI
+#ifdef USE_MPI
+  MPI_Finalize();
+#endif
 }
 
 template <typename T>
 void run()
 {
+#ifdef USE_MPI
+  if (rank == 0)
+#endif
+
+  {
   std::cout << "Running kernels " << num_times << " times" << std::endl;
 
   if (sizeof(T) == sizeof(float))
     std::cout << "Precision: float" << std::endl;
   else
     std::cout << "Precision: double" << std::endl;
+  }
 
   // Create host vectors
   std::vector<T> a(ARRAY_SIZE);
   std::vector<T> b(ARRAY_SIZE);
   std::vector<T> c(ARRAY_SIZE);
+
+#ifdef USE_MPI
+if (rank == 0)
+#endif
+  {
   std::streamsize ss = std::cout.precision();
   std::cout << std::setprecision(1) << std::fixed
     << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
@@ -91,6 +128,7 @@ void run()
   std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
     << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
   std::cout.precision(ss);
+  }
 
   // Result of the Dot kernel
   T sum;
@@ -139,36 +177,59 @@ void run()
   // Declare timers
   std::chrono::high_resolution_clock::time_point t1, t2;
 
+
   // Main loop
   for (unsigned int k = 0; k < num_times; k++)
   {
+    #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
+
     // Execute Copy
     t1 = std::chrono::high_resolution_clock::now();
     stream->copy();
+    #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Mul
     t1 = std::chrono::high_resolution_clock::now();
     stream->mul();
+    #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Add
     t1 = std::chrono::high_resolution_clock::now();
     stream->add();
+    #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Triad
     t1 = std::chrono::high_resolution_clock::now();
     stream->triad();
+    #ifdef USE_MPI
+      MPI_Barrier(MPI_COMM_WORLD);
+    #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
     // Execute Dot
     t1 = std::chrono::high_resolution_clock::now();
     sum = stream->dot();
+    #ifdef USE_MPI
+    if (rank == 0)
+      MPI_Reduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    else
+      MPI_Reduce(&sum, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+    #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
 
@@ -179,6 +240,10 @@ void run()
   check_solution<T>(num_times, a, b, c, sum);
 
   // Display timing results
+#ifdef USE_MPI
+  if (rank == 0)
+#endif
+  {
   std::cout
     << std::left << std::setw(12) << "Function"
     << std::left << std::setw(12) << "MBytes/sec"
@@ -187,6 +252,7 @@ void run()
     << std::left << std::setw(12) << "Average" << std::endl;
 
   std::cout << std::fixed;
+  }
 
   std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
   size_t sizes[5] = {
@@ -207,9 +273,28 @@ void run()
     // Calculate average; ignore the first result
     double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0);
 
+#ifdef USE_MPI
+  // Collate timings
+  if (rank == 0)
+  {
+    MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+    MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+  }
+  else
+  {
+    MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+    MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+  }
+  sizes[i] *= procs;
+#endif
+
     average = average / (double)(num_times - 1);
 
     // Display results
+#ifdef USE_MPI
+    if (rank == 0)
+#endif
+    {
     std::cout
       << std::left << std::setw(12) << labels[i]
       << std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / min
@@ -217,7 +302,7 @@ void run()
       << std::left << std::setw(12) << std::setprecision(5) << max
       << std::left << std::setw(12) << std::setprecision(5) << average
       << std::endl;
-
+    }
   }
 
   delete stream;
@@ -247,6 +332,13 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
   // Do the reduction
   goldSum = goldA * goldB * ARRAY_SIZE;
 
+#ifdef USE_MPI
+  if (rank == 0)
+  {
+    goldSum *= (T)procs;
+  }
+#endif
+
   // Calculate the average error
   double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
   errA /= a.size();

From afa1e234861e9981a5e14b9456080e4a9859d0b9 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 20 Jun 2017 15:52:12 +0200
Subject: [PATCH 3/8] Dot kernel should do Allreduce so value is the same
 everywhere

---
 main.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/main.cpp b/main.cpp
index 6a4ef3ac..a07dc1f1 100644
--- a/main.cpp
+++ b/main.cpp
@@ -225,10 +225,7 @@ if (rank == 0)
     t1 = std::chrono::high_resolution_clock::now();
     sum = stream->dot();
     #ifdef USE_MPI
-    if (rank == 0)
-      MPI_Reduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
-    else
-      MPI_Reduce(&sum, NULL, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
+      MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
     #endif
     t2 = std::chrono::high_resolution_clock::now();
     timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());
@@ -333,10 +330,7 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
   goldSum = goldA * goldB * ARRAY_SIZE;
 
 #ifdef USE_MPI
-  if (rank == 0)
-  {
     goldSum *= (T)procs;
-  }
 #endif
 
   // Calculate the average error

From 5a64ce142f22d9d8f483740fdcc7ef395594b22a Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 27 Jun 2017 16:36:30 +0100
Subject: [PATCH 4/8] Only check verificaiton on master MPI rank

Prevents lots of output in case of failure
---
 main.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/main.cpp b/main.cpp
index a07dc1f1..068afc43 100644
--- a/main.cpp
+++ b/main.cpp
@@ -234,6 +234,11 @@ if (rank == 0)
 
   // Check solutions
   stream->read_arrays(a, b, c);
+
+#ifdef USE_MPI
+  // Only check solutions on the master rank in case verificaiton fails
+  if (rank == 0)
+#endif
   check_solution<T>(num_times, a, b, c, sum);
 
   // Display timing results

From a57fdecc65f4ad8b847684bd332641c4af7c5123 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 27 Jun 2017 16:46:42 +0100
Subject: [PATCH 5/8] Fix indenting

---
 main.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/main.cpp b/main.cpp
index 068afc43..f27866be 100644
--- a/main.cpp
+++ b/main.cpp
@@ -276,18 +276,18 @@ if (rank == 0)
     double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0);
 
 #ifdef USE_MPI
-  // Collate timings
-  if (rank == 0)
-  {
-    MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
-    MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-  }
-  else
-  {
-    MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
-    MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
-  }
-  sizes[i] *= procs;
+    // Collate timings
+    if (rank == 0)
+    {
+      MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+      MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    }
+    else
+    {
+      MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
+      MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
+    }
+    sizes[i] *= procs;
 #endif
 
     average = average / (double)(num_times - 1);

From fc71f6d4b5d4f33d9af730ad12c36670c85d5c3f Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 27 Jun 2017 16:58:25 +0100
Subject: [PATCH 6/8] [CUDA] Seperate build of driver

---
 CUDA.make | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/CUDA.make b/CUDA.make
index 3edf0f51..e222c06e 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -1,10 +1,14 @@
 CXXFLAGS=-O3
 CUDA_CXX=nvcc
+CUDA_LIBS=-lcudart
 
-cuda-stream: main.cpp CUDAStream.cu
-	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
+cuda-stream: main.cpp CUDAStream.o
+	$(CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) $(CUDA_LIBS) -o $@
+
+CUDAStream.o: CUDAStream.cu
+	$(CUDA_CXX) -std=c++11 $(CXXFLAGS) $< $(CUDA_EXTRA_FLAGS) -c
 
 .PHONY: clean
 clean:
-	rm -f cuda-stream
+	rm -f cuda-stream CUDAStream.o
 

From ef22cbcc84c21790fdef61816131a46f0804f002 Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Tue, 27 Jun 2017 16:58:54 +0100
Subject: [PATCH 7/8] [CUDA] Add MPI=yes option

---
 CUDA.make | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CUDA.make b/CUDA.make
index e222c06e..55889b6e 100644
--- a/CUDA.make
+++ b/CUDA.make
@@ -2,6 +2,11 @@ CXXFLAGS=-O3
 CUDA_CXX=nvcc
 CUDA_LIBS=-lcudart
 
+ifeq ($(MPI), yes)
+  CXX=mpicxx
+  EXTRA_FLAGS+=-DUSE_MPI
+endif
+
 cuda-stream: main.cpp CUDAStream.o
 	$(CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) $(CUDA_LIBS) -o $@
 

From cb92f94b17df16b59d1ff0be8f11fba99da391bb Mon Sep 17 00:00:00 2001
From: Tom Deakin <tom.deakin@bristol.ac.uk>
Date: Fri, 30 Jun 2017 16:58:24 +0100
Subject: [PATCH 8/8] Add 'per rank' to size print outs

---
 main.cpp | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/main.cpp b/main.cpp
index f27866be..ae23f884 100644
--- a/main.cpp
+++ b/main.cpp
@@ -123,9 +123,19 @@ if (rank == 0)
   {
   std::streamsize ss = std::cout.precision();
   std::cout << std::setprecision(1) << std::fixed
-    << "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
-    << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
-  std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+#ifdef USE_MPI
+    << "Array size (per rank): "
+#else
+    << "Array size: "
+#endif
+    << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
+    << " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl
+#ifdef USE_MPI
+    << "Total size (per rank): "
+#else
+    << "Total size: "
+#endif
+    << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
     << " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
   std::cout.precision(ss);
   }