Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MPI to driver code #34

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions CUDA.make
Original file line number Diff line number Diff line change
@@ -1,10 +1,19 @@
CXXFLAGS=-O3
CUDA_CXX=nvcc
CUDA_LIBS=-lcudart

cuda-stream: main.cpp CUDAStream.cu
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) -o $@
ifeq ($(MPI), yes)
CXX=mpicxx
EXTRA_FLAGS+=-DUSE_MPI
endif

cuda-stream: main.cpp CUDAStream.o
$(CXX) -std=c++11 $(CXXFLAGS) -DCUDA $^ $(EXTRA_FLAGS) $(CUDA_LIBS) -o $@

CUDAStream.o: CUDAStream.cu
$(CUDA_CXX) -std=c++11 $(CXXFLAGS) $< $(CUDA_EXTRA_FLAGS) -c

.PHONY: clean
clean:
rm -f cuda-stream
rm -f cuda-stream CUDAStream.o

123 changes: 114 additions & 9 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,34 @@ void run();

void parseArguments(int argc, char *argv[]);

#ifdef USE_MPI
#include <mpi.h>
// MPI parameters
int rank, procs;
#endif

int main(int argc, char *argv[])
{
std::cout
#ifdef USE_MPI
int provided;
MPI_Init_thread(&argc, &argv, MPI_THREAD_FUNNELED, &provided);
if (provided < MPI_THREAD_FUNNELED) MPI_Abort(MPI_COMM_WORLD, provided);

MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &procs);
#endif

#ifdef USE_MPI
if (rank == 0) {
#endif
std::cout
<< "BabelStream" << std::endl
<< "Version: " << VERSION_STRING << std::endl
<< "Implementation: " << IMPLEMENTATION_STRING << std::endl;
#ifdef USE_MPI
std::cout << "Number of MPI ranks: " << procs << std::endl;
}
#endif

parseArguments(argc, argv);

Expand All @@ -68,29 +90,55 @@ int main(int argc, char *argv[])
#endif
run<double>();

// End MPI
#ifdef USE_MPI
MPI_Finalize();
#endif
}

template <typename T>
void run()
{
#ifdef USE_MPI
if (rank == 0)
#endif

{
std::cout << "Running kernels " << num_times << " times" << std::endl;

if (sizeof(T) == sizeof(float))
std::cout << "Precision: float" << std::endl;
else
std::cout << "Precision: double" << std::endl;
}

// Create host vectors
std::vector<T> a(ARRAY_SIZE);
std::vector<T> b(ARRAY_SIZE);
std::vector<T> c(ARRAY_SIZE);

#ifdef USE_MPI
if (rank == 0)
#endif
{
std::streamsize ss = std::cout.precision();
std::cout << std::setprecision(1) << std::fixed
<< "Array size: " << ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
std::cout << "Total size: " << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
#ifdef USE_MPI
<< "Array size (per rank): "
#else
<< "Array size: "
#endif
<< ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
<< " (=" << ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl
#ifdef USE_MPI
<< "Total size (per rank): "
#else
<< "Total size: "
#endif
<< 3.0*ARRAY_SIZE*sizeof(T)*1.0E-6 << " MB"
<< " (=" << 3.0*ARRAY_SIZE*sizeof(T)*1.0E-9 << " GB)" << std::endl;
std::cout.precision(ss);
}

// Result of the Dot kernel
T sum;
Expand Down Expand Up @@ -139,46 +187,75 @@ void run()
// Declare timers
std::chrono::high_resolution_clock::time_point t1, t2;


// Main loop
for (unsigned int k = 0; k < num_times; k++)
{
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif

// Execute Copy
t1 = std::chrono::high_resolution_clock::now();
stream->copy();
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
t2 = std::chrono::high_resolution_clock::now();
timings[0].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());

// Execute Mul
t1 = std::chrono::high_resolution_clock::now();
stream->mul();
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
t2 = std::chrono::high_resolution_clock::now();
timings[1].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());

// Execute Add
t1 = std::chrono::high_resolution_clock::now();
stream->add();
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
t2 = std::chrono::high_resolution_clock::now();
timings[2].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());

// Execute Triad
t1 = std::chrono::high_resolution_clock::now();
stream->triad();
#ifdef USE_MPI
MPI_Barrier(MPI_COMM_WORLD);
#endif
t2 = std::chrono::high_resolution_clock::now();
timings[3].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());

// Execute Dot
t1 = std::chrono::high_resolution_clock::now();
sum = stream->dot();
#ifdef USE_MPI
MPI_Allreduce(MPI_IN_PLACE, &sum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
#endif
t2 = std::chrono::high_resolution_clock::now();
timings[4].push_back(std::chrono::duration_cast<std::chrono::duration<double> >(t2 - t1).count());

}

// Check solutions
stream->read_arrays(a, b, c);

#ifdef USE_MPI
// Only check solutions on the master rank in case verificaiton fails
if (rank == 0)
#endif
check_solution<T>(num_times, a, b, c, sum);
Copy link
Contributor

@jrprice jrprice Jun 27, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this needs to be in an #ifdef USE_MPI \n if(rank == 0) section as well - otherwise you get massively spammed if verification fails.

On a related note, we really need to sort out verification for the reduction as per #20, since this almost always fails now when using a large number of MPI ranks.

Copy link
Contributor Author

@tomdeakin tomdeakin Jun 27, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Verification printing fixed in 5a64ce1
#20 still needs to be addressed, although all this MPI code assumes T is double as I've used MPI_Double for messages. We should probably support MPI_Float too.


// Display timing results
#ifdef USE_MPI
if (rank == 0)
#endif
{
std::cout
<< std::left << std::setw(12) << "Function"
<< std::left << std::setw(12) << "MBytes/sec"
Expand All @@ -187,6 +264,7 @@ void run()
<< std::left << std::setw(12) << "Average" << std::endl;

std::cout << std::fixed;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is fine, but might be interesting to see these results per rank as well as the aggregate bandwidth. This would make it easier to see how well things scale when running on lots of nodes.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One could always run the benchmark with -np 1 (or run the non-MPI benchmark). We could also run the benchmark on a singe node first, and then print out the efficiency. This would probably need a bit of an overhaul in the way the driver works though. Not sure if it's worth the pain.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just meant that it might be nice to print out the average per-rank bandwidths, I'm not talking about actually re-running on a single node.

e.g. If I run on a single P100 I get 550 GB/s. When I run on a cluster of 2000 P100s, I get some very large number, but do I still get 550 GB/s per rank?

To be honest I'm just being lazy - it's trivial for the user to open a calculator and do that division themselves. Feel free to ignore this comment :-)


std::string labels[5] = {"Copy", "Mul", "Add", "Triad", "Dot"};
size_t sizes[5] = {
Expand All @@ -201,19 +279,42 @@ void run()
{
// Get min/max; ignore the first result
auto minmax = std::minmax_element(timings[i].begin()+1, timings[i].end());
double min = *minmax.first;
double max = *minmax.second;

// Calculate average; ignore the first result
double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0) / (double)(num_times - 1);
double average = std::accumulate(timings[i].begin()+1, timings[i].end(), 0.0);

#ifdef USE_MPI
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indentation inside this block looks a little off.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in a57fdec

// Collate timings
if (rank == 0)
{
MPI_Reduce(MPI_IN_PLACE, &min, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce(MPI_IN_PLACE, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
}
else
{
MPI_Reduce(&min, NULL, 1, MPI_DOUBLE, MPI_MIN, 0, MPI_COMM_WORLD);
MPI_Reduce(&max, NULL, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD);
}
sizes[i] *= procs;
#endif

average = average / (double)(num_times - 1);

// Display results
#ifdef USE_MPI
if (rank == 0)
#endif
{
std::cout
<< std::left << std::setw(12) << labels[i]
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / (*minmax.first)
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.first
<< std::left << std::setw(12) << std::setprecision(5) << *minmax.second
<< std::left << std::setw(12) << std::setprecision(3) << 1.0E-6 * sizes[i] / min
<< std::left << std::setw(12) << std::setprecision(5) << min
<< std::left << std::setw(12) << std::setprecision(5) << max
<< std::left << std::setw(12) << std::setprecision(5) << average
<< std::endl;

}
}

delete stream;
Expand Down Expand Up @@ -243,6 +344,10 @@ void check_solution(const unsigned int ntimes, std::vector<T>& a, std::vector<T>
// Do the reduction
goldSum = goldA * goldB * ARRAY_SIZE;

#ifdef USE_MPI
goldSum *= (T)procs;
#endif

// Calculate the average error
double errA = std::accumulate(a.begin(), a.end(), 0.0, [&](double sum, const T val){ return sum + fabs(val - goldA); });
errA /= a.size();
Expand Down