Skip to content

Commit

Permalink
Add Scan, Read, and Write benchmarks
Browse files Browse the repository at this point in the history
  • Loading branch information
gonzalobg committed Jun 10, 2024
1 parent 62a5051 commit 0b7fdb3
Show file tree
Hide file tree
Showing 33 changed files with 808 additions and 122 deletions.
13 changes: 8 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@ endmacro()
# the final executable name
set(EXE_NAME babelstream)

# for chrono, make_unique, and some basic CXX features, models can overwrite this if required
set(CMAKE_CXX_STANDARD 14)
# for chrono, make_unique, exclusive_scan_, and other basic features, models can override if required
set(CMAKE_CXX_STANDARD 17)

if (NOT CMAKE_BUILD_TYPE)
message("No CMAKE_BUILD_TYPE specified, defaulting to 'Release'")
Expand Down Expand Up @@ -177,10 +177,13 @@ else ()
message(STATUS "Selected model : ${MODEL}")
endif ()

if (MODEL STREQUAL "sycl2020")
if (MODEL STREQUAL "sycl2020-acc")
message(FATAL_ERROR "
Model sycl2020 has been renamed to sycl2020-acc, and a new sycl2020-usm model is now available.
Please use sycl2020-acc for SYCL2020 style accessors and sycl2020-usm for USM")
Model sycl2020-acc has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=ACCESSOR cmake option.")
endif ()
if (MODEL STREQUAL "sycl2020-usm")
message(FATAL_ERROR "
Model sycl2020-usm has been renamed to sycl2020 and may be enabled with the -DSYCL_ACCESS=USM cmake option.")
endif ()

# load the $MODEL.cmake file and setup the correct IMPL_* based on $MODEL
Expand Down
10 changes: 9 additions & 1 deletion src/Stream.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
#include <array>
#include <vector>
#include <string>
#include <type_traits>
#include "benchmark.h"

using std::intptr_t;

template <typename T>
using scan_t = std::conditional_t<sizeof(T) == 4, std::uint32_t,
std::conditional_t<sizeof(T) == 8, std::uint64_t, void>>;

template <class T>
class Stream
{
Expand All @@ -29,9 +34,12 @@ class Stream
virtual void triad() = 0;
virtual void nstream() = 0;
virtual T dot() = 0;
virtual void read() = 0;
virtual void write(T initA) = 0;
virtual void scan() = 0;

// Set pointers to read from arrays
virtual void get_arrays(T const*& a, T const*& b, T const*& c) = 0;
virtual void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) = 0;
};

// Implementation specific device functions
Expand Down
62 changes: 61 additions & 1 deletion src/acc/ACCStream.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
// For full license terms please see the LICENSE file distributed with this
// source code

#include <numeric>
#include "ACCStream.h"

template <class T>
Expand All @@ -24,6 +25,11 @@ ACCStream<T>::ACCStream(BenchId bs, const intptr_t array_size, const int device_
T * restrict b = this->b;
T * restrict c = this->c;

if (needs_buffer(bs, 's')) {
s_i = new scan_t<T>[array_size];
s_o = new scan_t<T>[array_size];
}

#pragma acc enter data create(a[0:array_size], b[0:array_size], c[0:array_size])
{}

Expand All @@ -46,6 +52,11 @@ ACCStream<T>::~ACCStream()
delete[] a;
delete[] b;
delete[] c;

if (s_i) {
delete[] s_i;
delete[] s_o;
}
}

template <class T>
Expand All @@ -62,10 +73,17 @@ void ACCStream<T>::init_arrays(T initA, T initB, T initC)
b[i] = initB;
c[i] = initC;
}

if (s_i) {
for (intptr_t i = 0; i < array_size; i++)
{
s_i[i] = scan_t<T>(i);
}
}
}

template <class T>
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c, scan_t<T> const*& h_s)
{
T *a = this->a;
T *b = this->b;
Expand All @@ -76,6 +94,10 @@ void ACCStream<T>::get_arrays(T const*& h_a, T const*& h_b, T const*& h_c)
h_a = a;
h_b = b;
h_c = c;

if (s_o) {
h_s = s_o;
}
}

template <class T>
Expand Down Expand Up @@ -169,6 +191,44 @@ T ACCStream<T>::dot()
return sum;
}

template <class T>
void ACCStream<T>::read()
{
intptr_t array_size = this->array_size;
T * restrict a = this->a;
#pragma acc parallel loop present(a[0:array_size]) wait
for (intptr_t i = 0; i < array_size; i++)
{
T tmp = a[i];
if (tmp == T(3.14)) {
a[i] *= 2;;
}
}
}

template <class T>
void ACCStream<T>::write(T initA)
{
intptr_t array_size = this->array_size;
T * restrict a = this->a;
#pragma acc parallel loop present(a[0:array_size]) wait
for (intptr_t i = 0; i < array_size; i++)
{
a[i] = initA;
}
}

template <class T>
void ACCStream<T>::scan()
{
if (!s_i) {
throw std::runtime_error("Trying to run scan but storage not allocated");
}

// OpenAcc doesn't have scan; run sequentially
std::exclusive_scan(s_i, s_i + array_size, s_o, scan_t<T>(0));
}

void listDevices(void)
{
// Get number of devices
Expand Down
7 changes: 6 additions & 1 deletion src/acc/ACCStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ class ACCStream : public Stream<T>
T* restrict a;
T* restrict b;
T* restrict c;
scan_t<T>* restrict s_i;
scan_t<T>* restrict s_o;

public:
ACCStream(BenchId bs, const intptr_t array_size, const int device_id,
Expand All @@ -37,7 +39,10 @@ class ACCStream : public Stream<T>
void triad() override;
void nstream() override;
T dot() override;
void read() override;
void write(T initA) override;
void scan() override;

void get_arrays(T const*& a, T const*& b, T const*& c) override;
void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
void init_arrays(T initA, T initB, T initC);
};
20 changes: 13 additions & 7 deletions src/benchmark.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
// - Classic: 5 classic kernels: Copy, Mul, Add, Triad, Dot.
// - All: all kernels.
// - Individual kernels only.
enum class BenchId : int {Copy, Mul, Add, Triad, Nstream, Dot, Classic, All};
enum class BenchId : int {Write, Copy, Mul, Add, Triad, Dot, Nstream, Scan, Read, Classic, All};

struct Benchmark {
BenchId id;
Expand All @@ -28,14 +28,17 @@ struct Benchmark {
};

// Benchmarks in the order in which - if present - should be run for validation purposes:
constexpr size_t num_benchmarks = 6;
constexpr std::array<Benchmark, num_benchmarks> bench = {
constexpr size_t num_benchmarks = 9;
inline constexpr std::array<Benchmark, num_benchmarks> bench = {
Benchmark { .id = BenchId::Write, .label = "Write", .weight = 1, .classic = false },
Benchmark { .id = BenchId::Copy, .label = "Copy", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Mul, .label = "Mul", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Add, .label = "Add", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Triad, .label = "Triad", .weight = 3, .classic = true },
Benchmark { .id = BenchId::Dot, .label = "Dot", .weight = 2, .classic = true },
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false }
Benchmark { .id = BenchId::Nstream, .label = "Nstream", .weight = 4, .classic = false },
Benchmark { .id = BenchId::Scan, .label = "Scan", .weight = 2, .classic = false },
Benchmark { .id = BenchId::Read, .label = "Read", .weight = 1, .classic = false }
};

// Which buffers are needed by each benchmark
Expand All @@ -44,14 +47,17 @@ inline bool needs_buffer(BenchId id, char n) {
return std::find(values.begin(), values.end(), n) != values.end();
};
switch(id) {
case BenchId::All: return in({'a','b','c'});
case BenchId::Classic: return in({'a','b','c'});
case BenchId::All: return in({'a','b','c', 's'});
case BenchId::Classic: return in({'a','b','c'});
case BenchId::Copy: return in({'a','c'});
case BenchId::Mul: return in({'b','c'});
case BenchId::Add: return in({'a','b','c'});
case BenchId::Triad: return in({'a','b','c'});
case BenchId::Dot: return in({'a','b'});
case BenchId::Nstream: return in({'a','b','c'});
case BenchId::Nstream: return in({'a','b','c'});
case BenchId::Read: return in({'a'});
case BenchId::Write: return in({'a'});
case BenchId::Scan: return in({'s'});
default:
std::cerr << "Unknown benchmark" << std::endl;
abort();
Expand Down
61 changes: 58 additions & 3 deletions src/cuda/CUDAStream.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#include "CUDAStream.h"
#include <nvml.h>
#include <thrust/scan.h>
#include <thrust/execution_policy.h>

#if !defined(UNROLL_FACTOR)
#define UNROLL_FACTOR 4
Expand Down Expand Up @@ -128,7 +130,8 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
// Size of partial sums for dot kernels
size_t sums_bytes = sizeof(T) * dot_num_blocks;
size_t array_bytes = sizeof(T) * array_size;
size_t total_bytes = array_bytes * size_t(3) + sums_bytes;
size_t scan_bytes = needs_buffer(bs, 's')? size_t(2) * array_size * sizeof(scan_t<T>) : 0;
size_t total_bytes = array_bytes * size_t(3) + scan_bytes + sums_bytes;
std::cout << "Reduction kernel config: " << dot_num_blocks << " groups of (fixed) size " << TBSIZE_DOT << std::endl;

// Check buffers fit on the device
Expand All @@ -144,6 +147,11 @@ CUDAStream<T>::CUDAStream(BenchId bs, const intptr_t array_size, const int devic
d_c = alloc_device<T>(array_size);
sums = alloc_host<T>(dot_num_blocks);

if (needs_buffer(bs, 's')) {
d_si = alloc_device<scan_t<T>>(array_size);
d_so = alloc_device<scan_t<T>>(array_size);
}

// Initialize buffers:
init_arrays(initA, initB, initC);
}
Expand All @@ -156,6 +164,10 @@ CUDAStream<T>::~CUDAStream()
free_device(d_b);
free_device(d_c);
free_host(sums);
if (d_si) {
free_device(d_si);
free_device(d_so);
}
}

template <typename F>
Expand Down Expand Up @@ -203,22 +215,26 @@ void for_each(size_t array_size, F f) {
template <class T>
void CUDAStream<T>::init_arrays(T initA, T initB, T initC)
{
for_each(array_size, [=,a=d_a,b=d_b,c=d_c] __device__ (size_t i) {
for_each(array_size, [=,a=d_a,b=d_b,c=d_c,s=d_si] __device__ (size_t i) {
a[i] = initA;
b[i] = initB;
c[i] = initC;
if (s) {
s[i] = static_cast<scan_t<T>>(i);
}
});
}

template <class T>
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s)
{
CU(cudaStreamSynchronize(stream));
#if defined(PAGEFAULT) || defined(MANAGED)
// Unified memory: return pointers to device memory
a = d_a;
b = d_b;
c = d_c;
s = d_so;
#else
// No Unified memory: copy data to the host
size_t nbytes = array_size * sizeof(T);
Expand All @@ -231,7 +247,14 @@ void CUDAStream<T>::get_arrays(T const*& a, T const*& b, T const*& c)
CU(cudaMemcpy(h_a.data(), d_a, nbytes, cudaMemcpyDeviceToHost));
CU(cudaMemcpy(h_b.data(), d_b, nbytes, cudaMemcpyDeviceToHost));
CU(cudaMemcpy(h_c.data(), d_c, nbytes, cudaMemcpyDeviceToHost));
if (d_so) {
size_t nbytes = array_size * sizeof(scan_t<T>);
h_s.resize(array_size);
s = h_s.data();
CU(cudaMemcpy(h_s.data(), d_so, nbytes, cudaMemcpyDeviceToHost));
}
#endif
CU(cudaStreamSynchronize(stream));
}

template <class T>
Expand Down Expand Up @@ -308,6 +331,38 @@ T CUDAStream<T>::dot()
return sum;
}

template <class T>
void CUDAStream<T>::scan()
{
if (!d_so) {
std::cerr << "Trying to run scan but storage not allocated" << std::endl;
std::terminate();
}
thrust::exclusive_scan(thrust::cuda::par.on(stream), d_si, d_si + array_size, d_so);
CU(cudaPeekAtLastError());
CU(cudaStreamSynchronize(stream));
}

template <class T>
void CUDAStream<T>::read()
{
for_each(array_size, [a=d_a] __device__ (size_t i) {
T tmp = a[i];
// Control-dependency on loading a[i]: never true, but checking it requires loading value:
if (tmp == T(3.14)) {
a[i] *= 2;
}
});
}

template <class T>
void CUDAStream<T>::write(T initA)
{
for_each(array_size, [a=d_a, initA] __device__ (size_t i) {
a[i] = initA;
});
}

void listDevices(void)
{
// Get number of devices
Expand Down
9 changes: 8 additions & 1 deletion src/cuda/CUDAStream.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,10 @@ class CUDAStream : public Stream<T>

// If UVM is disabled, host arrays for verification purposes
std::vector<T> h_a, h_b, h_c;
std::vector<scan_t<T>> h_s;

// Allocate memory for scan
scan_t<T> *d_si, *d_so;

// Number of blocks for dot kernel
intptr_t dot_num_blocks;
Expand All @@ -50,7 +54,10 @@ class CUDAStream : public Stream<T>
void triad() override;
void nstream() override;
T dot() override;
void read() override;
void write(T initA) override;
void scan() override;

void get_arrays(T const*& a, T const*& b, T const*& c) override;
void get_arrays(T const*& a, T const*& b, T const*& c, scan_t<T> const*& s) override;
void init_arrays(T initA, T initB, T initC);
};
Loading

0 comments on commit 0b7fdb3

Please sign in to comment.