Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add atomic tests #1

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions AMDGPU/atomic.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
#include "hip/hip_runtime.h"
#include <cstdlib>
#include <hip/hip_runtime.h>
#include <stdint.h>
#include <time.h>

#include <chrono>
using namespace std::chrono;
using nano_double = duration<double, std::nano>;

#ifdef _WIN32
#define EXPORT_API __declspec(dllexport)
#else
#define EXPORT_API
#endif

// #define DAT float
#define DAT double
// #define DAT uint32_t
// #define DAT uint64_t

#define source(i) _source[i]
#define indices(i, j) _indices[j * n + i]
#define target1(i) _target1[i]
#define target2(i) _target2[i]

__global__ void no_atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int i1 = indices(i, 0);
int i2 = indices(i, 1);
int i3 = indices(i, 2);
int i4 = indices(i, 3);
DAT v = source(i);
target1(i1) += v;
target1(i2) += v;
target1(i3) += v;
target1(i4) += v;
target2(i1) += v;
target2(i2) += v;
target2(i3) += v;
target2(i4) += v;
}

__global__ void atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int i1 = indices(i, 0);
int i2 = indices(i, 1);
int i3 = indices(i, 2);
int i4 = indices(i, 3);
DAT v = source(i);
atomicAdd(&target1(i1), v);
atomicAdd(&target1(i2), v);
atomicAdd(&target1(i3), v);
atomicAdd(&target1(i4), v);
atomicAdd(&target2(i1), v);
atomicAdd(&target2(i2), v);
atomicAdd(&target2(i3), v);
atomicAdd(&target2(i4), v);
}

extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples) {
int i;
const int n = 1024;
const int bins = 64;
DAT *target1, *target2, *source;
int *indices;

srand((unsigned) time(NULL));

DAT *target1_h = (DAT *)malloc(bins * sizeof(DAT));
for (i = 0; i < bins; i++) {
target1_h[i] = (DAT)0.0;
}
DAT *target2_h = (DAT *)malloc(bins * sizeof(DAT));
for (i = 0; i < bins; i++) {
target2_h[i] = (DAT)0.0;
}
DAT *source_h = (DAT *)malloc(n * sizeof(DAT));
for (i = 0; i < n; i++) {
source_h[i] = static_cast<DAT>(rand()) / static_cast<DAT>(RAND_MAX);
}
int *indices_h = (int *)malloc(n * 4 * sizeof(int));
for (i = 0; i < (n * 4); i++) {
indices_h[i] = std::rand() % bins;
}

hipMalloc(&target1, bins * sizeof(DAT));
hipMalloc(&target2, bins * sizeof(DAT));
hipMalloc(&source, n * sizeof(DAT));
hipMalloc(&indices, n * 4 * sizeof(int));

hipMemcpy(target1, target1_h, bins * sizeof(DAT), hipMemcpyHostToDevice);
hipMemcpy(target2, target2_h, bins * sizeof(DAT), hipMemcpyHostToDevice);
hipMemcpy(source, source_h, n * sizeof(DAT), hipMemcpyHostToDevice);
hipMemcpy(indices, indices_h, n * 4 * sizeof(int), hipMemcpyHostToDevice);

hipStream_t stream;
hipStreamCreate(&stream);

dim3 nthreads(256);
dim3 nblocks((n + nthreads.x - 1) / nthreads.x);

// for (int isample = 0; isample < nsamples; ++isample) {
// auto timer = high_resolution_clock::now();
// hipLaunchKernelGGL(no_atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n);
// hipStreamSynchronize(stream);
// auto elapsed = high_resolution_clock::now() - timer;
// auto time_total = duration_cast<nano_double>(elapsed).count();
// times[isample] = time_total;
// }

for (int isample = 0; isample < nsamples; ++isample) {
auto timer = high_resolution_clock::now();
hipLaunchKernelGGL(atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n);
hipStreamSynchronize(stream);
auto elapsed = high_resolution_clock::now() - timer;
auto time_total = duration_cast<nano_double>(elapsed).count();
times[isample] = time_total;
}

free(target1_h);
free(target2_h);
free(source_h);
free(indices_h);
hipFree(target1);
hipFree(target2);
hipFree(source);
hipFree(indices);

hipStreamDestroy(stream);
}
100 changes: 100 additions & 0 deletions AMDGPU/atomic.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
using AMDGPU
using KernelAbstractions
using BenchmarkTools
using Libdl

function make_c_trial(nsamples)
c_times = zeros(Float64, nsamples)
c_gctimes = zeros(Float64, nsamples)
c_memory = 0::Int64
c_allocs = 0::Int64
c_params = BenchmarkTools.DEFAULT_PARAMETERS
c_params.samples = nsamples
return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs)
end

INPUTS = Dict()

INPUTS["atomic"] = (
c_samples=2000,
)

function amd_atomic_add!(target1, target2, source, indices)
i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x
i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
v = source[i]
AMDGPU.@atomic target1[i1] += v
AMDGPU.@atomic target1[i2] += v
AMDGPU.@atomic target1[i3] += v
AMDGPU.@atomic target1[i4] += v
AMDGPU.@atomic target2[i1] += v
AMDGPU.@atomic target2[i2] += v
AMDGPU.@atomic target2[i3] += v
AMDGPU.@atomic target2[i4] += v
return
end

@kernel function ka_atomic_add!(target1, target2, source, indices)
i = @index(Global, Linear)
i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
v = source[i]
KernelAbstractions.@atomic target1[i1] += v
KernelAbstractions.@atomic target1[i2] += v
KernelAbstractions.@atomic target1[i3] += v
KernelAbstractions.@atomic target1[i4] += v
KernelAbstractions.@atomic target2[i1] += v
KernelAbstractions.@atomic target2[i2] += v
KernelAbstractions.@atomic target2[i3] += v
KernelAbstractions.@atomic target2[i4] += v
end

function run_julia_benchmarks(::Type{DAT}) where DAT
n, bins = 1024, 64
target1 = ROCArray(zeros(DAT, bins))
target2 = ROCArray(zeros(DAT, bins))
source = ROCArray(rand(DAT, n))
indices = ROCArray(rand(1:bins, n, 4))

nthreads = 256
nblocks = cld.(n, nthreads)

bm = @benchmark begin
@roc groupsize=$nthreads gridsize=$nblocks amd_atomic_add!($target1, $target2, $source, $indices)
AMDGPU.synchronize()
end

bm_ka = @benchmark begin
ka_atomic_add!(ROCBackend(), 256, $n)($target1, $target2, $source, $indices)
KernelAbstractions.synchronize(ROCBackend())
end

AMDGPU.unsafe_free!(source)
AMDGPU.unsafe_free!(indices)
AMDGPU.unsafe_free!(target1)
AMDGPU.unsafe_free!(target2)

return (bm, bm_ka)
end

function run_c_benchmarks(lib, nsamples)
trial = make_c_trial(nsamples)

sym = Libdl.dlsym(lib, :run_benchmark)
@ccall $sym(trial.times::Ptr{Cdouble}, nsamples::Cint)::Cvoid

return trial
end

# Compile C benchmark
libext = Sys.iswindows() ? "dll" : "so"
libname = "atomic." * libext
run(`hipcc -xhip -munsafe-fp-atomics -O3 -o $libname --shared -fPIC atomic.cu`)

Libdl.dlopen("./$libname") do lib
group_n = BenchmarkGroup()
jb = run_julia_benchmarks(Float64)
group_n["julia"] = jb[1]
group_n["julia-ka"] = jb[2]
group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic"].c_samples)
display(group_n)
end