PTsolvers · luraess · Dec 15, 2023
diff --git a/AMDGPU/atomic.cu b/AMDGPU/atomic.cu
@@ -0,0 +1,131 @@
+#include "hip/hip_runtime.h"
+#include <cstdlib>
+#include <hip/hip_runtime.h>
+#include <stdint.h>
+#include <time.h>
+
+#include <chrono>
+using namespace std::chrono;
+using nano_double = duration<double, std::nano>;
+
+#ifdef _WIN32
+#define EXPORT_API __declspec(dllexport)
+#else
+#define EXPORT_API
+#endif
+
+// #define DAT float
+#define DAT double
+// #define DAT uint32_t
+// #define DAT uint64_t
+
+#define source(i) _source[i]
+#define indices(i, j) _indices[j * n + i]
+#define target1(i) _target1[i]
+#define target2(i) _target2[i]
+
+__global__ void no_atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int i1 = indices(i, 0);
+  int i2 = indices(i, 1);
+  int i3 = indices(i, 2);
+  int i4 = indices(i, 3);
+  DAT v = source(i);
+  target1(i1) += v;
+  target1(i2) += v;
+  target1(i3) += v;
+  target1(i4) += v;
+  target2(i1) += v;
+  target2(i2) += v;
+  target2(i3) += v;
+  target2(i4) += v;
+}
+
+__global__ void atomic_kernel(DAT *_target1, DAT *_target2, DAT *_source, int *_indices, const int n) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;
+  int i1 = indices(i, 0);
+  int i2 = indices(i, 1);
+  int i3 = indices(i, 2);
+  int i4 = indices(i, 3);
+  DAT v = source(i);
+  atomicAdd(&target1(i1), v);
+  atomicAdd(&target1(i2), v);
+  atomicAdd(&target1(i3), v);
+  atomicAdd(&target1(i4), v);
+  atomicAdd(&target2(i1), v);
+  atomicAdd(&target2(i2), v);
+  atomicAdd(&target2(i3), v);
+  atomicAdd(&target2(i4), v);
+}
+
+extern "C" EXPORT_API void run_benchmark(double *times, const int nsamples) {
+  int i;
+  const int n = 1024;
+  const int bins = 64;
+  DAT *target1, *target2, *source;
+  int *indices;
+
+  srand((unsigned) time(NULL));
+
+  DAT *target1_h = (DAT *)malloc(bins * sizeof(DAT));
+  for (i = 0; i < bins; i++) {
+    target1_h[i] = (DAT)0.0;
+  }
+  DAT *target2_h = (DAT *)malloc(bins * sizeof(DAT));
+  for (i = 0; i < bins; i++) {
+    target2_h[i] = (DAT)0.0;
+  }
+  DAT *source_h = (DAT *)malloc(n * sizeof(DAT));
+  for (i = 0; i < n; i++) {
+    source_h[i] = static_cast<DAT>(rand()) / static_cast<DAT>(RAND_MAX);
+  }
+  int *indices_h = (int *)malloc(n * 4 * sizeof(int));
+  for (i = 0; i < (n * 4); i++) {
+    indices_h[i] = std::rand() % bins;
+  }
+
+  hipMalloc(&target1, bins * sizeof(DAT));
+  hipMalloc(&target2, bins * sizeof(DAT));
+  hipMalloc(&source, n * sizeof(DAT));
+  hipMalloc(&indices, n * 4 * sizeof(int));
+
+  hipMemcpy(target1, target1_h, bins * sizeof(DAT), hipMemcpyHostToDevice);
+  hipMemcpy(target2, target2_h, bins * sizeof(DAT), hipMemcpyHostToDevice);
+  hipMemcpy(source, source_h, n * sizeof(DAT), hipMemcpyHostToDevice);
+  hipMemcpy(indices, indices_h, n * 4 * sizeof(int), hipMemcpyHostToDevice);
+
+  hipStream_t stream;
+  hipStreamCreate(&stream);
+
+  dim3 nthreads(256);
+  dim3 nblocks((n + nthreads.x - 1) / nthreads.x);
+
+  // for (int isample = 0; isample < nsamples; ++isample) {
+  //   auto timer = high_resolution_clock::now();
+  //   hipLaunchKernelGGL(no_atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n);
+  //   hipStreamSynchronize(stream);
+  //   auto elapsed = high_resolution_clock::now() - timer;
+  //   auto time_total = duration_cast<nano_double>(elapsed).count();
+  //   times[isample] = time_total;
+  // }
+
+  for (int isample = 0; isample < nsamples; ++isample) {
+    auto timer = high_resolution_clock::now();
+    hipLaunchKernelGGL(atomic_kernel, nblocks, nthreads, 0, stream, target1, target2, source, indices, n);
+    hipStreamSynchronize(stream);
+    auto elapsed = high_resolution_clock::now() - timer;
+    auto time_total = duration_cast<nano_double>(elapsed).count();
+    times[isample] = time_total;
+  }
+
+  free(target1_h);
+  free(target2_h);
+  free(source_h);
+  free(indices_h);
+  hipFree(target1);
+  hipFree(target2);
+  hipFree(source);
+  hipFree(indices);
+
+  hipStreamDestroy(stream);
+}
diff --git a/AMDGPU/atomic.jl b/AMDGPU/atomic.jl
@@ -0,0 +1,100 @@
+using AMDGPU
+using KernelAbstractions
+using BenchmarkTools
+using Libdl
+
+function make_c_trial(nsamples)
+    c_times = zeros(Float64, nsamples)
+    c_gctimes = zeros(Float64, nsamples)
+    c_memory = 0::Int64
+    c_allocs = 0::Int64
+    c_params = BenchmarkTools.DEFAULT_PARAMETERS
+    c_params.samples = nsamples
+    return BenchmarkTools.Trial(c_params, c_times, c_gctimes, c_memory, c_allocs)
+end
+
+INPUTS = Dict()
+
+INPUTS["atomic"] = (
+    c_samples=2000,
+)
+
+function amd_atomic_add!(target1, target2, source, indices)
+    i = workitemIdx().x + (workgroupIdx().x - 0x1) * workgroupDim().x
+    i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
+    v = source[i]
+    AMDGPU.@atomic target1[i1] += v
+    AMDGPU.@atomic target1[i2] += v
+    AMDGPU.@atomic target1[i3] += v
+    AMDGPU.@atomic target1[i4] += v
+    AMDGPU.@atomic target2[i1] += v
+    AMDGPU.@atomic target2[i2] += v
+    AMDGPU.@atomic target2[i3] += v
+    AMDGPU.@atomic target2[i4] += v
+    return
+end
+
+@kernel function ka_atomic_add!(target1, target2, source, indices)
+    i = @index(Global, Linear)
+    i1, i2, i3, i4 = indices[i, 1], indices[i, 2], indices[i, 3], indices[i, 4]
+    v = source[i]
+    KernelAbstractions.@atomic target1[i1] += v
+    KernelAbstractions.@atomic target1[i2] += v
+    KernelAbstractions.@atomic target1[i3] += v
+    KernelAbstractions.@atomic target1[i4] += v
+    KernelAbstractions.@atomic target2[i1] += v
+    KernelAbstractions.@atomic target2[i2] += v
+    KernelAbstractions.@atomic target2[i3] += v
+    KernelAbstractions.@atomic target2[i4] += v
+end
+
+function run_julia_benchmarks(::Type{DAT}) where DAT
+    n, bins = 1024, 64
+    target1 = ROCArray(zeros(DAT, bins))
+    target2 = ROCArray(zeros(DAT, bins))
+    source  = ROCArray(rand(DAT, n))
+    indices = ROCArray(rand(1:bins, n, 4))
+
+    nthreads = 256
+    nblocks = cld.(n, nthreads)
+
+    bm = @benchmark begin
+        @roc groupsize=$nthreads gridsize=$nblocks amd_atomic_add!($target1, $target2, $source, $indices)
+        AMDGPU.synchronize()
+    end
+
+    bm_ka = @benchmark begin
+        ka_atomic_add!(ROCBackend(), 256, $n)($target1, $target2, $source, $indices)
+        KernelAbstractions.synchronize(ROCBackend())
+    end
+
+    AMDGPU.unsafe_free!(source)
+    AMDGPU.unsafe_free!(indices)
+    AMDGPU.unsafe_free!(target1)
+    AMDGPU.unsafe_free!(target2)
+
+    return (bm, bm_ka)
+end
+
+function run_c_benchmarks(lib, nsamples)
+    trial = make_c_trial(nsamples)
+
+    sym = Libdl.dlsym(lib, :run_benchmark)
+    @ccall $sym(trial.times::Ptr{Cdouble}, nsamples::Cint)::Cvoid
+
+    return trial
+end
+
+# Compile C benchmark
+libext = Sys.iswindows() ? "dll" : "so"
+libname = "atomic." * libext
+run(`hipcc -xhip -munsafe-fp-atomics -O3 -o $libname --shared -fPIC atomic.cu`)
+
+Libdl.dlopen("./$libname") do lib
+    group_n = BenchmarkGroup()
+    jb = run_julia_benchmarks(Float64)
+    group_n["julia"] = jb[1]
+    group_n["julia-ka"] = jb[2]
+    group_n["reference"] = run_c_benchmarks(lib, INPUTS["atomic"].c_samples)
+    display(group_n)
+end