Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA accelerated PSNR #1175

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions libvmaf/src/cuda/cuda_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ namespace {
return atomicAdd(reinterpret_cast<uint64_cu *>(address),
static_cast<uint64_cu>(val));
}

typedef unsigned long long int uint64_cu;
__forceinline__ __device__ int64_t atomicAdd_uint64(uint64_t *address,
uint64_t val) {
return atomicAdd(reinterpret_cast<uint64_cu *>(address),
static_cast<uint64_cu>(val));
}
} // namespace
#endif

Expand Down
17 changes: 2 additions & 15 deletions libvmaf/src/feature/cuda/integer_adm_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -855,17 +855,9 @@ static void integer_compute_adm_cuda(VmafFeatureExtractor *fex, AdmStateCuda *s,
h = (h + 1) / 2;

// This event ensures the input buffer is consumed
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));

CHECK_CUDA(cuStreamWaitEvent(s->str, s->dis_event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->dis_event));
CHECK_CUDA(cuEventCreate(&s->dis_event, CU_EVENT_DEFAULT));

CHECK_CUDA(cuStreamWaitEvent(s->str, s->ref_event, CU_EVENT_WAIT_DEFAULT));
CHECK_CUDA(cuEventDestroy(s->ref_event));
CHECK_CUDA(cuEventCreate(&s->ref_event, CU_EVENT_DEFAULT));

CHECK_CUDA(cuCtxPopCurrent(NULL));

// consumes buf->ref_dwt2 , buf->dis_dwt2
// produces buf->decouple_r , buf->decouple_a
adm_decouple_device(s, buf, w, h, buf_stride, &p, s->str);
Expand Down Expand Up @@ -1154,12 +1146,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex,
(void) dist_pic_90;

// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));
CHECK_CUDA(cuEventSynchronize(s->finished));

// current implementation is limited by the 16-bit data pipeline, thus
// cannot handle an angular frequency smaller than 1080p * 3H
Expand Down
7 changes: 1 addition & 6 deletions libvmaf/src/feature/cuda/integer_motion_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,12 +259,7 @@ static int extract_fex_cuda(VmafFeatureExtractor *fex, VmafPicture *ref_pic,
MotionStateCuda *s = fex->priv;

// this is done to ensure that the CPU does not overwrite the buffer params for 'write_scores
CHECK_CUDA(cuStreamSynchronize(s->str));
// CHECK_CUDA(cuEventSynchronize(s->finished));
CHECK_CUDA(cuCtxPushCurrent(fex->cu_state->ctx));
CHECK_CUDA(cuEventDestroy(s->finished));
CHECK_CUDA(cuEventCreate(&s->finished, CU_EVENT_DEFAULT));
CHECK_CUDA(cuCtxPopCurrent(NULL));
CHECK_CUDA(cuEventSynchronize(s->finished));

int err = 0;
(void) dist_pic;
Expand Down
138 changes: 138 additions & 0 deletions libvmaf/src/feature/cuda/integer_psnr/psnr.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/**
*
* Copyright 2016-2023 Netflix, Inc.
* Copyright 2022 NVIDIA Corporation.
*
* Licensed under the BSD+Patent License (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://opensource.org/licenses/BSDplusPatent
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#include "cuda/integer_motion_cuda.h"
#include "cuda_helper.cuh"

#include "common.h"

template <typename T, typename LOAD_TYPE>
__device__ void sse_calculation(T *ref, T *dis, unsigned int w, unsigned int h,
unsigned int stride, uint64_t *sse) {
constexpr int val_per_thread = sizeof(LOAD_TYPE) / sizeof(T);
unsigned int idx_x = (threadIdx.x + blockDim.x * blockIdx.x) * val_per_thread;
unsigned int idx_y = threadIdx.y + blockDim.y * blockIdx.y;

if (idx_y < h && idx_x < w) {
int idx = idx_y * (stride / sizeof(T)) + idx_x;
uint64_t thread_sse = 0u;
union {
T value_ref[val_per_thread];
LOAD_TYPE load_value_dis;
};
union {
T value_dis[val_per_thread];
LOAD_TYPE load_value_ref;
};
load_value_ref = *reinterpret_cast<LOAD_TYPE *>(&ref[idx]);
load_value_dis = *reinterpret_cast<LOAD_TYPE *>(&dis[idx]);
for (unsigned int i = 0; i < val_per_thread; ++i) {
if ((idx_x + i) < w) {
const int e = value_ref[i] - value_dis[i];
thread_sse += e * e;
}
}

// Warp-reduce abs_dist
thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 16);
thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 8);
thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 4);
thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 2);
thread_sse += __shfl_down_sync(0xffffffff, thread_sse, 1);
// Let threads in lane zero add warp-reduced abs_dist atomically to global
// sad
const int lane =
(threadIdx.y * blockDim.x + threadIdx.x) % VMAF_CUDA_THREADS_PER_WARP;
if (lane == 0)
atomicAdd_uint64(sse, static_cast<uint64_t>(thread_sse));
}
}

template <int chn>
__device__ void psnr8_impl(const VmafPicture ref_pic,
const VmafPicture dist_pic,
const VmafCudaBuffer sse) {
unsigned int stride = ref_pic.stride[chn];
// if second channel is smaller use smaller load
if (stride <= (ref_pic.stride[0] / 2))
sse_calculation<uint8_t, uint64_t>(
reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
else
sse_calculation<uint8_t, ushort4>(
reinterpret_cast<uint8_t *>(ref_pic.data[chn]),
reinterpret_cast<uint8_t *>(dist_pic.data[chn]), ref_pic.w[chn],
ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
}

template <int chn>
__device__ void psnr16_impl(const VmafPicture ref_pic,
const VmafPicture dist_pic,
const VmafCudaBuffer sse) {
unsigned int stride = ref_pic.stride[chn];
// if second channel is smaller use smaller load
if (stride <= (ref_pic.stride[0] / 2))
sse_calculation<uint16_t, ushort4>(
reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
else
sse_calculation<uint16_t, uint4>(
reinterpret_cast<uint16_t *>(ref_pic.data[chn]),
reinterpret_cast<uint16_t *>(dist_pic.data[chn]), ref_pic.w[chn],
ref_pic.h[chn], stride, reinterpret_cast<uint64_t *>(sse.data) + chn);
}

extern "C" {

__global__ void psnr(const VmafPicture ref_pic, const VmafPicture dist_pic,
const VmafCudaBuffer sse) {
// this is needed to not produce local load/store ops when accessing with
// "dynamic" index although blockIdx.z is not really dynamic
switch (blockIdx.z) {
case 0:
psnr8_impl<0>(ref_pic, dist_pic, sse);
return;
case 1:
psnr8_impl<1>(ref_pic, dist_pic, sse);
return;
case 2:
psnr8_impl<2>(ref_pic, dist_pic, sse);
return;
}
}

__global__ void psnr_hbd(const VmafPicture ref_pic, const VmafPicture dist_pic,
const VmafCudaBuffer sse) {
// this is needed to not produce local load/store ops when accessing with
// "dynamic" index although blockIdx.z is not really dynamic
switch (blockIdx.z) {
case 0:
psnr16_impl<0>(ref_pic, dist_pic, sse);
return;
case 1:
psnr16_impl<1>(ref_pic, dist_pic, sse);
return;
case 2:
psnr16_impl<2>(ref_pic, dist_pic, sse);
return;
}
}
}
Loading