From 63f5f9306385054e53541e1a8cb249156dda012b Mon Sep 17 00:00:00 2001 From: Tomoaki Teshima Date: Tue, 28 Apr 2020 21:46:31 +0900 Subject: [PATCH] fix test failure on ODROID-N2 --- modules/objdetect/src/hog.cpp | 55 +------ modules/objdetect/src/opencl/objdetect_hog.cl | 145 ++++-------------- 2 files changed, 32 insertions(+), 168 deletions(-) diff --git a/modules/objdetect/src/hog.cpp b/modules/objdetect/src/hog.cpp index e98b9c2e23fc..43771a8bda0f 100644 --- a/modules/objdetect/src/hog.cpp +++ b/modules/objdetect/src/hog.cpp @@ -1216,15 +1216,6 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y, UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size) { ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc); - if(k.empty()) - return false; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; - cv::String opts; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); if(k.empty()) return false; @@ -1285,19 +1276,10 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_ size_t localThreads[3] = { 1, 1, 1 }; int idx = 0; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; - cv::String opts; ocl::Kernel k; if ( nbins == 9 ) { k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); if(k.empty()) return false; @@ -1309,14 +1291,7 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_ } else { - k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; @@ -1733,7 +1708,6 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size) { int nthreads; - bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU; cv::String opts; ocl::Kernel k; @@ -1742,14 +1716,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y { case 180: nthreads = 180; - k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.width); @@ -1758,14 +1725,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y case 252: nthreads = 256; - k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.width); @@ -1774,14 +1734,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y default: nthreads = 256; - k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32"); - if(k.empty()) - return false; - if(is_cpu) - opts = "-D CPU "; - else - opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple()); - k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts); + k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, ""); if(k.empty()) return false; idx = k.set(idx, descr_size.area()); diff --git a/modules/objdetect/src/opencl/objdetect_hog.cl b/modules/objdetect/src/opencl/objdetect_hog.cl index 4fae320d02e6..156c8eb6a02a 100644 --- a/modules/objdetect/src/opencl/objdetect_hog.cl +++ b/modules/objdetect/src/opencl/objdetect_hog.cl @@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel( barrier(CLK_LOCAL_MEM_FENCE); if (cell_thread_x < 3) hist_[0] += hist_[3]; -#ifdef CPU barrier(CLK_LOCAL_MEM_FENCE); -#endif if (cell_thread_x == 0) final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] = hist_[0] + hist_[1] + hist_[2]; @@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size) barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; barrier(CLK_LOCAL_MEM_FENCE); } -#ifdef CPU if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32]; barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16]; @@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size) barrier(CLK_LOCAL_MEM_FENCE); } if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1]; barrier(CLK_LOCAL_MEM_FENCE); } -#else - if (tid < 32) - { - if (size >= 64) smem[tid] = sum = sum + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - if (size >= 32) smem[tid] = sum = sum + smem[tid + 16]; - if (size >= 16) smem[tid] = sum = sum + smem[tid + 8]; - if (size >= 8) smem[tid] = sum = sum + smem[tid + 4]; - if (size >= 4) smem[tid] = sum = sum + smem[tid + 2]; - if (size >= 2) smem[tid] = sum = sum + smem[tid + 1]; - } -#endif return sum; } @@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel( hist[0] = elem * scale; } +#define reduce_with_sync(target, sharedMemory, localMemory, tid, offset) \ + if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \ + barrier(CLK_LOCAL_MEM_FENCE); + //--------------------------------------------------------------------- // Linear SVM based classification // 48x96 window, 9 bins and default parameters @@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 90) products[tid] = product = product + products[tid + 90]; - barrier(CLK_LOCAL_MEM_FENCE); - - if (tid < 45) products[tid] = product = product + products[tid + 45]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if (tid < 13) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 13) - { - smem[tid] = product = product + smem[tid + 32]; - } -#if WAVE_SIZE < 32 - barrier(CLK_LOCAL_MEM_FENCE); -#endif - if (tid < 16) - { - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif + reduce_with_sync(90, products, product, tid, 90); + reduce_with_sync(45, products, product, tid, 45); + reduce_with_sync(13, products, product, tid, 32); // 13 is not typo + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); if (tid == 0){ - product = product + smem[tid + 1]; + product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } } @@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 128) products[tid] = product = product + products[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); + reduce_with_sync(128, products, product, tid, 128); + reduce_with_sync(64, products, product, tid, 64); + reduce_with_sync(32, products, product, tid, 32); + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); - if (tid < 64) products[tid] = product = product + products[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if(tid<32) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 32) - { - smem[tid] = product = product + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif if (tid == 0){ - product = product + smem[tid + 1]; + product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } } @@ -459,40 +394,16 @@ __kernel void classify_hists_kernel( barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 128) products[tid] = product = product + products[tid + 128]; - barrier(CLK_LOCAL_MEM_FENCE); + reduce_with_sync(128, products, product, tid, 128); + reduce_with_sync(64, products, product, tid, 64); + reduce_with_sync(32, products, product, tid, 32); + reduce_with_sync(16, products, product, tid, 16); + reduce_with_sync(8, products, product, tid, 8); + reduce_with_sync(4, products, product, tid, 4); + reduce_with_sync(2, products, product, tid, 2); - if (tid < 64) products[tid] = product = product + products[tid + 64]; - barrier(CLK_LOCAL_MEM_FENCE); - - volatile __local float* smem = products; -#ifdef CPU - if(tid<32) smem[tid] = product = product + smem[tid + 32]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<16) smem[tid] = product = product + smem[tid + 16]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<8) smem[tid] = product = product + smem[tid + 8]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<4) smem[tid] = product = product + smem[tid + 4]; - barrier(CLK_LOCAL_MEM_FENCE); - if(tid<2) smem[tid] = product = product + smem[tid + 2]; - barrier(CLK_LOCAL_MEM_FENCE); -#else - if (tid < 32) - { - smem[tid] = product = product + smem[tid + 32]; -#if WAVE_SIZE < 32 - } barrier(CLK_LOCAL_MEM_FENCE); - if (tid < 16) { -#endif - smem[tid] = product = product + smem[tid + 16]; - smem[tid] = product = product + smem[tid + 8]; - smem[tid] = product = product + smem[tid + 4]; - smem[tid] = product = product + smem[tid + 2]; - } -#endif if (tid == 0){ - smem[tid] = product = product + smem[tid + 1]; + products[tid] = product = product + products[tid + 1]; labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold); } }