From c5dabb74a694ec12a8a0aaf0bffe69462f759abc Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 7 Jan 2025 15:15:01 -0800 Subject: [PATCH 1/4] Add OpenMP acceleration for normal quicksort --- src/avx512-16bit-qsort.hpp | 5 ++- src/avx512-64bit-common.h | 3 +- src/xss-common-includes.h | 5 +++ src/xss-common-keyvaluesort.hpp | 5 --- src/xss-common-qsort.h | 76 ++++++++++++++++++++++++++++++--- tests/test-qsort.cpp | 4 ++ 6 files changed, 85 insertions(+), 13 deletions(-) diff --git a/src/avx512-16bit-qsort.hpp b/src/avx512-16bit-qsort.hpp index 65a64fb..e5eb145 100644 --- a/src/avx512-16bit-qsort.hpp +++ b/src/avx512-16bit-qsort.hpp @@ -556,6 +556,7 @@ avx512_qsort_fp16(uint16_t *arr, { using vtype = zmm_vector; + // TODO multithreading support here if (arrsize > 1) { arrsize_t nan_count = 0; if (UNLIKELY(hasnan)) { @@ -564,11 +565,11 @@ avx512_qsort_fp16(uint16_t *arr, } if (descending) { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); } else { qsort_, uint16_t>( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); } replace_inf_with_nan(arr, arrsize, nan_count, descending); } diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index f27a31f..5d55196 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -968,7 +968,8 @@ struct zmm_vector { static_assert(sizeof(size_t) == sizeof(uint64_t), "Size of size_t and uint64_t are not the same"); template <> -struct zmm_vector : public zmm_vector {}; +struct zmm_vector : public zmm_vector { +}; #endif template <> diff --git a/src/xss-common-includes.h b/src/xss-common-includes.h index 386ca86..27d6c36 100644 --- a/src/xss-common-includes.h +++ b/src/xss-common-includes.h @@ -82,6 +82,11 @@ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 +#if defined(XSS_USE_OPENMP) && defined(_OPENMP) +#define XSS_COMPILE_OPENMP +#include +#endif + template constexpr bool always_false = false; diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 43f64ec..0fb621d 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -11,11 +11,6 @@ #include "xss-common-qsort.h" #include "xss-network-keyvaluesort.hpp" -#if defined(XSS_USE_OPENMP) && defined(_OPENMP) -#define XSS_COMPILE_OPENMP -#include -#endif - /* * Sort all the NAN's to end of the array and return the index of the last elem * in the array which is not a nan diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index 709fadc..f91ea3f 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -521,8 +521,11 @@ template void sort_n(typename vtype::type_t *arr, int N); template -static void -qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) +static void qsort_(type_t *arr, + arrsize_t left, + arrsize_t right, + arrsize_t max_iters, + arrsize_t task_threshold) { /* * Resort to std::sort if quicksort isnt making any progress @@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters) type_t leftmostValue = comparator::leftmost(smallest, biggest); type_t rightmostValue = comparator::rightmost(smallest, biggest); +#ifdef XSS_COMPILE_OPENMP + if (pivot != leftmostValue) { + bool parallel_left = (pivot_index - left) > task_threshold; + if (parallel_left) { +#pragma omp task + qsort_( + arr, left, pivot_index - 1, max_iters - 1, task_threshold); + } + else { + qsort_( + arr, left, pivot_index - 1, max_iters - 1, task_threshold); + } + } + if (pivot != rightmostValue) { + bool parallel_right = (right - pivot_index) > task_threshold; + + if (parallel_right) { +#pragma omp task + qsort_( + arr, pivot_index, right, max_iters - 1, task_threshold); + } + else { + qsort_( + arr, pivot_index, right, max_iters - 1, task_threshold); + } + } +#else + UNUSED(task_threshold); + if (pivot != leftmostValue) - qsort_(arr, left, pivot_index - 1, max_iters - 1); + qsort_(arr, left, pivot_index - 1, max_iters - 1, 0); if (pivot != rightmostValue) - qsort_(arr, pivot_index, right, max_iters - 1); + qsort_(arr, pivot_index, right, max_iters - 1, 0); +#endif } template @@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) } UNUSED(hasnan); + +#ifdef XSS_COMPILE_OPENMP + + bool use_parallel = arrsize > 10000; + + if (use_parallel) { + // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system + constexpr int thread_limit = 8; + int thread_count = std::min(thread_limit, omp_get_max_threads()); + arrsize_t task_threshold + = std::max((arrsize_t)10000, arrsize / 100); + + // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ + // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems + // Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays +#pragma omp parallel num_threads(thread_count) +#pragma omp single + qsort_(arr, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + task_threshold); + } + else { + qsort_(arr, + 0, + arrsize - 1, + 2 * (arrsize_t)log2(arrsize), + std::numeric_limits::max()); + } +#pragma omp taskwait +#else qsort_( - arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize)); + arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0); +#endif replace_inf_with_nan(arr, arrsize, nan_count, descending); } diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index 7eef83e..c404967 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -11,6 +11,10 @@ class simdsort : public ::testing::Test { simdsort() { std::iota(arrsize.begin(), arrsize.end(), 1); + arrsize.push_back(10'000); + arrsize.push_back(100'000); + arrsize.push_back(1'000'000); + arrtype = {"random", "constant", "sorted", From fe145648b67051337855c5b152007458f291b48c Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Thu, 9 Jan 2025 15:31:20 -0800 Subject: [PATCH 2/4] Speed up tests, in particular when OpenMP is disabled --- tests/meson.build | 7 +++++++ tests/test-keyvalue.cpp | 15 ++++++++++----- tests/test-qsort.cpp | 15 ++++++++++----- 3 files changed, 27 insertions(+), 10 deletions(-) diff --git a/tests/meson.build b/tests/meson.build index 0583c55..92c689b 100644 --- a/tests/meson.build +++ b/tests/meson.build @@ -1,19 +1,26 @@ libtests = [] +if get_option('use_openmp') + openmpflags = ['-DXSS_USE_OPENMP=true'] +endif + libtests += static_library('tests_qsort', files('test-qsort.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) libtests += static_library('tests_kvsort', files('test-keyvalue.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) libtests += static_library('tests_objsort', files('test-objqsort.cpp', ), dependencies: gtest_dep, include_directories : [src, lib, utils], + cpp_args : [openmpflags], ) diff --git a/tests/test-keyvalue.cpp b/tests/test-keyvalue.cpp index c0e683c..c1386af 100644 --- a/tests/test-keyvalue.cpp +++ b/tests/test-keyvalue.cpp @@ -15,9 +15,13 @@ class simdkvsort : public ::testing::Test { simdkvsort() { std::iota(arrsize.begin(), arrsize.end(), 1); - arrsize.push_back(10'000); - arrsize.push_back(100'000); - arrsize.push_back(1'000'000); + std::iota(arrsize_long.begin(), arrsize_long.end(), 1); +#ifdef XSS_USE_OPENMP + // These extended tests are only needed for the OpenMP logic + arrsize_long.push_back(10'000); + arrsize_long.push_back(100'000); + arrsize_long.push_back(1'000'000); +#endif arrtype = {"random", "constant", @@ -32,6 +36,7 @@ class simdkvsort : public ::testing::Test { } std::vector arrtype; std::vector arrsize = std::vector(1024); + std::vector arrsize_long = std::vector(1024); }; TYPED_TEST_SUITE_P(simdkvsort); @@ -168,7 +173,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_ascending) using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type; for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector key = get_array(type, size); std::vector val = get_array(type, size); std::vector key_bckp = key; @@ -199,7 +204,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_descending) using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type; for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector key = get_array(type, size); std::vector val = get_array(type, size); std::vector key_bckp = key; diff --git a/tests/test-qsort.cpp b/tests/test-qsort.cpp index c404967..8a48207 100644 --- a/tests/test-qsort.cpp +++ b/tests/test-qsort.cpp @@ -11,9 +11,13 @@ class simdsort : public ::testing::Test { simdsort() { std::iota(arrsize.begin(), arrsize.end(), 1); - arrsize.push_back(10'000); - arrsize.push_back(100'000); - arrsize.push_back(1'000'000); + std::iota(arrsize_long.begin(), arrsize_long.end(), 1); +#ifdef XSS_USE_OPENMP + // These extended tests are only needed for the OpenMP logic + arrsize_long.push_back(10'000); + arrsize_long.push_back(100'000); + arrsize_long.push_back(1'000'000); +#endif arrtype = {"random", "constant", @@ -28,6 +32,7 @@ class simdsort : public ::testing::Test { } std::vector arrtype; std::vector arrsize = std::vector(1024); + std::vector arrsize_long = std::vector(1024); }; TYPED_TEST_SUITE_P(simdsort); @@ -36,7 +41,7 @@ TYPED_TEST_P(simdsort, test_qsort_ascending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector basearr = get_array(type, size); // Ascending order @@ -58,7 +63,7 @@ TYPED_TEST_P(simdsort, test_qsort_descending) { for (auto type : this->arrtype) { bool hasnan = is_nan_test(type); - for (auto size : this->arrsize) { + for (auto size : this->arrsize_long) { std::vector basearr = get_array(type, size); // Descending order From 98e2da5d3bd70e2414e78f467a7335a227466f4e Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Thu, 16 Jan 2025 14:57:17 -0800 Subject: [PATCH 3/4] Change threshold for OpenMP/tasks from 10k to 100k --- src/xss-common-qsort.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/xss-common-qsort.h b/src/xss-common-qsort.h index f91ea3f..9aa5911 100644 --- a/src/xss-common-qsort.h +++ b/src/xss-common-qsort.h @@ -663,14 +663,14 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan) #ifdef XSS_COMPILE_OPENMP - bool use_parallel = arrsize > 10000; + bool use_parallel = arrsize > 100000; if (use_parallel) { // This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system constexpr int thread_limit = 8; int thread_count = std::min(thread_limit, omp_get_max_threads()); arrsize_t task_threshold - = std::max((arrsize_t)10000, arrsize / 100); + = std::max((arrsize_t)100000, arrsize / 100); // We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_ // The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems From 8ddc73a06aa5779347d36ead89dca7e8cbb1aad4 Mon Sep 17 00:00:00 2001 From: Matthew Sterrett Date: Tue, 21 Jan 2025 10:52:04 -0800 Subject: [PATCH 4/4] Fix that weird formatting issue --- src/avx512-64bit-common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 5d55196..f27a31f 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -968,8 +968,7 @@ struct zmm_vector { static_assert(sizeof(size_t) == sizeof(uint64_t), "Size of size_t and uint64_t are not the same"); template <> -struct zmm_vector : public zmm_vector { -}; +struct zmm_vector : public zmm_vector {}; #endif template <>