Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds OpenMP to qsort, should also improve test speed a bit #179

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/avx512-16bit-qsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,7 @@ avx512_qsort_fp16(uint16_t *arr,
{
using vtype = zmm_vector<float16>;

// TODO multithreading support here
if (arrsize > 1) {
arrsize_t nan_count = 0;
if (UNLIKELY(hasnan)) {
Expand All @@ -564,11 +565,11 @@ avx512_qsort_fp16(uint16_t *arr,
}
if (descending) {
qsort_<vtype, Comparator<vtype, true>, uint16_t>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
}
else {
qsort_<vtype, Comparator<vtype, false>, uint16_t>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
}
replace_inf_with_nan(arr, arrsize, nan_count, descending);
}
Expand Down
5 changes: 5 additions & 0 deletions src/xss-common-includes.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, \
21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31

#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
#define XSS_COMPILE_OPENMP
#include <omp.h>
#endif

template <class... T>
constexpr bool always_false = false;

Expand Down
5 changes: 0 additions & 5 deletions src/xss-common-keyvaluesort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,6 @@
#include "xss-common-qsort.h"
#include "xss-network-keyvaluesort.hpp"

#if defined(XSS_USE_OPENMP) && defined(_OPENMP)
#define XSS_COMPILE_OPENMP
#include <omp.h>
#endif

/*
* Sort all the NAN's to end of the array and return the index of the last elem
* in the array which is not a nan
Expand Down
76 changes: 71 additions & 5 deletions src/xss-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,11 @@ template <typename vtype, int maxN>
void sort_n(typename vtype::type_t *arr, int N);

template <typename vtype, typename comparator, typename type_t>
static void
qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
static void qsort_(type_t *arr,
arrsize_t left,
arrsize_t right,
arrsize_t max_iters,
arrsize_t task_threshold)
{
/*
* Resort to std::sort if quicksort isnt making any progress
Expand Down Expand Up @@ -559,10 +562,40 @@ qsort_(type_t *arr, arrsize_t left, arrsize_t right, arrsize_t max_iters)
type_t leftmostValue = comparator::leftmost(smallest, biggest);
type_t rightmostValue = comparator::rightmost(smallest, biggest);

#ifdef XSS_COMPILE_OPENMP
if (pivot != leftmostValue) {
bool parallel_left = (pivot_index - left) > task_threshold;
if (parallel_left) {
#pragma omp task
qsort_<vtype, comparator>(
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
}
else {
qsort_<vtype, comparator>(
arr, left, pivot_index - 1, max_iters - 1, task_threshold);
}
}
if (pivot != rightmostValue) {
bool parallel_right = (right - pivot_index) > task_threshold;

if (parallel_right) {
#pragma omp task
qsort_<vtype, comparator>(
arr, pivot_index, right, max_iters - 1, task_threshold);
}
else {
qsort_<vtype, comparator>(
arr, pivot_index, right, max_iters - 1, task_threshold);
}
}
#else
UNUSED(task_threshold);

if (pivot != leftmostValue)
qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1);
qsort_<vtype, comparator>(arr, left, pivot_index - 1, max_iters - 1, 0);
if (pivot != rightmostValue)
qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1);
qsort_<vtype, comparator>(arr, pivot_index, right, max_iters - 1, 0);
#endif
}

template <typename vtype, typename comparator, typename type_t>
Expand Down Expand Up @@ -627,8 +660,41 @@ X86_SIMD_SORT_INLINE void xss_qsort(T *arr, arrsize_t arrsize, bool hasnan)
}

UNUSED(hasnan);

#ifdef XSS_COMPILE_OPENMP

bool use_parallel = arrsize > 100000;

if (use_parallel) {
// This thread limit was determined experimentally; it may be better for it to be the number of physical cores on the system
constexpr int thread_limit = 8;
int thread_count = std::min(thread_limit, omp_get_max_threads());
arrsize_t task_threshold
= std::max((arrsize_t)100000, arrsize / 100);

// We use omp parallel and then omp single to setup the threads that will run the omp task calls in qsort_
// The omp single prevents multiple threads from running the initial qsort_ simultaneously and causing problems
// Note that we do not use the if(...) clause built into OpenMP, because it causes a performance regression for small arrays
#pragma omp parallel num_threads(thread_count)
#pragma omp single
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
task_threshold);
}
else {
qsort_<vtype, comparator, T>(arr,
0,
arrsize - 1,
2 * (arrsize_t)log2(arrsize),
std::numeric_limits<arrsize_t>::max());
}
#pragma omp taskwait
#else
qsort_<vtype, comparator, T>(
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize));
arr, 0, arrsize - 1, 2 * (arrsize_t)log2(arrsize), 0);
#endif

replace_inf_with_nan(arr, arrsize, nan_count, descending);
}
Expand Down
7 changes: 7 additions & 0 deletions tests/meson.build
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
libtests = []

if get_option('use_openmp')
openmpflags = ['-DXSS_USE_OPENMP=true']
endif

libtests += static_library('tests_qsort',
files('test-qsort.cpp', ),
dependencies: gtest_dep,
include_directories : [src, lib, utils],
cpp_args : [openmpflags],
)

libtests += static_library('tests_kvsort',
files('test-keyvalue.cpp', ),
dependencies: gtest_dep,
include_directories : [src, lib, utils],
cpp_args : [openmpflags],
)

libtests += static_library('tests_objsort',
files('test-objqsort.cpp', ),
dependencies: gtest_dep,
include_directories : [src, lib, utils],
cpp_args : [openmpflags],
)
15 changes: 10 additions & 5 deletions tests/test-keyvalue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,13 @@ class simdkvsort : public ::testing::Test {
simdkvsort()
{
std::iota(arrsize.begin(), arrsize.end(), 1);
arrsize.push_back(10'000);
arrsize.push_back(100'000);
arrsize.push_back(1'000'000);
std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
#ifdef XSS_USE_OPENMP
// These extended tests are only needed for the OpenMP logic
arrsize_long.push_back(10'000);
arrsize_long.push_back(100'000);
arrsize_long.push_back(1'000'000);
#endif

arrtype = {"random",
"constant",
Expand All @@ -32,6 +36,7 @@ class simdkvsort : public ::testing::Test {
}
std::vector<std::string> arrtype;
std::vector<size_t> arrsize = std::vector<size_t>(1024);
std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
};

TYPED_TEST_SUITE_P(simdkvsort);
Expand Down Expand Up @@ -168,7 +173,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_ascending)
using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
for (auto type : this->arrtype) {
bool hasnan = is_nan_test(type);
for (auto size : this->arrsize) {
for (auto size : this->arrsize_long) {
std::vector<T1> key = get_array<T1>(type, size);
std::vector<T2> val = get_array<T2>(type, size);
std::vector<T1> key_bckp = key;
Expand Down Expand Up @@ -199,7 +204,7 @@ TYPED_TEST_P(simdkvsort, test_kvsort_descending)
using T2 = typename std::tuple_element<1, decltype(TypeParam())>::type;
for (auto type : this->arrtype) {
bool hasnan = is_nan_test(type);
for (auto size : this->arrsize) {
for (auto size : this->arrsize_long) {
std::vector<T1> key = get_array<T1>(type, size);
std::vector<T2> val = get_array<T2>(type, size);
std::vector<T1> key_bckp = key;
Expand Down
13 changes: 11 additions & 2 deletions tests/test-qsort.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ class simdsort : public ::testing::Test {
simdsort()
{
std::iota(arrsize.begin(), arrsize.end(), 1);
std::iota(arrsize_long.begin(), arrsize_long.end(), 1);
#ifdef XSS_USE_OPENMP
// These extended tests are only needed for the OpenMP logic
arrsize_long.push_back(10'000);
arrsize_long.push_back(100'000);
arrsize_long.push_back(1'000'000);
#endif

arrtype = {"random",
"constant",
"sorted",
Expand All @@ -24,6 +32,7 @@ class simdsort : public ::testing::Test {
}
std::vector<std::string> arrtype;
std::vector<size_t> arrsize = std::vector<size_t>(1024);
std::vector<size_t> arrsize_long = std::vector<size_t>(1024);
};

TYPED_TEST_SUITE_P(simdsort);
Expand All @@ -32,7 +41,7 @@ TYPED_TEST_P(simdsort, test_qsort_ascending)
{
for (auto type : this->arrtype) {
bool hasnan = is_nan_test(type);
for (auto size : this->arrsize) {
for (auto size : this->arrsize_long) {
std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);

// Ascending order
Expand All @@ -54,7 +63,7 @@ TYPED_TEST_P(simdsort, test_qsort_descending)
{
for (auto type : this->arrtype) {
bool hasnan = is_nan_test(type);
for (auto size : this->arrsize) {
for (auto size : this->arrsize_long) {
std::vector<TypeParam> basearr = get_array<TypeParam>(type, size);

// Descending order
Expand Down