Skip to content

Commit 7d7591c

Browse files
authored
Merge pull request #4 from r-devulap/cygwin-bug
Force inline on cygwin only
2 parents 0f1023b + c3be276 commit 7d7591c

4 files changed

+54
-44
lines changed

src/avx512-16bit-qsort.hpp

+13-13
Original file line numberDiff line numberDiff line change
@@ -374,7 +374,7 @@ struct zmm_vector<uint16_t> {
374374
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
375375
*/
376376
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
377-
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
377+
X86_SIMD_SORT_INLINE zmm_t sort_zmm_16bit(zmm_t zmm)
378378
{
379379
// Level 1
380380
zmm = cmp_merge<vtype>(
@@ -434,7 +434,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_16bit(zmm_t zmm)
434434

435435
// Assumes zmm is bitonic and performs a recursive half cleaner
436436
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
437-
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
437+
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
438438
{
439439
// 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
440440
zmm = cmp_merge<vtype>(
@@ -460,7 +460,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_16bit(zmm_t zmm)
460460

461461
// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
462462
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
463-
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
463+
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
464464
{
465465
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
466466
zmm2 = vtype::permutexvar(vtype::get_network(4), zmm2);
@@ -474,7 +474,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_16bit(zmm_t &zmm1, zmm_t &zmm2)
474474
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
475475
// half cleaner
476476
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
477-
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
477+
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
478478
{
479479
zmm_t zmm2r = vtype::permutexvar(vtype::get_network(4), zmm[2]);
480480
zmm_t zmm3r = vtype::permutexvar(vtype::get_network(4), zmm[3]);
@@ -495,7 +495,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_16bit(zmm_t *zmm)
495495
}
496496

497497
template <typename vtype, typename type_t>
498-
X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
498+
X86_SIMD_SORT_INLINE void sort_32_16bit(type_t *arr, int32_t N)
499499
{
500500
typename vtype::opmask_t load_mask = ((0x1ull << N) - 0x1ull) & 0xFFFFFFFF;
501501
typename vtype::zmm_t zmm
@@ -504,7 +504,7 @@ X86_SIMD_SORT_FINLINE void sort_32_16bit(type_t *arr, int32_t N)
504504
}
505505

506506
template <typename vtype, typename type_t>
507-
X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
507+
X86_SIMD_SORT_INLINE void sort_64_16bit(type_t *arr, int32_t N)
508508
{
509509
if (N <= 32) {
510510
sort_32_16bit<vtype>(arr, N);
@@ -523,7 +523,7 @@ X86_SIMD_SORT_FINLINE void sort_64_16bit(type_t *arr, int32_t N)
523523
}
524524

525525
template <typename vtype, typename type_t>
526-
X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
526+
X86_SIMD_SORT_INLINE void sort_128_16bit(type_t *arr, int32_t N)
527527
{
528528
if (N <= 64) {
529529
sort_64_16bit<vtype>(arr, N);
@@ -556,9 +556,9 @@ X86_SIMD_SORT_FINLINE void sort_128_16bit(type_t *arr, int32_t N)
556556
}
557557

558558
template <typename vtype, typename type_t>
559-
X86_SIMD_SORT_FINLINE type_t get_pivot_16bit(type_t *arr,
560-
const int64_t left,
561-
const int64_t right)
559+
X86_SIMD_SORT_INLINE type_t get_pivot_16bit(type_t *arr,
560+
const int64_t left,
561+
const int64_t right)
562562
{
563563
// median of 32
564564
int64_t size = (right - left) / 32;
@@ -657,8 +657,8 @@ qsort_16bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
657657
qsort_16bit_<vtype>(arr, pivot_index, right, max_iters - 1);
658658
}
659659

660-
X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
661-
int64_t arrsize)
660+
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(uint16_t *arr,
661+
int64_t arrsize)
662662
{
663663
int64_t nan_count = 0;
664664
__mmask16 loadmask = 0xFFFF;
@@ -676,7 +676,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(uint16_t *arr,
676676
return nan_count;
677677
}
678678

679-
X86_SIMD_SORT_FINLINE void
679+
X86_SIMD_SORT_INLINE void
680680
replace_inf_with_nan(uint16_t *arr, int64_t arrsize, int64_t nan_count)
681681
{
682682
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {

src/avx512-32bit-qsort.hpp

+14-14
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ struct zmm_vector<float> {
336336
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
337337
*/
338338
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
339-
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
339+
X86_SIMD_SORT_INLINE zmm_t sort_zmm_32bit(zmm_t zmm)
340340
{
341341
zmm = cmp_merge<vtype>(
342342
zmm,
@@ -383,7 +383,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_32bit(zmm_t zmm)
383383

384384
// Assumes zmm is bitonic and performs a recursive half cleaner
385385
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
386-
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
386+
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
387387
{
388388
// 1) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
389389
zmm = cmp_merge<vtype>(
@@ -410,7 +410,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_32bit(zmm_t zmm)
410410

411411
// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
412412
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
413-
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
413+
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
414414
{
415415
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
416416
*zmm2 = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), *zmm2);
@@ -424,7 +424,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_32bit(zmm_t *zmm1, zmm_t *zmm2)
424424
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
425425
// half cleaner
426426
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
427-
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
427+
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
428428
{
429429
zmm_t zmm2r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[2]);
430430
zmm_t zmm3r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[3]);
@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_32bit(zmm_t *zmm)
445445
}
446446

447447
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
448-
X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
448+
X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
449449
{
450450
zmm_t zmm4r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[4]);
451451
zmm_t zmm5r = vtype::permutexvar(_mm512_set_epi32(NETWORK_32BIT_5), zmm[5]);
@@ -482,7 +482,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_32bit(zmm_t *zmm)
482482
}
483483

484484
template <typename vtype, typename type_t>
485-
X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
485+
X86_SIMD_SORT_INLINE void sort_16_32bit(type_t *arr, int32_t N)
486486
{
487487
typename vtype::opmask_t load_mask = (0x0001 << N) - 0x0001;
488488
typename vtype::zmm_t zmm
@@ -491,7 +491,7 @@ X86_SIMD_SORT_FINLINE void sort_16_32bit(type_t *arr, int32_t N)
491491
}
492492

493493
template <typename vtype, typename type_t>
494-
X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
494+
X86_SIMD_SORT_INLINE void sort_32_32bit(type_t *arr, int32_t N)
495495
{
496496
if (N <= 16) {
497497
sort_16_32bit<vtype>(arr, N);
@@ -509,7 +509,7 @@ X86_SIMD_SORT_FINLINE void sort_32_32bit(type_t *arr, int32_t N)
509509
}
510510

511511
template <typename vtype, typename type_t>
512-
X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
512+
X86_SIMD_SORT_INLINE void sort_64_32bit(type_t *arr, int32_t N)
513513
{
514514
if (N <= 32) {
515515
sort_32_32bit<vtype>(arr, N);
@@ -540,7 +540,7 @@ X86_SIMD_SORT_FINLINE void sort_64_32bit(type_t *arr, int32_t N)
540540
}
541541

542542
template <typename vtype, typename type_t>
543-
X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
543+
X86_SIMD_SORT_INLINE void sort_128_32bit(type_t *arr, int32_t N)
544544
{
545545
if (N <= 64) {
546546
sort_64_32bit<vtype>(arr, N);
@@ -592,9 +592,9 @@ X86_SIMD_SORT_FINLINE void sort_128_32bit(type_t *arr, int32_t N)
592592
}
593593

594594
template <typename vtype, typename type_t>
595-
X86_SIMD_SORT_FINLINE type_t get_pivot_32bit(type_t *arr,
596-
const int64_t left,
597-
const int64_t right)
595+
X86_SIMD_SORT_INLINE type_t get_pivot_32bit(type_t *arr,
596+
const int64_t left,
597+
const int64_t right)
598598
{
599599
// median of 16
600600
int64_t size = (right - left) / 16;
@@ -656,7 +656,7 @@ qsort_32bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
656656
qsort_32bit_<vtype>(arr, pivot_index, right, max_iters - 1);
657657
}
658658

659-
X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
659+
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
660660
{
661661
int64_t nan_count = 0;
662662
__mmask16 loadmask = 0xFFFF;
@@ -672,7 +672,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(float *arr, int64_t arrsize)
672672
return nan_count;
673673
}
674674

675-
X86_SIMD_SORT_FINLINE void
675+
X86_SIMD_SORT_INLINE void
676676
replace_inf_with_nan(float *arr, int64_t arrsize, int64_t nan_count)
677677
{
678678
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {

src/avx512-64bit-qsort.hpp

+16-16
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ struct zmm_vector<double> {
330330
* https://en.wikipedia.org/wiki/Bitonic_sorter#/media/File:BitonicSort.svg
331331
*/
332332
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
333-
X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
333+
X86_SIMD_SORT_INLINE zmm_t sort_zmm_64bit(zmm_t zmm)
334334
{
335335
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
336336
zmm = cmp_merge<vtype>(
@@ -353,7 +353,7 @@ X86_SIMD_SORT_FINLINE zmm_t sort_zmm_64bit(zmm_t zmm)
353353

354354
// Assumes zmm is bitonic and performs a recursive half cleaner
355355
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
356-
X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
356+
X86_SIMD_SORT_INLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
357357
{
358358

359359
// 1) half_cleaner[8]: compare 0-4, 1-5, 2-6, 3-7
@@ -374,7 +374,7 @@ X86_SIMD_SORT_FINLINE zmm_t bitonic_merge_zmm_64bit(zmm_t zmm)
374374

375375
// Assumes zmm1 and zmm2 are sorted and performs a recursive half cleaner
376376
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
377-
X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
377+
X86_SIMD_SORT_INLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
378378
{
379379
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
380380
// 1) First step of a merging network: coex of zmm1 and zmm2 reversed
@@ -389,7 +389,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_two_zmm_64bit(zmm_t &zmm1, zmm_t &zmm2)
389389
// Assumes [zmm0, zmm1] and [zmm2, zmm3] are sorted and performs a recursive
390390
// half cleaner
391391
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
392-
X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
392+
X86_SIMD_SORT_INLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
393393
{
394394
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
395395
// 1) First step of a merging network
@@ -411,7 +411,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_four_zmm_64bit(zmm_t *zmm)
411411
}
412412

413413
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
414-
X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
414+
X86_SIMD_SORT_INLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
415415
{
416416
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
417417
zmm_t zmm4r = vtype::permutexvar(rev_index, zmm[4]);
@@ -445,7 +445,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_eight_zmm_64bit(zmm_t *zmm)
445445
}
446446

447447
template <typename vtype, typename zmm_t = typename vtype::zmm_t>
448-
X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
448+
X86_SIMD_SORT_INLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
449449
{
450450
const __m512i rev_index = _mm512_set_epi64(NETWORK_64BIT_2);
451451
zmm_t zmm8r = vtype::permutexvar(rev_index, zmm[8]);
@@ -519,7 +519,7 @@ X86_SIMD_SORT_FINLINE void bitonic_merge_sixteen_zmm_64bit(zmm_t *zmm)
519519
}
520520

521521
template <typename vtype, typename type_t>
522-
X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
522+
X86_SIMD_SORT_INLINE void sort_8_64bit(type_t *arr, int32_t N)
523523
{
524524
typename vtype::opmask_t load_mask = (0x01 << N) - 0x01;
525525
typename vtype::zmm_t zmm
@@ -528,7 +528,7 @@ X86_SIMD_SORT_FINLINE void sort_8_64bit(type_t *arr, int32_t N)
528528
}
529529

530530
template <typename vtype, typename type_t>
531-
X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
531+
X86_SIMD_SORT_INLINE void sort_16_64bit(type_t *arr, int32_t N)
532532
{
533533
if (N <= 8) {
534534
sort_8_64bit<vtype>(arr, N);
@@ -546,7 +546,7 @@ X86_SIMD_SORT_FINLINE void sort_16_64bit(type_t *arr, int32_t N)
546546
}
547547

548548
template <typename vtype, typename type_t>
549-
X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
549+
X86_SIMD_SORT_INLINE void sort_32_64bit(type_t *arr, int32_t N)
550550
{
551551
if (N <= 16) {
552552
sort_16_64bit<vtype>(arr, N);
@@ -577,7 +577,7 @@ X86_SIMD_SORT_FINLINE void sort_32_64bit(type_t *arr, int32_t N)
577577
}
578578

579579
template <typename vtype, typename type_t>
580-
X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
580+
X86_SIMD_SORT_INLINE void sort_64_64bit(type_t *arr, int32_t N)
581581
{
582582
if (N <= 32) {
583583
sort_32_64bit<vtype>(arr, N);
@@ -628,7 +628,7 @@ X86_SIMD_SORT_FINLINE void sort_64_64bit(type_t *arr, int32_t N)
628628
}
629629

630630
template <typename vtype, typename type_t>
631-
X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
631+
X86_SIMD_SORT_INLINE void sort_128_64bit(type_t *arr, int32_t N)
632632
{
633633
if (N <= 64) {
634634
sort_64_64bit<vtype>(arr, N);
@@ -718,9 +718,9 @@ X86_SIMD_SORT_FINLINE void sort_128_64bit(type_t *arr, int32_t N)
718718
}
719719

720720
template <typename vtype, typename type_t>
721-
X86_SIMD_SORT_FINLINE type_t get_pivot_64bit(type_t *arr,
722-
const int64_t left,
723-
const int64_t right)
721+
X86_SIMD_SORT_INLINE type_t get_pivot_64bit(type_t *arr,
722+
const int64_t left,
723+
const int64_t right)
724724
{
725725
// median of 8
726726
int64_t size = (right - left) / 8;
@@ -769,7 +769,7 @@ qsort_64bit_(type_t *arr, int64_t left, int64_t right, int64_t max_iters)
769769
qsort_64bit_<vtype>(arr, pivot_index, right, max_iters - 1);
770770
}
771771

772-
X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
772+
X86_SIMD_SORT_INLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
773773
{
774774
int64_t nan_count = 0;
775775
__mmask8 loadmask = 0xFF;
@@ -785,7 +785,7 @@ X86_SIMD_SORT_FINLINE int64_t replace_nan_with_inf(double *arr, int64_t arrsize)
785785
return nan_count;
786786
}
787787

788-
X86_SIMD_SORT_FINLINE void
788+
X86_SIMD_SORT_INLINE void
789789
replace_inf_with_nan(double *arr, int64_t arrsize, int64_t nan_count)
790790
{
791791
for (int64_t ii = arrsize - 1; nan_count > 0; --ii) {

src/avx512-common-qsort.h

+11-1
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,20 @@
6464
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d
6565

6666
#ifdef _MSC_VER
67+
#define X86_SIMD_SORT_INLINE static inline
6768
#define X86_SIMD_SORT_FINLINE static __forceinline
69+
#elif defined(__CYGWIN__)
70+
/*
71+
* Force inline in cygwin to work around a compiler bug. See
72+
* https://github.com/numpy/numpy/pull/22315#issuecomment-1267757584
73+
*/
74+
#define X86_SIMD_SORT_INLINE static __attribute__((always_inline))
75+
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
6876
#elif defined(__GNUC__)
69-
#define X86_SIMD_SORT_FINLINE static inline //__attribute__((always_inline))
77+
#define X86_SIMD_SORT_INLINE static inline
78+
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
7079
#else
80+
#define X86_SIMD_SORT_INLINE static
7181
#define X86_SIMD_SORT_FINLINE static
7282
#endif
7383

0 commit comments

Comments
 (0)