forked from AlexHarker/HISSTools_Library
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSIMDSupport.hpp
executable file
·738 lines (557 loc) · 32.1 KB
/
SIMDSupport.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
#ifndef SIMDSUPPORT_HPP
#define SIMDSUPPORT_HPP
#include <cmath>
#include <cstdint>
#include <emmintrin.h>
#include <immintrin.h>
#ifdef __APPLE__
template <class T> T *allocate_aligned(size_t size)
{
return static_cast<T *>(malloc(size * sizeof(T)));
}
template <class T> void deallocate_aligned(T *ptr)
{
free(ptr);
}
#elif defined(__linux__)
#include <stdlib.h>
// Forward declation
template <class T> struct SIMDLimits;
template <class T> T *allocate_aligned(size_t size)
{
void *mem;
posix_memalign(&mem, SIMDLimits<T>::byte_width, size * sizeof(T));
return static_cast<T *>(mem);
}
template <class T> void deallocate_aligned(T *ptr)
{
free(ptr);
}
#else
#include <malloc.h>
template <class T> T *allocate_aligned(size_t size)
{
return static_cast<T *>(_aligned_malloc(size * sizeof(T), 16));
}
template <class T> void deallocate_aligned(T *ptr)
{
_aligned_free(ptr);
}
#endif
#include <algorithm>
#include <functional>
#define SIMD_COMPILER_SUPPORT_SCALAR 0
#define SIMD_COMPILER_SUPPORT_SSE128 1
#define SIMD_COMPILER_SUPPORT_AVX256 2
#define SIMD_COMPILER_SUPPORT_AVX512 3
// Microsoft Visual Studio doesn't ever define __SSE__ so if necessary we derive it from other defines
#ifndef __SSE__
#if defined _M_X64 || (defined _M_IX86_FP && _M_IX86_FP > 0)
#define __SSE__ 1
#endif
#endif
template<class T> struct SIMDLimits
{
static const int max_size = 1;
static const int byte_width = sizeof(T);
};
#if defined(__AVX512F__)
#define SIMD_COMPILER_SUPPORT_LEVEL SIMD_COMPILER_SUPPORT_AVX512
template<> struct SIMDLimits<double>
{
static const int max_size = 8;
static const int byte_width = 64;
};
template<> struct SIMDLimits<float>
{
static const int max_size = 16;
static const int byte_width = 64;
};
#elif defined(__AVX__)
#define SIMD_COMPILER_SUPPORT_LEVEL SIMD_COMPILER_SUPPORT_AVX256
template<> struct SIMDLimits<double>
{
static const int max_size = 4;
static const int byte_width = 32;
};
template<> struct SIMDLimits<float>
{
static const int max_size = 8;
static const int byte_width = 32;
};
#elif defined(__SSE__)
#define SIMD_COMPILER_SUPPORT_LEVEL SIMD_COMPILER_SUPPORT_SSE128
template<> struct SIMDLimits<double>
{
static const int max_size = 2;
static const int byte_width = 16;
};
template<> struct SIMDLimits<float>
{
static const int max_size = 4;
static const int byte_width = 16;
};
#else
#define SIMD_COMPILER_SUPPORT_LEVEL SIMD_COMPILER_SUPPORT_SCALAR
#endif
// Select Functionality for all types
template <class T> T select(const T& a, const T& b, const T& mask)
{
return (b & mask) | and_not(mask, a);
}
// Data Type Definitions
// ******************** A Vector of Given Size (Made of Vectors) ******************** //
template <int final_size, class T> struct SizedVector
{
typedef SizedVector SV;
typedef typename T::scalar_type scalar_type;
static const int size = final_size;
static const int array_size = final_size / T::size;
SizedVector() {}
SizedVector(const typename T::scalar_type& a)
{
for (int i = 0; i < array_size; i++)
mData[i] = a;
}
SizedVector(const SizedVector *ptr) { *this = *ptr; }
SizedVector(const typename T::scalar_type *array) { *this = *reinterpret_cast<const SizedVector *>(array); }
// This template allows a static loop
template <int First, int Last>
struct static_for
{
template <typename Fn>
void operator()(SizedVector &result, const SizedVector &a, const SizedVector &b, Fn const& fn) const
{
if (First < Last)
{
result.mData[First] = fn(a.mData[First], b.mData[First]);
static_for<First + 1, Last>()(result, a, b, fn);
}
}
};
// This specialisation avoids infinite recursion
template <int N>
struct static_for<N, N>
{
template <typename Fn>
void operator()(SV &result, const SV &a, const SV &b, Fn const& fn) const {}
};
template <typename Op> friend SizedVector op(const SV& a, const SV& b, Op op)
{
SV result;
static_for<0, array_size>()(result, a, b, op);
return result;
}
friend SV operator + (const SV& a, const SV& b) { return op(a, b, std::plus<T>()); }
friend SV operator - (const SV& a, const SV& b) { return op(a, b, std::minus<T>()); }
friend SV operator * (const SV& a, const SV& b) { return op(a, b, std::multiplies<T>()); }
friend SV operator / (const SV& a, const SV& b) { return op(a, b, std::divides<T>()); }
SV& operator += (const SV& b) { return (*this = *this + b); }
SV& operator -= (const SV& b) { return (*this = *this - b); }
SV& operator *= (const SV& b) { return (*this = *this * b); }
SV& operator /= (const SV& b) { return (*this = *this / b); }
friend SV min(const SV& a, const SV& b) { return op(a, b, std::min<T>()); }
friend SV max(const SV& a, const SV& b) { return op(a, b, std::max<T>()); }
friend SV operator == (const SV& a, const SV& b) { return op(a, b, std::equal_to<T>()); }
friend SV operator != (const SV& a, const SV& b) { return op(a, b, std::not_equal_to<T>()); }
friend SV operator > (const SV& a, const SV& b) { return op(a, b, std::greater<T>()); }
friend SV operator < (const SV& a, const SV& b) { return op(a, b, std::less<T>()); }
friend SV operator >= (const SV& a, const SV& b) { return op(a, b, std::greater_equal<T>()); }
friend SV operator <= (const SV& a, const SV& b) { return op(a, b, std::less_equal<T>()); }
T mData[array_size];
};
// ******************** Basic Data Type Defintions ******************** //
template <class T, class U, int vec_size> struct SIMDVector
{
static const int size = vec_size;
typedef T scalar_type;
SIMDVector() {}
SIMDVector(U a) : mVal(a) {}
U mVal;
};
template <class T, int vec_size> struct SIMDType {};
template<>
struct SIMDType<double, 1>
{
static const int size = 1;
typedef double scalar_type;
SIMDType() {}
SIMDType(double a) : mVal(a) {}
SIMDType(const double* a) { mVal = *a; }
void store(double *a) const { *a = mVal; }
friend SIMDType operator + (const SIMDType& a, const SIMDType& b) { return a.mVal + b.mVal; }
friend SIMDType operator - (const SIMDType& a, const SIMDType& b) { return a.mVal - b.mVal; }
friend SIMDType operator * (const SIMDType& a, const SIMDType& b) { return a.mVal * b.mVal; }
friend SIMDType operator / (const SIMDType& a, const SIMDType& b) { return a.mVal / b.mVal; }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return sqrt(a.mVal); }
friend SIMDType round(const SIMDType& a) { return round(a.mVal); }
friend SIMDType trunc(const SIMDType& a) { return trunc(a.mVal); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return std::min(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return std::max(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return c.mVal ? b.mVal : a.mVal; }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return a.mVal == b.mVal; }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return a.mVal != b.mVal; }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return a.mVal > b.mVal; }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return a.mVal < b.mVal; }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return a.mVal >= b.mVal; }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return a.mVal <= b.mVal; }
double mVal;
};
template<>
struct SIMDType<float, 1>
{
static const int size = 1;
typedef float scalar_type;
SIMDType() {}
SIMDType(float a) : mVal(a) {}
SIMDType(const float* a) { mVal = *a; }
SIMDType(const SIMDType<double, 1>& a) : mVal(static_cast<float>(a.mVal)) {}
void store(float *a) const { *a = mVal; }
friend SIMDType operator + (const SIMDType& a, const SIMDType& b) { return a.mVal + b.mVal; }
friend SIMDType operator - (const SIMDType& a, const SIMDType& b) { return a.mVal - b.mVal; }
friend SIMDType operator * (const SIMDType& a, const SIMDType& b) { return a.mVal * b.mVal; }
friend SIMDType operator / (const SIMDType& a, const SIMDType& b) { return a.mVal / b.mVal; }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return sqrtf(a.mVal); }
friend SIMDType round(const SIMDType& a) { return roundf(a.mVal); }
friend SIMDType trunc(const SIMDType& a) { return truncf(a.mVal); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return std::min(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return std::max(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return c.mVal ? b.mVal : a.mVal; }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return a.mVal == b.mVal; }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return a.mVal != b.mVal; }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return a.mVal > b.mVal; }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return a.mVal < b.mVal; }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return a.mVal >= b.mVal; }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return a.mVal <= b.mVal; }
operator SIMDType<double, 1>() { return static_cast<double>(mVal); }
float mVal;
};
template<>
struct SIMDType<float, 2>
{
static const int size = 1;
typedef float scalar_type;
SIMDType() {}
SIMDType(float a)
{
mVals[0] = a;
mVals[1] = a;
}
SIMDType(float a, float b)
{
mVals[0] = a;
mVals[1] = b;
}
SIMDType(const float* a)
{
mVals[0] = a[0];
mVals[1] = a[1];
}
void store(float *a) const
{
a[0] = mVals[0];
a[1] = mVals[1];
}
// N.B. - no ops
float mVals[2];
};
#if (SIMD_COMPILER_SUPPORT_LEVEL >= SIMD_COMPILER_SUPPORT_SSE)
template<>
struct SIMDType<double, 2> : public SIMDVector<double, __m128d, 2>
{
SIMDType() {}
SIMDType(const double& a) { mVal = _mm_set1_pd(a); }
SIMDType(const double* a) { mVal = _mm_loadu_pd(a); }
SIMDType(__m128d a) : SIMDVector(a) {}
SIMDType(const SIMDType<float, 2> &a)
{
double vals[2];
vals[0] = a.mVals[0];
vals[1] = a.mVals[1];
mVal = _mm_loadu_pd(vals);
}
void store(double *a) const { _mm_storeu_pd(a, mVal); }
friend SIMDType operator + (const SIMDType &a, const SIMDType& b) { return _mm_add_pd(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType &a, const SIMDType& b) { return _mm_sub_pd(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType &a, const SIMDType& b) { return _mm_mul_pd(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType &a, const SIMDType& b) { return _mm_div_pd(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm_sqrt_pd(a.mVal); }
friend SIMDType round(const SIMDType& a) { return _mm_round_pd(a.mVal, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
friend SIMDType trunc(const SIMDType& a) { return _mm_round_pd(a.mVal, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm_min_pd(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm_max_pd(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm_andnot_pd(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm_and_pd(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm_or_pd(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm_xor_pd(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm_cmpeq_pd(a.mVal, b.mVal); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm_cmpneq_pd(a.mVal, b.mVal); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm_cmplt_pd(a.mVal, b.mVal); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm_cmpgt_pd(a.mVal, b.mVal); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm_cmple_pd(a.mVal, b.mVal); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm_cmpge_pd(a.mVal, b.mVal); }
template <int y, int x> static SIMDType shuffle(const SIMDType& a, const SIMDType& b)
{
return _mm_shuffle_pd(a.mVal, b.mVal, (y<<1)|x);
}
operator SIMDType<float, 2>()
{
double vals[2];
store(vals);
return SIMDType<float, 2>(static_cast<float>(vals[0]), static_cast<float>(vals[1]));
}
};
template<>
struct SIMDType<float, 4> : public SIMDVector<float, __m128, 4>
{
SIMDType() {}
SIMDType(const float& a) { mVal = _mm_set1_ps(a); }
SIMDType(const float* a) { mVal = _mm_loadu_ps(a); }
SIMDType(__m128 a) : SIMDVector(a) {}
void store(float *a) const { _mm_storeu_ps(a, mVal); }
friend SIMDType operator + (const SIMDType& a, const SIMDType& b) { return _mm_add_ps(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType& a, const SIMDType& b) { return _mm_sub_ps(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType& a, const SIMDType& b) { return _mm_mul_ps(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType& a, const SIMDType& b) { return _mm_div_ps(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm_sqrt_ps(a.mVal); }
friend SIMDType round(const SIMDType& a) { return _mm_round_ps(a.mVal, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
friend SIMDType trunc(const SIMDType& a) { return _mm_round_ps(a.mVal, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm_min_ps(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm_max_ps(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm_andnot_ps(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm_and_ps(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm_or_ps(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm_xor_ps(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm_cmpeq_ps(a.mVal, b.mVal); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm_cmpneq_ps(a.mVal, b.mVal); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm_cmplt_ps(a.mVal, b.mVal); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm_cmpgt_ps(a.mVal, b.mVal); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm_cmple_ps(a.mVal, b.mVal); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm_cmpge_ps(a.mVal, b.mVal); }
template <int z, int y, int x, int w> static SIMDType shuffle(const SIMDType& a, const SIMDType& b)
{
return _mm_shuffle_ps(a.mVal, b.mVal, ((z<<6)|(y<<4)|(x<<2)|w));
}
operator SizedVector<4, SIMDType<double, 2>>()
{
SizedVector<4, SIMDType<double, 2>> vec;
vec.mData[0] = _mm_cvtps_pd(mVal);
vec.mData[1] = _mm_cvtps_pd(_mm_movehl_ps(mVal, mVal));
return vec;
}
};
template<>
struct SIMDType<int32_t, 4> : public SIMDVector<int32_t, __m128i, 4>
{
SIMDType() {}
SIMDType(const int32_t& a) { mVal = _mm_set1_epi32(a); }
SIMDType(const int32_t* a) { mVal = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a)); }
SIMDType(__m128i a) : SIMDVector(a) {}
void store(int32_t *a) const { _mm_storeu_si128(reinterpret_cast<__m128i *>(a), mVal); }
friend SIMDType operator + (const SIMDType& a, const SIMDType& b) { return _mm_add_epi32(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType& a, const SIMDType& b) { return _mm_sub_epi32(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType& a, const SIMDType& b) { return _mm_mul_epi32(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm_min_epi32(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm_max_epi32(a.mVal, b.mVal); }
operator SIMDType<float, 4>() { return SIMDType<float, 4>( _mm_cvtepi32_ps(mVal)); }
operator SizedVector<4, SIMDType<double, 2>>()
{
SizedVector<4, SIMDType<double, 2>> vec;
vec.mData[0] = _mm_cvtepi32_pd(mVal);
vec.mData[1] = _mm_cvtepi32_pd(_mm_shuffle_epi32(mVal, 0xE));
return vec;
}
};
#endif
#if (SIMD_COMPILER_SUPPORT_LEVEL >= SIMD_COMPILER_SUPPORT_AVX256)
template<>
struct SIMDType<double, 4> : public SIMDVector<double, __m256d, 4>
{
SIMDType() {}
SIMDType(const double& a) { mVal = _mm256_set1_pd(a); }
SIMDType(const double* a) { mVal = _mm256_loadu_pd(a); }
SIMDType(__m256d a) : SIMDVector(a) {}
SIMDType(const SIMDType<float, 4> &a) { mVal = _mm256_cvtps_pd(a.mVal); }
SIMDType(const SIMDType<int32_t, 4> &a) { mVal = _mm256_cvtepi32_pd(a.mVal); }
void store(double *a) const { _mm256_storeu_pd(a, mVal); }
friend SIMDType operator + (const SIMDType &a, const SIMDType &b) { return _mm256_add_pd(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType &a, const SIMDType &b) { return _mm256_sub_pd(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType &a, const SIMDType &b) { return _mm256_mul_pd(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType &a, const SIMDType &b) { return _mm256_div_pd(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm256_sqrt_pd(a.mVal); }
friend SIMDType round(const SIMDType& a) { return _mm256_round_pd(a.mVal, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
friend SIMDType trunc(const SIMDType& a) { return _mm256_round_pd(a.mVal, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm256_min_pd(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm256_max_pd(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm256_andnot_pd(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm256_and_pd(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm256_or_pd(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm256_xor_pd(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_EQ_OQ); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_NEQ_UQ); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_GT_OQ); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_LT_OQ); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_GE_OQ); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_pd(a.mVal, b.mVal, _CMP_LE_OQ); }
operator SIMDType<float, 4>() { return _mm256_cvtpd_ps(mVal); }
operator SIMDType<int32_t, 4>() { return _mm256_cvtpd_epi32(mVal); }
};
template<>
struct SIMDType<float, 8> : public SIMDVector<float, __m256, 8>
{
SIMDType() {}
SIMDType(const float& a) { mVal = _mm256_set1_ps(a); }
SIMDType(const float* a) { mVal = _mm256_loadu_ps(a); }
SIMDType(__m256 a) : SIMDVector(a) {}
void store(float *a) const { _mm256_storeu_ps(a, mVal); }
friend SIMDType operator + (const SIMDType &a, const SIMDType &b) { return _mm256_add_ps(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType &a, const SIMDType &b) { return _mm256_sub_ps(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType &a, const SIMDType &b) { return _mm256_mul_ps(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType &a, const SIMDType &b) { return _mm256_div_ps(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm256_sqrt_ps(a.mVal); }
friend SIMDType round(const SIMDType& a) { return _mm256_round_ps(a.mVal, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC); }
friend SIMDType trunc(const SIMDType& a) { return _mm256_round_ps(a.mVal, _MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm256_min_ps(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm256_max_ps(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm256_andnot_ps(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm256_and_ps(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm256_or_ps(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm256_xor_ps(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_EQ_OQ); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_NEQ_UQ); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_GT_OQ); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_LT_OQ); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_GE_OQ); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm256_cmp_ps(a.mVal, b.mVal, _CMP_LE_OQ); }
operator SizedVector<8, SIMDType<double, 4>>()
{
SizedVector<8, SIMDType<double, 4>> vec;
vec.mData[0] = _mm256_cvtps_pd(_mm256_extractf128_ps(mVal, 0));
vec.mData[1] = _mm256_cvtps_pd(_mm256_extractf128_ps(mVal, 1));
return vec;
}
};
#endif
#if (SIMD_COMPILER_SUPPORT_LEVEL >= SIMD_COMPILER_SUPPORT_AVX512)
template<>
struct SIMDType<double, 8> : public SIMDVector<double, __m512d, 8>
{
SIMDType() {}
SIMDType(const double& a) { mVal = _mm512_set1_pd(a); }
SIMDType(const double* a) { mVal = _mm512_loadu_pd(a); }
SIMDType(__m512d a) : SIMDVector(a) {}
SIMDType(const SIMDType<float, 8> &a) { mVal = _mm512_cvtps_pd(a.mVal); }
void store(double *a) const { _mm512_storeu_pd(a, mVal); }
friend SIMDType operator + (const SIMDType &a, const SIMDType &b) { return _mm512_add_pd(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType &a, const SIMDType &b) { return _mm512_sub_pd(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType &a, const SIMDType &b) { return _mm512_mul_pd(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType &a, const SIMDType &b) { return _mm512_div_pd(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm512_sqrt_pd(a.mVal); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm512_min_pd(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm512_max_pd(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm512_andnot_pd(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm512_and_pd(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm512_or_pd(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm512_xor_pd(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_EQ_OQ); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_NEQ_UQ); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_GT_OQ); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_LT_OQ); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_GE_OQ); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_pd_mask(a.mVal, b.mVal, _CMP_LE_OQ); }
operator SIMDType<float, 8>() { return _mm512_cvtpd_ps(mVal); }
};
template<>
struct SIMDType<float, 16> : public SIMDVector<float, __m512, 16>
{
SIMDType() {}
SIMDType(const float& a) { mVal = _mm512_set1_ps(a); }
SIMDType(const float* a) { mVal = _mm512_loadu_ps(a); }
SIMDType(__m512 a) : SIMDVector(a) {}
void store(float *a) const { _mm512_storeu_ps(a, mVal); }
friend SIMDType operator + (const SIMDType &a, const SIMDType &b) { return _mm512_add_ps(a.mVal, b.mVal); }
friend SIMDType operator - (const SIMDType &a, const SIMDType &b) { return _mm512_sub_ps(a.mVal, b.mVal); }
friend SIMDType operator * (const SIMDType &a, const SIMDType &b) { return _mm512_mul_ps(a.mVal, b.mVal); }
friend SIMDType operator / (const SIMDType &a, const SIMDType &b) { return _mm512_div_ps(a.mVal, b.mVal); }
SIMDType& operator += (const SIMDType& b) { return (*this = *this + b); }
SIMDType& operator -= (const SIMDType& b) { return (*this = *this - b); }
SIMDType& operator *= (const SIMDType& b) { return (*this = *this * b); }
SIMDType& operator /= (const SIMDType& b) { return (*this = *this / b); }
friend SIMDType sqrt(const SIMDType& a) { return _mm512_sqrt_ps(a.mVal); }
friend SIMDType min(const SIMDType& a, const SIMDType& b) { return _mm512_min_ps(a.mVal, b.mVal); }
friend SIMDType max(const SIMDType& a, const SIMDType& b) { return _mm512_max_ps(a.mVal, b.mVal); }
friend SIMDType sel(const SIMDType& a, const SIMDType& b, const SIMDType& c) { return and_not(c, a) | (b & c); }
friend SIMDType and_not(const SIMDType& a, const SIMDType& b) { return _mm512_andnot_ps(a.mVal, b.mVal); }
friend SIMDType operator & (const SIMDType& a, const SIMDType& b) { return _mm512_and_ps(a.mVal, b.mVal); }
friend SIMDType operator | (const SIMDType& a, const SIMDType& b) { return _mm512_or_ps(a.mVal, b.mVal); }
friend SIMDType operator ^ (const SIMDType& a, const SIMDType& b) { return _mm512_xor_ps(a.mVal, b.mVal); }
friend SIMDType operator == (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_EQ_OQ); }
friend SIMDType operator != (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_NEQ_UQ); }
friend SIMDType operator > (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_GT_OQ); }
friend SIMDType operator < (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_LT_OQ); }
friend SIMDType operator >= (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_GE_OQ); }
friend SIMDType operator <= (const SIMDType& a, const SIMDType& b) { return _mm512_cmp_ps_mask(a.mVal, b.mVal, _CMP_LE_OQ); }
};
#endif
// abs functions
static inline SIMDType<double, 1> abs(const SIMDType<double, 1> a)
{
const static uint64_t bit_mask_64 = 0x7FFFFFFFFFFFFFFFU;
uint64_t temp = *(reinterpret_cast<const uint64_t *>(&a)) & bit_mask_64;
return *(reinterpret_cast<double *>(&temp));
}
static inline SIMDType<float, 1> abs(const SIMDType<float, 1> a)
{
const static uint32_t bit_mask_32 = 0x7FFFFFFFU;
uint32_t temp = *(reinterpret_cast<const uint32_t *>(&a)) & bit_mask_32;
return *(reinterpret_cast<float *>(&temp));
}
template <int N> SIMDType<double, N> abs(const SIMDType<double, N> a)
{
const static uint64_t bit_mask_64 = 0x7FFFFFFFFFFFFFFFU;
const double bit_mask_64d = *(reinterpret_cast<const double *>(&bit_mask_64));
return a & SIMDType<double, N>(bit_mask_64d);
}
template <int N> SIMDType<float, N> abs(const SIMDType<float, N> a)
{
const static uint32_t bit_mask_32 = 0x7FFFFFFFU;
const float bit_mask_32f = *(reinterpret_cast<const double *>(&bit_mask_32));
return a & SIMDType<float, N>(bit_mask_32f);
}
#endif