Skip to content


Added proper bounds checking to SIMD wide-exp function, as it was blo…
Browse files Browse the repository at this point in the history
…wing up in the exp(-hnu/kT) term for continua at very short wavelengths (< 1 nm). This is very unlikely to have affected results, as it produces nans as soon as it explodes
  • Loading branch information
Goobley committed Nov 2, 2022
1 parent bdafe69 commit 91ba365
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 2 deletions.
37 changes: 36 additions & 1 deletion Source/SimdImpl_AVX2FMA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,24 @@ inline __m256d pow2n(const __m256d n) {
__m256d d = _mm256_castsi256_pd(c); // bit-cast back to double
return d;

inline __m256d abs_pd_avx(__m256d xIn)
unsigned int data[2] = {0xFFFFFFFFu, 0x7FFFFFFFu};
__m256d mask = _mm256_broadcast_sd((double*)data);
return _mm256_and_pd(xIn, mask);

inline __m256i finite_mask_avx(__m256d x)
__m256i i = _mm256_castpd_si256(x);
__m256i iShift = _mm256_sll_epi64(i, _mm_cvtsi64_si128(1));
__m256i exp_val = _mm256_set1_epi64x(0xFFE0000000000000);
__m256i result = ~_mm256_cmpeq_epi64(_mm256_and_si256(iShift, exp_val), exp_val);
return result;

// NOTE(cmo): AVX impl of exp_pd, based on Agner Fog's vector class
// The implementation here, based on a classic Taylor series, rather than a
Expand Down Expand Up @@ -98,7 +116,24 @@ inline __m256d exp_pd_avx(__m256d xIn)
__m256d n2 = pow2n(r);
z = _mm256_mul_pd(_mm256_add_pd(z, _mm256_set1_pd(1.0)), n2);

// TODO(cmo): Probably should have some of the nan/inf error handling code.
// NOTE(cmo): Error/edge-case handling code. The previous warning was prophetic.
// abs(xIn) < xMax
__m256i mask1 = _mm256_castpd_si256(_mm256_cmp_pd(abs_pd_avx(xIn), _mm256_set1_pd(xMax), 1));
__m256i mask2 = finite_mask_avx(xIn);
__m256i mask = mask1 & mask2;
// if all mask is set, then exit normally.
if (_mm256_testc_si256(mask, _mm256_set1_epi64x(-1)) != 0)
return z;

__m256d maskd = _mm256_castsi256_pd(mask);
__m256d inputSign = _mm256_and_pd(xIn, _mm256_set1_pd(-0.0));
__m256d inf = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FF0000000000000));
r = _mm256_blendv_pd(inf, _mm256_set1_pd(0.0), inputSign); // values for over/underflow/inf
z = _mm256_blendv_pd(r, z, maskd); // +/- underflow

__m256d nan_mask = _mm256_cmp_pd(xIn, xIn, 3); // check for unordered comparison, i.e. a value is nan
z = _mm256_blendv_pd(z, xIn, nan_mask); // set output to nan if input is nan

return z;

Expand Down
35 changes: 34 additions & 1 deletion Source/SimdImpl_AVX512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,23 @@ inline __m512d pow2n(const __m512d n) {
__m512d d = _mm512_castsi512_pd(c); // bit-cast back to double
return d;

inline __m512d abs_pd_avx512(__m512d xIn)
unsigned int data[2] = {0xFFFFFFFFu, 0x7FFFFFFFu};
__m512d mask = _mm512_broadcast_f64x4(_mm256_broadcast_sd(((double*)data)));
return _mm512_and_pd(xIn, mask);

inline __mmask8 finite_mask_avx512(__m512d x)
__m512i i = _mm512_castpd_si512(x);
__m512i iShift = _mm512_sll_epi64(i, _mm_cvtsi64_si128(1));
__m512i exp_val = _mm512_set1_epi64(0xFFE0000000000000);
__mmask8 result = ~_mm512_cmpeq_epi64_mask(_mm512_and_si512(iShift, exp_val), exp_val);
return result;

// NOTE(cmo): AVX impl of exp_pd, based on Agner Fog's vector class
// The implementation here, based on a classic Taylor series, rather than a
Expand Down Expand Up @@ -98,7 +115,23 @@ inline __m512d exp_pd_avx512(__m512d xIn)
__m512d n2 = pow2n(r);
z = _mm512_mul_pd(_mm512_add_pd(z, _mm512_set1_pd(1.0)), n2);

// TODO(cmo): Probably should have some of the nan/inf error handling code.
// NOTE(cmo): Error/edge-case handling code. The previous warning was prophetic.
// abs(xIn) < xMax
__mmask8 mask1 = _mm512_cmp_pd_mask(abs_pd_avx512(xIn), _mm512_set1_pd(xMax), 1);
__mmask8 mask2 = finite_mask_avx512(xIn);
__mmask8 mask = mask1 & mask2;
// if all mask is set, then exit normally.
if (mask == 255)
return z;

// __m256d maskd = _mm256_castsi256_pd(mask);
__mmask8 inputNegative = _mm512_cmp_pd_mask(xIn, _mm512_set1_pd(0.0), 1);
__m512d inf = _mm512_castsi512_pd(_mm512_set1_epi64(0x7FF0000000000000));
r = _mm512_mask_blend_pd(inputNegative, inf, _mm512_set1_pd(0.0)); // values for over/underflow/inf
z = _mm512_mask_blend_pd(mask, r, z); // +/- underflow

__mmask8 nan_mask = _mm512_cmp_pd_mask(xIn, xIn, 3); // check for unordered comparison, i.e. a value is nan
z = _mm512_mask_blend_pd(nan_mask, z, xIn); // set output to nan if input is nan
return z;

Expand Down

0 comments on commit 91ba365

Please sign in to comment.