From a3110a1fe1220c98be5d82e6b8e5a3e7496ae610 Mon Sep 17 00:00:00 2001 From: KdotJPG Date: Tue, 13 Feb 2024 23:16:55 -0500 Subject: [PATCH] Simplex Rework + Domain Warp --- .../FastNoise/Generators/DomainWarpSimplex.h | 60 +- .../Generators/DomainWarpSimplex.inl | 1216 +++++++++++++-- include/FastNoise/Generators/Generator.h | 24 + include/FastNoise/Generators/Perlin.inl | 28 +- include/FastNoise/Generators/Simplex.h | 54 +- include/FastNoise/Generators/Simplex.inl | 1363 +++++++++++------ include/FastNoise/Generators/Utils.inl | 936 +++++++++-- src/FastNoise/FastSIMD_Build.inl | 4 +- 8 files changed, 2866 insertions(+), 819 deletions(-) diff --git a/include/FastNoise/Generators/DomainWarpSimplex.h b/include/FastNoise/Generators/DomainWarpSimplex.h index 5c1f72f..6f1dfb8 100644 --- a/include/FastNoise/Generators/DomainWarpSimplex.h +++ b/include/FastNoise/Generators/DomainWarpSimplex.h @@ -1,19 +1,41 @@ -#pragma once -#include "Generator.h" -#include "DomainWarp.h" - -namespace FastNoise -{ - class DomainWarpOpenSimplex : public virtual DomainWarp - { - public: const Metadata& GetMetadata() const override; - }; - -#ifdef FASTNOISE_METADATA - template<> - struct MetadataT : MetadataT - { - SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override; - }; -#endif -} +#pragma once +#include "Generator.h" +#include "DomainWarp.h" + +namespace FastNoise +{ + class DomainWarpSimplex : public virtual DomainWarp + { + public: + const Metadata& GetMetadata() const override; + + void SetType( SimplexType value ) { mType = value; } + void SetVectorizationScheme( VectorizationScheme value ) { mVectorizationScheme = value; } + + protected: + SimplexType mType = SimplexType::Standard; + VectorizationScheme mVectorizationScheme = VectorizationScheme::OrthogonalGradientMatrix; + }; + +#ifdef FASTNOISE_METADATA + template<> + struct MetadataT : MetadataT + { + SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override; + + MetadataT() + { + this->AddVariableEnum( + { "Type", "Noise character style" }, + SimplexType::Standard, &DomainWarpSimplex::SetType, + kSimplexType_Strings + ); + this->AddVariableEnum( + { "Vectorization Scheme", "Construction used by the noise to produce a vector output" }, + VectorizationScheme::OrthogonalGradientMatrix, &DomainWarpSimplex::SetVectorizationScheme, + kVectorizationScheme_Strings + ); + } + }; +#endif +} diff --git a/include/FastNoise/Generators/DomainWarpSimplex.inl b/include/FastNoise/Generators/DomainWarpSimplex.inl index 8e6e361..d40c2d0 100644 --- a/include/FastNoise/Generators/DomainWarpSimplex.inl +++ b/include/FastNoise/Generators/DomainWarpSimplex.inl @@ -2,176 +2,1084 @@ #include "Utils.inl" template -class FastSIMD::DispatchClass final : public virtual FastNoise::DomainWarpOpenSimplex, public FastSIMD::DispatchClass +class FastSIMD::DispatchClass final : public virtual FastNoise::DomainWarpSimplex, public FastSIMD::DispatchClass { public: - float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const + float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const final { - float32v xs = FS::Floor( x ); - float32v ys = FS::Floor( y ); + switch( mType ) { + case SimplexType::Standard: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Standard( seed, warpAmp, x, y, xOut, yOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Standard( seed, warpAmp, x, y, xOut, yOut ); + } + case SimplexType::Smooth: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Smooth( seed, warpAmp, x, y, xOut, yOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Smooth( seed, warpAmp, x, y, xOut, yOut ); + } + } + } + + float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const final + { + switch( mType ) { + case SimplexType::Standard: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Standard( seed, warpAmp, x, y, z, xOut, yOut, zOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Standard( seed, warpAmp, x, y, z, xOut, yOut, zOut ); + } + case SimplexType::Smooth: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Smooth( seed, warpAmp, x, y, z, xOut, yOut, zOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Smooth( seed, warpAmp, x, y, z, xOut, yOut, zOut ); + } + } + } + + float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const final + { + switch( mType ) { + case SimplexType::Standard: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Standard( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Standard( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut ); + } + case SimplexType::Smooth: + switch( mVectorizationScheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return Warp_Smooth( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut ); + case VectorizationScheme::GradientOuterProduct: + return Warp_Smooth( seed, warpAmp, x, y, z, w, xOut, yOut, zOut, wOut ); + } + } + } - int32v x0 = FS::Convert( xs ) * int32v( Primes::X ); - int32v y0 = FS::Convert( ys ) * int32v( Primes::Y ); - int32v x1 = x0 + int32v( Primes::X ); - int32v y1 = y0 + int32v( Primes::Y ); +protected: + template + float32v FS_VECTORCALL Warp_Standard( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const + { + constexpr double kRoot3 = 1.7320508075688772935274463415059; + constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 ); + constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 ); + constexpr double kFalloffRadiusSquared = 0.5; + + float32v skewDelta = float32v( kSkew2 ) * ( x + y ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); - xs = InterpHermite( x - xs ); - ys = InterpHermite( y - ys ); + mask32v xGreaterEqualY = dxSkewed >= dySkewed; - #define GRADIENT_COORD( _x, _y )\ - int32v hash##_x##_y = HashPrimesHB(seed, x##_x, y##_y );\ - float32v x##_x##_y = FS::Convert( hash##_x##_y & int32v( 0xffff ) );\ - float32v y##_x##_y = FS::Convert( (hash##_x##_y >> 16) & int32v( 0xffff ) ); + float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; - GRADIENT_COORD( 0, 0 ); - GRADIENT_COORD( 1, 0 ); - GRADIENT_COORD( 0, 1 ); - GRADIENT_COORD( 1, 1 ); + float32v dx1 = FS::MaskedIncrement( ~xGreaterEqualY, dx0 ) - float32v( kUnskew2 + 1 ); + float32v dy1 = FS::MaskedIncrement( xGreaterEqualY, dy0 ) - float32v( kUnskew2 + 1 ); + float32v dx2 = dx0 - float32v( kUnskew2 * 2 + 1 ); + float32v dy2 = dy0 - float32v( kUnskew2 * 2 + 1 ); - #undef GRADIENT_COORD + float32v falloff0 = FS::FNMulAdd( dx0, dx0, FS::FNMulAdd( dy0, dy0, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff1 = FS::FNMulAdd( dx1, dx1, FS::FNMulAdd( dy1, dy1, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff2 = falloff0 + FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ), + float32v( -2.0 / 3.0 ) ); - float32v normalise = float32v( 1.0f / (0xffff / 2.0f) ); + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); - float32v xWarp = (Lerp( Lerp( x00, x10, xs ), Lerp( x01, x11, xs ), ys ) - float32v( 0xffff / 2.0f )) * normalise; - float32v yWarp = (Lerp( Lerp( y00, y10, xs ), Lerp( y01, y11, xs ), ys ) - float32v( 0xffff / 2.0f )) * normalise; + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; - xOut = FS::FMulAdd( xWarp, warpAmp, xOut ); - yOut = FS::FMulAdd( yWarp, warpAmp, yOut ); + float32v valueX( 0 ); + float32v valueY( 0 ); - float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, yWarp * yWarp ); + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase ), dx0, dy0, falloff0, valueX, valueY ); + ApplyVectorContributionSimplex( HashPrimes( seed, FS::MaskedAdd( xGreaterEqualY, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( xGreaterEqualY, yPrimedBase, int32v( Primes::Y ) ) ), dx1, dy1, falloff1, valueX, valueY ); + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ), dx2, dy2, falloff2, valueX, valueY ); - return warpLengthSq * FS::InvSqrt( warpLengthSq ); + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 49.918426513671875 / 2.0 : + 70.1480577066486; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + + float32v warpLengthSq = FS::FMulAdd( valueY, valueY, valueX * valueX ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; } - - float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const + + template + float32v FS_VECTORCALL Warp_Standard( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const { - float32v xs = FS::Floor( x ); - float32v ys = FS::Floor( y ); - float32v zs = FS::Floor( z ); - - int32v x0 = FS::Convert( xs ) * int32v( Primes::X ); - int32v y0 = FS::Convert( ys ) * int32v( Primes::Y ); - int32v z0 = FS::Convert( zs ) * int32v( Primes::Z ); - int32v x1 = x0 + int32v( Primes::X ); - int32v y1 = y0 + int32v( Primes::Y ); - int32v z1 = z0 + int32v( Primes::Z ); - - xs = InterpHermite( x - xs ); - ys = InterpHermite( y - ys ); - zs = InterpHermite( z - zs ); - - #define GRADIENT_COORD( _x, _y, _z )\ - int32v hash##_x##_y##_z = HashPrimesHB( seed, x##_x, y##_y, z##_z );\ - float32v x##_x##_y##_z = FS::Convert( hash##_x##_y##_z & int32v( 0x3ff ) );\ - float32v y##_x##_y##_z = FS::Convert( (hash##_x##_y##_z >> 10) & int32v( 0x3ff ) );\ - float32v z##_x##_y##_z = FS::Convert( (hash##_x##_y##_z >> 20) & int32v( 0x3ff ) ); - - GRADIENT_COORD( 0, 0, 0 ); - GRADIENT_COORD( 1, 0, 0 ); - GRADIENT_COORD( 0, 1, 0 ); - GRADIENT_COORD( 1, 1, 0 ); - GRADIENT_COORD( 0, 0, 1 ); - GRADIENT_COORD( 1, 0, 1 ); - GRADIENT_COORD( 0, 1, 1 ); - GRADIENT_COORD( 1, 1, 1 ); - - #undef GRADIENT_COORD - - float32v x0z = Lerp( Lerp( x000, x100, xs ), Lerp( x010, x110, xs ), ys ); - float32v y0z = Lerp( Lerp( y000, y100, xs ), Lerp( y010, y110, xs ), ys ); - float32v z0z = Lerp( Lerp( z000, z100, xs ), Lerp( z010, z110, xs ), ys ); - - float32v x1z = Lerp( Lerp( x001, x101, xs ), Lerp( x011, x111, xs ), ys ); - float32v y1z = Lerp( Lerp( y001, y101, xs ), Lerp( y011, y111, xs ), ys ); - float32v z1z = Lerp( Lerp( z001, z101, xs ), Lerp( z011, z111, xs ), ys ); - - float32v normalise = float32v( 1.0f / (0x3ff / 2.0f) ); - - float32v xWarp = (Lerp( x0z, x1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise; - float32v yWarp = (Lerp( y0z, y1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise; - float32v zWarp = (Lerp( z0z, z1z, zs ) - float32v( 0x3ff / 2.0f )) * normalise; - - xOut = FS::FMulAdd( xWarp, warpAmp, xOut ); - yOut = FS::FMulAdd( yWarp, warpAmp, yOut ); - zOut = FS::FMulAdd( zWarp, warpAmp, zOut ); - - float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, FS::FMulAdd( yWarp, yWarp, zWarp * zWarp ) ); - - return warpLengthSq * FS::InvSqrt( warpLengthSq ); + constexpr double kSkew3 = 1.0 / 3.0; + constexpr double kReflectUnskew3 = -1.0 / 2.0; + constexpr double kFalloffRadiusSquared = 0.6; + + float32v skewDelta = float32v( kSkew3 ) * ( x + y + z ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + + mask32v xGreaterEqualY = dxSkewed >= dySkewed; + mask32v yGreaterEqualZ = dySkewed >= dzSkewed; + mask32v xGreaterEqualZ = dxSkewed >= dzSkewed; + + float32v unskewDelta = float32v( kReflectUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; + float32v dz0 = dzSkewed + unskewDelta; + + mask32v maskX1 = xGreaterEqualY & xGreaterEqualZ; + mask32v maskY1 = FS::BitwiseAndNot( yGreaterEqualZ, xGreaterEqualY ); + mask32v maskZ1 = FS::BitwiseAndNot( ~xGreaterEqualZ, yGreaterEqualZ ); + + mask32v nMaskX2 = ~( xGreaterEqualY | xGreaterEqualZ ); + mask32v nMaskY2 = xGreaterEqualY & ~yGreaterEqualZ; + mask32v nMaskZ2 = xGreaterEqualZ & yGreaterEqualZ; + + float32v dx3 = dx0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dy3 = dy0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dz3 = dz0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dx1 = FS::MaskedSub( maskX1, dx3, float32v( 1 ) ); // kReflectUnskew3 * 3 + 1 = kReflectUnskew3, so dx0 - kReflectUnskew3 = dx3 + float32v dy1 = FS::MaskedSub( maskY1, dy3, float32v( 1 ) ); + float32v dz1 = FS::MaskedSub( maskZ1, dz3, float32v( 1 ) ); + float32v dx2 = FS::MaskedIncrement( nMaskX2, dx0 ); // kReflectUnskew3 * 2 - 1 = 0, so dx0 + ( kReflectUnskew3 * 2 - 1 ) = dx0 + float32v dy2 = FS::MaskedIncrement( nMaskY2, dy0 ); + float32v dz2 = FS::MaskedIncrement( nMaskZ2, dz0 ); + + float32v falloff0 = FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff1 = FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff2 = FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff3 = falloff0 - ( unskewDelta + float32v( 3.0 / 4.0 ) ); + + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); + falloff3 = FS::Max( falloff3, float32v( 0 ) ); + + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; + falloff3 *= falloff3; falloff3 *= falloff3; + + float32v valueX( 0 ); + float32v valueY( 0 ); + float32v valueZ( 0 ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dx0, dy0, dz0, falloff0, valueX, valueY, valueZ ); + ApplyVectorContributionCommon( HashPrimes( seed, FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ) ), dx1, dy1, dz1, falloff1, valueX, valueY, valueZ ); + ApplyVectorContributionCommon( HashPrimes( seed, FS::InvMaskedAdd( nMaskX2, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( nMaskY2, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( nMaskZ2, zPrimedBase, int32v( Primes::Z ) ) ), dx2, dy2, dz2, falloff2, valueX, valueY, valueZ ); + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ) ), dx3, dy3, dz3, falloff3, valueX, valueY, valueZ ); + + if constexpr( Scheme != VectorizationScheme::OrthogonalGradientMatrix ) + { + // Match gradient orientation. + constexpr double kReflect3D = -2.0 / 2.0; + float32v valueTransformDelta = float32v( kReflect3D ) * ( valueX + valueY + valueZ ); + valueX += valueTransformDelta; + valueY += valueTransformDelta; + valueZ += valueTransformDelta; + } + + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 32.69428253173828125 / 1.4142135623730951 : + 16.281631889139874; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + zOut = FS::FMulAdd( valueZ, warpAmp, zOut ); + + float32v warpLengthSq = FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; } - - float32v FS_VECTORCALL Warp( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const + + template + float32v FS_VECTORCALL Warp_Standard( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const { - float32v xs = FS::Floor( x ); - float32v ys = FS::Floor( y ); - float32v zs = FS::Floor( z ); - float32v ws = FS::Floor( w ); - - int32v x0 = FS::Convert( xs ) * int32v( Primes::X ); - int32v y0 = FS::Convert( ys ) * int32v( Primes::Y ); - int32v z0 = FS::Convert( zs ) * int32v( Primes::Z ); - int32v w0 = FS::Convert( ws ) * int32v( Primes::W ); - int32v x1 = x0 + int32v( Primes::X ); - int32v y1 = y0 + int32v( Primes::Y ); - int32v z1 = z0 + int32v( Primes::Z ); - int32v w1 = w0 + int32v( Primes::W ); - - xs = InterpHermite( x - xs ); - ys = InterpHermite( y - ys ); - zs = InterpHermite( z - zs ); - ws = InterpHermite( w - ws ); - - #define GRADIENT_COORD( _x, _y, _z, _w )\ - int32v hash##_x##_y##_z##_w = HashPrimesHB( seed, x##_x, y##_y, z##_z, w##_w );\ - float32v x##_x##_y##_z##_w = FS::Convert( hash##_x##_y##_z##_w & int32v( 0xff ) );\ - float32v y##_x##_y##_z##_w = FS::Convert( (hash##_x##_y##_z##_w >> 8) & int32v( 0xff ) );\ - float32v z##_x##_y##_z##_w = FS::Convert( (hash##_x##_y##_z##_w >> 16) & int32v( 0xff ) );\ - float32v w##_x##_y##_z##_w = FS::Convert( (hash##_x##_y##_z##_w >> 24) & int32v( 0xff ) ); - - GRADIENT_COORD( 0, 0, 0, 0 ); - GRADIENT_COORD( 1, 0, 0, 0 ); - GRADIENT_COORD( 0, 1, 0, 0 ); - GRADIENT_COORD( 1, 1, 0, 0 ); - GRADIENT_COORD( 0, 0, 1, 0 ); - GRADIENT_COORD( 1, 0, 1, 0 ); - GRADIENT_COORD( 0, 1, 1, 0 ); - GRADIENT_COORD( 1, 1, 1, 0 ); - GRADIENT_COORD( 0, 0, 0, 1 ); - GRADIENT_COORD( 1, 0, 0, 1 ); - GRADIENT_COORD( 0, 1, 0, 1 ); - GRADIENT_COORD( 1, 1, 0, 1 ); - GRADIENT_COORD( 0, 0, 1, 1 ); - GRADIENT_COORD( 1, 0, 1, 1 ); - GRADIENT_COORD( 0, 1, 1, 1 ); - GRADIENT_COORD( 1, 1, 1, 1 ); - - #undef GRADIENT_COORD - - float32v x0w = Lerp( Lerp( Lerp( x0000, x1000, xs ), Lerp( x0100, x1100, xs ), ys ), Lerp( Lerp( x0010, x1010, xs ), Lerp( x0110, x1110, xs ), ys ), zs ); - float32v y0w = Lerp( Lerp( Lerp( y0000, y1000, xs ), Lerp( y0100, y1100, xs ), ys ), Lerp( Lerp( y0010, y1010, xs ), Lerp( y0110, y1110, xs ), ys ), zs ); - float32v z0w = Lerp( Lerp( Lerp( z0000, z1000, xs ), Lerp( z0100, z1100, xs ), ys ), Lerp( Lerp( z0010, z1010, xs ), Lerp( z0110, z1110, xs ), ys ), zs ); - float32v w0w = Lerp( Lerp( Lerp( w0000, w1000, xs ), Lerp( w0100, w1100, xs ), ys ), Lerp( Lerp( w0010, w1010, xs ), Lerp( w0110, w1110, xs ), ys ), zs ); - - float32v x1w = Lerp( Lerp( Lerp( x0001, x1001, xs ), Lerp( x0101, x1101, xs ), ys ), Lerp( Lerp( x0011, x1011, xs ), Lerp( x0111, x1111, xs ), ys ), zs ); - float32v y1w = Lerp( Lerp( Lerp( y0001, y1001, xs ), Lerp( y0101, y1101, xs ), ys ), Lerp( Lerp( y0011, y1011, xs ), Lerp( y0111, y1111, xs ), ys ), zs ); - float32v z1w = Lerp( Lerp( Lerp( z0001, z1001, xs ), Lerp( z0101, z1101, xs ), ys ), Lerp( Lerp( z0011, z1011, xs ), Lerp( z0111, z1111, xs ), ys ), zs ); - float32v w1w = Lerp( Lerp( Lerp( w0001, w1001, xs ), Lerp( w0101, w1101, xs ), ys ), Lerp( Lerp( w0011, w1011, xs ), Lerp( w0111, w1111, xs ), ys ), zs ); - - float32v normalise = float32v( 1.0f / (0xff / 2.0f) ); - - float32v xWarp = (Lerp( x0w, x1w, ws ) - float32v( 0xff / 2.0f )) * normalise; - float32v yWarp = (Lerp( y0w, y1w, ws ) - float32v( 0xff / 2.0f )) * normalise; - float32v zWarp = (Lerp( z0w, z1w, ws ) - float32v( 0xff / 2.0f )) * normalise; - float32v wWarp = (Lerp( w0w, w1w, ws ) - float32v( 0xff / 2.0f )) * normalise; - - xOut = FS::FMulAdd( xWarp, warpAmp, xOut ); - yOut = FS::FMulAdd( yWarp, warpAmp, yOut ); - zOut = FS::FMulAdd( zWarp, warpAmp, zOut ); - wOut = FS::FMulAdd( wWarp, warpAmp, wOut ); - - float32v warpLengthSq = FS::FMulAdd( xWarp, xWarp, FS::FMulAdd( yWarp, yWarp, FS::FMulAdd( zWarp, zWarp, wWarp * wWarp ) ) ); - - return warpLengthSq * FS::InvSqrt( warpLengthSq ); + constexpr double kRoot5 = 2.2360679774997896964091736687313; + constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 ); + constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 ); + constexpr double kFalloffRadiusSquared = 0.6; + + float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v wSkewed = w + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v wSkewedBase = FS::Floor( wSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + float32v dwSkewed = wSkewed - wSkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + int32v wPrimedBase = FS::Convert( wSkewedBase ) * int32v( Primes::W ); + + float32v unskewDelta = float32v( kUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; + float32v dz0 = dzSkewed + unskewDelta; + float32v dw0 = dwSkewed + unskewDelta; + + int32v rankX( 0 ); + int32v rankY( 0 ); + int32v rankZ( 0 ); + int32v rankW( 0 ); + + mask32v xGreaterEqualY = dx0 >= dy0; + rankX = FS::MaskedIncrement( xGreaterEqualY, rankX ); + rankY = FS::MaskedIncrement( ~xGreaterEqualY, rankY ); + + mask32v xGreaterEqualZ = dx0 >= dz0; + rankX = FS::MaskedIncrement( xGreaterEqualZ, rankX ); + rankZ = FS::MaskedIncrement( ~xGreaterEqualZ, rankZ ); + + mask32v xGreaterEqualW = dx0 >= dw0; + rankX = FS::MaskedIncrement( xGreaterEqualW, rankX ); + rankW = FS::MaskedIncrement( ~xGreaterEqualW, rankW ); + + mask32v yGreaterEqualZ = dy0 >= dz0; + rankY = FS::MaskedIncrement( yGreaterEqualZ, rankY ); + rankZ = FS::MaskedIncrement( ~yGreaterEqualZ, rankZ ); + + mask32v yGreaterEqualW = dy0 >= dw0; + rankY = FS::MaskedIncrement( yGreaterEqualW, rankY ); + rankW = FS::MaskedIncrement( ~yGreaterEqualW, rankW ); + + mask32v zGreaterEqualW = dz0 >= dw0; + rankZ = FS::MaskedIncrement( zGreaterEqualW, rankZ ); + rankW = FS::MaskedIncrement( ~zGreaterEqualW, rankW ); + + mask32v maskX1 = rankX > int32v( 2 ); + mask32v maskY1 = rankY > int32v( 2 ); + mask32v maskZ1 = rankZ > int32v( 2 ); + mask32v maskW1 = rankW > int32v( 2 ); + + mask32v maskX2 = rankX > int32v( 1 ); + mask32v maskY2 = rankY > int32v( 1 ); + mask32v maskZ2 = rankZ > int32v( 1 ); + mask32v maskW2 = rankW > int32v( 1 ); + + mask32v maskX3 = rankX > int32v( 0 ); + mask32v maskY3 = rankY > int32v( 0 ); + mask32v maskZ3 = rankZ > int32v( 0 ); + mask32v maskW3 = rankW > int32v( 0 ); + + float32v dx1 = FS::MaskedSub( maskX1, dx0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dy1 = FS::MaskedSub( maskY1, dy0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dz1 = FS::MaskedSub( maskZ1, dz0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dw1 = FS::MaskedSub( maskW1, dw0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dx2 = FS::MaskedSub( maskX2, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dy2 = FS::MaskedSub( maskY2, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dz2 = FS::MaskedSub( maskZ2, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dw2 = FS::MaskedSub( maskW2, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dx3 = FS::MaskedSub( maskX3, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dy3 = FS::MaskedSub( maskY3, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dz3 = FS::MaskedSub( maskZ3, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dw3 = FS::MaskedSub( maskW3, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dx4 = dx0 - float32v( kUnskew4 * 4 + 1 ); + float32v dy4 = dy0 - float32v( kUnskew4 * 4 + 1 ); + float32v dz4 = dz0 - float32v( kUnskew4 * 4 + 1 ); + float32v dw4 = dw0 - float32v( kUnskew4 * 4 + 1 ); + + float32v falloff0 = FS::FNMulAdd( dw0, dw0, FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff1 = FS::FNMulAdd( dw1, dw1, FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff2 = FS::FNMulAdd( dw2, dw2, FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff3 = FS::FNMulAdd( dw3, dw3, FS::FNMulAdd( dz3, dz3, FS::FNMulAdd( dy3, dy3, FS::FNMulAdd( dx3, dx3, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff4 = falloff0 + FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot5 + 3.0 ) / ( kRoot5 + 5.0 ) ), + float32v( -4.0 / 5.0 ) ); + + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); + falloff3 = FS::Max( falloff3, float32v( 0 ) ); + falloff4 = FS::Max( falloff4, float32v( 0 ) ); + + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; + falloff3 *= falloff3; falloff3 *= falloff3; + falloff4 *= falloff4; falloff4 *= falloff4; + + float32v valueX( 0 ); + float32v valueY( 0 ); + float32v valueZ( 0 ); + float32v valueW( 0 ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dx0, dy0, dz0, dw0, falloff0, valueX, valueY, valueZ, valueW ); + ApplyVectorContributionSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW1, wPrimedBase, int32v( Primes::W ) ) ), dx1, dy1, dz1, dw1, falloff1, valueX, valueY, valueZ, valueW ); + ApplyVectorContributionSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX2, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY2, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ2, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW2, wPrimedBase, int32v( Primes::W ) ) ), dx2, dy2, dz2, dw2, falloff2, valueX, valueY, valueZ, valueW ); + ApplyVectorContributionSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX3, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY3, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ3, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW3, wPrimedBase, int32v( Primes::W ) ) ), dx3, dy3, dz3, dw3, falloff3, valueX, valueY, valueZ, valueW ); + ApplyVectorContributionSimplex( HashPrimes( seed, + xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ), wPrimedBase + int32v( Primes::W ) ), + dx4, dy4, dz4, dw4, falloff4, valueX, valueY, valueZ, valueW ); + + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 33.653125584827855 / 1.4142135623730951 : + 30.88161777516092; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + zOut = FS::FMulAdd( valueZ, warpAmp, zOut ); + + float32v warpLengthSq = FS::FMulAdd( valueW, valueW, FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ) ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; } -}; + template + float32v FS_VECTORCALL Warp_Smooth( int32v seed, float32v warpAmp, float32v x, float32v y, float32v& xOut, float32v& yOut ) const + { + constexpr double kRoot3 = 1.7320508075688772935274463415059; + constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 ); + constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 ); + constexpr double kFalloffRadiusSquared = 2.0 / 3.0; + + float32v skewDelta = float32v( kSkew2 ) * ( x + y ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + + mask32v forwardXY = dxSkewed + dySkewed > float32v( 1.0f ); + float32v boundaryXY = FS::Masked( forwardXY, float32v( -1.0f ) ); + mask32v forwardX = FS::FMulAdd( dxSkewed, float32v( -2.0f ), dySkewed ) < boundaryXY; + mask32v forwardY = FS::FMulAdd( dySkewed, float32v( -2.0f ), dxSkewed ) < boundaryXY; + + float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed ); + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; + + float32v falloffBase0; + float32v valueX( 0 ); + float32v valueY( 0 ); + + // Vertex <0, 0> + { + int32v hash = HashPrimes( seed, xPrimedBase, yPrimedBase ); + falloffBase0 = FS::FNMulAdd( dxBase, dxBase, FS::FNMulAdd( dyBase, dyBase, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff = falloffBase0; falloff *= falloff; falloff *= falloff; + ApplyVectorContributionSimplex( hash, dxBase, dyBase, falloff, valueX, valueY ); + } + + // Vertex <1, 1> + { + int32v hash = HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ); + float32v falloff = FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ), + falloffBase0 - float32v( kFalloffRadiusSquared ) ); + falloff *= falloff; falloff *= falloff; + ApplyVectorContributionSimplex( hash, dxBase - float32v( 2 * kUnskew2 + 1 ), dyBase - float32v( 2 * kUnskew2 + 1 ), falloff, valueX, valueY ); + } + + float32v xyDelta = FS::Select( forwardXY, float32v( kUnskew2 + 1 ), float32v( -kUnskew2 ) ); + dxBase -= xyDelta; + dyBase -= xyDelta; + + // Vertex <1, 0> or <-1, 0> or <1, 2> + { + int32v hash = HashPrimes( seed, + FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardX, xPrimedBase, int32v( Primes::X * 2 ) ), int32v( Primes::X ) ), + FS::MaskedAdd( forwardXY, yPrimedBase, int32v( Primes::Y ) ) ); + float32v dx = dxBase - FS::Select( forwardX, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) ); + float32v dy = FS::MaskedSub( forwardX, dyBase, float32v( 2 * kUnskew2 ) ); + float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) ); + falloff *= falloff; falloff *= falloff; + ApplyVectorContributionSimplex( hash, dx, dy, falloff, valueX, valueY ); + } + + // Vertex <0, 1> or <0, -1> or <2, 1> + { + int32v hash = HashPrimes( seed, + FS::MaskedAdd( forwardXY, xPrimedBase, int32v( Primes::X ) ), + FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardY, yPrimedBase, int32v( (int32_t)( Primes::Y * 2LL ) ) ), int32v( Primes::Y ) ) ); + float32v dx = FS::MaskedSub( forwardY, dxBase, float32v( 2 * kUnskew2 ) ); + float32v dy = dyBase - FS::Select( forwardY, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) ); + float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) ); + falloff *= falloff; falloff *= falloff; + ApplyVectorContributionSimplex( hash, dx, dy, falloff, valueX, valueY ); + } + + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 9.28993664146183 / 2.0 : + 12.814453124999995; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + + float32v warpLengthSq = FS::FMulAdd( valueY, valueY, valueX * valueX ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; + } + + template + float32v FS_VECTORCALL Warp_Smooth( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v& xOut, float32v& yOut, float32v& zOut ) const + { + constexpr double kSkew3 = 1.0 / 3.0; + constexpr double kReflectUnskew3 = -1.0 / 2.0; + constexpr double kTwiceUnskew3 = -1.0 / 4.0; + + constexpr double kDistanceSquaredA = 3.0 / 4.0; + constexpr double kDistanceSquaredB = 1.0; + constexpr double kFalloffRadiusSquared = kDistanceSquaredA; + + float32v skewDelta = float32v( kSkew3 ) * ( x + y + z ); + + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + + // From unit cell base, find closest vertex + { + // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result. + float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed ); + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + + // Using those, compare scores to determine which vertex is closest. + constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr + { + moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits ); + maxScore = FS::Max( maxScore, score ); + }; + float32v maxScore = float32v( 0.375f ); + int32v moveMaskBits = FS::Masked( xyzNormal > maxScore, int32v( -1 ) ); + maxScore = FS::Max( maxScore, xyzNormal ); + considerVertex( maxScore, moveMaskBits, xNormal, 0b001 ); + considerVertex( maxScore, moveMaskBits, yNormal, 0b010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b100 ); + maxScore += float32v( 0.125f ) - xyzNormal; + considerVertex( maxScore, moveMaskBits, -zNormal, 0b011 ); + considerVertex( maxScore, moveMaskBits, -yNormal, 0b101 ); + considerVertex( maxScore, moveMaskBits, -xNormal, 0b110 ); + + mask32v moveX = ( moveMaskBits & int32v( 0b001 ) ) != int32v( 0 ); + mask32v moveY = ( moveMaskBits & int32v( 0b010 ) ) != int32v( 0 ); + mask32v moveZ = ( moveMaskBits & int32v( 0b100 ) ) != int32v( 0 ); + + xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase ); + ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase ); + zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase ); + + dxSkewed = FS::MaskedDecrement( moveX, dxSkewed ); + dySkewed = FS::MaskedDecrement( moveY, dySkewed ); + dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed ); + } + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + + float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed; + float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * skewedCoordinateSum; + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + + float32v unskewDelta = float32v( kReflectUnskew3 ) * skewedCoordinateSum; + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; + float32v dzBase = dzSkewed + unskewDelta; + + float32v coordinateSum = float32v( 1 + 3 * kReflectUnskew3 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + + float32v valueX( 0 ); + float32v valueY( 0 ); + float32v valueZ( 0 ); + float32v falloffBaseStemA, falloffBaseStemB; + + // Vertex <0, 0, 0> + { + float32v falloffBase = FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) * float32v( 0.5f ); + falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 ); + falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 ); + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dxBase, dyBase, dzBase, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <1, 1, 1> or <-1, -1, -1> + { + mask32v signMask = xyzNormal < float32v( 0 ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset = float32v( 3 * kReflectUnskew3 + 1 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimed ), dxBase - offset, dyBase - offset, dzBase - offset, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <1, 1, 0> or <-1, -1, 0> + { + mask32v signMask = xyzNormal < zNormal; + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; + + float32v falloffBase = FS::Min( ( sign ^ dzBase ) - falloffBaseStemB, float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase ), dxBase, dyBase, dzBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <1, 0, 1> or <-1, 0, -1> + { + mask32v signMask = xyzNormal < yNormal; + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; + + float32v falloffBase = FS::Min( ( sign ^ dyBase ) - falloffBaseStemB, float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed ), dxBase, dyBase - offset0, dzBase, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <0, 1, 1> or <0, -1, -1> + { + mask32v signMask = xyzNormal < xNormal; + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed ), dxBase - offset0, dyBase, dzBase ); + float32v falloffBase = FS::Min( ( sign ^ dxBase ) - falloffBaseStemB, float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed ), dxBase - offset0, dyBase, dzBase, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <1, 0, 0> or <-1, 0, 0> + { + mask32v signMask = xNormal < float32v( 0 ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase ), dxBase + offset0, dyBase - offset0, dzBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <0, 1, 0> or <0, -1, 0> + { + mask32v signMask = yNormal < float32v( 0 ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase ), dxBase - offset0, dyBase + offset0, dzBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + // Vertex <0, 0, 1> or <0, 0, -1> + { + mask32v signMask = zNormal < float32v( 0 ); + + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) ); + + ApplyVectorContributionCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed ), dxBase - offset0, dyBase - offset0, dzBase + offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ ); + } + + if constexpr( Scheme != VectorizationScheme::OrthogonalGradientMatrix ) + { + // Match gradient orientation. + constexpr double kReflect3D = -2.0 / 3.0; + float32v valueTransformDelta = float32v( kReflect3D ) * ( valueX + valueY + valueZ ); + valueX += valueTransformDelta; + valueY += valueTransformDelta; + valueZ += valueTransformDelta; + } + + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 144.736422163332608 / 1.4142135623730951 : + 37.63698669623629; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + zOut = FS::FMulAdd( valueZ, warpAmp, zOut ); + + float32v warpLengthSq = FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; + } + + template + float32v FS_VECTORCALL Warp_Smooth( int32v seed, float32v warpAmp, float32v x, float32v y, float32v z, float32v w, float32v& xOut, float32v& yOut, float32v& zOut, float32v& wOut ) const + { + constexpr double kRoot5 = 2.2360679774997896964091736687313; + constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 ); + constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 ); + constexpr double kTwiceUnskew4 = -1.0 / 5.0; + + constexpr double kDistanceSquaredA = 4.0 / 5.0; + constexpr double kDistanceSquaredB = 6.0 / 5.0; + constexpr double kFalloffRadiusSquared = kDistanceSquaredA; + + float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w ); + + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v wSkewed = w + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v wSkewedBase = FS::Floor( wSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + float32v dwSkewed = wSkewed - wSkewedBase; + + // From unit cell base, find closest vertex + { + // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result. + float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed ); + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v wNormal = dwSkewed + twiceUnskewDelta; + float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal + + // Using those, compare scores to determine which vertex is closest. + constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr + { + moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits ); + maxScore = FS::Max( maxScore, score ); + }; + float32v maxScore = float32v( 0.6f ) - xyzwNormal; + int32v moveMaskBits = FS::Masked( float32v( 0.2f ) > maxScore, int32v( -1 ) ); + maxScore = FS::Max( maxScore, float32v( 0.2f ) ); + considerVertex( maxScore, moveMaskBits, -wNormal, 0b0111 ); + considerVertex( maxScore, moveMaskBits, -zNormal, 0b1011 ); + considerVertex( maxScore, moveMaskBits, -yNormal, 0b1101 ); + considerVertex( maxScore, moveMaskBits, -xNormal, 0b1110 ); + maxScore += xyzwNormal - float32v( 0.2f ); + considerVertex( maxScore, moveMaskBits, xNormal, 0b0001 ); + considerVertex( maxScore, moveMaskBits, yNormal, 0b0010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b0100 ); + considerVertex( maxScore, moveMaskBits, wNormal, 0b1000 ); + maxScore += float32v( 0.2f ) - xNormal; + considerVertex( maxScore, moveMaskBits, yNormal, 0b0011 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b0101 ); + considerVertex( maxScore, moveMaskBits, wNormal, 0b1001 ); + maxScore += xNormal; + considerVertex( maxScore, moveMaskBits, yNormal + zNormal, 0b0110 ); + maxScore -= wNormal; + considerVertex( maxScore, moveMaskBits, yNormal, 0b1010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b1100 ); + + mask32v moveX = ( moveMaskBits & int32v( 0b0001 ) ) != int32v( 0 ); + mask32v moveY = ( moveMaskBits & int32v( 0b0010 ) ) != int32v( 0 ); + mask32v moveZ = ( moveMaskBits & int32v( 0b0100 ) ) != int32v( 0 ); + mask32v moveW = ( moveMaskBits & int32v( 0b1000 ) ) != int32v( 0 ); + + xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase ); + ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase ); + zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase ); + wSkewedBase = FS::MaskedIncrement( moveW, wSkewedBase ); + + dxSkewed = FS::MaskedDecrement( moveX, dxSkewed ); + dySkewed = FS::MaskedDecrement( moveY, dySkewed ); + dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed ); + dwSkewed = FS::MaskedDecrement( moveW, dwSkewed ); + } + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + int32v wPrimedBase = FS::Convert( wSkewedBase ) * int32v( Primes::W ); + + float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed + dwSkewed; + float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * skewedCoordinateSum; + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v wNormal = dwSkewed + twiceUnskewDelta; + float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal + + float32v unskewDelta = float32v( kUnskew4 ) * skewedCoordinateSum; + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; + float32v dzBase = dzSkewed + unskewDelta; + float32v dwBase = dwSkewed + unskewDelta; + + float32v coordinateSum = float32v( 1 + 4 * kUnskew4 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + dwBase + + float32v valueX( 0 ); + float32v valueY( 0 ); + float32v valueZ( 0 ); + float32v valueW( 0 ); + float32v falloffBaseStemA, falloffBaseStemB; + + // Vertex <0, 0, 0, 0> + { + float32v falloffBase = FS::FNMulAdd( dwBase, dwBase, FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) ) * float32v( 0.5f ); + falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 ); + falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 ); + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase, dyBase, dzBase, dwBase, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 1, 1, 1> or <-1, -1, -1, -1> + { + mask32v signMask = xyzwNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset = float32v( 4 * kUnskew4 + 1 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimed ), dxBase - offset, dyBase - offset, dzBase - offset, dwBase - offset, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 1, 1, 0> or <-1, -1, -1, 0> + { + mask32v signMask = xyzwNormal < wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dwBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset1, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 1, 0, 1> or <-1, -1, 0, -1> + { + mask32v signMask = xyzwNormal < zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dzBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 0, 1, 1> or <-1, 0, -1, -1> + { + mask32v signMask = xyzwNormal < yNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dyBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 1, 1, 1> or <0, -1, -1, -1> + { + mask32v signMask = xyzwNormal < xNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dxBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 0, 0, 0> or <-1, 0, 0, 0> + { + mask32v signMask = xNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 1, 0, 0> or <-1, -1, 0, 0> + { + mask32v signMask = xNormal < -yNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dyBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 0, 1, 0> or <-1, 0, -1, 0> + { + mask32v signMask = xNormal < -zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dzBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <1, 0, 0, 1> or <-1, 0, 0, -1> + { + mask32v signMask = xNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dwBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 1, 0, 0> or <0, -1, 0, 0> + { + mask32v signMask = yNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 1, 1, 0> or <0, -1, -1, 0> + { + mask32v signMask = yNormal < -zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dzBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 1, 0, 1> or <0, -1, 0, -1> + { + mask32v signMask = yNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dwBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 0, 1, 0> or <0, 0, -1, 0> + { + mask32v signMask = zNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset0, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 0, 1, 1> or <0, 0, -1, -1> + { + mask32v signMask = zNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dzBase + dwBase ) ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + // Vertex <0, 0, 0, 1> or <0, 0, 0, -1> + { + mask32v signMask = wNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dwBase ), float32v( 0.0f ) ); + + ApplyVectorContributionSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset0, dwBase - offset1, + ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), valueX, valueY, valueZ, valueW ); + } + + constexpr double kBounding = constexpr( Scheme == VectorizationScheme::GradientOuterProduct ) ? + 115.21625311930542 / 1.4142135623730951 : + 48.80058117543753; + + warpAmp *= float32v( kBounding ); + xOut = FS::FMulAdd( valueX, warpAmp, xOut ); + yOut = FS::FMulAdd( valueY, warpAmp, yOut ); + zOut = FS::FMulAdd( valueZ, warpAmp, zOut ); + + float32v warpLengthSq = FS::FMulAdd( valueW, valueW, FS::FMulAdd( valueZ, valueZ, FS::FMulAdd( valueY, valueY, valueX * valueX ) ) ); + return warpLengthSq * FS::InvSqrt( warpLengthSq ) * warpAmp; + } +}; diff --git a/include/FastNoise/Generators/Generator.h b/include/FastNoise/Generators/Generator.h index 19a9331..354d1cd 100644 --- a/include/FastNoise/Generators/Generator.h +++ b/include/FastNoise/Generators/Generator.h @@ -48,6 +48,30 @@ namespace FastNoise "Minkowski", }; + enum class SimplexType + { + Standard, + Smooth + }; + + constexpr static const char* kSimplexType_Strings[] = + { + "Standard", + "Smooth", + }; + + enum class VectorizationScheme + { + OrthogonalGradientMatrix, + GradientOuterProduct + }; + + constexpr static const char* kVectorizationScheme_Strings[] = + { + "Orthogonal Gradient Matrix", + "Gradient Outer Product", + }; + struct OutputMinMax { float min = INFINITY; diff --git a/include/FastNoise/Generators/Perlin.inl b/include/FastNoise/Generators/Perlin.inl index 6f19b47..edaa6f4 100644 --- a/include/FastNoise/Generators/Perlin.inl +++ b/include/FastNoise/Generators/Perlin.inl @@ -26,8 +26,8 @@ class FastSIMD::DispatchClass final : public virtual Fa constexpr float kBounding = 0.579106986522674560546875f; return this->ScaleOutput( Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0 ), xf0, yf0 ), GetGradientDot( HashPrimes( seed, x1, y0 ), xf1, yf0 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1 ), xf0, yf1 ), GetGradientDot( HashPrimes( seed, x1, y1 ), xf1, yf1 ), xs ), ys ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0 ), xf0, yf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0 ), xf1, yf0 ), xs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1 ), xf0, yf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1 ), xf1, yf1 ), xs ), ys ), -1 / kBounding, 1 / kBounding ); } @@ -60,11 +60,11 @@ class FastSIMD::DispatchClass final : public virtual Fa constexpr float kBounding = 0.964921414852142333984375f; return this->ScaleOutput( Lerp( Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z0 ), xf0, yf0, zf0 ), GetGradientDot( HashPrimes( seed, x1, y0, z0 ), xf1, yf0, zf0 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z0 ), xf0, yf1, zf0 ), GetGradientDot( HashPrimes( seed, x1, y1, z0 ), xf1, yf1, zf0 ), xs ), ys ), + Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y0, z0 ), xf0, yf0, zf0 ), GetGradientDotCommon( HashPrimes( seed, x1, y0, z0 ), xf1, yf0, zf0 ), xs ), + Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y1, z0 ), xf0, yf1, zf0 ), GetGradientDotCommon( HashPrimes( seed, x1, y1, z0 ), xf1, yf1, zf0 ), xs ), ys ), Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z1 ), xf0, yf0, zf1 ), GetGradientDot( HashPrimes( seed, x1, y0, z1 ), xf1, yf0, zf1 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z1 ), xf0, yf1, zf1 ), GetGradientDot( HashPrimes( seed, x1, y1, z1 ), xf1, yf1, zf1 ), xs ), ys ), zs ), + Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y0, z1 ), xf0, yf0, zf1 ), GetGradientDotCommon( HashPrimes( seed, x1, y0, z1 ), xf1, yf0, zf1 ), xs ), + Lerp( GetGradientDotCommon( HashPrimes( seed, x0, y1, z1 ), xf0, yf1, zf1 ), GetGradientDotCommon( HashPrimes( seed, x1, y1, z1 ), xf1, yf1, zf1 ), xs ), ys ), zs ), -1 / kBounding, 1 / kBounding ); } @@ -103,17 +103,17 @@ class FastSIMD::DispatchClass final : public virtual Fa constexpr float kBounding = 0.964921414852142333984375f; return this->ScaleOutput( Lerp( Lerp( Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z0, w0 ), xf0, yf0, zf0, wf0 ), GetGradientDot( HashPrimes( seed, x1, y0, z0, w0 ), xf1, yf0, zf0, wf0 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z0, w0 ), xf0, yf1, zf0, wf0 ), GetGradientDot( HashPrimes( seed, x1, y1, z0, w0 ), xf1, yf1, zf0, wf0 ), xs ), ys ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z0, w0 ), xf0, yf0, zf0, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z0, w0 ), xf1, yf0, zf0, wf0 ), xs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z0, w0 ), xf0, yf1, zf0, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z0, w0 ), xf1, yf1, zf0, wf0 ), xs ), ys ), Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z1, w0 ), xf0, yf0, zf1, wf0 ), GetGradientDot( HashPrimes( seed, x1, y0, z1, w0 ), xf1, yf0, zf1, wf0 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z1, w0 ), xf0, yf1, zf1, wf0 ), GetGradientDot( HashPrimes( seed, x1, y1, z1, w0 ), xf1, yf1, zf1, wf0 ), xs ), ys ), zs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z1, w0 ), xf0, yf0, zf1, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z1, w0 ), xf1, yf0, zf1, wf0 ), xs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z1, w0 ), xf0, yf1, zf1, wf0 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z1, w0 ), xf1, yf1, zf1, wf0 ), xs ), ys ), zs ), Lerp( Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z0, w1 ), xf0, yf0, zf0, wf1 ), GetGradientDot( HashPrimes( seed, x1, y0, z0, w1 ), xf1, yf0, zf0, wf1 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z0, w1 ), xf0, yf1, zf0, wf1 ), GetGradientDot( HashPrimes( seed, x1, y1, z0, w1 ), xf1, yf1, zf0, wf1 ), xs ), ys ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z0, w1 ), xf0, yf0, zf0, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z0, w1 ), xf1, yf0, zf0, wf1 ), xs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z0, w1 ), xf0, yf1, zf0, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z0, w1 ), xf1, yf1, zf0, wf1 ), xs ), ys ), Lerp( - Lerp( GetGradientDot( HashPrimes( seed, x0, y0, z1, w1 ), xf0, yf0, zf1, wf1 ), GetGradientDot( HashPrimes( seed, x1, y0, z1, w1 ), xf1, yf0, zf1, wf1 ), xs ), - Lerp( GetGradientDot( HashPrimes( seed, x0, y1, z1, w1 ), xf0, yf1, zf1, wf1 ), GetGradientDot( HashPrimes( seed, x1, y1, z1, w1 ), xf1, yf1, zf1, wf1 ), xs ), ys ), zs ), ws ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y0, z1, w1 ), xf0, yf0, zf1, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y0, z1, w1 ), xf1, yf0, zf1, wf1 ), xs ), + Lerp( GetGradientDotPerlin( HashPrimes( seed, x0, y1, z1, w1 ), xf0, yf1, zf1, wf1 ), GetGradientDotPerlin( HashPrimes( seed, x1, y1, z1, w1 ), xf1, yf1, zf1, wf1 ), xs ), ys ), zs ), ws ), -1 / kBounding, 1 / kBounding ); } }; diff --git a/include/FastNoise/Generators/Simplex.h b/include/FastNoise/Generators/Simplex.h index cbed810..f56949b 100644 --- a/include/FastNoise/Generators/Simplex.h +++ b/include/FastNoise/Generators/Simplex.h @@ -6,7 +6,11 @@ namespace FastNoise class Simplex : public virtual VariableRange { public: + void SetType( SimplexType value ) { mType = value; } const Metadata& GetMetadata() const override; + + protected: + SimplexType mType = SimplexType::Standard; }; #ifdef FASTNOISE_METADATA @@ -22,52 +26,12 @@ namespace FastNoise description = "Smooth gradient noise from an N dimensional simplex grid\n" "Developed by Ken Perlin in 2001"; - } - }; -#endif - - class OpenSimplex2 : public virtual VariableRange - { - public: - const Metadata& GetMetadata() const override; - }; - -#ifdef FASTNOISE_METADATA - template<> - struct MetadataT : MetadataT> - { - SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override; - - MetadataT() - { - groups.push_back( "Coherent Noise" ); - - description = - "Smooth gradient noise from an N dimensional simplex grid, alternate implementation\n" - "Developed by K.jpg in 2019"; - } - }; -#endif - - class OpenSimplex2S : public virtual VariableRange - { - public: - const Metadata& GetMetadata() const override; - }; - -#ifdef FASTNOISE_METADATA - template<> - struct MetadataT : MetadataT> - { - SmartNode<> CreateNode( FastSIMD::FeatureSet ) const override; - - MetadataT() - { - groups.push_back( "Coherent Noise" ); - description = - "Smoother gradient noise from an N dimensional simplex grid\n" - "Developed by K.jpg in 2017"; + this->AddVariableEnum( + { "Type", "Noise character style" }, + SimplexType::Standard, &Simplex::SetType, + kSimplexType_Strings + ); } }; #endif diff --git a/include/FastNoise/Generators/Simplex.inl b/include/FastNoise/Generators/Simplex.inl index 3209d60..d35fc15 100644 --- a/include/FastNoise/Generators/Simplex.inl +++ b/include/FastNoise/Generators/Simplex.inl @@ -4,542 +4,951 @@ template class FastSIMD::DispatchClass final : public virtual FastNoise::Simplex, public FastSIMD::DispatchClass, SIMD> { - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const + float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const final { - this->ScalePositions( x, y ); - - const float SQRT3 = 1.7320508075688772935274463415059f; - const float F2 = 0.5f * (SQRT3 - 1.0f); - const float G2 = (3.0f - SQRT3) / 6.0f; - - float32v f = float32v( F2 ) * (x + y); - float32v x0 = FS::Floor( x + f ); - float32v y0 = FS::Floor( y + f ); - - int32v i = FS::Convert( x0 ) * int32v( Primes::X ); - int32v j = FS::Convert( y0 ) * int32v( Primes::Y ); - - float32v g = float32v( G2 ) * (x0 + y0); - x0 = x - (x0 - g); - y0 = y - (y0 - g); - - mask32v i1 = x0 > y0; - //mask32v j1 = ~i1; //InvMasked funcs - - float32v x1 = FS::MaskedSub( i1, x0, float32v( 1.f ) ) + float32v( G2 ); - float32v y1 = FS::InvMaskedSub( i1, y0, float32v( 1.f ) ) + float32v( G2 ); - - float32v x2 = x0 + float32v( G2 * 2 - 1 ); - float32v y2 = y0 + float32v( G2 * 2 - 1 ); - - float32v t0 = FS::FNMulAdd( x0, x0, FS::FNMulAdd( y0, y0, float32v( 0.5f ) ) ); - float32v t1 = FS::FNMulAdd( x1, x1, FS::FNMulAdd( y1, y1, float32v( 0.5f ) ) ); - float32v t2 = FS::FNMulAdd( x2, x2, FS::FNMulAdd( y2, y2, float32v( 0.5f ) ) ); - - t0 = FS::Max( t0, float32v( 0 ) ); - t1 = FS::Max( t1, float32v( 0 ) ); - t2 = FS::Max( t2, float32v( 0 ) ); + switch( mType ) { + case SimplexType::Standard: + return Gen_Standard( seed, x, y ); + case SimplexType::Smooth: + return Gen_Smooth( seed, x, y ); + } + } - t0 *= t0; t0 *= t0; - t1 *= t1; t1 *= t1; - t2 *= t2; t2 *= t2; + float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const final + { + switch( mType ) { + case SimplexType::Standard: + return Gen_Standard( seed, x, y, z ); + case SimplexType::Smooth: + return Gen_Smooth( seed, x, y, z ); + } + } - float32v n0 = GetGradientDot( HashPrimes( seed, i, j ), x0, y0 ); - float32v n1 = GetGradientDot( HashPrimes( seed, FS::MaskedAdd( i1, i, int32v( Primes::X ) ), FS::InvMaskedAdd( i1, j, int32v( Primes::Y ) ) ), x1, y1 ); - float32v n2 = GetGradientDot( HashPrimes( seed, i + int32v( Primes::X ), j + int32v( Primes::Y ) ), x2, y2 ); + float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const final + { + switch( mType ) { + case SimplexType::Standard: + return Gen_Standard( seed, x, y, z, w ); + case SimplexType::Smooth: + return Gen_Smooth( seed, x, y, z, w ); + } + } - constexpr float kBounding = 38.283687591552734375f; + float32v FS_VECTORCALL Gen_Standard( int32v seed, float32v x, float32v y ) const + { + this->ScalePositions( x, y ); - return this->ScaleOutput( FS::FMulAdd( n0, t0, FS::FMulAdd( n1, t1, n2 * t2 ) ), + constexpr double kRoot3 = 1.7320508075688772935274463415059; + constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 ); + constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 ); + constexpr double kFalloffRadiusSquared = 0.5; + + float32v skewDelta = float32v( kSkew2 ) * ( x + y ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + + mask32v xGreaterEqualY = dxSkewed >= dySkewed; + + float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; + + float32v dx1 = FS::MaskedIncrement( ~xGreaterEqualY, dx0 ) - float32v( kUnskew2 + 1 ); + float32v dy1 = FS::MaskedIncrement( xGreaterEqualY, dy0 ) - float32v( kUnskew2 + 1 ); + float32v dx2 = dx0 - float32v( kUnskew2 * 2 + 1 ); + float32v dy2 = dy0 - float32v( kUnskew2 * 2 + 1 ); + + float32v falloff0 = FS::FNMulAdd( dx0, dx0, FS::FNMulAdd( dy0, dy0, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff1 = FS::FNMulAdd( dx1, dx1, FS::FNMulAdd( dy1, dy1, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff2 = falloff0 + FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ), + float32v( -2.0 / 3.0 ) ); + + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); + + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; + + float32v gradientRampValue0 = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase ), dx0, dy0 ); + float32v gradientRampValue1 = GetGradientDotSimplex( HashPrimes( seed, FS::MaskedAdd( xGreaterEqualY, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( xGreaterEqualY, yPrimedBase, int32v( Primes::Y ) ) ), dx1, dy1 ); + float32v gradientRampValue2 = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ), dx2, dy2 ); + + constexpr double kBounding = 49.918426513671875; + + return this->ScaleOutput( FS::FMulAdd( gradientRampValue0, falloff0, FS::FMulAdd( gradientRampValue1, falloff1, gradientRampValue2 * falloff2 ) ), -1 / kBounding, 1 / kBounding ); } - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const + float32v FS_VECTORCALL Gen_Standard( int32v seed, float32v x, float32v y, float32v z ) const { this->ScalePositions( x, y, z ); - const float F3 = 1.0f / 3.0f; - const float G3 = 1.0f / 2.0f; - - float32v s = float32v( F3 ) * (x + y + z); - x += s; - y += s; - z += s; - - float32v x0 = FS::Floor( x ); - float32v y0 = FS::Floor( y ); - float32v z0 = FS::Floor( z ); - float32v xi = x - x0; - float32v yi = y - y0; - float32v zi = z - z0; - - int32v i = FS::Convert( x0 ) * int32v( Primes::X ); - int32v j = FS::Convert( y0 ) * int32v( Primes::Y ); - int32v k = FS::Convert( z0 ) * int32v( Primes::Z ); - - mask32v x_ge_y = xi >= yi; - mask32v y_ge_z = yi >= zi; - mask32v x_ge_z = xi >= zi; - - float32v g = float32v( G3 ) * (xi + yi + zi); - x0 = xi - g; - y0 = yi - g; - z0 = zi - g; - - mask32v i1 = x_ge_y & x_ge_z; - mask32v j1 = FS::BitwiseAndNot( y_ge_z, x_ge_y ); - mask32v k1 = FS::BitwiseAndNot( ~x_ge_z, y_ge_z ); - - mask32v i2 = x_ge_y | x_ge_z; - mask32v j2 = ~x_ge_y | y_ge_z; - mask32v k2 = x_ge_z & y_ge_z; //InvMasked - - float32v x1 = FS::MaskedSub( i1, x0, float32v( 1 ) ) + float32v( G3 ); - float32v y1 = FS::MaskedSub( j1, y0, float32v( 1 ) ) + float32v( G3 ); - float32v z1 = FS::MaskedSub( k1, z0, float32v( 1 ) ) + float32v( G3 ); - float32v x2 = FS::MaskedSub( i2, x0, float32v( 1 ) ) + float32v( G3 * 2 ); - float32v y2 = FS::MaskedSub( j2, y0, float32v( 1 ) ) + float32v( G3 * 2 ); - float32v z2 = FS::InvMaskedSub( k2, z0, float32v( 1 ) ) + float32v( G3 * 2 ); - float32v x3 = x0 + float32v( G3 * 3 - 1 ); - float32v y3 = y0 + float32v( G3 * 3 - 1 ); - float32v z3 = z0 + float32v( G3 * 3 - 1 ); - - float32v t0 = FS::FNMulAdd( x0, x0, FS::FNMulAdd( y0, y0, FS::FNMulAdd( z0, z0, float32v( 0.6f ) ) ) ); - float32v t1 = FS::FNMulAdd( x1, x1, FS::FNMulAdd( y1, y1, FS::FNMulAdd( z1, z1, float32v( 0.6f ) ) ) ); - float32v t2 = FS::FNMulAdd( x2, x2, FS::FNMulAdd( y2, y2, FS::FNMulAdd( z2, z2, float32v( 0.6f ) ) ) ); - float32v t3 = FS::FNMulAdd( x3, x3, FS::FNMulAdd( y3, y3, FS::FNMulAdd( z3, z3, float32v( 0.6f ) ) ) ); - - t0 = FS::Max( t0, float32v( 0 ) ); - t1 = FS::Max( t1, float32v( 0 ) ); - t2 = FS::Max( t2, float32v( 0 ) ); - t3 = FS::Max( t3, float32v( 0 ) ); - - t0 *= t0; t0 *= t0; - t1 *= t1; t1 *= t1; - t2 *= t2; t2 *= t2; - t3 *= t3; t3 *= t3; - - float32v n0 = GetGradientDot( HashPrimes( seed, i, j, k ), x0, y0, z0 ); - float32v n1 = GetGradientDot( HashPrimes( seed, FS::MaskedAdd( i1, i, int32v( Primes::X ) ), FS::MaskedAdd( j1, j, int32v( Primes::Y ) ), FS::MaskedAdd( k1, k, int32v( Primes::Z ) ) ), x1, y1, z1 ); - float32v n2 = GetGradientDot( HashPrimes( seed, FS::MaskedAdd( i2, i, int32v( Primes::X ) ), FS::MaskedAdd( j2, j, int32v( Primes::Y ) ), FS::InvMaskedAdd( k2, k, int32v( Primes::Z ) ) ), x2, y2, z2 ); - float32v n3 = GetGradientDot( HashPrimes( seed, i + int32v( Primes::X ), j + int32v( Primes::Y ), k + int32v( Primes::Z ) ), x3, y3, z3 ); - - constexpr float kBounding = 32.69428253173828125f; - - return this->ScaleOutput( FS::FMulAdd( n0, t0, FS::FMulAdd( n1, t1, FS::FMulAdd( n2, t2, n3 * t3 ) ) ), + constexpr double kSkew3 = 1.0 / 3.0; + constexpr double kReflectUnskew3 = -1.0 / 2.0; + constexpr double kFalloffRadiusSquared = 0.6; + + float32v skewDelta = float32v( kSkew3 ) * ( x + y + z ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + + mask32v xGreaterEqualY = dxSkewed >= dySkewed; + mask32v yGreaterEqualZ = dySkewed >= dzSkewed; + mask32v xGreaterEqualZ = dxSkewed >= dzSkewed; + + float32v unskewDelta = float32v( kReflectUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; + float32v dz0 = dzSkewed + unskewDelta; + + mask32v maskX1 = xGreaterEqualY & xGreaterEqualZ; + mask32v maskY1 = FS::BitwiseAndNot( yGreaterEqualZ, xGreaterEqualY ); + mask32v maskZ1 = FS::BitwiseAndNot( ~xGreaterEqualZ, yGreaterEqualZ ); + + mask32v nMaskX2 = ~( xGreaterEqualY | xGreaterEqualZ ); + mask32v nMaskY2 = xGreaterEqualY & ~yGreaterEqualZ; + mask32v nMaskZ2 = xGreaterEqualZ & yGreaterEqualZ; + + float32v dx3 = dx0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dy3 = dy0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dz3 = dz0 - float32v( kReflectUnskew3 * 3 + 1 ); + float32v dx1 = FS::MaskedSub( maskX1, dx3, float32v( 1 ) ); // kReflectUnskew3 * 3 + 1 = kReflectUnskew3, so dx0 - kReflectUnskew3 = dx3 + float32v dy1 = FS::MaskedSub( maskY1, dy3, float32v( 1 ) ); + float32v dz1 = FS::MaskedSub( maskZ1, dz3, float32v( 1 ) ); + float32v dx2 = FS::MaskedIncrement( nMaskX2, dx0 ); // kReflectUnskew3 * 2 - 1 = 0, so dx0 + ( kReflectUnskew3 * 2 - 1 ) = dx0 + float32v dy2 = FS::MaskedIncrement( nMaskY2, dy0 ); + float32v dz2 = FS::MaskedIncrement( nMaskZ2, dz0 ); + + float32v falloff0 = FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff1 = FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff2 = FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ); + float32v falloff3 = falloff0 - ( unskewDelta + float32v( 3.0 / 4.0 ) ); + + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); + falloff3 = FS::Max( falloff3, float32v( 0 ) ); + + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; + falloff3 *= falloff3; falloff3 *= falloff3; + + float32v gradientRampValue0 = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dx0, dy0, dz0 ); + float32v gradientRampValue1 = GetGradientDotCommon( HashPrimes( seed, FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ) ), dx1, dy1, dz1 ); + float32v gradientRampValue2 = GetGradientDotCommon( HashPrimes( seed, FS::InvMaskedAdd( nMaskX2, xPrimedBase, int32v( Primes::X ) ), FS::InvMaskedAdd( nMaskY2, yPrimedBase, int32v( Primes::Y ) ), FS::InvMaskedAdd( nMaskZ2, zPrimedBase, int32v( Primes::Z ) ) ), dx2, dy2, dz2 ); + float32v gradientRampValue3 = GetGradientDotCommon( HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ) ), dx3, dy3, dz3 ); + + constexpr double kBounding = 32.69428253173828125; + + return this->ScaleOutput( FS::FMulAdd( gradientRampValue3, falloff3, FS::FMulAdd( gradientRampValue2, falloff2, FS::FMulAdd( gradientRampValue1, falloff1, gradientRampValue0 * falloff0 ) ) ), -1 / kBounding, 1 / kBounding ); } - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z, float32v w ) const + float32v FS_VECTORCALL Gen_Standard( int32v seed, float32v x, float32v y, float32v z, float32v w ) const { this->ScalePositions( x, y, z, w ); - const float SQRT5 = 2.236067977499f; - const float F4 = (SQRT5 - 1.0f) / 4.0f; - const float G4 = (5.0f - SQRT5) / 20.0f; - - float32v s = float32v( F4 ) * (x + y + z + w); - x += s; - y += s; - z += s; - w += s; - - float32v x0 = FS::Floor( x ); - float32v y0 = FS::Floor( y ); - float32v z0 = FS::Floor( z ); - float32v w0 = FS::Floor( w ); - float32v xi = x - x0; - float32v yi = y - y0; - float32v zi = z - z0; - float32v wi = w - w0; - - int32v i = FS::Convert( x0 ) * int32v( Primes::X ); - int32v j = FS::Convert( y0 ) * int32v( Primes::Y ); - int32v k = FS::Convert( z0 ) * int32v( Primes::Z ); - int32v l = FS::Convert( w0 ) * int32v( Primes::W ); - - float32v g = float32v( G4 ) * (xi + yi + zi + wi); - x0 = xi - g; - y0 = yi - g; - z0 = zi - g; - w0 = wi - g; - - int32v rankx( 0 ); - int32v ranky( 0 ); - int32v rankz( 0 ); - int32v rankw( 0 ); - - mask32v x_ge_y = x0 >= y0; - rankx = FS::MaskedIncrement( x_ge_y, rankx ); - ranky = FS::MaskedIncrement( ~x_ge_y, ranky ); - - mask32v x_ge_z = x0 >= z0; - rankx = FS::MaskedIncrement( x_ge_z, rankx ); - rankz = FS::MaskedIncrement( ~x_ge_z, rankz ); - - mask32v x_ge_w = x0 >= w0; - rankx = FS::MaskedIncrement( x_ge_w, rankx ); - rankw = FS::MaskedIncrement( ~x_ge_w, rankw ); - - mask32v y_ge_z = y0 >= z0; - ranky = FS::MaskedIncrement( y_ge_z, ranky ); - rankz = FS::MaskedIncrement( ~y_ge_z, rankz ); - - mask32v y_ge_w = y0 >= w0; - ranky = FS::MaskedIncrement( y_ge_w, ranky ); - rankw = FS::MaskedIncrement( ~y_ge_w, rankw ); - - mask32v z_ge_w = z0 >= w0; - rankz = FS::MaskedIncrement( z_ge_w, rankz ); - rankw = FS::MaskedIncrement( ~z_ge_w, rankw ); - - mask32v i1 = rankx > int32v( 2 ); - mask32v j1 = ranky > int32v( 2 ); - mask32v k1 = rankz > int32v( 2 ); - mask32v l1 = rankw > int32v( 2 ); - - mask32v i2 = rankx > int32v( 1 ); - mask32v j2 = ranky > int32v( 1 ); - mask32v k2 = rankz > int32v( 1 ); - mask32v l2 = rankw > int32v( 1 ); - - mask32v i3 = rankx > int32v( 0 ); - mask32v j3 = ranky > int32v( 0 ); - mask32v k3 = rankz > int32v( 0 ); - mask32v l3 = rankw > int32v( 0 ); - - float32v x1 = FS::MaskedSub( i1, x0, float32v( 1 ) ) + float32v( G4 ); - float32v y1 = FS::MaskedSub( j1, y0, float32v( 1 ) ) + float32v( G4 ); - float32v z1 = FS::MaskedSub( k1, z0, float32v( 1 ) ) + float32v( G4 ); - float32v w1 = FS::MaskedSub( l1, w0, float32v( 1 ) ) + float32v( G4 ); - float32v x2 = FS::MaskedSub( i2, x0, float32v( 1 ) ) + float32v( G4 * 2 ); - float32v y2 = FS::MaskedSub( j2, y0, float32v( 1 ) ) + float32v( G4 * 2 ); - float32v z2 = FS::MaskedSub( k2, z0, float32v( 1 ) ) + float32v( G4 * 2 ); - float32v w2 = FS::MaskedSub( l2, w0, float32v( 1 ) ) + float32v( G4 * 2 ); - float32v x3 = FS::MaskedSub( i3, x0, float32v( 1 ) ) + float32v( G4 * 3 ); - float32v y3 = FS::MaskedSub( j3, y0, float32v( 1 ) ) + float32v( G4 * 3 ); - float32v z3 = FS::MaskedSub( k3, z0, float32v( 1 ) ) + float32v( G4 * 3 ); - float32v w3 = FS::MaskedSub( l3, w0, float32v( 1 ) ) + float32v( G4 * 3 ); - float32v x4 = x0 + float32v( G4 * 4 - 1 ); - float32v y4 = y0 + float32v( G4 * 4 - 1 ); - float32v z4 = z0 + float32v( G4 * 4 - 1 ); - float32v w4 = w0 + float32v( G4 * 4 - 1 ); - - float32v t0 = FS::FNMulAdd( x0, x0, FS::FNMulAdd( y0, y0, FS::FNMulAdd( z0, z0, FS::FNMulAdd( w0, w0, float32v( 0.6f ) ) ) ) ); - float32v t1 = FS::FNMulAdd( x1, x1, FS::FNMulAdd( y1, y1, FS::FNMulAdd( z1, z1, FS::FNMulAdd( w1, w1, float32v( 0.6f ) ) ) ) ); - float32v t2 = FS::FNMulAdd( x2, x2, FS::FNMulAdd( y2, y2, FS::FNMulAdd( z2, z2, FS::FNMulAdd( w2, w2, float32v( 0.6f ) ) ) ) ); - float32v t3 = FS::FNMulAdd( x3, x3, FS::FNMulAdd( y3, y3, FS::FNMulAdd( z3, z3, FS::FNMulAdd( w3, w3, float32v( 0.6f ) ) ) ) ); - float32v t4 = FS::FNMulAdd( x4, x4, FS::FNMulAdd( y4, y4, FS::FNMulAdd( z4, z4, FS::FNMulAdd( w4, w4, float32v( 0.6f ) ) ) ) ); - - t0 = FS::Max( t0, float32v( 0 ) ); - t1 = FS::Max( t1, float32v( 0 ) ); - t2 = FS::Max( t2, float32v( 0 ) ); - t3 = FS::Max( t3, float32v( 0 ) ); - t4 = FS::Max( t4, float32v( 0 ) ); - - t0 *= t0; t0 *= t0; - t1 *= t1; t1 *= t1; - t2 *= t2; t2 *= t2; - t3 *= t3; t3 *= t3; - t4 *= t4; t4 *= t4; - - float32v n0 = GetGradientDot( HashPrimes( seed, i, j, k, l ), x0, y0, z0, w0 ); - float32v n1 = GetGradientDot( HashPrimes( seed, - FS::MaskedAdd( i1, i, int32v( Primes::X ) ), - FS::MaskedAdd( j1, j, int32v( Primes::Y ) ), - FS::MaskedAdd( k1, k, int32v( Primes::Z ) ), - FS::MaskedAdd( l1, l, int32v( Primes::W ) ) ), x1, y1, z1, w1 ); - float32v n2 = GetGradientDot( HashPrimes( seed, - FS::MaskedAdd( i2, i, int32v( Primes::X ) ), - FS::MaskedAdd( j2, j, int32v( Primes::Y ) ), - FS::MaskedAdd( k2, k, int32v( Primes::Z ) ), - FS::MaskedAdd( l2, l, int32v( Primes::W ) ) ), x2, y2, z2, w2 ); - float32v n3 = GetGradientDot( HashPrimes( seed, - FS::MaskedAdd( i3, i, int32v( Primes::X ) ), - FS::MaskedAdd( j3, j, int32v( Primes::Y ) ), - FS::MaskedAdd( k3, k, int32v( Primes::Z ) ), - FS::MaskedAdd( l3, l, int32v( Primes::W ) ) ), x3, y3, z3, w3 ); - float32v n4 = GetGradientDot( HashPrimes( seed, i + int32v( Primes::X ), j + int32v( Primes::Y ), k + int32v( Primes::Z ), l + int32v( Primes::W ) ), x4, y4, z4, w4 ); - - constexpr float kBounding = 27.f; - - return this->ScaleOutput( FS::FMulAdd( n0, t0, FS::FMulAdd( n1, t1, FS::FMulAdd( n2, t2, FS::FMulAdd( n3, t3, n4 * t4 ) ) ) ), + constexpr double kRoot5 = 2.2360679774997896964091736687313; + constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 ); + constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 ); + constexpr double kFalloffRadiusSquared = 0.6; + + float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v wSkewed = w + skewDelta; + + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v wSkewedBase = FS::Floor( wSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + float32v dwSkewed = wSkewed - wSkewedBase; + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + int32v wPrimedBase = FS::Convert( wSkewedBase ) * int32v( Primes::W ); + + float32v unskewDelta = float32v( kUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed ); + float32v dx0 = dxSkewed + unskewDelta; + float32v dy0 = dySkewed + unskewDelta; + float32v dz0 = dzSkewed + unskewDelta; + float32v dw0 = dwSkewed + unskewDelta; + + int32v rankX( 0 ); + int32v rankY( 0 ); + int32v rankZ( 0 ); + int32v rankW( 0 ); + + mask32v xGreaterEqualY = dx0 >= dy0; + rankX = FS::MaskedIncrement( xGreaterEqualY, rankX ); + rankY = FS::MaskedIncrement( ~xGreaterEqualY, rankY ); + + mask32v xGreaterEqualZ = dx0 >= dz0; + rankX = FS::MaskedIncrement( xGreaterEqualZ, rankX ); + rankZ = FS::MaskedIncrement( ~xGreaterEqualZ, rankZ ); + + mask32v xGreaterEqualW = dx0 >= dw0; + rankX = FS::MaskedIncrement( xGreaterEqualW, rankX ); + rankW = FS::MaskedIncrement( ~xGreaterEqualW, rankW ); + + mask32v yGreaterEqualZ = dy0 >= dz0; + rankY = FS::MaskedIncrement( yGreaterEqualZ, rankY ); + rankZ = FS::MaskedIncrement( ~yGreaterEqualZ, rankZ ); + + mask32v yGreaterEqualW = dy0 >= dw0; + rankY = FS::MaskedIncrement( yGreaterEqualW, rankY ); + rankW = FS::MaskedIncrement( ~yGreaterEqualW, rankW ); + + mask32v zGreaterEqualW = dz0 >= dw0; + rankZ = FS::MaskedIncrement( zGreaterEqualW, rankZ ); + rankW = FS::MaskedIncrement( ~zGreaterEqualW, rankW ); + + mask32v maskX1 = rankX > int32v( 2 ); + mask32v maskY1 = rankY > int32v( 2 ); + mask32v maskZ1 = rankZ > int32v( 2 ); + mask32v maskW1 = rankW > int32v( 2 ); + + mask32v maskX2 = rankX > int32v( 1 ); + mask32v maskY2 = rankY > int32v( 1 ); + mask32v maskZ2 = rankZ > int32v( 1 ); + mask32v maskW2 = rankW > int32v( 1 ); + + mask32v maskX3 = rankX > int32v( 0 ); + mask32v maskY3 = rankY > int32v( 0 ); + mask32v maskZ3 = rankZ > int32v( 0 ); + mask32v maskW3 = rankW > int32v( 0 ); + + float32v dx1 = FS::MaskedSub( maskX1, dx0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dy1 = FS::MaskedSub( maskY1, dy0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dz1 = FS::MaskedSub( maskZ1, dz0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dw1 = FS::MaskedSub( maskW1, dw0, float32v( 1 ) ) - float32v( kUnskew4 ); + float32v dx2 = FS::MaskedSub( maskX2, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dy2 = FS::MaskedSub( maskY2, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dz2 = FS::MaskedSub( maskZ2, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dw2 = FS::MaskedSub( maskW2, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 2 ); + float32v dx3 = FS::MaskedSub( maskX3, dx0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dy3 = FS::MaskedSub( maskY3, dy0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dz3 = FS::MaskedSub( maskZ3, dz0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dw3 = FS::MaskedSub( maskW3, dw0, float32v( 1 ) ) - float32v( kUnskew4 * 3 ); + float32v dx4 = dx0 - float32v( kUnskew4 * 4 + 1 ); + float32v dy4 = dy0 - float32v( kUnskew4 * 4 + 1 ); + float32v dz4 = dz0 - float32v( kUnskew4 * 4 + 1 ); + float32v dw4 = dw0 - float32v( kUnskew4 * 4 + 1 ); + + float32v falloff0 = FS::FNMulAdd( dw0, dw0, FS::FNMulAdd( dz0, dz0, FS::FNMulAdd( dy0, dy0, FS::FNMulAdd( dx0, dx0, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff1 = FS::FNMulAdd( dw1, dw1, FS::FNMulAdd( dz1, dz1, FS::FNMulAdd( dy1, dy1, FS::FNMulAdd( dx1, dx1, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff2 = FS::FNMulAdd( dw2, dw2, FS::FNMulAdd( dz2, dz2, FS::FNMulAdd( dy2, dy2, FS::FNMulAdd( dx2, dx2, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff3 = FS::FNMulAdd( dw3, dw3, FS::FNMulAdd( dz3, dz3, FS::FNMulAdd( dy3, dy3, FS::FNMulAdd( dx3, dx3, float32v( kFalloffRadiusSquared ) ) ) ) ); + float32v falloff4 = falloff0 + FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot5 + 3.0 ) / ( kRoot5 + 5.0 ) ), + float32v( -4.0 / 5.0 ) ); + + falloff0 = FS::Max( falloff0, float32v( 0 ) ); + falloff1 = FS::Max( falloff1, float32v( 0 ) ); + falloff2 = FS::Max( falloff2, float32v( 0 ) ); + falloff3 = FS::Max( falloff3, float32v( 0 ) ); + falloff4 = FS::Max( falloff4, float32v( 0 ) ); + + falloff0 *= falloff0; falloff0 *= falloff0; + falloff1 *= falloff1; falloff1 *= falloff1; + falloff2 *= falloff2; falloff2 *= falloff2; + falloff3 *= falloff3; falloff3 *= falloff3; + falloff4 *= falloff4; falloff4 *= falloff4; + + float32v gradientRampValue0 = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dx0, dy0, dz0, dw0 ); + float32v gradientRampValue1 = GetGradientDotSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX1, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY1, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ1, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW1, wPrimedBase, int32v( Primes::W ) ) ), dx1, dy1, dz1, dw1 ); + float32v gradientRampValue2 = GetGradientDotSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX2, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY2, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ2, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW2, wPrimedBase, int32v( Primes::W ) ) ), dx2, dy2, dz2, dw2 ); + float32v gradientRampValue3 = GetGradientDotSimplex( HashPrimes( seed, + FS::MaskedAdd( maskX3, xPrimedBase, int32v( Primes::X ) ), + FS::MaskedAdd( maskY3, yPrimedBase, int32v( Primes::Y ) ), + FS::MaskedAdd( maskZ3, zPrimedBase, int32v( Primes::Z ) ), + FS::MaskedAdd( maskW3, wPrimedBase, int32v( Primes::W ) ) ), dx3, dy3, dz3, dw3 ); + float32v gradientRampValue4 = GetGradientDotSimplex( HashPrimes( seed, + xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ), zPrimedBase + int32v( Primes::Z ), wPrimedBase + int32v( Primes::W ) ), + dx4, dy4, dz4, dw4 ); + + constexpr double kBounding = 33.653125584827855; + + return this->ScaleOutput( FS::FMulAdd( gradientRampValue0, falloff0, FS::FMulAdd( gradientRampValue1, falloff1, FS::FMulAdd( gradientRampValue2, falloff2, FS::FMulAdd( gradientRampValue3, falloff3, gradientRampValue4 * falloff4 ) ) ) ), -1 / kBounding, 1 / kBounding ); } -}; -template -class FastSIMD::DispatchClass final : public virtual FastNoise::OpenSimplex2, public FastSIMD::DispatchClass, SIMD> -{ - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const + float32v FS_VECTORCALL Gen_Smooth( int32v seed, float32v x, float32v y ) const { this->ScalePositions( x, y ); - const float SQRT3 = 1.7320508075f; - const float F2 = 0.5f * (SQRT3 - 1.0f); - const float G2 = (3.0f - SQRT3) / 6.0f; - - float32v f = float32v( F2 ) * (x + y); - float32v x0 = FS::Floor( x + f ); - float32v y0 = FS::Floor( y + f ); - - int32v i = FS::Convert( x0 ) * int32v( Primes::X ); - int32v j = FS::Convert( y0 ) * int32v( Primes::Y ); + constexpr double kRoot3 = 1.7320508075688772935274463415059; + constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 ); + constexpr double kUnskew2 = -1.0 / ( kRoot3 + 3.0 ); + constexpr double kFalloffRadiusSquared = 2.0 / 3.0; + + float32v skewDelta = float32v( kSkew2 ) * ( x + y ); + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + + mask32v forwardXY = dxSkewed + dySkewed > float32v( 1.0f ); + float32v boundaryXY = FS::Masked( forwardXY, float32v( -1.0f ) ); + mask32v forwardX = FS::FMulAdd( dxSkewed, float32v( -2.0f ), dySkewed ) < boundaryXY; + mask32v forwardY = FS::FMulAdd( dySkewed, float32v( -2.0f ), dxSkewed ) < boundaryXY; - float32v g = float32v( G2 ) * (x0 + y0); - x0 = x - (x0 - g); - y0 = y - (y0 - g); + float32v unskewDelta = float32v( kUnskew2 ) * ( dxSkewed + dySkewed ); + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; - mask32v i1 = x0 > y0; - //mask32v j1 = ~i1; //InvMasked funcs + float32v falloffBase0, value; - float32v x1 = FS::MaskedSub( i1, x0, float32v( 1.f ) ) + float32v( G2 ); - float32v y1 = FS::InvMaskedSub( i1, y0, float32v( 1.f ) ) + float32v( G2 ); - float32v x2 = x0 + float32v( (G2 * 2) - 1 ); - float32v y2 = y0 + float32v( (G2 * 2) - 1 ); + // Vertex <0, 0> + { + int32v hash = HashPrimes( seed, xPrimedBase, yPrimedBase ); + float32v gradientRampValue = GetGradientDotSimplex( hash, dxBase, dyBase ); + falloffBase0 = FS::FNMulAdd( dxBase, dxBase, FS::FNMulAdd( dyBase, dyBase, float32v( kFalloffRadiusSquared ) ) ); + float32v falloff = falloffBase0; falloff *= falloff; falloff *= falloff; + value = falloff * gradientRampValue; + } - float32v t0 = float32v( 0.5f ) - (x0 * x0) - (y0 * y0); - float32v t1 = float32v( 0.5f ) - (x1 * x1) - (y1 * y1); - float32v t2 = float32v( 0.5f ) - (x2 * x2) - (y2 * y2); + // Vertex <1, 1> + { + int32v hash = HashPrimes( seed, xPrimedBase + int32v( Primes::X ), yPrimedBase + int32v( Primes::Y ) ); + float32v gradientRampValue = GetGradientDotSimplex( hash, dxBase - float32v( 2 * kUnskew2 + 1 ), dyBase - float32v( 2 * kUnskew2 + 1 ) ); + float32v falloff = FS::FMulAdd( unskewDelta, + float32v( -4.0 * ( kRoot3 + 2.0 ) / ( kRoot3 + 3.0 ) ), + falloffBase0 - float32v( kFalloffRadiusSquared ) ); + falloff *= falloff; falloff *= falloff; + value = FS::FMulAdd( falloff, gradientRampValue, value ); + } - t0 = FS::Max( t0, float32v( 0 ) ); - t1 = FS::Max( t1, float32v( 0 ) ); - t2 = FS::Max( t2, float32v( 0 ) ); + float32v xyDelta = FS::Select( forwardXY, float32v( kUnskew2 + 1 ), float32v( -kUnskew2 ) ); + dxBase -= xyDelta; + dyBase -= xyDelta; - t0 *= t0; t0 *= t0; - t1 *= t1; t1 *= t1; - t2 *= t2; t2 *= t2; + // Vertex <1, 0> or <-1, 0> or <1, 2> + { + int32v hash = HashPrimes( seed, + FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardX, xPrimedBase, int32v( Primes::X * 2 ) ), int32v( Primes::X ) ), + FS::MaskedAdd( forwardXY, yPrimedBase, int32v( Primes::Y ) ) ); + float32v dx = dxBase - FS::Select( forwardX, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) ); + float32v dy = FS::MaskedSub( forwardX, dyBase, float32v( 2 * kUnskew2 ) ); + float32v gradientRampValue = GetGradientDotSimplex( hash, dx, dy ); + float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) ); + falloff *= falloff; falloff *= falloff; + value = FS::FMulAdd( falloff, gradientRampValue, value ); + } - float32v n0 = GetGradientDotFancy( HashPrimes( seed, i, j ), x0, y0 ); - float32v n1 = GetGradientDotFancy( HashPrimes( seed, FS::MaskedAdd( i1, i, int32v( Primes::X ) ), FS::InvMaskedAdd( i1, j, int32v( Primes::Y ) ) ), x1, y1 ); - float32v n2 = GetGradientDotFancy( HashPrimes( seed, i + int32v( Primes::X ), j + int32v( Primes::Y ) ), x2, y2 ); + // Vertex <0, 1> or <0, -1> or <2, 1> + { + int32v hash = HashPrimes( seed, + FS::MaskedAdd( forwardXY, xPrimedBase, int32v( Primes::X ) ), + FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardY, yPrimedBase, int32v( (int32_t)( Primes::Y * 2LL ) ) ), int32v( Primes::Y ) ) ); + float32v dx = FS::MaskedSub( forwardY, dxBase, float32v( 2 * kUnskew2 ) ); + float32v dy = dyBase - FS::Select( forwardY, float32v( 1 + 2 * kUnskew2 ), float32v( -1 ) ); + float32v gradientRampValue = GetGradientDotSimplex( hash, dx, dy ); + float32v falloff = FS::Max( FS::FNMulAdd( dx, dx, FS::FNMulAdd( dy, dy, float32v( kFalloffRadiusSquared ) ) ), float32v( 0 ) ); + falloff *= falloff; falloff *= falloff; + value = FS::FMulAdd( falloff, gradientRampValue, value ); + } - constexpr float kBounding = 49.918426513671875f; + constexpr double kBounding = 9.28993664146183; - return this->ScaleOutput( FS::FMulAdd( n0, t0, FS::FMulAdd( n1, t1, n2 * t2 ) ), - -1 / kBounding, 1 / kBounding ); + return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding ); } - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const + float32v FS_VECTORCALL Gen_Smooth( int32v seed, float32v x, float32v y, float32v z ) const { this->ScalePositions( x, y, z ); - float32v f = float32v( 2.0f / 3.0f ) * (x + y + z); - float32v xr = f - x; - float32v yr = f - y; - float32v zr = f - z; + constexpr double kSkew3 = 1.0 / 3.0; + constexpr double kReflectUnskew3 = -1.0 / 2.0; + constexpr double kTwiceUnskew3 = -1.0 / 4.0; + + constexpr double kDistanceSquaredA = 3.0 / 4.0; + constexpr double kDistanceSquaredB = 1.0; + constexpr double kFalloffRadiusSquared = kDistanceSquaredA; + + float32v skewDelta = float32v( kSkew3 ) * ( x + y + z ); - float32v val( 0 ); - for( size_t i = 0; ; i++ ) + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + + // From unit cell base, find closest vertex { - float32v v0xr = FS::Round( xr ); - float32v v0yr = FS::Round( yr ); - float32v v0zr = FS::Round( zr ); - float32v d0xr = xr - v0xr; - float32v d0yr = yr - v0yr; - float32v d0zr = zr - v0zr; - - float32v score0xr = FS::Abs( d0xr ); - float32v score0yr = FS::Abs( d0yr ); - float32v score0zr = FS::Abs( d0zr ); - mask32v dir0xr = FS::Max( score0yr, score0zr ) <= score0xr; - mask32v dir0yr = FS::BitwiseAndNot( FS::Max( score0zr, score0xr ) <= score0yr, dir0xr ); - mask32v dir0zr = ~(dir0xr | dir0yr); - float32v v1xr = FS::MaskedAdd( dir0xr, v0xr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0xr ) ); - float32v v1yr = FS::MaskedAdd( dir0yr, v0yr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0yr ) ); - float32v v1zr = FS::MaskedAdd( dir0zr, v0zr, float32v( 1.0f ) | ( float32v( -1.0f ) & d0zr ) ); - float32v d1xr = xr - v1xr; - float32v d1yr = yr - v1yr; - float32v d1zr = zr - v1zr; - - int32v hv0xr = FS::Convert( v0xr ) * int32v( Primes::X ); - int32v hv0yr = FS::Convert( v0yr ) * int32v( Primes::Y ); - int32v hv0zr = FS::Convert( v0zr ) * int32v( Primes::Z ); - - int32v hv1xr = FS::Convert( v1xr ) * int32v( Primes::X ); - int32v hv1yr = FS::Convert( v1yr ) * int32v( Primes::Y ); - int32v hv1zr = FS::Convert( v1zr ) * int32v( Primes::Z ); - - float32v t0 = FS::FNMulAdd( d0zr, d0zr, FS::FNMulAdd( d0yr, d0yr, FS::FNMulAdd( d0xr, d0xr, float32v( 0.6f ) ) ) ); - float32v t1 = FS::FNMulAdd( d1zr, d1zr, FS::FNMulAdd( d1yr, d1yr, FS::FNMulAdd( d1xr, d1xr, float32v( 0.6f ) ) ) ); - t0 = FS::Max( t0, float32v( 0 ) ); - t1 = FS::Max( t1, float32v( 0 ) ); - t0 *= t0; t0 *= t0; - t1 *= t1; t1 *= t1; - - float32v v0 = GetGradientDot( HashPrimes( seed, hv0xr, hv0yr, hv0zr ), d0xr, d0yr, d0zr ); - float32v v1 = GetGradientDot( HashPrimes( seed, hv1xr, hv1yr, hv1zr ), d1xr, d1yr, d1zr ); - - val = FS::FMulAdd( v0, t0, FS::FMulAdd( v1, t1, val ) ); - - if( i == 1 ) + // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result. + float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * ( dxSkewed + dySkewed + dzSkewed ); + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + + // Using those, compare scores to determine which vertex is closest. + constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr { - break; - } + moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits ); + maxScore = FS::Max( maxScore, score ); + }; + float32v maxScore = float32v( 0.375f ); + int32v moveMaskBits = FS::Masked( xyzNormal > maxScore, int32v( -1 ) ); + maxScore = FS::Max( maxScore, xyzNormal ); + considerVertex( maxScore, moveMaskBits, xNormal, 0b001 ); + considerVertex( maxScore, moveMaskBits, yNormal, 0b010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b100 ); + maxScore += float32v( 0.125f ) - xyzNormal; + considerVertex( maxScore, moveMaskBits, -zNormal, 0b011 ); + considerVertex( maxScore, moveMaskBits, -yNormal, 0b101 ); + considerVertex( maxScore, moveMaskBits, -xNormal, 0b110 ); + + mask32v moveX = ( moveMaskBits & int32v( 0b001 ) ) != int32v( 0 ); + mask32v moveY = ( moveMaskBits & int32v( 0b010 ) ) != int32v( 0 ); + mask32v moveZ = ( moveMaskBits & int32v( 0b100 ) ) != int32v( 0 ); + + xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase ); + ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase ); + zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase ); + + dxSkewed = FS::MaskedDecrement( moveX, dxSkewed ); + dySkewed = FS::MaskedDecrement( moveY, dySkewed ); + dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed ); + } + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + + float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed; + float32v twiceUnskewDelta = float32v( kTwiceUnskew3 ) * skewedCoordinateSum; + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v xyzNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + + float32v unskewDelta = float32v( kReflectUnskew3 ) * skewedCoordinateSum; + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; + float32v dzBase = dzSkewed + unskewDelta; + + float32v coordinateSum = float32v( 1 + 3 * kReflectUnskew3 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + + // Vertex <0, 0, 0> + float32v value, falloffBaseStemA, falloffBaseStemB; + { + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase ), dxBase, dyBase, dzBase ); + float32v falloffBase = FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) * float32v( 0.5f ); + falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 ); + falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 ); + value = ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ) * gradientRampValue; + } + + // Vertex <1, 1, 1> or <-1, -1, -1> + { + mask32v signMask = xyzNormal < float32v( 0 ); - xr += float32v( 0.5f ); - yr += float32v( 0.5f ); - zr += float32v( 0.5f ); - seed = ~seed; + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset = float32v( 3 * kReflectUnskew3 + 1 ) ^ sign; + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimed ), dxBase - offset, dyBase - offset, dzBase - offset ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); } - constexpr float kBounding = 32.69428253173828125f; + // Vertex <1, 1, 0> or <-1, -1, 0> + { + mask32v signMask = xyzNormal < zNormal; - return this->ScaleOutput( val, -1 / kBounding, 1 / kBounding ); - } -}; + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); -template -class FastSIMD::DispatchClass final : public virtual FastNoise::OpenSimplex2S, public FastSIMD::DispatchClass, SIMD> -{ - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y ) const - { - this->ScalePositions( x, y ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; - const float SQRT3 = 1.7320508075688772935274463415059f; - const float F2 = 0.5f * ( SQRT3 - 1.0f ); - const float G2 = ( SQRT3 - 3.0f ) / 6.0f; - - float32v s = float32v( F2 ) * ( x + y ); - float32v xs = x + s; - float32v ys = y + s; - float32v xsb = FS::Floor( xs ); - float32v ysb = FS::Floor( ys ); - float32v xsi = xs - xsb; - float32v ysi = ys - ysb; - int32v xsbp = FS::Convert( xsb ) * int32v( Primes::X ); - int32v ysbp = FS::Convert( ysb ) * int32v( Primes::Y ); - - mask32v forwardXY = xsi + ysi > float32v( 1.0f ); - float32v boundaryXY = FS::Masked( forwardXY, float32v( -1.0f ) ); - mask32v forwardX = FS::FMulAdd( xsi, float32v( -2.0f ), ysi ) < boundaryXY; - mask32v forwardY = FS::FMulAdd( ysi, float32v( -2.0f ), xsi ) < boundaryXY; - - float32v t = float32v( G2 ) * ( xsi + ysi ); - float32v xi = xsi + t; - float32v yi = ysi + t; - - int32v h0 = HashPrimes( seed, xsbp, ysbp ); - float32v v0 = GetGradientDotFancy( h0, xi, yi ); - float32v a = FS::FNMulAdd( xi, xi, FS::FNMulAdd( yi, yi, float32v( 2.0f / 3.0f ) ) ); - float32v a0 = a; a0 *= a0; a0 *= a0; - float32v value = a0 * v0; - - int32v h1 = HashPrimes( seed, xsbp + int32v( Primes::X ), ysbp + int32v( Primes::Y ) ); - float32v v1 = GetGradientDotFancy( h1, xi - float32v( 2 * G2 + 1 ), yi - float32v( 2 * G2 + 1 ) ); - float32v a1 = FS::FMulAdd( float32v( 2 * ( 1 + 2 * G2 ) * ( 1 / G2 + 2 ) ), t, a + float32v( -2 * ( 1 + 2 * G2 ) * ( 1 + 2 * G2 ) ) ); - a1 *= a1; a1 *= a1; - value = FS::FMulAdd( a1, v1, value ); - - float32v xyDelta = FS::Select( forwardXY, float32v( G2 + 1 ), float32v( -G2 ) ); - xi -= xyDelta; - yi -= xyDelta; - - int32v h2 = HashPrimes( seed, - FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardX, xsbp, int32v( Primes::X * 2 ) ), int32v( Primes::X ) ), - FS::MaskedAdd( forwardXY, ysbp, int32v( Primes::Y ) ) ); - float32v xi2 = xi - FS::Select( forwardX, float32v( 1 + 2 * G2 ), float32v( -1 ) ); - float32v yi2 = FS::MaskedSub( forwardX, yi, float32v( 2 * G2 ) ); - float32v v2 = GetGradientDotFancy( h2, xi2, yi2 ); - float32v a2 = FS::Max( FS::FNMulAdd( xi2, xi2, FS::FNMulAdd( yi2, yi2, float32v( 2.0f / 3.0f ) ) ), float32v( 0 ) ); - a2 *= a2; a2 *= a2; - value = FS::FMulAdd( a2, v2, value ); - - int32v h3 = HashPrimes( seed, - FS::MaskedAdd( forwardXY, xsbp, int32v( Primes::X ) ), - FS::InvMaskedSub( forwardXY, FS::MaskedAdd( forwardY, ysbp, int32v( (int32_t)( Primes::Y * 2LL ) ) ), int32v( Primes::Y ) ) ); - float32v xi3 = FS::MaskedSub( forwardY, xi, float32v( 2 * G2 ) ); - float32v yi3 = yi - FS::Select( forwardY, float32v( 1 + 2 * G2 ), float32v( -1 ) ); - float32v v3 = GetGradientDotFancy( h3, xi3, yi3 ); - float32v a3 = FS::Max( FS::FNMulAdd( xi3, xi3, FS::FNMulAdd( yi3, yi3, float32v( 2.0f / 3.0f ) ) ), float32v( 0 ) ); - a3 *= a3; a3 *= a3; - value = FS::FMulAdd( a3, v3, value ); - - constexpr float kBounding = 9.28993664146183f; + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase ), dxBase, dyBase, dzBase - offset0 ); + float32v falloffBase = FS::Min( ( sign ^ dzBase ) - falloffBaseStemB, float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 1> or <-1, 0, -1> + { + mask32v signMask = xyzNormal < yNormal; + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed ), dxBase, dyBase - offset0, dzBase ); + float32v falloffBase = FS::Min( ( sign ^ dyBase ) - falloffBaseStemB, float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 1> or <0, -1, -1> + { + mask32v signMask = xyzNormal < xNormal; + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( 2 * kReflectUnskew3 ) ^ sign; + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed ), dxBase - offset0, dyBase, dzBase ); + float32v falloffBase = FS::Min( ( sign ^ dxBase ) - falloffBaseStemB, float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 0> or <-1, 0, 0> + { + mask32v signMask = xNormal < float32v( 0 ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase ), dxBase + offset0, dyBase - offset0, dzBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 0> or <0, -1, 0> + { + mask32v signMask = yNormal < float32v( 0 ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase ), dxBase - offset0, dyBase + offset0, dzBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 0, 1> or <0, 0, -1> + { + mask32v signMask = zNormal < float32v( 0 ); + + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + float32v offset0 = float32v( kReflectUnskew3 ) ^ sign; // offset1 = -offset0 because kReflectUnskew3 + 1 = -kReflectUnskew3 + + float32v gradientRampValue = GetGradientDotCommon( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed ), dxBase - offset0, dyBase - offset0, dzBase + offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + constexpr double kBounding = 144.736422163332608; return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding ); } - float32v FS_VECTORCALL Gen( int32v seed, float32v x, float32v y, float32v z ) const + float32v FS_VECTORCALL Gen_Smooth( int32v seed, float32v x, float32v y, float32v z, float32v w ) const { - this->ScalePositions( x, y, z ); - - float32v f = float32v( 2.0f / 3.0f ) * ( x + y + z ); - float32v xr = f - x; - float32v yr = f - y; - float32v zr = f - z; - - float32v xrb = FS::Floor( xr ); - float32v yrb = FS::Floor( yr ); - float32v zrb = FS::Floor( zr ); - float32v xri = xr - xrb; - float32v yri = yr - yrb; - float32v zri = zr - zrb; - int32v xrbp = FS::Convert( xrb ) * int32v( Primes::X ); - int32v yrbp = FS::Convert( yrb ) * int32v( Primes::Y ); - int32v zrbp = FS::Convert( zrb ) * int32v( Primes::Z ); - - float32v value( 0 ); - for( size_t i = 0; ; i++ ) + this->ScalePositions( x, y, z, w ); + + constexpr double kRoot5 = 2.2360679774997896964091736687313; + constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 ); + constexpr double kUnskew4 = -1.0 / ( kRoot5 + 5.0 ); + constexpr double kTwiceUnskew4 = -1.0 / 5.0; + + constexpr double kDistanceSquaredA = 4.0 / 5.0; + constexpr double kDistanceSquaredB = 6.0 / 5.0; + constexpr double kFalloffRadiusSquared = kDistanceSquaredA; + + float32v skewDelta = float32v( kSkew4 ) * ( x + y + z + w ); + + float32v xSkewed = x + skewDelta; + float32v ySkewed = y + skewDelta; + float32v zSkewed = z + skewDelta; + float32v wSkewed = w + skewDelta; + float32v xSkewedBase = FS::Floor( xSkewed ); + float32v ySkewedBase = FS::Floor( ySkewed ); + float32v zSkewedBase = FS::Floor( zSkewed ); + float32v wSkewedBase = FS::Floor( wSkewed ); + float32v dxSkewed = xSkewed - xSkewedBase; + float32v dySkewed = ySkewed - ySkewedBase; + float32v dzSkewed = zSkewed - zSkewedBase; + float32v dwSkewed = wSkewed - wSkewedBase; + + // From unit cell base, find closest vertex { - float32v a = FS::FNMulAdd( xri, xri, FS::FNMulAdd( yri, yri, FS::FNMulAdd( zri, zri, float32v( 0.75f ) ) ) ) * float32v( 0.5f ); - - float32v p0 = zri + yri + xri - float32v( 1.5f ); - mask32v flip0 = p0 >= float32v( 0.0f ); - float32v a0 = FS::Max( FS::MaskedAdd( flip0, a, p0 ), float32v( 0 ) ); - a0 *= a0; a0 *= a0; - int32v h0 = HashPrimes( seed, FS::MaskedAdd( flip0, xrbp, int32v( Primes::X ) ), FS::MaskedAdd( flip0, yrbp, int32v( Primes::Y )), FS::MaskedAdd( flip0, zrbp, int32v( Primes::Z ))); - float32v v0 = GetGradientDot( h0, FS::MaskedDecrement( flip0, xri ), FS::MaskedDecrement( flip0, yri ), FS::MaskedDecrement( flip0, zri ) ); - value = FS::FMulAdd( a0, v0, value ); - a -= float32v( 0.5f ); - - float32v p1 = zri + yri - xri + float32v( -0.5f ); - mask32v flip1 = p1 >= float32v( 0.0f ); - float32v a1 = FS::Max( FS::MaskedAdd( flip1, a + xri, p1 ), float32v( 0 ) ); - a1 *= a1; a1 *= a1; - int32v h1 = HashPrimes( seed, FS::InvMaskedAdd( flip1, xrbp, int32v( Primes::X )), FS::MaskedAdd( flip1, yrbp, int32v( Primes::Y ) ), FS::MaskedAdd( flip1, zrbp, int32v( Primes::Z ))); - float32v v1 = GetGradientDot( h1, FS::InvMaskedSub( flip1, xri, float32v( 1.0f ) ), FS::MaskedDecrement( flip1, yri ), FS::MaskedDecrement( flip1, zri ) ); - value = FS::FMulAdd( a1, v1, value ); - - float32v p2 = xri + float32v( -0.5f ) + ( zri - yri ); - mask32v flip2 = p2 >= float32v( 0.0f ); - float32v a2 = FS::Max( FS::MaskedAdd( flip2, a + yri, p2 ), float32v( 0 ) ); - a2 *= a2; a2 *= a2; - int32v h2 = HashPrimes( seed, FS::MaskedAdd( flip2, xrbp, int32v( Primes::X )), FS::InvMaskedAdd( flip2, yrbp, int32v( Primes::Y )), FS::MaskedAdd( flip2, zrbp, int32v( Primes::Z ))); - float32v v2 = GetGradientDot( h2, FS::MaskedDecrement( flip2, xri ), FS::InvMaskedSub( flip2, yri, float32v( 1.0f ) ), FS::MaskedDecrement( flip2, zri ) ); - value = FS::FMulAdd( a2, v2, value ); - - float32v p3 = xri + float32v( -0.5f ) - ( zri - yri ); - mask32v flip3 = p3 >= float32v( 0.0f ); - float32v a3 = FS::Max( FS::MaskedAdd( flip3, a + zri, p3 ), float32v( 0 ) ); - a3 *= a3; a3 *= a3; - int32v h3 = HashPrimes( seed, FS::MaskedAdd( flip3, xrbp, int32v( Primes::X )), FS::MaskedAdd( flip3, yrbp, int32v( Primes::Y )), FS::InvMaskedAdd( flip3, zrbp, int32v( Primes::Z ))); - float32v v3 = GetGradientDot( h3, FS::MaskedDecrement( flip3, xri ), FS::MaskedDecrement( flip3, yri ), FS::InvMaskedSub( flip3, zri, float32v( 1.0f ) ) ); - value = FS::FMulAdd( a3, v3, value ); - - if( i == 1 ) + // Perform a double unskew to get the vector whose dot product with skewed vectors produces the unskewed result. + float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * ( dxSkewed + dySkewed + dzSkewed + dwSkewed ); + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v wNormal = dwSkewed + twiceUnskewDelta; + float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal + + // Using those, compare scores to determine which vertex is closest. + constexpr auto considerVertex = [] ( float32v& maxScore, int32v& moveMaskBits, float32v score, int32v bits ) constexpr { - break; - } + moveMaskBits = FS::Select( score > maxScore, bits, moveMaskBits ); + maxScore = FS::Max( maxScore, score ); + }; + float32v maxScore = float32v( 0.6f ) - xyzwNormal; + int32v moveMaskBits = FS::Masked( float32v( 0.2f ) > maxScore, int32v( -1 ) ); + maxScore = FS::Max( maxScore, float32v( 0.2f ) ); + considerVertex( maxScore, moveMaskBits, -wNormal, 0b0111 ); + considerVertex( maxScore, moveMaskBits, -zNormal, 0b1011 ); + considerVertex( maxScore, moveMaskBits, -yNormal, 0b1101 ); + considerVertex( maxScore, moveMaskBits, -xNormal, 0b1110 ); + maxScore += xyzwNormal - float32v( 0.2f ); + considerVertex( maxScore, moveMaskBits, xNormal, 0b0001 ); + considerVertex( maxScore, moveMaskBits, yNormal, 0b0010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b0100 ); + considerVertex( maxScore, moveMaskBits, wNormal, 0b1000 ); + maxScore += float32v( 0.2f ) - xNormal; + considerVertex( maxScore, moveMaskBits, yNormal, 0b0011 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b0101 ); + considerVertex( maxScore, moveMaskBits, wNormal, 0b1001 ); + maxScore += xNormal; + considerVertex( maxScore, moveMaskBits, yNormal + zNormal, 0b0110 ); + maxScore -= wNormal; + considerVertex( maxScore, moveMaskBits, yNormal, 0b1010 ); + considerVertex( maxScore, moveMaskBits, zNormal, 0b1100 ); + + mask32v moveX = ( moveMaskBits & int32v( 0b0001 ) ) != int32v( 0 ); + mask32v moveY = ( moveMaskBits & int32v( 0b0010 ) ) != int32v( 0 ); + mask32v moveZ = ( moveMaskBits & int32v( 0b0100 ) ) != int32v( 0 ); + mask32v moveW = ( moveMaskBits & int32v( 0b1000 ) ) != int32v( 0 ); + + xSkewedBase = FS::MaskedIncrement( moveX, xSkewedBase ); + ySkewedBase = FS::MaskedIncrement( moveY, ySkewedBase ); + zSkewedBase = FS::MaskedIncrement( moveZ, zSkewedBase ); + wSkewedBase = FS::MaskedIncrement( moveW, wSkewedBase ); + + dxSkewed = FS::MaskedDecrement( moveX, dxSkewed ); + dySkewed = FS::MaskedDecrement( moveY, dySkewed ); + dzSkewed = FS::MaskedDecrement( moveZ, dzSkewed ); + dwSkewed = FS::MaskedDecrement( moveW, dwSkewed ); + } + + int32v xPrimedBase = FS::Convert( xSkewedBase ) * int32v( Primes::X ); + int32v yPrimedBase = FS::Convert( ySkewedBase ) * int32v( Primes::Y ); + int32v zPrimedBase = FS::Convert( zSkewedBase ) * int32v( Primes::Z ); + int32v wPrimedBase = FS::Convert( wSkewedBase ) * int32v( Primes::W ); + + float32v skewedCoordinateSum = dxSkewed + dySkewed + dzSkewed + dwSkewed; + float32v twiceUnskewDelta = float32v( kTwiceUnskew4 ) * skewedCoordinateSum; + float32v xNormal = dxSkewed + twiceUnskewDelta; + float32v yNormal = dySkewed + twiceUnskewDelta; + float32v zNormal = dzSkewed + twiceUnskewDelta; + float32v wNormal = dwSkewed + twiceUnskewDelta; + float32v xyzwNormal = -twiceUnskewDelta; // xNormal + yNormal + zNormal + wNormal + + float32v unskewDelta = float32v( kUnskew4 ) * skewedCoordinateSum; + float32v dxBase = dxSkewed + unskewDelta; + float32v dyBase = dySkewed + unskewDelta; + float32v dzBase = dzSkewed + unskewDelta; + float32v dwBase = dwSkewed + unskewDelta; + + float32v coordinateSum = float32v( 1 + 4 * kUnskew4 ) * skewedCoordinateSum; // dxBase + dyBase + dzBase + dwBase + + // Vertex <0, 0, 0, 0> + float32v value, falloffBaseStemA, falloffBaseStemB; + { + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase, dyBase, dzBase, dwBase ); + float32v falloffBase = FS::FNMulAdd( dwBase, dwBase, FS::FNMulAdd( dzBase, dzBase, FS::FNMulAdd( dyBase, dyBase, FS::FNMulAdd( dxBase, dxBase, float32v( kFalloffRadiusSquared ) ) ) ) ) * float32v( 0.5f ); + falloffBaseStemA = falloffBase - float32v( kDistanceSquaredA * 0.5 ); + falloffBaseStemB = falloffBase - float32v( kDistanceSquaredB * 0.5 ); + value = ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ) * gradientRampValue; + } + + // Vertex <1, 1, 1, 1> or <-1, -1, -1, -1> + { + mask32v signMask = xyzwNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset = float32v( 4 * kUnskew4 + 1 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimed ), dxBase - offset, dyBase - offset, dzBase - offset, dwBase - offset ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset, coordinateSum, falloffBaseStemA ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 1, 1, 0> or <-1, -1, -1, 0> + { + mask32v signMask = xyzwNormal < wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset1, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dwBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 1, 0, 1> or <-1, -1, 0, -1> + { + mask32v signMask = xyzwNormal < zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dzBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 1, 1> or <-1, 0, -1, -1> + { + mask32v signMask = xyzwNormal < yNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dyBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 1, 1> or <0, -1, -1, -1> + { + mask32v signMask = xyzwNormal < xNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 3 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 3 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset1, coordinateSum, falloffBaseStemB ) - ( sign ^ dxBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 0, 0> or <-1, 0, 0, 0> + { + mask32v signMask = xNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dxBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 1, 0, 0> or <-1, -1, 0, 0> + { + mask32v signMask = xNormal < -yNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset1, dyBase - offset1, dzBase - offset0, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dyBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 1, 0> or <-1, 0, -1, 0> + { + mask32v signMask = xNormal < -zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset1, dyBase - offset0, dzBase - offset1, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dzBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <1, 0, 0, 1> or <-1, 0, 0, -1> + { + mask32v signMask = xNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v xPrimed = xPrimedBase + FS::Select( signMask, int32v( -Primes::X ), int32v( Primes::X ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimed, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset1, dyBase - offset0, dzBase - offset0, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dxBase + dwBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 0, 0> or <0, -1, 0, 0> + { + mask32v signMask = yNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dyBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 1, 0> or <0, -1, -1, 0> + { + mask32v signMask = yNormal < -zNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; - mask32v sideX = xri >= float32v( 0.5f ); - mask32v sideY = yri >= float32v( 0.5f ); - mask32v sideZ = zri >= float32v( 0.5f ); + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset1, dzBase - offset1, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dzBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 1, 0, 1> or <0, -1, 0, -1> + { + mask32v signMask = yNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v yPrimed = yPrimedBase + FS::Select( signMask, int32v( -Primes::Y ), int32v( Primes::Y ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimed, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset1, dzBase - offset0, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dyBase + dwBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 0, 1, 0> or <0, 0, -1, 0> + { + mask32v signMask = zNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); - xrbp = FS::MaskedAdd( sideX, xrbp, int32v( Primes::X ) ); - yrbp = FS::MaskedAdd( sideY, yrbp, int32v( Primes::Y ) ); - zrbp = FS::MaskedAdd( sideZ, zrbp, int32v( Primes::Z ) ); + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); - xri += FS::Select( sideX, float32v( -0.5f ), float32v( 0.5f ) ); - yri += FS::Select( sideY, float32v( -0.5f ), float32v( 0.5f ) ); - zri += FS::Select( sideZ, float32v( -0.5f ), float32v( 0.5f ) ); + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; - seed = ~seed; + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimedBase ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset0 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dzBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); } - - constexpr float kBounding = 144.736422163332608f; + + // Vertex <0, 0, 1, 1> or <0, 0, -1, -1> + { + mask32v signMask = zNormal < -wNormal; + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v zPrimed = zPrimedBase + FS::Select( signMask, int32v( -Primes::Z ), int32v( Primes::Z ) ); + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( 2 * kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( 2 * kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimed, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset1, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemB ) + ( sign ^ ( dzBase + dwBase ) ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + // Vertex <0, 0, 0, 1> or <0, 0, 0, -1> + { + mask32v signMask = wNormal < float32v( 0 ); + float32v sign = FS::Masked( signMask, float32v( FS::Cast( int32v( 1 << 31 ) ) ) ); + + int32v wPrimed = wPrimedBase + FS::Select( signMask, int32v( -Primes::W ), int32v( Primes::W ) ); + + float32v offset1 = float32v( kUnskew4 + 1 ) ^ sign; + float32v offset0 = float32v( kUnskew4 ) ^ sign; + + float32v gradientRampValue = GetGradientDotSimplex( HashPrimes( seed, xPrimedBase, yPrimedBase, zPrimedBase, wPrimed ), dxBase - offset0, dyBase - offset0, dzBase - offset0, dwBase - offset1 ); + float32v falloffBase = FS::Max( FS::FMulAdd( offset0, coordinateSum, falloffBaseStemA ) + ( sign ^ dwBase ), float32v( 0.0f ) ); + value = FS::FMulAdd( ( falloffBase * falloffBase ) * ( falloffBase * falloffBase ), gradientRampValue, value ); + } + + constexpr double kBounding = 115.21625311930542; return this->ScaleOutput( value, -1 / kBounding, 1 / kBounding ); } }; - diff --git a/include/FastNoise/Generators/Utils.inl b/include/FastNoise/Generators/Utils.inl index e706e75..62acc44 100644 --- a/include/FastNoise/Generators/Utils.inl +++ b/include/FastNoise/Generators/Utils.inl @@ -13,210 +13,932 @@ namespace FastNoise static constexpr int Lookup[] = { X,Y,Z,W }; } + static constexpr double kRoot2 = 1.4142135623730950488016887242097; + static constexpr double kRoot3 = 1.7320508075688772935274463415059; + static constexpr double kRoot5 = 2.2360679774997896964091736687313; + static constexpr double kSkew2 = 1.0 / ( kRoot3 + 1.0 ); + static constexpr double kSkew4 = 1.0 / ( kRoot5 + 1.0 ); + static constexpr float kValueBounds = 2147483648.f; - static constexpr float kRoot2 = 1.4142135623730950488f; - static constexpr float kRoot3 = 1.7320508075688772935f; + static constexpr float kRoot2f = kRoot2; + static constexpr float kRoot3f = kRoot3; + static constexpr float kSkew2f = kSkew2; + static constexpr float kSkew4f = kSkew4; template - FS_FORCEINLINE static float32v GetGradientDotFancy( int32v hash, float32v fX, float32v fY ) + FS_FORCEINLINE static float32v GetGradientDotSimplex( int32v hash31, float32v fX, float32v fY ) { - int32v index = FS::Convert( FS::Convert( hash & int32v( 0x3FFFFF ) ) * float32v( 1.3333333333333333f ) ); + int32v index = FS::BitShiftRightZeroExtend( hash31, 1 ) * int32v( 12 >> 2 ); // [0,12) in the upper four bits if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( kRoot3, kRoot3, 2, 2, 1, -1, 0, 0, -kRoot3, -kRoot3, -2, -2, -1, 1, 0, 0 ) ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( 1, -1, 0, 0, kRoot3, kRoot3, 2, 2, -1, 1, 0, 0, -kRoot3, -kRoot3, -2, -2 ) ); + index >>= 28; + + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) ); return FS::FMulAdd( gX, fX, fY * gY ); } else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kRoot3, kRoot3, 2, 2, 1, -1, 0, 0 ), index ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, -1, 0, 0, kRoot3, kRoot3, 2, 2 ), index ); + float32v finalSign = FS::Cast( ( index >> 28 ) << 31 ); + index >>= 29; + + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), index ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), index ); - // Bit-8 = Flip sign of a + b - return FS::FMulAdd( gX, fX, fY * gY ) ^ FS::Cast( ( index >> 3 ) << 31 ); + return FS::FMulAdd( gX, fX, fY * gY ) ^ finalSign; } else { - // Bit-3 = Choose X Y ordering - mask32v bit3; + float32v u = FS::SelectHighBit( index << 2, fY, fX ); + float32v v = FS::SelectHighBit( index << 2, fX, fY ); - if constexpr( SIMD & FastSIMD::FeatureFlag::SSE2 ) - { - if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) - { - bit3 = FS::Cast>( index << 29 ); - } - else - { - bit3 = FS::Cast>( ( index << 29 ) >> 31 ); - } - } - else - { - bit3 = ( index & int32v( 1 << 2 ) ) != int32v( 0 ); - } + float32v a = u * FS::SelectHighBit( index, float32v( 2 ), float32v( kRoot3f ) ); + float32v b = v ^ FS::Cast( ( index >> 30 ) << 31 ); - float32v a = FS::Select( bit3, fY, fX ); - float32v b = FS::Select( bit3, fX, fY ); + return FS::MaskedAdd( index >= int32v( 0 ), a, b ) ^ FS::Cast( ( index >> 28 ) << 31 ); + } + } - // Bit-1 = b flip sign - b ^= FS::Cast( index << 31 ); + template + FS_FORCEINLINE static float32v GetGradientDotSimplex( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v fW ) + { + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 2 ); + int32v index = hashShifted * int32v( 20 >> 2 ); // [0,20) in the upper five bits + + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + index = FS::BitShiftRightZeroExtend( index, 27 ); - // Bit-2 = Mul a by 2 or Root3 - mask32v bit2 = ( index & int32v( 2 ) ) == int32v( 0 ); + const auto tableX = FS::Constant( kSkew4f + 1, kSkew4f, kSkew4f, kSkew4f, -1, 1, 0, 0, -1, 0, 1, 0, -1, 0, 0, 1 ); + const auto tableY = FS::Constant( kSkew4f, kSkew4f + 1, kSkew4f, kSkew4f, 1, -1, 0, 0, 0, -1, 0, 1, 0, -1, 1, 0 ); + const auto tableZ = FS::Constant( kSkew4f, kSkew4f, kSkew4f + 1, kSkew4f, 0, 0, -1, 1, 1, 0, -1, 0, 0, 1, -1, 0 ); + const auto tableW = FS::Constant( kSkew4f, kSkew4f, kSkew4f, kSkew4f + 1, 0, 0, 1, -1, 0, 1, 0, -1, 1, 0, 0, -1 ); - a *= FS::Select( bit2, float32v( 2 ), float32v( kRoot3 ) ); - // b zero value if a mul 2 - float32v c = FS::MaskedAdd( bit2, a, b ); + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, index, -tableX ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, index, -tableY ); + float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, index, -tableZ ); + float32v gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, index, -tableW ); - // Bit-4 = Flip sign of a + b - return c ^ FS::Cast( ( index >> 3 ) << 31 ); + return FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) ); + } + else + { + int32v indexA = index & int32v( 0x03 << 27 ); + int32v indexB = ( index >> 2 ) & int32v( 0x07 << 27 ); + indexB ^= indexA; // Simplifies the AVX512_F case. + + mask32v extra = indexB >= int32v( 0x04 << 27 ); + mask32v equal = ( indexA == indexB ); + indexA |= FS::Cast( equal ); // Forces decrement conditions to fail. + + float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) ); + + float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) ); + float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) ); + float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) ); + float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) ); + + return FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) ); } } template - FS_FORCEINLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY ) + FS_FORCEINLINE static float32v GetGradientDotCommon( int32v hash31, float32v fX, float32v fY, float32v fZ ) + { + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 ); + int32v index = FS::BitShiftRightZeroExtend( hashShifted * int32v( 12 >> 2 ), 28 ); // [0,12) + + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 0, 0, 0, 0 ) ); + float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 0, 0, 0 ) ); + + return FS::FMulAdd( gZ, fZ, FS::FMulAdd( fY, gY, fX * gX ) ); + } + else + { + float32v sign0 = FS::Cast( index << 31 ); + float32v sign1 = FS::Cast( ( index >> 1 ) << 31 ); + + mask32v thirdCombo = constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) ? + FS::Cast>( index << ( 31 - 3 ) ) : + index >= int32v( 8 ); + + float32v u = FS::Select( thirdCombo, fY, fX ); + float32v v = FS::Select( index >= int32v( 4 ), fZ, fY ); + + return ( u ^ sign0 ) + ( v ^ sign1 ); + } + } + + template + FS_FORCEINLINE static float32v GetGradientDotPerlin( int32v hash, float32v fX, float32v fY ) { // ( 1+R2, 1 ) ( -1-R2, 1 ) ( 1+R2, -1 ) ( -1-R2, -1 ) // ( 1, 1+R2 ) ( 1, -1-R2 ) ( -1, 1+R2 ) ( -1, -1-R2 ) if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1 + kRoot2, -1 - kRoot2, 1 + kRoot2, -1 - kRoot2, 1, -1, 1, -1, 1 + kRoot2, -1 - kRoot2, 1 + kRoot2, -1 - kRoot2, 1, -1, 1, -1 ) ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1, 1, -1, -1, 1 + kRoot2, 1 + kRoot2, -1 - kRoot2, -1 - kRoot2, 1, 1, -1, -1, 1 + kRoot2, 1 + kRoot2, -1 - kRoot2, -1 - kRoot2 ) ); + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1, 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f, 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f ) ); return FS::FMulAdd( gX, fX, fY * gY ); } else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1 + kRoot2, -1 - kRoot2, 1 + kRoot2, -1 - kRoot2, 1, -1, 1, -1 ), hash ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, 1, -1, -1, 1 + kRoot2, 1 + kRoot2, -1 - kRoot2, -1 - kRoot2 ), hash ); + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1 + kRoot2f, -1 - kRoot2f, 1 + kRoot2f, -1 - kRoot2f, 1, -1, 1, -1 ), hash ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, 1, -1, -1, 1 + kRoot2f, 1 + kRoot2f, -1 - kRoot2f, -1 - kRoot2f ), hash ); return FS::FMulAdd( gX, fX, fY * gY ); } else { - int32v bit1 = hash << 31; - int32v bit2 = ( hash >> 1 ) << 31; - int32v bit4 = hash << 29; + fX ^= FS::Cast( hash << 31 ); + fY ^= FS::Cast( ( hash >> 1 ) << 31 ); + + float32v u = FS::SelectHighBit( hash << 29, fY, fX ); + float32v v = FS::SelectHighBit( hash << 29, fX, fY ); + + return FS::FMulAdd( float32v( 1.0f + kRoot2f ), u, v ); + } + } + + template + FS_FORCEINLINE static float32v GetGradientDotPerlin( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW ) + { + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ), hash, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ), hash, FS::Constant( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ) ); + float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ), hash, FS::Constant( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 ) ); + float32v gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1 ), hash, FS::Constant( 1, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ); + + return FS::FMulAdd( gX, fX, FS::FMulAdd( fY, gY, FS::FMulAdd( fZ, gZ, fW * gW ) )); + } + else + { + int32v p = hash & int32v( 3 << 3 ); + + float32v a = FS::Select( p > int32v( 0 ), fX, fY ); + float32v b = FS::SelectHighBit( hash << 27, fY, fZ ); + float32v c = FS::Select( p > int32v( 2 << 3 ), fZ, fW ); + + float32v aSign = FS::Cast( hash << 31 ); + float32v bSign = FS::Cast( ( hash >> 1 ) << 31 ); + float32v cSign = FS::Cast( ( hash >> 2 ) << 31 ); + + return ( a ^ aSign ) + ( b ^ bSign ) + ( c ^ cSign ); + } + } + + template + FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY ) + { + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 ); + int32v indexGradient = hashShifted * int32v( 12 >> 2 ); // [0,12) in the upper four bits + int32v indexOuterVector = ( hashShifted * int32v( ( -4LL << 30 ) / 3 ) ) & int32v( 0xC0000003 ); // [0,12) in bits 0,1,30,31 + + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + indexGradient >>= 28; + indexOuterVector |= indexOuterVector >> 28; + + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) ); + + multiplier *= FS::FMulAdd( fY, gY, fX * gX ); + + valueX = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant( kRoot3f, -kRoot3f, 1, -1, kRoot3f, -kRoot3f, -1, 1, 2, -2, 0, 0, 0, 0, 0, 0 ) ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant( 1, -1, kRoot3f, -kRoot3f, -1, 1, kRoot3f, -kRoot3f, 0, 0, 2, -2, 0, 0, 0, 0 ) ), valueY ); + } + else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 ) + { + float32v finalSign = FS::Cast( ( ( indexGradient >> 28 ) ^ indexOuterVector ) << 31 ); + indexGradient >>= 29; + indexOuterVector = ( indexOuterVector >> 1 ) | ( indexOuterVector >> 29 ); + + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), indexGradient ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), indexGradient ); + + multiplier *= FS::FMulAdd( fY, gY, fX * gX ) ^ finalSign; + + valueX = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kRoot3f, 1, kRoot3f, -1, 2, 0, 0, 0 ), indexOuterVector ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( 1, kRoot3f, -1, kRoot3f, 0, 2, 0, 0 ), indexOuterVector ), valueY ); + } + else + { + { + float32v u = FS::SelectHighBit( indexGradient << 2, fY, fX ); + float32v v = FS::SelectHighBit( indexGradient << 2, fX, fY ); + + float32v a = u * FS::SelectHighBit( indexGradient, float32v( 2 ), float32v( kRoot3f ) ); + float32v b = v ^ FS::Cast( ( indexGradient >> 30 ) << 31 ); + + multiplier *= FS::MaskedAdd( indexGradient >= int32v( 0 ), a, b ) ^ FS::Cast( ( ( indexGradient >> 28 ) ^ indexOuterVector ) << 31 ); + } - if constexpr( !( SIMD & FastSIMD::FeatureFlag::SSE41 ) ) { - bit4 >>= 31; + float32v a = multiplier * FS::SelectHighBit( indexOuterVector, float32v( 2 ), float32v( kRoot3f ) ); + float32v b = FS::Masked( indexOuterVector >= int32v( 0 ), multiplier ) ^ FS::Cast( ( indexOuterVector >> 30 ) << 31 ); + + valueX += FS::SelectHighBit( indexOuterVector << 30, b, a ); + valueY += FS::SelectHighBit( indexOuterVector << 30, a, b ); } + } + } - auto bit4Mask = FS::Cast>( bit4 ); + template + FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductCommon( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ ) + { + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 ); + int32v indexGradient = FS::BitShiftRightZeroExtend( hashShifted * int32v( 12 >> 2 ), 28 ); // [0,12) + int32v indexOuterVector = ( hashShifted * int32v( ( -4LL << 30 ) / 3 ) ) & int32v( 0xC0000003 ); // [0,12) in bits 0,1,30,31 + indexOuterVector |= indexOuterVector >> 28; - fX ^= FS::Cast( bit1 ); - fY ^= FS::Cast( bit2 ); + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 0, 0, 0, 0 ) ); + float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexGradient, FS::Constant( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 0, 0, 0 ) ); - float32v a = FS::Select( bit4Mask, fY, fX ); - float32v b = FS::Select( bit4Mask, fX, fY ); + multiplier *= FS::FMulAdd( gZ, fZ, FS::FMulAdd( fY, gY, fX * gX ) ); - return FS::FMulAdd( float32v( 1.0f + kRoot2 ), a, b ); + valueX = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 0, 0, 0, 0 ) ), valueY ); + valueZ = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexOuterVector, FS::Constant( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 0, 0, 0 ) ), valueZ ); + } + else + { + { + float32v sign0 = FS::Cast( indexGradient << 31 ); + float32v sign1 = FS::Cast( ( indexGradient >> 1 ) << 31 ); + + mask32v thirdCombo = constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) ? + FS::Cast>( indexGradient << ( 31 - 3 ) ) : + indexGradient >= int32v( 8 ); + + float32v u = FS::Select( thirdCombo, fY, fX ); + float32v v = FS::Select( indexGradient >= int32v( 4 ), fZ, fY ); + + multiplier *= ( u ^ sign0 ) + ( v ^ sign1 ); + } + + { + indexOuterVector &= int32v( 0xF ); + + float32v signed0 = multiplier ^ FS::Cast( indexOuterVector << 31 ); + float32v signed1 = multiplier ^ FS::Cast( ( indexOuterVector >> 1 ) << 31 ); + + mask32v notYZ = indexOuterVector < int32v( 8 ); + mask32v notXY = indexOuterVector >= int32v( 4 ); + + valueX = FS::MaskedAdd( notYZ, valueX, signed0 ); + valueZ = FS::MaskedAdd( notXY, valueZ, signed1 ); + valueY = FS::InvMaskedAdd( notYZ & notXY, valueY, FS::Select( notXY, signed0, signed1 ) ); + } } } - + template - FS_FORCEINLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ ) - { + FS_FORCEINLINE static void ApplyGradientOuterProductVectorProductSimplex( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW ) + { + int32v hashShifted = FS::BitShiftRightZeroExtend( hash, 2 ); + int32v indexGradient = hashShifted * int32v( 20 >> 2 ); // [0,20) in the upper five bits + int32v indexOuterVector = hashShifted * int32v( ( -8LL << 29 ) / 5 ); + indexOuterVector = ( indexOuterVector & int32v( 0xE0000003 ) ) * int32v( 3 | ( 1 << 27 ) ); // [0,20) in the upper five bits, independently of the above + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 1, 0, -1, 0 ) ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 1, 1, -1, -1, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ) ); - float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), hash, FS::Constant( 0, 0, 0, 0, 1, 1, -1, -1, 1, 1, -1, -1, 0, 1, 0, -1 ) ); + indexGradient = FS::BitShiftRightZeroExtend( indexGradient, 27 ); + indexOuterVector = FS::BitShiftRightZeroExtend( indexOuterVector, 27 ); + + const auto tableX = FS::Constant( kSkew4f + 1, kSkew4f, kSkew4f, kSkew4f, -1, 1, 0, 0, -1, 0, 1, 0, -1, 0, 0, 1 ); + const auto tableY = FS::Constant( kSkew4f, kSkew4f + 1, kSkew4f, kSkew4f, 1, -1, 0, 0, 0, -1, 0, 1, 0, -1, 1, 0 ); + const auto tableZ = FS::Constant( kSkew4f, kSkew4f, kSkew4f + 1, kSkew4f, 0, 0, -1, 1, 1, 0, -1, 0, 0, 1, -1, 0 ); + const auto tableW = FS::Constant( kSkew4f, kSkew4f, kSkew4f, kSkew4f + 1, 0, 0, 1, -1, 0, 1, 0, -1, 1, 0, 0, -1 ); - return FS::FMulAdd( gX, fX, FS::FMulAdd( fY, gY, fZ * gZ )); + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, indexGradient, -tableX ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, indexGradient, -tableY ); + float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, indexGradient, -tableZ ); + float32v gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, indexGradient, -tableW ); + + multiplier *= FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) ); + + valueX = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableX, indexOuterVector, -tableX ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableY, indexOuterVector, -tableY ), valueY ); + valueZ = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableZ, indexOuterVector, -tableZ ), valueZ ); + valueW = FS::FMulAdd( multiplier, FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableW, indexOuterVector, -tableW ), valueW ); } else { - int32v hasha13 = hash & int32v( 13 ); - - // if h > 7 then y, else x - mask32v gt7; - if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) { - gt7 = FS::Cast>( hash << 28 ); + int32v indexA = indexGradient & int32v( 0x03 << 27 ); + int32v indexB = ( indexGradient >> 2 ) & int32v( 0x07 << 27 ); + indexB ^= indexA; // Simplifies the AVX512_F case. + + mask32v extra = indexB >= int32v( 0x04 << 27 ); + mask32v equal = ( indexA == indexB ); + indexA |= FS::Cast( equal ); // Forces decrement conditions to fail. + + float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) ); + + float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) ); + float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) ); + float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) ); + float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) ); + + multiplier *= FS::FMulAdd( gW, fW, FS::FMulAdd( gZ, fZ, FS::FMulAdd( gY, fY, gX * fX ) ) ); } - else + { - gt7 = hasha13 > int32v( 7 ); + int32v indexA = indexOuterVector & int32v( 0x03 << 27 ); + int32v indexB = ( indexOuterVector >> 2 ) & int32v( 0x07 << 27 ); + indexB ^= indexA; // Simplifies the AVX512_F case. + + mask32v extra = indexB >= int32v( 0x04 << 27 ); + mask32v equal = ( indexA == indexB ); + indexA |= FS::Cast( equal ); // Forces decrement conditions to fail. + + float32v neutral = FS::Masked( equal | extra, FS::MaskedMul( extra, float32v( kSkew4f ), float32v( -1.0f ) ) ); + + float32v gX = FS::MaskedIncrement( indexB == int32v( 0 << 27 ), FS::MaskedDecrement( indexA == int32v( 0 << 27 ), neutral ) ); + float32v gY = FS::MaskedIncrement( indexB == int32v( 1 << 27 ), FS::MaskedDecrement( indexA == int32v( 1 << 27 ), neutral ) ); + float32v gZ = FS::MaskedIncrement( indexB == int32v( 2 << 27 ), FS::MaskedDecrement( indexA == int32v( 2 << 27 ), neutral ) ); + float32v gW = FS::MaskedIncrement( indexB == int32v( 3 << 27 ), FS::MaskedDecrement( indexA == int32v( 3 << 27 ), neutral ) ); + + valueX = FS::FMulAdd( multiplier, gX, valueX ); + valueY = FS::FMulAdd( multiplier, gY, valueY ); + valueZ = FS::FMulAdd( multiplier, gZ, valueZ ); + valueW = FS::FMulAdd( multiplier, gW, valueW ); } - float32v u = FS::Select( gt7, fY, fX ); - - // if h < 4 then y else if h is 12 or 14 then x else z - float32v v = FS::Select( hasha13 == int32v( 12 ), fX, fZ ); - v = FS::Select( hasha13 < int32v( 2 ), fY, v ); - - // if h1 then -u else u - // if h2 then -v else v - float32v h1 = FS::Cast( hash << 31 ); - float32v h2 = FS::Cast( ( hash >> 1 ) << 31 ); - // then add them - return ( u ^ h1 ) + ( v ^ h2 ); } } - + template - FS_FORCEINLINE static float32v GetGradientDot( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW ) + FS_FORCEINLINE static void ApplyOrthogonalGradientMatrixVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY ) { + int32v index = FS::BitShiftRightZeroExtend( hash31, 1 ) * int32v( 12 >> 2 ); // [0,12) in the upper four bits + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) { - float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 0, 0, 0, 0, 0, 0, 0, 0, 1, -1, 1, -1, 1, -1, 1, -1 ), hash, FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1 ) ); - float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, -1, 1, -1, 1, -1, 1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ), hash, FS::Constant( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ) ); - float32v gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1 ), hash, FS::Constant( 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, -1, -1, -1, -1 ) ); - float32v gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), FS::Constant( 1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1 ), hash, FS::Constant( 1, 1, 1, 1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 ) ); + index = FS::BitShiftRightZeroExtend( index, 28 ); - return FS::FMulAdd( gX, fX, FS::FMulAdd( fY, gY, FS::FMulAdd( fZ, gZ, fW * gW ) )); + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( kSkew2f, -kSkew2f, kSkew2f, -kSkew2f, kSkew2f + 1, -kSkew2f - 1, kSkew2f + 1, -kSkew2f - 1, 1, -1, 1, -1, 0, 0, 0, 0 ) ); + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), index, FS::Constant( kSkew2f + 1, kSkew2f + 1, -kSkew2f - 1, -kSkew2f - 1, kSkew2f, kSkew2f, -kSkew2f, -kSkew2f, 1, 1, -1, -1, 0, 0, 0, 0 ) ); + + valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX ); + multiplier ^= FS::Cast( hash31 << 31 ); + valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY ); + } + else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 ) + { + float32v signX = FS::Cast( ( index >> 28 ) << 31 ); + index = FS::BitShiftRightZeroExtend( index, 29 ); + + float32v gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kSkew2f, kSkew2f, kSkew2f + 1, kSkew2f + 1, 1, 1, 0, 0 ), index ) ^ signX; + float32v gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( kSkew2f + 1, -kSkew2f - 1, kSkew2f, -kSkew2f, 1, -1, 0, 0 ), index ); + + valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX ); + multiplier ^= FS::Cast( hash31 << 31 ); + valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY ); } else { - int32v p = hash & int32v( 3 << 3 ); + int32v ofThree = FS::BitShiftRightZeroExtend( index, 30 ); + float32v signX = FS::Cast( ( index >> 28 ) << 31 ); + float32v signY = FS::Cast( ( index >> 29 ) << 31 ); + + float32v masked = FS::Masked( index >= int32v( 0 ), float32v( kSkew2f ) ); + float32v gX = FS::MaskedIncrement( ofThree != int32v( 0 ), masked ) ^ signX; + float32v gY = FS::MaskedIncrement( ofThree != int32v( 1 ), masked ) ^ signY; + + valueX = FS::FMulAdd( multiplier, FS::FMulAdd( fY, gY, fX * gX ), valueX ); + multiplier ^= FS::Cast( hash31 << 31 ); + valueY = FS::FMulAdd( multiplier, FS::FMulSub( fY, gX, fX * gY ), valueY ); + } + } + + template + FS_FORCEINLINE static void ApplyOrthogonalGradientMatrixVectorProductCommon( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ ) + { + const float kComponentA = 2.224744871391589f; + const float kComponentB = -0.224744871391589f; + const float kComponentC = -1.0f; + const float kComponentsDE = 1.0f; + const float kComponentF = 2.0f; + + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 1 ); + int32v indexFacetBasisWithPermute2 = hashShifted * int32v( ( -4LL << 30 ) / 3 ); // [0,3) in the highest two bits, [0,8) in the lowest three bits + int32v indexPermutation2HighBit = ( indexFacetBasisWithPermute2 << 29 ); // & int32v( 1 << 31 ); // [0,1) in the most significant bit + int32v indexPermutation3 = FS::BitShiftRightZeroExtend( hashShifted * int32v( 3 ), 30 ); // [0,3) + float32v finalSign = FS::Cast( hash31 << 31 ); + + float32v valueAB, valueBA, valueC; + + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + //indexFacetBasisWithPermute2 = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_rol_epi32 ), indexFacetBasisWithPermute2, 2 ); + indexFacetBasisWithPermute2 = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_rolv_epi32 ), indexFacetBasisWithPermute2, int32v( 2 ) ); + + const auto tableA_gX = FS::Constant( kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC, kComponentA, kComponentA, kComponentC, kComponentC, -kComponentA, -kComponentA, kComponentC, kComponentC ); + const auto tableA_gY = FS::Constant( kComponentC, kComponentB, kComponentA, kComponentA, kComponentC, kComponentB, -kComponentA, -kComponentA, kComponentC, -kComponentB, kComponentA, kComponentA, kComponentC, -kComponentB, -kComponentA, -kComponentA ); + const auto tableA_gZ = FS::Constant( kComponentB, kComponentC, kComponentB, kComponentB, kComponentB, kComponentC, kComponentB, kComponentB, -kComponentB, kComponentC, -kComponentB, -kComponentB, -kComponentB, kComponentC, -kComponentB, -kComponentB ); + + const auto tableB_gX = FS::Constant( kComponentB, kComponentB, kComponentC, kComponentC, -kComponentB, -kComponentB, kComponentC, kComponentC, kComponentB, kComponentB, kComponentC, kComponentC, -kComponentB, -kComponentB, kComponentC, kComponentC ); + const auto tableB_gY = FS::Constant( kComponentC, kComponentA, kComponentB, kComponentB, kComponentC, kComponentA, -kComponentB, -kComponentB, kComponentC, -kComponentA, kComponentB, kComponentB, kComponentC, -kComponentA, -kComponentB, -kComponentB ); + const auto tableB_gZ = FS::Constant( kComponentA, kComponentC, kComponentA, kComponentA, kComponentA, kComponentC, kComponentA, kComponentA, -kComponentA, kComponentC, -kComponentA, -kComponentA, -kComponentA, kComponentC, -kComponentA, -kComponentA ); + + const auto tableC_gX = FS::Constant( kComponentsDE, kComponentsDE, kComponentF, kComponentF, kComponentC, kComponentC, kComponentF, kComponentF, kComponentsDE, kComponentsDE, kComponentF, kComponentF, kComponentC, kComponentC, kComponentF, kComponentF ); + const auto tableC_gY = FS::Constant( kComponentF, kComponentsDE, kComponentsDE, kComponentsDE, kComponentF, kComponentsDE, kComponentC, kComponentC, kComponentF, kComponentC, kComponentsDE, kComponentsDE, kComponentF, kComponentC, kComponentC, kComponentC ); + const auto tableC_gZ = FS::Constant( kComponentsDE, kComponentF, kComponentsDE, kComponentsDE, kComponentsDE, kComponentF, kComponentsDE, kComponentsDE, kComponentC, kComponentF, kComponentC, kComponentC, kComponentC, kComponentF, kComponentC, kComponentC ); + + float32v valueAB_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gX, indexFacetBasisWithPermute2, tableB_gX ); + float32v valueAB_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gY, indexFacetBasisWithPermute2, tableB_gY ); + float32v valueAB_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableA_gZ, indexFacetBasisWithPermute2, tableB_gZ ); + valueAB = FS::FMulAdd( valueAB_gZ, fZ, FS::FMulAdd( fY, valueAB_gY, fX * valueAB_gX ) ); + + float32v valueBA_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gX, indexFacetBasisWithPermute2, tableA_gX ); + float32v valueBA_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gY, indexFacetBasisWithPermute2, tableA_gY ); + float32v valueBA_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), tableB_gZ, indexFacetBasisWithPermute2, tableA_gZ ); + valueBA = FS::FMulAdd( valueBA_gZ, fZ, FS::FMulAdd( fY, valueBA_gY, fX * valueBA_gX ) ); + + float32v valueC_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gX ); + float32v valueC_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gY ); + float32v valueC_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutexvar_ps ), indexFacetBasisWithPermute2, tableC_gZ ); + valueC = FS::FMulAdd( valueC_gZ, fZ, FS::FMulAdd( fY, valueC_gY, fX * valueC_gX ) ); + } + else + { + float32v sign0 = FS::Cast( indexFacetBasisWithPermute2 << 31 ); + float32v sign1 = FS::Cast( ( indexFacetBasisWithPermute2 << 30 ) & int32v( 1 << 31 ) ); + + auto notYZ = indexFacetBasisWithPermute2 >= int32v( 0 ); + auto notXY = ( indexFacetBasisWithPermute2 << 1 ) >= int32v( 0 ); + + float32v valueA_gX = FS::Select( notYZ, float32v( kComponentA ) ^ sign0, float32v( kComponentC ) ); + float32v valueA_gY = FS::Select( notYZ & notXY, float32v( kComponentC ), FS::Select( notXY, float32v( kComponentA ) ^ sign0, float32v( kComponentB ) ^ sign1 ) ); + float32v valueA_gZ = FS::Select( notXY, float32v( kComponentB ) ^ sign1, float32v( kComponentC ) ); + float32v valueA = FS::FMulAdd( valueA_gZ, fZ, FS::FMulAdd( fY, valueA_gY, fX * valueA_gX ) ); + + float32v valueB_gX = FS::Select( notYZ, float32v( kComponentB ) ^ sign0, float32v( kComponentC ) ); + float32v valueB_gY = FS::Select( notYZ & notXY, float32v( kComponentC ), FS::Select( notXY, float32v( kComponentB ) ^ sign0, float32v( kComponentA ) ^ sign1 ) ); + float32v valueB_gZ = FS::Select( notXY, float32v( kComponentA ) ^ sign1, float32v( kComponentC ) ); + float32v valueB = FS::FMulAdd( valueB_gZ, fZ, FS::FMulAdd( fY, valueB_gY, fX * valueB_gX ) ); + + float32v valueC_gX = FS::Select( notYZ, float32v( kComponentsDE ) ^ sign0, float32v( kComponentF ) ); + float32v valueC_gY = FS::Select( notYZ & notXY, float32v( kComponentF ), FS::Select( notXY, float32v( kComponentsDE ) ^ sign0, float32v( kComponentsDE ) ^ sign1 ) ); + float32v valueC_gZ = FS::Select( notXY, float32v( kComponentsDE ) ^ sign1, float32v( kComponentF ) ); + valueC = FS::FMulAdd( valueC_gZ, fZ, FS::FMulAdd( fY, valueC_gY, fX * valueC_gX ) ); + + valueAB = FS::SelectHighBit( indexPermutation2HighBit, valueB, valueA ); + valueBA = FS::SelectHighBit( indexPermutation2HighBit, valueA, valueB ); + } + + multiplier ^= finalSign; + valueX = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 0 ), valueC, valueAB ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 1 ), valueC, FS::Select( indexPermutation3 == int32v( 2 ), valueBA, valueAB ) ), valueY ); + valueZ = FS::FMulAdd( multiplier, FS::Select( indexPermutation3 == int32v( 2 ), valueC, valueBA ), valueZ ); + } + + template + static void FS_VECTORCALL ApplyOrthogonalGradientMatrixVectorProductSimplex( int32v hash31, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW ) + { + const float kComponentPairwiseIndexedNegativeAB = -0.375999676691291f; + const float kComponentPairwiseUnindexedFillerAB = 0.222726847849776f; + const float kComponentPairwiseIndexedPositiveD = -kSkew4f; + const float kComponentPairwiseUnindexedD = kSkew4f; + + const float kDeltaPairwiseToSingleAB = -0.124000323308709f; + const float kDeltaPairwiseToSingleD = 0.190983005625053f; + const float kDeltaSingleToExtra = kSkew4f; + const float kDeltaPairwiseABToC = 0.437016024448821f; + const float kDeltaUnindexedFillerToDiagonal = -kRoot2f; + + const float kDeltaPairwiseToSingleExtraAB = kDeltaPairwiseToSingleAB + kDeltaSingleToExtra; + const float kDeltaPairwiseToSingleExtraD = kDeltaPairwiseToSingleD + kDeltaSingleToExtra; + + const float sIdxABC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseToSingleAB; + const float sDiagABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleAB + kDeltaUnindexedFillerToDiagonal; + const float sFillABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleAB; + const float sIdxD = kComponentPairwiseIndexedPositiveD + kDeltaPairwiseToSingleD - 1; + const float sFillD = kComponentPairwiseUnindexedD + kDeltaPairwiseToSingleD; + + const float pIdxPosAB = kComponentPairwiseIndexedNegativeAB + 1; + const float pIdxNegAB = kComponentPairwiseIndexedNegativeAB; + const float pFillAB = kComponentPairwiseUnindexedFillerAB; + const float pDiagAB = kComponentPairwiseUnindexedFillerAB + kDeltaUnindexedFillerToDiagonal; + const float pIdxPosC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseABToC + 1; + const float pIdxNegC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseABToC; + const float pFillC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseABToC; + const float pIdxPosD = kComponentPairwiseIndexedPositiveD; + const float pIdxNegD = kComponentPairwiseIndexedPositiveD - 1; + const float pFillD = kComponentPairwiseUnindexedD; + + const float eIdxABC = kComponentPairwiseIndexedNegativeAB + kDeltaPairwiseToSingleExtraAB + 1; + const float eDiagABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleExtraAB + kDeltaUnindexedFillerToDiagonal; + const float eFillABC = kComponentPairwiseUnindexedFillerAB + kDeltaPairwiseToSingleExtraAB; + const float eIdxD = kComponentPairwiseIndexedPositiveD + kDeltaPairwiseToSingleExtraD; + const float eFillD = kComponentPairwiseUnindexedD + kDeltaPairwiseToSingleExtraD; + + int32v hashShifted = FS::BitShiftRightZeroExtend( hash31, 2 ); + int32v indexBasis = hashShifted * int32v( 20 >> 2 ); // [0,20) << 27 + int32v indexPermutation3 = ( hashShifted * int32v( ( -4LL << 29 ) / 3 ) ) >> 29; // [0,3) + int32v indexPermutation8 = indexBasis >> 24; // & int32v( 0x07 ); + float32v finalSign = FS::Cast( hash31 << 31 ); + + float32v valueA, valueB, valueC, valueD; + float32v valueA_gX, valueB_gX, valueC_gX, valueD_gX; + float32v valueA_gY, valueB_gY, valueC_gY, valueD_gY; + float32v valueA_gZ, valueB_gZ, valueC_gZ, valueD_gZ; + float32v valueA_gW, valueB_gW, valueC_gW, valueD_gW; + + if constexpr( SIMD & FastSIMD::FeatureFlag::AVX512_F ) + { + indexBasis >>= 27; + + valueA_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sIdxABC, sDiagABC, sDiagABC, sDiagABC, pIdxPosAB, pIdxNegAB, pDiagAB, pDiagAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pDiagAB, pDiagAB, pIdxNegAB ), indexBasis, + FS::Constant( eIdxABC, eDiagABC, eDiagABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueB_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sIdxABC, sFillABC, sFillABC, sFillABC, pIdxPosAB, pIdxNegAB, pFillAB, pFillAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pFillAB, pFillAB, pIdxNegAB ), indexBasis, + FS::Constant( eIdxABC, eFillABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueC_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sIdxABC, sFillABC, sFillABC, sFillABC, pIdxPosC, pIdxNegC, pFillC, pFillC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pFillC, pFillC, pIdxNegC ), indexBasis, + FS::Constant( eIdxABC, eFillABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueD_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sIdxD, sFillD, sFillD, sFillD, pIdxPosD, pIdxNegD, pFillD, pFillD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pFillD, pFillD, pIdxNegD ), indexBasis, + FS::Constant( eIdxD, eFillD, eFillD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + + valueA = valueA_gX * fX; + valueB = valueB_gX * fX; + valueC = valueC_gX * fX; + valueD = valueD_gX * fX; + + valueA_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sDiagABC, sIdxABC, sFillABC, sFillABC, pIdxNegAB, pIdxPosAB, pFillAB, pFillAB, pDiagAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pIdxNegAB, pDiagAB ), indexBasis, + FS::Constant( eDiagABC, eIdxABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueB_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sIdxABC, sDiagABC, sDiagABC, pIdxNegAB, pIdxPosAB, pDiagAB, pDiagAB, pFillAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pIdxNegAB, pFillAB ), indexBasis, + FS::Constant( eFillABC, eIdxABC, eDiagABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueC_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sIdxABC, sFillABC, sFillABC, pIdxNegC, pIdxPosC, pFillC, pFillC, pFillC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pIdxNegC, pFillC ), indexBasis, + FS::Constant( eFillABC, eIdxABC, eFillABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueD_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillD, sIdxD, sFillD, sFillD, pIdxNegD, pIdxPosD, pFillD, pFillD, pFillD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pIdxNegD, pFillD ), indexBasis, + FS::Constant( eFillD, eIdxD, eFillD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + + valueA = FS::FMulAdd( valueA_gY, fY, valueA ); + valueB = FS::FMulAdd( valueB_gY, fY, valueB ); + valueC = FS::FMulAdd( valueC_gY, fY, valueC ); + valueD = FS::FMulAdd( valueD_gY, fY, valueD ); + + valueA_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sFillABC, sIdxABC, sFillABC, pDiagAB, pDiagAB, pIdxPosAB, pIdxNegAB, pIdxNegAB, pFillAB, pIdxPosAB, pFillAB, pFillAB, pIdxNegAB, pIdxPosAB, pFillAB ), indexBasis, + FS::Constant( eFillABC, eFillABC, eIdxABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueB_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sDiagABC, sDiagABC, sIdxABC, sFillABC, pFillAB, pFillAB, pIdxPosAB, pIdxNegAB, pIdxNegAB, pDiagAB, pIdxPosAB, pDiagAB, pDiagAB, pIdxNegAB, pIdxPosAB, pDiagAB ), indexBasis, + FS::Constant( eDiagABC, eDiagABC, eIdxABC, eFillABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueC_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sFillABC, sIdxABC, sDiagABC, pFillC, pFillC, pIdxPosC, pIdxNegC, pIdxNegC, pFillC, pIdxPosC, pFillC, pFillC, pIdxNegC, pIdxPosC, pFillC ), indexBasis, + FS::Constant( eFillABC, eFillABC, eIdxABC, eDiagABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueD_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillD, sFillD, sIdxD, sFillD, pFillD, pFillD, pIdxPosD, pIdxNegD, pIdxNegD, pFillD, pIdxPosD, pFillD, pFillD, pIdxNegD, pIdxPosD, pFillD ), indexBasis, + FS::Constant( eFillD, eFillD, eIdxD, eFillD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + + valueA = FS::FMulAdd( valueA_gZ, fZ, valueA ); + valueB = FS::FMulAdd( valueB_gZ, fZ, valueB ); + valueC = FS::FMulAdd( valueC_gZ, fZ, valueC ); + valueD = FS::FMulAdd( valueD_gZ, fZ, valueD ); + + valueA_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sFillABC, sFillABC, sIdxABC, pFillAB, pFillAB, pIdxNegAB, pIdxPosAB, pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, pIdxNegAB, pFillAB, pFillAB, pIdxPosAB ), indexBasis, + FS::Constant( eFillABC, eFillABC, eFillABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueB_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillABC, sFillABC, sFillABC, sIdxABC, pDiagAB, pDiagAB, pIdxNegAB, pIdxPosAB, pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, pIdxNegAB, pDiagAB, pDiagAB, pIdxPosAB ), indexBasis, + FS::Constant( eFillABC, eFillABC, eFillABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueC_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sDiagABC, sDiagABC, sDiagABC, sIdxABC, pFillC, pFillC, pIdxNegC, pIdxPosC, pFillC, pIdxNegC, pFillC, pIdxPosC, pIdxNegC, pFillC, pFillC, pIdxPosC ), indexBasis, + FS::Constant( eDiagABC, eDiagABC, eDiagABC, eIdxABC, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + valueD_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm512_permutex2var_ps ), + FS::Constant( sFillD, sFillD, sFillD, sIdxD, pFillD, pFillD, pIdxNegD, pIdxPosD, pFillD, pIdxNegD, pFillD, pIdxPosD, pIdxNegD, pFillD, pFillD, pIdxPosD ), indexBasis, + FS::Constant( eFillD, eFillD, eFillD, eIdxD, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ) + ); + + valueA = FS::FMulAdd( valueA_gW, fW, valueA ); + valueB = FS::FMulAdd( valueB_gW, fW, valueB ); + valueC = FS::FMulAdd( valueC_gW, fW, valueC ); + valueD = FS::FMulAdd( valueD_gW, fW, valueD ); + } + else if constexpr( SIMD & FastSIMD::FeatureFlag::AVX2 ) + { + const auto tableAB = FS::Constant( pFillAB, pIdxNegAB, pFillAB, pIdxPosAB, sFillABC, sIdxABC, eFillABC, eIdxABC ); + const auto tableC = FS::Constant( pFillC, pIdxNegC, pFillC, pIdxPosC, sFillABC, sIdxABC, eFillABC, eIdxABC ); + const auto tableD = FS::Constant( pFillD, pIdxNegD, pFillD, pIdxPosD, sFillD, sIdxD, eFillD, eIdxD ); + + int32v indexPositive = indexBasis & int32v( 0x03 << 27 ); + int32v indexNegative = ( indexBasis >> 2 ) & int32v( 0x03 << 27 ); + indexNegative ^= indexPositive; + + auto extraCase = ( indexBasis >= int32v( 0x10 << 27 ) ); + auto singleCase = ( indexPositive == indexNegative ); + indexPositive |= FS::Cast( singleCase ); // Force indexPositive checks to fail + + int32v indexSelectBase = FS::Masked( singleCase, int32v( 4 ) ) | FS::Masked( extraCase, int32v( 2 ) ); + + int32v indexedCounter( -1 ); - float32v a = FS::Select( p > int32v( 0 ), fX, fY ); - float32v b; - if constexpr( SIMD & FastSIMD::FeatureFlag::SSE41 ) { - b = FS::Select( FS::Cast>( hash << 27 ), fY, fZ ); + auto indexedPositive = ( indexPositive == int32v( 0 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 0 << 27 ) ); + int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) ); + + valueA_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), FS::Constant( pDiagAB, pIdxNegAB, pDiagAB, pIdxPosAB, sDiagABC, sIdxABC, eDiagABC, eIdxABC ), indexSelect ); + valueB_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect ); + valueC_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect ); + valueD_gX = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); } - else + + valueA = valueA_gX * fX; + valueB = valueB_gX * fX; + valueC = valueC_gX * fX; + valueD = valueD_gX * fX; + { - b = FS::Select( p > int32v( 1 << 3 ), fY, fZ ); + auto indexedPositive = ( indexPositive == int32v( 1 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 1 << 27 ) ); + int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) ); + + valueA_gY = valueB_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect ); + valueC_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect ); + valueD_gY = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect ); + + int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter ); + valueA_gY = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueA_gY, float32v( kDeltaUnindexedFillerToDiagonal ) ); + valueB_gY = FS::MaskedAdd( maskedIndexedCounter == int32v( -1 ), valueB_gY, float32v( kDeltaUnindexedFillerToDiagonal ) ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); } - float32v c = FS::Select( p > int32v( 2 << 3 ), fZ, fW ); - float32v aSign = FS::Cast( hash << 31 ); - float32v bSign = FS::Cast( ( hash >> 1 ) << 31 ); - float32v cSign = FS::Cast( ( hash >> 2 ) << 31 ); + valueA = FS::FMulAdd( valueA_gY, fY, valueA ); + valueB = FS::FMulAdd( valueB_gY, fY, valueB ); + valueC = FS::FMulAdd( valueC_gY, fY, valueC ); + valueD = FS::FMulAdd( valueD_gY, fY, valueD ); - return ( a ^ aSign ) + ( b ^ bSign ) + ( c ^ cSign ); + { + auto indexedPositive = ( indexPositive == int32v( 2 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 2 << 27 ) ); + int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) ); + + valueA_gZ = valueB_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect ); + valueC_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect ); + valueD_gZ = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect ); + + int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter ); + valueA_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -3 ), valueA_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) ); + valueB_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueB_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) ); + valueC_gZ = FS::MaskedAdd( maskedIndexedCounter == int32v( -1 ), valueC_gZ, float32v( kDeltaUnindexedFillerToDiagonal ) ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); + } + + valueA = FS::FMulAdd( valueA_gZ, fZ, valueA ); + valueB = FS::FMulAdd( valueB_gZ, fZ, valueB ); + valueC = FS::FMulAdd( valueC_gZ, fZ, valueC ); + valueD = FS::FMulAdd( valueD_gZ, fZ, valueD ); + + { + auto indexedPositive = ( indexPositive == int32v( 3 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 3 << 27 ) ); + int32v indexSelect = FS::MaskedIncrement( indexed, indexSelectBase | FS::Masked( indexedPositive, int32v( 2 ) ) ); + + valueA_gW = valueB_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableAB, indexSelect ); + valueC_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableC, indexSelect ); + valueD_gW = FS::NativeExec( FS_BIND_INTRINSIC( _mm256_permutevar8x32_ps ), tableD, indexSelect ); + + int32v maskedIndexedCounter = FS::InvMasked( indexed, indexedCounter ); + valueB_gW = FS::MaskedAdd( maskedIndexedCounter == int32v( -3 ), valueB_gW, float32v( kDeltaUnindexedFillerToDiagonal ) ); + valueC_gW = FS::MaskedAdd( maskedIndexedCounter == int32v( -2 ), valueC_gW, float32v( kDeltaUnindexedFillerToDiagonal ) ); + } + + valueA = FS::FMulAdd( valueA_gW, fW, valueA ); + valueB = FS::FMulAdd( valueB_gW, fW, valueB ); + valueC = FS::FMulAdd( valueC_gW, fW, valueC ); + valueD = FS::FMulAdd( valueD_gW, fW, valueD ); + } + else + { + int32v indexPositive = indexBasis & int32v( 0x03 << 27 ); + int32v indexNegative = ( indexBasis >> 2 ) & int32v( 0x03 << 27 ); + indexNegative ^= indexPositive; + + auto extraCase = ( indexBasis >= int32v( 0x10 << 27 ) ); + auto singleCase = ( indexPositive == indexNegative ); + auto singleNonExtraCase = indexBasis < int32v( 0x04 << 27 ); + indexPositive |= FS::Cast( singleNonExtraCase ); // Force indexPositive checks to fail + + float32v singleOffsetAB = FS::MaskedAdd( extraCase, float32v( kDeltaPairwiseToSingleAB ), float32v( kDeltaSingleToExtra ) ); + float32v componentIndexedNegativeAB = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseIndexedNegativeAB ), singleOffsetAB ); + float32v componentUnindexedFillerAB = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseUnindexedFillerAB ), singleOffsetAB ); + + float32v componentIndexedNegativeC = FS::InvMaskedAdd( singleCase, componentIndexedNegativeAB, float32v( kDeltaPairwiseABToC ) ); + float32v componentUnindexedFillerC = FS::InvMaskedAdd( singleCase, componentUnindexedFillerAB, float32v( kDeltaPairwiseABToC ) ); + + float32v singleOffsetD = FS::MaskedAdd( extraCase, float32v( kDeltaPairwiseToSingleD ), float32v( kDeltaSingleToExtra ) ); + float32v componentIndexedPositiveD = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseIndexedPositiveD ), singleOffsetD ); + float32v componentUnindexedD = FS::MaskedAdd( singleCase, float32v( kComponentPairwiseUnindexedD ), singleOffsetD ); + + int32v indexedCounter( -1 ); + + { + auto indexedPositive = ( indexPositive == int32v( 0 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 0 << 27 ) ); + + float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB ); + float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC ); + float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD ); + + float32v unindexedComponentA = componentUnindexedFillerAB + float32v( kDeltaUnindexedFillerToDiagonal ); + float32v unindexedComponentB = componentUnindexedFillerAB; + float32v unindexedComponentC = componentUnindexedFillerC; + + valueA_gX = FS::Select( indexed, indexedComponentAB, unindexedComponentA ); + valueB_gX = FS::Select( indexed, indexedComponentAB, unindexedComponentB ); + valueC_gX = FS::Select( indexed, indexedComponentC, unindexedComponentC ); + valueD_gX = FS::Select( indexed, indexedComponentD, componentUnindexedD ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); + } + + valueA = valueA_gX * fX; + valueB = valueB_gX * fX; + valueC = valueC_gX * fX; + valueD = valueD_gX * fX; + + { + auto indexedPositive = ( indexPositive == int32v( 1 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 1 << 27 ) ); + + float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB ); + float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC ); + float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD ); + + float32v unindexedComponentA = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) ); + float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -1 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) ); + float32v unindexedComponentC = componentUnindexedFillerC; + + valueA_gY = FS::Select( indexed, indexedComponentAB, unindexedComponentA ); + valueB_gY = FS::Select( indexed, indexedComponentAB, unindexedComponentB ); + valueC_gY = FS::Select( indexed, indexedComponentC, unindexedComponentC ); + valueD_gY = FS::Select( indexed, indexedComponentD, componentUnindexedD ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); + } + + valueA = FS::FMulAdd( valueA_gY, fY, valueA ); + valueB = FS::FMulAdd( valueB_gY, fY, valueB ); + valueC = FS::FMulAdd( valueC_gY, fY, valueC ); + valueD = FS::FMulAdd( valueD_gY, fY, valueD ); + + { + auto indexedPositive = ( indexPositive == int32v( 2 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 2 << 27 ) ); + + float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB ); + float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC ); + float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD ); + + float32v unindexedComponentA = FS::MaskedAdd( indexedCounter == int32v( -3 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) ); + float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) ); + float32v unindexedComponentC = FS::MaskedAdd( indexedCounter == int32v( -1 ), componentUnindexedFillerC, float32v( kDeltaUnindexedFillerToDiagonal ) ); + + valueA_gZ = FS::Select( indexed, indexedComponentAB, unindexedComponentA ); + valueB_gZ = FS::Select( indexed, indexedComponentAB, unindexedComponentB ); + valueC_gZ = FS::Select( indexed, indexedComponentC, unindexedComponentC ); + valueD_gZ = FS::Select( indexed, indexedComponentD, componentUnindexedD ); + + indexedCounter = FS::MaskedDecrement( indexed, indexedCounter ); + } + + valueA = FS::FMulAdd( valueA_gZ, fZ, valueA ); + valueB = FS::FMulAdd( valueB_gZ, fZ, valueB ); + valueC = FS::FMulAdd( valueC_gZ, fZ, valueC ); + valueD = FS::FMulAdd( valueD_gZ, fZ, valueD ); + + { + auto indexedPositive = ( indexPositive == int32v( 3 << 27 ) ); + auto indexed = indexedPositive | ( indexNegative == int32v( 3 << 27 ) ); + + float32v indexedComponentAB = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeAB ); + float32v indexedComponentC = FS::MaskedIncrement( indexedPositive, componentIndexedNegativeC ); + float32v indexedComponentD = FS::MaskedDecrement( ~indexedPositive, componentIndexedPositiveD ); + + float32v unindexedComponentA = componentUnindexedFillerAB; + float32v unindexedComponentB = FS::MaskedAdd( indexedCounter == int32v( -3 ), componentUnindexedFillerAB, float32v( kDeltaUnindexedFillerToDiagonal ) ); + float32v unindexedComponentC = FS::MaskedAdd( indexedCounter == int32v( -2 ), componentUnindexedFillerC, float32v( kDeltaUnindexedFillerToDiagonal ) ); + + valueA_gW = FS::Select( indexed, indexedComponentAB, unindexedComponentA ); + valueB_gW = FS::Select( indexed, indexedComponentAB, unindexedComponentB ); + valueC_gW = FS::Select( indexed, indexedComponentC, unindexedComponentC ); + valueD_gW = FS::Select( indexed, indexedComponentD, componentUnindexedD ); + } + + valueA = FS::FMulAdd( valueA_gW, fW, valueA ); + valueB = FS::FMulAdd( valueB_gW, fW, valueB ); + valueC = FS::FMulAdd( valueC_gW, fW, valueC ); + valueD = FS::FMulAdd( valueD_gW, fW, valueD ); } + + int32v valueIndexX = ( indexPermutation8 >> 1 ); // & int32v( 0x3 ); + int32v valueIndexY = ( FS::Increment( valueIndexX ) + indexPermutation3 ); // & int32v( 0x3 ); + int32v valueIndexZ = indexPermutation8 & int32v( 0x1 ); + valueIndexZ = ( FS::Increment( valueIndexX ) + FS::MaskedIncrement( valueIndexZ >= indexPermutation3, valueIndexZ ) ); // & int32v( 0x3 ); + int32v valueIndexSumXYZ = valueIndexX + valueIndexY + valueIndexZ; + + multiplier ^= finalSign; + valueX = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexX << 31, FS::SelectHighBit( valueIndexX << 30, valueD, valueB ), FS::SelectHighBit( valueIndexX << 30, valueC, valueA ) ), valueX ); + valueY = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexY << 31, FS::SelectHighBit( valueIndexY << 30, valueD, valueB ), FS::SelectHighBit( valueIndexY << 30, valueC, valueA ) ), valueY ); + valueZ = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexZ << 31, FS::SelectHighBit( valueIndexZ << 30, valueD, valueB ), FS::SelectHighBit( valueIndexZ << 30, valueC, valueA ) ), valueZ ); + valueW = FS::FMulAdd( multiplier, FS::SelectHighBit( valueIndexSumXYZ << 31, FS::SelectHighBit( valueIndexSumXYZ << 30, valueD, valueB ), FS::SelectHighBit( valueIndexSumXYZ << 30, valueA, valueC ) ), valueW ); } - template + template + FS_FORCEINLINE static void ApplyVectorContributionSimplex( int32v hash, float32v fX, float32v fY, float32v multiplier, float32v& valueX, float32v& valueY ) { + switch( Scheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return ApplyOrthogonalGradientMatrixVectorProductSimplex( hash, fX, fY, multiplier, valueX, valueY ); + case VectorizationScheme::GradientOuterProduct: + return ApplyGradientOuterProductVectorProductSimplex( hash, fX, fY, multiplier, valueX, valueY ); + } + } + + template + FS_FORCEINLINE static void ApplyVectorContributionCommon( int32v hash, float32v fX, float32v fY, float32v fZ, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ ) { + switch( Scheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return ApplyOrthogonalGradientMatrixVectorProductCommon( hash, fX, fY, fZ, multiplier, valueX, valueY, valueZ ); + case VectorizationScheme::GradientOuterProduct: + return ApplyGradientOuterProductVectorProductCommon( hash, fX, fY, fZ, multiplier, valueX, valueY, valueZ ); + } + } + + template + FS_FORCEINLINE static void ApplyVectorContributionSimplex( int32v hash, float32v fX, float32v fY, float32v fZ, float32v fW, float32v multiplier, float32v& valueX, float32v& valueY, float32v& valueZ, float32v& valueW ) { + switch( Scheme ) { + case VectorizationScheme::OrthogonalGradientMatrix: + return ApplyOrthogonalGradientMatrixVectorProductSimplex( hash, fX, fY, fZ, fW, multiplier, valueX, valueY, valueZ, valueW ); + case VectorizationScheme::GradientOuterProduct: + return ApplyGradientOuterProductVectorProductSimplex( hash, fX, fY, fZ, fW, multiplier, valueX, valueY, valueZ, valueW ); + } + } + + enum HashMultiplier + { + A = 0x27D4EB2D + }; + + template FS_FORCEINLINE static int32v HashPrimes( int32v seed, P... primedPos ) { int32v hash = seed; - hash ^= (primedPos ^ ...); + hash ^= ( primedPos ^ ... ); - hash *= int32v( 0x27d4eb2d ); - return (hash >> 15) ^ hash; + hash *= int32v( Multiplier ); + + return ( hash >> 15 ) ^ hash; } template FS_FORCEINLINE static int32v HashPrimesHB( int32v seed, P... primedPos ) { int32v hash = seed; - hash ^= (primedPos ^ ...); + hash ^= ( primedPos ^ ... ); hash *= int32v( 0x27d4eb2d ); return hash; - } + } template - FS_FORCEINLINE static float32v GetValueCoord( int32v seed, P... primedPos ) + FS_FORCEINLINE static float32v GetValueCoord( int32v seed, P... primedPos ) { int32v hash = seed; hash ^= (primedPos ^ ...); diff --git a/src/FastNoise/FastSIMD_Build.inl b/src/FastNoise/FastSIMD_Build.inl index ec7a8a2..301ea55 100644 --- a/src/FastNoise/FastSIMD_Build.inl +++ b/src/FastNoise/FastSIMD_Build.inl @@ -91,7 +91,6 @@ FASTNOISE_REGISTER_NODE( PositionOutput ); FASTNOISE_REGISTER_NODE( DistanceToPoint ); FASTNOISE_REGISTER_NODE( Simplex ); -FASTNOISE_REGISTER_NODE( OpenSimplex2 ); FASTNOISE_REGISTER_NODE( Perlin ); FASTNOISE_REGISTER_NODE( Value ); @@ -103,8 +102,7 @@ FASTNOISE_REGISTER_NODE( FractalFBm ); FASTNOISE_REGISTER_NODE( FractalPingPong ); FASTNOISE_REGISTER_NODE( FractalRidged ); -FASTNOISE_REGISTER_NODE( DomainWarpOpenSimplex ); -FASTNOISE_REGISTER_NODE( OpenSimplex2S ); +FASTNOISE_REGISTER_NODE( DomainWarpSimplex ); FASTNOISE_REGISTER_NODE( DomainWarpGradient ); FASTNOISE_REGISTER_NODE( DomainWarpFractalProgressive );