From e1805422e86ff8b551e4d3d4ba52e109f62854cd Mon Sep 17 00:00:00 2001
From: Jacco Bikker <bikker.j@gmail.com>
Date: Thu, 14 Nov 2024 12:04:06 +0100
Subject: [PATCH] Bumped version to 0.8.0 for ARM/NEON support.

---
 tiny_bvh.h | 542 +++++++++++++++++++++++++++--------------------------
 1 file changed, 272 insertions(+), 270 deletions(-)

diff --git a/tiny_bvh.h b/tiny_bvh.h
index 1d6be29..2a5d918 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -85,8 +85,8 @@ THE SOFTWARE.
 
 // library version
 #define TINY_BVH_VERSION_MAJOR	0
-#define TINY_BVH_VERSION_MINOR	7
-#define TINY_BVH_VERSION_SUB	6
+#define TINY_BVH_VERSION_MINOR	8
+#define TINY_BVH_VERSION_SUB	0
 
 // ============================================================================
 //
@@ -286,15 +286,15 @@ typedef __m128 SIMDVEC4;
 #define SIMD_SETRVEC(a,b,c,d) _mm_set_ps( d, c, b, a )
 #elif defined(BVH_USENEON)
 typedef float32x4_t SIMDVEC4;
-inline float32x4_t SIMD_SETVEC(float w, float z, float y, float x)
+inline float32x4_t SIMD_SETVEC( float w, float z, float y, float x )
 {
-    ALIGNED(64) float data[4] = {x, y, z, w};
-    return vld1q_f32(data);
+	ALIGNED( 64 ) float data[4] = { x, y, z, w };
+	return vld1q_f32( data );
 }
-inline float32x4_t SIMD_SETRVEC(float x, float y, float z, float w)
+inline float32x4_t SIMD_SETRVEC( float x, float y, float z, float w )
 {
-    ALIGNED(64) float data[4] = {x, y, z, w};
-    return vld1q_f32(data);
+	ALIGNED( 64 ) float data[4] = { x, y, z, w };
+	return vld1q_f32( data );
 }
 #else
 typedef bvhvec4 SIMDVEC4;
@@ -481,9 +481,8 @@ class BVH
 	void BuildHQ( const bvhvec4* vertices, const unsigned primCount );
 #ifdef BVH_USEAVX
 	void BuildAVX( const bvhvec4* vertices, const unsigned primCount );
-#endif
-#ifdef BVH_USENEON
-    void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
+#elif defined BVH_USENEON
+	void BuildNEON( const bvhvec4* vertices, const unsigned primCount );
 #endif
 	void Convert( BVHLayout from, BVHLayout to, const bool deleteOriginal = false );
 	void SplitLeafs(); // operates on VERBOSE layout
@@ -1898,19 +1897,16 @@ int BVH::Intersect( Ray& ray, BVHLayout layout ) const
 	case BASIC_BVH8:
 		return Intersect_BasicBVH8( ray );
 		break;
-	#ifdef BVH_USEAVX
+	#if defined BVH_USEAVX || defined BVH_USENEON
 	case ALT_SOA:
 		return Intersect_AltSoA( ray );
 		break;
+	#endif
+	#if defined BVH_USEAVX
 	case CWBVH:
 		return Intersect_CWBVH( ray );
 		break;
 	#endif
-    #ifdef BVH_USENEON
-    case ALT_SOA:
-        return Intersect_AltSoA( ray );
-        break;
-    #endif
 	default:
 		assert( false );
 	};
@@ -2364,7 +2360,7 @@ float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3&
 
 // ============================================================================
 //
-//        I M P L E M E N T A T I O N  -  S I M D  C O D E
+//        I M P L E M E N T A T I O N  -  A V X / S S E  C O D E
 // 
 // ============================================================================
 
@@ -3037,31 +3033,37 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 
 #endif // BVH_USEAVX
 
+// ============================================================================
+//
+//        I M P L E M E N T A T I O N  -  A R M / N E O N  C O D E
+// 
+// ============================================================================
+
 #ifdef BVH_USENEON
 
 #define ILANE(a,b) vgetq_lane_s32(a, b)
 
-inline float32x4x2_t vmaxq_f32x2(float32x4x2_t a, float32x4x2_t b)
+inline float32x4x2_t vmaxq_f32x2( float32x4x2_t a, float32x4x2_t b )
 {
-    float32x4x2_t ret;
-    ret.val[0] = vmaxq_f32(a.val[0], b.val[0]);
-    ret.val[1] = vmaxq_f32(a.val[1], b.val[1]);
-    return ret;
+	float32x4x2_t ret;
+	ret.val[0] = vmaxq_f32( a.val[0], b.val[0] );
+	ret.val[1] = vmaxq_f32( a.val[1], b.val[1] );
+	return ret;
 }
 inline float halfArea( const float32x4_t a /* a contains extent of aabb */ )
 {
-    ALIGNED(64) float v[4];
-    vst1q_f32(v, a);
-    return v[0] * v[1] + v[1] * v[2] + v[2] * v[3];
+	ALIGNED( 64 ) float v[4];
+	vst1q_f32( v, a );
+	return v[0] * v[1] + v[1] * v[2] + v[2] * v[3];
 }
 inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with min.xyz negated */ )
 {
-    ALIGNED(64) float c[8];
-    vst1q_f32(c, a.val[0]);
-    vst1q_f32(c + 4, a.val[1]);
-    
-    float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2];
-    return ex * ey + ey * ez + ez * ex;
+	ALIGNED( 64 ) float c[8];
+	vst1q_f32( c, a.val[0] );
+	vst1q_f32( c + 4, a.val[1] );
+
+	float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2];
+	return ex * ey + ey * ez + ez * ex;
 }
 #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \
     ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \
@@ -3069,249 +3071,249 @@ inline float halfArea( const float32x4x2_t& a /* a contains aabb itself, with mi
 
 void BVH::BuildNEON( const bvhvec4* vertices, const unsigned primCount )
 {
-    int test = BVHBINS;
-    if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
-    // aligned data
-    ALIGNED( 64 ) float32x4x2_t binbox[3 * BVHBINS];                // 768 bytes
-    ALIGNED( 64 ) float32x4x2_t binboxOrig[3 * BVHBINS];            // 768 bytes
-    ALIGNED( 64 ) unsigned count[3][BVHBINS]{};            // 96 bytes
-    ALIGNED( 64 ) float32x4x2_t bestLBox, bestRBox;                // 64 bytes
-    // some constants
-    static const float32x4_t max4 = vdupq_n_f32( -1e30f ), half4 = vdupq_n_f32( 0.5f );
-    static const float32x4_t two4 = vdupq_n_f32( 2.0f ), min1 = vdupq_n_f32( -1 );
-    static const float32x4x2_t max8 = {max4, max4};
-    static const float32x4_t signFlip4 = SIMD_SETRVEC( -0.0f, -0.0f, -0.0f, 0.0f );
-    static const float32x4x2_t signFlip8 = {signFlip4, vdupq_n_f32(0)}; // TODO: Check me
-    static const float32x4_t mask3 = vceqq_f32( SIMD_SETRVEC( 0, 0, 0, 1 ), vdupq_n_f32(0) );
-    static const float32x4_t binmul3 = vdupq_n_f32( BVHBINS * 0.49999f );
-    for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
-    // reset node pool
-    const unsigned spaceNeeded = primCount * 2;
-    if (allocatedBVHNodes < spaceNeeded)
-    {
-        ALIGNED_FREE( bvhNode );
-        ALIGNED_FREE( triIdx );
-        ALIGNED_FREE( fragment );
-        verts = (bvhvec4*)vertices;
-        triIdx = (unsigned*)ALIGNED_MALLOC( primCount * sizeof( unsigned ) );
-        bvhNode = (BVHNode*)ALIGNED_MALLOC( spaceNeeded * sizeof( BVHNode ) );
-        allocatedBVHNodes = spaceNeeded;
-        memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
-        fragment = (Fragment*)ALIGNED_MALLOC( primCount * sizeof( Fragment ) );
-    }
-    else assert( rebuildable == true );
-    triCount = idxCount = primCount;
-    unsigned newNodePtr = 2;
-    struct FragSSE { float32x4_t bmin4, bmax4; };
-    FragSSE* frag4 = (FragSSE*)fragment;
-    float32x4x2_t* frag8 = (float32x4x2_t*)fragment;
-    const float32x4_t* verts4 = (float32x4_t*)verts;
-    // assign all triangles to the root node
-    BVHNode& root = bvhNode[0];
-    root.leftFirst = 0, root.triCount = triCount;
-    // initialize fragments and update root bounds
-    float32x4_t rootMin = max4, rootMax = max4;
-    for (unsigned i = 0; i < triCount; i++)
-    {
-        const float32x4_t v1 = veorq_s32( signFlip4, vminq_f32( vminq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) );
-        const float32x4_t v2 = vmaxq_f32( vmaxq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] );
-        frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = vmaxq_f32( rootMin, v1 ), rootMax = vmaxq_f32( rootMax, v2 ), triIdx[i] = i;
-    }
-    rootMin = veorq_s32( rootMin, signFlip4 );
-    root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
-    // subdivide recursively
-    ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0;
-    const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f;
-    while (1)
-    {
-        while (1)
-        {
-            BVHNode& node = bvhNode[nodeIdx];
-            float32x4_t* node4 = (float32x4_t*) & bvhNode[nodeIdx];
-            // find optimal object split
-            const float32x4_t d4 = vbslq_f32(vshrq_n_s32(mask3, 31), vsubq_f32( node4[1], node4[0] ), min1 );
-            const float32x4_t nmin4 = vmulq_f32( vandq_s32( node4[0], mask3 ), two4 );
-            const float32x4_t rpd4 = vandq_s32( vdivq_f32( binmul3, d4 ), vmvnq_u32(vceqq_f32( d4, vdupq_n_f32(0) ) ) );
-            // implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
-            // main loop operates on two fragments to minimize dependencies and maximize ILP.
-            unsigned fi = triIdx[node.leftFirst];
-            memset( count, 0, sizeof( count ) );
-            float32x4x2_t r0, r1, r2, f = frag8[fi];
-            int32x4_t bi4 = vcvtq_s32_f32(vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ) );
-            memcpy( binbox, binboxOrig, sizeof( binbox ) );
-            unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
-            for (unsigned i = 0; i < node.triCount - 1; i++)
-            {
-                unsigned fid = *ti++;
-                const float32x4x2_t b0 = binbox[i0];
-                const float32x4x2_t b1 = binbox[BVHBINS + i1];
-                const float32x4x2_t b2 = binbox[2 * BVHBINS + i2];
-                const float32x4_t fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
-                r0 =  vmaxq_f32x2(b0, f);
-                r1 =  vmaxq_f32x2(b1, f);
-                r2 =  vmaxq_f32x2(b2, f);
-                const int32x4_t b4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ) );
-
-                f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
-                binbox[i0] = r0, i0 = ILANE( b4, 0 );
-                binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
-                binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
-            }
-            // final business for final fragment
-            const float32x4x2_t b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
-            count[0][i0]++, count[1][i1]++, count[2][i2]++;
-            r0 =  vmaxq_f32x2(b0, f);
-            r1 =  vmaxq_f32x2(b1, f);
-            r2 =  vmaxq_f32x2(b2, f);
-            binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
-            // calculate per-split totals
-            float splitCost = 1e30f;
-            unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
-            const float32x4x2_t* bb = binbox;
-            for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
-            {
-                // hardcoded bin processing for BVHBINS == 8
-                assert( BVHBINS == 8 );
-                const unsigned lN0 = count[a][0], rN0 = count[a][7];
-                const float32x4x2_t lb0 = bb[0], rb0 = bb[7];
-                const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
-                const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
-                const float32x4x2_t lb1 = vmaxq_f32x2( lb0, bb[1] ), rb1 = vmaxq_f32x2( rb0, bb[6] );
-                const float32x4x2_t lb2 = vmaxq_f32x2( lb1, bb[2] ), rb2 = vmaxq_f32x2( rb1, bb[5] );
-                const float32x4x2_t lb3 = vmaxq_f32x2( lb2, bb[3] ), rb3 = vmaxq_f32x2( rb2, bb[4] );
-                const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
-                const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
-                const float32x4x2_t lb4 = vmaxq_f32x2( lb3, bb[4] ), rb4 = vmaxq_f32x2( rb3, bb[3] );
-                const float32x4x2_t lb5 = vmaxq_f32x2( lb4, bb[5] ), rb5 = vmaxq_f32x2( rb4, bb[2] );
-                const float32x4x2_t lb6 = vmaxq_f32x2( lb5, bb[6] ), rb6 = vmaxq_f32x2( rb5, bb[1] );
-                float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split
-                float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 );
-                float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 );
-                float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 );
-                float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 );
-                float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
-                float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
-            }
-            if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
-            // in-place partition
-            const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
-            unsigned t, fr = triIdx[src];
-            for (unsigned i = 0; i < node.triCount; i++)
-            {
-                const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
-                if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
-            }
-            // create child nodes and recurse
-            const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
-            if (leftCount == 0 || rightCount == 0) break; // should not happen.
-            (*(float32x4x2_t*)& bvhNode[n]).val[0] = veorq_s32( bestLBox.val[0], signFlip8.val[0] );
-            (*(float32x4x2_t*)& bvhNode[n]).val[1] = veorq_s32( bestLBox.val[1], signFlip8.val[1] );
-            bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
-            node.leftFirst = n++, node.triCount = 0, newNodePtr += 2;
-            (*(float32x4x2_t*)& bvhNode[n]).val[0] = veorq_s32( bestRBox.val[0], signFlip8.val[0] );
-            (*(float32x4x2_t*)& bvhNode[n]).val[1] = veorq_s32( bestRBox.val[1], signFlip8.val[1] );
-            bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
-            task[taskCount++] = n, nodeIdx = n - 1;
-        }
-        // fetch subdivision task from stack
-        if (taskCount == 0) break; else nodeIdx = task[--taskCount];
-    }
-    // all done.
-    refittable = true; // not using spatial splits: can refit this BVH
-    frag_min_flipped = true; // NEON was used for binning; fragment.min flipped
-    may_have_holes = false; // the NEON builder produces a continuous list of nodes
-    usedBVHNodes = newNodePtr;
+	int test = BVHBINS;
+	if (test != 8) assert( false ); // AVX builders require BVHBINS == 8.
+	// aligned data
+	ALIGNED( 64 ) float32x4x2_t binbox[3 * BVHBINS];                // 768 bytes
+	ALIGNED( 64 ) float32x4x2_t binboxOrig[3 * BVHBINS];            // 768 bytes
+	ALIGNED( 64 ) unsigned count[3][BVHBINS]{};            // 96 bytes
+	ALIGNED( 64 ) float32x4x2_t bestLBox, bestRBox;                // 64 bytes
+	// some constants
+	static const float32x4_t max4 = vdupq_n_f32( -1e30f ), half4 = vdupq_n_f32( 0.5f );
+	static const float32x4_t two4 = vdupq_n_f32( 2.0f ), min1 = vdupq_n_f32( -1 );
+	static const float32x4x2_t max8 = { max4, max4 };
+	static const float32x4_t signFlip4 = SIMD_SETRVEC( -0.0f, -0.0f, -0.0f, 0.0f );
+	static const float32x4x2_t signFlip8 = { signFlip4, vdupq_n_f32( 0 ) }; // TODO: Check me
+	static const float32x4_t mask3 = vceqq_f32( SIMD_SETRVEC( 0, 0, 0, 1 ), vdupq_n_f32( 0 ) );
+	static const float32x4_t binmul3 = vdupq_n_f32( BVHBINS * 0.49999f );
+	for (unsigned i = 0; i < 3 * BVHBINS; i++) binboxOrig[i] = max8; // binbox initialization template
+	// reset node pool
+	const unsigned spaceNeeded = primCount * 2;
+	if (allocatedBVHNodes < spaceNeeded)
+	{
+		ALIGNED_FREE( bvhNode );
+		ALIGNED_FREE( triIdx );
+		ALIGNED_FREE( fragment );
+		verts = (bvhvec4*)vertices;
+		triIdx = (unsigned*)ALIGNED_MALLOC( primCount * sizeof( unsigned ) );
+		bvhNode = (BVHNode*)ALIGNED_MALLOC( spaceNeeded * sizeof( BVHNode ) );
+		allocatedBVHNodes = spaceNeeded;
+		memset( &bvhNode[1], 0, 32 ); // avoid crash in refit.
+		fragment = (Fragment*)ALIGNED_MALLOC( primCount * sizeof( Fragment ) );
+	}
+	else assert( rebuildable == true );
+	triCount = idxCount = primCount;
+	unsigned newNodePtr = 2;
+	struct FragSSE { float32x4_t bmin4, bmax4; };
+	FragSSE* frag4 = (FragSSE*)fragment;
+	float32x4x2_t* frag8 = (float32x4x2_t*)fragment;
+	const float32x4_t* verts4 = (float32x4_t*)verts;
+	// assign all triangles to the root node
+	BVHNode& root = bvhNode[0];
+	root.leftFirst = 0, root.triCount = triCount;
+	// initialize fragments and update root bounds
+	float32x4_t rootMin = max4, rootMax = max4;
+	for (unsigned i = 0; i < triCount; i++)
+	{
+		const float32x4_t v1 = veorq_s32( signFlip4, vminq_f32( vminq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] ) );
+		const float32x4_t v2 = vmaxq_f32( vmaxq_f32( verts4[i * 3], verts4[i * 3 + 1] ), verts4[i * 3 + 2] );
+		frag4[i].bmin4 = v1, frag4[i].bmax4 = v2, rootMin = vmaxq_f32( rootMin, v1 ), rootMax = vmaxq_f32( rootMax, v2 ), triIdx[i] = i;
+	}
+	rootMin = veorq_s32( rootMin, signFlip4 );
+	root.aabbMin = *(bvhvec3*)&rootMin, root.aabbMax = *(bvhvec3*)&rootMax;
+	// subdivide recursively
+	ALIGNED( 64 ) unsigned task[128], taskCount = 0, nodeIdx = 0;
+	const bvhvec3 minDim = (root.aabbMax - root.aabbMin) * 1e-7f;
+	while (1)
+	{
+		while (1)
+		{
+			BVHNode& node = bvhNode[nodeIdx];
+			float32x4_t* node4 = (float32x4_t*)&bvhNode[nodeIdx];
+			// find optimal object split
+			const float32x4_t d4 = vbslq_f32( vshrq_n_s32( mask3, 31 ), vsubq_f32( node4[1], node4[0] ), min1 );
+			const float32x4_t nmin4 = vmulq_f32( vandq_s32( node4[0], mask3 ), two4 );
+			const float32x4_t rpd4 = vandq_s32( vdivq_f32( binmul3, d4 ), vmvnq_u32( vceqq_f32( d4, vdupq_n_f32( 0 ) ) ) );
+			// implementation of Section 4.1 of "Parallel Spatial Splits in Bounding Volume Hierarchies":
+			// main loop operates on two fragments to minimize dependencies and maximize ILP.
+			unsigned fi = triIdx[node.leftFirst];
+			memset( count, 0, sizeof( count ) );
+			float32x4x2_t r0, r1, r2, f = frag8[fi];
+			int32x4_t bi4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( frag4[fi].bmax4, frag4[fi].bmin4 ), nmin4 ), rpd4 ), half4 ) ) );
+			memcpy( binbox, binboxOrig, sizeof( binbox ) );
+			unsigned i0 = ILANE( bi4, 0 ), i1 = ILANE( bi4, 1 ), i2 = ILANE( bi4, 2 ), * ti = triIdx + node.leftFirst + 1;
+			for (unsigned i = 0; i < node.triCount - 1; i++)
+			{
+				unsigned fid = *ti++;
+				const float32x4x2_t b0 = binbox[i0];
+				const float32x4x2_t b1 = binbox[BVHBINS + i1];
+				const float32x4x2_t b2 = binbox[2 * BVHBINS + i2];
+				const float32x4_t fmin = frag4[fid].bmin4, fmax = frag4[fid].bmax4;
+				r0 = vmaxq_f32x2( b0, f );
+				r1 = vmaxq_f32x2( b1, f );
+				r2 = vmaxq_f32x2( b2, f );
+				const int32x4_t b4 = vcvtq_s32_f32( vrnd32xq_f32( vsubq_f32( vmulq_f32( vsubq_f32( vsubq_f32( fmax, fmin ), nmin4 ), rpd4 ), half4 ) ) );
+
+				f = frag8[fid], count[0][i0]++, count[1][i1]++, count[2][i2]++;
+				binbox[i0] = r0, i0 = ILANE( b4, 0 );
+				binbox[BVHBINS + i1] = r1, i1 = ILANE( b4, 1 );
+				binbox[2 * BVHBINS + i2] = r2, i2 = ILANE( b4, 2 );
+			}
+			// final business for final fragment
+			const float32x4x2_t b0 = binbox[i0], b1 = binbox[BVHBINS + i1], b2 = binbox[2 * BVHBINS + i2];
+			count[0][i0]++, count[1][i1]++, count[2][i2]++;
+			r0 = vmaxq_f32x2( b0, f );
+			r1 = vmaxq_f32x2( b1, f );
+			r2 = vmaxq_f32x2( b2, f );
+			binbox[i0] = r0, binbox[BVHBINS + i1] = r1, binbox[2 * BVHBINS + i2] = r2;
+			// calculate per-split totals
+			float splitCost = 1e30f;
+			unsigned bestAxis = 0, bestPos = 0, n = newNodePtr, j = node.leftFirst + node.triCount, src = node.leftFirst;
+			const float32x4x2_t* bb = binbox;
+			for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a])
+			{
+				// hardcoded bin processing for BVHBINS == 8
+				assert( BVHBINS == 8 );
+				const unsigned lN0 = count[a][0], rN0 = count[a][7];
+				const float32x4x2_t lb0 = bb[0], rb0 = bb[7];
+				const unsigned lN1 = lN0 + count[a][1], rN1 = rN0 + count[a][6], lN2 = lN1 + count[a][2];
+				const unsigned rN2 = rN1 + count[a][5], lN3 = lN2 + count[a][3], rN3 = rN2 + count[a][4];
+				const float32x4x2_t lb1 = vmaxq_f32x2( lb0, bb[1] ), rb1 = vmaxq_f32x2( rb0, bb[6] );
+				const float32x4x2_t lb2 = vmaxq_f32x2( lb1, bb[2] ), rb2 = vmaxq_f32x2( rb1, bb[5] );
+				const float32x4x2_t lb3 = vmaxq_f32x2( lb2, bb[3] ), rb3 = vmaxq_f32x2( rb2, bb[4] );
+				const unsigned lN4 = lN3 + count[a][4], rN4 = rN3 + count[a][3], lN5 = lN4 + count[a][5];
+				const unsigned rN5 = rN4 + count[a][2], lN6 = lN5 + count[a][6], rN6 = rN5 + count[a][1];
+				const float32x4x2_t lb4 = vmaxq_f32x2( lb3, bb[4] ), rb4 = vmaxq_f32x2( rb3, bb[3] );
+				const float32x4x2_t lb5 = vmaxq_f32x2( lb4, bb[5] ), rb5 = vmaxq_f32x2( rb4, bb[2] );
+				const float32x4x2_t lb6 = vmaxq_f32x2( lb5, bb[6] ), rb6 = vmaxq_f32x2( rb5, bb[1] );
+				float ANLR3 = 1e30f; PROCESS_PLANE( a, 3, ANLR3, lN3, rN3, lb3, rb3 ); // most likely split
+				float ANLR2 = 1e30f; PROCESS_PLANE( a, 2, ANLR2, lN2, rN4, lb2, rb4 );
+				float ANLR4 = 1e30f; PROCESS_PLANE( a, 4, ANLR4, lN4, rN2, lb4, rb2 );
+				float ANLR5 = 1e30f; PROCESS_PLANE( a, 5, ANLR5, lN5, rN1, lb5, rb1 );
+				float ANLR1 = 1e30f; PROCESS_PLANE( a, 1, ANLR1, lN1, rN5, lb1, rb5 );
+				float ANLR0 = 1e30f; PROCESS_PLANE( a, 0, ANLR0, lN0, rN6, lb0, rb6 );
+				float ANLR6 = 1e30f; PROCESS_PLANE( a, 6, ANLR6, lN6, rN0, lb6, rb0 ); // least likely split
+			}
+			if (splitCost >= node.CalculateNodeCost()) break; // not splitting is better.
+			// in-place partition
+			const float rpd = (*(bvhvec3*)&rpd4)[bestAxis], nmin = (*(bvhvec3*)&nmin4)[bestAxis];
+			unsigned t, fr = triIdx[src];
+			for (unsigned i = 0; i < node.triCount; i++)
+			{
+				const unsigned bi = (unsigned)((fragment[fr].bmax[bestAxis] - fragment[fr].bmin[bestAxis] - nmin) * rpd);
+				if (bi <= bestPos) fr = triIdx[++src]; else t = fr, fr = triIdx[src] = triIdx[--j], triIdx[j] = t;
+			}
+			// create child nodes and recurse
+			const unsigned leftCount = src - node.leftFirst, rightCount = node.triCount - leftCount;
+			if (leftCount == 0 || rightCount == 0) break; // should not happen.
+			(*(float32x4x2_t*)&bvhNode[n]).val[0] = veorq_s32( bestLBox.val[0], signFlip8.val[0] );
+			(*(float32x4x2_t*)&bvhNode[n]).val[1] = veorq_s32( bestLBox.val[1], signFlip8.val[1] );
+			bvhNode[n].leftFirst = node.leftFirst, bvhNode[n].triCount = leftCount;
+			node.leftFirst = n++, node.triCount = 0, newNodePtr += 2;
+			(*(float32x4x2_t*)&bvhNode[n]).val[0] = veorq_s32( bestRBox.val[0], signFlip8.val[0] );
+			(*(float32x4x2_t*)&bvhNode[n]).val[1] = veorq_s32( bestRBox.val[1], signFlip8.val[1] );
+			bvhNode[n].leftFirst = j, bvhNode[n].triCount = rightCount;
+			task[taskCount++] = n, nodeIdx = n - 1;
+		}
+		// fetch subdivision task from stack
+		if (taskCount == 0) break; else nodeIdx = task[--taskCount];
+	}
+	// all done.
+	refittable = true; // not using spatial splits: can refit this BVH
+	frag_min_flipped = true; // NEON was used for binning; fragment.min flipped
+	may_have_holes = false; // the NEON builder produces a continuous list of nodes
+	usedBVHNodes = newNodePtr;
 }
 
 // Traverse the second alternative BVH layout (ALT_SOA).
 int BVH::Intersect_AltSoA( Ray& ray ) const
 {
-    assert( alt2Node != 0 );
-    BVHNodeAlt2* node = &alt2Node[0], * stack[64];
-    unsigned stackPtr = 0, steps = 0;
-    const float32x4_t Ox4 = vdupq_n_f32( ray.O.x ), rDx4 = vdupq_n_f32( ray.rD.x );
-    const float32x4_t Oy4 = vdupq_n_f32( ray.O.y ), rDy4 = vdupq_n_f32( ray.rD.y );
-    const float32x4_t Oz4 = vdupq_n_f32( ray.O.z ), rDz4 = vdupq_n_f32( ray.rD.z );
-    // const float32x4_t inf4 = vdupq_n_f32( 1e30f );
-    while (1)
-    {
-        steps++;
-        if (node->isLeaf())
-        {
-            for (unsigned i = 0; i < node->triCount; i++)
-            {
-                const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
-                const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
-                const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
-                const bvhvec3 h = cross( ray.D, edge2 );
-                const float a = dot( edge1, h );
-                if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
-                const float f = 1 / a;
-                const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
-                const float u = f * dot( s, h );
-                if (u < 0 || u > 1) continue;
-                const bvhvec3 q = cross( s, edge1 );
-                const float v = f * dot( ray.D, q );
-                if (v < 0 || u + v > 1) continue;
-                const float t = f * dot( edge2, q );
-                if (t < 0 || t > ray.hit.t) continue;
-                ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx;
-            }
-            if (stackPtr == 0) break; else node = stack[--stackPtr];
-            continue;
-        }
-        float32x4_t x4 = vmulq_f32( vsubq_f32( node->xxxx, Ox4 ), rDx4 );
-        float32x4_t y4 = vmulq_f32( vsubq_f32( node->yyyy, Oy4 ), rDy4 );
-        float32x4_t z4 = vmulq_f32( vsubq_f32( node->zzzz, Oz4 ), rDz4 );
-        // transpose
-        float32x4_t t0 = vzip1q_f32( x4, y4 ), t2 = vzip1q_f32( z4, z4 );
-        float32x4_t t1 = vzip2q_f32( x4, y4 ), t3 = vzip2q_f32( z4, z4 );
-        float32x4_t xyzw1a = vcombine_f32( vget_low_f32(t0), vget_low_f32(t2) );
-        float32x4_t xyzw2a = vcombine_f32( vget_high_f32(t0), vget_high_f32(t2) );
-        float32x4_t xyzw1b = vcombine_f32( vget_low_f32(t1), vget_low_f32(t3) );
-        float32x4_t xyzw2b = vcombine_f32( vget_high_f32(t1), vget_high_f32(t3) );
-        // process
-        float32x4_t tmina4 = vminq_f32( xyzw1a, xyzw2a ), tmaxa4 = vmaxq_f32( xyzw1a, xyzw2a );
-        float32x4_t tminb4 = vminq_f32( xyzw1b, xyzw2b ), tmaxb4 = vmaxq_f32( xyzw1b, xyzw2b );
-        // transpose back
-        t0 = vzip1q_f32( tmina4, tmaxa4 ), t2 = vzip1q_f32( tminb4, tmaxb4 );
-        t1 = vzip2q_f32( tmina4, tmaxa4 ), t3 = vzip2q_f32( tminb4, tmaxb4 );
-        x4 = vcombine_f32( vget_low_f32(t0), vget_low_f32(t2) );
-        y4 = vcombine_f32( vget_high_f32(t0), vget_high_f32(t2) );
-        z4 = vcombine_f32( vget_low_f32(t1), vget_low_f32(t3) );
-        unsigned lidx = node->left, ridx = node->right;
-        const float32x4_t min4 = vmaxq_f32( vmaxq_f32( vmaxq_f32( x4, y4 ), z4 ), vdupq_n_f32(0) );
-        const float32x4_t max4 = vminq_f32( vminq_f32( vminq_f32( x4, y4 ), z4 ), vdupq_n_f32( ray.hit.t ) );
-    #if 0
-        // TODO: why is this slower on gen14?
-        const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
-        const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
-        t0 = __builtin_shufflevector( max4, max4, 3, 1, 3, 1);
-        t1 = __builtin_shufflevector( min4, min4, 2, 0, 2, 0 );
-        t0 = vbslq_f32( vcgeq_f32( t0, t1 ), t1, inf4 );
-        float dist1 = vgetq_lane_f32( t0, 1 ), dist2 = vgetq_lane_f32( t0, 0 );
-    #else
-        const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
-        const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
-        float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f;
-        float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f;
-    #endif
-        if (dist1 > dist2)
-        {
-            float t = dist1; dist1 = dist2; dist2 = t;
-            unsigned i = lidx; lidx = ridx; ridx = i;
-        }
-        if (dist1 == 1e30f)
-        {
-            if (stackPtr == 0) break; else node = stack[--stackPtr];
-        }
-        else
-        {
-            node = alt2Node + lidx;
-            if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx;
-        }
-    }
-    return steps;
+	assert( alt2Node != 0 );
+	BVHNodeAlt2* node = &alt2Node[0], * stack[64];
+	unsigned stackPtr = 0, steps = 0;
+	const float32x4_t Ox4 = vdupq_n_f32( ray.O.x ), rDx4 = vdupq_n_f32( ray.rD.x );
+	const float32x4_t Oy4 = vdupq_n_f32( ray.O.y ), rDy4 = vdupq_n_f32( ray.rD.y );
+	const float32x4_t Oz4 = vdupq_n_f32( ray.O.z ), rDz4 = vdupq_n_f32( ray.rD.z );
+	// const float32x4_t inf4 = vdupq_n_f32( 1e30f );
+	while (1)
+	{
+		steps++;
+		if (node->isLeaf())
+		{
+			for (unsigned i = 0; i < node->triCount; i++)
+			{
+				const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3;
+				const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx];
+				const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx];
+				const bvhvec3 h = cross( ray.D, edge2 );
+				const float a = dot( edge1, h );
+				if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle
+				const float f = 1 / a;
+				const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] );
+				const float u = f * dot( s, h );
+				if (u < 0 || u > 1) continue;
+				const bvhvec3 q = cross( s, edge1 );
+				const float v = f * dot( ray.D, q );
+				if (v < 0 || u + v > 1) continue;
+				const float t = f * dot( edge2, q );
+				if (t < 0 || t > ray.hit.t) continue;
+				ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = tidx;
+			}
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+			continue;
+		}
+		float32x4_t x4 = vmulq_f32( vsubq_f32( node->xxxx, Ox4 ), rDx4 );
+		float32x4_t y4 = vmulq_f32( vsubq_f32( node->yyyy, Oy4 ), rDy4 );
+		float32x4_t z4 = vmulq_f32( vsubq_f32( node->zzzz, Oz4 ), rDz4 );
+		// transpose
+		float32x4_t t0 = vzip1q_f32( x4, y4 ), t2 = vzip1q_f32( z4, z4 );
+		float32x4_t t1 = vzip2q_f32( x4, y4 ), t3 = vzip2q_f32( z4, z4 );
+		float32x4_t xyzw1a = vcombine_f32( vget_low_f32( t0 ), vget_low_f32( t2 ) );
+		float32x4_t xyzw2a = vcombine_f32( vget_high_f32( t0 ), vget_high_f32( t2 ) );
+		float32x4_t xyzw1b = vcombine_f32( vget_low_f32( t1 ), vget_low_f32( t3 ) );
+		float32x4_t xyzw2b = vcombine_f32( vget_high_f32( t1 ), vget_high_f32( t3 ) );
+		// process
+		float32x4_t tmina4 = vminq_f32( xyzw1a, xyzw2a ), tmaxa4 = vmaxq_f32( xyzw1a, xyzw2a );
+		float32x4_t tminb4 = vminq_f32( xyzw1b, xyzw2b ), tmaxb4 = vmaxq_f32( xyzw1b, xyzw2b );
+		// transpose back
+		t0 = vzip1q_f32( tmina4, tmaxa4 ), t2 = vzip1q_f32( tminb4, tmaxb4 );
+		t1 = vzip2q_f32( tmina4, tmaxa4 ), t3 = vzip2q_f32( tminb4, tmaxb4 );
+		x4 = vcombine_f32( vget_low_f32( t0 ), vget_low_f32( t2 ) );
+		y4 = vcombine_f32( vget_high_f32( t0 ), vget_high_f32( t2 ) );
+		z4 = vcombine_f32( vget_low_f32( t1 ), vget_low_f32( t3 ) );
+		unsigned lidx = node->left, ridx = node->right;
+		const float32x4_t min4 = vmaxq_f32( vmaxq_f32( vmaxq_f32( x4, y4 ), z4 ), vdupq_n_f32( 0 ) );
+		const float32x4_t max4 = vminq_f32( vminq_f32( vminq_f32( x4, y4 ), z4 ), vdupq_n_f32( ray.hit.t ) );
+	#if 0
+		// TODO: why is this slower on gen14?
+		const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
+		const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
+		t0 = __builtin_shufflevector( max4, max4, 3, 1, 3, 1 );
+		t1 = __builtin_shufflevector( min4, min4, 2, 0, 2, 0 );
+		t0 = vbslq_f32( vcgeq_f32( t0, t1 ), t1, inf4 );
+		float dist1 = vgetq_lane_f32( t0, 1 ), dist2 = vgetq_lane_f32( t0, 0 );
+	#else
+		const float tmina_0 = vgetq_lane_f32( min4, 0 ), tmaxa_1 = vgetq_lane_f32( max4, 1 );
+		const float tminb_2 = vgetq_lane_f32( min4, 2 ), tmaxb_3 = vgetq_lane_f32( max4, 3 );
+		float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f;
+		float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f;
+	#endif
+		if (dist1 > dist2)
+		{
+			float t = dist1; dist1 = dist2; dist2 = t;
+			unsigned i = lidx; lidx = ridx; ridx = i;
+		}
+		if (dist1 == 1e30f)
+		{
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+		}
+		else
+		{
+			node = alt2Node + lidx;
+			if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx;
+		}
+	}
+	return steps;
 }
 
 #endif // BVH_USENEON