From 251e8d243a18952095bf42de32cb2d0aa13a047f Mon Sep 17 00:00:00 2001
From: Jacco Bikker <bikker.j@gmail.com>
Date: Wed, 27 Nov 2024 11:15:49 +0100
Subject: [PATCH] Triangle layout optimizations.

---
 tiny_bvh.h             | 86 +++++++++++++++++++++---------------------
 tiny_bvh_fenster.cpp   |  9 ++---
 tiny_bvh_speedtest.cpp | 64 +++++++++++++++++++------------
 traverse.cl            |  5 +--
 4 files changed, 87 insertions(+), 77 deletions(-)

diff --git a/tiny_bvh.h b/tiny_bvh.h
index ba89447..fe3edc3 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -1306,10 +1306,11 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 				{
 					unsigned t = triIdx[childNode[i]->firstTri + j];
 					bvhvec4 v0 = verts[t * 3 + 0];
+					bvh4Alt[newAlt4Ptr + 1] = verts[t * 3 + 1] - v0;
+					bvh4Alt[newAlt4Ptr + 2] = verts[t * 3 + 2] - v0;
 					v0.w = *(float*)&t; // as_float
-					bvh4Alt[newAlt4Ptr++] = v0;
-					bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 1];
-					bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 2];
+					bvh4Alt[newAlt4Ptr + 0] = v0;
+					newAlt4Ptr += 3;
 				}
 			}
 			// process interior nodes
@@ -1431,10 +1432,6 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 		while (1)
 		{
 			BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
-			if (nodeIdx == 58663)
-			{
-				int w = 0;
-			}
 			for (int i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0)
 			{
 				if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else
@@ -1442,18 +1439,16 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
 					unsigned first = node.childFirst[i];
 					unsigned count = node.triCount[i];
 					node.childFirst[i] = triPtr;
+					// assign vertex data
 					for (unsigned j = 0; j < count; j++)
 					{
 						unsigned fi = triIdx[first + j];
-						if (fi > triCount)
-						{
-							int w = 0;
-						}
 						bvhvec4 v0 = verts[fi * 3 + 0];
+						bvh4Tris[triPtr + 1] = verts[fi * 3 + 1] - v0;
+						bvh4Tris[triPtr + 2] = verts[fi * 3 + 2] - v0;
 						v0.w = *(float*)&fi; // so we know the original tri idx
-						bvh4Tris[triPtr++] = v0;
-						bvh4Tris[triPtr++] = verts[fi * 3 + 1];
-						bvh4Tris[triPtr++] = verts[fi * 3 + 2];
+						bvh4Tris[triPtr + 0] = v0;
+						triPtr += 3;
 					}
 				}
 			}
@@ -2466,9 +2461,9 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const
 			unsigned triStart = offset + (leaf[i] & 0xffff);
 			for (unsigned j = 0; j < N; j++, triStart += 3)
 			{
+				const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] );
+				const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] );
 				const bvhvec3 v0 = bvh4Alt[triStart + 0];
-				const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ) - v0;
-				const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ) - v0;
 				const bvhvec3 h = cross( ray.D, edge2 );
 				const float a = dot( edge1, h );
 				if (fabs( a ) < 0.0000001f) continue;
@@ -3377,19 +3372,18 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
 inline void IntersectTri4( Ray& r, unsigned idx, __m128& t4, const bvhvec4* verts )
 {
 	bvhvec4 v0 = verts[idx];
+	bvhvec3 edge1 = verts[idx + 1], edge2 = verts[idx + 2];
 	const unsigned triIdx = *(unsigned*)&v0.w;
 	v0.w = 0;
-	const bvhvec3 edge1 = verts[idx + 1] - v0, edge2 = verts[idx + 2] - v0;
 	const bvhvec3 h = cross( r.D, edge2 );
 	const float a = dot( edge1, h );
 	if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle
 	const float f = 1 / a;
-	const bvhvec3 s = r.O - bvhvec3( v0 );
+	const bvhvec3 s = r.O - bvhvec3( v0 ), q = cross( s, edge1 );
 	const float u = f * dot( s, h );
-	if (u < 0 || u > 1) return;
-	const bvhvec3 q = cross( s, edge1 );
 	const float v = f * dot( r.D, q );
-	if (v < 0 || u + v > 1) return;
+	const bool out = u < 0 || u > 1 || v < 0 || u + v > 1;
+	if (out) return;
 	const float t = f * dot( edge2, q );
 	if (t > 0 && t < r.hit.t) r.hit.u = u, r.hit.v = v, r.hit.prim = triIdx, r.hit.t = t, t4 = _mm_set1_ps( t );
 }
@@ -3407,27 +3401,25 @@ int BVH::Intersect_Afra( Ray& ray ) const
 	{
 		const BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
 		// intersect the ray with four AABBs
-		const __m128 x0 = _mm_sub_ps( node.xmin4, ox4 ), x1 = _mm_sub_ps( node.xmax4, ox4 );
-		const __m128 y0 = _mm_sub_ps( node.ymin4, oy4 ), y1 = _mm_sub_ps( node.ymax4, oy4 );
-		const __m128 z0 = _mm_sub_ps( node.zmin4, oz4 ), z1 = _mm_sub_ps( node.zmax4, oz4 );
+		const __m128 xmin4 = node.xmin4, xmax4 = node.xmax4;
+		const __m128 ymin4 = node.ymin4, ymax4 = node.ymax4;
+		const __m128 zmin4 = node.zmin4, zmax4 = node.zmax4;
+		const __m128 x0 = _mm_sub_ps( xmin4, ox4 ), x1 = _mm_sub_ps( xmax4, ox4 );
+		const __m128 y0 = _mm_sub_ps( ymin4, oy4 ), y1 = _mm_sub_ps( ymax4, oy4 );
+		const __m128 z0 = _mm_sub_ps( zmin4, oz4 ), z1 = _mm_sub_ps( zmax4, oz4 );
 		const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 );
 		const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 );
 		const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 );
-		__m128 tmin = _mm_max_ps( _mm_max_ps( _mm_min_ps( tx1, tx2 ), _mm_min_ps( ty1, ty2 ) ), _mm_min_ps( tz1, tz2 ) );
-		const __m128 tmax = _mm_min_ps( _mm_min_ps( _mm_max_ps( tx1, tx2 ), _mm_max_ps( ty1, ty2 ) ), _mm_max_ps( tz1, tz2 ) );
+		const __m128 txmin = _mm_min_ps( tx1, tx2 ), tymin = _mm_min_ps( ty1, ty2 ), tzmin = _mm_min_ps( tz1, tz2 );
+		const __m128 txmax = _mm_max_ps( tx1, tx2 ), tymax = _mm_max_ps( ty1, ty2 ), tzmax = _mm_max_ps( tz1, tz2 );
+		const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin );
+		const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax );
 		const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) );
-		const int hitBits = _mm_movemask_ps( hit );
-		const int hits = __popc( hitBits );
-		if (hits == 0)
-		{
-			if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr];
-		}
-		else if (hits == 1)
+		const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
+		if (hits == 1 /* 43% */)
 		{
 			// just one node was hit - no sorting needed.
-			const unsigned lane = __bfind( hitBits );
-			const unsigned count = node.triCount[lane];
-			// if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
+			const unsigned lane = __bfind( hitBits ), count = node.triCount[lane];
 			if (count == 0) nodeIdx = node.childFirst[lane]; else
 			{
 				const unsigned first = node.childFirst[lane];
@@ -3438,7 +3430,13 @@ int BVH::Intersect_Afra( Ray& ray ) const
 			}
 			continue;
 		}
-		else if (hits == 2)
+		if (hits == 0 /* 29% */)
+		{
+			if (stackPtr == 0) break;
+			nodeIdx = stack[--stackPtr];
+			continue;
+		}
+		if (hits == 2 /* 16% */)
 		{
 			// two nodes hit
 			unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
@@ -3470,12 +3468,12 @@ int BVH::Intersect_Afra( Ray& ray ) const
 					IntersectTri4( ray, first + j * 3, t4, bvh4Tris );
 			}
 		}
-		else if (hits == 3)
+		else if (hits == 3 /* 8% */)
 		{
 			// blend in lane indices
-			tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
+			__m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
 			// sort
-			float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 );
+			float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 );
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
 			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
 			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
@@ -3499,12 +3497,12 @@ int BVH::Intersect_Afra( Ray& ray ) const
 					IntersectTri4( ray, first + j * 3, t4, bvh4Tris );
 			}
 		}
-		else
+		else /* hits == 4, 2%: rare */
 		{
 			// blend in lane indices
-			tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
+			__m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
 			// sort
-			float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 );
+			float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 );
 			if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
 			if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
 			if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
@@ -3825,7 +3823,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const
 
 #endif // BVH_USENEON
 
-} // namespace tinybvh
+	} // namespace tinybvh
 
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp
index d40914d..563b19c 100644
--- a/tiny_bvh_fenster.cpp
+++ b/tiny_bvh_fenster.cpp
@@ -31,9 +31,8 @@ int verts = 0;
 // setup view pyramid for a pinhole camera: 
 // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left)
 #ifdef LOADSCENE
-// bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) );
-static bvhvec3 eye( 0, 13, 30 ), p1, p2, p3;
-static bvhvec3 view = normalize( bvhvec3( 0, 0.01f, -1 ) );
+static bvhvec3 eye( 0, 30, 0 ), p1, p2, p3;
+static bvhvec3 view = normalize( bvhvec3( -8, 2, -1.7f ) );
 #else
 static bvhvec3 eye( -3.5f, -1.5f, -6.5f ), p1, p2, p3;
 static bvhvec3 view = normalize( bvhvec3( 3, 1.5f, 5 ) );
@@ -110,8 +109,6 @@ void Init()
 	// build a BVH over the scene
 #if defined(BVH_USEAVX)
 	bvh.BuildHQ( triangles, verts / 3 );
-	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
-	bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA );
 #elif defined(BVH_USENEON)
 	bvh.BuildNEON( triangles, verts / 3 );
 #else
@@ -173,7 +170,7 @@ void Tick( uint32_t* buf )
 
 	// trace primary rays
 #if !defined USE_EMBREE
-	for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::BVH4_AFRA );
+	for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
 #else
 	struct RTCRayHit rayhit;
 	for (int i = 0; i < N; i++)
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index c330a0b..d0bdb57 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -16,7 +16,7 @@
 #define BUILD_REFERENCE
 #define BUILD_AVX
 #define BUILD_NEON
-// #define BUILD_SBVH
+#define BUILD_SBVH
 #define TRAVERSE_2WAY_ST
 #define TRAVERSE_ALT2WAY_ST
 #define TRAVERSE_SOA2WAY_ST
@@ -323,9 +323,11 @@ int main()
 	// trace all rays three times to estimate average performance
 	// - single core version
 	printf( "- CPU, coherent,   basic 2-way layout, ST: " );
-	t.reset();
-	for (int pass = 0; pass < 3; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i] );
+	}
 	float traceTimeST = t.elapsed() / 3.0f;
 	mrays = (float)Nsmall / traceTimeST;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeST * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
@@ -338,9 +340,11 @@ int main()
 	// - single core version, alternative bvh layout
 	printf( "- CPU, coherent,   alt 2-way layout,   ST: " );
 	bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
-	t.reset();
-	for (int pass = 0; pass < 3; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::AILA_LAINE );
+	}
 	float traceTimeAlt = t.elapsed() / 3.0f;
 	mrays = (float)Nsmall / traceTimeAlt;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
@@ -372,12 +376,13 @@ int main()
 	t.reset();
 	float traceTimeGPU = 0;
 	ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
-	for (int pass = 0; pass < 8; pass++)
+	for (int pass = 0; pass < 9; pass++)
 	{
 		ailalaine_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
 		clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
+		if (pass == 0) continue; // first pass is for cache warming
 		traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
 	}
 	// get results from GPU - this also syncs the queue.
@@ -411,12 +416,13 @@ int main()
 	t.reset();
 	float traceTimeGPU4 = 0;
 	gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData );
-	for (int pass = 0; pass < 8; pass++)
+	for (int pass = 0; pass < 9; pass++)
 	{
 		gpu4way_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
 		clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
+		if (pass == 0) continue; // first pass is for cache warming
 		traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
 	}
 	// get results from GPU - this also syncs the queue.
@@ -453,12 +459,13 @@ int main()
 	t.reset();
 	float traceTimeGPU8 = 0;
 	cwbvh_kernel.SetArguments( &cwbvhNodes, &cwbvhTris, &rayData );
-	for (int pass = 0; pass < 8; pass++)
+	for (int pass = 0; pass < 9; pass++)
 	{
 		cwbvh_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
 		clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
 		clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
+		if (pass == 0) continue; // first pass is for cache warming
 		traceTimeGPU8 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
 	}
 	// get results from GPU - this also syncs the queue.
@@ -478,9 +485,11 @@ int main()
 	// - single core version, alternative bvh layout 2
 	printf( "- CPU, coherent,   soa 2-way layout,   ST: " );
 	bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
-	t.reset();
-	for (int pass = 0; pass < 3; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA );
+	}
 	float traceTimeAlt2 = t.elapsed() / 3.0f;
 	mrays = (float)Nsmall / traceTimeAlt2;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt2 * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
@@ -492,9 +501,9 @@ int main()
 	// trace all rays three times to estimate average performance
 	// - multi-core version (using OpenMP and batches of 10,000 rays)
 	printf( "- CPU, coherent,   basic 2-way layout, MT: " );
-	t.reset();
-	for (int j = 0; j < 3; j++)
+	for (int pass = 0; pass < 4; pass++)
 	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		const int batchCount = Nfull / 10000;
 	#pragma omp parallel for schedule(dynamic)
 		for (int batch = 0; batch < batchCount; batch++)
@@ -514,9 +523,9 @@ int main()
 	// trace all rays three times to estimate average performance
 	// - coherent distribution, multi-core, packet traversal
 	printf( "- CPU, coherent,   2-way, packets,     MT: " );
-	t.reset();
-	for (int j = 0; j < 3; j++)
+	for (int pass = 0; pass < 4; pass++)
 	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
 	#pragma omp parallel for schedule(dynamic)
 		for (int batch = 0; batch < batchCount; batch++)
@@ -534,9 +543,9 @@ int main()
 	// trace all rays three times to estimate average performance
 	// - coherent distribution, multi-core, packet traversal, SSE version
 	printf( "- CPU, coherent,   2-way, packets/SSE, MT: " );
-	t.reset();
-	for (int j = 0; j < 3; j++)
+	for (int pass = 0; pass < 4; pass++)
 	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
 	#pragma omp parallel for schedule(dynamic)
 		for (int batch = 0; batch < batchCount; batch++)
@@ -565,9 +574,11 @@ int main()
 	printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
 	bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
 	printf( "- CPU, coherent,   2-way optimized,    ST: " );
-	t.reset();
-	for (int pass = 0; pass < 3; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA );
+	}
 	float traceTimeOpt = t.elapsed() / 3.0f;
 	mrays = (float)Nsmall / traceTimeOpt;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
@@ -589,9 +600,11 @@ int main()
 	bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
 	bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA );
 	printf( "- CPU, coherent,   4-way optimized,    ST: " );
-	t.reset();
-	for (int pass = 0; pass < 3; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::BVH4_AFRA );
+	}
 	float traceTimeAfra = t.elapsed() / 3.0f;
 	mrays = (float)Nsmall / traceTimeAfra;
 	printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAfra * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
@@ -614,10 +627,12 @@ int main()
 		rayhits[i].hit.geomID = RTC_INVALID_GEOMETRY_ID;
 		rayhits[i].hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
 	}
-	t.reset();
-	for (int pass = 0; pass < 6; pass++)
+	for (int pass = 0; pass < 4; pass++)
+	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		for (int i = 0; i < Nsmall; i++) rtcIntersect1( embreeScene, rayhits + i );
-	float traceTimeEmbree = t.elapsed() / 6.0f;
+	}
+	float traceTimeEmbree = t.elapsed() / 3.0f;
 	// retrieve intersection results
 	for (int i = 0; i < Nsmall; i++)
 	{
@@ -646,8 +661,9 @@ int main()
 	// - divergent distribution, multi-core
 	printf( "- CPU, incoherent, basic 2-way layout, MT: " );
 	t.reset();
-	for (int j = 0; j < 3; j++)
+	for (int pass = 0; pass < 4; pass++)
 	{
+		if (pass == 1) t.reset(); // first pass is cache warming
 		const int batchCount = Nfull / 10000;
 	#pragma omp parallel for schedule(dynamic)
 		for (int batch = 0; batch < batchCount; batch++)
diff --git a/traverse.cl b/traverse.cl
index 3ce7041..bbce89b 100644
--- a/traverse.cl
+++ b/traverse.cl
@@ -244,12 +244,11 @@ void kernel traverse_gpu4way( global float4* alt4Node, global struct Ray* rayDat
 		{
 			const unsigned leafInfo = smem[smBase + leaf];
 			unsigned thisTri = (leafInfo & 0xffff) + offset + prim * 3;
+			const float4 edge2 = alt4Node[thisTri + 2];
+			const float4 edge1 = alt4Node[thisTri + 1];
 			const float4 v0 = alt4Node[thisTri];
-			const float4 v1 = alt4Node[thisTri + 1];
-			const float4 v2 = alt4Node[thisTri + 2];
 			const unsigned triCount = (leafInfo >> 16) & 0x7fff;
 			if (++prim == triCount) prim = 0, leaf++;
-			const float4 edge1 = v1 - v0, edge2 = v2 - v0;
 			const float3 h = cross( D, edge2.xyz );
 			const float a = dot( edge1.xyz, h );
 			if (fabs( a ) < 0.0000001f) continue;