From 23727561bda7f08a58a3164052d894cdbcb19f22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?An=C4=B1lcan=20G=C3=BClkaya?= <anilcangulkaya7@gmail.com>
Date: Fri, 20 Dec 2024 15:27:37 +0300
Subject: [PATCH] native_recip and fast_normalize

These are OpenCL functions that might speed up the code a bit.
Especially in hemisphere function.
---
 wavefront.cl | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/wavefront.cl b/wavefront.cl
index bcbb693..81ac1a2 100644
--- a/wavefront.cl
+++ b/wavefront.cl
@@ -37,7 +37,7 @@ float3 DiffuseReflection( float3 N, uint* seed )
 	{
 		R = (float3)( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 );
 	} while (dot( R, R ) > 1);
-	return normalize( dot( R, N ) > 0 ? R : -R );
+	return fast_normalize( dot( R, N ) > 0 ? R : -R );
 }
 
 // CosWeightedDiffReflection: Cosine-weighted random bounce in the hemisphere
@@ -48,7 +48,7 @@ float3 CosWeightedDiffReflection( const float3 N, uint* seed )
 	{
 		R = (float3)( RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1, RandomFloat( seed ) * 2 - 1 );
 	} while (dot( R, R ) > 1);
-	return normalize( N + normalize( R ) );
+	return fast_normalize( N + fast_normalize( R ) );
 }
 
 // PathState: path throughput, current extension ray, pixel index
@@ -106,7 +106,7 @@ void kernel Generate( global struct PathState* raysOut, uint frameSeed  )
 	const float4 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0);
 	raysOut[id].T = (float4)( 1, 1, 1, 1 /* pdf */ );
 	raysOut[id].O = (float4)( rd.eye.xyz, as_float( id << 4 /* low bits: depth */ ) );
-	raysOut[id].D = (float4)( normalize( P.xyz - rd.eye.xyz ), 1e30f );
+	raysOut[id].D = (float4)( fast_normalize( P.xyz - rd.eye.xyz ), 1e30f );
 	raysOut[id].hit = (float4)( 1e30f, 0, 0, as_float( 0 ) );
 }
 
@@ -121,7 +121,7 @@ void kernel Extend( global struct PathState* raysIn )
 		if (pathId < 0) break;
 		const float4 O4 = raysIn[pathId].O;
 		const float4 D4 = raysIn[pathId].D;
-		const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z );
+		const float3 rD = native_recip( D4.xyz ); 
 		raysIn[pathId].hit = traverse_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, 1e30f );
 	}
 }
@@ -177,7 +177,7 @@ void kernel Shade( global float4* accumulator,
 			continue;
 		}
 		float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz;
-		float3 N = normalize( cross( vert1 - vert0, vert2 - vert0 ) );
+		float3 N = fast_normalize( cross( vert1 - vert0, vert2 - vert0 ) );
 		float3 D = D4.xyz;
 		if (dot( N, D ) > 0) N *= -1;
 		float3 I = O4.xyz + t * D;
@@ -191,9 +191,9 @@ void kernel Shade( global float4* accumulator,
 		{
 			uint newShadowIdx = atomic_inc( &connectTasks );
 			float dist2 = dot( L, L ), dist = sqrt( dist2 );
-			L *= 1.0f / dist;
+			L *= native_recip(dist);
 			float NLdotL = fabs( L.y ); // actually, fabs( dot( L, LN ) )
-			shadowOut[newShadowIdx].T = (float4)( lightColor * BRDF * T * NdotL * NLdotL * (1.0f / dist2), 0 );
+			shadowOut[newShadowIdx].T = (float4)( lightColor * BRDF * T * NdotL * NLdotL * native_recip( dist2 ), 0 );
 			shadowOut[newShadowIdx].O = (float4)( I + L * 0.001f, as_float( pixelIdx ) );
 			shadowOut[newShadowIdx].D = (float4)( L, dist - 0.002f );
 		}
@@ -203,7 +203,7 @@ void kernel Shade( global float4* accumulator,
 			uint newRayIdx = atomic_inc( &extendTasks );
 			float3 R = CosWeightedDiffReflection( N, &seed );
 			float PDF = dot( N, R ) * INVPI;
-			T *= dot( N, R ) * BRDF * (1.0f / PDF);
+			T *= dot( N, R ) * BRDF * native_recip( PDF );
 			raysOut[newRayIdx].T = (float4)( T, 1 );
 			raysOut[newRayIdx].O = (float4)( I + R * 0.001f, as_float( (pixelIdx << 4) + depth + 1 ) );
 			raysOut[newRayIdx].D = (float4)( R, 1e30f );
@@ -231,7 +231,7 @@ void kernel Connect( global float4* accumulator, global struct Potential* shadow
 		const float4 T4 = shadowIn[rayId].T;
 		const float4 O4 = shadowIn[rayId].O;
 		const float4 D4 = shadowIn[rayId].D;
-		const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z );
+		const float3 rD = native_recip( D4.xyz ); 
 		if (!isoccluded_cwbvh( rd.cwbvhNodes, rd.cwbvhTris, O4.xyz, D4.xyz, rD, D4.w ))
 		{
 			uint pixelIdx = as_uint( O4.w );
@@ -249,8 +249,6 @@ void kernel Finalize( global float4* accumulator, const float scale, global uint
 	const uint x = get_global_id( 0 ), y = get_global_id( 1 );
 	const uint pixelIdx = x + y * get_global_size( 0 );
 	const float4 p = accumulator[pixelIdx] * scale;
-	const int r = (int)(255.0f * min( 1.0f, sqrt( p.x ) ));
-	const int g = (int)(255.0f * min( 1.0f, sqrt( p.y ) ));
-	const int b = (int)(255.0f * min( 1.0f, sqrt( p.z ) ));
-	pixels[pixelIdx] = (r << 16) + (g << 8) + b;
+	int3 rgb = convert_int3( min( sqrt( p.xyz ) , (float3)( 1.0f, 1.0f, 1.0f ) ) * 255.0f ) ;
+	pixels[pixelIdx] = (rgb.x << 16) + (rgb.y << 8) + rgb.z;
 }
\ No newline at end of file