diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp
new file mode 100644
index 0000000..92aba55
--- /dev/null
+++ b/tiny_bvh_gpu.cpp
@@ -0,0 +1,125 @@
+// This example shows how to build a basic GPU path tracer using
+// tinybvh. TinyOCL is used to render on the GPU using OpenCL.
+
+#define FENSTER_APP_IMPLEMENTATION
+#define SCRWIDTH 1280
+#define SCRHEIGHT 800
+#include "external/fenster.h" // https://github.com/zserge/fenster
+
+// This application uses tinybvh - And this file will include the implementation.
+#define TINYBVH_IMPLEMENTATION
+#include "tiny_bvh.h"
+using namespace tinybvh;
+
+// This application uses tinyocl - And this file will include the implementation.
+#define TINY_OCL_IMPLEMENTATION
+#include "tiny_ocl.h"
+
+// Other includes
+#include <fstream>
+
+// Application variables
+
+static BVH4_GPU bvh;
+static bvhvec4* tris = 0;
+static int triCount = 0, frameIdx = 0, spp = 0;
+static Kernel* reset, * generate, * extend, * shade, * traceShadows, * finalize;
+static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections;
+
+// View pyramid for a pinhole camera
+struct RenderData
+{
+	bvhvec4 eye = bvhvec4( 0, 30, 0, 0 );
+	bvhvec4 view = bvhvec4( -1, 0, 0, 0 );
+	bvhvec4 C, p0, p1, p2;
+	uint32_t frameIdx, dummy1, dummy2, dummy3;
+} rd;
+
+// Scene management - Append a file, with optional position, scale and color override, tinyfied
+void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, int N = 0 )
+{
+	std::fstream s{ file, s.binary | s.in }; s.read( (char*)&N, 4 );
+	bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (N + triCount) * 48 );
+	if (tris) memcpy( data, tris, triCount * 48 ), tinybvh::free64( tris );
+	tris = data, s.read( (char*)tris + triCount * 48, N * 48 ), triCount += N;
+	for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++)
+		*(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4;
+}
+
+// Application init
+void Init()
+{
+	// create OpenCL kernels
+	reset = new Kernel( "wavefront.cl", "ResetCounters" );
+	generate = new Kernel( "wavefront.cl", "Generate" );
+	extend = new Kernel( "wavefront.cl", "Extend" );
+	shade = new Kernel( "wavefront.cl", "Shade" );
+	traceShadows = new Kernel( "wavefront.cl", "Connect" );
+	finalize = new Kernel( "wavefront.cl", "Finalize" );
+	// create OpenCL buffers
+	int N = SCRWIDTH * SCRHEIGHT;
+	pixels = new Buffer( N * sizeof( uint32_t ) );
+	accumulator = new Buffer( N * sizeof( bvhvec4 ) );
+	raysIn = new Buffer( N * sizeof( bvhvec4 ) * 4 );
+	raysOut = new Buffer( N * sizeof( bvhvec4 ) * 4 );
+	connections = new Buffer( N * sizeof( bvhvec4 ) * 3 );
+	// set kernel arguments
+	reset->SetArguments( N );
+	// load raw vertex data
+	AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff );
+	AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff );
+	// build bvh (here: 'compressed wide bvh', for efficient GPU rendering)
+	bvh.Build( tris, triCount );
+	// load camera position / direction from file
+	std::fstream t = std::fstream{ "camera_gpu.bin", t.binary | t.in };
+	if (!t.is_open()) return;
+	t.read( (char*)&rd, sizeof( rd ) );
+}
+
+// Keyboard handling
+bool UpdateCamera( float delta_time_s, fenster& f )
+{
+	bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
+	// get camera controls.
+	bool moved = false;
+	if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true;
+	if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true;
+	if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true;
+	if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true;
+	if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true;
+	if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true;
+	if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true;
+	if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true;
+	if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true;
+	if (f.keys[18]) rd.view = normalize( rd.view + up * delta_time_s ), moved = true;
+	// recalculate right, up
+	right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right );
+	bvhvec3 C = rd.eye + 1.2f * rd.view;
+	rd.p0 = C - right + up, rd.p1 = C + right + up, rd.p2 = C - right - up;
+	return moved;
+}
+
+// Application Tick
+void Tick( float delta_time_s, fenster& f, uint32_t* buf )
+{
+	// handle user input and update camera
+	if (UpdateCamera( delta_time_s, f ) || frameIdx++ == 0)
+	{
+		// memset( accumulator, 0, SCRWIDTH * SCRHEIGHT * sizeof( bvhvec3 ) );
+		spp = 1;
+	}
+	// render on the GPU
+	// ..
+	// print frame time / rate in window title
+	char title[50];
+	sprintf( title, "tiny_bvh %.2f s %.2f Hz", delta_time_s, 1.0f / delta_time_s );
+	fenster_update_title( &f, title );
+}
+
+// Application Shutdown
+void Shutdown()
+{
+	// save camera position / direction to file
+	std::fstream s = std::fstream{ "camera.bin", s.binary | s.out };
+	s.write( (char*)&rd, sizeof( rd ) );
+}
\ No newline at end of file
diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp
index e5a3140..38d2db2 100644
--- a/tiny_bvh_speedtest.cpp
+++ b/tiny_bvh_speedtest.cpp
@@ -687,7 +687,11 @@ int main()
 #if defined TRAVERSE_OPTIMIZED_ST || defined TRAVERSE_4WAY_OPTIMIZED
 
 	printf( "Optimized BVH performance - Optimizing... " );
-	if (!bvh_verbose) bvh_verbose = new BVH_Verbose( *bvh );
+	if (!bvh_verbose) 
+	{
+		bvh_verbose = new BVH_Verbose();
+		bvh_verbose->ConvertFrom( *bvh );
+	}
 	t.reset();
 	bvh_verbose->Optimize( 1000000 ); // optimize the raw SBVH
 	bvh->ConvertFrom( *bvh_verbose );
diff --git a/tiny_bvh_test.sln b/tiny_bvh_test.sln
index 90d589f..cdea067 100644
--- a/tiny_bvh_test.sln
+++ b/tiny_bvh_test.sln
@@ -15,6 +15,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_pt", "vcproj\tiny_
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_anim", "vcproj\tiny_bvh_anim.vcxproj", "{2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_gpu", "vcproj\tiny_bvh_gpu.vcxproj", "{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -71,6 +73,14 @@ Global
 		{2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x64.Build.0 = Release|x64
 		{2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x86.ActiveCfg = Release|Win32
 		{2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x86.Build.0 = Release|Win32
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x64.ActiveCfg = Debug|x64
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x64.Build.0 = Debug|x64
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x86.ActiveCfg = Debug|Win32
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x86.Build.0 = Debug|Win32
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x64.ActiveCfg = Release|x64
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x64.Build.0 = Release|x64
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x86.ActiveCfg = Release|Win32
+		{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/traverse.cl b/traverse.cl
index 133e6af..f9257a5 100644
--- a/traverse.cl
+++ b/traverse.cl
@@ -93,16 +93,11 @@ inline float fmax_fmax( const float a, const float b, const float c )
 // 
 // ============================================================================
 
-void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
+float4 traverse_ailalaine( const float3 O, const float3 D, const float3 rD, const float tmax )
 {
-	// fetch ray
-	const unsigned threadId = get_global_id( 0 );
-	const float3 O = rayData[threadId].O.xyz;
-	const float3 D = rayData[threadId].D.xyz;
-	const float3 rD = rayData[threadId].rD.xyz;
-	float4 hit;
-	hit.x = 1e30f; // ignoring value set in ray to spare one memory transaction.
 	// traverse BVH
+	const float4 hit;
+	hit.x = tmax;
 	unsigned node = 0, stack[64], stackPtr = 0;
 	while (1)
 	{
@@ -169,6 +164,88 @@ void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsign
 	rayData[threadId].hit = hit;
 }
 
+bool isoccluded_ailalaine( const float3 O, const float3 D, const float3 rD, const float tmax )
+{
+	// traverse BVH
+	const float4 hit;
+	hit.x = tmax;
+	unsigned node = 0, stack[64], stackPtr = 0;
+	while (1)
+	{
+		// fetch the node
+		const float4 lmin = altNode[node].lmin, lmax = altNode[node].lmax;
+		const float4 rmin = altNode[node].rmin, rmax = altNode[node].rmax;
+		const unsigned triCount = as_uint( rmin.w );
+		if (triCount > 0)
+		{
+			// process leaf node
+			const unsigned firstTri = as_uint( rmax.w );
+			for (unsigned i = 0; i < triCount; i++)
+			{
+				const unsigned triIdx = idx[firstTri + i];
+				const float4* tri = verts + 3 * triIdx;
+				// triangle intersection - Möller-Trumbore
+				const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0];
+				const float3 h = cross( D, edge2.xyz );
+				const float a = dot( edge1.xyz, h );
+				if (fabs( a ) < 0.0000001f) continue;
+				const float f = 1 / a;
+				const float3 s = O - tri[0].xyz;
+				const float u = f * dot( s, h );
+				if (u < 0 || u > 1) continue;
+				const float3 q = cross( s, edge1.xyz );
+				const float v = f * dot( D, q );
+				if (v < 0 || u + v > 1) continue;
+				const float d = f * dot( edge2.xyz, q );
+				if (d > 0.0f && d < hit.x) return true;
+			}
+			if (stackPtr == 0) break;
+			node = stack[--stackPtr];
+			continue;
+		}
+		unsigned left = as_uint( lmin.w ), right = as_uint( lmax.w );
+		// child AABB intersection tests
+		const float3 t1a = (lmin.xyz - O) * rD, t2a = (lmax.xyz - O) * rD;
+		const float3 t1b = (rmin.xyz - O) * rD, t2b = (rmax.xyz - O) * rD;
+		const float3 minta = fmin( t1a, t2a ), maxta = fmax( t1a, t2a );
+		const float3 mintb = fmin( t1b, t2b ), maxtb = fmax( t1b, t2b );
+		const float tmina = fmax( fmax( fmax( minta.x, minta.y ), minta.z ), 0 );
+		const float tminb = fmax( fmax( fmax( mintb.x, mintb.y ), mintb.z ), 0 );
+		const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), hit.x );
+		const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), hit.x );
+		float dist1 = tmina > tmaxa ? 1e30f : tmina;
+		float dist2 = tminb > tmaxb ? 1e30f : tminb;
+		// traverse nearest child first
+		if (dist1 > dist2)
+		{
+			float h = dist1; dist1 = dist2; dist2 = h;
+			unsigned t = left; left = right; right = t;
+		}
+		if (dist1 == 1e30f)
+		{
+			if (stackPtr == 0) break; else node = stack[--stackPtr];
+		}
+		else
+		{
+			node = left;
+			if (dist2 != 1e30f) stack[stackPtr++] = right;
+		}
+	}
+	// no hit found
+	return false;
+}
+
+void kernel batch_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData )
+{
+	// fetch ray
+	const unsigned threadId = get_global_id( 0 );
+	const float3 O = rayData[threadId].O.xyz;
+	const float3 D = rayData[threadId].D.xyz;
+	const float3 rD = rayData[threadId].rD.xyz;
+	float4 hit = traverse_ailalaine( O, D, rD, 1e30f );
+	rayData[threadId].hit = hit;
+}
+
 // ============================================================================
 //
 //        T R A V E R S E _ G P U 4 W A Y
diff --git a/vcproj/tiny_bvh_anim.vcxproj b/vcproj/tiny_bvh_anim.vcxproj
index 52528a2..90d4d9a 100644
--- a/vcproj/tiny_bvh_anim.vcxproj
+++ b/vcproj/tiny_bvh_anim.vcxproj
@@ -119,7 +119,8 @@
       <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -137,7 +138,8 @@
       <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions);	_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
diff --git a/vcproj/tiny_bvh_fenster.vcxproj b/vcproj/tiny_bvh_fenster.vcxproj
index d0a4e69..2095c56 100644
--- a/vcproj/tiny_bvh_fenster.vcxproj
+++ b/vcproj/tiny_bvh_fenster.vcxproj
@@ -119,7 +119,8 @@
       <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -137,7 +138,8 @@
       <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
diff --git a/vcproj/tiny_bvh_gpu.vcxproj b/vcproj/tiny_bvh_gpu.vcxproj
new file mode 100644
index 0000000..fcb157f
--- /dev/null
+++ b/vcproj/tiny_bvh_gpu.vcxproj
@@ -0,0 +1,163 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\tiny_bvh_gpu.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\tiny_bvh.h" />
+    <ClInclude Include="..\tiny_ocl.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\traverse.cl" />
+    <None Include="..\wavefront.cl" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <VCProjectVersion>17.0</VCProjectVersion>
+    <Keyword>Win32Proj</Keyword>
+    <ProjectGuid>{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}</ProjectGuid>
+    <RootNamespace>tinybvhgpu</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <CharacterSet>NotSet</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v143</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>NotSet</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(SolutionDir)</OutDir>
+    <IntDir>$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <OutDir>$(SolutionDir)</OutDir>
+    <IntDir>$(Platform)\$(ProjectName)\$(Configuration)\</IntDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalIncludeDirectories>../external/OpenCL/inc/</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>../external/embree/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>$(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <SDLCheck>true</SDLCheck>
+      <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions);	_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
+      <ConformanceMode>true</ConformanceMode>
+      <LanguageStandard>stdcpp20</LanguageStandard>
+      <AdditionalIncludeDirectories>../external/OpenCL/inc/</AdditionalIncludeDirectories>
+      <Optimization>Full</Optimization>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
+      <FloatingPointModel>Fast</FloatingPointModel>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalLibraryDirectories>../external/embree/lib</AdditionalLibraryDirectories>
+      <AdditionalDependencies>$(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_gpu.vcxproj.filters b/vcproj/tiny_bvh_gpu.vcxproj.filters
new file mode 100644
index 0000000..84ca75d
--- /dev/null
+++ b/vcproj/tiny_bvh_gpu.vcxproj.filters
@@ -0,0 +1,13 @@
+ï»¿<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <ClCompile Include="..\tiny_bvh_gpu.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\tiny_bvh.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="..\traverse.cl" />
+    <None Include="..\wavefront.cl" />
+  </ItemGroup>
+</Project>
\ No newline at end of file
diff --git a/vcproj/tiny_bvh_pt.vcxproj b/vcproj/tiny_bvh_pt.vcxproj
index 7b67f5e..3271fad 100644
--- a/vcproj/tiny_bvh_pt.vcxproj
+++ b/vcproj/tiny_bvh_pt.vcxproj
@@ -119,7 +119,8 @@
       <PreprocessorDefinitions>_DEBUG;_WINDOWS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
       <SubSystem>Windows</SubSystem>
@@ -137,7 +138,8 @@
       <PreprocessorDefinitions>NDEBUG;_WINDOWS;%(PreprocessorDefinitions);	_CRT_SECURE_NO_WARNINGS</PreprocessorDefinitions>
       <ConformanceMode>true</ConformanceMode>
       <LanguageStandard>stdcpp20</LanguageStandard>
-      <AdditionalIncludeDirectories>..\external\embree\include</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>
+      </AdditionalIncludeDirectories>
       <Optimization>Full</Optimization>
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
       <EnableEnhancedInstructionSet>AdvancedVectorExtensions</EnableEnhancedInstructionSet>
diff --git a/wavefront.cl b/wavefront.cl
new file mode 100644
index 0000000..361ca31
--- /dev/null
+++ b/wavefront.cl
@@ -0,0 +1,100 @@
+// gpu-side path tracing (wavefront)
+
+#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
+
+__global int pathCount, shadowRays;
+
+// struct for rendering parameters - keep in sync with CPU side.
+struct RenderData
+{
+	float4 eye, C, p0, p1, p2;
+	uint frameIdx, scrWidth, scrHeight;
+	uint dummy;
+};
+
+// PathState: path throughput, current extension ray, pixel index
+struct PathState
+{
+	float4 T; // xyz = rgb, postponed pdf in w
+	float4 O; // pixel index in O.w
+	float4 D; // t in D.w
+	float4 hit;
+};
+
+// Potential contribution: shadoww ray origin & dir, throughput
+struct Potential 
+{ 
+	float4 T;
+	float4 O; // pixel index in O.w
+	float4 D; // t in D.w
+};
+
+// atomic counter management - prepare for primary ray wavefront
+void kernel ResetCounters( primaryRayCount )
+{
+	if (get_global_id( 0 ) != 0) return;
+	pathCount = primaryRayCount;
+	shadowRays = 0;
+}
+
+// primary ray generation
+void kernel Generate( struct RenderData rd, global PathState* raysOut  )
+{
+	const unsigned x = get_global_id( 0 ), y = get_global_id( 1 );
+	const float u = (float)x / (float)rd.scrWidth;
+	const float v = (float)y / (float)rd.scrHeight;
+	const float3 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0);
+	raysOut[threadId].T = (float4)( 1, 1, 1, 1 /* pdf */ );
+	raysOut[threadId].O = (float4)( eye, as_float( x + y * rd * scrWidth ) );
+	raysOut[threadId].D = (float4)( normalize( P - eye ), 1e30f );
+}
+
+// extend: trace the generated rays to find the nearest intersection point.
+void kernel Extend( global PathState* raysIn )
+{
+	// obtain task
+	const int pathId = atomic_dec( &pathCount ) - 1;
+	if (pathId < 0) return;
+	const O4 = raysIn[pathId].O;
+	const D4 = raysIn[pathId].D;
+	const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z, 1 );
+	raysIn[pathId].hit = traverse_ailalaine( O4.xyz, D4.xyz, rD, 1e30f );
+}
+
+// shade: process intersection results; this evaluates the BRDF and creates extension
+// rays and shadow rays.
+void kernel Shade( global float4* accumulator, global float4* raysIn, global float4* raysOut, global float4* shadowOut )
+{
+}
+
+// connect: trace shadow rays and deposit their potential contribution to the pixels
+// if not occluded.
+void kernel Connect( global float4* accumulator, global Potential* shadowIn )
+{
+	// obtain task
+	const int rayId = atomic_dec( &shadowRays ) - 1;
+	if (rayId < 0) return;
+	const T4 = shadowIn[rayId].T;
+	const O4 = shadowIn[rayId].O;
+	const D4 = shadowIn[rayId].D;
+	const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z, 1 );
+	bool occluded = isoccluded_ailalaine( O4.xyz, D4.xyz, rD, D4.w );
+	if (occluded) return;
+	unsigned pixelIdx = as_uint( O4.w );
+	accumulator[pixelIdx] += T4;
+}
+
+// finalize: convert the accumulated values into final pixel values.
+// NOTE: rendering result is emitted to global uint array, which needs to be copied back 
+// to the host. This is not efficient. A proper scheme should use OpenGL / D3D / Vulkan 
+// interop do write directly to a texture.
+void kernel Finalize( global float4* accumulator, const float scale, global uint* pixels )
+{
+	const unsigned x = get_global_id( 0 ), y = get_global_id( 1 );
+	const unsigned pixelIdx = x + y * get_global_size( 0 );
+	const float4 p = accumulator[pixelIdx] * scale;
+	const int r = (int)(255.0f * min( 1.0f, sqrtf( p.x ) ));
+	const int g = (int)(255.0f * min( 1.0f, sqrtf( p.y ) ));
+	const int b = (int)(255.0f * min( 1.0f, sqrtf( p.z ) ));
+	pixels[pixelIdx] = (r << 16) + (g << 8) + b;
+}
\ No newline at end of file