diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp new file mode 100644 index 0000000..92aba55 --- /dev/null +++ b/tiny_bvh_gpu.cpp @@ -0,0 +1,125 @@ +// This example shows how to build a basic GPU path tracer using +// tinybvh. TinyOCL is used to render on the GPU using OpenCL. + +#define FENSTER_APP_IMPLEMENTATION +#define SCRWIDTH 1280 +#define SCRHEIGHT 800 +#include "external/fenster.h" // https://github.com/zserge/fenster + +// This application uses tinybvh - And this file will include the implementation. +#define TINYBVH_IMPLEMENTATION +#include "tiny_bvh.h" +using namespace tinybvh; + +// This application uses tinyocl - And this file will include the implementation. +#define TINY_OCL_IMPLEMENTATION +#include "tiny_ocl.h" + +// Other includes +#include + +// Application variables + +static BVH4_GPU bvh; +static bvhvec4* tris = 0; +static int triCount = 0, frameIdx = 0, spp = 0; +static Kernel* reset, * generate, * extend, * shade, * traceShadows, * finalize; +static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections; + +// View pyramid for a pinhole camera +struct RenderData +{ + bvhvec4 eye = bvhvec4( 0, 30, 0, 0 ); + bvhvec4 view = bvhvec4( -1, 0, 0, 0 ); + bvhvec4 C, p0, p1, p2; + uint32_t frameIdx, dummy1, dummy2, dummy3; +} rd; + +// Scene management - Append a file, with optional position, scale and color override, tinyfied +void AddMesh( const char* file, float scale = 1, bvhvec3 pos = {}, int c = 0, int N = 0 ) +{ + std::fstream s{ file, s.binary | s.in }; s.read( (char*)&N, 4 ); + bvhvec4* data = (bvhvec4*)tinybvh::malloc64( (N + triCount) * 48 ); + if (tris) memcpy( data, tris, triCount * 48 ), tinybvh::free64( tris ); + tris = data, s.read( (char*)tris + triCount * 48, N * 48 ), triCount += N; + for (int* b = (int*)tris + (triCount - N) * 12, i = 0; i < N * 3; i++) + *(bvhvec3*)b = *(bvhvec3*)b * scale + pos, b[3] = c ? c : b[3], b += 4; +} + +// Application init +void Init() +{ + // create OpenCL kernels + reset = new Kernel( "wavefront.cl", "ResetCounters" ); + generate = new Kernel( "wavefront.cl", "Generate" ); + extend = new Kernel( "wavefront.cl", "Extend" ); + shade = new Kernel( "wavefront.cl", "Shade" ); + traceShadows = new Kernel( "wavefront.cl", "Connect" ); + finalize = new Kernel( "wavefront.cl", "Finalize" ); + // create OpenCL buffers + int N = SCRWIDTH * SCRHEIGHT; + pixels = new Buffer( N * sizeof( uint32_t ) ); + accumulator = new Buffer( N * sizeof( bvhvec4 ) ); + raysIn = new Buffer( N * sizeof( bvhvec4 ) * 4 ); + raysOut = new Buffer( N * sizeof( bvhvec4 ) * 4 ); + connections = new Buffer( N * sizeof( bvhvec4 ) * 3 ); + // set kernel arguments + reset->SetArguments( N ); + // load raw vertex data + AddMesh( "./testdata/cryteksponza.bin", 1, bvhvec3( 0 ), 0xffffff ); + AddMesh( "./testdata/lucy.bin", 1.1f, bvhvec3( -2, 4.1f, -3 ), 0xaaaaff ); + // build bvh (here: 'compressed wide bvh', for efficient GPU rendering) + bvh.Build( tris, triCount ); + // load camera position / direction from file + std::fstream t = std::fstream{ "camera_gpu.bin", t.binary | t.in }; + if (!t.is_open()) return; + t.read( (char*)&rd, sizeof( rd ) ); +} + +// Keyboard handling +bool UpdateCamera( float delta_time_s, fenster& f ) +{ + bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right ); + // get camera controls. + bool moved = false; + if (f.keys['A']) rd.eye += right * -1.0f * delta_time_s * 10, moved = true; + if (f.keys['D']) rd.eye += right * delta_time_s * 10, moved = true; + if (f.keys['W']) rd.eye += rd.view * delta_time_s * 10, moved = true; + if (f.keys['S']) rd.eye += rd.view * -1.0f * delta_time_s * 10, moved = true; + if (f.keys['R']) rd.eye += up * delta_time_s * 20, moved = true; + if (f.keys['F']) rd.eye += up * -1.0f * delta_time_s * 20, moved = true; + if (f.keys[20]) rd.view = normalize( rd.view + right * -1.0f * delta_time_s ), moved = true; + if (f.keys[19]) rd.view = normalize( rd.view + right * delta_time_s ), moved = true; + if (f.keys[17]) rd.view = normalize( rd.view + up * -1.0f * delta_time_s ), moved = true; + if (f.keys[18]) rd.view = normalize( rd.view + up * delta_time_s ), moved = true; + // recalculate right, up + right = normalize( cross( bvhvec3( 0, 1, 0 ), rd.view ) ), up = 0.8f * cross( rd.view, right ); + bvhvec3 C = rd.eye + 1.2f * rd.view; + rd.p0 = C - right + up, rd.p1 = C + right + up, rd.p2 = C - right - up; + return moved; +} + +// Application Tick +void Tick( float delta_time_s, fenster& f, uint32_t* buf ) +{ + // handle user input and update camera + if (UpdateCamera( delta_time_s, f ) || frameIdx++ == 0) + { + // memset( accumulator, 0, SCRWIDTH * SCRHEIGHT * sizeof( bvhvec3 ) ); + spp = 1; + } + // render on the GPU + // .. + // print frame time / rate in window title + char title[50]; + sprintf( title, "tiny_bvh %.2f s %.2f Hz", delta_time_s, 1.0f / delta_time_s ); + fenster_update_title( &f, title ); +} + +// Application Shutdown +void Shutdown() +{ + // save camera position / direction to file + std::fstream s = std::fstream{ "camera.bin", s.binary | s.out }; + s.write( (char*)&rd, sizeof( rd ) ); +} \ No newline at end of file diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index e5a3140..38d2db2 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -687,7 +687,11 @@ int main() #if defined TRAVERSE_OPTIMIZED_ST || defined TRAVERSE_4WAY_OPTIMIZED printf( "Optimized BVH performance - Optimizing... " ); - if (!bvh_verbose) bvh_verbose = new BVH_Verbose( *bvh ); + if (!bvh_verbose) + { + bvh_verbose = new BVH_Verbose(); + bvh_verbose->ConvertFrom( *bvh ); + } t.reset(); bvh_verbose->Optimize( 1000000 ); // optimize the raw SBVH bvh->ConvertFrom( *bvh_verbose ); diff --git a/tiny_bvh_test.sln b/tiny_bvh_test.sln index 90d589f..cdea067 100644 --- a/tiny_bvh_test.sln +++ b/tiny_bvh_test.sln @@ -15,6 +15,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_pt", "vcproj\tiny_ EndProject Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_anim", "vcproj\tiny_bvh_anim.vcxproj", "{2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}" EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tiny_bvh_gpu", "vcproj\tiny_bvh_gpu.vcxproj", "{4B0A219D-68D0-41EA-A0C4-8EB9E6171218}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 @@ -71,6 +73,14 @@ Global {2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x64.Build.0 = Release|x64 {2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x86.ActiveCfg = Release|Win32 {2DC14EA1-D6E9-45F1-A6CC-6E0A0BAD8A9B}.Release|x86.Build.0 = Release|Win32 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x64.ActiveCfg = Debug|x64 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x64.Build.0 = Debug|x64 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x86.ActiveCfg = Debug|Win32 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Debug|x86.Build.0 = Debug|Win32 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x64.ActiveCfg = Release|x64 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x64.Build.0 = Release|x64 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x86.ActiveCfg = Release|Win32 + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218}.Release|x86.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/traverse.cl b/traverse.cl index 133e6af..f9257a5 100644 --- a/traverse.cl +++ b/traverse.cl @@ -93,16 +93,11 @@ inline float fmax_fmax( const float a, const float b, const float c ) // // ============================================================================ -void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData ) +float4 traverse_ailalaine( const float3 O, const float3 D, const float3 rD, const float tmax ) { - // fetch ray - const unsigned threadId = get_global_id( 0 ); - const float3 O = rayData[threadId].O.xyz; - const float3 D = rayData[threadId].D.xyz; - const float3 rD = rayData[threadId].rD.xyz; - float4 hit; - hit.x = 1e30f; // ignoring value set in ray to spare one memory transaction. // traverse BVH + const float4 hit; + hit.x = tmax; unsigned node = 0, stack[64], stackPtr = 0; while (1) { @@ -169,6 +164,88 @@ void kernel traverse_ailalaine( global struct BVHNodeAlt* altNode, global unsign rayData[threadId].hit = hit; } +bool isoccluded_ailalaine( const float3 O, const float3 D, const float3 rD, const float tmax ) +{ + // traverse BVH + const float4 hit; + hit.x = tmax; + unsigned node = 0, stack[64], stackPtr = 0; + while (1) + { + // fetch the node + const float4 lmin = altNode[node].lmin, lmax = altNode[node].lmax; + const float4 rmin = altNode[node].rmin, rmax = altNode[node].rmax; + const unsigned triCount = as_uint( rmin.w ); + if (triCount > 0) + { + // process leaf node + const unsigned firstTri = as_uint( rmax.w ); + for (unsigned i = 0; i < triCount; i++) + { + const unsigned triIdx = idx[firstTri + i]; + const float4* tri = verts + 3 * triIdx; + // triangle intersection - Möller-Trumbore + const float4 edge1 = tri[1] - tri[0], edge2 = tri[2] - tri[0]; + const float3 h = cross( D, edge2.xyz ); + const float a = dot( edge1.xyz, h ); + if (fabs( a ) < 0.0000001f) continue; + const float f = 1 / a; + const float3 s = O - tri[0].xyz; + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const float3 q = cross( s, edge1.xyz ); + const float v = f * dot( D, q ); + if (v < 0 || u + v > 1) continue; + const float d = f * dot( edge2.xyz, q ); + if (d > 0.0f && d < hit.x) return true; + } + if (stackPtr == 0) break; + node = stack[--stackPtr]; + continue; + } + unsigned left = as_uint( lmin.w ), right = as_uint( lmax.w ); + // child AABB intersection tests + const float3 t1a = (lmin.xyz - O) * rD, t2a = (lmax.xyz - O) * rD; + const float3 t1b = (rmin.xyz - O) * rD, t2b = (rmax.xyz - O) * rD; + const float3 minta = fmin( t1a, t2a ), maxta = fmax( t1a, t2a ); + const float3 mintb = fmin( t1b, t2b ), maxtb = fmax( t1b, t2b ); + const float tmina = fmax( fmax( fmax( minta.x, minta.y ), minta.z ), 0 ); + const float tminb = fmax( fmax( fmax( mintb.x, mintb.y ), mintb.z ), 0 ); + const float tmaxa = fmin( fmin( fmin( maxta.x, maxta.y ), maxta.z ), hit.x ); + const float tmaxb = fmin( fmin( fmin( maxtb.x, maxtb.y ), maxtb.z ), hit.x ); + float dist1 = tmina > tmaxa ? 1e30f : tmina; + float dist2 = tminb > tmaxb ? 1e30f : tminb; + // traverse nearest child first + if (dist1 > dist2) + { + float h = dist1; dist1 = dist2; dist2 = h; + unsigned t = left; left = right; right = t; + } + if (dist1 == 1e30f) + { + if (stackPtr == 0) break; else node = stack[--stackPtr]; + } + else + { + node = left; + if (dist2 != 1e30f) stack[stackPtr++] = right; + } + } + // no hit found + return false; +} + +void kernel batch_ailalaine( global struct BVHNodeAlt* altNode, global unsigned* idx, global float4* verts, global struct Ray* rayData ) +{ + // fetch ray + const unsigned threadId = get_global_id( 0 ); + const float3 O = rayData[threadId].O.xyz; + const float3 D = rayData[threadId].D.xyz; + const float3 rD = rayData[threadId].rD.xyz; + float4 hit = traverse_ailalaine( O, D, rD, 1e30f ); + rayData[threadId].hit = hit; +} + // ============================================================================ // // T R A V E R S E _ G P U 4 W A Y diff --git a/vcproj/tiny_bvh_anim.vcxproj b/vcproj/tiny_bvh_anim.vcxproj index 52528a2..90d4d9a 100644 --- a/vcproj/tiny_bvh_anim.vcxproj +++ b/vcproj/tiny_bvh_anim.vcxproj @@ -119,7 +119,8 @@ _DEBUG;_WINDOWS;%(PreprocessorDefinitions) true stdcpp20 - ..\external\embree\include + + Windows @@ -137,7 +138,8 @@ NDEBUG;_WINDOWS;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS true stdcpp20 - ..\external\embree\include + + Full Speed AdvancedVectorExtensions diff --git a/vcproj/tiny_bvh_fenster.vcxproj b/vcproj/tiny_bvh_fenster.vcxproj index d0a4e69..2095c56 100644 --- a/vcproj/tiny_bvh_fenster.vcxproj +++ b/vcproj/tiny_bvh_fenster.vcxproj @@ -119,7 +119,8 @@ _DEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS true stdcpp20 - ..\external\embree\include + + Windows @@ -137,7 +138,8 @@ NDEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS true stdcpp20 - ..\external\embree\include + + Windows diff --git a/vcproj/tiny_bvh_gpu.vcxproj b/vcproj/tiny_bvh_gpu.vcxproj new file mode 100644 index 0000000..fcb157f --- /dev/null +++ b/vcproj/tiny_bvh_gpu.vcxproj @@ -0,0 +1,163 @@ + + + + + Debug + Win32 + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + + + + + + + + + + + + 17.0 + Win32Proj + {4B0A219D-68D0-41EA-A0C4-8EB9E6171218} + tinybvhgpu + 10.0 + + + + Application + true + v143 + Unicode + + + Application + false + v143 + true + Unicode + + + Application + true + v143 + NotSet + + + Application + false + v143 + true + NotSet + + + + + + + + + + + + + + + + + + + + + $(SolutionDir) + $(Platform)\$(ProjectName)\$(Configuration)\ + + + $(SolutionDir) + $(Platform)\$(ProjectName)\$(Configuration)\ + + + + Level3 + true + WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + + + Windows + true + + + + + Level3 + true + true + true + WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + true + + + Windows + true + true + true + + + + + Level3 + true + _DEBUG;_WINDOWS;%(PreprocessorDefinitions);_CRT_SECURE_NO_WARNINGS + true + stdcpp20 + ../external/OpenCL/inc/ + + + Windows + true + ../external/embree/lib + $(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib + + + + + Level3 + true + true + true + NDEBUG;_WINDOWS;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_WARNINGS + true + stdcpp20 + ../external/OpenCL/inc/ + Full + Speed + AdvancedVectorExtensions + Fast + + + Windows + true + true + true + ../external/embree/lib + $(CoreLibraryDependencies);%(AdditionalDependencies);embree4.lib;tbb12.lib + + + + + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_gpu.vcxproj.filters b/vcproj/tiny_bvh_gpu.vcxproj.filters new file mode 100644 index 0000000..84ca75d --- /dev/null +++ b/vcproj/tiny_bvh_gpu.vcxproj.filters @@ -0,0 +1,13 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/vcproj/tiny_bvh_pt.vcxproj b/vcproj/tiny_bvh_pt.vcxproj index 7b67f5e..3271fad 100644 --- a/vcproj/tiny_bvh_pt.vcxproj +++ b/vcproj/tiny_bvh_pt.vcxproj @@ -119,7 +119,8 @@ _DEBUG;_WINDOWS;%(PreprocessorDefinitions) true stdcpp20 - ..\external\embree\include + + Windows @@ -137,7 +138,8 @@ NDEBUG;_WINDOWS;%(PreprocessorDefinitions); _CRT_SECURE_NO_WARNINGS true stdcpp20 - ..\external\embree\include + + Full Speed AdvancedVectorExtensions diff --git a/wavefront.cl b/wavefront.cl new file mode 100644 index 0000000..361ca31 --- /dev/null +++ b/wavefront.cl @@ -0,0 +1,100 @@ +// gpu-side path tracing (wavefront) + +#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable + +__global int pathCount, shadowRays; + +// struct for rendering parameters - keep in sync with CPU side. +struct RenderData +{ + float4 eye, C, p0, p1, p2; + uint frameIdx, scrWidth, scrHeight; + uint dummy; +}; + +// PathState: path throughput, current extension ray, pixel index +struct PathState +{ + float4 T; // xyz = rgb, postponed pdf in w + float4 O; // pixel index in O.w + float4 D; // t in D.w + float4 hit; +}; + +// Potential contribution: shadoww ray origin & dir, throughput +struct Potential +{ + float4 T; + float4 O; // pixel index in O.w + float4 D; // t in D.w +}; + +// atomic counter management - prepare for primary ray wavefront +void kernel ResetCounters( primaryRayCount ) +{ + if (get_global_id( 0 ) != 0) return; + pathCount = primaryRayCount; + shadowRays = 0; +} + +// primary ray generation +void kernel Generate( struct RenderData rd, global PathState* raysOut ) +{ + const unsigned x = get_global_id( 0 ), y = get_global_id( 1 ); + const float u = (float)x / (float)rd.scrWidth; + const float v = (float)y / (float)rd.scrHeight; + const float3 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0); + raysOut[threadId].T = (float4)( 1, 1, 1, 1 /* pdf */ ); + raysOut[threadId].O = (float4)( eye, as_float( x + y * rd * scrWidth ) ); + raysOut[threadId].D = (float4)( normalize( P - eye ), 1e30f ); +} + +// extend: trace the generated rays to find the nearest intersection point. +void kernel Extend( global PathState* raysIn ) +{ + // obtain task + const int pathId = atomic_dec( &pathCount ) - 1; + if (pathId < 0) return; + const O4 = raysIn[pathId].O; + const D4 = raysIn[pathId].D; + const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z, 1 ); + raysIn[pathId].hit = traverse_ailalaine( O4.xyz, D4.xyz, rD, 1e30f ); +} + +// shade: process intersection results; this evaluates the BRDF and creates extension +// rays and shadow rays. +void kernel Shade( global float4* accumulator, global float4* raysIn, global float4* raysOut, global float4* shadowOut ) +{ +} + +// connect: trace shadow rays and deposit their potential contribution to the pixels +// if not occluded. +void kernel Connect( global float4* accumulator, global Potential* shadowIn ) +{ + // obtain task + const int rayId = atomic_dec( &shadowRays ) - 1; + if (rayId < 0) return; + const T4 = shadowIn[rayId].T; + const O4 = shadowIn[rayId].O; + const D4 = shadowIn[rayId].D; + const float3 rD = (float3)( 1.0f / D4.x, 1.0f / D4.y, 1.0f / D4.z, 1 ); + bool occluded = isoccluded_ailalaine( O4.xyz, D4.xyz, rD, D4.w ); + if (occluded) return; + unsigned pixelIdx = as_uint( O4.w ); + accumulator[pixelIdx] += T4; +} + +// finalize: convert the accumulated values into final pixel values. +// NOTE: rendering result is emitted to global uint array, which needs to be copied back +// to the host. This is not efficient. A proper scheme should use OpenGL / D3D / Vulkan +// interop do write directly to a texture. +void kernel Finalize( global float4* accumulator, const float scale, global uint* pixels ) +{ + const unsigned x = get_global_id( 0 ), y = get_global_id( 1 ); + const unsigned pixelIdx = x + y * get_global_size( 0 ); + const float4 p = accumulator[pixelIdx] * scale; + const int r = (int)(255.0f * min( 1.0f, sqrtf( p.x ) )); + const int g = (int)(255.0f * min( 1.0f, sqrtf( p.y ) )); + const int b = (int)(255.0f * min( 1.0f, sqrtf( p.z ) )); + pixels[pixelIdx] = (r << 16) + (g << 8) + b; +} \ No newline at end of file