diff --git a/README.md b/README.md index ded2a24..aa552bf 100644 --- a/README.md +++ b/README.md @@ -48,6 +48,8 @@ Apart from the default BVH layout (simply named ````BVH````), several other layo A BVH in the ````BVH```` format may be _refitted_, in case the triangles moved, using ````BVH::Refit````. Refitting is substantially faster than rebuilding and works well if the animation is subtle. Refitting does not work if polygon counts change. +New in version 1.1.3: 'Self-contained' formats may be serialized and de-serialized via ````::Save```` and ````::Load````. Currently this is supported for ````BVH8_CWBVH````, which stores vertex data in a custom format and thus does not rely on the input vertices for traversal. + # How To Use The library ````tiny_bvh.h```` is designed to be easy to use. Please have a look at tiny_bvh_minimal.cpp for an example. A Visual Studio 'solution' (.sln/.vcxproj) is included, as well as a CMake file. That being said: The examples consists of only a single source file, which can be compiled with clang or g++, e.g.: @@ -71,7 +73,7 @@ The **performance measurement tool** can be compiled with: ````g++ -std=c++20 -mavx -Ofast tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest```` -# Version 1.1.2 +# Version 1.1.3 Version 1.1.0 introduced a change to the API. The single BVH class with multiple layouts has been replaced with a BVH class per layout. You can simply instantiate the desired layout; conversion (and data ownership) is then handled properly by the library. Examples: diff --git a/tiny_bvh.h b/tiny_bvh.h index a9085fe..72d751d 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -116,7 +116,7 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 1 #define TINY_BVH_VERSION_MINOR 1 -#define TINY_BVH_VERSION_SUB 2 +#define TINY_BVH_VERSION_SUB 3 // ============================================================================ // @@ -907,7 +907,7 @@ class BVH8_CWBVH : public BVHBase uint32_t allocatedBlocks = 0; // node data is stored in blocks of 16 byte. uint32_t usedBlocks = 0; // actually used blocks. BVH8 bvh8; // BVH8_CWBVH is created from BVH8 and uses its data. - bool ownBVH8 = true; // False when ConvertFrom receives an external bvh8. + bool ownBVH8 = true; // false when ConvertFrom receives an external bvh8. }; // BLASInstance: A TLAS is built over BLAS instances, where a single BLAS can be @@ -2176,18 +2176,18 @@ void BVH_Verbose::MergeLeafs() // BVH_GPU implementation // ---------------------------------------------------------------------------- -BVH_GPU::~BVH_GPU() +BVH_GPU::~BVH_GPU() { if (!ownBVH) bvh = BVH(); // clear out pointers we don't own. - AlignedFree( bvhNode ); + AlignedFree( bvhNode ); } -void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) -{ - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH_GPU::Build( const bvhvec4slice& vertices ) -{ +void BVH_GPU::Build( const bvhvec4slice& vertices ) +{ bvh.BuildDefault( vertices ); ConvertFrom( bvh ); } @@ -2196,7 +2196,7 @@ void BVH_GPU::ConvertFrom( const BVH& original ) { // get a copy of the original bvh if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. - bvh = original; + bvh = original; // allocate space const uint32_t spaceNeeded = original.usedNodes; if (allocatedNodes < spaceNeeded) @@ -2285,18 +2285,18 @@ int32_t BVH_GPU::Intersect( Ray& ray ) const // BVH_SoA implementation // ---------------------------------------------------------------------------- -BVH_SoA::~BVH_SoA() +BVH_SoA::~BVH_SoA() { if (!ownBVH) bvh = BVH(); // clear out pointers we don't own. - AlignedFree( bvhNode ); + AlignedFree( bvhNode ); } -void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t primCount ) -{ - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH_SoA::Build( const bvhvec4slice& vertices ) -{ +void BVH_SoA::Build( const bvhvec4slice& vertices ) +{ bvh.context = context; // properly propagate context to fix issue #66. bvh.BuildDefault( vertices ); ConvertFrom( bvh ); @@ -2306,7 +2306,7 @@ void BVH_SoA::ConvertFrom( const BVH& original ) { // get a copy of the original bvh if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. - bvh = original; + bvh = original; // allocate space const uint32_t spaceNeeded = bvh.usedNodes; if (allocatedNodes < spaceNeeded) @@ -2355,15 +2355,15 @@ void BVH_SoA::ConvertFrom( const BVH& original ) // BVH4 implementation // ---------------------------------------------------------------------------- -BVH4::~BVH4() +BVH4::~BVH4() { if (!ownBVH) bvh = BVH(); // clear out pointers we don't own. - AlignedFree( bvh4Node ); + AlignedFree( bvh4Node ); } void BVH4::Build( const bvhvec4* vertices, const uint32_t primCount ) { - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } void BVH4::Build( const bvhvec4slice& vertices ) { @@ -2376,7 +2376,7 @@ void BVH4::ConvertFrom( const BVH& original ) { // get a copy of the original bvh if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. - bvh = original; + bvh = original; // allocate space const uint32_t spaceNeeded = original.usedNodes; if (allocatedNodes < spaceNeeded) @@ -2458,19 +2458,19 @@ int32_t BVH4::Intersect( Ray& ray ) const // BVH4_CPU implementation // ---------------------------------------------------------------------------- -BVH4_CPU::~BVH4_CPU() +BVH4_CPU::~BVH4_CPU() { if (!ownBVH4) bvh4 = BVH4(); // clear out pointers we don't own. - AlignedFree( bvh4Node ); + AlignedFree( bvh4Node ); AlignedFree( bvh4Tris ); } -void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t primCount ) { - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH4_CPU::Build( const bvhvec4slice& vertices ) -{ +void BVH4_CPU::Build( const bvhvec4slice& vertices ) +{ bvh4.context = context; // properly propagate context to fix issue #66. bvh4.Build( vertices ); ConvertFrom( bvh4 ); @@ -2480,7 +2480,7 @@ void BVH4_CPU::ConvertFrom( const BVH4& original ) { // get a copy of the original bvh4 if (&original != &bvh4) ownBVH4 = false; // bvh isn't ours; don't delete in destructor. - bvh4 = original; + bvh4 = original; // Convert a 4-wide BVH to a format suitable for CPU traversal. // See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout, // Atilla T. Áfra, 2013. @@ -2564,18 +2564,18 @@ void BVH4_CPU::ConvertFrom( const BVH4& original ) // BVH4_GPU implementation // ---------------------------------------------------------------------------- -BVH4_GPU::~BVH4_GPU() +BVH4_GPU::~BVH4_GPU() { if (!ownBVH4) bvh4 = BVH4(); // clear out pointers we don't own. - AlignedFree( bvh4Data ); + AlignedFree( bvh4Data ); } -void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) -{ - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH4_GPU::Build( const bvhvec4slice& vertices ) -{ +void BVH4_GPU::Build( const bvhvec4slice& vertices ) +{ bvh4.context = context; // properly propagate context to fix issue #66. bvh4.Build( vertices ); ConvertFrom( bvh4 ); @@ -2816,18 +2816,18 @@ int32_t BVH4_GPU::Intersect( Ray& ray ) const // BVH8 implementation // ---------------------------------------------------------------------------- -BVH8::~BVH8() +BVH8::~BVH8() { if (!ownBVH) bvh = BVH(); // clear out pointers we don't own. AlignedFree( bvh8Node ); } -void BVH8::Build( const bvhvec4* vertices, const uint32_t primCount ) -{ - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +void BVH8::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH8::Build( const bvhvec4slice& vertices ) -{ +void BVH8::Build( const bvhvec4slice& vertices ) +{ bvh.context = context; // properly propagate context to fix issue #66. bvh.BuildDefault( vertices ); ConvertFrom( bvh ); @@ -2837,7 +2837,7 @@ void BVH8::ConvertFrom( const BVH& original ) { // get a copy of the original if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. - bvh = original; + bvh = original; // allocate space // Note: The safe upper bound here is usedNodes when converting an existing // BVH2, but we need triCount * 2 to be safe in later conversions, e.g. to @@ -2963,7 +2963,7 @@ int32_t BVH8::Intersect( Ray& ray ) const // BVH8_CWBVH implementation // ---------------------------------------------------------------------------- -BVH8_CWBVH::~BVH8_CWBVH() +BVH8_CWBVH::~BVH8_CWBVH() { if (!ownBVH8) bvh8 = BVH8(); // clear out pointers we don't own. AlignedFree( bvh8Data ); @@ -2994,12 +2994,12 @@ bool BVH8_CWBVH::Load( const char* fileName ) return true; } -void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t primCount ) -{ - Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); +void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t primCount ) +{ + Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); } -void BVH8_CWBVH::Build( const bvhvec4slice& vertices ) -{ +void BVH8_CWBVH::Build( const bvhvec4slice& vertices ) +{ bvh8.context = context; // properly propagate context to fix issue #66. bvh8.Build( vertices ); ConvertFrom( bvh8 ); @@ -3009,7 +3009,7 @@ void BVH8_CWBVH::ConvertFrom( BVH8& original ) { // get a copy of the original bvh8 if (&original != &bvh8) ownBVH8 = false; // bvh isn't ours; don't delete in destructor. - bvh8 = original; + bvh8 = original; // Convert a BVH8 to the format specified in: "Efficient Incoherent Ray // Traversal on GPUs Through Compressed Wide BVHs", Ylitie et al. 2017. // Adapted from code by "AlanWBFT". diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp index 3a7dd36..f38936e 100644 --- a/tiny_bvh_gpu.cpp +++ b/tiny_bvh_gpu.cpp @@ -26,7 +26,7 @@ static int triCount = 0, frameIdx = 0, spp = 0; static Kernel* init, * clear, * generate, * extend, * shade; static Kernel* updateCounters1, * updateCounters2, * traceShadows, * finalize; static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triData; -static Buffer* cwbvhNodes = 0, * cwbvhTris = 0, *noise = 0; +static Buffer* cwbvhNodes = 0, * cwbvhTris = 0, * noise = 0; static size_t computeUnits; static uint32_t* blueNoise; @@ -60,7 +60,7 @@ void AddQuad( const bvhvec3 pos, const float w, const float d, int c ) // Blue noise from file void LoadBlueNoise() { - std::fstream s{ "blue_noise_128x128x8_2d.raw", s.binary | s.in }; + std::fstream s{ "blue_noise_128x128x8_2d.raw", s.binary | s.in }; s.read( (char*)blueNoise, 128 * 128 * 4 ); } diff --git a/wavefront.cl b/wavefront.cl index f67068d..dacaa06 100644 --- a/wavefront.cl +++ b/wavefront.cl @@ -25,7 +25,6 @@ struct RenderData __global volatile int extendTasks, shadeTasks, connectTasks; __global struct RenderData rd; - // Xor32 RNG uint WangHash( uint s ) { s = (s ^ 61) ^ (s >> 16), s *= 9, s = s ^ (s >> 4), s *= 0x27d4eb2d; return s ^ (s >> 15); } uint RandomUInt( uint* seed ) { *seed ^= *seed << 13, * seed ^= *seed >> 17, * seed ^= *seed << 5; return *seed; } @@ -63,10 +62,12 @@ float3 CosWeightedDiffReflection( const float3 N, uint* seed ) } // PathState: path throughput, current extension ray, pixel index +#define PATH_LAST_SPECULAR 1 +#define PATH_VIA_DIFFUSE 2 struct PathState { - float4 T; // xyz = rgb, postponed pdf in w - float4 O; // pixel index and path depth in O.w + float4 T; // xyz = rgb, postponed MIS pdf in w + float4 O; // O.w: 24-bit pixel index, 4-bit path depth, 4-bit path flags float4 D; // t in D.w float4 hit; }; @@ -117,8 +118,8 @@ void kernel Generate( global struct PathState* raysOut, uint frameSeed ) const float u = ((float)x + RandomFloat( &seed )) / (float)get_global_size( 0 ); const float v = ((float)y + RandomFloat( &seed )) / (float)get_global_size( 1 ); const float4 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0); - raysOut[id].T = (float4)(1, 1, 1, -1 /* pdf, or -1 for specular vertex */); - raysOut[id].O = (float4)(rd.eye.xyz, as_float( id << 4 /* low bits: depth */ )); + raysOut[id].T = (float4)(1, 1, 1, 1 ); + raysOut[id].O = (float4)(rd.eye.xyz, as_float( (id << 8) + PATH_LAST_SPECULAR )); raysOut[id].D = (float4)(fast_normalize( P.xyz - rd.eye.xyz ), 1e30f); raysOut[id].hit = (float4)(1e30f, 0, 0, as_float( 0 )); } @@ -161,13 +162,14 @@ void kernel Shade( global float4* accumulator, const int pathId = atomic_dec( &shadeTasks ) - 1; if (pathId < 0) break; // fetch path data - float4 T4 = raysIn[pathId].T; // xyz = rgb, postponed pdf in w - float4 O4 = raysIn[pathId].O; // pixel index in O.w - float4 D4 = raysIn[pathId].D; // t in D.w + float4 T4 = raysIn[pathId].T; // xyz = rgb, postponed pdf in w + float4 O4 = raysIn[pathId].O; // pixel index in O.w + float4 D4 = raysIn[pathId].D; // t in D.w float4 hit = raysIn[pathId].hit; // dist, u, v, prim // prepare for shading - uint depth = as_uint( O4.w ) & 15; - uint pixelIdx = as_uint( O4.w ) >> 4; + uint pathState = as_uint( O4.w ); + uint pixelIdx = pathState >> 8; + uint depth = (pathState >> 4) & 15; uint seed = WangHash( as_uint( O4.w ) + rd.frameIdx * 17117 ); float3 T = T4.xyz; float t = hit.x; @@ -186,7 +188,7 @@ void kernel Shade( global float4* accumulator, float3 lightColor = (float3)(20); if (mat == 1 /* light source */) { - if (T4.w == -1) accumulator[pixelIdx] += (float4)(T * lightColor, 1); + if (pathState & PATH_LAST_SPECULAR) accumulator[pixelIdx] += (float4)(T * lightColor, 1); continue; } float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz; @@ -201,7 +203,7 @@ void kernel Shade( global float4* accumulator, uint newRayIdx = atomic_inc( &extendTasks ); float3 R = Reflect( D, N ); raysOut[newRayIdx].T = (float4)(T * diff, -1 /* mark vertex as specular */); - raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 4) + depth + 1 )); + raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 8) + ((depth + 1) << 4) + PATH_LAST_SPECULAR )); raysOut[newRayIdx].D = (float4)(R, 1e30f); continue; } @@ -221,14 +223,14 @@ void kernel Shade( global float4* accumulator, shadowOut[newShadowIdx].D = (float4)(L, dist - 2 * EPSILON); } // indirect illumination: diffuse bounce - if (depth < 3) + if (depth < 3 && (pathState & PATH_VIA_DIFFUSE) == 0 ) { uint newRayIdx = atomic_inc( &extendTasks ); float3 R = CosWeightedDiffReflection( N, &seed ); float PDF = dot( N, R ) * INVPI; T *= dot( N, R ) * BRDF * native_recip( PDF ); raysOut[newRayIdx].T = (float4)(T, 1); - raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 4) + depth + 1 )); + raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 8) + ((depth + 1) << 4) + PATH_VIA_DIFFUSE )); raysOut[newRayIdx].D = (float4)(R, 1e30f); } }