From 637189a4139a305d6bc18c94da158e9371722e83 Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Mon, 11 Nov 2024 14:08:51 +0100 Subject: [PATCH] Add traversal for BVH4_GPU. --- tiny_bvh.h | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 139 insertions(+), 7 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index 545ef12..9660d62 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -22,7 +22,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -// Nov 11, '24: version 0.5.0 : SBVH builder. +// Nov 11, '24: version 0.5.1 : SBVH builder, BVH4_GPU traversal. // Nov 10, '24: version 0.4.2 : BVH4/8, gpu-friendly BVH4. // Nov 09, '24: version 0.4.0 : Layouts, BVH optimizer. // Nov 08, '24: version 0.3.0 @@ -77,7 +77,7 @@ THE SOFTWARE. // library version #define TINY_BVH_VERSION_MAJOR 0 #define TINY_BVH_VERSION_MINOR 5 -#define TINY_BVH_VERSION_SUB 0 +#define TINY_BVH_VERSION_SUB 1 // ============================================================================ // @@ -292,7 +292,16 @@ struct Ray class BVH { public: - enum BVHLayout { WALD_32BYTE = 1, AILA_LAINE, ALT_SOA, VERBOSE, BASIC_BVH4, BVH4_GPU, BASIC_BVH8 }; + enum BVHLayout { + WALD_32BYTE = 1, // Default format, obtained using BVH::Build variants. + AILA_LAINE, // For GPU rendering. Obtained by converting WALD_32BYTE. + ALT_SOA, // For faster CPU rendering. Obtained by converting WALD_32BYTE. + VERBOSE, // For BVH optimizing. Obtained by converting WALD_32BYTE. + BASIC_BVH4, // Input for BVH4_GPU conversion. Obtained by converting WALD_32BYTE. + BVH4_GPU, // For fast GPU rendering. Obtained by converting BASIC_BVH4. + BASIC_BVH8, // Input for CWBVH. Obtained by converting WALD_32BYTE. + CWBVH // Fastest GPU rendering. Obtained by converting BASIC_BVH8. + }; struct BVHNode { // 'Traditional' 32-byte BVH node layout, as proposed by Ingo Wald. @@ -434,8 +443,9 @@ class BVH private: int Intersect_Wald32Byte( Ray& ray ) const; int Intersect_AilaLaine( Ray& ray ) const; - int Intersect_BasicBVH4( Ray& ray ) const; - int Intersect_BasicBVH8( Ray& ray ) const; + int Intersect_BasicBVH4( Ray& ray ) const; // only for testing, not efficient. + int Intersect_BasicBVH8( Ray& ray ) const; // only for testing, not efficient. + int Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient. int Intersect_AltSoA( Ray& ray ) const; // requires BVH_USEAVX void IntersectTri( Ray& ray, const unsigned int triIdx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); @@ -461,6 +471,8 @@ class BVH BVHNode4* bvh4Node = 0; // BVH node for 4-wide BVH. bvhvec4* bvh4Alt = 0; // 64-byte 4-wide BVH node for efficient GPU rendering. BVHNode8* bvh8Node = 0; // BVH node for 8-wide BVH. + bvhvec4* bvh8Compact = 0; // Nodes in CWBVH format. + bvhvec4* bvh8Tris = 0; // Triangle data for CWBVH nodes. bool rebuildable = true; // rebuilds are safe only if a tree has not been converted. bool refittable = true; // refits are safe only if the tree has no spatial splits. // keep track of allocated buffer size to avoid @@ -472,6 +484,7 @@ class BVH unsigned allocatedBVH4Nodes = 0; unsigned allocatedAlt4Blocks = 0; unsigned allocatedBVH8Nodes = 0; + unsigned allocatedCWBVHBlocks = 0; unsigned usedBVHNodes = 0; unsigned usedAltNodes = 0; unsigned usedAlt2Nodes = 0; @@ -479,6 +492,7 @@ class BVH unsigned usedBVH4Nodes = 0; unsigned usedAlt4Blocks = 0; unsigned usedBVH8Nodes = 0; + unsigned usedCWBVHBlocks = 0; }; } // namespace tinybvh @@ -1449,6 +1463,9 @@ int BVH::Intersect( Ray& ray, BVHLayout layout ) const case BASIC_BVH4: return Intersect_BasicBVH4( ray ); break; + case BVH4_GPU: + return Intersect_Alt4BVH( ray ); + break; case BASIC_BVH8: return Intersect_BasicBVH8( ray ); break; @@ -1535,7 +1552,7 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const return steps; } -// Intersect4. For testing the converted data only; not efficient. +// Intersect_BasicBVH4. For testing the converted data only; not efficient. int BVH::Intersect_BasicBVH4( Ray& ray ) const { BVHNode4* node = &bvh4Node[0], * stack[64]; @@ -1556,7 +1573,7 @@ int BVH::Intersect_BasicBVH4( Ray& ray ) const return steps; } -// Intersect4. For testing the converted data only; not efficient. +// Intersect_BasicBVH8. For testing the converted data only; not efficient. int BVH::Intersect_BasicBVH8( Ray& ray ) const { BVHNode8* node = &bvh8Node[0], * stack[128]; @@ -1577,6 +1594,121 @@ int BVH::Intersect_BasicBVH8( Ray& ray ) const return steps; } +// IntersectAlt4Nodes. For testing the converted data only; not efficient. +// This code replicates how traversal on GPU happens. +#define SWAP(A,B,C,D) t=A,A=B,B=t,t2=C,C=D,D=t2; +struct uchar4 { unsigned char x, y, z, w; }; +static uchar4 as_uchar4( const float v ) { union { float t; uchar4 t4; }; t = v; return t4; } +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif +static unsigned as_uint( const float v ) { return *(unsigned int*)&v; } +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif +int BVH::Intersect_Alt4BVH( Ray& ray ) const +{ + // traverse a blas + unsigned int offset = 0, stack[128], stackPtr = 0, t2 /* for SWAP macro */; + unsigned int steps = 0; + while (1) + { + steps++; + // fetch the node + const bvhvec4 data0 = bvh4Alt[offset + 0], data1 = bvh4Alt[offset + 1]; + const bvhvec4 data2 = bvh4Alt[offset + 2], data3 = bvh4Alt[offset + 3]; + // extract aabb + const bvhvec3 bmin = data0, extent = data1; // pre-scaled by 1/255 + // reconstruct conservative child aabbs + const uchar4 d0 = as_uchar4( data0.w ), d1 = as_uchar4( data1.w ), d2 = as_uchar4( data2.x ); + const uchar4 d3 = as_uchar4( data2.y ), d4 = as_uchar4( data2.z ), d5 = as_uchar4( data2.w ); + const bvhvec3 c0min = bmin + extent * bvhvec3(d0.x, d2.x, d4.x), c0max = bmin + extent * bvhvec3(d1.x, d3.x, d5.x); + const bvhvec3 c1min = bmin + extent * bvhvec3(d0.y, d2.y, d4.y), c1max = bmin + extent * bvhvec3(d1.y, d3.y, d5.y); + const bvhvec3 c2min = bmin + extent * bvhvec3(d0.z, d2.z, d4.z), c2max = bmin + extent * bvhvec3(d1.z, d3.z, d5.z); + const bvhvec3 c3min = bmin + extent * bvhvec3(d0.w, d2.w, d4.w), c3max = bmin + extent * bvhvec3(d1.w, d3.w, d5.w); + // intersect child aabbs + const bvhvec3 t1a = (c0min - ray.O) * ray.rD, t2a = (c0max - ray.O) * ray.rD; + const bvhvec3 t1b = (c1min - ray.O) * ray.rD, t2b = (c1max - ray.O) * ray.rD; + const bvhvec3 t1c = (c2min - ray.O) * ray.rD, t2c = (c2max - ray.O) * ray.rD; + const bvhvec3 t1d = (c3min - ray.O) * ray.rD, t2d = (c3max - ray.O) * ray.rD; + const bvhvec3 minta = tinybvh_min( t1a, t2a ), maxta = tinybvh_max( t1a, t2a ); + const bvhvec3 mintb = tinybvh_min( t1b, t2b ), maxtb = tinybvh_max( t1b, t2b ); + const bvhvec3 mintc = tinybvh_min( t1c, t2c ), maxtc = tinybvh_max( t1c, t2c ); + const bvhvec3 mintd = tinybvh_min( t1d, t2d ), maxtd = tinybvh_max( t1d, t2d ); + const float tmina = tinybvh_max( tinybvh_max( tinybvh_max( minta.x, minta.y ), minta.z ), 0.0f ); + const float tminb = tinybvh_max( tinybvh_max( tinybvh_max( mintb.x, mintb.y ), mintb.z ), 0.0f ); + const float tminc = tinybvh_max( tinybvh_max( tinybvh_max( mintc.x, mintc.y ), mintc.z ), 0.0f ); + const float tmind = tinybvh_max( tinybvh_max( tinybvh_max( mintd.x, mintd.y ), mintd.z ), 0.0f ); + const float tmaxa = tinybvh_min( tinybvh_min( tinybvh_min( maxta.x, maxta.y ), maxta.z ), ray.hit.t ); + const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_min( maxtb.x, maxtb.y ), maxtb.z ), ray.hit.t ); + const float tmaxc = tinybvh_min( tinybvh_min( tinybvh_min( maxtc.x, maxtc.y ), maxtc.z ), ray.hit.t ); + const float tmaxd = tinybvh_min( tinybvh_min( tinybvh_min( maxtd.x, maxtd.y ), maxtd.z ), ray.hit.t ); + float dist0 = tmina > tmaxa ? 1e30f : tmina, dist1 = tminb > tmaxb ? 1e30f : tminb; + float dist2 = tminc > tmaxc ? 1e30f : tminc, dist3 = tmind > tmaxd ? 1e30f : tmind, t; + // get child node info fields + unsigned int c0info = as_uint( data3.x ), c1info = as_uint( data3.y ); + unsigned int c2info = as_uint( data3.z ), c3info = as_uint( data3.w ); + if (dist0 < dist2) SWAP( dist0, dist2, c0info, c2info ); + if (dist1 < dist3) SWAP( dist1, dist3, c1info, c3info ); + if (dist0 < dist1) SWAP( dist0, dist1, c0info, c1info ); + if (dist2 < dist3) SWAP( dist2, dist3, c2info, c3info ); + if (dist1 < dist2) SWAP( dist1, dist2, c1info, c2info ); + // process results, starting with farthest child, so nearest ends on top of stack + unsigned int nextNode = 0; + unsigned int leaf[4] = { 0, 0, 0, 0 }, leafs = 0; + if (dist0 < 1e30f) + { + if (c0info & 0x80000000) leaf[leafs++] = c0info; else if (c0info) stack[stackPtr++] = c0info; + } + if (dist1 < 1e30f) + { + if (c1info & 0x80000000) leaf[leafs++] = c1info; else if (c1info) stack[stackPtr++] = c1info; + } + if (dist2 < 1e30f) + { + if (c2info & 0x80000000) leaf[leafs++] = c2info; else if (c2info) stack[stackPtr++] = c2info; + } + if (dist3 < 1e30f) + { + if (c3info & 0x80000000) leaf[leafs++] = c3info; else if (c3info) stack[stackPtr++] = c3info; + } + // process encountered leafs, if any + for (unsigned int i = 0; i < leafs; i++) + { + const unsigned int N = (leaf[i] >> 16) & 0x7fff; + unsigned int triStart = offset + (leaf[i] & 0xffff); + for (unsigned int j = 0; j < N; j++, triStart += 3) + { + const bvhvec3 v0 = bvh4Alt[triStart + 0]; + const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ) - v0; + const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ) - v0; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; + const float f = 1 / a; + const bvhvec3 s = ray.O - v0; + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float d = f * dot( edge2, q ); + if (d <= 0.0f || d >= ray.hit.t /* i.e., t */) continue; + ray.hit.t = d, ray.hit.u = u, ray.hit.v = v; + ray.hit.prim = as_uint( bvh4Alt[triStart + 0].w ); + } + } + // continue with nearest node or first node on the stack + if (nextNode) offset = nextNode; else + { + if (!stackPtr) break; + offset = stack[--stackPtr]; + } + } + return steps; +} + // Intersect a WALD_32BYTE BVH with a ray packet. // The 256 rays travel together to better utilize the caches and to amortize the cost // of memory transfers over the rays in the bundle.