From ca5f7dcb12ef80595c93e8935b03bb12b5e4bc1a Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Thu, 28 Nov 2024 16:50:26 +0100 Subject: [PATCH] New speedtest tool, with validation. --- tiny_bvh.h | 787 +++++++++++++++++++++++++++++------------ tiny_bvh_speedtest.cpp | 359 ++++++++++--------- 2 files changed, 733 insertions(+), 413 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index 3fe6502..b5934f8 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -98,9 +98,9 @@ THE SOFTWARE. #endif // library version -#define TINY_BVH_VERSION_MAJOR 0 -#define TINY_BVH_VERSION_MINOR 9 -#define TINY_BVH_VERSION_SUB 9 +#define TINY_BVH_VERSION_MAJOR 1 +#define TINY_BVH_VERSION_MINOR 0 +#define TINY_BVH_VERSION_SUB 0 // ============================================================================ // @@ -564,6 +564,9 @@ class BVH int Intersect_Alt4BVH( Ray& ray ) const; // only for testing, not efficient. int Intersect_CWBVH( Ray& ray ) const; // only for testing, not efficient. bool IsOccluded_Wald32Byte( const Ray& ray ) const; + bool IsOccluded_AilaLaine( const Ray& ray ) const; + bool IsOccluded_AltSoA( const Ray& ray ) const; + bool IsOccluded_Afra( const Ray& ray ) const; void IntersectTri( Ray& ray, const unsigned triIdx ) const; static float IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ); static float SA( const bvhvec3& aabbMin, const bvhvec3& aabbMax ) @@ -571,6 +574,7 @@ class BVH bvhvec3 e = aabbMax - aabbMin; // extent of the node return e.x * e.y + e.y * e.z + e.z * e.x; } + void PrecomputeTriangle( const bvhvec4* const vert, float* T ); bool ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhvec3 bmax, bvhvec3 minDim ); void RefitUpVerbose( unsigned nodeIdx ); unsigned FindBestNewPosition( const unsigned Lid ); @@ -1025,86 +1029,6 @@ void BVH::BuildHQ( const bvhvec4* vertices, const unsigned primCount ) usedBVHNodes = newNodePtr; } -bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhvec3 bmax, bvhvec3 minDim ) -{ - // find intersection of bmin/bmax and orig bmin/bmax - bmin = tinybvh_max( bmin, orig.bmin ); - bmax = tinybvh_min( bmax, orig.bmax ); - const bvhvec3 extent = bmax - bmin; - // Sutherland-Hodgeman against six bounding planes - unsigned Nin = 3, vidx = orig.primIdx * 3; - bvhvec3 vin[10] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[10]; - for (unsigned a = 0; a < 3; a++) - { - const float eps = minDim.cell[a]; - if (extent.cell[a] > eps) - { - unsigned Nout = 0; - const float l = bmin[a], r = bmax[a]; - for (unsigned v = 0; v < Nin; v++) - { - bvhvec3 v0 = vin[v], v1 = vin[(v + 1) % Nin]; - const bool v0in = v0[a] >= l - eps, v1in = v1[a] >= l - eps; - if (!(v0in || v1in)) continue; else if (v0in != v1in) - { - bvhvec3 C = v0 + (l - v0[a]) / (v1[a] - v0[a]) * (v1 - v0); - C[a] = l /* accurate */, vout[Nout++] = C; - } - if (v1in) vout[Nout++] = v1; - } - Nin = 0; - for (unsigned v = 0; v < Nout; v++) - { - bvhvec3 v0 = vout[v], v1 = vout[(v + 1) % Nout]; - const bool v0in = v0[a] <= r + eps, v1in = v1[a] <= r + eps; - if (!(v0in || v1in)) continue; else if (v0in != v1in) - { - bvhvec3 C = v0 + (r - v0[a]) / (v1[a] - v0[a]) * (v1 - v0); - C[a] = r /* accurate */, vin[Nin++] = C; - } - if (v1in) vin[Nin++] = v1; - } - } - } - bvhvec3 mn( 1e30f ), mx( -1e30f ); - for (unsigned i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] ); - newFrag.primIdx = orig.primIdx; - newFrag.bmin = tinybvh_max( mn, bmin ), newFrag.bmax = tinybvh_min( mx, bmax ); - newFrag.clipped = 1; - return Nin > 0; -} - -// PrecomputeTriangle (helper), transforms a triangle to the format used in: -// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016. -static void PrecomputeTriangle( const bvhvec4* const vert, float* T ) -{ - bvhvec3 v0 = vert[0], v1 = vert[1], v2 = vert[2]; - bvhvec3 e1 = v1 - v0, e2 = v2 - v0, N = cross( e1, e2 ); - float x1, x2, n = dot( v0, N ), rN; - if (fabs( N[0] ) > fabs( N[1] ) && fabs( N[0] ) > fabs( N[2] )) - { - x1 = v1.y * v0.z - v1.z * v0.y, x2 = v2.y * v0.z - v2.z * v0.y, rN = 1.0f / N.x; - T[0] = 0, T[1] = e2.z * rN, T[2] = -e2.y * rN, T[3] = x2 * rN; - T[4] = 0, T[5] = -e1.z * rN, T[6] = e1.y * rN, T[7] = -x1 * rN; - T[8] = 1, T[9] = N.y * rN, T[10] = N.z * rN, T[11] = -n * rN; - } - else if (fabs( N.y ) > fabs( N.z )) - { - x1 = v1.z * v0.x - v1.x * v0.z, x2 = v2.z * v0.x - v2.x * v0.z, rN = 1.0f / N.y; - T[0] = -e2.z * rN, T[1] = 0, T[2] = e2.x * rN, T[3] = x2 * rN; - T[4] = e1.z * rN, T[5] = 0, T[6] = -e1.x * rN, T[7] = -x1 * rN; - T[8] = N.x * rN, T[9] = 1, T[10] = N.z * rN, T[11] = -n * rN; - } - else if (fabs( N.z ) > 0) - { - x1 = v1.x * v0.y - v1.y * v0.x, x2 = v2.x * v0.y - v2.y * v0.x, rN = 1.0f / N.z; - T[0] = e2.y * rN, T[1] = -e2.x * rN, T[2] = 0, T[3] = x2 * rN; - T[4] = -e1.y * rN, T[5] = e1.x * rN, T[6] = 0, T[7] = -x1 * rN; - T[8] = N.x * rN, T[9] = N.y * rN, T[10] = 1, T[11] = -n * rN; - } - else memset( T, 0, 12 * 4 ); // cerr << "degenerate source " << endl; -} - // Convert: Change the BVH layout from one format into another. void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOriginal ) { @@ -2025,35 +1949,6 @@ void BVH::MergeLeafs() may_have_holes = true; // all over the place, in fact } -// Determine for each node in the tree the number of primitives -// stored in that subtree. Helper function for MergeLeafs. -unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters ) -{ - BVHNodeVerbose& node = verbose[nodeIdx]; - unsigned result = node.triCount; - if (!result) - result = CountSubtreeTris( node.left, counters ) + CountSubtreeTris( node.right, counters ); - counters[nodeIdx] = result; - return result; -} - -// Write the triangle indices stored in a subtree to a continuous -// slice in the 'newIdx' array. Helper function for MergeLeafs. -void BVH::MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr ) -{ - BVHNodeVerbose& node = verbose[nodeIdx]; - if (node.isLeaf()) - { - memcpy( newIdx + newIdxPtr, triIdx + node.firstTri, node.triCount * 4 ); - newIdxPtr += node.triCount; - } - else - { - MergeSubtree( node.left, newIdx, newIdxPtr ); - MergeSubtree( node.right, newIdx, newIdxPtr ); - } -} - // Optimizing a BVH: BVH must be in 'verbose' format. // Implements "Fast Insertion-Based Optimization of Bounding Volume Hierarchies", void BVH::Optimize( const unsigned iterations, const bool convertBack ) @@ -2088,72 +1983,6 @@ void BVH::Optimize( const unsigned iterations, const bool convertBack ) } } -// RefitUpVerbose: Update bounding boxes of ancestors of the specified node. -void BVH::RefitUpVerbose( unsigned nodeIdx ) -{ - while (nodeIdx != 0xffffffff) - { - BVHNodeVerbose& node = verbose[nodeIdx]; - BVHNodeVerbose& left = verbose[node.left]; - BVHNodeVerbose& right = verbose[node.right]; - node.aabbMin = tinybvh_min( left.aabbMin, right.aabbMin ); - node.aabbMax = tinybvh_max( left.aabbMax, right.aabbMax ); - nodeIdx = node.parent; - } -} - -// FindBestNewPosition -// Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" -unsigned BVH::FindBestNewPosition( const unsigned Lid ) -{ - const BVHNodeVerbose& L = verbose[Lid]; - const float SA_L = SA( L.aabbMin, L.aabbMax ); - // reinsert L into BVH - unsigned taskNode[512], tasks = 1, Xbest = 0; - float taskCi[512], taskInvCi[512], Cbest = 1e30f, epsilon = 1e-10f; - taskNode[0] = 0 /* root */, taskCi[0] = 0, taskInvCi[0] = 1 / epsilon; - while (tasks > 0) - { - // 'pop' task with createst taskInvCi - float maxInvCi = 0; - unsigned bestTask = 0; - for (unsigned j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j; - const unsigned Xid = taskNode[bestTask]; - const float CiLX = taskCi[bestTask]; - taskNode[bestTask] = taskNode[--tasks], taskCi[bestTask] = taskCi[tasks], taskInvCi[bestTask] = taskInvCi[tasks]; - // execute task - const BVHNodeVerbose& X = verbose[Xid]; - if (CiLX + SA_L >= Cbest) break; - const float CdLX = SA( tinybvh_min( L.aabbMin, X.aabbMin ), tinybvh_max( L.aabbMax, X.aabbMax ) ); - const float CLX = CiLX + CdLX; - if (CLX < Cbest) Cbest = CLX, Xbest = Xid; - const float Ci = CLX - SA( X.aabbMin, X.aabbMax ); - if (Ci + SA_L < Cbest) if (!X.isLeaf()) - { - taskNode[tasks] = X.left, taskCi[tasks] = Ci, taskInvCi[tasks++] = 1.0f / (Ci + epsilon); - taskNode[tasks] = X.right, taskCi[tasks] = Ci, taskInvCi[tasks++] = 1.0f / (Ci + epsilon); - } - } - return Xbest; -} - -// ReinsertNodeVerbose -// Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" -void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin ) -{ - unsigned Xbest = FindBestNewPosition( Lid ); - if (Xbest == 0 || verbose[Xbest].parent == 0) Xbest = origin; - const unsigned X1 = verbose[Xbest].parent; - BVHNodeVerbose& N = verbose[Nid]; - N.left = Xbest, N.right = Lid; - N.aabbMin = tinybvh_min( verbose[Xbest].aabbMin, verbose[Lid].aabbMin ); - N.aabbMax = tinybvh_max( verbose[Xbest].aabbMax, verbose[Lid].aabbMax ); - verbose[Nid].parent = X1; - if (verbose[X1].left == Xbest) verbose[X1].left = Nid; else verbose[X1].right = Nid; - verbose[Xbest].parent = verbose[Lid].parent = Nid; - RefitUpVerbose( Nid ); -} - // Intersect a BVH with a ray. // This function returns the intersection details in Ray::hit. Additionally, // the number of steps through the BVH is returned. Visualize this to get a @@ -2218,9 +2047,10 @@ bool BVH::IsOccluded( const Ray& ray, const BVHLayout layout ) const { switch (layout) { - case WALD_32BYTE: - return IsOccluded_Wald32Byte( ray ); - break; + case WALD_32BYTE: return IsOccluded_Wald32Byte( ray ); + case AILA_LAINE: return IsOccluded_AilaLaine( ray ); + case ALT_SOA: return IsOccluded_AltSoA( ray ); + case BVH4_AFRA: return IsOccluded_Afra( ray ); default: // For now, implemented using Intersect, so we have the interface in // place. TODO: make specialized code, which will be faster. @@ -2368,6 +2198,67 @@ int BVH::Intersect_AilaLaine( Ray& ray ) const return steps; } +// Find occlusions in the alternative BVH layout (AILA_LAINE). +bool BVH::IsOccluded_AilaLaine( const Ray& ray ) const +{ + BVHNodeAlt* node = &altNode[0], * stack[64]; + unsigned stackPtr = 0, steps = 0; + while (1) + { + steps++; + if (node->isLeaf()) + { + for (unsigned i = 0; i < node->triCount; i++) + { + const unsigned vertIdx = triIdx[node->firstTri + i] * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) return true; + } + if (stackPtr == 0) break; else node = stack[--stackPtr]; + continue; + } + const bvhvec3 lmin = node->lmin - ray.O, lmax = node->lmax - ray.O; + const bvhvec3 rmin = node->rmin - ray.O, rmax = node->rmax - ray.O; + float dist1 = 1e30f, dist2 = 1e30f; + const bvhvec3 t1a = lmin * ray.rD, t2a = lmax * ray.rD; + const bvhvec3 t1b = rmin * ray.rD, t2b = rmax * ray.rD; + const float tmina = tinybvh_max( tinybvh_max( tinybvh_min( t1a.x, t2a.x ), tinybvh_min( t1a.y, t2a.y ) ), tinybvh_min( t1a.z, t2a.z ) ); + const float tmaxa = tinybvh_min( tinybvh_min( tinybvh_max( t1a.x, t2a.x ), tinybvh_max( t1a.y, t2a.y ) ), tinybvh_max( t1a.z, t2a.z ) ); + const float tminb = tinybvh_max( tinybvh_max( tinybvh_min( t1b.x, t2b.x ), tinybvh_min( t1b.y, t2b.y ) ), tinybvh_min( t1b.z, t2b.z ) ); + const float tmaxb = tinybvh_min( tinybvh_min( tinybvh_max( t1b.x, t2b.x ), tinybvh_max( t1b.y, t2b.y ) ), tinybvh_max( t1b.z, t2b.z ) ); + if (tmaxa >= tmina && tmina < ray.hit.t && tmaxa >= 0) dist1 = tmina; + if (tmaxb >= tminb && tminb < ray.hit.t && tmaxb >= 0) dist2 = tminb; + unsigned lidx = node->left, ridx = node->right; + if (dist1 > dist2) + { + float t = dist1; dist1 = dist2; dist2 = t; + unsigned i = lidx; lidx = ridx; ridx = i; + } + if (dist1 == 1e30f) + { + if (stackPtr == 0) break; else node = stack[--stackPtr]; + } + else + { + node = altNode + lidx; + if (dist2 != 1e30f) stack[stackPtr++] = altNode + ridx; + } + } + return steps; +} + // Intersect_BasicBVH4. For testing the converted data only; not efficient. int BVH::Intersect_BasicBVH4( Ray& ray ) const { @@ -2689,46 +2580,6 @@ void BVH::Intersect256Rays( Ray* packet ) const } } -// IntersectTri -void BVH::IntersectTri( Ray& ray, const unsigned idx ) const -{ - // Moeller-Trumbore ray/triangle intersection algorithm - const unsigned vertIdx = idx * 3; - const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; - const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; - const bvhvec3 h = cross( ray.D, edge2 ); - const float a = dot( edge1, h ); - if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle - const float f = 1 / a; - const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); - const float u = f * dot( s, h ); - if (u < 0 || u > 1) return; - const bvhvec3 q = cross( s, edge1 ); - const float v = f * dot( ray.D, q ); - if (v < 0 || u + v > 1) return; - const float t = f * dot( edge2, q ); - if (t > 0 && t < ray.hit.t) - { - // register a hit: ray is shortened to t - ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx; - } -} - -// IntersectAABB -float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ) -{ - // "slab test" ray/AABB intersection - float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x; - float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 ); - float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y; - tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) ); - tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) ); - float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z; - tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) ); - tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) ); - if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f; -} - // ============================================================================ // // I M P L E M E N T A T I O N - A V X / S S E C O D E @@ -3212,13 +3063,102 @@ int BVH::Intersect_AltSoA( Ray& ray ) const if (dist1 == 1e30f) { if (stackPtr == 0) break; else node = stack[--stackPtr]; - } + } else { node = alt2Node + lidx; if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx; } +} + return steps; +} + +// Find occlusions in the second alternative BVH layout (ALT_SOA). +bool BVH::IsOccluded_AltSoA( const Ray& ray ) const +{ + BVHNodeAlt2* node = &alt2Node[0], * stack[64]; + unsigned stackPtr = 0, steps = 0; + const __m128 Ox4 = _mm_set1_ps( ray.O.x ), rDx4 = _mm_set1_ps( ray.rD.x ); + const __m128 Oy4 = _mm_set1_ps( ray.O.y ), rDy4 = _mm_set1_ps( ray.rD.y ); + const __m128 Oz4 = _mm_set1_ps( ray.O.z ), rDz4 = _mm_set1_ps( ray.rD.z ); + // const __m128 inf4 = _mm_set1_ps( 1e30f ); + while (1) + { + steps++; + if (node->isLeaf()) + { + for (unsigned i = 0; i < node->triCount; i++) + { + const unsigned tidx = triIdx[node->firstTri + i], vertIdx = tidx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t >= 0 && t <= ray.hit.t) return true; + } + if (stackPtr == 0) break; else node = stack[--stackPtr]; + continue; + } + __m128 x4 = _mm_mul_ps( _mm_sub_ps( node->xxxx, Ox4 ), rDx4 ); + __m128 y4 = _mm_mul_ps( _mm_sub_ps( node->yyyy, Oy4 ), rDy4 ); + __m128 z4 = _mm_mul_ps( _mm_sub_ps( node->zzzz, Oz4 ), rDz4 ); + // transpose + __m128 t0 = _mm_unpacklo_ps( x4, y4 ), t2 = _mm_unpacklo_ps( z4, z4 ); + __m128 t1 = _mm_unpackhi_ps( x4, y4 ), t3 = _mm_unpackhi_ps( z4, z4 ); + __m128 xyzw1a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + __m128 xyzw2a = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + __m128 xyzw1b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + __m128 xyzw2b = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + // process + __m128 tmina4 = _mm_min_ps( xyzw1a, xyzw2a ), tmaxa4 = _mm_max_ps( xyzw1a, xyzw2a ); + __m128 tminb4 = _mm_min_ps( xyzw1b, xyzw2b ), tmaxb4 = _mm_max_ps( xyzw1b, xyzw2b ); + // transpose back + t0 = _mm_unpacklo_ps( tmina4, tmaxa4 ), t2 = _mm_unpacklo_ps( tminb4, tmaxb4 ); + t1 = _mm_unpackhi_ps( tmina4, tmaxa4 ), t3 = _mm_unpackhi_ps( tminb4, tmaxb4 ); + x4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + y4 = _mm_shuffle_ps( t0, t2, _MM_SHUFFLE( 3, 2, 3, 2 ) ); + z4 = _mm_shuffle_ps( t1, t3, _MM_SHUFFLE( 1, 0, 1, 0 ) ); + unsigned lidx = node->left, ridx = node->right; + const __m128 min4 = _mm_max_ps( _mm_max_ps( _mm_max_ps( x4, y4 ), z4 ), _mm_setzero_ps() ); + const __m128 max4 = _mm_min_ps( _mm_min_ps( _mm_min_ps( x4, y4 ), z4 ), _mm_set1_ps( ray.hit.t ) ); + #if 0 + // TODO: why is this slower on gen14? + const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 ); + const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 ); + t0 = _mm_shuffle_ps( max4, max4, _MM_SHUFFLE( 1, 3, 1, 3 ) ); + t1 = _mm_shuffle_ps( min4, min4, _MM_SHUFFLE( 0, 2, 0, 2 ) ); + t0 = _mm_blendv_ps( inf4, t1, _mm_cmpge_ps( t0, t1 ) ); + float dist1 = LANE( t0, 1 ), dist2 = LANE( t0, 0 ); + #else + const float tmina_0 = LANE( min4, 0 ), tmaxa_1 = LANE( max4, 1 ); + const float tminb_2 = LANE( min4, 2 ), tmaxb_3 = LANE( max4, 3 ); + float dist1 = tmaxa_1 >= tmina_0 ? tmina_0 : 1e30f; + float dist2 = tmaxb_3 >= tminb_2 ? tminb_2 : 1e30f; + #endif + if (dist1 > dist2) + { + float t = dist1; dist1 = dist2; dist2 = t; + unsigned i = lidx; lidx = ridx; ridx = i; + } + if (dist1 == 1e30f) + { + if (stackPtr == 0) break; else node = stack[--stackPtr]; } + else + { + node = alt2Node + lidx; + if (dist2 != 1e30f) stack[stackPtr++] = alt2Node + ridx; + } +} return steps; } @@ -3397,7 +3337,7 @@ int BVH::Intersect_CWBVH( Ray& ray ) const } // Traverse a 4-way BVH stored in 'Atilla Áfra' layout. -inline void IntersectTri4( Ray& r, __m128& t4, const float* T ) +inline void IntersectCompactTri( Ray& r, __m128& t4, const float* T ) { const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11]; const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z; @@ -3407,14 +3347,7 @@ inline void IntersectTri4( Ray& r, __m128& t4, const float* T ) const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; const bool hit = u >= 0 && v >= 0 && u + v < 1; - if (hit) - { - r.hit.u = u; - r.hit.v = v; - r.hit.prim = *(unsigned*)&T[15]; - r.hit.t = ta; - t4 = _mm_set1_ps( ta ); - } + if (hit) r.hit = { ta, u, v, *(unsigned*)&T[15] }, t4 = _mm_set1_ps( ta ); } int BVH::Intersect_Afra( Ray& ray ) const { @@ -3453,7 +3386,7 @@ int BVH::Intersect_Afra( Ray& ray ) const { const unsigned first = node.childFirst[lane]; for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf - IntersectTri4( ray, t4, (float*)(bvh4Tris + first + j * 4 ) ); + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); if (stackPtr == 0) break; nodeIdx = stack[--stackPtr]; } @@ -3481,7 +3414,7 @@ int BVH::Intersect_Afra( Ray& ray ) const { const unsigned first = node.childFirst[lane0]; for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf - IntersectTri4( ray, t4, (float*)(bvh4Tris + first + j * 4 ) ); + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); nodeIdx = 0; } // process second lane @@ -3494,7 +3427,7 @@ int BVH::Intersect_Afra( Ray& ray ) const { const unsigned first = node.childFirst[lane1]; for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf - IntersectTri4( ray, t4, (float*)(bvh4Tris + first + j * 4 ) ); + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } else if (hits == 3 /* 8% */) @@ -3523,7 +3456,7 @@ int BVH::Intersect_Afra( Ray& ray ) const } const unsigned first = node.childFirst[lane], count = node.triCount[lane]; for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf - IntersectTri4( ray, t4, (float*)(bvh4Tris + first + j * 4 ) ); + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } else /* hits == 4, 2%: rare */ @@ -3553,7 +3486,7 @@ int BVH::Intersect_Afra( Ray& ray ) const } const unsigned first = node.childFirst[lane], count = node.triCount[lane]; for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf - IntersectTri4( ray, t4, (float*)(bvh4Tris + first + j * 4 ) ); + IntersectCompactTri( ray, t4, (float*)(bvh4Tris + first + j * 4) ); } } // get next task @@ -3563,6 +3496,165 @@ int BVH::Intersect_Afra( Ray& ray ) const return steps; } +// Find occlusions in a 4-way BVH stored in 'Atilla Áfra' layout. +inline bool OccludedCompactTri( const Ray& r, const float* T ) +{ + const float transS = T[8] * r.O.x + T[9] * r.O.y + T[10] * r.O.z + T[11]; + const float transD = T[8] * r.D.x + T[9] * r.D.y + T[10] * r.D.z; + const float ta = -transS / transD; + if (ta <= 0 || ta >= r.hit.t) return false; + const bvhvec3 wr = r.O + ta * r.D; + const float u = T[0] * wr.x + T[1] * wr.y + T[2] * wr.z + T[3]; + const float v = T[4] * wr.x + T[5] * wr.y + T[6] * wr.z + T[7]; + return u >= 0 && v >= 0 && u + v < 1; +} +bool BVH::IsOccluded_Afra( const Ray& ray ) const +{ + unsigned nodeIdx = 0, stack[1024], stackPtr = 0; + const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x ); + const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y ); + const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z ); + __m128 t4 = _mm_set1_ps( ray.hit.t ), zero4 = _mm_setzero_ps(); + const __m128 idx4 = _mm_castsi128_ps( _mm_setr_epi32( 0, 1, 2, 3 ) ); + const __m128 idxMask = _mm_castsi128_ps( _mm_set1_epi32( 0xfffffffc ) ); + const __m128 inf4 = _mm_set1_ps( 1e30f ); + while (1) + { + const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; + // intersect the ray with four AABBs + const __m128 xmin4 = node.xmin4, xmax4 = node.xmax4; + const __m128 ymin4 = node.ymin4, ymax4 = node.ymax4; + const __m128 zmin4 = node.zmin4, zmax4 = node.zmax4; + const __m128 x0 = _mm_sub_ps( xmin4, ox4 ), x1 = _mm_sub_ps( xmax4, ox4 ); + const __m128 y0 = _mm_sub_ps( ymin4, oy4 ), y1 = _mm_sub_ps( ymax4, oy4 ); + const __m128 z0 = _mm_sub_ps( zmin4, oz4 ), z1 = _mm_sub_ps( zmax4, oz4 ); + const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 ); + const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 ); + const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 ); + const __m128 txmin = _mm_min_ps( tx1, tx2 ), tymin = _mm_min_ps( ty1, ty2 ), tzmin = _mm_min_ps( tz1, tz2 ); + const __m128 txmax = _mm_max_ps( tx1, tx2 ), tymax = _mm_max_ps( ty1, ty2 ), tzmax = _mm_max_ps( tz1, tz2 ); + const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin ); + const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax ); + const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); + const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); + if (hits == 1 /* 43% */) + { + // just one node was hit - no sorting needed. + const unsigned lane = __bfind( hitBits ), count = node.triCount[lane]; + if (count == 0) nodeIdx = node.childFirst[lane]; else + { + const unsigned first = node.childFirst[lane]; + for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + } + continue; + } + if (hits == 0 /* 29% */) + { + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + continue; + } + if (hits == 2 /* 16% */) + { + // two nodes hit + unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); + float dist0 = ((float*)&tmin)[lane0], dist1 = ((float*)&tmin)[lane1]; + if (dist1 < dist0) + { + unsigned t = lane0; lane0 = lane1; lane1 = t; + float ft = dist0; dist0 = dist1; dist1 = ft; + } + const unsigned triCount0 = node.triCount[lane0], triCount1 = node.triCount[lane1]; + // process first lane + if (triCount0 == 0) nodeIdx = node.childFirst[lane0]; else + { + const unsigned first = node.childFirst[lane0]; + for (unsigned j = 0; j < triCount0; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + nodeIdx = 0; + } + // process second lane + if (triCount1 == 0) + { + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = node.childFirst[lane1]; + } + else + { + const unsigned first = node.childFirst[lane1]; + for (unsigned j = 0; j < triCount1; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + else if (hits == 3 /* 8% */) + { + // blend in lane indices + __m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); + // sort + float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 ); + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int i = 1; i < 4; i++) + { + unsigned lane = *(unsigned*)&d[i] & 3; + if (node.triCount[lane] == 0) + { + const unsigned childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const unsigned first = node.childFirst[lane], count = node.triCount[lane]; + for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + else /* hits == 4, 2%: rare */ + { + // blend in lane indices + __m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); + // sort + float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 ); + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + nodeIdx = 0; + for (int i = 0; i < 4; i++) + { + unsigned lane = *(unsigned*)&d[i] & 3; + if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + if (node.triCount[lane] == 0) + { + const unsigned childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const unsigned first = node.childFirst[lane], count = node.triCount[lane]; + for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + if (OccludedCompactTri( ray, (float*)(bvh4Tris + first + j * 4) )) return true; + } + } + // get next task + if (nodeIdx) continue; + if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; + } + return false; +} + #endif // BVH_USEAVX // ============================================================================ @@ -3852,6 +3944,229 @@ int BVH::Intersect_AltSoA( Ray& ray ) const #endif // BVH_USENEON +// ============================================================================ +// +// H E L P E R S +// +// ============================================================================ + +// IntersectTri +void BVH::IntersectTri( Ray& ray, const unsigned idx ) const +{ + // Moeller-Trumbore ray/triangle intersection algorithm + const unsigned vertIdx = idx * 3; + const bvhvec3 edge1 = verts[vertIdx + 1] - verts[vertIdx]; + const bvhvec3 edge2 = verts[vertIdx + 2] - verts[vertIdx]; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( verts[vertIdx] ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) return; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) return; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) + { + // register a hit: ray is shortened to t + ray.hit.t = t, ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx; + } +} + +// IntersectAABB +float BVH::IntersectAABB( const Ray& ray, const bvhvec3& aabbMin, const bvhvec3& aabbMax ) +{ + // "slab test" ray/AABB intersection + float tx1 = (aabbMin.x - ray.O.x) * ray.rD.x, tx2 = (aabbMax.x - ray.O.x) * ray.rD.x; + float tmin = tinybvh_min( tx1, tx2 ), tmax = tinybvh_max( tx1, tx2 ); + float ty1 = (aabbMin.y - ray.O.y) * ray.rD.y, ty2 = (aabbMax.y - ray.O.y) * ray.rD.y; + tmin = tinybvh_max( tmin, tinybvh_min( ty1, ty2 ) ); + tmax = tinybvh_min( tmax, tinybvh_max( ty1, ty2 ) ); + float tz1 = (aabbMin.z - ray.O.z) * ray.rD.z, tz2 = (aabbMax.z - ray.O.z) * ray.rD.z; + tmin = tinybvh_max( tmin, tinybvh_min( tz1, tz2 ) ); + tmax = tinybvh_min( tmax, tinybvh_max( tz1, tz2 ) ); + if (tmax >= tmin && tmin < ray.hit.t && tmax >= 0) return tmin; else return 1e30f; +} + +// PrecomputeTriangle (helper), transforms a triangle to the format used in: +// Fast Ray-Triangle Intersections by Coordinate Transformation. Baldwin & Weber, 2016. +void BVH::PrecomputeTriangle( const bvhvec4* const vert, float* T ) +{ + bvhvec3 v0 = vert[0], v1 = vert[1], v2 = vert[2]; + bvhvec3 e1 = v1 - v0, e2 = v2 - v0, N = cross( e1, e2 ); + float x1, x2, n = dot( v0, N ), rN; + if (fabs( N[0] ) > fabs( N[1] ) && fabs( N[0] ) > fabs( N[2] )) + { + x1 = v1.y * v0.z - v1.z * v0.y, x2 = v2.y * v0.z - v2.z * v0.y, rN = 1.0f / N.x; + T[0] = 0, T[1] = e2.z * rN, T[2] = -e2.y * rN, T[3] = x2 * rN; + T[4] = 0, T[5] = -e1.z * rN, T[6] = e1.y * rN, T[7] = -x1 * rN; + T[8] = 1, T[9] = N.y * rN, T[10] = N.z * rN, T[11] = -n * rN; + } + else if (fabs( N.y ) > fabs( N.z )) + { + x1 = v1.z * v0.x - v1.x * v0.z, x2 = v2.z * v0.x - v2.x * v0.z, rN = 1.0f / N.y; + T[0] = -e2.z * rN, T[1] = 0, T[2] = e2.x * rN, T[3] = x2 * rN; + T[4] = e1.z * rN, T[5] = 0, T[6] = -e1.x * rN, T[7] = -x1 * rN; + T[8] = N.x * rN, T[9] = 1, T[10] = N.z * rN, T[11] = -n * rN; + } + else if (fabs( N.z ) > 0) + { + x1 = v1.x * v0.y - v1.y * v0.x, x2 = v2.x * v0.y - v2.y * v0.x, rN = 1.0f / N.z; + T[0] = e2.y * rN, T[1] = -e2.x * rN, T[2] = 0, T[3] = x2 * rN; + T[4] = -e1.y * rN, T[5] = e1.x * rN, T[6] = 0, T[7] = -x1 * rN; + T[8] = N.x * rN, T[9] = N.y * rN, T[10] = 1, T[11] = -n * rN; + } + else memset( T, 0, 12 * 4 ); // cerr << "degenerate source " << endl; +} + +// ClipFrag (helper), clip a triangle against an AABB. +// Can probably be done a lot more efficiently. Used in SBVH construction. +bool BVH::ClipFrag( const Fragment& orig, Fragment& newFrag, bvhvec3 bmin, bvhvec3 bmax, bvhvec3 minDim ) +{ + // find intersection of bmin/bmax and orig bmin/bmax + bmin = tinybvh_max( bmin, orig.bmin ); + bmax = tinybvh_min( bmax, orig.bmax ); + const bvhvec3 extent = bmax - bmin; + // Sutherland-Hodgeman against six bounding planes + unsigned Nin = 3, vidx = orig.primIdx * 3; + bvhvec3 vin[10] = { verts[vidx], verts[vidx + 1], verts[vidx + 2] }, vout[10]; + for (unsigned a = 0; a < 3; a++) + { + const float eps = minDim.cell[a]; + if (extent.cell[a] > eps) + { + unsigned Nout = 0; + const float l = bmin[a], r = bmax[a]; + for (unsigned v = 0; v < Nin; v++) + { + bvhvec3 v0 = vin[v], v1 = vin[(v + 1) % Nin]; + const bool v0in = v0[a] >= l - eps, v1in = v1[a] >= l - eps; + if (!(v0in || v1in)) continue; else if (v0in != v1in) + { + bvhvec3 C = v0 + (l - v0[a]) / (v1[a] - v0[a]) * (v1 - v0); + C[a] = l /* accurate */, vout[Nout++] = C; + } + if (v1in) vout[Nout++] = v1; + } + Nin = 0; + for (unsigned v = 0; v < Nout; v++) + { + bvhvec3 v0 = vout[v], v1 = vout[(v + 1) % Nout]; + const bool v0in = v0[a] <= r + eps, v1in = v1[a] <= r + eps; + if (!(v0in || v1in)) continue; else if (v0in != v1in) + { + bvhvec3 C = v0 + (r - v0[a]) / (v1[a] - v0[a]) * (v1 - v0); + C[a] = r /* accurate */, vin[Nin++] = C; + } + if (v1in) vin[Nin++] = v1; + } + } + } + bvhvec3 mn( 1e30f ), mx( -1e30f ); + for (unsigned i = 0; i < Nin; i++) mn = tinybvh_min( mn, vin[i] ), mx = tinybvh_max( mx, vin[i] ); + newFrag.primIdx = orig.primIdx; + newFrag.bmin = tinybvh_max( mn, bmin ), newFrag.bmax = tinybvh_min( mx, bmax ); + newFrag.clipped = 1; + return Nin > 0; +} + +// RefitUpVerbose: Update bounding boxes of ancestors of the specified node. +void BVH::RefitUpVerbose( unsigned nodeIdx ) +{ + while (nodeIdx != 0xffffffff) + { + BVHNodeVerbose& node = verbose[nodeIdx]; + BVHNodeVerbose& left = verbose[node.left]; + BVHNodeVerbose& right = verbose[node.right]; + node.aabbMin = tinybvh_min( left.aabbMin, right.aabbMin ); + node.aabbMax = tinybvh_max( left.aabbMax, right.aabbMax ); + nodeIdx = node.parent; + } +} + +// FindBestNewPosition +// Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" +unsigned BVH::FindBestNewPosition( const unsigned Lid ) +{ + const BVHNodeVerbose& L = verbose[Lid]; + const float SA_L = SA( L.aabbMin, L.aabbMax ); + // reinsert L into BVH + unsigned taskNode[512], tasks = 1, Xbest = 0; + float taskCi[512], taskInvCi[512], Cbest = 1e30f, epsilon = 1e-10f; + taskNode[0] = 0 /* root */, taskCi[0] = 0, taskInvCi[0] = 1 / epsilon; + while (tasks > 0) + { + // 'pop' task with createst taskInvCi + float maxInvCi = 0; + unsigned bestTask = 0; + for (unsigned j = 0; j < tasks; j++) if (taskInvCi[j] > maxInvCi) maxInvCi = taskInvCi[j], bestTask = j; + const unsigned Xid = taskNode[bestTask]; + const float CiLX = taskCi[bestTask]; + taskNode[bestTask] = taskNode[--tasks], taskCi[bestTask] = taskCi[tasks], taskInvCi[bestTask] = taskInvCi[tasks]; + // execute task + const BVHNodeVerbose& X = verbose[Xid]; + if (CiLX + SA_L >= Cbest) break; + const float CdLX = SA( tinybvh_min( L.aabbMin, X.aabbMin ), tinybvh_max( L.aabbMax, X.aabbMax ) ); + const float CLX = CiLX + CdLX; + if (CLX < Cbest) Cbest = CLX, Xbest = Xid; + const float Ci = CLX - SA( X.aabbMin, X.aabbMax ); + if (Ci + SA_L < Cbest) if (!X.isLeaf()) + { + taskNode[tasks] = X.left, taskCi[tasks] = Ci, taskInvCi[tasks++] = 1.0f / (Ci + epsilon); + taskNode[tasks] = X.right, taskCi[tasks] = Ci, taskInvCi[tasks++] = 1.0f / (Ci + epsilon); + } + } + return Xbest; +} + +// ReinsertNodeVerbose +// Part of "Fast Insertion-Based Optimization of Bounding Volume Hierarchies" +void BVH::ReinsertNodeVerbose( const unsigned Lid, const unsigned Nid, const unsigned origin ) +{ + unsigned Xbest = FindBestNewPosition( Lid ); + if (Xbest == 0 || verbose[Xbest].parent == 0) Xbest = origin; + const unsigned X1 = verbose[Xbest].parent; + BVHNodeVerbose& N = verbose[Nid]; + N.left = Xbest, N.right = Lid; + N.aabbMin = tinybvh_min( verbose[Xbest].aabbMin, verbose[Lid].aabbMin ); + N.aabbMax = tinybvh_max( verbose[Xbest].aabbMax, verbose[Lid].aabbMax ); + verbose[Nid].parent = X1; + if (verbose[X1].left == Xbest) verbose[X1].left = Nid; else verbose[X1].right = Nid; + verbose[Xbest].parent = verbose[Lid].parent = Nid; + RefitUpVerbose( Nid ); +} + +// Determine for each node in the tree the number of primitives +// stored in that subtree. Helper function for MergeLeafs. +unsigned BVH::CountSubtreeTris( const unsigned nodeIdx, unsigned* counters ) +{ + BVHNodeVerbose& node = verbose[nodeIdx]; + unsigned result = node.triCount; + if (!result) + result = CountSubtreeTris( node.left, counters ) + CountSubtreeTris( node.right, counters ); + counters[nodeIdx] = result; + return result; +} + +// Write the triangle indices stored in a subtree to a continuous +// slice in the 'newIdx' array. Helper function for MergeLeafs. +void BVH::MergeSubtree( const unsigned nodeIdx, unsigned* newIdx, unsigned& newIdxPtr ) +{ + BVHNodeVerbose& node = verbose[nodeIdx]; + if (node.isLeaf()) + { + memcpy( newIdx + newIdxPtr, triIdx + node.firstTri, node.triCount * 4 ); + newIdxPtr += node.triCount; + } + else + { + MergeSubtree( node.left, newIdx, newIdxPtr ); + MergeSubtree( node.right, newIdx, newIdxPtr ); + } +} + } // namespace tinybvh #ifdef __GNUC__ diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index 659ce41..9335481 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -16,17 +16,17 @@ #define BUILD_REFERENCE #define BUILD_AVX #define BUILD_NEON -#define BUILD_SBVH +// #define BUILD_SBVH #define TRAVERSE_2WAY_ST #define TRAVERSE_ALT2WAY_ST #define TRAVERSE_SOA2WAY_ST +#define TRAVERSE_4WAY #define TRAVERSE_2WAY_MT #define TRAVERSE_2WAY_MT_PACKET #define TRAVERSE_OPTIMIZED_ST #define TRAVERSE_4WAY_OPTIMIZED -// #define TRAVERSE_2WAY_MT_DIVERGENT // skipping; needs improvement. -#define EMBREE_BUILD // win64-only for now. -#define EMBREE_TRAVERSE // win64-only for now. +// #define EMBREE_BUILD // win64-only for now. +// #define EMBREE_TRAVERSE // win64-only for now. // GPU rays: only if ENABLE_OPENCL is defined. #define GPU_2WAY @@ -61,6 +61,7 @@ ALIGNED( 64 ) bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; #endif int verts = 0; BVH bvh; +float traceTime, *refDist = 0; #if defined EMBREE_BUILD || defined EMBREE_TRAVERSE #include "embree4/rtcore.h" @@ -91,21 +92,6 @@ struct Timer std::chrono::high_resolution_clock::time_point start; }; -#if 0 -void dump_bitmap( Ray* rays ) -{ - unsigned char pixel[SCRWIDTH * SCRHEIGHT]; float sum; - for (int s, x, y, i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++) for (int tx = 0; tx < SCRWIDTH / 4; tx++) - for (y = 0; y < 4; y++) for (s = 0, x = 0; x < 4; x++) - { - for (s = 0, sum = 0; s < 16; s++, i++) sum += rays[i].hit.t == 1e30f ? 0 : rays[i].hit.t; - pixel[tx * 4 + x + (ty * 4 + y) * SCRWIDTH] = (unsigned char)((int)(sum * 0.01f) & 255); - } - FILE* f = fopen( "testimage.raw", "wb" ); - fwrite( pixel, 1, SCRWIDTH * SCRHEIGHT, f ); // for debugging, forget fclose -} -#endif - void sphere_flake( float x, float y, float z, float s, int d = 0 ) { // procedural tesselated sphere flake object @@ -128,6 +114,64 @@ void sphere_flake( float x, float y, float z, float s, int d = 0 ) if (d < 3) sphere_flake( x, y, z - s * 1.5f, s * 0.5f, d + 1 ); } +float TestPrimaryRays( BVH::BVHLayout layout, Ray* batch, unsigned N, unsigned passes ) +{ + // Primary rays: coherent batch of rays from a pinhole camera. One ray per + // pixel, organized in tiles to further increase coherence. + Timer t; + for( unsigned i = 0; i < N; i++ ) batch[i].hit.t = 1e30f; + for (unsigned pass = 0; pass < passes + 1; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming + for (unsigned i = 0; i < N; i++) bvh.Intersect( batch[i], layout ); + } + return t.elapsed() / passes; +} + +float TestShadowRays( BVH::BVHLayout layout, Ray* batch, unsigned N, unsigned passes ) +{ + // Shadow rays: coherent batch of rays from a single point to 'far away'. Shadow + // rays terminate on the first hit, and don't need sorted order. They also don't + // store intersection information, and are therefore expected to be faster than + // primary rays. + Timer t; + unsigned occluded = 0; // avoid dead code elimitation + for( unsigned i = 0; i < N; i++ ) batch[i].hit.t = 1000.0f; // shadow ray length + for (unsigned pass = 0; pass < passes + 1; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming + for (unsigned i = 0; i < N; i++) occluded += bvh.IsOccluded( batch[i], layout ) ? 1 : 2; + } + return occluded == 0 ? 0 : (t.elapsed() / passes); +} + +void ValidateTraceResult( Ray* batch, unsigned line ) +{ + float refSum = 0, batchSum = 0; + for( unsigned i = 0; i < SCRWIDTH * SCRHEIGHT; i += 4 ) + refSum += refDist[i] == 1e30f ? 100 : refDist[i], + batchSum += batch[i].hit.t == 1e30f ? 100 : batch[i].hit.t; + float diff = fabs( refSum - batchSum); + if (diff / refSum > 0.0001f) + { + fprintf( stderr, "Validation failed on line %i - dumping img.raw.\n", line ); + unsigned char pixel[SCRWIDTH * SCRHEIGHT]; + for ( unsigned i = 0, ty = 0; ty < SCRHEIGHT / 4; ty++) for (unsigned tx = 0; tx < SCRWIDTH / 4; tx++) + { + for (unsigned y = 0; y < 4; y++) for (unsigned x = 0; x < 4; x++, i++) + { + float col = batch[i].hit.t == 1e30f ? 0 : batch[i].hit.t; + pixel[tx * 4 + x + (ty * 4 + y) * SCRWIDTH] = (unsigned char)((int)(col * 0.1f) & 255); + } + } + std::fstream s{ "img.raw", s.binary | s.out }; + s.seekp( 0 ); + s.write( (char*)&pixel, SCRWIDTH * SCRHEIGHT ); + s.close(); + exit( 1 ); + } +} + int main() { int minor = TINY_BVH_VERSION_MINOR; @@ -229,7 +273,6 @@ int main() // T I N Y _ B V H P E R F O R M A N C E M E A S U R E M E N T S Timer t; - float mrays; // measure single-core bvh construction time - warming caches printf( "BVH construction speed\n" ); @@ -315,48 +358,112 @@ int main() #endif - // trace all rays once to warm the caches - printf( "BVH traversal speed\n" ); + // report CPU single ray, single-core performance + printf( "BVH traversal speed - single-threaded\n" ); #ifdef TRAVERSE_2WAY_ST - // trace all rays three times to estimate average performance - // - single core version - printf( "- CPU, coherent, basic 2-way layout, ST: " ); - for (int pass = 0; pass < 4; pass++) - { - if (pass == 1) t.reset(); // first pass is cache warming - for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i] ); - } - float traceTimeST = t.elapsed() / 3.0f; - mrays = (float)Nsmall / traceTimeST; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeST * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); + // WALD_32BYTE - Have this enabled at all times if validation is desired. + printf( "- WALD_32BYTE - primary: " ); + traceTime = TestPrimaryRays( BVH::WALD_32BYTE, smallBatch, Nsmall, 3 ); + refDist = new float[Nsmall]; + for( int i = 0; i < Nsmall; i++ ) refDist[i] = smallBatch[i].hit.t; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::WALD_32BYTE, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); #endif #ifdef TRAVERSE_ALT2WAY_ST - // trace all rays three times to estimate average performance - // - single core version, alternative bvh layout - printf( "- CPU, coherent, alt 2-way layout, ST: " ); + // AILA_LAINE bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE ); - for (int pass = 0; pass < 4; pass++) + printf( "- AILA_LAINE - primary: " ); + traceTime = TestPrimaryRays( BVH::AILA_LAINE, smallBatch, Nsmall, 3 ); + ValidateTraceResult( smallBatch, __LINE__ ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::AILA_LAINE, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + +#endif + +#ifdef TRAVERSE_SOA2WAY_ST + + // AILA_LAINE + bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); + printf( "- ALT_SOA - primary: " ); + traceTime = TestPrimaryRays( BVH::ALT_SOA, smallBatch, Nsmall, 3 ); + ValidateTraceResult( smallBatch, __LINE__ ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::ALT_SOA, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + +#endif + +#ifdef TRAVERSE_4WAY + + // AILA_LAINE + bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); + bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); + printf( "- BVH4_AFRA - primary: " ); + traceTime = TestPrimaryRays( BVH::BVH4_AFRA, smallBatch, Nsmall, 3 ); + ValidateTraceResult( smallBatch, __LINE__ ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::BVH4_AFRA, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + +#endif + +#if defined TRAVERSE_OPTIMIZED_ST || defined TRAVERSE_4WAY_OPTIMIZED + + printf( "Optimized BVH performance - Optimizing... " ); + bvh.Convert( BVH::WALD_32BYTE, BVH::VERBOSE ); + t.reset(); + bvh.Optimize( 1000000 ); // optimize the raw SBVH + bvh.Convert( BVH::VERBOSE, BVH::WALD_32BYTE ); + printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() ); + +#endif + +#ifdef TRAVERSE_OPTIMIZED_ST + + // ALT_SOA + if (bvh.alt2Node) bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); + printf( "- ALT_SOA - primary: " ); + traceTime = TestPrimaryRays( BVH::ALT_SOA, smallBatch, Nsmall, 3 ); + ValidateTraceResult( smallBatch, __LINE__ ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::ALT_SOA, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + +#endif + +#ifdef TRAVERSE_4WAY_OPTIMIZED + + // ALT_SOA + if (bvh.bvh4Alt2) { - if (pass == 1) t.reset(); // first pass is cache warming - for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::AILA_LAINE ); + bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); + bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); } - float traceTimeAlt = t.elapsed() / 3.0f; - mrays = (float)Nsmall / traceTimeAlt; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); + printf( "- BVH4_AFRA - primary: " ); + traceTime = TestPrimaryRays( BVH::BVH4_AFRA, smallBatch, Nsmall, 3 ); + ValidateTraceResult( smallBatch, __LINE__ ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s), ", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); + traceTime = TestShadowRays( BVH::BVH4_AFRA, smallBatch, Nsmall, 3 ); + printf( "shadow: %5.1fms (%7.2fMRays/s)\n", traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); #endif #ifdef ENABLE_OPENCL + // report GPU performance + printf( "BVH traversal speed - GPU (OpenCL)\n" ); + #ifdef GPU_2WAY // trace the rays on GPU using OpenCL - printf( "- GPU, coherent, alt 2-way layout, ocl: " ); + printf( "- AILA_LAINE - primary: " ); bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE ); // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer gpuNodes( bvh.usedAltNodes * sizeof( BVH::BVHNodeAlt ), bvh.altNode ); @@ -374,7 +481,7 @@ int main() cl_ulong startTime, endTime; // start timer and start kernel on gpu t.reset(); - float traceTimeGPU = 0; + traceTime = 0; ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData ); for (int pass = 0; pass < 9; pass++) { @@ -383,21 +490,20 @@ int main() clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); if (pass == 0) continue; // first pass is for cache warming - traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds + traceTime += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. rayData.CopyFromDevice(); // report on timing - traceTimeGPU /= 8.0f; - mrays = (float)Nfull / traceTimeGPU; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime /= 8.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #endif #ifdef GPU_4WAY // trace the rays on GPU using OpenCL - printf( "- GPU, coherent, alt 4-way layout, ocl: " ); + printf( "- BVH4_GPU - primary: " ); bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_GPU ); // create OpenCL buffers for the BVH data calculated by tiny_bvh.h @@ -414,7 +520,7 @@ int main() #endif // start timer and start kernel on gpu t.reset(); - float traceTimeGPU4 = 0; + traceTime = 0; gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData ); for (int pass = 0; pass < 9; pass++) { @@ -423,24 +529,22 @@ int main() clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); if (pass == 0) continue; // first pass is for cache warming - traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds + traceTime += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. rayData.CopyFromDevice(); // report on timing - traceTimeGPU4 /= 8.0f; - mrays = (float)Nfull / traceTimeGPU4; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU4 * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime /= 8.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #endif #ifdef GPU_CWBVH // trace the rays on GPU using OpenCL - printf( "- GPU, coherent, CWBVH layout, " ); + printf( "- BVH8/CWBVH - primary: " ); bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH8 ); bvh.Convert( BVH::BASIC_BVH8, BVH::CWBVH ); - printf( "ocl: " ); // create OpenCL buffers for the BVH data calculated by tiny_bvh.h tinyocl::Buffer cwbvhNodes( bvh.usedCWBVHBlocks * sizeof( tinybvh::bvhvec4 ), bvh.bvh8Compact ); tinyocl::Buffer cwbvhTris( bvh.idxCount * 3 * sizeof( tinybvh::bvhvec4 ), bvh.bvh8Tris ); @@ -457,7 +561,7 @@ int main() #endif // start timer and start kernel on gpu t.reset(); - float traceTimeGPU8 = 0; + traceTime = 0; cwbvh_kernel.SetArguments( &cwbvhNodes, &cwbvhTris, &rayData ); for (int pass = 0; pass < 9; pass++) { @@ -466,41 +570,25 @@ int main() clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); if (pass == 0) continue; // first pass is for cache warming - traceTimeGPU8 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds + traceTime += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. rayData.CopyFromDevice(); // report on timing - traceTimeGPU8 /= 8.0f; - mrays = (float)Nfull / traceTimeGPU8; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeGPU8 * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime /= 8.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #endif #endif -#ifdef TRAVERSE_SOA2WAY_ST - - // trace all rays three times to estimate average performance - // - single core version, alternative bvh layout 2 - printf( "- CPU, coherent, soa 2-way layout, ST: " ); - bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); - for (int pass = 0; pass < 4; pass++) - { - if (pass == 1) t.reset(); // first pass is cache warming - for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA ); - } - float traceTimeAlt2 = t.elapsed() / 3.0f; - mrays = (float)Nsmall / traceTimeAlt2; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt2 * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); - -#endif + // report threaded CPU performance + printf( "BVH traversal speed - CPU multi-core\n" ); #ifdef TRAVERSE_2WAY_MT - // trace all rays three times to estimate average performance - // - multi-core version (using OpenMP and batches of 10,000 rays) - printf( "- CPU, coherent, basic 2-way layout, MT: " ); + // using OpenMP and batches of 10,000 rays + printf( "- WALD_32BYTE - primary: " ); for (int pass = 0; pass < 4; pass++) { if (pass == 1) t.reset(); // first pass is cache warming @@ -512,17 +600,15 @@ int main() for (int i = 0; i < 10000; i++) bvh.Intersect( fullBatch[batchStart + i] ); } } - float traceTimeMT = t.elapsed() / 3.0f; - mrays = (float)Nfull / traceTimeMT; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeMT * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime = t.elapsed() / 3.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #endif #ifdef TRAVERSE_2WAY_MT_PACKET - // trace all rays three times to estimate average performance - // - coherent distribution, multi-core, packet traversal - printf( "- CPU, coherent, 2-way, packets, MT: " ); + // multi-core packet traversal + printf( "- RayPacket - primary: " ); for (int pass = 0; pass < 4; pass++) { if (pass == 1) t.reset(); // first pass is cache warming @@ -534,15 +620,14 @@ int main() for (int i = 0; i < 30; i++) bvh.Intersect256Rays( fullBatch + batchStart + i * 256 ); } } - float traceTimeMTP = t.elapsed() / 3.0f; - mrays = (float)Nfull / traceTimeMTP; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeMTP * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime = t.elapsed() / 3.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #ifdef BVH_USEAVX // trace all rays three times to estimate average performance // - coherent distribution, multi-core, packet traversal, SSE version - printf( "- CPU, coherent, 2-way, packets/SSE, MT: " ); + printf( "- Packet,SSE - primary: " ); for (int pass = 0; pass < 4; pass++) { if (pass == 1) t.reset(); // first pass is cache warming @@ -554,68 +639,21 @@ int main() for (int i = 0; i < 30; i++) bvh.Intersect256RaysSSE( fullBatch + batchStart + i * 256 ); } } - float traceTimeMTPS = t.elapsed() / 3.0f; - mrays = (float)Nfull / traceTimeMTPS; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeMTPS * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); + traceTime = t.elapsed() / 3.0f; + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nfull * 1e-6f, traceTime * 1000, (float)Nfull / traceTime * 1e-6f ); #endif #endif -#ifdef TRAVERSE_OPTIMIZED_ST - - // trace all rays three times to estimate average performance - // - single core version, alternative bvh layout - printf( "Optimizing BVH, regular... " ); - bvh.Convert( BVH::WALD_32BYTE, BVH::VERBOSE ); - t.reset(); - bvh.Optimize( 1000000 ); // optimize the raw SBVH - bvh.Convert( BVH::VERBOSE, BVH::WALD_32BYTE ); - printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() ); - bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); - printf( "- CPU, coherent, 2-way optimized, ST: " ); - for (int pass = 0; pass < 4; pass++) - { - if (pass == 1) t.reset(); // first pass is cache warming - for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA ); - } - float traceTimeOpt = t.elapsed() / 3.0f; - mrays = (float)Nsmall / traceTimeOpt; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); - -#endif - -#ifdef TRAVERSE_4WAY_OPTIMIZED - - // trace all rays three times to estimate average performance - // - single core version, BVH4 in SIMD-friendly layout -#ifndef TRAVERSE_OPTIMIZED_ST - printf( "Optimizing BVH, regular... " ); - bvh.Convert( BVH::WALD_32BYTE, BVH::VERBOSE ); - t.reset(); - bvh.Optimize( 1000000 ); // optimize the raw SBVH - bvh.Convert( BVH::VERBOSE, BVH::WALD_32BYTE ); - printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() ); -#endif - bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); - bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); - printf( "- CPU, coherent, 4-way optimized, ST: " ); - for (int pass = 0; pass < 4; pass++) - { - if (pass == 1) t.reset(); // first pass is cache warming - for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::BVH4_AFRA ); - } - float traceTimeAfra = t.elapsed() / 3.0f; - mrays = (float)Nsmall / traceTimeAfra; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAfra * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); - -#endif + // report threaded CPU performance + printf( "BVH traversal speed - EMBREE reference\n" ); #if defined EMBREE_TRAVERSE && defined EMBREE_BUILD // trace all rays three times to estimate average performance // - coherent, Embree, single-threaded - printf( "- CPU, coherent, Embree BVH, Embree ST: " ); + printf( "- Default BVH - primary: " ); struct RTCRayHit* rayhits = (RTCRayHit*)tinybvh::malloc64( SCRWIDTH * SCRHEIGHT * 16 * sizeof( RTCRayHit ) ); // copy our rays to Embree format for (int i = 0; i < Nfull; i++) @@ -632,7 +670,7 @@ int main() if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) rtcIntersect1( embreeScene, rayhits + i ); } - float traceTimeEmbree = t.elapsed() / 3.0f; + traceTime = t.elapsed() / 3.0f; // retrieve intersection results for (int i = 0; i < Nsmall; i++) { @@ -640,42 +678,9 @@ int main() fullBatch[i].hit.u = rayhits[i].hit.u, fullBatch[i].hit.u = rayhits[i].hit.v; fullBatch[i].hit.prim = rayhits[i].hit.primID; } - mrays = (float)Nsmall / traceTimeEmbree; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeEmbree * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); + printf( "%4.2fM rays in %5.1fms (%7.2fMRays/s)\n", (float)Nsmall * 1e-6f, traceTime * 1000, (float)Nsmall / traceTime * 1e-6f ); tinybvh::free64( rayhits ); -#endif - -#ifdef TRAVERSE_2WAY_MT_DIVERGENT - - // shuffle rays for the next experiment - TODO: replace by random bounce - for (int i = 0; i < Nfull; i++) - { - int j = (unsigned)(i + 17 * rand()) % Nfull; - Ray t = fullBatch[i]; - fullBatch[i] = fullBatch[j]; - fullBatch[j] = t; - } - - // trace all rays three times to estimate average performance - // - divergent distribution, multi-core - printf( "- CPU, incoherent, basic 2-way layout, MT: " ); - t.reset(); - for (int pass = 0; pass < 4; pass++) - { - if (pass == 1) t.reset(); // first pass is cache warming - const int batchCount = Nfull / 10000; - #pragma omp parallel for schedule(dynamic) - for (int batch = 0; batch < batchCount; batch++) - { - const int batchStart = batch * 10000; - for (int i = 0; i < 10000; i++) bvh.Intersect( fullBatch[batchStart + i] ); - } - } - float traceTimeMTI = t.elapsed() / 3.0f; - mrays = (float)Nfull / traceTimeMTI; - printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeMTI * 1000, (float)Nfull * 1e-6f, mrays * 1e-6f ); - #endif printf( "all done." );