From 27cb047703f911f7d36e4e0ecb83429a49e49af5 Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Tue, 26 Nov 2024 15:42:20 +0100 Subject: [PATCH] Improved speed for BVH4 / Afra layout. --- tiny_bvh.h | 215 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 126 insertions(+), 89 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index fedb19a..a8b6982 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -3183,95 +3183,6 @@ int BVH::Intersect_AltSoA( Ray& ray ) const return steps; } -// Traverse a 4-way BVH stored in 'Atilla Áfra' layout. -int BVH::Intersect_Afra( Ray& ray ) const -{ - unsigned nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0; - const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x ); - const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y ); - const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z ); - __m128 t4 = _mm_set1_ps( ray.hit.t ), zero4 = _mm_setzero_ps(); - __m128 idx4 = _mm_castsi128_ps( _mm_setr_epi32( 0, 1, 2, 3 ) ); - __m128 idxMask = _mm_castsi128_ps( _mm_set1_epi32( 0xfffffffc ) ); - __m128 inf4 = _mm_set1_ps( 1e30f ); - while (1) - { - const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; - steps++; - // intersect the ray with four AABBs - const __m128 x0 = _mm_sub_ps( node.xmin4, ox4 ), x1 = _mm_sub_ps( node.xmax4, ox4 ); - const __m128 y0 = _mm_sub_ps( node.ymin4, oy4 ), y1 = _mm_sub_ps( node.ymax4, oy4 ); - const __m128 z0 = _mm_sub_ps( node.zmin4, oz4 ), z1 = _mm_sub_ps( node.zmax4, oz4 ); - const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 ); - const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 ); - const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 ); - __m128 tmin = _mm_max_ps( _mm_max_ps( _mm_min_ps( tx1, tx2 ), _mm_min_ps( ty1, ty2 ) ), _mm_min_ps( tz1, tz2 ) ); - const __m128 tmax = _mm_min_ps( _mm_min_ps( _mm_max_ps( tx1, tx2 ), _mm_max_ps( ty1, ty2 ) ), _mm_max_ps( tz1, tz2 ) ); - const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); - const int hits = _mm_movemask_ps( hit ); - nodeIdx = 0; - if (hits) - { - // blend in lane indices - tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); - // sort - float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 ); - if (d0 > d2) tmp = d0, d0 = d2, d2 = tmp; - if (d1 > d3) tmp = d1, d1 = d3, d3 = tmp; - if (d0 > d1) tmp = d0, d0 = d1, d1 = tmp; - if (d2 > d3) tmp = d2, d2 = d3, d3 = tmp; - if (d1 > d2) tmp = d1, d1 = d2, d2 = tmp; - // process hits - float d[4] = { d0, d1, d2, d3 }; - for (int i = 0; i < 4; i++) - { - if (d[i] > 1e29f) break; - #ifdef __GNUC__ - #pragma GCC diagnostic push - #pragma GCC diagnostic ignored "-Wstrict-aliasing" - #endif - unsigned lane = *(unsigned*)&d[i] & 3; - #ifdef __GNUC__ - #pragma GCC diagnostic pop - #endif - if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? - if (node.triCount[lane] == 0) - { - const unsigned childIdx = node.childFirst[lane]; - if (!nodeIdx) nodeIdx = childIdx; else stack[stackPtr++] = childIdx; - continue; - } - const unsigned first = node.childFirst[lane], count = node.triCount[lane]; - for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf - { - const unsigned idx = triIdx[first + j], vertIdx = idx * 3; - const bvhvec4 v0 = verts[vertIdx]; - const bvhvec3 edge1 = verts[vertIdx + 1] - v0; - const bvhvec3 edge2 = verts[vertIdx + 2] - v0; - const bvhvec3 h = cross( ray.D, edge2 ); - const float a = dot( edge1, h ); - if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle - const float f = 1 / a; - const bvhvec3 s = ray.O - bvhvec3( v0 ); - const float u = f * dot( s, h ); - if (u < 0 || u > 1) continue; - const bvhvec3 q = cross( s, edge1 ); - const float v = f * dot( ray.D, q ); - if (v < 0 || u + v > 1) continue; - const float t = f * dot( edge2, q ); - if (t > 0 && t < ray.hit.t) - ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx, - ray.hit.t = t, t4 = _mm_set1_ps( t ); - } - } - } - // get next task - if (nodeIdx) continue; - if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; - } - return steps; -} - // Intersect_CWBVH: // Intersect a compressed 8-wide BVH with a ray. For debugging only, not efficient. // Not technically limited to BVH_USEAVX, but __lzcnt and __popcnt will require @@ -3453,6 +3364,132 @@ int BVH::Intersect_CWBVH( Ray& ray ) const return 0; } +// Traverse a 4-way BVH stored in 'Atilla Áfra' layout. +int BVH::Intersect_Afra( Ray& ray ) const +{ + unsigned nodeIdx = 0, stack[1024], stackPtr = 0, steps = 0; + const __m128 ox4 = _mm_set1_ps( ray.O.x ), rdx4 = _mm_set1_ps( ray.rD.x ); + const __m128 oy4 = _mm_set1_ps( ray.O.y ), rdy4 = _mm_set1_ps( ray.rD.y ); + const __m128 oz4 = _mm_set1_ps( ray.O.z ), rdz4 = _mm_set1_ps( ray.rD.z ); + __m128 t4 = _mm_set1_ps( ray.hit.t ), zero4 = _mm_setzero_ps(); + __m128 idx4 = _mm_castsi128_ps( _mm_setr_epi32( 0, 1, 2, 3 ) ); + __m128 idxMask = _mm_castsi128_ps( _mm_set1_epi32( 0xfffffffc ) ); + __m128 inf4 = _mm_set1_ps( 1e30f ); + while (1) + { + const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; + steps++; + // intersect the ray with four AABBs + const __m128 x0 = _mm_sub_ps( node.xmin4, ox4 ), x1 = _mm_sub_ps( node.xmax4, ox4 ); + const __m128 y0 = _mm_sub_ps( node.ymin4, oy4 ), y1 = _mm_sub_ps( node.ymax4, oy4 ); + const __m128 z0 = _mm_sub_ps( node.zmin4, oz4 ), z1 = _mm_sub_ps( node.zmax4, oz4 ); + const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 ); + const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 ); + const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 ); + __m128 tmin = _mm_max_ps( _mm_max_ps( _mm_min_ps( tx1, tx2 ), _mm_min_ps( ty1, ty2 ) ), _mm_min_ps( tz1, tz2 ) ); + const __m128 tmax = _mm_min_ps( _mm_min_ps( _mm_max_ps( tx1, tx2 ), _mm_max_ps( ty1, ty2 ) ), _mm_max_ps( tz1, tz2 ) ); + const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); + const int hitBits = _mm_movemask_ps( hit ); + const int hits = __popc( hitBits ); + nodeIdx = 0; + if (hits == 1) + { + // just one node was hit - no sorting needed. + unsigned lane = __bfind( hitBits ); + // if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + if (node.triCount[lane] == 0) + { + nodeIdx = node.childFirst[lane]; + continue; + } + const unsigned first = node.childFirst[lane], count = node.triCount[lane]; + for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + { + const unsigned idx = triIdx[first + j], vertIdx = idx * 3; + const bvhvec4 v0 = verts[vertIdx]; + const bvhvec3 edge1 = verts[vertIdx + 1] - v0; + const bvhvec3 edge2 = verts[vertIdx + 2] - v0; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( v0 ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) + ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx, + ray.hit.t = t, t4 = _mm_set1_ps( t ); + } + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + continue; + } + else if (hits) + { + // blend in lane indices + tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); + // sort + float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 ); + if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; + if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; + if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; + if (d2 < d3) tmp = d2, d2 = d3, d3 = tmp; + if (d1 < d2) tmp = d1, d1 = d2, d2 = tmp; + // process hits + float d[4] = { d0, d1, d2, d3 }; + for (int i = 0; i < 4; i++) if (d[i] < 1e29f) + { + #ifdef __GNUC__ + #pragma GCC diagnostic push + #pragma GCC diagnostic ignored "-Wstrict-aliasing" + #endif + unsigned lane = *(unsigned*)&d[i] & 3; + #ifdef __GNUC__ + #pragma GCC diagnostic pop + #endif + if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + if (node.triCount[lane] == 0) + { + const unsigned childIdx = node.childFirst[lane]; + if (nodeIdx) stack[stackPtr++] = nodeIdx; + nodeIdx = childIdx; + continue; + } + const unsigned first = node.childFirst[lane], count = node.triCount[lane]; + for (unsigned j = 0; j < count; j++) // TODO: aim for 4 prims per leaf + { + const unsigned idx = triIdx[first + j], vertIdx = idx * 3; + const bvhvec4 v0 = verts[vertIdx]; + const bvhvec3 edge1 = verts[vertIdx + 1] - v0; + const bvhvec3 edge2 = verts[vertIdx + 2] - v0; + const bvhvec3 h = cross( ray.D, edge2 ); + const float a = dot( edge1, h ); + if (fabs( a ) < 0.0000001f) continue; // ray parallel to triangle + const float f = 1 / a; + const bvhvec3 s = ray.O - bvhvec3( v0 ); + const float u = f * dot( s, h ); + if (u < 0 || u > 1) continue; + const bvhvec3 q = cross( s, edge1 ); + const float v = f * dot( ray.D, q ); + if (v < 0 || u + v > 1) continue; + const float t = f * dot( edge2, q ); + if (t > 0 && t < ray.hit.t) + ray.hit.u = u, ray.hit.v = v, ray.hit.prim = idx, + ray.hit.t = t, t4 = _mm_set1_ps( t ); + } + } + } + // get next task + if (nodeIdx) continue; + if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; + } + return steps; +} + #endif // BVH_USEAVX // ============================================================================