Skip to content

Commit

Permalink
Triangle layout optimizations.
Browse files Browse the repository at this point in the history
  • Loading branch information
jbikker committed Nov 27, 2024
1 parent 78c4b90 commit 251e8d2
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 77 deletions.
86 changes: 42 additions & 44 deletions tiny_bvh.h
Original file line number Diff line number Diff line change
Expand Up @@ -1306,10 +1306,11 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
{
unsigned t = triIdx[childNode[i]->firstTri + j];
bvhvec4 v0 = verts[t * 3 + 0];
bvh4Alt[newAlt4Ptr + 1] = verts[t * 3 + 1] - v0;
bvh4Alt[newAlt4Ptr + 2] = verts[t * 3 + 2] - v0;
v0.w = *(float*)&t; // as_float
bvh4Alt[newAlt4Ptr++] = v0;
bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 1];
bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 2];
bvh4Alt[newAlt4Ptr + 0] = v0;
newAlt4Ptr += 3;
}
}
// process interior nodes
Expand Down Expand Up @@ -1431,29 +1432,23 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr
while (1)
{
BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
if (nodeIdx == 58663)
{
int w = 0;
}
for (int i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0)
{
if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else
{
unsigned first = node.childFirst[i];
unsigned count = node.triCount[i];
node.childFirst[i] = triPtr;
// assign vertex data
for (unsigned j = 0; j < count; j++)
{
unsigned fi = triIdx[first + j];
if (fi > triCount)
{
int w = 0;
}
bvhvec4 v0 = verts[fi * 3 + 0];
bvh4Tris[triPtr + 1] = verts[fi * 3 + 1] - v0;
bvh4Tris[triPtr + 2] = verts[fi * 3 + 2] - v0;
v0.w = *(float*)&fi; // so we know the original tri idx
bvh4Tris[triPtr++] = v0;
bvh4Tris[triPtr++] = verts[fi * 3 + 1];
bvh4Tris[triPtr++] = verts[fi * 3 + 2];
bvh4Tris[triPtr + 0] = v0;
triPtr += 3;
}
}
}
Expand Down Expand Up @@ -2466,9 +2461,9 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const
unsigned triStart = offset + (leaf[i] & 0xffff);
for (unsigned j = 0; j < N; j++, triStart += 3)
{
const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] );
const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] );
const bvhvec3 v0 = bvh4Alt[triStart + 0];
const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ) - v0;
const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ) - v0;
const bvhvec3 h = cross( ray.D, edge2 );
const float a = dot( edge1, h );
if (fabs( a ) < 0.0000001f) continue;
Expand Down Expand Up @@ -3377,19 +3372,18 @@ int BVH::Intersect_CWBVH( Ray& ray ) const
inline void IntersectTri4( Ray& r, unsigned idx, __m128& t4, const bvhvec4* verts )
{
bvhvec4 v0 = verts[idx];
bvhvec3 edge1 = verts[idx + 1], edge2 = verts[idx + 2];
const unsigned triIdx = *(unsigned*)&v0.w;
v0.w = 0;
const bvhvec3 edge1 = verts[idx + 1] - v0, edge2 = verts[idx + 2] - v0;
const bvhvec3 h = cross( r.D, edge2 );
const float a = dot( edge1, h );
if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle
const float f = 1 / a;
const bvhvec3 s = r.O - bvhvec3( v0 );
const bvhvec3 s = r.O - bvhvec3( v0 ), q = cross( s, edge1 );
const float u = f * dot( s, h );
if (u < 0 || u > 1) return;
const bvhvec3 q = cross( s, edge1 );
const float v = f * dot( r.D, q );
if (v < 0 || u + v > 1) return;
const bool out = u < 0 || u > 1 || v < 0 || u + v > 1;
if (out) return;
const float t = f * dot( edge2, q );
if (t > 0 && t < r.hit.t) r.hit.u = u, r.hit.v = v, r.hit.prim = triIdx, r.hit.t = t, t4 = _mm_set1_ps( t );
}
Expand All @@ -3407,27 +3401,25 @@ int BVH::Intersect_Afra( Ray& ray ) const
{
const BVHNode4Alt2& node = bvh4Alt2[nodeIdx];
// intersect the ray with four AABBs
const __m128 x0 = _mm_sub_ps( node.xmin4, ox4 ), x1 = _mm_sub_ps( node.xmax4, ox4 );
const __m128 y0 = _mm_sub_ps( node.ymin4, oy4 ), y1 = _mm_sub_ps( node.ymax4, oy4 );
const __m128 z0 = _mm_sub_ps( node.zmin4, oz4 ), z1 = _mm_sub_ps( node.zmax4, oz4 );
const __m128 xmin4 = node.xmin4, xmax4 = node.xmax4;
const __m128 ymin4 = node.ymin4, ymax4 = node.ymax4;
const __m128 zmin4 = node.zmin4, zmax4 = node.zmax4;
const __m128 x0 = _mm_sub_ps( xmin4, ox4 ), x1 = _mm_sub_ps( xmax4, ox4 );
const __m128 y0 = _mm_sub_ps( ymin4, oy4 ), y1 = _mm_sub_ps( ymax4, oy4 );
const __m128 z0 = _mm_sub_ps( zmin4, oz4 ), z1 = _mm_sub_ps( zmax4, oz4 );
const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 );
const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 );
const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 );
__m128 tmin = _mm_max_ps( _mm_max_ps( _mm_min_ps( tx1, tx2 ), _mm_min_ps( ty1, ty2 ) ), _mm_min_ps( tz1, tz2 ) );
const __m128 tmax = _mm_min_ps( _mm_min_ps( _mm_max_ps( tx1, tx2 ), _mm_max_ps( ty1, ty2 ) ), _mm_max_ps( tz1, tz2 ) );
const __m128 txmin = _mm_min_ps( tx1, tx2 ), tymin = _mm_min_ps( ty1, ty2 ), tzmin = _mm_min_ps( tz1, tz2 );
const __m128 txmax = _mm_max_ps( tx1, tx2 ), tymax = _mm_max_ps( ty1, ty2 ), tzmax = _mm_max_ps( tz1, tz2 );
const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin );
const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax );
const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) );
const int hitBits = _mm_movemask_ps( hit );
const int hits = __popc( hitBits );
if (hits == 0)
{
if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr];
}
else if (hits == 1)
const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits );
if (hits == 1 /* 43% */)
{
// just one node was hit - no sorting needed.
const unsigned lane = __bfind( hitBits );
const unsigned count = node.triCount[lane];
// if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens?
const unsigned lane = __bfind( hitBits ), count = node.triCount[lane];
if (count == 0) nodeIdx = node.childFirst[lane]; else
{
const unsigned first = node.childFirst[lane];
Expand All @@ -3438,7 +3430,13 @@ int BVH::Intersect_Afra( Ray& ray ) const
}
continue;
}
else if (hits == 2)
if (hits == 0 /* 29% */)
{
if (stackPtr == 0) break;
nodeIdx = stack[--stackPtr];
continue;
}
if (hits == 2 /* 16% */)
{
// two nodes hit
unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) );
Expand Down Expand Up @@ -3470,12 +3468,12 @@ int BVH::Intersect_Afra( Ray& ray ) const
IntersectTri4( ray, first + j * 3, t4, bvh4Tris );
}
}
else if (hits == 3)
else if (hits == 3 /* 8% */)
{
// blend in lane indices
tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
__m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
// sort
float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 );
float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 );
if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
Expand All @@ -3499,12 +3497,12 @@ int BVH::Intersect_Afra( Ray& ray ) const
IntersectTri4( ray, first + j * 3, t4, bvh4Tris );
}
}
else
else /* hits == 4, 2%: rare */
{
// blend in lane indices
tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
__m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 );
// sort
float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 );
float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 );
if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp;
if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp;
if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp;
Expand Down Expand Up @@ -3825,7 +3823,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const

#endif // BVH_USENEON

} // namespace tinybvh
} // namespace tinybvh

#ifdef __GNUC__
#pragma GCC diagnostic pop
Expand Down
9 changes: 3 additions & 6 deletions tiny_bvh_fenster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ int verts = 0;
// setup view pyramid for a pinhole camera:
// eye, p1 (top-left), p2 (top-right) and p3 (bottom-left)
#ifdef LOADSCENE
// bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) );
static bvhvec3 eye( 0, 13, 30 ), p1, p2, p3;
static bvhvec3 view = normalize( bvhvec3( 0, 0.01f, -1 ) );
static bvhvec3 eye( 0, 30, 0 ), p1, p2, p3;
static bvhvec3 view = normalize( bvhvec3( -8, 2, -1.7f ) );
#else
static bvhvec3 eye( -3.5f, -1.5f, -6.5f ), p1, p2, p3;
static bvhvec3 view = normalize( bvhvec3( 3, 1.5f, 5 ) );
Expand Down Expand Up @@ -110,8 +109,6 @@ void Init()
// build a BVH over the scene
#if defined(BVH_USEAVX)
bvh.BuildHQ( triangles, verts / 3 );
bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA );
#elif defined(BVH_USENEON)
bvh.BuildNEON( triangles, verts / 3 );
#else
Expand Down Expand Up @@ -173,7 +170,7 @@ void Tick( uint32_t* buf )

// trace primary rays
#if !defined USE_EMBREE
for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::BVH4_AFRA );
for (int i = 0; i < N; i++) bvh.Intersect( rays[i] );
#else
struct RTCRayHit rayhit;
for (int i = 0; i < N; i++)
Expand Down
64 changes: 40 additions & 24 deletions tiny_bvh_speedtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#define BUILD_REFERENCE
#define BUILD_AVX
#define BUILD_NEON
// #define BUILD_SBVH
#define BUILD_SBVH
#define TRAVERSE_2WAY_ST
#define TRAVERSE_ALT2WAY_ST
#define TRAVERSE_SOA2WAY_ST
Expand Down Expand Up @@ -323,9 +323,11 @@ int main()
// trace all rays three times to estimate average performance
// - single core version
printf( "- CPU, coherent, basic 2-way layout, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i] );
}
float traceTimeST = t.elapsed() / 3.0f;
mrays = (float)Nsmall / traceTimeST;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeST * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
Expand All @@ -338,9 +340,11 @@ int main()
// - single core version, alternative bvh layout
printf( "- CPU, coherent, alt 2-way layout, ST: " );
bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::AILA_LAINE );
}
float traceTimeAlt = t.elapsed() / 3.0f;
mrays = (float)Nsmall / traceTimeAlt;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
Expand Down Expand Up @@ -372,12 +376,13 @@ int main()
t.reset();
float traceTimeGPU = 0;
ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData );
for (int pass = 0; pass < 8; pass++)
for (int pass = 0; pass < 9; pass++)
{
ailalaine_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
if (pass == 0) continue; // first pass is for cache warming
traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
Expand Down Expand Up @@ -411,12 +416,13 @@ int main()
t.reset();
float traceTimeGPU4 = 0;
gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData );
for (int pass = 0; pass < 8; pass++)
for (int pass = 0; pass < 9; pass++)
{
gpu4way_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
if (pass == 0) continue; // first pass is for cache warming
traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
Expand Down Expand Up @@ -453,12 +459,13 @@ int main()
t.reset();
float traceTimeGPU8 = 0;
cwbvh_kernel.SetArguments( &cwbvhNodes, &cwbvhTris, &rayData );
for (int pass = 0; pass < 8; pass++)
for (int pass = 0; pass < 9; pass++)
{
cwbvh_kernel.Run( Nfull, 64, 0, &event ); // for now, todo.
clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 );
clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 );
if (pass == 0) continue; // first pass is for cache warming
traceTimeGPU8 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds
}
// get results from GPU - this also syncs the queue.
Expand All @@ -478,9 +485,11 @@ int main()
// - single core version, alternative bvh layout 2
printf( "- CPU, coherent, soa 2-way layout, ST: " );
bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA );
}
float traceTimeAlt2 = t.elapsed() / 3.0f;
mrays = (float)Nsmall / traceTimeAlt2;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt2 * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
Expand All @@ -492,9 +501,9 @@ int main()
// trace all rays three times to estimate average performance
// - multi-core version (using OpenMP and batches of 10,000 rays)
printf( "- CPU, coherent, basic 2-way layout, MT: " );
t.reset();
for (int j = 0; j < 3; j++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / 10000;
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
Expand All @@ -514,9 +523,9 @@ int main()
// trace all rays three times to estimate average performance
// - coherent distribution, multi-core, packet traversal
printf( "- CPU, coherent, 2-way, packets, MT: " );
t.reset();
for (int j = 0; j < 3; j++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
Expand All @@ -534,9 +543,9 @@ int main()
// trace all rays three times to estimate average performance
// - coherent distribution, multi-core, packet traversal, SSE version
printf( "- CPU, coherent, 2-way, packets/SSE, MT: " );
t.reset();
for (int j = 0; j < 3; j++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
Expand Down Expand Up @@ -565,9 +574,11 @@ int main()
printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() );
bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA );
printf( "- CPU, coherent, 2-way optimized, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA );
}
float traceTimeOpt = t.elapsed() / 3.0f;
mrays = (float)Nsmall / traceTimeOpt;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
Expand All @@ -589,9 +600,11 @@ int main()
bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 );
bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA );
printf( "- CPU, coherent, 4-way optimized, ST: " );
t.reset();
for (int pass = 0; pass < 3; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::BVH4_AFRA );
}
float traceTimeAfra = t.elapsed() / 3.0f;
mrays = (float)Nsmall / traceTimeAfra;
printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAfra * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f );
Expand All @@ -614,10 +627,12 @@ int main()
rayhits[i].hit.geomID = RTC_INVALID_GEOMETRY_ID;
rayhits[i].hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
}
t.reset();
for (int pass = 0; pass < 6; pass++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
for (int i = 0; i < Nsmall; i++) rtcIntersect1( embreeScene, rayhits + i );
float traceTimeEmbree = t.elapsed() / 6.0f;
}
float traceTimeEmbree = t.elapsed() / 3.0f;
// retrieve intersection results
for (int i = 0; i < Nsmall; i++)
{
Expand Down Expand Up @@ -646,8 +661,9 @@ int main()
// - divergent distribution, multi-core
printf( "- CPU, incoherent, basic 2-way layout, MT: " );
t.reset();
for (int j = 0; j < 3; j++)
for (int pass = 0; pass < 4; pass++)
{
if (pass == 1) t.reset(); // first pass is cache warming
const int batchCount = Nfull / 10000;
#pragma omp parallel for schedule(dynamic)
for (int batch = 0; batch < batchCount; batch++)
Expand Down
Loading

0 comments on commit 251e8d2

Please sign in to comment.