From 251e8d243a18952095bf42de32cb2d0aa13a047f Mon Sep 17 00:00:00 2001 From: Jacco Bikker Date: Wed, 27 Nov 2024 11:15:49 +0100 Subject: [PATCH] Triangle layout optimizations. --- tiny_bvh.h | 86 +++++++++++++++++++++--------------------- tiny_bvh_fenster.cpp | 9 ++--- tiny_bvh_speedtest.cpp | 64 +++++++++++++++++++------------ traverse.cl | 5 +-- 4 files changed, 87 insertions(+), 77 deletions(-) diff --git a/tiny_bvh.h b/tiny_bvh.h index ba89447..fe3edc3 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -1306,10 +1306,11 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr { unsigned t = triIdx[childNode[i]->firstTri + j]; bvhvec4 v0 = verts[t * 3 + 0]; + bvh4Alt[newAlt4Ptr + 1] = verts[t * 3 + 1] - v0; + bvh4Alt[newAlt4Ptr + 2] = verts[t * 3 + 2] - v0; v0.w = *(float*)&t; // as_float - bvh4Alt[newAlt4Ptr++] = v0; - bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 1]; - bvh4Alt[newAlt4Ptr++] = verts[t * 3 + 2]; + bvh4Alt[newAlt4Ptr + 0] = v0; + newAlt4Ptr += 3; } } // process interior nodes @@ -1431,10 +1432,6 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr while (1) { BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; - if (nodeIdx == 58663) - { - int w = 0; - } for (int i = 0; i < 4; i++) if (node.triCount[i] + node.childFirst[i] > 0) { if (!node.triCount[i]) stack[stackPtr++] = node.childFirst[i]; else @@ -1442,18 +1439,16 @@ void BVH::Convert( const BVHLayout from, const BVHLayout to, const bool deleteOr unsigned first = node.childFirst[i]; unsigned count = node.triCount[i]; node.childFirst[i] = triPtr; + // assign vertex data for (unsigned j = 0; j < count; j++) { unsigned fi = triIdx[first + j]; - if (fi > triCount) - { - int w = 0; - } bvhvec4 v0 = verts[fi * 3 + 0]; + bvh4Tris[triPtr + 1] = verts[fi * 3 + 1] - v0; + bvh4Tris[triPtr + 2] = verts[fi * 3 + 2] - v0; v0.w = *(float*)&fi; // so we know the original tri idx - bvh4Tris[triPtr++] = v0; - bvh4Tris[triPtr++] = verts[fi * 3 + 1]; - bvh4Tris[triPtr++] = verts[fi * 3 + 2]; + bvh4Tris[triPtr + 0] = v0; + triPtr += 3; } } } @@ -2466,9 +2461,9 @@ int BVH::Intersect_Alt4BVH( Ray& ray ) const unsigned triStart = offset + (leaf[i] & 0xffff); for (unsigned j = 0; j < N; j++, triStart += 3) { + const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ); + const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ); const bvhvec3 v0 = bvh4Alt[triStart + 0]; - const bvhvec3 edge1 = bvhvec3( bvh4Alt[triStart + 1] ) - v0; - const bvhvec3 edge2 = bvhvec3( bvh4Alt[triStart + 2] ) - v0; const bvhvec3 h = cross( ray.D, edge2 ); const float a = dot( edge1, h ); if (fabs( a ) < 0.0000001f) continue; @@ -3377,19 +3372,18 @@ int BVH::Intersect_CWBVH( Ray& ray ) const inline void IntersectTri4( Ray& r, unsigned idx, __m128& t4, const bvhvec4* verts ) { bvhvec4 v0 = verts[idx]; + bvhvec3 edge1 = verts[idx + 1], edge2 = verts[idx + 2]; const unsigned triIdx = *(unsigned*)&v0.w; v0.w = 0; - const bvhvec3 edge1 = verts[idx + 1] - v0, edge2 = verts[idx + 2] - v0; const bvhvec3 h = cross( r.D, edge2 ); const float a = dot( edge1, h ); if (fabs( a ) < 0.0000001f) return; // ray parallel to triangle const float f = 1 / a; - const bvhvec3 s = r.O - bvhvec3( v0 ); + const bvhvec3 s = r.O - bvhvec3( v0 ), q = cross( s, edge1 ); const float u = f * dot( s, h ); - if (u < 0 || u > 1) return; - const bvhvec3 q = cross( s, edge1 ); const float v = f * dot( r.D, q ); - if (v < 0 || u + v > 1) return; + const bool out = u < 0 || u > 1 || v < 0 || u + v > 1; + if (out) return; const float t = f * dot( edge2, q ); if (t > 0 && t < r.hit.t) r.hit.u = u, r.hit.v = v, r.hit.prim = triIdx, r.hit.t = t, t4 = _mm_set1_ps( t ); } @@ -3407,27 +3401,25 @@ int BVH::Intersect_Afra( Ray& ray ) const { const BVHNode4Alt2& node = bvh4Alt2[nodeIdx]; // intersect the ray with four AABBs - const __m128 x0 = _mm_sub_ps( node.xmin4, ox4 ), x1 = _mm_sub_ps( node.xmax4, ox4 ); - const __m128 y0 = _mm_sub_ps( node.ymin4, oy4 ), y1 = _mm_sub_ps( node.ymax4, oy4 ); - const __m128 z0 = _mm_sub_ps( node.zmin4, oz4 ), z1 = _mm_sub_ps( node.zmax4, oz4 ); + const __m128 xmin4 = node.xmin4, xmax4 = node.xmax4; + const __m128 ymin4 = node.ymin4, ymax4 = node.ymax4; + const __m128 zmin4 = node.zmin4, zmax4 = node.zmax4; + const __m128 x0 = _mm_sub_ps( xmin4, ox4 ), x1 = _mm_sub_ps( xmax4, ox4 ); + const __m128 y0 = _mm_sub_ps( ymin4, oy4 ), y1 = _mm_sub_ps( ymax4, oy4 ); + const __m128 z0 = _mm_sub_ps( zmin4, oz4 ), z1 = _mm_sub_ps( zmax4, oz4 ); const __m128 tx1 = _mm_mul_ps( x0, rdx4 ), tx2 = _mm_mul_ps( x1, rdx4 ); const __m128 ty1 = _mm_mul_ps( y0, rdy4 ), ty2 = _mm_mul_ps( y1, rdy4 ); const __m128 tz1 = _mm_mul_ps( z0, rdz4 ), tz2 = _mm_mul_ps( z1, rdz4 ); - __m128 tmin = _mm_max_ps( _mm_max_ps( _mm_min_ps( tx1, tx2 ), _mm_min_ps( ty1, ty2 ) ), _mm_min_ps( tz1, tz2 ) ); - const __m128 tmax = _mm_min_ps( _mm_min_ps( _mm_max_ps( tx1, tx2 ), _mm_max_ps( ty1, ty2 ) ), _mm_max_ps( tz1, tz2 ) ); + const __m128 txmin = _mm_min_ps( tx1, tx2 ), tymin = _mm_min_ps( ty1, ty2 ), tzmin = _mm_min_ps( tz1, tz2 ); + const __m128 txmax = _mm_max_ps( tx1, tx2 ), tymax = _mm_max_ps( ty1, ty2 ), tzmax = _mm_max_ps( tz1, tz2 ); + const __m128 tmin = _mm_max_ps( _mm_max_ps( txmin, tymin ), tzmin ); + const __m128 tmax = _mm_min_ps( _mm_min_ps( txmax, tymax ), tzmax ); const __m128 hit = _mm_and_ps( _mm_and_ps( _mm_cmpge_ps( tmax, tmin ), _mm_cmplt_ps( tmin, t4 ) ), _mm_cmpge_ps( tmax, zero4 ) ); - const int hitBits = _mm_movemask_ps( hit ); - const int hits = __popc( hitBits ); - if (hits == 0) - { - if (stackPtr == 0) break; else nodeIdx = stack[--stackPtr]; - } - else if (hits == 1) + const int hitBits = _mm_movemask_ps( hit ), hits = __popc( hitBits ); + if (hits == 1 /* 43% */) { // just one node was hit - no sorting needed. - const unsigned lane = __bfind( hitBits ); - const unsigned count = node.triCount[lane]; - // if (node.triCount[lane] + node.childFirst[lane] == 0) continue; // TODO - never happens? + const unsigned lane = __bfind( hitBits ), count = node.triCount[lane]; if (count == 0) nodeIdx = node.childFirst[lane]; else { const unsigned first = node.childFirst[lane]; @@ -3438,7 +3430,13 @@ int BVH::Intersect_Afra( Ray& ray ) const } continue; } - else if (hits == 2) + if (hits == 0 /* 29% */) + { + if (stackPtr == 0) break; + nodeIdx = stack[--stackPtr]; + continue; + } + if (hits == 2 /* 16% */) { // two nodes hit unsigned lane0 = __bfind( hitBits ), lane1 = __bfind( hitBits - (1 << lane0) ); @@ -3470,12 +3468,12 @@ int BVH::Intersect_Afra( Ray& ray ) const IntersectTri4( ray, first + j * 3, t4, bvh4Tris ); } } - else if (hits == 3) + else if (hits == 3 /* 8% */) { // blend in lane indices - tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); + __m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); // sort - float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 ); + float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 ); if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; @@ -3499,12 +3497,12 @@ int BVH::Intersect_Afra( Ray& ray ) const IntersectTri4( ray, first + j * 3, t4, bvh4Tris ); } } - else + else /* hits == 4, 2%: rare */ { // blend in lane indices - tmin = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); + __m128 tm = _mm_or_ps( _mm_and_ps( _mm_blendv_ps( inf4, tmin, hit ), idxMask ), idx4 ); // sort - float tmp, d0 = LANE( tmin, 0 ), d1 = LANE( tmin, 1 ), d2 = LANE( tmin, 2 ), d3 = LANE( tmin, 3 ); + float tmp, d0 = LANE( tm, 0 ), d1 = LANE( tm, 1 ), d2 = LANE( tm, 2 ), d3 = LANE( tm, 3 ); if (d0 < d2) tmp = d0, d0 = d2, d2 = tmp; if (d1 < d3) tmp = d1, d1 = d3, d3 = tmp; if (d0 < d1) tmp = d0, d0 = d1, d1 = tmp; @@ -3825,7 +3823,7 @@ int BVH::Intersect_AltSoA( Ray& ray ) const #endif // BVH_USENEON -} // namespace tinybvh + } // namespace tinybvh #ifdef __GNUC__ #pragma GCC diagnostic pop diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index d40914d..563b19c 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -31,9 +31,8 @@ int verts = 0; // setup view pyramid for a pinhole camera: // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) #ifdef LOADSCENE -// bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) ); -static bvhvec3 eye( 0, 13, 30 ), p1, p2, p3; -static bvhvec3 view = normalize( bvhvec3( 0, 0.01f, -1 ) ); +static bvhvec3 eye( 0, 30, 0 ), p1, p2, p3; +static bvhvec3 view = normalize( bvhvec3( -8, 2, -1.7f ) ); #else static bvhvec3 eye( -3.5f, -1.5f, -6.5f ), p1, p2, p3; static bvhvec3 view = normalize( bvhvec3( 3, 1.5f, 5 ) ); @@ -110,8 +109,6 @@ void Init() // build a BVH over the scene #if defined(BVH_USEAVX) bvh.BuildHQ( triangles, verts / 3 ); - bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); - bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); #elif defined(BVH_USENEON) bvh.BuildNEON( triangles, verts / 3 ); #else @@ -173,7 +170,7 @@ void Tick( uint32_t* buf ) // trace primary rays #if !defined USE_EMBREE - for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::BVH4_AFRA ); + for (int i = 0; i < N; i++) bvh.Intersect( rays[i] ); #else struct RTCRayHit rayhit; for (int i = 0; i < N; i++) diff --git a/tiny_bvh_speedtest.cpp b/tiny_bvh_speedtest.cpp index c330a0b..d0bdb57 100644 --- a/tiny_bvh_speedtest.cpp +++ b/tiny_bvh_speedtest.cpp @@ -16,7 +16,7 @@ #define BUILD_REFERENCE #define BUILD_AVX #define BUILD_NEON -// #define BUILD_SBVH +#define BUILD_SBVH #define TRAVERSE_2WAY_ST #define TRAVERSE_ALT2WAY_ST #define TRAVERSE_SOA2WAY_ST @@ -323,9 +323,11 @@ int main() // trace all rays three times to estimate average performance // - single core version printf( "- CPU, coherent, basic 2-way layout, ST: " ); - t.reset(); - for (int pass = 0; pass < 3; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i] ); + } float traceTimeST = t.elapsed() / 3.0f; mrays = (float)Nsmall / traceTimeST; printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeST * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); @@ -338,9 +340,11 @@ int main() // - single core version, alternative bvh layout printf( "- CPU, coherent, alt 2-way layout, ST: " ); bvh.Convert( BVH::WALD_32BYTE, BVH::AILA_LAINE ); - t.reset(); - for (int pass = 0; pass < 3; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::AILA_LAINE ); + } float traceTimeAlt = t.elapsed() / 3.0f; mrays = (float)Nsmall / traceTimeAlt; printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); @@ -372,12 +376,13 @@ int main() t.reset(); float traceTimeGPU = 0; ailalaine_kernel.SetArguments( &gpuNodes, &idxData, &triData, &rayData ); - for (int pass = 0; pass < 8; pass++) + for (int pass = 0; pass < 9; pass++) { ailalaine_kernel.Run( Nfull, 64, 0, &event ); // for now, todo. clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); + if (pass == 0) continue; // first pass is for cache warming traceTimeGPU += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. @@ -411,12 +416,13 @@ int main() t.reset(); float traceTimeGPU4 = 0; gpu4way_kernel.SetArguments( &gpu4Nodes, &rayData ); - for (int pass = 0; pass < 8; pass++) + for (int pass = 0; pass < 9; pass++) { gpu4way_kernel.Run( Nfull, 64, 0, &event ); // for now, todo. clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); + if (pass == 0) continue; // first pass is for cache warming traceTimeGPU4 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. @@ -453,12 +459,13 @@ int main() t.reset(); float traceTimeGPU8 = 0; cwbvh_kernel.SetArguments( &cwbvhNodes, &cwbvhTris, &rayData ); - for (int pass = 0; pass < 8; pass++) + for (int pass = 0; pass < 9; pass++) { cwbvh_kernel.Run( Nfull, 64, 0, &event ); // for now, todo. clWaitForEvents( 1, &event ); // OpenCL kernsl run asynchronously clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_START, sizeof( cl_ulong ), &startTime, 0 ); clGetEventProfilingInfo( event, CL_PROFILING_COMMAND_END, sizeof( cl_ulong ), &endTime, 0 ); + if (pass == 0) continue; // first pass is for cache warming traceTimeGPU8 += (endTime - startTime) * 1e-9f; // event timing is in nanoseconds } // get results from GPU - this also syncs the queue. @@ -478,9 +485,11 @@ int main() // - single core version, alternative bvh layout 2 printf( "- CPU, coherent, soa 2-way layout, ST: " ); bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); - t.reset(); - for (int pass = 0; pass < 3; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA ); + } float traceTimeAlt2 = t.elapsed() / 3.0f; mrays = (float)Nsmall / traceTimeAlt2; printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAlt2 * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); @@ -492,9 +501,9 @@ int main() // trace all rays three times to estimate average performance // - multi-core version (using OpenMP and batches of 10,000 rays) printf( "- CPU, coherent, basic 2-way layout, MT: " ); - t.reset(); - for (int j = 0; j < 3; j++) + for (int pass = 0; pass < 4; pass++) { + if (pass == 1) t.reset(); // first pass is cache warming const int batchCount = Nfull / 10000; #pragma omp parallel for schedule(dynamic) for (int batch = 0; batch < batchCount; batch++) @@ -514,9 +523,9 @@ int main() // trace all rays three times to estimate average performance // - coherent distribution, multi-core, packet traversal printf( "- CPU, coherent, 2-way, packets, MT: " ); - t.reset(); - for (int j = 0; j < 3; j++) + for (int pass = 0; pass < 4; pass++) { + if (pass == 1) t.reset(); // first pass is cache warming const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays #pragma omp parallel for schedule(dynamic) for (int batch = 0; batch < batchCount; batch++) @@ -534,9 +543,9 @@ int main() // trace all rays three times to estimate average performance // - coherent distribution, multi-core, packet traversal, SSE version printf( "- CPU, coherent, 2-way, packets/SSE, MT: " ); - t.reset(); - for (int j = 0; j < 3; j++) + for (int pass = 0; pass < 4; pass++) { + if (pass == 1) t.reset(); // first pass is cache warming const int batchCount = Nfull / (30 * 256); // batches of 30 packets of 256 rays #pragma omp parallel for schedule(dynamic) for (int batch = 0; batch < batchCount; batch++) @@ -565,9 +574,11 @@ int main() printf( "done (%.2fs). New: %i nodes, SAH=%.2f\n", t.elapsed(), bvh.NodeCount( BVH::WALD_32BYTE ), bvh.SAHCost() ); bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); printf( "- CPU, coherent, 2-way optimized, ST: " ); - t.reset(); - for (int pass = 0; pass < 3; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::ALT_SOA ); + } float traceTimeOpt = t.elapsed() / 3.0f; mrays = (float)Nsmall / traceTimeOpt; printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeOpt * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); @@ -589,9 +600,11 @@ int main() bvh.Convert( BVH::WALD_32BYTE, BVH::BASIC_BVH4 ); bvh.Convert( BVH::BASIC_BVH4, BVH::BVH4_AFRA ); printf( "- CPU, coherent, 4-way optimized, ST: " ); - t.reset(); - for (int pass = 0; pass < 3; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) bvh.Intersect( smallBatch[i], BVH::BVH4_AFRA ); + } float traceTimeAfra = t.elapsed() / 3.0f; mrays = (float)Nsmall / traceTimeAfra; printf( "%8.1fms for %6.2fM rays => %6.2fMRay/s\n", traceTimeAfra * 1000, (float)Nsmall * 1e-6f, mrays * 1e-6f ); @@ -614,10 +627,12 @@ int main() rayhits[i].hit.geomID = RTC_INVALID_GEOMETRY_ID; rayhits[i].hit.instID[0] = RTC_INVALID_GEOMETRY_ID; } - t.reset(); - for (int pass = 0; pass < 6; pass++) + for (int pass = 0; pass < 4; pass++) + { + if (pass == 1) t.reset(); // first pass is cache warming for (int i = 0; i < Nsmall; i++) rtcIntersect1( embreeScene, rayhits + i ); - float traceTimeEmbree = t.elapsed() / 6.0f; + } + float traceTimeEmbree = t.elapsed() / 3.0f; // retrieve intersection results for (int i = 0; i < Nsmall; i++) { @@ -646,8 +661,9 @@ int main() // - divergent distribution, multi-core printf( "- CPU, incoherent, basic 2-way layout, MT: " ); t.reset(); - for (int j = 0; j < 3; j++) + for (int pass = 0; pass < 4; pass++) { + if (pass == 1) t.reset(); // first pass is cache warming const int batchCount = Nfull / 10000; #pragma omp parallel for schedule(dynamic) for (int batch = 0; batch < batchCount; batch++) diff --git a/traverse.cl b/traverse.cl index 3ce7041..bbce89b 100644 --- a/traverse.cl +++ b/traverse.cl @@ -244,12 +244,11 @@ void kernel traverse_gpu4way( global float4* alt4Node, global struct Ray* rayDat { const unsigned leafInfo = smem[smBase + leaf]; unsigned thisTri = (leafInfo & 0xffff) + offset + prim * 3; + const float4 edge2 = alt4Node[thisTri + 2]; + const float4 edge1 = alt4Node[thisTri + 1]; const float4 v0 = alt4Node[thisTri]; - const float4 v1 = alt4Node[thisTri + 1]; - const float4 v2 = alt4Node[thisTri + 2]; const unsigned triCount = (leafInfo >> 16) & 0x7fff; if (++prim == triCount) prim = 0, leaf++; - const float4 edge1 = v1 - v0, edge2 = v2 - v0; const float3 h = cross( D, edge2.xyz ); const float a = dot( edge1.xyz, h ); if (fabs( a ) < 0.0000001f) continue;