diff --git a/testdata/cryteksponza.bin b/testdata/cryteksponza.bin new file mode 100644 index 0000000..7fa6e98 Binary files /dev/null and b/testdata/cryteksponza.bin differ diff --git a/tiny_bvh.h b/tiny_bvh.h index e13878e..97791ce 100644 --- a/tiny_bvh.h +++ b/tiny_bvh.h @@ -962,11 +962,18 @@ inline float halfArea( const __m128 a /* a contains extent of aabb */ ) { return LANE( a, 0 ) * LANE( a, 1 ) + LANE( a, 1 ) * LANE( a, 2 ) + LANE( a, 2 ) * LANE( a, 3 ); } -inline float halfArea( const __m256 a /* a contains aabb itself, with min.xyz negated */ ) +inline float halfArea( const __m256& a /* a contains aabb itself, with min.xyz negated */ ) { +#ifndef _MSC_VER + // g++ doesn't seem to like the faster construct + float* c = (float*)&a; + float ex = c[4] + c[0], ey = c[5] + c[1], ez = c[6] + c[2]; + return ex * ey + ey * ez + ez * ex; +#else const __m128 q = _mm256_castps256_ps128( _mm256_add_ps( _mm256_permute2f128_ps( a, a, 5 ), a ) ); const __m128 v = _mm_mul_ps( q, _mm_shuffle_ps( q, q, 9 ) ); return LANE( v, 0 ) + LANE( v, 1 ) + LANE( v, 2 ); +#endif } #define PROCESS_PLANE( a, pos, ANLR, lN, rN, lb, rb ) if (lN * rN != 0) { \ ANLR = halfArea( lb ) * (float)lN + halfArea( rb ) * (float)rN; if (ANLR < splitCost) \ @@ -1070,7 +1077,7 @@ void BVH::BuildAVX( const bvhvec4* vertices, const unsigned int primCount ) const __m256* bb = binbox; for (int a = 0; a < 3; a++, bb += BVHBINS) if ((node.aabbMax[a] - node.aabbMin[a]) > minDim.cell[a]) { - // hardcoded bin processing for BVHBINS == 8, see end of file for generic code. + // hardcoded bin processing for BVHBINS == 8 assert( BVHBINS == 8 ); const unsigned int lN0 = count[a][0], rN0 = count[a][7]; const __m256 lb0 = bb[0], rb0 = bb[7]; diff --git a/tiny_bvh_fenster.cpp b/tiny_bvh_fenster.cpp index 7c2ef3b..24d6d4e 100644 --- a/tiny_bvh_fenster.cpp +++ b/tiny_bvh_fenster.cpp @@ -2,6 +2,7 @@ // #define USE_NANORT // enable to verify correct implementation // #define USE_EMBREE // enable to verify correct implementation, win64 only for now. +// #define LOADSPONZA #define TINYBVH_IMPLEMENTATION #include "tiny_bvh.h" @@ -24,7 +25,11 @@ void embreeError( void* userPtr, enum RTCError error, const char* str ) BVH bvh; #endif +#ifdef LOADSPONZA +bvhvec4* triangles; +#else bvhvec4 triangles[259 /* level 3 */ * 6 * 2 * 49 * 3]{}; +#endif int verts = 0; void sphere_flake( float x, float y, float z, float s, int d = 0 ) @@ -51,8 +56,17 @@ void sphere_flake( float x, float y, float z, float s, int d = 0 ) void Init() { +#ifdef LOADSPONZA + // load raw vertex data for Crytek's Sponza + FILE* f = fopen( "../testdata/cryteksponza.bin", "rb" ); + fread( &verts, 1, 4, f ); + verts *= 3, triangles = new bvhvec4[verts]; + fread( triangles, sizeof( bvhvec4 ), verts, f ); + fclose( f ); +#else // generate a sphere flake scene sphere_flake( 0, 0, 0, 1.5f ); +#endif #if defined USE_NANORT @@ -89,7 +103,7 @@ void Init() #else // build a BVH over the scene - bvh.Build( (bvhvec4*)triangles, verts / 3 ); + bvh.BuildAVX( triangles, verts / 3 ); bvh.Convert( BVH::WALD_32BYTE, BVH::ALT_SOA ); #endif @@ -100,7 +114,11 @@ void Tick( uint32_t* buf ) { // setup view pyramid for a pinhole camera: // eye, p1 (top-left), p2 (top-right) and p3 (bottom-left) +#ifdef LOADSPONZA + bvhvec3 eye( 0, 30, 0 ), view = normalize( bvhvec3( -8, 2, -1.7f ) ); +#else bvhvec3 eye( -3.5f, -1.5f, -6.5f ), view = normalize( bvhvec3( 3, 1.5f, 5 ) ); +#endif bvhvec3 right = normalize( cross( bvhvec3( 0, 1, 0 ), view ) ); bvhvec3 up = 0.8f * cross( view, right ), C = eye + 2 * view; bvhvec3 p1 = C - right + up, p2 = C + right + up, p3 = C - right - up; @@ -136,11 +154,9 @@ void Tick( uint32_t* buf ) ray.dir[0] = rays[i].D.x, ray.dir[1] = rays[i].D.y, ray.dir[2] = rays[i].D.z; ray.min_t = 0, ray.max_t = rays[i].hit.t; nanort::TriangleIntersection isect; - bool hit = accel.Traverse( ray, triangle_intersector, &isect, trace_options ); - if (hit) - rays[i].hit.t = isect.t, - rays[i].hit.u = isect.u, rays[i].hit.v = isect.v, - rays[i].hit.prim = isect.prim_id; + if (accel.Traverse( ray, triangle_intersector, &isect, trace_options )) + rays[i].hit.t = isect.t, rays[i].hit.prim = isect.prim_id, + rays[i].hit.u = isect.u, rays[i].hit.v = isect.v; } #elif defined(USE_EMBREE) struct RTCRayHit rayhit; @@ -148,17 +164,14 @@ void Tick( uint32_t* buf ) { rayhit.ray.org_x = rays[i].O.x, rayhit.ray.org_y = rays[i].O.y, rayhit.ray.org_z = rays[i].O.z; rayhit.ray.dir_x = rays[i].D.x, rayhit.ray.dir_y = rays[i].D.y, rayhit.ray.dir_z = rays[i].D.z; - rayhit.ray.tnear = 0, rayhit.ray.tfar = rays[i].hit.t; - rayhit.ray.mask = -1, rayhit.ray.flags = 0; - rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID; - rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; + rayhit.ray.tnear = 0, rayhit.ray.tfar = rays[i].hit.t, rayhit.ray.mask = -1, rayhit.ray.flags = 0; + rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID, rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID; rtcIntersect1( embreeScene, &rayhit ); - rays[i].hit.t = rayhit.ray.tfar; rays[i].hit.u = rayhit.hit.u, rays[i].hit.u = rayhit.hit.v; - rays[i].hit.prim = rayhit.hit.primID; + rays[i].hit.prim = rayhit.hit.primID, rays[i].hit.t = rayhit.ray.tfar; } #else - for (int i = 0; i < N; i++) bvh.Intersect( rays[i], BVH::ALT_SOA ); + for (int i = 0; i < N; i+=16) bvh.Intersect( rays[i], BVH::ALT_SOA ); #endif // visualize result @@ -169,7 +182,7 @@ void Tick( uint32_t* buf ) int pixel_x = tx * 4 + x; int pixel_y = ty * 4 + y; float avg = 0; - for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 1000) + for (int s = 0; s < 16; s++, i++) if (rays[i].hit.t < 10000) { int primIdx = rays[i].hit.prim; bvhvec3 v0 = triangles[primIdx * 3 + 0]; @@ -178,7 +191,7 @@ void Tick( uint32_t* buf ) bvhvec3 N = normalize( cross( v1 - v0, v2 - v0 ) ); avg += fabs( dot( N, normalize( bvhvec3( 1, 2, 3 ) ) ) ); } - #if defined USE_NANORT + #if !defined USE_NANORT int c = (int)(255.9f * avg); // we trace only every 16th ray with NanoRT #else int c = (int)(15.9f * avg);