diff --git a/README.md b/README.md
index ded2a24..aa552bf 100644
--- a/README.md
+++ b/README.md
@@ -48,6 +48,8 @@ Apart from the default BVH layout (simply named ````BVH````), several other layo
 
 A BVH in the ````BVH```` format may be _refitted_, in case the triangles moved, using ````BVH::Refit````. Refitting is substantially faster than rebuilding and works well if the animation is subtle. Refitting does not work if polygon counts change.
 
+New in version 1.1.3: 'Self-contained' formats may be serialized and de-serialized via ````::Save```` and ````::Load````. Currently this is supported for ````BVH8_CWBVH````, which stores vertex data in a custom format and thus does not rely on the input vertices for traversal.
+
 # How To Use
 The library ````tiny_bvh.h```` is designed to be easy to use. Please have a look at tiny_bvh_minimal.cpp for an example. A Visual Studio 'solution' (.sln/.vcxproj) is included, as well as a CMake file. That being said: The examples consists of only a single source file, which can be compiled with clang or g++, e.g.:
 
@@ -71,7 +73,7 @@ The **performance measurement tool** can be compiled with:
 
 ````g++ -std=c++20 -mavx -Ofast tiny_bvh_speedtest.cpp -o tiny_bvh_speedtest````
 
-# Version 1.1.2
+# Version 1.1.3
 
 Version 1.1.0 introduced a <ins>change to the API</ins>. The single BVH class with multiple layouts has been replaced with a BVH class per layout. You can simply instantiate the desired layout; conversion (and data ownership) is then handled properly by the library. Examples:
 
diff --git a/tiny_bvh.h b/tiny_bvh.h
index a9085fe..72d751d 100644
--- a/tiny_bvh.h
+++ b/tiny_bvh.h
@@ -116,7 +116,7 @@ THE SOFTWARE.
 // library version
 #define TINY_BVH_VERSION_MAJOR	1
 #define TINY_BVH_VERSION_MINOR	1
-#define TINY_BVH_VERSION_SUB	2
+#define TINY_BVH_VERSION_SUB	3
 
 // ============================================================================
 //
@@ -907,7 +907,7 @@ class BVH8_CWBVH : public BVHBase
 	uint32_t allocatedBlocks = 0;	// node data is stored in blocks of 16 byte.
 	uint32_t usedBlocks = 0;		// actually used blocks.
 	BVH8 bvh8;						// BVH8_CWBVH is created from BVH8 and uses its data.
-	bool ownBVH8 = true;			// False when ConvertFrom receives an external bvh8.
+	bool ownBVH8 = true;			// false when ConvertFrom receives an external bvh8.
 };
 
 // BLASInstance: A TLAS is built over BLAS instances, where a single BLAS can be
@@ -2176,18 +2176,18 @@ void BVH_Verbose::MergeLeafs()
 // BVH_GPU implementation
 // ----------------------------------------------------------------------------
 
-BVH_GPU::~BVH_GPU() 
+BVH_GPU::~BVH_GPU()
 {
 	if (!ownBVH) bvh = BVH(); // clear out pointers we don't own.
-	AlignedFree( bvhNode ); 
+	AlignedFree( bvhNode );
 }
 
-void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) 
-{ 
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+void BVH_GPU::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH_GPU::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH_GPU::Build( const bvhvec4slice& vertices )
+{
 	bvh.BuildDefault( vertices );
 	ConvertFrom( bvh );
 }
@@ -2196,7 +2196,7 @@ void BVH_GPU::ConvertFrom( const BVH& original )
 {
 	// get a copy of the original bvh
 	if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. 
-	bvh = original; 
+	bvh = original;
 	// allocate space
 	const uint32_t spaceNeeded = original.usedNodes;
 	if (allocatedNodes < spaceNeeded)
@@ -2285,18 +2285,18 @@ int32_t BVH_GPU::Intersect( Ray& ray ) const
 // BVH_SoA implementation
 // ----------------------------------------------------------------------------
 
-BVH_SoA::~BVH_SoA() 
+BVH_SoA::~BVH_SoA()
 {
 	if (!ownBVH) bvh = BVH(); // clear out pointers we don't own.
-	AlignedFree( bvhNode ); 
+	AlignedFree( bvhNode );
 }
 
-void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t primCount ) 
-{ 
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+void BVH_SoA::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH_SoA::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH_SoA::Build( const bvhvec4slice& vertices )
+{
 	bvh.context = context; // properly propagate context to fix issue #66.
 	bvh.BuildDefault( vertices );
 	ConvertFrom( bvh );
@@ -2306,7 +2306,7 @@ void BVH_SoA::ConvertFrom( const BVH& original )
 {
 	// get a copy of the original bvh
 	if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. 
-	bvh = original; 
+	bvh = original;
 	// allocate space
 	const uint32_t spaceNeeded = bvh.usedNodes;
 	if (allocatedNodes < spaceNeeded)
@@ -2355,15 +2355,15 @@ void BVH_SoA::ConvertFrom( const BVH& original )
 // BVH4 implementation
 // ----------------------------------------------------------------------------
 
-BVH4::~BVH4() 
+BVH4::~BVH4()
 {
 	if (!ownBVH) bvh = BVH(); // clear out pointers we don't own.
-	AlignedFree( bvh4Node ); 
+	AlignedFree( bvh4Node );
 }
 
 void BVH4::Build( const bvhvec4* vertices, const uint32_t primCount )
 {
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
 void BVH4::Build( const bvhvec4slice& vertices )
 {
@@ -2376,7 +2376,7 @@ void BVH4::ConvertFrom( const BVH& original )
 {
 	// get a copy of the original bvh
 	if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. 
-	bvh = original; 
+	bvh = original;
 	// allocate space
 	const uint32_t spaceNeeded = original.usedNodes;
 	if (allocatedNodes < spaceNeeded)
@@ -2458,19 +2458,19 @@ int32_t BVH4::Intersect( Ray& ray ) const
 // BVH4_CPU implementation
 // ----------------------------------------------------------------------------
 
-BVH4_CPU::~BVH4_CPU() 
+BVH4_CPU::~BVH4_CPU()
 {
 	if (!ownBVH4) bvh4 = BVH4(); // clear out pointers we don't own.
-	AlignedFree( bvh4Node ); 
+	AlignedFree( bvh4Node );
 	AlignedFree( bvh4Tris );
 }
 
-void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t primCount ) 
+void BVH4_CPU::Build( const bvhvec4* vertices, const uint32_t primCount )
 {
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH4_CPU::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH4_CPU::Build( const bvhvec4slice& vertices )
+{
 	bvh4.context = context; // properly propagate context to fix issue #66.
 	bvh4.Build( vertices );
 	ConvertFrom( bvh4 );
@@ -2480,7 +2480,7 @@ void BVH4_CPU::ConvertFrom( const BVH4& original )
 {
 	// get a copy of the original bvh4
 	if (&original != &bvh4) ownBVH4 = false; // bvh isn't ours; don't delete in destructor. 
-	bvh4 = original; 
+	bvh4 = original;
 	// Convert a 4-wide BVH to a format suitable for CPU traversal.
 	// See Faster Incoherent Ray Traversal Using 8-Wide AVX InstructionsLayout,
 	// Atilla T. Áfra, 2013.
@@ -2564,18 +2564,18 @@ void BVH4_CPU::ConvertFrom( const BVH4& original )
 // BVH4_GPU implementation
 // ----------------------------------------------------------------------------
 
-BVH4_GPU::~BVH4_GPU() 
+BVH4_GPU::~BVH4_GPU()
 {
 	if (!ownBVH4) bvh4 = BVH4(); // clear out pointers we don't own.
-	AlignedFree( bvh4Data ); 
+	AlignedFree( bvh4Data );
 }
 
-void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t primCount ) 
-{ 
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+void BVH4_GPU::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH4_GPU::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH4_GPU::Build( const bvhvec4slice& vertices )
+{
 	bvh4.context = context; // properly propagate context to fix issue #66.
 	bvh4.Build( vertices );
 	ConvertFrom( bvh4 );
@@ -2816,18 +2816,18 @@ int32_t BVH4_GPU::Intersect( Ray& ray ) const
 // BVH8 implementation
 // ----------------------------------------------------------------------------
 
-BVH8::~BVH8() 
+BVH8::~BVH8()
 {
 	if (!ownBVH) bvh = BVH(); // clear out pointers we don't own.
 	AlignedFree( bvh8Node );
 }
 
-void BVH8::Build( const bvhvec4* vertices, const uint32_t primCount ) 
-{ 
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+void BVH8::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH8::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH8::Build( const bvhvec4slice& vertices )
+{
 	bvh.context = context; // properly propagate context to fix issue #66.
 	bvh.BuildDefault( vertices );
 	ConvertFrom( bvh );
@@ -2837,7 +2837,7 @@ void BVH8::ConvertFrom( const BVH& original )
 {
 	// get a copy of the original
 	if (&original != &bvh) ownBVH = false; // bvh isn't ours; don't delete in destructor. 
-	bvh = original; 
+	bvh = original;
 	// allocate space
 	// Note: The safe upper bound here is usedNodes when converting an existing
 	// BVH2, but we need triCount * 2 to be safe in later conversions, e.g. to
@@ -2963,7 +2963,7 @@ int32_t BVH8::Intersect( Ray& ray ) const
 // BVH8_CWBVH implementation
 // ----------------------------------------------------------------------------
 
-BVH8_CWBVH::~BVH8_CWBVH() 
+BVH8_CWBVH::~BVH8_CWBVH()
 {
 	if (!ownBVH8) bvh8 = BVH8(); // clear out pointers we don't own.
 	AlignedFree( bvh8Data );
@@ -2994,12 +2994,12 @@ bool BVH8_CWBVH::Load( const char* fileName )
 	return true;
 }
 
-void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t primCount ) 
-{ 
-	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) ); 
+void BVH8_CWBVH::Build( const bvhvec4* vertices, const uint32_t primCount )
+{
+	Build( bvhvec4slice( vertices, primCount * 3, sizeof( bvhvec4 ) ) );
 }
-void BVH8_CWBVH::Build( const bvhvec4slice& vertices ) 
-{ 
+void BVH8_CWBVH::Build( const bvhvec4slice& vertices )
+{
 	bvh8.context = context; // properly propagate context to fix issue #66.
 	bvh8.Build( vertices );
 	ConvertFrom( bvh8 );
@@ -3009,7 +3009,7 @@ void BVH8_CWBVH::ConvertFrom( BVH8& original )
 {
 	// get a copy of the original bvh8
 	if (&original != &bvh8) ownBVH8 = false; // bvh isn't ours; don't delete in destructor. 
-	bvh8 = original; 
+	bvh8 = original;
 	// Convert a BVH8 to the format specified in: "Efficient Incoherent Ray
 	// Traversal on GPUs Through Compressed Wide BVHs", Ylitie et al. 2017.
 	// Adapted from code by "AlanWBFT".
diff --git a/tiny_bvh_gpu.cpp b/tiny_bvh_gpu.cpp
index 3a7dd36..f38936e 100644
--- a/tiny_bvh_gpu.cpp
+++ b/tiny_bvh_gpu.cpp
@@ -26,7 +26,7 @@ static int triCount = 0, frameIdx = 0, spp = 0;
 static Kernel* init, * clear, * generate, * extend, * shade;
 static Kernel* updateCounters1, * updateCounters2, * traceShadows, * finalize;
 static Buffer* pixels, * accumulator, * raysIn, * raysOut, * connections, * triData;
-static Buffer* cwbvhNodes = 0, * cwbvhTris = 0, *noise = 0;
+static Buffer* cwbvhNodes = 0, * cwbvhTris = 0, * noise = 0;
 static size_t computeUnits;
 static uint32_t* blueNoise;
 
@@ -60,7 +60,7 @@ void AddQuad( const bvhvec3 pos, const float w, const float d, int c )
 // Blue noise from file
 void LoadBlueNoise()
 {
-	std::fstream s{ "blue_noise_128x128x8_2d.raw", s.binary | s.in }; 
+	std::fstream s{ "blue_noise_128x128x8_2d.raw", s.binary | s.in };
 	s.read( (char*)blueNoise, 128 * 128 * 4 );
 }
 
diff --git a/wavefront.cl b/wavefront.cl
index f67068d..dacaa06 100644
--- a/wavefront.cl
+++ b/wavefront.cl
@@ -25,7 +25,6 @@ struct RenderData
 __global volatile int extendTasks, shadeTasks, connectTasks;
 __global struct RenderData rd;
 
-
 // Xor32 RNG
 uint WangHash( uint s ) { s = (s ^ 61) ^ (s >> 16), s *= 9, s = s ^ (s >> 4), s *= 0x27d4eb2d; return s ^ (s >> 15); }
 uint RandomUInt( uint* seed ) { *seed ^= *seed << 13, * seed ^= *seed >> 17, * seed ^= *seed << 5; return *seed; }
@@ -63,10 +62,12 @@ float3 CosWeightedDiffReflection( const float3 N, uint* seed )
 }
 
 // PathState: path throughput, current extension ray, pixel index
+#define PATH_LAST_SPECULAR 1
+#define PATH_VIA_DIFFUSE 2
 struct PathState
 {
-	float4 T; // xyz = rgb, postponed pdf in w
-	float4 O; // pixel index and path depth in O.w
+	float4 T; // xyz = rgb, postponed MIS pdf in w
+	float4 O; // O.w: 24-bit pixel index, 4-bit path depth, 4-bit path flags
 	float4 D; // t in D.w
 	float4 hit;
 };
@@ -117,8 +118,8 @@ void kernel Generate( global struct PathState* raysOut, uint frameSeed )
 	const float u = ((float)x + RandomFloat( &seed )) / (float)get_global_size( 0 );
 	const float v = ((float)y + RandomFloat( &seed )) / (float)get_global_size( 1 );
 	const float4 P = rd.p0 + u * (rd.p1 - rd.p0) + v * (rd.p2 - rd.p0);
-	raysOut[id].T = (float4)(1, 1, 1, -1 /* pdf, or -1 for specular vertex */);
-	raysOut[id].O = (float4)(rd.eye.xyz, as_float( id << 4 /* low bits: depth */ ));
+	raysOut[id].T = (float4)(1, 1, 1, 1 );
+	raysOut[id].O = (float4)(rd.eye.xyz, as_float( (id << 8) + PATH_LAST_SPECULAR ));
 	raysOut[id].D = (float4)(fast_normalize( P.xyz - rd.eye.xyz ), 1e30f);
 	raysOut[id].hit = (float4)(1e30f, 0, 0, as_float( 0 ));
 }
@@ -161,13 +162,14 @@ void kernel Shade( global float4* accumulator,
 		const int pathId = atomic_dec( &shadeTasks ) - 1;
 		if (pathId < 0) break;
 		// fetch path data
-		float4 T4 = raysIn[pathId].T;	// xyz = rgb, postponed pdf in w
-		float4 O4 = raysIn[pathId].O;	// pixel index in O.w
-		float4 D4 = raysIn[pathId].D;	// t in D.w
+		float4 T4 = raysIn[pathId].T;		// xyz = rgb, postponed pdf in w
+		float4 O4 = raysIn[pathId].O;		// pixel index in O.w
+		float4 D4 = raysIn[pathId].D;		// t in D.w
 		float4 hit = raysIn[pathId].hit;	// dist, u, v, prim
 		// prepare for shading
-		uint depth = as_uint( O4.w ) & 15;
-		uint pixelIdx = as_uint( O4.w ) >> 4;
+		uint pathState = as_uint( O4.w );
+		uint pixelIdx = pathState >> 8;
+		uint depth = (pathState >> 4) & 15;
 		uint seed = WangHash( as_uint( O4.w ) + rd.frameIdx * 17117 );
 		float3 T = T4.xyz;
 		float t = hit.x;
@@ -186,7 +188,7 @@ void kernel Shade( global float4* accumulator,
 		float3 lightColor = (float3)(20);
 		if (mat == 1 /* light source */)
 		{
-			if (T4.w == -1) accumulator[pixelIdx] += (float4)(T * lightColor, 1);
+			if (pathState & PATH_LAST_SPECULAR) accumulator[pixelIdx] += (float4)(T * lightColor, 1);
 			continue;
 		}
 		float3 vert0 = v0.xyz, vert1 = verts[vertIdx + 1].xyz, vert2 = verts[vertIdx + 2].xyz;
@@ -201,7 +203,7 @@ void kernel Shade( global float4* accumulator,
 			uint newRayIdx = atomic_inc( &extendTasks );
 			float3 R = Reflect( D, N );
 			raysOut[newRayIdx].T = (float4)(T * diff, -1 /* mark vertex as specular */);
-			raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 4) + depth + 1 ));
+			raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 8) + ((depth + 1) << 4) + PATH_LAST_SPECULAR ));
 			raysOut[newRayIdx].D = (float4)(R, 1e30f);
 			continue;
 		}
@@ -221,14 +223,14 @@ void kernel Shade( global float4* accumulator,
 			shadowOut[newShadowIdx].D = (float4)(L, dist - 2 * EPSILON);
 		}
 		// indirect illumination: diffuse bounce
-		if (depth < 3)
+		if (depth < 3 && (pathState & PATH_VIA_DIFFUSE) == 0 )
 		{
 			uint newRayIdx = atomic_inc( &extendTasks );
 			float3 R = CosWeightedDiffReflection( N, &seed );
 			float PDF = dot( N, R ) * INVPI;
 			T *= dot( N, R ) * BRDF * native_recip( PDF );
 			raysOut[newRayIdx].T = (float4)(T, 1);
-			raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 4) + depth + 1 ));
+			raysOut[newRayIdx].O = (float4)(I + R * EPSILON, as_float( (pixelIdx << 8) + ((depth + 1) << 4) + PATH_VIA_DIFFUSE ));
 			raysOut[newRayIdx].D = (float4)(R, 1e30f);
 		}
 	}