From f91e57e88c627dd2fe69041adce7488d353b28d6 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Mon, 14 Aug 2023 20:42:01 -0400
Subject: [PATCH 01/26] Ensuring plotting correctness on windows

---
 .vscode/launch.json    |  2 +-
 .vscode/settings.json  |  3 +-
 cuda/CudaPlotPhase2.cu |  2 +-
 cuda/CudaPlotPhase3.cu | 47 +++++++-------------
 cuda/CudaPlotter.cu    | 98 ++++++++++++++++++++++--------------------
 src/util/Util.h        | 10 ++---
 6 files changed, 77 insertions(+), 85 deletions(-)
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 36ae6c2f..7ae86bf7 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -131,7 +131,7 @@
             "preLaunchTask" : "build_cuda_debug",
 
             "program": "${workspaceFolder}/build/bladebit_cuda",
-            
+
             //                 "-c", "xch1uf48n3f50xrs7zds0uek9wp9wmyza6crnex6rw8kwm3jnm39y82q5mvps6",
             // "-i", "7a709594087cca18cffa37be61bdecf9b6b465de91acb06ecb6dbe0f4a536f73",    // Yes overflow
             // "--memo", "80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef207d52406afa2b6d7d92ea778f407205bd9dca40816c1b1cacfca2a6612b93eb",
diff --git a/.vscode/settings.json b/.vscode/settings.json
index fafad2e2..6c2da21b 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -138,7 +138,8 @@
     ],
     // "cmake.buildArgs": [],
     "cmake.configureSettings": {
-        "BB_ENABLE_TESTS": "ON"
+        "BB_ENABLE_TESTS": "ON",
+        "BB_CUDA_USE_NATIVE": "ON"
     },
     "C_Cpp.dimInactiveRegions": false,
     // "cmake.generator": "Unix Makefiles"
diff --git a/cuda/CudaPlotPhase2.cu b/cuda/CudaPlotPhase2.cu
index 0e1f6480..8d2d5094 100644
--- a/cuda/CudaPlotPhase2.cu
+++ b/cuda/CudaPlotPhase2.cu
@@ -419,7 +419,7 @@ void CudaK32PlotPhase2AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
         desc.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
     }
 
-    if( cx.cfg.disableDirectDownloads )
+    if( !cx.downloadDirect )
         desc.pinnedAllocator = acx.pinnedAllocator;
 
     CudaK32Phase2& p2 = *cx.phase2;
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index a9b34c62..101e8cd9 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -615,24 +615,24 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
     auto& p3 = *cx.phase3;
 
     // Shared allocations
-    p3.devBucketCounts      = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
-    p3.devPrunedEntryCount  = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
+    p3.devBucketCounts     = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+    p3.devPrunedEntryCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
 
     // Host allocations
-    p3.hostRMap             = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
-    p3.hostLinePoints       = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
+    p3.hostRMap            = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
+    p3.hostLinePoints      = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
 
     if( cx.cfg.hybrid64Mode )
     {
         Panic( "Unimplemented for 64G mode. Need to offload LMap/Line Points to disk." );
     }
 
-    if( !acx.dryRun )
-    {
-        // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
-        // ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) < (uintptr_t)cx.hostTableSortedL );
-    }
-    // p3.hostBucketCounts     = acx.pinnedAllocator->CAlloc<uint32>( BBCU_BUCKET_COUNT, acx.alignment );
+    #if _DEBUG
+        if( !acx.dryRun && !cx.cfg.hybrid128Mode )
+        {
+            ASSERT( (uintptr_t)(p3.hostLinePoints + BBCU_TABLE_ALLOC_ENTRY_COUNT ) <= (uintptr_t)cx.hostTableL );
+        }
+    #endif
 
     if( acx.dryRun )
     {
@@ -704,7 +704,7 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
@@ -728,7 +728,7 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
@@ -738,13 +738,8 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     const size_t alignment = acx.alignment;
 
     s1.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( uploadDesc,  acx.dryRun );
-                    // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-    
     s1.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( uploadDesc,  acx.dryRun );
-                    // sizeof( uint16 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
     s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<RMap>( desc, acx.dryRun );
-                    // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
 
     s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
 }
@@ -758,23 +753,17 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
 
     auto&        s2        = cx.phase3->step2;
     const size_t alignment = acx.alignment;
 
     s2.rMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<RMap>( desc, acx.dryRun );
-        // sizeof( RMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
     s2.lMapIn = cx.gpuUploadStream[0]->CreateUploadBufferT<LMap>( desc, acx.dryRun );
-        // sizeof( LMap ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
-
-    s2.lpOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
-        // sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
 
+    s2.lpOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32> (desc, acx.dryRun );
-        // sizeof( uint32 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT, *acx.devAllocator, alignment, acx.dryRun );
-    
+
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 }
@@ -788,7 +777,7 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.cfg.disableDirectDownloads ? acx.pinnedAllocator : nullptr;
+    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
 
     auto&        s3        = cx.phase3->step3;
     const size_t alignment = acx.alignment;
@@ -808,7 +797,6 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( DEV_MAX_PARK_SIZE, sizeof( uint64 ) );
 
     s3.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
-    // cx.gpuDownloadStream[0]->CreateDownloadBuffer( devParkAllocSize, *acx.devAllocator, *acx.pinnedAllocator, alignment, acx.dryRun );
 
     if( acx.dryRun )
     {
@@ -828,9 +816,6 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     s3.devDeltaLinePoints = acx.devAllocator->CAlloc<uint64>( linePointAllocCount, alignment );
     s3.devIndices         = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 
-    // s3.devParks  = acx.devAllocator->AllocT<uint64>( parkAllocSize, alignment );
-    // s3.hostParks = acx.devAllocator->AllocT<byte>  ( maxParkSize  , alignment );
-
     s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
     s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
 }
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index cc9619d3..a7e25d53 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -65,6 +65,12 @@ GPU-based (CUDA) plotter
  -t2, --temp2         : Temporary directory 2. Used for temporary, shorted-lived read and writes.
                          NOTE: If only one of -t1 or -t2 is specified, both will be
                                set to the same directory.
+  
+  --no-direct-buffers : Disable using direct downloads and uploads from/to GPU and host.
+                        If this is set, intermediate buffers are used between the GPU and host,
+                        which will means slower plotting times.
+                        This is forcefully enabled on Windows to avoid limited pinnable memory.
+
 )";
 
 ///
@@ -105,6 +111,8 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
             continue;
         if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) )
             continue;
+        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
+            continue;
         if( cli.ArgMatch( "--help", "-h" ) )
         {
             Log::Line( USAGE );
@@ -144,6 +152,9 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
 
     Log::Line( "[Bladebit CUDA Plotter]" );
+    Log::Line( " Host RAM: %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+    Log::NewLine();
+   
     CudaInit( cx );
 
     CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );
@@ -165,11 +176,12 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.plotFence  = new Fence();
     cx.parkFence  = new Fence();
 
-    #if __linux__
-        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
+    #if __WIN32
+        // #MAYBE: Add a configurable option to enable direct downloads on windows?
+        // On windows always default to using intermediate pinned buffers
+        cx.downloadDirect = false;
     #else
-        // #TODO: One windows, check if we have enough memory, if so, default to true.
-        cx.downloadDirect = true ;//false;
+        cx.downloadDirect = cfg.disableDirectDownloads ? false : true;
     #endif
 
     // cx.plotWriter = new PlotWriter( !cfg.gCfg->disableOutputDirectIO );
@@ -177,9 +189,10 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     //     cx.plotWriter->EnableDummyMode();
 
     // Need to do allocations for park serialization differently under the following conditions
-    if( cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode ) //cx.cfg.hybrid64Mode )
+    if( cx.downloadDirect || cx.cfg.hybrid128Mode )
     {
-        cx.parkContext = new CudaK32ParkContext{};
+        cx.parkContext    = new CudaK32ParkContext{};
+        cx.useParkContext = true;
     }
 
     // Check for hybrid mode
@@ -1198,11 +1211,11 @@ void AllocBuffers( CudaK32PlotContext& cx )
     cx.hostTempAllocSize  = 0;
     cx.devAllocSize       = 0;
 
-    size_t parksPinnedSize = 0;
-
-    // If on <= 64G mode or not using direct downloads, 
+    // If on <= 128G mode or not using direct downloads, 
     // we need to use a separate buffer for downloading parks, instead of re-using exisintg ones.
-    const bool allocateParkBuffers = cx.cfg.disableDirectDownloads || cx.cfg.hybrid128Mode; //cx.cfg.hybrid64Mode;
+    // If on <= 64G mode or not using direct downloads, 
+    const bool allocateParkBuffers = cx.downloadDirect || cx.cfg.hybrid128Mode;
+    size_t parksPinnedSize = 0;
 
     // Gather the size needed first
     {
@@ -1280,40 +1293,34 @@ void AllocBuffers( CudaK32PlotContext& cx )
     Log::Line( "GPU RAM required          : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", cx.devAllocSize,
                    (double)cx.devAllocSize BtoMB, (double)cx.devAllocSize BtoGB );
 
-    Log::Line( "Allocating buffers" );
     // Now actually allocate the buffers
+    Log::Line( "Allocating buffers..." );
     CudaErrCheck( cudaMallocHost( &cx.pinnedBuffer, cx.pinnedAllocSize, cudaHostAllocDefault ) );
 
     #if _DEBUG
         cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
     #else
         #if !_WIN32
-        // if( cx.downloadDirect )
             CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-        // else
-        // {
-        //     // #TODO: On windows, first check if we have enough shared memory (512G)? 
-        //     //        and attempt to alloc that way first. Otherwise, use intermediate pinned buffers.
         #else
+            // On windows we always force the use of intermediate buffers, so we allocate on the host
             cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
         #endif
-        // }
     #endif
 
-    //CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-
     cx.hostBufferTemp = nullptr;
-#if _DEBUG
-    if( cx.hostTempAllocSize )
-        cx.hostBufferTemp   = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
-#endif
+    #if _DEBUG
+        if( cx.hostTempAllocSize )
+            cx.hostBufferTemp = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
+    #endif
+
     if( cx.hostBufferTemp == nullptr && cx.hostTempAllocSize )
         CudaErrCheck( cudaMallocHost( &cx.hostBufferTemp, cx.hostTempAllocSize, cudaHostAllocDefault ) );
 
     CudaErrCheck( cudaMalloc( &cx.deviceBuffer, cx.devAllocSize ) );
 
     // Warm start
-    if( true )
+    if( true )// cx.gCfg->warmStart )
     {
         FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.pinnedBuffer    , cx.pinnedAllocSize    );
         FaultMemoryPages::RunJob( *cx.threadPool, cx.threadPool->ThreadCount(), cx.hostBufferTables, cx.hostTableAllocSize );
@@ -1493,36 +1500,35 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
 
         // In disk-backed mode, we always have pinned buffers,
         // which are the same buffers used to write and read from disk.
-        GpuStreamDescriptor diskDescTables = directDesc;
-        GpuStreamDescriptor diskDescXPair  = directDesc;
-        GpuStreamDescriptor diskDescMeta   = directDesc;
+        GpuStreamDescriptor descSortedTables = directDesc;
+        GpuStreamDescriptor descXPair        = directDesc;
+        GpuStreamDescriptor descMeta         = directDesc;
 
         if( cx.cfg.hybrid128Mode )
         {
             // Temp 1 Queue
-            diskDescTables.pinnedAllocator = acx.pinnedAllocator;
-            diskDescTables.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+            descSortedTables.pinnedAllocator = acx.pinnedAllocator;
+            descSortedTables.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
 
             // Temp 2 Queue
-            diskDescXPair.pinnedAllocator  = acx.pinnedAllocator;
-            diskDescXPair.sliceAlignment   = cx.diskContext->temp2Queue->BlockSize();
+            descXPair.pinnedAllocator  = acx.pinnedAllocator;
+            descXPair.sliceAlignment   = cx.diskContext->temp2Queue->BlockSize();
 
             if( cx.cfg.hybrid64Mode )
             {
-                diskDescMeta.pinnedAllocator = acx.pinnedAllocator;
-                diskDescMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+                descMeta.pinnedAllocator = acx.pinnedAllocator;
+                descMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
             }
         }
 
         // In direct mode, we don't have any intermediate pinned buffers,
         // but our destination buffer is already a pinned buffer.
-        if( cx.cfg.disableDirectDownloads )
+        if( !cx.downloadDirect )
         {
-            directDesc.pinnedAllocator = acx.pinnedAllocator;
-
-            // Assign these here too in case we're not in disk-backed mode
-            diskDescTables.pinnedAllocator = acx.pinnedAllocator;
-            diskDescMeta  .pinnedAllocator = acx.pinnedAllocator;
+            directDesc      .pinnedAllocator = acx.pinnedAllocator;
+            descSortedTables.pinnedAllocator = acx.pinnedAllocator;
+            descMeta        .pinnedAllocator = acx.pinnedAllocator;
+            descXPair       .pinnedAllocator = acx.pinnedAllocator;
         }
 
 
@@ -1531,7 +1537,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
         /// Downloads
         ///
         cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( directDesc, acx.dryRun );
-        cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( diskDescMeta, acx.dryRun );
+        cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         {
             // These download buffers share the same backing buffers
@@ -1545,7 +1551,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( diskDescXPair, acx.dryRun );
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPair, acx.dryRun );
         }
 
         {
@@ -1553,21 +1559,21 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( diskDescTables, acx.dryRun );
-            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( diskDescTables, acx.dryRun );
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descSortedTables, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descSortedTables, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( diskDescTables, acx.dryRun );
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descSortedTables, acx.dryRun );
         }
 
         ///
         /// Uploads
         ///
         cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( directDesc, acx.dryRun );
-        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( diskDescMeta, acx.dryRun );
+        cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         // These uploaded buffers share the same backing buffers
         {
@@ -1581,7 +1587,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( diskDescXPair, acx.dryRun );
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descXPair, acx.dryRun );
         }
 
         /// Device-only allocations
diff --git a/src/util/Util.h b/src/util/Util.h
index 7d38cdde..5cf4753d 100644
--- a/src/util/Util.h
+++ b/src/util/Util.h
@@ -68,11 +68,11 @@
 ///
 /// Assorted utility functions
 /// 
-void Exit( int code );
-void FatalExit();
-void PanicExit();
-void FatalErrorMsg( const char* message, ... );
-void PanicErrorMsg( const char* message, ... );
+[[noreturn]] void Exit( int code );
+[[noreturn]] void FatalExit();
+[[noreturn]] void PanicExit();
+[[noreturn]] void FatalErrorMsg( const char* message, ... );
+[[noreturn]] void PanicErrorMsg( const char* message, ... );
 
 // Fatal: Post a message and exit with error
 // Panic: Same as panic, but the error is unexpected,

From e8d0ec2d74eaadbd3af705521953c6a99eec5280 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Tue, 15 Aug 2023 19:59:57 -0400
Subject: [PATCH 02/26] Fixed issue with indirect uploads.

Incorrect implementation after adding disk support.
---
 .vscode/launch.json       |   2 +-
 cuda/CudaPlotter.cu       |   5 +-
 cuda/GpuDownloadStream.cu |   2 +-
 cuda/GpuStreams.cu        | 104 ++++++++++++++------------------------
 src/main.cpp              |   5 +-
 src/util/Util.h           |   4 +-
 6 files changed, 48 insertions(+), 74 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 7ae86bf7..a5520a00 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -140,7 +140,7 @@
             // "-w -z 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
 
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
-            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
+            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
             
             
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index a7e25d53..edec81a7 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -66,7 +66,7 @@ GPU-based (CUDA) plotter
                          NOTE: If only one of -t1 or -t2 is specified, both will be
                                set to the same directory.
   
-  --no-direct-buffers : Disable using direct downloads and uploads from/to GPU and host.
+ --no-direct-buffers : Disable using direct downloads and uploads from/to GPU and host.
                         If this is set, intermediate buffers are used between the GPU and host,
                         which will means slower plotting times.
                         This is forcefully enabled on Windows to avoid limited pinnable memory.
@@ -152,7 +152,8 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.firstStoredTable = TableId::Table2 + (TableId)cx.gCfg->numDroppedTables;
 
     Log::Line( "[Bladebit CUDA Plotter]" );
-    Log::Line( " Host RAM: %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+    Log::Line( " Host RAM        : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
+    Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" );
     Log::NewLine();
    
     CudaInit( cx );
diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
index c1c2e875..4baca8fc 100644
--- a/cuda/GpuDownloadStream.cu
+++ b/cuda/GpuDownloadStream.cu
@@ -41,7 +41,7 @@ void GpuDownloadBuffer::Download( void* hostBuffer, const size_t size, cudaStrea
 
 void GpuDownloadBuffer::DownloadAndCopy( void* hostBuffer, void* finalBuffer, const size_t size, cudaStream_t workStream  )
 {
-    ASSERT( 0 );
+    Panic( "Unavailable" );
     // ASSERT( self->outgoingSequence < BBCU_BUCKET_COUNT );
     // ASSERT( hostBuffer );
     // ASSERT( workStream );
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index 57ece050..5c40f49a 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -56,6 +56,10 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
             (void)diskBuffer->GetNextReadBuffer();
         });
     }
+    else if( !isDirect )
+    {
+        Panic( "Unimplemented!" );
+    }
 
     // Ensure the device buffer is ready for use
     CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
@@ -143,89 +147,57 @@ void GpuUploadBuffer::UploadArray( const void* hostBuffer, uint32 length, uint32
             (void)diskBuffer->GetNextReadBuffer();
         });
     }
-
-    // Upload to device buffer
-    if( !isDirect )
-    {
-        for( uint32 i = 0; i < length; i++ )
-        {
-            ASSERT( *counts );
-            totalBufferSize += *counts * (size_t)elementSize;
-            counts += countStride;
-        }
-
-        // #TODO: These should be done in a copy stream to perform the copies in the background
-        if( diskBuffer )
-        {
-            CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
-            CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
-        }
-        else
-        {
-            CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToHost, uploadStream ) );
-            CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
-            CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], self->pinnedBuffer[index], totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
-        }
-    }
     else
     {
         // Perform fragmented uploads
-        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        const auto waitEvent = isDirect ? self->deviceEvents[index] : self->pinnedEvent[index];
+        const auto copyMode  = isDirect ? cudaMemcpyHostToDevice : cudaMemcpyHostToHost;
 
-        const byte* src = (byte*)hostBuffer;
-              byte* dst = (byte*)self->deviceBuffer[index];
+        // Wait on device or pinned buffer to be ready (depending if a direct copy or not)
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, waitEvent ) );
+
+        const byte*   src   = (byte*)hostBuffer;
+              byte*   dst   = (byte*)( isDirect ? self->deviceBuffer[index] : self->pinnedBuffer[index] );
+        const uint32* sizes = counts;
 
         for( uint32 i = 0; i < length; i++ )
         {
-            const size_t size = *counts * (size_t)elementSize;
+            const size_t size = *sizes * (size_t)elementSize;
 
-            CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, uploadStream ) );
+            CudaErrCheck( cudaMemcpyAsync( dst, src, size, copyMode, uploadStream ) );
 
             dst    += size;
             src    += srcStride;
-            counts += countStride;
+            sizes += countStride;
         }
-    }
 
-    // Signal work stream that the device buffer is ready to be used
-    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
-
-
-
-    ///
-    /// Old pre-disk Impl
-    ///
-    // ASSERT( hostBuffer );
-    // const uint32 index = SynchronizeOutgoingSequence();
-
-    // auto stream = self->queue->GetStream();
-
-    // // Ensure the device buffer is ready for use
-    // CudaErrCheck( cudaStreamWaitEvent( stream, self->events[index] ) );
-
-    // // Perform uploads
-    // //size_t deviceCopySize = 0;
-    // const byte* src = (byte*)hostBuffer;
-    //       byte* dst = (byte*)self->deviceBuffer[index];
-
-    // for( uint32 i = 0; i < length; i++ )
-    // {
-    //     const size_t size = *counts * (size_t)elementSize;
-    //     //memcpy( dst, src, size );
-    //     CudaErrCheck( cudaMemcpyAsync( dst, src, size, cudaMemcpyHostToDevice, stream ) );
+        if( !isDirect )
+        {
+            // Set the pinned buffer as the host buffer so that we can do a sequential copy to the device now
+            hostBuffer = self->pinnedBuffer[index];
+        }
+    }
 
-    //     //deviceCopySize += size;
+    // Upload to device buffer if in non-direct mode
+    if( !isDirect )
+    {
+        for( uint32 i = 0; i < length; i++ )
+        {
+            ASSERT( *counts );
+            totalBufferSize += *counts * (size_t)elementSize;
+            counts += countStride;
+        }
 
-    //     dst    += size;
-    //     src    += srcStride;
-    //     counts += countStride;
-    // }
+        // #TODO: This should be done in a copy stream to perform the copies in the background
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->deviceEvents[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, totalBufferSize, cudaMemcpyHostToDevice, uploadStream ) );
 
-    // // Copy to device buffer
-    // //CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], cpy.dstBuffer, deviceCopySize, cudaMemcpyHostToDevice, _stream ) );
+        if( !self->diskBuffer )
+            CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+    }
 
-    // // Signal work stream that the device buffer is ready to be used
-    // CudaErrCheck( cudaEventRecord( self->readyEvents[index], stream ) );
+    // Signal work stream that the device buffer is ready to be used
+    CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
 
 void GpuUploadBuffer::UploadArrayForIndex( const uint32 index, const void* hostBuffer, uint32 length, 
diff --git a/src/main.cpp b/src/main.cpp
index ae568d1c..3450aa79 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -477,7 +477,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     Log::Line( " Benchmark mode        : %s", cfg.benchmarkMode ? "enabled" : "disabled" );
     // Log::Line( " Output path           : %s", cfg.outputFolder );
     // Log::Line( "" );
-    
+
 
     FatalIf( plotter == nullptr, "No plotter type chosen." );
 
@@ -486,7 +486,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
 
     // Parse plotter-specific CLI
     plotter->ParseCLI( cfg, cli );
-    
+
     // Parse remaining args as output directories
     cfg.outputFolderCount = (uint32)cli.RemainingArgCount();
     FatalIf( cfg.outputFolderCount < 1, "At least one output folder must be specified." );
@@ -498,6 +498,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     while( cli.HasArgs() )
     {
         outPath = cli.Arg();
+        FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath );
 
         // Add trailing slash?
         const char endChar = outPath.back();
diff --git a/src/util/Util.h b/src/util/Util.h
index 5cf4753d..e4477e84 100644
--- a/src/util/Util.h
+++ b/src/util/Util.h
@@ -71,8 +71,8 @@
 [[noreturn]] void Exit( int code );
 [[noreturn]] void FatalExit();
 [[noreturn]] void PanicExit();
-[[noreturn]] void FatalErrorMsg( const char* message, ... );
-[[noreturn]] void PanicErrorMsg( const char* message, ... );
+void FatalErrorMsg( const char* message, ... );
+void PanicErrorMsg( const char* message, ... );
 
 // Fatal: Post a message and exit with error
 // Panic: Same as panic, but the error is unexpected,

From 58d6cd42d4e58344a31c6efc32c4e1b95190182a Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 16 Aug 2023 21:44:23 -0400
Subject: [PATCH 03/26] Fixing buffer transfers for windows disk gpu

---
 .vscode/launch.json        |  3 +-
 cuda/CudaPlotPhase3.cu     |  8 ++--
 cuda/CudaPlotter.cu        | 84 +++++++++++++++++++-------------------
 cuda/GpuStreams.cu         | 15 ++++++-
 src/tools/PlotComparer.cpp |  4 +-
 5 files changed, 62 insertions(+), 52 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index a5520a00..06dea950 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -140,7 +140,8 @@
             // "-w -z 3 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot ~/plot/tmp",
 
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
-            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
+            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
             
             
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index 101e8cd9..60e5b1d6 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -704,7 +704,7 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
+    desc.pinnedAllocator = nullptr;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
@@ -728,7 +728,7 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
+    desc.pinnedAllocator = nullptr;
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
@@ -753,7 +753,7 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
+    desc.pinnedAllocator = nullptr;
 
     auto&        s2        = cx.phase3->step2;
     const size_t alignment = acx.alignment;
@@ -777,7 +777,7 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.sliceAlignment  = acx.alignment;
     desc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
     desc.deviceAllocator = acx.devAllocator;
-    desc.pinnedAllocator = cx.downloadDirect ? nullptr : acx.pinnedAllocator;
+    desc.pinnedAllocator = nullptr;
 
     auto&        s3        = cx.phase3->step3;
     const size_t alignment = acx.alignment;
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index edec81a7..31d90f3c 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -193,7 +193,9 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     if( cx.downloadDirect || cx.cfg.hybrid128Mode )
     {
         cx.parkContext    = new CudaK32ParkContext{};
-        cx.useParkContext = true;
+
+        if( cx.cfg.hybrid64Mode )
+            cx.useParkContext = true;
     }
 
     // Check for hybrid mode
@@ -1140,8 +1142,8 @@ void UploadBucketForTable( CudaK32PlotContext& cx, const uint64 bucket )
 
     const uint32* hostY        = cx.hostY;
     const uint32* hostMeta     = cx.hostMeta;
-    const uint32* hostPairsL   = cx.hostTableL; //cx.hostBackPointers[6].left;
-    const uint16* hostPairsR   = cx.hostTableR; //cx.hostBackPointers[6].right;
+    const uint32* hostPairsL   = cx.hostTableL;
+    const uint16* hostPairsR   = cx.hostTableR;
 
     const bool   uploadCompressed   = cx.table > TableId::Table2 && (uint32)cx.table-1 <= cx.gCfg->numDroppedTables;
     const bool   uploadInlinedPairs = !uploadCompressed && (uint32)cx.table == cx.gCfg->numDroppedTables+2;
@@ -1271,8 +1273,9 @@ void AllocBuffers( CudaK32PlotContext& cx )
         // May need to allocate extra pinned buffers for park buffers
         if( allocateParkBuffers )
         {
+            pinnedAllocator = {};
             AllocateParkSerializationBuffers( cx, *acx.pinnedAllocator, acx.dryRun );
-            parksPinnedSize = acx.pinnedAllocator->Size();
+            parksPinnedSize = pinnedAllocator.Size();
         }
     }
 
@@ -1301,12 +1304,18 @@ void AllocBuffers( CudaK32PlotContext& cx )
     #if _DEBUG
         cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
     #else
-        #if !_WIN32
-            CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
-        #else
+
+        bool allocateHostTablesPinned = cx.downloadDirect;
+        #if _WIN32
             // On windows we always force the use of intermediate buffers, so we allocate on the host
-            cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
+            allocateHostTablesPinned = true;
         #endif
+        
+        Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
+        if( allocateHostTablesPinned )
+            CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
+        else
+            cx.hostBufferTables = bbvirtallocboundednuma<byte>( cx.hostTableAllocSize );
     #endif
 
     cx.hostBufferTemp = nullptr;
@@ -1491,29 +1500,23 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
 
     /// Device & Pinned allocations
     {
-        GpuStreamDescriptor directDesc{};
-        directDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
-        directDesc.sliceCount      = BBCU_BUCKET_COUNT;
-        directDesc.sliceAlignment  = alignment;
-        directDesc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
-        directDesc.deviceAllocator = acx.devAllocator;
-        directDesc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
+        GpuStreamDescriptor yDesc{};
+        yDesc.entriesPerSlice = BBCU_MAX_SLICE_ENTRY_COUNT;
+        yDesc.sliceCount      = BBCU_BUCKET_COUNT;
+        yDesc.sliceAlignment  = alignment;
+        yDesc.bufferCount     = BBCU_DEFAULT_GPU_BUFFER_COUNT;
+        yDesc.deviceAllocator = acx.devAllocator;
+        yDesc.pinnedAllocator = nullptr;             // Start in direct mode (no intermediate pinined buffers)
 
         // In disk-backed mode, we always have pinned buffers,
         // which are the same buffers used to write and read from disk.
-        GpuStreamDescriptor descSortedTables = directDesc;
-        GpuStreamDescriptor descXPair        = directDesc;
-        GpuStreamDescriptor descMeta         = directDesc;
+        GpuStreamDescriptor descTablePairs   = yDesc;
+        GpuStreamDescriptor descMeta         = yDesc;
 
         if( cx.cfg.hybrid128Mode )
         {
-            // Temp 1 Queue
-            descSortedTables.pinnedAllocator = acx.pinnedAllocator;
-            descSortedTables.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
-
-            // Temp 2 Queue
-            descXPair.pinnedAllocator  = acx.pinnedAllocator;
-            descXPair.sliceAlignment   = cx.diskContext->temp2Queue->BlockSize();
+            descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+            descTablePairs.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
 
             if( cx.cfg.hybrid64Mode )
             {
@@ -1522,22 +1525,17 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             }
         }
 
-        // In direct mode, we don't have any intermediate pinned buffers,
-        // but our destination buffer is already a pinned buffer.
         if( !cx.downloadDirect )
         {
-            directDesc      .pinnedAllocator = acx.pinnedAllocator;
-            descSortedTables.pinnedAllocator = acx.pinnedAllocator;
-            descMeta        .pinnedAllocator = acx.pinnedAllocator;
-            descXPair       .pinnedAllocator = acx.pinnedAllocator;
+            // Use intermediate pinned buffer for transfers to non-pinned destinations
+            descTablePairs.pinnedAllocator = acx.pinnedAllocator;
         }
 
 
-
         ///
         /// Downloads
         ///
-        cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( directDesc, acx.dryRun );
+        cx.yOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( yDesc, acx.dryRun );
         cx.metaOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         {
@@ -1545,14 +1543,14 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( directDesc, acx.dryRun );
-            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( directDesc, acx.dryRun );
+            cx.pairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPair, acx.dryRun );
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descTablePairs, acx.dryRun );
         }
 
         {
@@ -1560,20 +1558,20 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descSortedTables, acx.dryRun );
-            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descSortedTables, acx.dryRun );
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descSortedTables, acx.dryRun );
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descTablePairs, acx.dryRun );
         }
 
         ///
         /// Uploads
         ///
-        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( directDesc, acx.dryRun );
+        cx.yIn    = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( yDesc, acx.dryRun );
         cx.metaIn = cx.gpuUploadStream[0]->CreateUploadBufferT<K32Meta4>( descMeta, acx.dryRun );
 
         // These uploaded buffers share the same backing buffers
@@ -1581,14 +1579,14 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( directDesc, acx.dryRun );
-            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( directDesc, acx.dryRun );
+            cx.pairsLIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint32>( descTablePairs, acx.dryRun );
+            cx.pairsRIn = cx.gpuUploadStream[0]->CreateUploadBufferT<uint16>( descTablePairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descXPair, acx.dryRun );
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descTablePairs, acx.dryRun );
         }
 
         /// Device-only allocations
diff --git a/cuda/GpuStreams.cu b/cuda/GpuStreams.cu
index 5c40f49a..63700c9c 100644
--- a/cuda/GpuStreams.cu
+++ b/cuda/GpuStreams.cu
@@ -50,7 +50,7 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
             diskBuffer->ReadNextBucket();
 
             // Block until the buffer is fully read from disk
-            // #TODO: Also not do this here, but in a disk stream,
+            // #TODO: Also should not do this here, but in a host-to-host background stream,
             //        so that the next I/O read can happen in the background while
             //        the previous upload to disk is happening, if needed.
             (void)diskBuffer->GetNextReadBuffer();
@@ -58,7 +58,12 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
     }
     else if( !isDirect )
     {
-        Panic( "Unimplemented!" );
+        // Copy from unpinned to pinned first
+        // #TODO: This should be done in a different backgrund host-to-host copy stream
+        CudaErrCheck( cudaStreamWaitEvent( uploadStream, self->pinnedEvent[index] ) );
+        CudaErrCheck( cudaMemcpyAsync( self->pinnedBuffer[index], hostBuffer, size, cudaMemcpyHostToHost, uploadStream ) );
+
+        hostBuffer = self->pinnedBuffer[index];
     }
 
     // Ensure the device buffer is ready for use
@@ -67,6 +72,12 @@ void GpuUploadBuffer::Upload( const void* hostBuffer, size_t size, cudaStream_t
     // Upload to the device buffer
     CudaErrCheck( cudaMemcpyAsync( self->deviceBuffer[index], hostBuffer, size, cudaMemcpyHostToDevice, uploadStream ) );
 
+    if( !isDirect )
+    {
+        // Signal that the pinned buffer is ready for re-use
+        CudaErrCheck( cudaEventRecord( self->pinnedEvent[index], uploadStream ) );
+    }
+
     // Signal work stream that the device buffer is ready to be used
     CudaErrCheck( cudaEventRecord( self->readyEvents[index], uploadStream ) );
 }
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index 625074b6..f275d980 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -105,12 +105,12 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
-    // TestC3Table( refPlot, tgtPlot );
+    TestC3Table( refPlot, tgtPlot );
 
     for( TableId table = TableId::Table1; table <= TableId::Table7; table++ )
         TestTable( refPlot, tgtPlot, table );
 
-    TestC3Table( refPlot, tgtPlot );
+    // TestC3Table( refPlot, tgtPlot );
 }
 
 //-----------------------------------------------------------

From b18862403ccfc92b7bd3bf46dde377310526e132 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 16 Aug 2023 22:49:14 -0400
Subject: [PATCH 04/26] preprocessor def fix

---
 cuda/CudaPlotter.cu | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 31d90f3c..40d36324 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -177,7 +177,7 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     cx.plotFence  = new Fence();
     cx.parkFence  = new Fence();
 
-    #if __WIN32
+    #if _WIN32
         // #MAYBE: Add a configurable option to enable direct downloads on windows?
         // On windows always default to using intermediate pinned buffers
         cx.downloadDirect = false;
@@ -1510,13 +1510,20 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
 
         // In disk-backed mode, we always have pinned buffers,
         // which are the same buffers used to write and read from disk.
-        GpuStreamDescriptor descTablePairs   = yDesc;
-        GpuStreamDescriptor descMeta         = yDesc;
+        GpuStreamDescriptor descTablePairs       = yDesc;
+        GpuStreamDescriptor descTableSortedPairs = yDesc;
+        GpuStreamDescriptor descXPairs           = yDesc;
+        GpuStreamDescriptor descMeta             = yDesc;
 
         if( cx.cfg.hybrid128Mode )
         {
-            descTablePairs.pinnedAllocator = acx.pinnedAllocator;
-            descTablePairs.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+            // Temp 1 Queue
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descTableSortedPairs.sliceAlignment  = cx.diskContext->temp1Queue->BlockSize();
+
+            // Temp 2 Queue
+            descXPairs.pinnedAllocator   = acx.pinnedAllocator;
+            descXPairs.sliceAlignment    = cx.diskContext->temp2Queue->BlockSize();
 
             if( cx.cfg.hybrid64Mode )
             {

From c27d1e648dbd80ced693d2488bcd7e2115090a5f Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 16 Aug 2023 23:25:01 -0400
Subject: [PATCH 05/26] Set correct descriptors during transfer setup

---
 cuda/CudaPlotter.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 40d36324..1bc89671 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -1557,7 +1557,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descTablePairs, acx.dryRun );
+            cx.xPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         {
@@ -1565,14 +1565,14 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             const size_t devMarker    = acx.devAllocator->Size();
             const size_t pinnedMarker = acx.pinnedAllocator->Size();
 
-            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTablePairs, acx.dryRun );
-            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTablePairs, acx.dryRun );
+            cx.sortedPairsLOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( descTableSortedPairs, acx.dryRun );
+            cx.sortedPairsROut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint16>( descTableSortedPairs, acx.dryRun );
 
             acx.devAllocator->PopToMarker( devMarker );
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descTablePairs, acx.dryRun );
+            cx.sortedXPairsOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         ///
@@ -1593,7 +1593,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             acx.pinnedAllocator->PopToMarker( pinnedMarker );
 
             // Allocate Pair at the end, to ensure we grab the highest value
-            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descTablePairs, acx.dryRun );
+            cx.xPairsIn = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( descXPairs, acx.dryRun );
         }
 
         /// Device-only allocations

From a55bfd153c81ac76b41ce63f659185204b794930 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Thu, 17 Aug 2023 19:45:32 -0400
Subject: [PATCH 06/26] Trivial fixes

---
 cuda/CudaPlotter.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 1bc89671..b2175e28 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -56,7 +56,7 @@ GPU-based (CUDA) plotter
 [OPTIONS]:
  -h, --help           : Shows this help message and exits.
  -d, --device         : Select the CUDA device index. (default=0)
- 
+
  --disk-128           : Enable hybrid disk plotting for 128G system RAM. 
                          Requires a --temp1 and --temp2 to be set.
  --disk-64            : Enable hybrid disk plotting for 64G system RAM. 
@@ -65,7 +65,7 @@ GPU-based (CUDA) plotter
  -t2, --temp2         : Temporary directory 2. Used for temporary, shorted-lived read and writes.
                          NOTE: If only one of -t1 or -t2 is specified, both will be
                                set to the same directory.
-  
+
  --no-direct-buffers : Disable using direct downloads and uploads from/to GPU and host.
                         If this is set, intermediate buffers are used between the GPU and host,
                         which will means slower plotting times.

From 2eeee84d90503b6429ba4aead860055da69498c5 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Tue, 22 Aug 2023 14:08:20 -0400
Subject: [PATCH 07/26] Supporting 16G instead of 64G - phase 1

---
 .vscode/launch.json    |  4 +--
 cuda/CudaPlotContext.h |  6 ++--
 cuda/CudaPlotPhase3.cu |  3 +-
 cuda/CudaPlotter.cu    | 62 +++++++++++++++++++++++++++++++++---------
 cuda/chacha8.cu        |  3 ++
 5 files changed, 60 insertions(+), 18 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 06dea950..19e72437 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -141,8 +141,8 @@
 
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot /home/harold/plot",
             // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk --no-direct-buffers /home/harold/plot",
-            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
-            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
+            // "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-128 -t1 /home/harold/plotdisk /home/harold/plot",
+            "-w -z 1 -f ade0cc43610ce7540ab96a524d0ab17f5df7866ef13d1221a7203e5d10ad2a4ae37f7b73f6cdfd6ddf4122e8a1c2f8ef -p 80a836a74b077cabaca7a76d1c3c9f269f7f3a8f2fa196a65ee8953eb81274eb8b7328d474982617af5a0fe71b47e9b8 -i c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835 cudaplot --disk-64 -t1 /home/harold/plotdisk /home/harold/plot",
             
             
             "windows": {
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index c313b696..7fc29777 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -47,8 +47,10 @@ struct CudaK32HybridMode
     DiskQueue*  temp1Queue;  // Tables Queue
     DiskQueue*  temp2Queue;  // Metadata Queue (could be the same as temp1Queue)
 
-    DiskBucketBuffer* metaBuffer;   // Enabled in 64G mode
-    DiskBucketBuffer* unsortedXs;   // Unsorted Xs are written to disk (uint64 entries)
+    DiskBucketBuffer* metaBuffer;   // Enabled in < 128G mode
+    DiskBucketBuffer* yBuffer;      // Enabled in < 128G mode
+    DiskBucketBuffer* unsortedL;    // Unsorted Xs (or L pairs in < 128G) are written to disk (uint64 entries)
+    DiskBucketBuffer* unsortedR;    // Unsorted R pairs in < 128G mode
 
     DiskBuffer*       tablesL[7];
     DiskBuffer*       tablesR[7];
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index 60e5b1d6..9df0b09a 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -203,7 +203,7 @@ __global__ void PruneAndWriteRMap(
 */
 //-----------------------------------------------------------
 void CudaK32PlotPhase3( CudaK32PlotContext& cx )
-{
+{Log::Line("That's all for now");Exit(0);
     // Set-up our context
     memset( cx.phase3->prunedBucketCounts    , 0, sizeof( cx.phase3->prunedBucketCounts ) );
     memset( cx.phase3->prunedTableEntryCounts, 0, sizeof( cx.phase3->prunedTableEntryCounts ) );
@@ -612,6 +612,7 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 //-----------------------------------------------------------
 void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+return;
     auto& p3 = *cx.phase3;
 
     // Shared allocations
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index b2175e28..3e6700b7 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -321,7 +321,7 @@ void CudaK32Plotter::Run( const PlotRequest& req )
     if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
     {
         if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
-        if( cx.diskContext->unsortedXs ) delete cx.diskContext->unsortedXs;
+        if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL;
 
         for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
         {
@@ -506,13 +506,17 @@ void FpTable( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid128Mode )
     {
-        if( cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
+        if( cx.cfg.hybrid64Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
         {
-            cx.diskContext->unsortedXs->Swap();
+            cx.diskContext->unsortedL->Swap();
         }
 
         if( cx.cfg.hybrid64Mode )
+        {
+            cx.diskContext->yBuffer->Swap();
             cx.diskContext->metaBuffer->Swap();
+            cx.diskContext->unsortedR->Swap();
+        }
     }
 
     cx.yIn     .Reset();
@@ -1003,6 +1007,9 @@ void FinalizeTable7( CudaK32PlotContext& cx )
     {
         cx.diskContext->tablesL[(int)TableId::Table7]->Swap();
         cx.diskContext->tablesR[(int)TableId::Table7]->Swap();
+
+        if( cx.cfg.hybrid64Mode )
+            cx.diskContext->yBuffer->Swap();
     }
 
     auto elapsed = TimerEnd( timer );
@@ -1394,20 +1401,24 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
         // Temp allocations are pinned host buffers that can be re-used for other means in different phases.
         // This is roughly equivalent to temp2 dir during disk plotting.
 
-        cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
 
         if( !cx.cfg.hybrid64Mode )
         {
+            cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
             cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
         }
         else if( !cx.diskContext->metaBuffer )
         {
+            const size_t ySliceSize    = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT;
             const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT;
 
+            cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "y.tmp", 
+                                            BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->yBuffer, "Failed to create y.tmp disk buffer." );
+
             cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "metadata.tmp", 
                                             BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
-
-            FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
+            FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata.tmp disk buffer." );
         }
 Log::Line( "Host Temp @ %llu GiB", (llu)acx.hostTempAllocator->Size() BtoGB );
 Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
@@ -1486,14 +1497,24 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
                 multiplier = 1;
             }
 
-            cx.hostTableL = acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-            cx.hostTableR = acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
-
             // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead.
             const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair );
-            cx.diskContext->unsortedXs = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_x.tmp", 
+            cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_l.tmp", 
                                                                    BBCU_BUCKET_COUNT, xSliceSize, fileMode, FileAccess::ReadWrite, tmp2FileFlags );
-            FatalIf( !cx.diskContext->unsortedXs, "Failed to create unsorted_x.tmp disk buffer." );
+            FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted_l.tmp disk buffer." );
+
+            if( cx.cfg.hybrid64Mode )
+            {
+                cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_r.tmp", 
+                                                                    BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), fileMode, FileAccess::ReadWrite, tmp2FileFlags );
+                FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted_r.tmp disk buffer." );
+            }
+            else
+            {
+                // In 128G mode we can store intermediate pairs in the host
+                cx.hostTableL = acx.hostTableAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+                cx.hostTableR = acx.hostTableAllocator->CAlloc<uint16>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
+            }
         }
     }
 Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
@@ -1527,8 +1548,14 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
 
             if( cx.cfg.hybrid64Mode )
             {
+                yDesc.pinnedAllocator = acx.pinnedAllocator;
+                yDesc.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
                 descMeta.pinnedAllocator = acx.pinnedAllocator;
                 descMeta.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
+
+                descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+                descTablePairs.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
             }
         }
 
@@ -1629,11 +1656,20 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
     /// In disk-backed mode, assign disk buffers to gpu buffers
     if( cx.cfg.hybrid128Mode && !acx.dryRun )
     {
-        cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedXs );
-        cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedXs );
+        cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+        cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL );
 
         if( cx.cfg.hybrid64Mode )
         {
+            cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL );
+            cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL );
+
+            cx.pairsROut.AssignDiskBuffer( cx.diskContext->unsortedR );
+            cx.pairsRIn .AssignDiskBuffer( cx.diskContext->unsortedR );
+
+            cx.yOut.AssignDiskBuffer( cx.diskContext->yBuffer );
+            cx.yIn .AssignDiskBuffer( cx.diskContext->yBuffer );
+
             cx.metaOut.AssignDiskBuffer( cx.diskContext->metaBuffer );
             cx.metaIn .AssignDiskBuffer( cx.diskContext->metaBuffer );
         }
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
index 2aca03e5..ead1f67c 100644
--- a/cuda/chacha8.cu
+++ b/cuda/chacha8.cu
@@ -250,7 +250,10 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     cx.metaOut.Reset();
 
     if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->yBuffer->Swap();
         cx.diskContext->metaBuffer->Swap();
+    }
 }
 
 ///

From 4fc3d977910ec78ab47e9d7e119d8fafcf3f5db1 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Tue, 22 Aug 2023 18:00:47 -0400
Subject: [PATCH 08/26] Phase 3 updates Getting phase 3 updated for lower RAM
 reqs.

Not currently working - 0 values coming from the first
disk buffer.
---
 .vscode/launch.json         |  4 +-
 cuda/CudaPlotConfig.h       |  5 ++-
 cuda/CudaPlotContext.h      | 17 +++++--
 cuda/CudaPlotPhase3.cu      | 88 ++++++++++++++++++++++++++++++++++---
 cuda/CudaPlotPhase3Step3.cu |  4 ++
 cuda/CudaPlotter.cu         |  2 +
 src/tools/PlotComparer.cpp  |  2 +-
 7 files changed, 109 insertions(+), 13 deletions(-)

diff --git a/.vscode/launch.json b/.vscode/launch.json
index 19e72437..bceef625 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -358,8 +358,8 @@
 
                 /// Compare
                 "plotcmp",
-                "/home/harold/plot/plot-k32-c01-2023-08-09-20-50-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot.ref",
-                "/home/harold/plot/plot-k32-c01-2023-08-09-21-33-0a1b7c85644fcb9c274c5b75060ffd2a718c3c246fa24cba4399e1106d042172.plot",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
+                "/home/harold/plot/plot-k32-c01-2023-08-22-16-21-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot",
 
                 // "/home/harold/plot/plot-k32-c01-2023-08-03-22-59-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
                 // "/home/harold/plot/jmplot-c01-c6b84729c23dc6d60c92f22c17083f47845c1179227c5509f07a5d2804a7b835.plot"
diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
index 8fa77588..5861d64f 100644
--- a/cuda/CudaPlotConfig.h
+++ b/cuda/CudaPlotConfig.h
@@ -48,8 +48,8 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_REF_DIR       "/home/harold/plots/ref/"
 
 
-    // #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
-    // #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
+    #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
+    #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
 
     #if (defined( BBCU_DBG_SKIP_PHASE_2 ) && !defined( BBCU_DBG_SKIP_PHASE_1 ) )
         #define BBCU_DBG_SKIP_PHASE_1 1
@@ -60,6 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_P2_WRITE_MARKS   1
 
     // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
+    #define DBG_BBCU_KEEP_TEMP_FILES 1
 
 
     #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index 7fc29777..eadd5fca 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -60,8 +60,19 @@ struct CudaK32HybridMode
 
     struct
     {
-        DiskBucketBuffer* lpOut;
-        DiskBucketBuffer* indexOut;
+        // #NOTE: These are an alias to the unsortedL buffer from phase 1.
+        //        The same file & disk buffer is repurposed for this usage.
+        union {
+            DiskBucketBuffer* lpBuffer;
+            DiskBucketBuffer* lMapBuffer;
+        };
+
+        // #NOTE: These are an alias to metaBuffer from phase 1.
+        //        The same file & disk buffer is repurposed for phase 3.
+        union {
+            DiskBucketBuffer* indexBuffer;
+            DiskBucketBuffer* rMapBuffer;
+        };
 
     } phase3;
 };
@@ -153,7 +164,7 @@ struct CudaK32Phase3
     struct {
         GpuUploadBuffer   lpIn;         // Line points from step 2
         GpuUploadBuffer   indexIn;      // Indices from step 2
-        GpuDownloadBuffer mapOut;       // lTable for next step 1
+        GpuDownloadBuffer mapOut;       // lTable for next step 2
         GpuDownloadBuffer parksOut;     // Downloads park buffers to host
 
         uint32*           hostParkOverrunCount;
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index 9df0b09a..19c2c592 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -203,7 +203,7 @@ __global__ void PruneAndWriteRMap(
 */
 //-----------------------------------------------------------
 void CudaK32PlotPhase3( CudaK32PlotContext& cx )
-{Log::Line("That's all for now");Exit(0);
+{
     // Set-up our context
     memset( cx.phase3->prunedBucketCounts    , 0, sizeof( cx.phase3->prunedBucketCounts ) );
     memset( cx.phase3->prunedTableEntryCounts, 0, sizeof( cx.phase3->prunedTableEntryCounts ) );
@@ -235,6 +235,13 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
     }
     #endif
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.lMapBuffer->Swap();
+        cx.diskContext->phase3.rMapBuffer->Swap();
+    }
+
+
     const uint32 compressionLevel = cx.gCfg->compressionLevel;
 
     // Special case with the starting table, since it has the values inlined already
@@ -250,6 +257,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         CompressInlinedTable( cx );
         auto elapsed = TimerEnd( timer );
         Log::Line( " Step 1 completed step in %.2lf seconds.", elapsed );
+
         timer = TimerBegin();
         CudaK32PlotPhase3Step3( cx );
 
@@ -598,6 +606,12 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.lpBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
 // #if _DEBUG
 //     // DbgValidateIndices( cx );
 //     DbgValidateStep2Output( cx );
@@ -612,7 +626,6 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 //-----------------------------------------------------------
 void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
-return;
     auto& p3 = *cx.phase3;
 
     // Shared allocations
@@ -623,9 +636,12 @@ return;
     p3.hostRMap            = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
     p3.hostLinePoints      = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
 
+    
     if( cx.cfg.hybrid64Mode )
     {
-        Panic( "Unimplemented for 64G mode. Need to offload LMap/Line Points to disk." );
+        // Re-purpose these disk buffers for our use
+        cx.diskContext->phase3.rMapBuffer = cx.diskContext->metaBuffer;
+        cx.diskContext->phase3.lMapBuffer = cx.diskContext->unsortedL;
     }
 
     #if _DEBUG
@@ -709,8 +725,13 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
+    {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
+        if( cx.cfg.hybrid64Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto& tx = cx.phase3->xTable;
 
     tx.devRMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
@@ -718,6 +739,12 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     tx.xIn       = cx.gpuUploadStream[0]->CreateUploadBufferT<Pair>( uploadDesc, acx.dryRun );
     tx.lpOut     = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     tx.indexOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( desc, acx.dryRun );
+
+    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    {
+        tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -733,8 +760,13 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
 
     GpuStreamDescriptor uploadDesc = desc;
     if( cx.cfg.hybrid128Mode )
+    {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
+        if( cx.cfg.hybrid64Mode )
+            desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s1        = cx.phase3->step1;
     const size_t alignment = acx.alignment;
 
@@ -743,6 +775,11 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     s1.rMapOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<RMap>( desc, acx.dryRun );
 
     s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    {
+        s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -756,6 +793,12 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.deviceAllocator = acx.devAllocator;
     desc.pinnedAllocator = nullptr;
 
+    GpuStreamDescriptor uploadDesc = desc;
+    if( cx.cfg.hybrid64Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s2        = cx.phase3->step2;
     const size_t alignment = acx.alignment;
 
@@ -767,6 +810,15 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
 
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
+
+    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    {
+        s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
+        s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lMapBuffer );
+
+        s2.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+    }
 }
 
 //-----------------------------------------------------------
@@ -780,6 +832,11 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.deviceAllocator = acx.devAllocator;
     desc.pinnedAllocator = nullptr;
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        desc.pinnedAllocator = acx.pinnedAllocator;
+    }
+
     auto&        s3        = cx.phase3->step3;
     const size_t alignment = acx.alignment;
 
@@ -819,6 +876,14 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
 
     s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
     s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
+
+    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    {
+        s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
+
+        s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lMapBuffer );
+    }
 }
 
 
@@ -972,7 +1037,6 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
 
     const uint32* reader       = p3.hostIndices;
     const size_t  readerStride = P3_PRUNED_SLICE_MAX * 3;
-
     uint64 entryCount = 0;
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
@@ -981,7 +1045,15 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
         {
             const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
 
-            bbmemcpy_t( idxWriter, reader, copyCount );
+            if( cx.cfg.hybrid64Mode )
+            {
+                cx.diskContext->phase3.indexBuffer->ReadNextBucket();
+                const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs<uint32>();
+
+                bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
+            }
+            else
+                bbmemcpy_t( idxWriter, reader, copyCount );
 
             idxWriter += copyCount;
             entryCount += copyCount;
@@ -989,6 +1061,12 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
         }
     }
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     ASSERT( entryCount == p3.prunedTableEntryCounts[(int)cx.table] );
 
     RadixSort256::Sort<BB_MAX_JOBS>( pool, indices, idxTmp, entryCount );
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index 3f12dd05..fae090d7 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -318,6 +318,10 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     s3.lpIn   .Reset();
     s3.indexIn.Reset();
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.lMapBuffer->Swap();
+    }
 
 
     // #if _DEBUG
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 3e6700b7..7cc3e2e2 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -318,6 +318,7 @@ void CudaK32Plotter::Run( const PlotRequest& req )
     cx.plotWriter = nullptr;
 
     // Delete any temporary files
+    #if !(DBG_BBCU_KEEP_TEMP_FILES)
     if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
     {
         if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
@@ -329,6 +330,7 @@ void CudaK32Plotter::Run( const PlotRequest& req )
             if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
         }
     }
+    #endif
 }
 
 //-----------------------------------------------------------
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index f275d980..8b8ea453 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -105,7 +105,7 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
-    TestC3Table( refPlot, tgtPlot );
+    // TestC3Table( refPlot, tgtPlot );
 
     for( TableId table = TableId::Table1; table <= TableId::Table7; table++ )
         TestTable( refPlot, tgtPlot, table );

From 16e1310a3bf510eca0ba20a3b15290ab2352808d Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 23 Aug 2023 23:14:17 -0400
Subject: [PATCH 09/26] Fixed Phase3 disk issues for 16G

---
 cuda/CudaPlotContext.h          | 29 +++++-----
 cuda/CudaPlotPhase3.cu          | 93 ++++++++++++++++++++++-----------
 cuda/CudaPlotPhase3Step2.cu     | 12 ++++-
 cuda/CudaPlotPhase3Step3.cu     |  3 +-
 cuda/CudaPlotter.cu             | 32 ++++++------
 src/plotting/DiskBucketBuffer.h |  2 +-
 src/tools/PlotComparer.cpp      |  2 +-
 7 files changed, 109 insertions(+), 64 deletions(-)

diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index eadd5fca..2e8323f9 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -44,6 +44,17 @@ struct CudaK32ParkContext
 
 struct CudaK32HybridMode
 {
+    // For clarity, these are the file names for the disk buffers
+    // whose disk space will be shared for temp data in both phase 1 and phase 3.
+    // The name indicates their usage and in which phase.
+    static constexpr std::string_view Y_DISK_BUFFER_FILE_NAME      = "p1y-p3index.tmp";
+    static constexpr std::string_view META_DISK_BUFFER_FILE_NAME   = "p1meta-p3rmap.tmp";
+    static constexpr std::string_view LPAIRS_DISK_BUFFER_FILE_NAME = "p1unsortedx-p1lpairs-p3lp-p3-lmap.tmp";
+
+    static constexpr std::string_view P3_RMAP_DISK_BUFFER_FILE_NAME        = META_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_INDEX_DISK_BUFFER_FILE_NAME       = Y_DISK_BUFFER_FILE_NAME;
+    static constexpr std::string_view P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME = LPAIRS_DISK_BUFFER_FILE_NAME;
+
     DiskQueue*  temp1Queue;  // Tables Queue
     DiskQueue*  temp2Queue;  // Metadata Queue (could be the same as temp1Queue)
 
@@ -60,19 +71,11 @@ struct CudaK32HybridMode
 
     struct
     {
-        // #NOTE: These are an alias to the unsortedL buffer from phase 1.
-        //        The same file & disk buffer is repurposed for this usage.
-        union {
-            DiskBucketBuffer* lpBuffer;
-            DiskBucketBuffer* lMapBuffer;
-        };
-
-        // #NOTE: These are an alias to metaBuffer from phase 1.
-        //        The same file & disk buffer is repurposed for phase 3.
-        union {
-            DiskBucketBuffer* indexBuffer;
-            DiskBucketBuffer* rMapBuffer;
-        };
+        // #NOTE: These buffers shared the same file-backed storage as
+        //        with other buffers in phase 1.
+        DiskBucketBuffer* rMapBuffer;           // Step 1
+        DiskBucketBuffer* indexBuffer;          // X-step/Step 2
+        DiskBucketBuffer* lpAndLMapBuffer;      // X-step/Step 2 (LP) | Step 3 (LMap)
 
     } phase3;
 };
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index 19c2c592..e08e6c1c 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -237,8 +237,9 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid64Mode )
     {
-        cx.diskContext->phase3.lMapBuffer->Swap();
         cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
     }
 
 
@@ -292,7 +293,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
         Log::Line( "Compressing tables %u and %u...", (uint)rTable, (uint)rTable+1 );
 
         cx.table = rTable;
-        
+
         #if BBCU_DBG_SKIP_PHASE_2
             if( rTable < TableId::Table7 )
                 DbgLoadTablePairs( cx, rTable+1, false );
@@ -386,7 +387,6 @@ void Step1( CudaK32PlotContext& cx )
     p3.pairsLoadOffset = 0;
     LoadBucket( cx, 0 );
 
-
     ///
     /// Process buckets
     ///
@@ -432,7 +432,7 @@ void Step1( CudaK32PlotContext& cx )
         s1.rMapOut.Download2DT<RMap>( p3.hostRMap + (size_t)bucket * P3_PRUNED_SLICE_MAX,
             P3_PRUNED_SLICE_MAX, BBCU_BUCKET_COUNT, P3_PRUNED_BUCKET_MAX, P3_PRUNED_SLICE_MAX, cx.computeStream );
     }
-    
+
     // Download slice counts
     cudaStream_t downloadStream = s1.rMapOut.GetQueue()->GetStream();
 
@@ -467,6 +467,11 @@ void Step1( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+    }
+
     // #if _DEBUG
     //     DbgValidateRMap( cx );
     // #endif
@@ -608,13 +613,13 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid64Mode )
     {
-        cx.diskContext->phase3.lpBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
         cx.diskContext->phase3.indexBuffer->Swap();
     }
 
 // #if _DEBUG
-//     // DbgValidateIndices( cx );
-//     DbgValidateStep2Output( cx );
+//     DbgValidateIndices( cx );
+//     // DbgValidateStep2Output( cx );
 //     // DbgDumpSortedLinePoints( cx );
 // #endif
 }
@@ -626,6 +631,8 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
 //-----------------------------------------------------------
 void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 {
+    static_assert( sizeof( LMap ) == sizeof( uint64 ) );
+
     auto& p3 = *cx.phase3;
 
     // Shared allocations
@@ -633,15 +640,30 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
     p3.devPrunedEntryCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
 
     // Host allocations
-    p3.hostRMap            = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
-    p3.hostLinePoints      = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
-
-    
-    if( cx.cfg.hybrid64Mode )
+    if( !cx.cfg.hybrid64Mode )
     {
-        // Re-purpose these disk buffers for our use
-        cx.diskContext->phase3.rMapBuffer = cx.diskContext->metaBuffer;
-        cx.diskContext->phase3.lMapBuffer = cx.diskContext->unsortedL;
+        p3.hostRMap       = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
+        p3.hostLinePoints = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
+    }
+    else if( !cx.diskContext->phase3.rMapBuffer )
+    {
+        const size_t RMAP_SLICE_SIZE        = sizeof( RMap )   * P3_PRUNED_SLICE_MAX;
+        const size_t INDEX_SLICE_SIZE       = sizeof( uint32 ) * P3_PRUNED_SLICE_MAX;
+        const size_t LP_AND_LMAP_SLICE_SIZE = sizeof( uint64 ) * P3_PRUNED_SLICE_MAX;
+
+        const FileFlags TMP2_QUEUE_FILE_FLAGS = cx.cfg.temp2DirectIO ? FileFlags::NoBuffering | FileFlags::LargeFile : FileFlags::LargeFile;
+
+        cx.diskContext->phase3.rMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_RMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.rMapBuffer, "Failed to create R Map disk buffer." );
+
+        cx.diskContext->phase3.indexBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_INDEX_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, INDEX_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.indexBuffer, "Failed to create index disk buffer." );
+
+        cx.diskContext->phase3.lpAndLMapBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::P3_LP_AND_LMAP_DISK_BUFFER_FILE_NAME.data(), 
+                                            BBCU_BUCKET_COUNT, RMAP_SLICE_SIZE, FileMode::OpenOrCreate, FileAccess::ReadWrite, TMP2_QUEUE_FILE_FLAGS );
+        FatalIf( !cx.diskContext->phase3.lpAndLMapBuffer, "Failed to create LP/LMap disk buffer." );
     }
 
     #if _DEBUG
@@ -742,7 +764,7 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
 
     if( !acx.dryRun && cx.cfg.hybrid64Mode )
     {
-        tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
         tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
     }
 }
@@ -814,9 +836,9 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     if( !acx.dryRun && cx.cfg.hybrid64Mode )
     {
         s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
-        s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lMapBuffer );
+        s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
 
-        s2.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        s2.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
         s2.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
     }
 }
@@ -879,10 +901,10 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
 
     if( !acx.dryRun && cx.cfg.hybrid64Mode )
     {
-        s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpBuffer );
+        s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
         s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
 
-        s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lMapBuffer );
+        s3.mapOut.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
     }
 }
 
@@ -1041,23 +1063,32 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
-        for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
+        if( cx.cfg.hybrid64Mode )
         {
-            const uint32 copyCount = s2.prunedBucketSlices[bucket][slice];
+            const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket];
+
+            cx.diskContext->phase3.indexBuffer->OverrideReadSlices( bucket, sizeof( uint32 ), sizeSlices, BBCU_BUCKET_COUNT );
+            cx.diskContext->phase3.indexBuffer->ReadNextBucket();
+            const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs<uint32>();
+            ASSERT( readBucket.Length() == p3.prunedBucketCounts[(int)cx.table][bucket] );
+
+            bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
 
-            if( cx.cfg.hybrid64Mode )
+            idxWriter  += readBucket.Length();
+            entryCount += readBucket.Length();
+        }
+        else
+        {
+            for( uint32 slice = 0; slice < BBCU_BUCKET_COUNT; slice++ )
             {
-                cx.diskContext->phase3.indexBuffer->ReadNextBucket();
-                const auto readBucket = cx.diskContext->phase3.indexBuffer->GetNextReadBufferAs<uint32>();
+                const uint32 copyCount = s2.prunedBucketSlices[slice][bucket];
 
-                bbmemcpy_t( idxWriter, readBucket.Ptr(), readBucket.Length() );
-            }
-            else
                 bbmemcpy_t( idxWriter, reader, copyCount );
 
-            idxWriter += copyCount;
-            entryCount += copyCount;
-            reader += readerStride;
+                idxWriter  += copyCount;
+                entryCount += copyCount;
+                reader     += readerStride;
+            }
         }
     }
 
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index 0a7bc0f1..4c201606 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -369,6 +369,13 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
     }
 
+    if( cx.cfg.hybrid64Mode )
+    {
+        cx.diskContext->phase3.rMapBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
+    }
+
     // #if _DEBUG
     // // if( cx.table > TableId::Table3 )
     // {
@@ -418,7 +425,10 @@ void WritePark7( CudaK32PlotContext& cx )
     constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
     static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
 
-
+if( cx.cfg.hybrid64Mode )
+{
+    Fatal( "Park 7 Serialization still unimplemented for 16G Mode." );
+}
     // Host stuff
     constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
     StackAllocator hostAllocator( p3.hostRMap, hostMetaTableSize );
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index fae090d7..5e1fc46d 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -320,7 +320,8 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid64Mode )
     {
-        cx.diskContext->phase3.lMapBuffer->Swap();
+        cx.diskContext->phase3.lpAndLMapBuffer->Swap();
+        cx.diskContext->phase3.indexBuffer->Swap();
     }
 
 
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 7cc3e2e2..7e3f34d1 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -1247,7 +1247,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
         acx.devAllocator       = &devAllocator;
 
         AllocateP1Buffers( cx, acx );
-
+Log::Line( "P1: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = pinnedAllocator   .Size();
         cx.hostTableAllocSize = hostTableAllocator.Size();
         cx.hostTempAllocSize  = hostTempAllocator .Size();
@@ -1260,7 +1260,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase2AllocateBuffers( cx, acx );
-
+Log::Line( "P2: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1273,7 +1273,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
         devAllocator       = {};
 
         CudaK32PlotPhase3AllocateBuffers( cx, acx );
-
+Log::Line( "P3: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1289,8 +1289,8 @@ void AllocBuffers( CudaK32PlotContext& cx )
     }
 
 
-    size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
-    size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
+    const size_t totalPinnedSize = cx.pinnedAllocSize + cx.hostTempAllocSize + parksPinnedSize;
+    const size_t totalHostSize   = cx.hostTableAllocSize + totalPinnedSize;
     Log::Line( "Kernel RAM required       : %-12llu bytes ( %-9.2lf MiB or %-6.2lf GiB )", totalPinnedSize,
                    (double)totalPinnedSize BtoMB, (double)totalPinnedSize BtoGB );
 
@@ -1319,7 +1319,7 @@ void AllocBuffers( CudaK32PlotContext& cx )
             // On windows we always force the use of intermediate buffers, so we allocate on the host
             allocateHostTablesPinned = true;
         #endif
-        
+
         Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
         if( allocateHostTablesPinned )
             CudaErrCheck( cudaMallocHost( &cx.hostBufferTables, cx.hostTableAllocSize, cudaHostAllocDefault ) );
@@ -1414,13 +1414,13 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
             const size_t ySliceSize    = sizeof( uint32 ) * BBCU_MAX_SLICE_ENTRY_COUNT;
             const size_t metaSliceSize = sizeof( uint32 ) * BBCU_META_SLICE_ENTRY_COUNT;
 
-            cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "y.tmp", 
+            cx.diskContext->yBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::Y_DISK_BUFFER_FILE_NAME.data(), 
                                             BBCU_BUCKET_COUNT, ySliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
-            FatalIf( !cx.diskContext->yBuffer, "Failed to create y.tmp disk buffer." );
+            FatalIf( !cx.diskContext->yBuffer, "Failed to create y disk buffer." );
 
-            cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "metadata.tmp", 
+            cx.diskContext->metaBuffer = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::META_DISK_BUFFER_FILE_NAME.data(), 
                                             BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
-            FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata.tmp disk buffer." );
+            FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
         }
 Log::Line( "Host Temp @ %llu GiB", (llu)acx.hostTempAllocator->Size() BtoGB );
 Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
@@ -1501,15 +1501,15 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
 
             // When storing unsorted inlined x's, we don't have enough space in RAM, store i disk instead.
             const size_t xSliceSize = BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( Pair );
-            cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_l.tmp", 
-                                                                   BBCU_BUCKET_COUNT, xSliceSize, fileMode, FileAccess::ReadWrite, tmp2FileFlags );
-            FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted_l.tmp disk buffer." );
+            cx.diskContext->unsortedL = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, CudaK32HybridMode::LPAIRS_DISK_BUFFER_FILE_NAME.data(), 
+                                                                   BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+            FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." );
 
             if( cx.cfg.hybrid64Mode )
             {
-                cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "unsorted_r.tmp", 
-                                                                    BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), fileMode, FileAccess::ReadWrite, tmp2FileFlags );
-                FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted_r.tmp disk buffer." );
+                cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", 
+                                                                    BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
+                FatalIf( !cx.diskContext->unsortedR, "Failed to create unsorted R disk buffer." );
             }
             else
             {
diff --git a/src/plotting/DiskBucketBuffer.h b/src/plotting/DiskBucketBuffer.h
index 763ad75b..ec50cc8b 100644
--- a/src/plotting/DiskBucketBuffer.h
+++ b/src/plotting/DiskBucketBuffer.h
@@ -87,7 +87,7 @@ class DiskBucketBuffer : public DiskBufferBase
 
 private:
     size_t _sliceCapacity;         // Maximum size of each slice
-    
+
     bool   _verticalWrite = false;
     // size_t _writeSliceStride;      // Offset to the start of the next slices when writing
     // size_t _readSliceStride;       // Offset to the start of the next slice when reading (these are swapped between tables).
diff --git a/src/tools/PlotComparer.cpp b/src/tools/PlotComparer.cpp
index 8b8ea453..f275d980 100644
--- a/src/tools/PlotComparer.cpp
+++ b/src/tools/PlotComparer.cpp
@@ -105,7 +105,7 @@ void PlotCompareMain( GlobalPlotConfig& gCfg, CliParser& cli )
     // TestTable( refPlot, tgtPlot, TableId::Table7 );
     // TestTable( refPlot, tgtPlot, TableId::Table3 );
 
-    // TestC3Table( refPlot, tgtPlot );
+    TestC3Table( refPlot, tgtPlot );
 
     for( TableId table = TableId::Table1; table <= TableId::Table7; table++ )
         TestTable( refPlot, tgtPlot, table );

From e9efc94bb78a43e38aadbb67550ed0dc1c415f59 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 23 Aug 2023 23:18:08 -0400
Subject: [PATCH 10/26] Fixed correct buffer usage in GPU stream with disk

---
 cuda/GpuDownloadStream.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cuda/GpuDownloadStream.cu b/cuda/GpuDownloadStream.cu
index 4baca8fc..3d06973c 100644
--- a/cuda/GpuDownloadStream.cu
+++ b/cuda/GpuDownloadStream.cu
@@ -148,6 +148,8 @@ void GpuDownloadBuffer::PerformDownload2D( void* hostBuffer, size_t width, size_
         CallHostFunctionOnStream( downloadStream, [this](){
             self->diskBuffer->GetNextWriteBuffer();
         });
+
+        pinnedBuffer = self->diskBuffer->PeekWriteBufferForBucket( self->outgoingSequence-1 ); 
     }
 
     if( !isDirect )

From 4d30e1da1d75af0191887b3ce00bdb9ad039bf6e Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 23 Aug 2023 23:21:52 -0400
Subject: [PATCH 11/26] Some extra guards for disk buffers

---
 src/plotting/DiskBufferBase.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/plotting/DiskBufferBase.cpp b/src/plotting/DiskBufferBase.cpp
index 3faed0fe..38a26270 100644
--- a/src/plotting/DiskBufferBase.cpp
+++ b/src/plotting/DiskBufferBase.cpp
@@ -114,10 +114,11 @@ void DiskBufferBase::Swap()
 
 void* DiskBufferBase::GetNextWriteBuffer()
 {
-    FatalIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() );
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
+    PanicIf( (int64)_nextWriteLock - (int64)_nextWriteBucket >= 2, "Invalid write buffer lock for '%s'.", _name.c_str() );
 
     void* buf = _writeBuffers[_nextWriteLock % 2];
-    FatalIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() );
+    PanicIf( !buf, "No write buffer reserved for '%s'.", _name.c_str() );
 
     if( _nextWriteLock++ >= 2 )
         WaitForWriteToComplete( _nextWriteLock-2 );
@@ -127,11 +128,13 @@ void* DiskBufferBase::GetNextWriteBuffer()
 
 void* DiskBufferBase::PeekReadBufferForBucket( uint32 bucket )
 {
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
     return _readBuffers[bucket % 2];
 }
 
 void* DiskBufferBase::PeekWriteBufferForBucket( const uint32 bucket )
 {
+    PanicIf( _nextWriteLock >= _bucketCount, "Write bucket overflow." );
     return _writeBuffers[bucket % 2];
 }
 
@@ -150,10 +153,11 @@ void DiskBufferBase::WaitForLastWriteToComplete()
 
 void* DiskBufferBase::GetNextReadBuffer()
 {
-    FatalIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
+    PanicIf( _nextReadLock >= _bucketCount, "Read bucket overflow." );
+    PanicIf( _nextReadLock >= _nextReadBucket, "Invalid read buffer lock for '%s'.", _name.c_str() );
 
     void* buf = _readBuffers[_nextReadLock % 2];
-    FatalIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() );
+    PanicIf( !buf, "No read buffer reserved for '%s'.", _name.c_str() );
 
     WaitForReadToComplete( _nextReadLock++ );
     return buf;

From 74c5640caee0fed1e59262fac129cc98ddcb12d4 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Thu, 24 Aug 2023 01:23:41 -0400
Subject: [PATCH 12/26] Re-configure buffers for 128G windows

---
 cuda/CudaPlotter.cu | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 7e3f34d1..5058d132 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -1317,7 +1317,7 @@ Log::Line( "P3: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTe
         bool allocateHostTablesPinned = cx.downloadDirect;
         #if _WIN32
             // On windows we always force the use of intermediate buffers, so we allocate on the host
-            allocateHostTablesPinned = true;
+            allocateHostTablesPinned = false;
         #endif
 
         Log::Line( "Table pairs allocated as pinned: %s", allocateHostTablesPinned ? "true" : "false" );
@@ -1328,7 +1328,7 @@ Log::Line( "P3: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTe
     #endif
 
     cx.hostBufferTemp = nullptr;
-    #if _DEBUG
+    #if _DEBUG || _WIN32
         if( cx.hostTempAllocSize )
             cx.hostBufferTemp = bbvirtallocboundednuma<byte>( cx.hostTempAllocSize );
     #endif
@@ -1564,7 +1564,11 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
         if( !cx.downloadDirect )
         {
             // Use intermediate pinned buffer for transfers to non-pinned destinations
-            descTablePairs.pinnedAllocator = acx.pinnedAllocator;
+            yDesc.pinnedAllocator                = acx.pinnedAllocator;
+            descTablePairs.pinnedAllocator       = acx.pinnedAllocator;
+            descTableSortedPairs.pinnedAllocator = acx.pinnedAllocator;
+            descXPairs.pinnedAllocator           = acx.pinnedAllocator;
+            descMeta.pinnedAllocator             = acx.pinnedAllocator;
         }
 
 

From 20af47e45675b0ce7dc59a1bf455091d69a9763d Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Thu, 24 Aug 2023 23:22:04 -0400
Subject: [PATCH 13/26] Add P7 seriealization for 16G mode Fix BufferChain bug
 where it did not wait for the last buffer properly.

---
 cuda/CudaPlotContext.h        |  1 +
 cuda/CudaPlotPhase3.cu        | 10 ++++++
 cuda/CudaPlotPhase3Internal.h | 19 +++-------
 cuda/CudaPlotPhase3Step2.cu   | 67 ++++++++++++++++++++++++++---------
 src/plotting/BufferChain.cpp  | 11 ++++--
 src/plotting/BufferChain.h    |  2 +-
 6 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/cuda/CudaPlotContext.h b/cuda/CudaPlotContext.h
index 2e8323f9..4d546480 100644
--- a/cuda/CudaPlotContext.h
+++ b/cuda/CudaPlotContext.h
@@ -158,6 +158,7 @@ struct CudaK32Phase3
         GpuUploadBuffer   lMapIn;       // Output map (uint64) from the previous table run. Or, when L table is the first stored table, it is inlined x values
         GpuDownloadBuffer lpOut;        // Output line points (uint64)
         GpuDownloadBuffer indexOut;     // Output source line point index (uint32) (taken from the rMap source value)
+        GpuDownloadBuffer parksOut;     // Output P7 parks on the last table
         uint32*           devLTable[2]; // Unpacked L table bucket
 
         uint32 prunedBucketSlices[BBCU_BUCKET_COUNT][BBCU_BUCKET_COUNT];
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index e08e6c1c..ea8840b8 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -830,6 +830,16 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     s2.lpOut    = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     s2.indexOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32> (desc, acx.dryRun );
 
+
+    const size_t devParkAllocSize = P3_PARK_7_SIZE * P3_MAX_P7_PARKS_PER_BUCKET;
+
+    GpuStreamDescriptor parksDesc = desc;
+    parksDesc.sliceCount      = 1;
+    parksDesc.entriesPerSlice = devParkAllocSize;
+    parksDesc.sliceAlignment  = RoundUpToNextBoundaryT<size_t>( P3_PARK_7_SIZE, sizeof( uint64 ) );
+
+    s2.parksOut = cx.gpuDownloadStream[0]->CreateDownloadBufferT<byte>( parksDesc, acx.dryRun );
+
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 
diff --git a/cuda/CudaPlotPhase3Internal.h b/cuda/CudaPlotPhase3Internal.h
index feb93a6f..34909123 100644
--- a/cuda/CudaPlotPhase3Internal.h
+++ b/cuda/CudaPlotPhase3Internal.h
@@ -37,22 +37,11 @@ static_assert( alignof( LMap ) == sizeof( uint32 ) );
 #define P3_PRUNED_TABLE_MAX_ENTRIES     BBCU_TABLE_ALLOC_ENTRY_COUNT   //(P3_PRUNED_BUCKET_MAX*BBCU_BUCKET_COUNT)
 #define P3_PRUNED_MAX_PARKS_PER_BUCKET  ((P3_PRUNED_BUCKET_MAX/kEntriesPerPark)+2)
 
-static constexpr size_t P3_MAX_CTABLE_SIZE = 38u * 1024u;  // Should be more than enough
 
-//static constexpr size_t P3_LP_BUCKET_COUNT       = BBCU_BUCKET_COUNT;// << 1;
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = BBCU_MAX_SLICE_ENTRY_COUNT;
-//static constexpr uint32 P3_LP_BUCKET_BITS        = BBC_BUCKET_BITS;
-
-// static constexpr uint32 P3_LP_BUCKET_BITS        = (uint32)(CuBBLog2( P3_LP_BUCKET_COUNT ));
-//static constexpr size_t P3_LP_SLICE_ENTRY_COUNT  = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-                                                     //BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// static constexpr size_t P3_LP_BUCKET_ENTRY_COUNT = P3_LP_SLICE_ENTRY_COUNT * P3_LP_BUCKET_COUNT;
-
-//static constexpr size_t P3_LP_BUCKET_STRIDE      = BBCU_BUCKET_ALLOC_ENTRY_COUNT;
-
-// static constexpr size_t P3_LP_BUCKET_ALLOC_COUNT = ( CuCDiv( (size_t)( ( BBCU_TABLE_ENTRY_COUNT / P3_LP_BUCKET_COUNT / P3_LP_BUCKET_COUNT ) * P3_LP_BUCKET_MULTIPLER ),
-//                                                     BBCU_XTRA_ENTRIES_PER_SLICE ) * BBCU_XTRA_ENTRIES_PER_SLICE + BBCU_XTRA_ENTRIES_PER_SLICE );
-// //static constexpr size_t P3_LP_TABLE_ALLOC_COUNT  = P3_LP_BUCKET_STRIDE * BBCU_BUCKET_COUNT;
+static constexpr size_t P3_MAX_CTABLE_SIZE         = 38u * 1024u;  // Should be more than enough
+static constexpr size_t P3_MAX_P7_PARKS_PER_BUCKET = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+static constexpr size_t P3_PARK_7_SIZE             = CalculatePark7Size( BBCU_K );
+static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= P3_MAX_P7_PARKS_PER_BUCKET * P3_PARK_7_SIZE );
 
 static constexpr size_t MAX_PARK_SIZE            = CalculateParkSize( TableId::Table1 );
 static constexpr size_t DEV_MAX_PARK_SIZE        = CuCDiv( MAX_PARK_SIZE, sizeof( uint64 ) ) * sizeof( uint64 );   // Align parks to 64 bits, for easier writing of stubs
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index 4c201606..9d2d8991 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -408,27 +408,27 @@ void WritePark7( CudaK32PlotContext& cx )
     auto& p3 = *cx.phase3;
     auto& s2 = p3.step2;
 
-    
+
     // Load initial bucket
     LoadBucket( cx, 0 );
 
     // Begin park 7 table in plot
     cx.plotWriter->BeginTable( PlotTable::Table7 );
 
-    constexpr size_t parkSize       = CalculatePark7Size( BBCU_K );
+    constexpr size_t parkSize       = P3_PARK_7_SIZE;
     constexpr size_t parkFieldCount = parkSize / sizeof( uint64 );
     static_assert( parkFieldCount * sizeof( uint64 ) == parkSize );
 
+    GpuDownloadBuffer& parkDownloader = cx.useParkContext ? s2.parksOut : s2.lpOut;
 
-    GpuDownloadBuffer& parkDownloader = s2.lpOut;
-
-    constexpr size_t maxParksPerBucket = CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
+    constexpr size_t maxParksPerBucket = P3_MAX_P7_PARKS_PER_BUCKET;
     static_assert( sizeof( uint64 ) * BBCU_BUCKET_ALLOC_ENTRY_COUNT >= maxParksPerBucket * parkSize );
 
-if( cx.cfg.hybrid64Mode )
-{
-    Fatal( "Park 7 Serialization still unimplemented for 16G Mode." );
-}
+    if( cx.useParkContext )
+    {
+        cx.parkContext->parkBufferChain->Reset();
+    }
+
     // Host stuff
     constexpr size_t hostMetaTableSize = sizeof( RMap ) * BBCU_TABLE_ALLOC_ENTRY_COUNT;
     StackAllocator hostAllocator( p3.hostRMap, hostMetaTableSize );
@@ -436,9 +436,10 @@ if( cx.cfg.hybrid64Mode )
     const uint64 tableEntryCount = cx.tableEntryCounts[(int)cx.table];
     const size_t totalParkCount  = CDiv( (size_t)tableEntryCount, kEntriesPerPark );
 
-    byte*   hostParks           = hostAllocator.AllocT<byte>( totalParkCount * parkSize );
-    byte*   hostParkWriter      = hostParks;
-    uint32* hostLastParkEntries = hostAllocator.CAlloc<uint32>( kEntriesPerPark );
+    byte*   hostParks           = cx.useParkContext ? nullptr : hostAllocator.AllocT<byte>( totalParkCount * parkSize );
+    byte*   hostParksWriter     = cx.useParkContext ? nullptr : hostParks;
+    uint32* hostLastParkEntries = cx.useParkContext ? (uint32*)cx.parkContext->hostRetainedLinePoints : 
+                                                      hostAllocator.CAlloc<uint32>( kEntriesPerPark );
 
     static_assert( kEntriesPerPark * maxParksPerBucket <= BBCU_BUCKET_ALLOC_ENTRY_COUNT * 2 );
     uint32* devIndexBuffer     = s2.devLTable[0] + kEntriesPerPark;
@@ -488,14 +489,38 @@ if( cx.cfg.hybrid64Mode )
         // Download parks & write to plot
         const size_t downloadSize = parkCount * parkSize;
 
-        parkDownloader.DownloadWithCallback( hostParkWriter, downloadSize,
+        if( cx.useParkContext )
+        {
+            ASSERT( downloadSize <= cx.parkContext->parkBufferChain->BufferSize() );
+
+            // Override the park buffer to be used when using a park context
+            hostParksWriter = cx.parkContext->parkBufferChain->PeekBuffer( bucket );
+
+            // Wait for the next park buffer to be available
+            parkDownloader.HostCallback([&cx]{
+               (void)cx.parkContext->parkBufferChain->GetNextBuffer();
+            });
+        }
+
+        parkDownloader.DownloadWithCallback( hostParksWriter, downloadSize,
               []( void* parksBuffer, size_t size, void* userData ) {
 
                 auto& cx = *reinterpret_cast<CudaK32PlotContext*>( userData );
                 cx.plotWriter->WriteTableData( parksBuffer, size );
+
+                // Release the buffer after the plot writer is done with it.
+                if( cx.useParkContext )
+                {
+                    cx.plotWriter->CallBack([&cx](){
+                        cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+                    });
+                }
+                
             }, &cx, cx.computeStream );
 
-        hostParkWriter += downloadSize;
+        hostParksWriter += downloadSize;
+        if( cx.useParkContext )
+            hostParksWriter = nullptr;
     }
 
     // Wait for parks to complete downloading
@@ -508,9 +533,19 @@ if( cx.cfg.hybrid64Mode )
     // Was there a left-over park?
     if( retainedEntryCount > 0 )
     {
+        if( cx.useParkContext )
+            hostParksWriter = cx.parkContext->parkBufferChain->GetNextBuffer();
+
         // Submit last park to plot
-        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParkWriter );
-        cx.plotWriter->WriteTableData( hostParkWriter, parkSize );
+        TableWriter::WriteP7Parks( 1, hostLastParkEntries, hostParksWriter );
+        cx.plotWriter->WriteTableData( hostParksWriter, parkSize );
+
+        if( cx.useParkContext )
+        {
+            cx.plotWriter->CallBack([&cx](){
+                cx.parkContext->parkBufferChain->ReleaseNextBuffer();
+            });
+        }
     }
     cx.plotWriter->EndTable();
 
diff --git a/src/plotting/BufferChain.cpp b/src/plotting/BufferChain.cpp
index ca11dd59..43a7e47b 100644
--- a/src/plotting/BufferChain.cpp
+++ b/src/plotting/BufferChain.cpp
@@ -41,8 +41,8 @@ byte* BufferChain::GetNextBuffer()
 {
     const uint32 bufferCount = (uint32)_buffers.Length();
 
-    ASSERT( _nextBufferToRelease <= _nextBufferToLock );
-    ASSERT( _nextBufferToLock - _nextBufferToRelease <= bufferCount );
+    PanicIf( _nextBufferToRelease > _nextBufferToLock, "" );
+    PanicIf( _nextBufferToLock - _nextBufferToRelease > bufferCount, "" );
 
     if( _nextBufferToLock >= bufferCount )
     {
@@ -55,12 +55,17 @@ byte* BufferChain::GetNextBuffer()
 void BufferChain::ReleaseNextBuffer()
 {
     PanicIf( _nextBufferToRelease >= _nextBufferToLock, "" );
+    PanicIf(_nextBufferToLock - _nextBufferToRelease > (uint32)_buffers.Length(), "" );
+
     _fence.Signal( ++_nextBufferToRelease );
 }
 
 void BufferChain::Reset()
 {
-    GetNextBuffer();
+    // Wait for the last buffer to be released
+    _fence.Wait( _nextBufferToLock );
+
+    // Reset state
     _fence.Reset( 0 );
     _nextBufferToRelease = 0;
     _nextBufferToLock    = 0;
diff --git a/src/plotting/BufferChain.h b/src/plotting/BufferChain.h
index dabacdad..edb934a7 100644
--- a/src/plotting/BufferChain.h
+++ b/src/plotting/BufferChain.h
@@ -37,7 +37,7 @@ class BufferChain
 private:
     Fence        _fence;
     Span<byte*>  _buffers;
-    IAllocator*  _allocator = nullptr;
+    IAllocator*  _allocator           = nullptr;
     size_t       _bufferSize          = 0;  // Size of each individual buffer
     uint32       _nextBufferToLock    = 0;
     uint32       _nextBufferToRelease = 0;

From 46029fbc4508e762e1fed8db7534a6f1d06236ff Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Tue, 29 Aug 2023 01:36:58 +0000
Subject: [PATCH 14/26] Fixed overflow on park buffers with -z > 1

---
 cuda/CudaPlotter.cu          |  2 +-
 src/plotting/Compression.cpp | 16 ++++++++++++++++
 src/plotting/Compression.h   |  1 +
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index 5058d132..b3f03622 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -1693,7 +1693,7 @@ void AllocateParkSerializationBuffers( CudaK32PlotContext& cx, IAllocator& pinne
     // Get the largest park size
     const size_t maxParkSize = cx.cfg.gCfg->compressionLevel == 0 ?
                                 CalculateParkSize( TableId::Table1 ) :
-                                GetCompressionInfoForLevel( cx.cfg.gCfg->compressionLevel ).tableParkSize;
+                                GetLargestCompressedParkSize();
 
     const size_t parksPerBuffer       = CDivT<size_t>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kEntriesPerPark ) + 2;
     // CDiv( BBCU_BUCKET_ALLOC_ENTRY_COUNT, kCheckpoint1Interval ) + 1; // Need an extra park for left-over entries
diff --git a/src/plotting/Compression.cpp b/src/plotting/Compression.cpp
index 59c099d3..bde4313b 100644
--- a/src/plotting/Compression.cpp
+++ b/src/plotting/Compression.cpp
@@ -2,6 +2,7 @@
 #include "plotting/FSETableGenerator.h"
 #include "util/Util.h"
 #include <mutex>
+#include <algorithm>
 
 // Caches for C and D tables
 static std::atomic<FSE_CTable*> _cTableCache[32] = {};
@@ -140,4 +141,19 @@ uint32 GetCompressedLPBitCount( const uint32 compressionLevel )
     //     lpBitSize = lpBitSize * 2 - 1;
 
     return lpBitSize * 2 - 1;
+}
+
+size_t GetLargestCompressedParkSize()
+{
+    return std::max( {
+        GetCompressionInfoForLevel( 1 ).tableParkSize,
+        GetCompressionInfoForLevel( 2 ).tableParkSize,
+        GetCompressionInfoForLevel( 3 ).tableParkSize,
+        GetCompressionInfoForLevel( 4 ).tableParkSize,
+        GetCompressionInfoForLevel( 5 ).tableParkSize,
+        GetCompressionInfoForLevel( 6 ).tableParkSize,
+        GetCompressionInfoForLevel( 7 ).tableParkSize,
+        GetCompressionInfoForLevel( 8 ).tableParkSize,
+        GetCompressionInfoForLevel( 9 ).tableParkSize }
+    );
 }
\ No newline at end of file
diff --git a/src/plotting/Compression.h b/src/plotting/Compression.h
index c1967ed4..dbb01228 100644
--- a/src/plotting/Compression.h
+++ b/src/plotting/Compression.h
@@ -16,6 +16,7 @@ FSE_CTable*     CreateCompressionCTable( const uint32_t compressionLevel, size_t
 FSE_DTable*     CreateCompressionDTable( const uint32_t compressionLevel, size_t* outTableSize = nullptr );
 CompressionInfo GetCompressionInfoForLevel( const uint32_t compressionLevel );
 uint32_t        GetCompressedLPBitCount( const uint32_t compressionLevel );
+size_t          GetLargestCompressedParkSize();
 
 template<uint32_t level>
 struct CompressionLevelInfo 

From a7a81f329348a2f365c96cc049b9917e839d1e3b Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Tue, 29 Aug 2023 20:15:21 -0400
Subject: [PATCH 15/26] Fix macos build issue. Fix version embed not working on
 local builds. Fix raw std::string passed to formatter.

---
 CMakeLists.txt | 7 +++----
 src/main.cpp   | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5660cef0..8f72155c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,6 @@ cmake_minimum_required(VERSION 3.19 FATAL_ERROR)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CUDA_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_OSX_DEPLOYMENT_TARGET 10.16)
 
 set(CMAKE_CONFIGURATION_TYPES Release Debug)
 
@@ -19,7 +18,7 @@ if(POLICY CMP0091)
   cmake_policy(SET CMP0091 NEW) 
 endif()
 
-set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14" CACHE STRING "macOS minimum supported version.")
+set(CMAKE_OSX_DEPLOYMENT_TARGET "10.16" CACHE STRING "macOS minimum supported version.")
 set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$<CONFIG:Debug>:Debug>" CACHE STRING "MSVC Runtime Library")
 
 project(bladebit LANGUAGES C CXX ASM)
@@ -85,7 +84,7 @@ endif()
 # NOTE: These are mostly sandbox test environment, not proper tests
 option(BB_ENABLE_TESTS "Enable tests." OFF)
 option(NO_CUDA_HARVESTER "Explicitly disable CUDA in the bladebit_harvester target." OFF)
-option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." ON)
+option(BB_NO_EMBED_VERSION "Disable embedding the version when building locally (non-CI)." OFF)
 option(BB_HARVESTER_ONLY "Enable only the harvester target." OFF)
 option(BB_HARVESTER_STATIC "Build the harvester target as a static library." OFF)
 option(BB_CUDA_USE_NATIVE "Only build the native CUDA architecture when in release mode." OFF)
@@ -146,7 +145,7 @@ endif()
 include(Config.cmake)
 
 if(NOT ${BB_HARVESTER_ONLY})
-    if(NOT BB_IS_DEPENDENCY AND (NOT BB_NO_EMBED_VERSION))
+    if((NOT BB_IS_DEPENDENCY) AND (NOT BB_NO_EMBED_VERSION))
         include(cmake_modules/EmbedVersion.cmake)
     endif()
 
diff --git a/src/main.cpp b/src/main.cpp
index 3450aa79..48beebb6 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -498,7 +498,7 @@ void ParseCommandLine( GlobalPlotConfig& cfg, IPlotter*& outPlotter, int argc, c
     while( cli.HasArgs() )
     {
         outPath = cli.Arg();
-        FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath );
+        FatalIf( outPath[0] == '-', "Unrecognized argument '%s'.", outPath.c_str() );
 
         // Add trailing slash?
         const char endChar = outPath.back();

From c924aef9d3a91315434b7dc3982f2cf8eeab1585 Mon Sep 17 00:00:00 2001
From: William Allen <wallentx@users.noreply.github.com>
Date: Wed, 30 Aug 2023 11:52:04 -0500
Subject: [PATCH 16/26] Adding windows build support

---
 cmake_modules/EmbedVersion.cmake | 14 ++++----
 extract-version.ps1              | 60 ++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+), 6 deletions(-)
 create mode 100644 extract-version.ps1

diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 6ec042c0..50b269fb 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -5,15 +5,17 @@ if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
     set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
 
     set(cmd_ver bash)
+    set(ext_ver sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(cmd_ver bash.exe)
+        set(cmd_ver pwsh)
+        set(ext_ver ps1)
     endif()
 
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.sh commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)
diff --git a/extract-version.ps1 b/extract-version.ps1
new file mode 100644
index 00000000..c26d1c70
--- /dev/null
+++ b/extract-version.ps1
@@ -0,0 +1,60 @@
+# Navigate to the script's directory
+$scriptPath = Split-Path -Path $MyInvocation.MyCommand.Definition -Parent
+Set-Location -Path $scriptPath
+
+# Arguments
+$ver_component = $args[0]  # The user-specified component from the full version
+
+# Read the version from the file
+$version_str = (Get-Content 'VERSION' | Select-Object -First 1 | Out-String).Trim()
+$bb_version_suffix = (Get-Content 'VERSION' | Select-Object -Last 1 | Out-String).Trim()
+$version_header = 'src\Version.h'
+
+if ($version_str -eq $bb_version_suffix) {
+    $bb_version_suffix = ""
+}
+
+# Prepend a '-' to the suffix, if necessary
+if (-Not [string]::IsNullOrEmpty($bb_version_suffix) -and $bb_version_suffix[0] -ne '-') {
+    $bb_version_suffix = "-$bb_version_suffix"
+}
+
+# Parse the major, minor, and revision numbers
+$bb_ver_maj, $bb_ver_min, $bb_ver_rev = $version_str -split '\.' | ForEach-Object { $_.Trim() }
+
+# Get the Git commit hash
+$bb_git_commit = $env:GITHUB_SHA
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = & git rev-parse HEAD
+}
+
+if ([string]::IsNullOrEmpty($bb_git_commit)) {
+    $bb_git_commit = "unknown"
+}
+
+# Check if the user wants a specific component
+if (-Not [string]::IsNullOrEmpty($ver_component)) {
+    switch ($ver_component) {
+        "major" {
+            Write-Host -NoNewline $bb_ver_maj
+        }
+        "minor" {
+            Write-Host -NoNewline $bb_ver_min
+        }
+        "revision" {
+            Write-Host -NoNewline $bb_ver_rev
+        }
+        "suffix" {
+            Write-Host -NoNewline $bb_version_suffix
+        }
+        "commit" {
+            Write-Host -NoNewline $bb_git_commit
+        }
+        default {
+            Write-Error "Invalid version component '$ver_component'"
+            exit 1
+        }
+    }
+    exit 0
+}
+

From 7079519846f40c9bee938279e5695b57a79494ec Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 14:06:48 -0400
Subject: [PATCH 17/26] Fix harvester build

---
 Harvester.cmake | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Harvester.cmake b/Harvester.cmake
index ece2c457..c225c809 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -83,8 +83,9 @@ target_sources(bladebit_harvester PRIVATE
         cuda/CudaMatch.cu
         cuda/CudaPlotUtil.cu
 
-        # TODO: Remove this, ought not be needed in harvester
+        # TODO: Does this have to be here?
         cuda/GpuStreams.cu
+        cuda/GpuDownloadStream.cu
     >
 
     $<$<NOT:${have_cuda}>:

From cf7ed4a53be21576ab6968d2c98a55118def0a89 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 14:21:41 -0400
Subject: [PATCH 18/26] Code cleanup

---
 cuda/CudaPlotConfig.h       |  6 ++--
 cuda/CudaPlotPhase3.cu      | 28 +++++++++----------
 cuda/CudaPlotPhase3Step2.cu |  2 +-
 cuda/CudaPlotPhase3Step3.cu |  2 +-
 cuda/CudaPlotter.cu         | 56 ++++++++++++++-----------------------
 cuda/CudaPlotter.h          |  2 +-
 cuda/chacha8.cu             |  2 +-
 7 files changed, 42 insertions(+), 56 deletions(-)

diff --git a/cuda/CudaPlotConfig.h b/cuda/CudaPlotConfig.h
index 5861d64f..b42a5d8a 100644
--- a/cuda/CudaPlotConfig.h
+++ b/cuda/CudaPlotConfig.h
@@ -48,8 +48,8 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_REF_DIR       "/home/harold/plots/ref/"
 
 
-    #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
-    #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
+    // #define BBCU_DBG_SKIP_PHASE_1   1   // Skip phase 1 and load pairs from disk
+    // #define BBCU_DBG_SKIP_PHASE_2   1   // Skip phase 1 and 2 and load pairs and marks from disk
 
     #if (defined( BBCU_DBG_SKIP_PHASE_2 ) && !defined( BBCU_DBG_SKIP_PHASE_1 ) )
         #define BBCU_DBG_SKIP_PHASE_1 1
@@ -60,7 +60,7 @@ static_assert( BBCU_BUCKET_ALLOC_ENTRY_COUNT / BBCU_BUCKET_COUNT == BBCU_MAX_SLI
     // #define DBG_BBCU_P2_WRITE_MARKS   1
 
     // #define DBG_BBCU_P2_COUNT_PRUNED_ENTRIES 1
-    #define DBG_BBCU_KEEP_TEMP_FILES 1
+    // #define DBG_BBCU_KEEP_TEMP_FILES 1
 
 
     #define _ASSERT_DOES_NOT_OVERLAP( b0, b1, size ) ASSERT( (b1+size) <= b0 || b1 >= (b0+size) )
diff --git a/cuda/CudaPlotPhase3.cu b/cuda/CudaPlotPhase3.cu
index ea8840b8..8fcdfe2a 100644
--- a/cuda/CudaPlotPhase3.cu
+++ b/cuda/CudaPlotPhase3.cu
@@ -235,7 +235,7 @@ void CudaK32PlotPhase3( CudaK32PlotContext& cx )
     }
     #endif
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.rMapBuffer->Swap();
         cx.diskContext->phase3.indexBuffer->Swap();
@@ -467,7 +467,7 @@ void Step1( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.rMapBuffer->Swap();
     }
@@ -611,7 +611,7 @@ void CompressInlinedTable( CudaK32PlotContext& cx )
             p3.prunedTableEntryCounts[(int)rTable] += p3.prunedBucketCounts[(int)rTable][i];
     }
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.lpAndLMapBuffer->Swap();
         cx.diskContext->phase3.indexBuffer->Swap();
@@ -640,7 +640,7 @@ void CudaK32PlotPhase3AllocateBuffers( CudaK32PlotContext& cx, CudaK32AllocConte
     p3.devPrunedEntryCount = acx.devAllocator->CAlloc<uint32>( 1, acx.alignment );
 
     // Host allocations
-    if( !cx.cfg.hybrid64Mode )
+    if( !cx.cfg.hybrid16Mode )
     {
         p3.hostRMap       = acx.hostTempAllocator->CAlloc<RMap>( BBCU_TABLE_ALLOC_ENTRY_COUNT );     // Used for rMap and index
         p3.hostLinePoints = acx.hostTempAllocator->CAlloc<uint64>( BBCU_TABLE_ALLOC_ENTRY_COUNT );   // Used for lMap and LPs
@@ -750,7 +750,7 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
             desc.pinnedAllocator = acx.pinnedAllocator;
     }
 
@@ -762,7 +762,7 @@ void AllocXTableStep( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
     tx.lpOut     = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint64>( desc, acx.dryRun );
     tx.indexOut  = cx.gpuDownloadStream[0]->CreateDownloadBufferT<uint32>( desc, acx.dryRun );
 
-    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
     {
         tx.lpOut   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
         tx.indexOut.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
@@ -785,7 +785,7 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
     {
         uploadDesc.pinnedAllocator = acx.pinnedAllocator;
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
             desc.pinnedAllocator = acx.pinnedAllocator;
     }
 
@@ -798,7 +798,7 @@ void CudaK32PlotAllocateBuffersStep1( CudaK32PlotContext& cx, CudaK32AllocContex
 
     s1.rTableMarks = (uint64*)acx.devAllocator->AllocT<uint64>( GetMarkingTableBitFieldSize(), acx.alignment );
 
-    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
     {
         s1.rMapOut.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
     }
@@ -816,7 +816,7 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.pinnedAllocator = nullptr;
 
     GpuStreamDescriptor uploadDesc = desc;
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         desc.pinnedAllocator = acx.pinnedAllocator;
     }
@@ -843,7 +843,7 @@ void CudaK32PlotAllocateBuffersStep2( CudaK32PlotContext& cx, CudaK32AllocContex
     s2.devLTable[0] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
     s2.devLTable[1] = acx.devAllocator->CAlloc<uint32>( BBCU_BUCKET_ALLOC_ENTRY_COUNT, alignment );
 
-    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
     {
         s2.rMapIn.AssignDiskBuffer( cx.diskContext->phase3.rMapBuffer );
         s2.lMapIn.AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
@@ -864,7 +864,7 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     desc.deviceAllocator = acx.devAllocator;
     desc.pinnedAllocator = nullptr;
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         desc.pinnedAllocator = acx.pinnedAllocator;
     }
@@ -909,7 +909,7 @@ void CudaK32PlotAllocateBuffersStep3( CudaK32PlotContext& cx, CudaK32AllocContex
     s3.devCTable           = acx.devAllocator->AllocT<FSE_CTable>( P3_MAX_CTABLE_SIZE, alignment );
     s3.devParkOverrunCount = acx.devAllocator->CAlloc<uint32>( 1 );
 
-    if( !acx.dryRun && cx.cfg.hybrid64Mode )
+    if( !acx.dryRun && cx.cfg.hybrid16Mode )
     {
         s3.lpIn   .AssignDiskBuffer( cx.diskContext->phase3.lpAndLMapBuffer );
         s3.indexIn.AssignDiskBuffer( cx.diskContext->phase3.indexBuffer );
@@ -1073,7 +1073,7 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
 
     for( uint32 bucket = 0; bucket < BBCU_BUCKET_COUNT; bucket++ )
     {
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
         {
             const uint32* sizeSlices = &s2.prunedBucketSlices[0][bucket];
 
@@ -1102,7 +1102,7 @@ void DbgValidateIndices( CudaK32PlotContext& cx )
         }
     }
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.indexBuffer->Swap();
         cx.diskContext->phase3.indexBuffer->Swap();
diff --git a/cuda/CudaPlotPhase3Step2.cu b/cuda/CudaPlotPhase3Step2.cu
index 9d2d8991..3a7a6449 100644
--- a/cuda/CudaPlotPhase3Step2.cu
+++ b/cuda/CudaPlotPhase3Step2.cu
@@ -369,7 +369,7 @@ void CudaK32PlotPhase3Step2( CudaK32PlotContext& cx )
         ASSERT( p3.prunedBucketCounts[(int)rTable][bucket] <= P3_PRUNED_BUCKET_MAX );
     }
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.rMapBuffer->Swap();
         cx.diskContext->phase3.lpAndLMapBuffer->Swap();
diff --git a/cuda/CudaPlotPhase3Step3.cu b/cuda/CudaPlotPhase3Step3.cu
index 5e1fc46d..c8f9337b 100644
--- a/cuda/CudaPlotPhase3Step3.cu
+++ b/cuda/CudaPlotPhase3Step3.cu
@@ -318,7 +318,7 @@ void CudaK32PlotPhase3Step3( CudaK32PlotContext& cx )
     s3.lpIn   .Reset();
     s3.indexIn.Reset();
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->phase3.lpAndLMapBuffer->Swap();
         cx.diskContext->phase3.indexBuffer->Swap();
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index b3f03622..b5eb700e 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -59,18 +59,12 @@ GPU-based (CUDA) plotter
 
  --disk-128           : Enable hybrid disk plotting for 128G system RAM. 
                          Requires a --temp1 and --temp2 to be set.
- --disk-64            : Enable hybrid disk plotting for 64G system RAM. 
+ --disk-16            : Enable hybrid disk plotting for 16G system RAM. 
                          Requires a --temp1 and --temp2 to be set.
  -t1, --temp1         : Temporary directory 1. Used for longer-lived, sequential writes.
  -t2, --temp2         : Temporary directory 2. Used for temporary, shorted-lived read and writes.
                          NOTE: If only one of -t1 or -t2 is specified, both will be
                                set to the same directory.
-
- --no-direct-buffers : Disable using direct downloads and uploads from/to GPU and host.
-                        If this is set, intermediate buffers are used between the GPU and host,
-                        which will means slower plotting times.
-                        This is forcefully enabled on Windows to avoid limited pinnable memory.
-
 )";
 
 ///
@@ -86,11 +80,9 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
     {
         if( cli.ReadU32( cfg.deviceIndex, "-d", "--device" ) )
             continue;
-        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-downloads" ) )
-            continue;
         if( cli.ReadSwitch( cfg.hybrid128Mode, "--disk-128" ) )
             continue;
-        if( cli.ReadSwitch( cfg.hybrid64Mode, "--disk-64" ) )
+        if( cli.ReadSwitch( cfg.hybrid16Mode, "--disk-16" ) )
         {
             cfg.hybrid128Mode = true;
             continue;
@@ -111,8 +103,8 @@ void CudaK32Plotter::ParseCLI( const GlobalPlotConfig& gCfg, CliParser& cli )
             continue;
         if( cli.ReadUnswitch( cfg.temp2DirectIO, "--no-t2-direct" ) )
             continue;
-        if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
-            continue;
+        // if( cli.ReadSwitch( cfg.disableDirectDownloads, "--no-direct-buffers" ) )
+        //     continue;
         if( cli.ArgMatch( "--help", "-h" ) )
         {
             Log::Line( USAGE );
@@ -194,7 +186,7 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     {
         cx.parkContext    = new CudaK32ParkContext{};
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
             cx.useParkContext = true;
     }
 
@@ -319,17 +311,17 @@ void CudaK32Plotter::Run( const PlotRequest& req )
 
     // Delete any temporary files
     #if !(DBG_BBCU_KEEP_TEMP_FILES)
-    if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
-    {
-        if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
-        if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL;
-
-        for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+        if( cx.plotRequest.IsFinalPlot && cx.cfg.hybrid128Mode )
         {
-            if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
-            if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+            if( cx.diskContext->metaBuffer ) delete cx.diskContext->metaBuffer;
+            if( cx.diskContext->unsortedL ) delete cx.diskContext->unsortedL;
+
+            for( TableId t = TableId::Table1; t <= TableId::Table7; t++ )
+            {
+                if( cx.diskContext->tablesL[(int)t] ) delete cx.diskContext->tablesL[(int)t];
+                if( cx.diskContext->tablesR[(int)t] ) delete cx.diskContext->tablesR[(int)t];
+            }
         }
-    }
     #endif
 }
 
@@ -508,12 +500,12 @@ void FpTable( CudaK32PlotContext& cx )
 
     if( cx.cfg.hybrid128Mode )
     {
-        if( cx.cfg.hybrid64Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
+        if( cx.cfg.hybrid16Mode || cx.table == cx.firstStoredTable || cx.table == cx.firstStoredTable + 1 )
         {
             cx.diskContext->unsortedL->Swap();
         }
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
         {
             cx.diskContext->yBuffer->Swap();
             cx.diskContext->metaBuffer->Swap();
@@ -1010,7 +1002,7 @@ void FinalizeTable7( CudaK32PlotContext& cx )
         cx.diskContext->tablesL[(int)TableId::Table7]->Swap();
         cx.diskContext->tablesR[(int)TableId::Table7]->Swap();
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
             cx.diskContext->yBuffer->Swap();
     }
 
@@ -1247,7 +1239,6 @@ void AllocBuffers( CudaK32PlotContext& cx )
         acx.devAllocator       = &devAllocator;
 
         AllocateP1Buffers( cx, acx );
-Log::Line( "P1: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = pinnedAllocator   .Size();
         cx.hostTableAllocSize = hostTableAllocator.Size();
         cx.hostTempAllocSize  = hostTempAllocator .Size();
@@ -1260,7 +1251,6 @@ Log::Line( "P1: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTe
         devAllocator       = {};
 
         CudaK32PlotPhase2AllocateBuffers( cx, acx );
-Log::Line( "P2: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1273,7 +1263,6 @@ Log::Line( "P2: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTe
         devAllocator       = {};
 
         CudaK32PlotPhase3AllocateBuffers( cx, acx );
-Log::Line( "P3: p: %llu MiB | t: %llu MiB", pinnedAllocator.Size() BtoMB, hostTempAllocator.Size() BtoMB );
         cx.pinnedAllocSize    = std::max( cx.pinnedAllocSize   , pinnedAllocator   .Size() );
         cx.hostTableAllocSize = std::max( cx.hostTableAllocSize, hostTableAllocator.Size() );
         cx.hostTempAllocSize  = std::max( cx.hostTempAllocSize , hostTempAllocator .Size() );
@@ -1404,7 +1393,7 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
         // This is roughly equivalent to temp2 dir during disk plotting.
 
 
-        if( !cx.cfg.hybrid64Mode )
+        if( !cx.cfg.hybrid16Mode )
         {
             cx.hostY = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT, alignment );
             cx.hostMeta = acx.hostTempAllocator->CAlloc<uint32>( BBCU_TABLE_ALLOC_ENTRY_COUNT * BBCU_HOST_META_MULTIPLIER, alignment );
@@ -1422,8 +1411,6 @@ void AllocateP1Buffers( CudaK32PlotContext& cx, CudaK32AllocContext& acx )
                                             BBCU_BUCKET_COUNT, metaSliceSize, FileMode::Create, FileAccess::ReadWrite, tmp2FileFlags );
             FatalIf( !cx.diskContext->metaBuffer, "Failed to create metadata disk buffer." );
         }
-Log::Line( "Host Temp @ %llu GiB", (llu)acx.hostTempAllocator->Size() BtoGB );
-Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
 
         // Marking tables used to prune back pointers
         {
@@ -1505,7 +1492,7 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
                                                                    BBCU_BUCKET_COUNT, xSliceSize, FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
             FatalIf( !cx.diskContext->unsortedL, "Failed to create unsorted L disk buffer." );
 
-            if( cx.cfg.hybrid64Mode )
+            if( cx.cfg.hybrid16Mode )
             {
                 cx.diskContext->unsortedR = DiskBucketBuffer::Create( *cx.diskContext->temp2Queue, "p1unsorted_r.tmp", 
                                                                     BBCU_BUCKET_COUNT, BBCU_MAX_SLICE_ENTRY_COUNT * sizeof( uint16 ), FileMode::OpenOrCreate, FileAccess::ReadWrite, tmp2FileFlags );
@@ -1519,7 +1506,6 @@ Log::Line( "Host Tables B @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             }
         }
     }
-Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB );
 
     /// Device & Pinned allocations
     {
@@ -1548,7 +1534,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
             descXPairs.pinnedAllocator   = acx.pinnedAllocator;
             descXPairs.sliceAlignment    = cx.diskContext->temp2Queue->BlockSize();
 
-            if( cx.cfg.hybrid64Mode )
+            if( cx.cfg.hybrid16Mode )
             {
                 yDesc.pinnedAllocator = acx.pinnedAllocator;
                 yDesc.sliceAlignment  = cx.diskContext->temp2Queue->BlockSize();
@@ -1665,7 +1651,7 @@ Log::Line( "Host Tables A @ %llu GiB", (llu)acx.hostTableAllocator->Size() BtoGB
         cx.xPairsOut.AssignDiskBuffer( cx.diskContext->unsortedL );
         cx.xPairsIn .AssignDiskBuffer( cx.diskContext->unsortedL );
 
-        if( cx.cfg.hybrid64Mode )
+        if( cx.cfg.hybrid16Mode )
         {
             cx.pairsLOut.AssignDiskBuffer( cx.diskContext->unsortedL );
             cx.pairsLIn .AssignDiskBuffer( cx.diskContext->unsortedL );
diff --git a/cuda/CudaPlotter.h b/cuda/CudaPlotter.h
index 25a40bac..48e95a34 100644
--- a/cuda/CudaPlotter.h
+++ b/cuda/CudaPlotter.h
@@ -15,7 +15,7 @@ struct CudaK32PlotConfig
                                             // May be necessarry on Windows because of shared memory limitations (usual 50% of system memory)
 
     bool hybrid128Mode            = false;  // Enable hybrid disk-offload w/ 128G of RAM.
-    bool hybrid64Mode             = false;  // Enable hybrid disk-offload w/ 64G of RAM.
+    bool hybrid16Mode             = false;  // Enable hybrid disk-offload w/ 64G of RAM.
 
     const char* temp1Path         = nullptr; // For 128G RAM mode
     const char* temp2Path         = nullptr; // For 64G RAM mode
diff --git a/cuda/chacha8.cu b/cuda/chacha8.cu
index ead1f67c..7fb7c5d0 100644
--- a/cuda/chacha8.cu
+++ b/cuda/chacha8.cu
@@ -249,7 +249,7 @@ void GenF1Cuda( CudaK32PlotContext& cx )
     cx.yOut   .Reset();
     cx.metaOut.Reset();
 
-    if( cx.cfg.hybrid64Mode )
+    if( cx.cfg.hybrid16Mode )
     {
         cx.diskContext->yBuffer->Swap();
         cx.diskContext->metaBuffer->Swap();

From f4e346f75e73f41086f4c0f1c86763bd06991965 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 15:21:50 -0400
Subject: [PATCH 19/26] Attempt to fix Harvester on windows again

---
 Bladebit.cmake      | 6 +++++-
 Harvester.cmake     | 3 +++
 cuda/CudaPlotter.cu | 2 +-
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/Bladebit.cmake b/Bladebit.cmake
index df45dc3b..9f1166dd 100644
--- a/Bladebit.cmake
+++ b/Bladebit.cmake
@@ -1,4 +1,4 @@
-add_library(bladebit_core src/plotting/DiskBuffer.h src/plotting/DiskBufferBase.cpp src/plotting/DiskBufferBase.h)
+add_library(bladebit_core)
 target_link_libraries(bladebit_core PUBLIC bladebit_config)
 
 target_include_directories(bladebit_core PUBLIC
@@ -294,9 +294,13 @@ set(src_bladebit
 
     src/plotting/DiskQueue.h
     src/plotting/DiskQueue.cpp
+    src/plotting/DiskBuffer.h
     src/plotting/DiskBuffer.cpp
     src/plotting/DiskBucketBuffer.h
     src/plotting/DiskBucketBuffer.cpp
+    src/plotting/DiskBufferBase.h
+    src/plotting/DiskBufferBase.cpp
+
     src/util/MPMCQueue.h
     src/util/CommandQueue.h
 )
diff --git a/Harvester.cmake b/Harvester.cmake
index c225c809..6063c22b 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -86,6 +86,9 @@ target_sources(bladebit_harvester PRIVATE
         # TODO: Does this have to be here?
         cuda/GpuStreams.cu
         cuda/GpuDownloadStream.cu
+        src/plotting/DiskBuffer.cpp
+        src/plotting/DiskBucketBuffer.cpp
+        src/plotting/DiskBufferBase.cpp
     >
 
     $<$<NOT:${have_cuda}>:
diff --git a/cuda/CudaPlotter.cu b/cuda/CudaPlotter.cu
index b5eb700e..6b5dc586 100644
--- a/cuda/CudaPlotter.cu
+++ b/cuda/CudaPlotter.cu
@@ -147,7 +147,7 @@ void InitContext( CudaK32PlotConfig& cfg, CudaK32PlotContext*& outContext )
     Log::Line( " Host RAM        : %llu GiB", SysHost::GetTotalSystemMemory() BtoGB );
     Log::Line( " Direct transfers: %s", cfg.disableDirectDownloads ? "false" : "true" );
     Log::NewLine();
-   
+
     CudaInit( cx );
 
     CudaErrCheck( cudaStreamCreateWithFlags( &cx.computeStream , cudaStreamNonBlocking ) );

From 7c4d43cabb429b466b9817abaef834e3f1cd4cec Mon Sep 17 00:00:00 2001
From: wallentx <william.allentx@gmail.com>
Date: Wed, 30 Aug 2023 17:24:56 -0500
Subject: [PATCH 20/26] Fixing EmbedVersion cmd execution

---
 cmake_modules/EmbedVersion.cmake | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 50b269fb..deb849c8 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -4,18 +4,16 @@ if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
 
     set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
 
-    set(cmd_ver bash)
     set(ext_ver sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(cmd_ver pwsh)
         set(ext_ver ps1)
     endif()
 
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)

From ce07925a8f2e823c174248f5e90ce15e55c2969e Mon Sep 17 00:00:00 2001
From: wallentx <william.allentx@gmail.com>
Date: Wed, 30 Aug 2023 17:45:15 -0500
Subject: [PATCH 21/26] Trying full powershell cmd name

---
 cmake_modules/EmbedVersion.cmake | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index deb849c8..9929fa41 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -4,16 +4,18 @@ if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
 
     set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
 
+    set(cmd_ver bash)
     set(ext_ver sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
+        set(cmd_ver powershell)
         set(ext_ver ps1)
     endif()
 
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)

From 225b11d3ed83f4b64333877c89f895b1865d3f29 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 18:50:50 -0400
Subject: [PATCH 22/26] Revert to allow git bash on windows, if found

---
 cmake_modules/EmbedVersion.cmake | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index deb849c8..f547f53d 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -2,18 +2,26 @@
 if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
     message("Embedding local build version")
 
-    set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
-
+    set(cmd_ver bash)
     set(ext_ver sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
-        set(ext_ver ps1)
+
+        find_program(bash_path NAMES bash.exe NO_CACHE)
+
+        if(${bash_path} MATCHES "-NOTFOUND")
+            message("Bash was not found")
+            set(cmd_ver pwsh)
+            set(ext_ver ps1)
+        else()
+            set(cmd_ver "${bash_path}")
+        endif()
     endif()
 
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)
@@ -39,3 +47,5 @@ if(NOT DEFINED ENV{CI})
     add_compile_definitions(BLADEBIT_VERSION_SUFFIX="${bb_ver_suffix}")
     add_compile_definitions(BLADEBIT_GIT_COMMIT="${bb_ver_commit}")
 endif()
+
+set(bb_version_embedded on CACHE BOOL "Version embedding has already happened.")
\ No newline at end of file

From 8eb8523270886bc050c4d1c23f84f5ad8fa74b94 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 18:57:33 -0400
Subject: [PATCH 23/26] Check for git bash on windows

---
 cmake_modules/EmbedVersion.cmake | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/cmake_modules/EmbedVersion.cmake b/cmake_modules/EmbedVersion.cmake
index 4a2507b6..1c346632 100644
--- a/cmake_modules/EmbedVersion.cmake
+++ b/cmake_modules/EmbedVersion.cmake
@@ -2,26 +2,25 @@
 if((NOT DEFINED ENV{CI}) AND (NOT DEFINED CACHE{bb_version_embedded}))
     message("Embedding local build version")
 
-    set(cmd_ver bash)
-    set(ext_ver sh)
+    set(cmd_shell bash)
+    set(cmd_ext sh)
     if(${CMAKE_SYSTEM_NAME} MATCHES "Windows")
 
         find_program(bash_path NAMES bash.exe NO_CACHE)
 
         if(${bash_path} MATCHES "-NOTFOUND")
-            message("Bash was not found")
-            set(cmd_ver powershell)
-            set(ext_ver ps1)
+            set(cmd_shell powershell)
+            set(cmd_ext ps1)
         else()
-            set(cmd_ver "${bash_path}")
+            set(cmd_shell "${bash_path}")
         endif()
     endif()
 
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
-    execute_process(COMMAND ${cmd_ver} ${CMAKE_SOURCE_DIR}/extract-version.${ext_ver} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} major    OUTPUT_VARIABLE bb_ver_maj    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} minor    OUTPUT_VARIABLE bb_ver_min    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} revision OUTPUT_VARIABLE bb_ver_rev    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} suffix   OUTPUT_VARIABLE bb_ver_suffix WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
+    execute_process(COMMAND ${cmd_shell} ${CMAKE_SOURCE_DIR}/extract-version.${cmd_ext} commit   OUTPUT_VARIABLE bb_ver_commit WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY)
 
     # Remove trailing whitespace incurred in windows gitbash
     string(STRIP "${bb_ver_maj}"    bb_ver_maj)

From 5db9f94b767f1199d691933a9c9c6179c7a7af85 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 19:06:13 -0400
Subject: [PATCH 24/26] Fis harvester CI on windows

---
 Harvester.cmake                   | 4 +++-
 src/harvesting/HarvesterDummy.cpp | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)
 create mode 100644 src/harvesting/HarvesterDummy.cpp

diff --git a/Harvester.cmake b/Harvester.cmake
index 6063c22b..208799ce 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -1,5 +1,5 @@
 if(NOT ${BB_HARVESTER_STATIC})
-    add_library(bladebit_harvester SHARED)
+    add_library(bladebit_harvester SHARED src/harvesting/HarvesterDummy.cpp)
 else()
     add_library(bladebit_harvester STATIC)
 endif()
@@ -82,6 +82,7 @@ target_sources(bladebit_harvester PRIVATE
         cuda/CudaF1.cu
         cuda/CudaMatch.cu
         cuda/CudaPlotUtil.cu
+        cuda/CudaQueue.cu
 
         # TODO: Does this have to be here?
         cuda/GpuStreams.cu
@@ -89,6 +90,7 @@ target_sources(bladebit_harvester PRIVATE
         src/plotting/DiskBuffer.cpp
         src/plotting/DiskBucketBuffer.cpp
         src/plotting/DiskBufferBase.cpp
+        src/plotting/DiskQueue.cpp
     >
 
     $<$<NOT:${have_cuda}>:
diff --git a/src/harvesting/HarvesterDummy.cpp b/src/harvesting/HarvesterDummy.cpp
new file mode 100644
index 00000000..e2d8f69e
--- /dev/null
+++ b/src/harvesting/HarvesterDummy.cpp
@@ -0,0 +1 @@
+// Only here to make CMake happy
\ No newline at end of file

From 9aa55e4e78f61f31aa0bc7f427502655c0c3e5bd Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 19:11:46 -0400
Subject: [PATCH 25/26] Fix incorrect file name on Harvester sources

---
 Harvester.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Harvester.cmake b/Harvester.cmake
index 208799ce..692daa80 100644
--- a/Harvester.cmake
+++ b/Harvester.cmake
@@ -82,7 +82,7 @@ target_sources(bladebit_harvester PRIVATE
         cuda/CudaF1.cu
         cuda/CudaMatch.cu
         cuda/CudaPlotUtil.cu
-        cuda/CudaQueue.cu
+        cuda/GpuQueue.cu
 
         # TODO: Does this have to be here?
         cuda/GpuStreams.cu

From 42baec21de5ce694957445f788bb846ee3ed6084 Mon Sep 17 00:00:00 2001
From: Harold Brenes <h.brenes@chia.net>
Date: Wed, 30 Aug 2023 20:01:14 -0400
Subject: [PATCH 26/26] Reverting PlotWriter to previous implementation.
 Increasing PlotWriter buffer

---
 src/plotting/PlotWriter.cpp | 378 +++++++++++++++++++-----------------
 src/plotting/PlotWriter.h   |  26 ++-
 2 files changed, 219 insertions(+), 185 deletions(-)

diff --git a/src/plotting/PlotWriter.cpp b/src/plotting/PlotWriter.cpp
index 8c78934c..239fcbeb 100644
--- a/src/plotting/PlotWriter.cpp
+++ b/src/plotting/PlotWriter.cpp
@@ -10,7 +10,6 @@ PlotWriter::PlotWriter() : PlotWriter( true ) {}
 PlotWriter::PlotWriter( bool useDirectIO )
     : _writerThread( new Thread( 4 MiB ) )
     , _directIO    ( useDirectIO )
-    , _queue()
 {
     _readyToPlotSignal.Signal();    // Start ready to plot
 
@@ -276,16 +275,18 @@ void PlotWriter::EndPlot( const bool rename )
 
     ASSERT( _stream.IsOpen() );
 
-    // auto& cmd = GetCommand( CommandType::EndPlot );
-    // cmd.endPlot.fence    = &_completedFence;
-    // cmd.endPlot.rename   = rename;
-    // SubmitCommands();
-
-    SubmitCommand({ .type = CommandType::EndPlot,
-        .endPlot{ .fence    = &_completedFence,
-                  .rename   = rename
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::EndPlot );
+        cmd.endPlot.fence    = &_completedFence;
+        cmd.endPlot.rename   = rename;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::EndPlot,
+            .endPlot{ .fence    = &_completedFence,
+                      .rename   = rename
+            }
+        });
+    #endif
 }
 
 
@@ -337,14 +338,16 @@ void PlotWriter::BeginTable( const PlotTable table )
 {
     if( _dummyMode ) return;
 
-    SubmitCommand({
-        .type = CommandType::BeginTable,
-        .beginTable{ .table = table }
-    });
-    // auto& cmd = GetCommand( CommandType::BeginTable );
-    // auto cmd = GetCommand( CommandType::BeginTable );
-    // cmd.beginTable.table = table;
-    // SubmitCommands();
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::BeginTable );
+        cmd.beginTable.table = table;
+        SubmitCommands();
+    #else
+        SubmitCommand({
+            .type = CommandType::BeginTable,
+            .beginTable{ .table = table }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -352,18 +355,20 @@ void PlotWriter::ReserveTableSize( const PlotTable table, const size_t size )
 {
     if( _dummyMode ) return;
 
-    // auto& cmd = GetCommand( CommandType::ReserveTable );
-    // cmd.reserveTable.table = table;
-    // cmd.reserveTable.size  = size;
-    // SubmitCommands();
-
-     SubmitCommand({
-        .type = CommandType::ReserveTable,
-        .reserveTable { 
-            .table = table,
-            .size  = size 
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::ReserveTable );
+        cmd.reserveTable.table = table;
+        cmd.reserveTable.size  = size;
+        SubmitCommands();
+    #else
+         SubmitCommand({
+            .type = CommandType::ReserveTable,
+            .reserveTable { 
+                .table = table,
+                .size  = size 
+            }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -371,9 +376,12 @@ void PlotWriter::EndTable()
 {
     if( _dummyMode ) return;
 
-    // auto& cmd = GetCommand( CommandType::EndTable );
-    // SubmitCommands();
-    SubmitCommand({ .type = CommandType::EndTable });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::EndTable );
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::EndTable });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -381,16 +389,18 @@ void PlotWriter::WriteTableData( const void* data, const size_t size )
 {
     if( _dummyMode ) return;
 
-    // auto& cmd = GetCommand( CommandType::WriteTable );
-    // cmd.writeTable.buffer = (byte*)data;
-    // cmd.writeTable.size   = size;
-    // SubmitCommands();
-
-    SubmitCommand({ .type = CommandType::WriteTable,
-        .writeTable{ .buffer = (byte*)data,
-                     .size   = size,
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::WriteTable );
+        cmd.writeTable.buffer = (byte*)data;
+        cmd.writeTable.size   = size;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::WriteTable,
+            .writeTable{ .buffer = (byte*)data,
+                         .size   = size,
+            }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -398,17 +408,19 @@ void PlotWriter::WriteReservedTable( const PlotTable table, const void* data )
 {
     if( _dummyMode ) return;
 
-    // auto& cmd = GetCommand( CommandType::WriteReservedTable );
-    // cmd.writeReservedTable.table  = table;
-    // cmd.writeReservedTable.buffer = (byte*)data;
-    // SubmitCommands();
-
-    SubmitCommand({ .type = CommandType::WriteReservedTable,
-        .writeReservedTable{ 
-            .table  = table,
-            .buffer = (byte*)data
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::WriteReservedTable );
+        cmd.writeReservedTable.table  = table;
+        cmd.writeReservedTable.buffer = (byte*)data;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::WriteReservedTable,
+            .writeReservedTable{ 
+                .table  = table,
+                .buffer = (byte*)data
+            }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -420,16 +432,18 @@ void PlotWriter::SignalFence( Fence& fence )
         return;
     }
 
-    // auto& cmd = GetCommand( CommandType::SignalFence );
-    // cmd.signalFence.fence    = &fence;
-    // cmd.signalFence.sequence = -1;
-    // SubmitCommands();
-
-    SubmitCommand({ .type = CommandType::SignalFence,
-        .signalFence{ .fence    = &fence,
-                      .sequence = -1
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::SignalFence );
+        cmd.signalFence.fence    = &fence;
+        cmd.signalFence.sequence = -1;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::SignalFence,
+            .signalFence{ .fence    = &fence,
+                        .sequence = -1
+            }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -441,16 +455,18 @@ void PlotWriter::SignalFence( Fence& fence, uint32 sequence )
         return;
     }
 
-    // auto& cmd = GetCommand( CommandType::SignalFence );
-    // cmd.signalFence.fence    = &fence;
-    // cmd.signalFence.sequence = (int64)sequence;
-    // SubmitCommands();
-    
-    SubmitCommand({ .type = CommandType::SignalFence,
-        .signalFence{ .fence    = &fence,
-                      .sequence = (int64)sequence
-        }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::SignalFence );
+        cmd.signalFence.fence    = &fence;
+        cmd.signalFence.sequence = (int64)sequence;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::SignalFence,
+            .signalFence{ .fence    = &fence,
+                        .sequence = (int64)sequence
+            }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
@@ -462,69 +478,87 @@ void PlotWriter::CallBack( std::function<void()> func )
         return;
     }
 
-    // auto& cmd = GetCommand( CommandType::CallBack );
-    // cmd.callback.func = new std::function<void()>( std::move( func ) );
-    // SubmitCommands();
-
-    SubmitCommand({ .type =  CommandType::CallBack,
-        .callback{ .func = new std::function<void()>( std::move( func ) ) }
-    });
+    #if BB_LOCKFREE_BASED_QUEUE
+        auto& cmd = GetCommand( CommandType::CallBack );
+        cmd.callback.func = new std::function<void()>( std::move( func ) );
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type =  CommandType::CallBack,
+            .callback{ .func = new std::function<void()>( std::move( func ) ) }
+        });
+    #endif
 }
 
 //-----------------------------------------------------------
 void PlotWriter::ExitWriterThread()
 {
-    // Signal writer thread to exit after it finishes its commands
-    // auto& cmd = GetCommand( CommandType::Exit );
-    // cmd.signalFence.fence = &_completedFence;
-    // SubmitCommands();
-
-    SubmitCommand({ .type = CommandType::Exit,
-        .signalFence{ .fence = &_completedFence }
-    });
-
+    #if BB_LOCKFREE_BASED_QUEUE
+        // Signal writer thread to exit after it finishes its commands
+        auto& cmd = GetCommand( CommandType::Exit );
+        cmd.signalFence.fence = &_completedFence;
+        SubmitCommands();
+    #else
+        SubmitCommand({ .type = CommandType::Exit,
+            .signalFence{ .fence = &_completedFence }
+        });
+    #endif
     // Wait for writer thread to exit
     _completedFence.Wait();
     ASSERT( _writerThread->HasExited() );
 }
 
+#if BB_LOCKFREE_BASED_QUEUE
 //-----------------------------------------------------------
 PlotWriter::Command& PlotWriter::GetCommand( CommandType type )
 {
-    Panic( "Don't use me!" );
-
-    // if( _owner != nullptr )
-    // {
-    //     auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand );
-    //     ASSERT( cmd );
-
-    //     ZeroMem( &cmd->plotWriterCmd );
-    //     cmd->plotWriterCmd.writer   = this;
-    //     cmd->plotWriterCmd.cmd.type = type;
-    //     return cmd->plotWriterCmd.cmd;
-    // }
-    // else
-    // {
-    //     Command* cmd = nullptr;
-    //     while( !_queue.Write( cmd ) )
-    //     {
-    //         Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." );
-    //         auto waitTimer = TimerBegin();
-
-    //         // Block and wait until we have commands free in the buffer
-    //         _cmdConsumedSignal.Wait();
+    if( _owner != nullptr )
+    {
+        auto* cmd = _owner->GetCommandObject( DiskBufferQueue::Command::CommandType::PlotWriterCommand );
+        ASSERT( cmd );
+
+        ZeroMem( &cmd->plotWriterCmd );
+        cmd->plotWriterCmd.writer   = this;
+        cmd->plotWriterCmd.cmd.type = type;
+        return cmd->plotWriterCmd.cmd;
+    }
+    else
+    {
+        Command* cmd = nullptr;
+        while( !_queue.Write( cmd ) )
+        {
+            Log::Line( "[PlotWriter] Command buffer full. Waiting for commands." );
+            auto waitTimer = TimerBegin();
+
+            // Block and wait until we have commands free in the buffer
+            _cmdConsumedSignal.Wait();
             
-    //         Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) );
-    //     }
+            Log::Line( "[PlotWriter] Waited %.6lf seconds for a Command to be available.", TimerEnd( waitTimer ) );
+        }
         
-    //     ASSERT( cmd );
-    //     ZeroMem( cmd );
-    //     cmd->type = type;
+        ASSERT( cmd );
+        ZeroMem( cmd );
+        cmd->type = type;
+
+        return *cmd;
+    }
+}
 
-    //     return *cmd;
-    // }
+//-----------------------------------------------------------
+void PlotWriter::SubmitCommands()
+{
+    // Panic( "" );
+    if( _owner != nullptr )
+    {
+        _owner->CommitCommands();
+    }
+    else
+    {
+        _queue.Commit();
+        _cmdReadySignal.Signal();
+    }
 }
 
+#else
 //-----------------------------------------------------------
 void PlotWriter::SubmitCommand( const Command cmd )
 {
@@ -532,20 +566,8 @@ void PlotWriter::SubmitCommand( const Command cmd )
     _queue.push( cmd );
     _cmdReadySignal.Signal();
 }
+#endif // BB_LOCKFREE_BASED_QUEUE
 
-//-----------------------------------------------------------
-void PlotWriter::SubmitCommands()
-{Panic( "" );
-    // if( _owner != nullptr )
-    // {
-    //     _owner->CommitCommands();
-    // }
-    // else
-    // {
-    //     _queue.Commit();
-    //     _cmdReadySignal.Signal();
-    // }
-}
 
 
 ///
@@ -569,60 +591,62 @@ void PlotWriter::WriterThreadMain()
         _cmdReadySignal.Wait();
 
         // Load commands from the queue
-        // int32 cmdCount;
-        // while( ( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) ) )
-        // {
-        //     // Notify we consumed commands
-        //     _cmdConsumedSignal.Signal();
-
-        //     for( int32 i = 0; i < cmdCount; i++ )
-        //     {
-        //         if( commands[i].type == CommandType::Exit )
-        //         {
-        //             commands[i].signalFence.fence->Signal();
-        //             return;
-        //         }
-
-        //         ExecuteCommand( commands[i] );
-        //     }
-        // }
-
-        // Consume commands from the queue and execute them
-        // until there are none more found in the queue
-        size_t cmdCount = 0;
-        for( ;; )
-        {
-            // Consume commands from queue
+        #if BB_LOCKFREE_BASED_QUEUE
+            int32 cmdCount = 0;
+            while( ( cmdCount = _queue.Dequeue( commands, MAX_COMMANDS ) ) > 0 )
             {
-                std::unique_lock lock( _queueLock );
-                cmdCount = std::min<size_t>( _queue.size(), MAX_COMMANDS );
-                
-                for( size_t i = 0; i < cmdCount; i++ )
+                // Notify we consumed commands
+                _cmdConsumedSignal.Signal();
+
+                for( int32 i = 0; i < cmdCount; i++ )
                 {
-                    commands[i] = _queue.front();
-                    _queue.pop();
+                    if( commands[i].type == CommandType::Exit )
+                    {
+                        commands[i].signalFence.fence->Signal();
+                        return;
+                    }
+
+                    ExecuteCommand( commands[i] );
                 }
             }
+        #else
+            // Consume commands from the queue and execute them
+            // until there are none more found in the queue
+            size_t cmdCount = 0;
+            for( ;; )
+            {
+                // Consume commands from queue
+                {
+                    std::unique_lock lock( _queueLock );
+                    cmdCount = std::min<size_t>( _queue.size(), MAX_COMMANDS );
+                    
+                    for( size_t i = 0; i < cmdCount; i++ )
+                    {
+                        commands[i] = _queue.front();
+                        _queue.pop();
+                    }
+                }
 
-            // Notify we consumed commands
-            _cmdConsumedSignal.Signal();
+                // Notify we consumed commands
+                _cmdConsumedSignal.Signal();
 
-            if( cmdCount < 1 )
-                break;
+                if( cmdCount < 1 )
+                    break;
 
-            // Execute commands
-            for( int32 i = 0; i < cmdCount; i++ )
-            {
-                if( commands[i].type == CommandType::Exit )
+                // Execute commands
+                for( int32 i = 0; i < cmdCount; i++ )
                 {
-                    commands[i].signalFence.fence->Signal();
-                    return;
-                }
+                    if( commands[i].type == CommandType::Exit )
+                    {
+                        commands[i].signalFence.fence->Signal();
+                        return;
+                    }
 
-                ExecuteCommand( commands[i] );
+                    ExecuteCommand( commands[i] );
+                }
             }
-        }
-    }
+        #endif
+    } // End for( ;; )
 }
 
 //-----------------------------------------------------------
diff --git a/src/plotting/PlotWriter.h b/src/plotting/PlotWriter.h
index ef6a55ed..a359f721 100644
--- a/src/plotting/PlotWriter.h
+++ b/src/plotting/PlotWriter.h
@@ -8,8 +8,12 @@
 #include "threading/AutoResetSignal.h"
 #include "threading/Fence.h"
 #include <functional>
-#include <mutex>
-#include <queue>
+
+#define BB_LOCKFREE_BASED_QUEUE 1
+#if !BB_LOCKFREE_BASED_QUEUE
+    #include <mutex>
+    #include <queue>
+#endif
 
 /**
  * Handles writing the final plot data to disk asynchronously.
@@ -161,8 +165,12 @@ class PlotWriter
         int32 compressionLevel );
 
     Command& GetCommand( CommandType type );
-    void SubmitCommands();
-    void SubmitCommand( const Command cmd );
+
+    #if BB_LOCKFREE_BASED_QUEUE
+        void SubmitCommands();
+    #else
+        void SubmitCommand( const Command cmd );
+    #endif
     
     void SeekToLocation( size_t location );
 
@@ -286,10 +294,12 @@ class PlotWriter
     size_t                 _tableStart          = 0;    // Current table start location
     uint64                 _tablePointers[10]   = {};
     uint64                 _tableSizes   [10]   = {};
-    // SPCQueue<Command, 512> _queue;
 
-    std::queue<Command>     _queue;
-    std::mutex              _queueLock;
-    // std::mutex              _pushLock;
+    #if BB_LOCKFREE_BASED_QUEUE
+        SPCQueue<Command,4096> _queue;
+    #else
+        std::queue<Command>     _queue;
+        std::mutex              _queueLock;
+    #endif
 };