diff --git a/attack.hpp b/attack.hpp
deleted file mode 100644
index 136e1ad..0000000
--- a/attack.hpp
+++ /dev/null
@@ -1,1468 +0,0 @@
-/*
- * attack.hpp
- *
- *  Created on: Oct 26, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_HPP_
-#define ATTACK_HPP_
-
-
-#include "nick_blake3.hpp"
-//#include <thrust/device_ptr.h>
-//#include <thrust/sort.h>
-//#include <thrust/unique.h>
-
-#define ATTACK_KBCFILTER(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
-		Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-}
-
-__global__
-void gpu_chacha8_k32_kbc_ranges(const uint32_t N,
-		const __restrict__ uint32_t *input, Tx_Bucketed_Meta1 *kbc_local_entries, int *kbc_local_num_entries,
-		uint32_t KBC_START, uint32_t KBC_END)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_KBCFILTER(x0,0);ATTACK_KBCFILTER(x1,1);ATTACK_KBCFILTER(x2,2);ATTACK_KBCFILTER(x3,3);
-		ATTACK_KBCFILTER(x4,4);ATTACK_KBCFILTER(x5,5);ATTACK_KBCFILTER(x6,6);ATTACK_KBCFILTER(x7,7);
-		ATTACK_KBCFILTER(x8,8);ATTACK_KBCFILTER(x9,9);ATTACK_KBCFILTER(x10,10);ATTACK_KBCFILTER(x11,11);
-		ATTACK_KBCFILTER(x12,12);ATTACK_KBCFILTER(x13,13);ATTACK_KBCFILTER(x14,14);ATTACK_KBCFILTER(x15,15);
-	}
-}
-
-__device__ int gpu_xs_L_count = 0;
-__device__ int gpu_xs_R_count = 0;
-
-#define ATTACK_WRITEXS_LR(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-		int slot = atomicAdd(&local_num_xs_L,1); \
-		local_xs_L[slot] = x+i; \
-		local_ys_L[slot] = chacha_y; \
-	} \
-	if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-		int slot = atomicAdd(&local_num_xs_R,1); \
-		local_xs_R[slot] = x+i; \
-		local_ys_R[slot] = chacha_y; \
-	} \
-}
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR_write_xy(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint32_t *xs_L, uint32_t *ys_L, uint32_t *xs_L_count, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		uint32_t *xs_R, uint32_t *ys_R, uint32_t *xs_R_count, uint32_t KBC_START_R, uint32_t KBC_END_R)
-{
-	__shared__ uint32_t local_xs_L[256];
-	__shared__ uint32_t local_ys_L[256];
-	__shared__ uint32_t local_xs_R[256];
-	__shared__ uint32_t local_ys_R[256];
-	__shared__ int local_num_xs_L;
-	__shared__ int local_num_xs_R;
-	__shared__ int global_L_slot;
-	__shared__ int global_R_slot;
-
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	if (threadIdx.x == 0) {
-		local_num_xs_L = 0;
-		local_num_xs_R = 0;
-	}
-	__syncthreads();
-	const int j = 33*threadIdx.x;
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITEXS_LR(datax[j+0],0);ATTACK_WRITEXS_LR(datax[j+1],1);ATTACK_WRITEXS_LR(datax[j+2],2);ATTACK_WRITEXS_LR(datax[j+3],3);
-		ATTACK_WRITEXS_LR(datax[j+4],4);ATTACK_WRITEXS_LR(datax[j+5],5);ATTACK_WRITEXS_LR(datax[j+6],6);ATTACK_WRITEXS_LR(datax[j+7],7);
-		ATTACK_WRITEXS_LR(datax[j+8],8);ATTACK_WRITEXS_LR(datax[j+9],9);ATTACK_WRITEXS_LR(datax[j+10],10);ATTACK_WRITEXS_LR(datax[j+11],11);
-		ATTACK_WRITEXS_LR(datax[j+12],12);ATTACK_WRITEXS_LR(datax[j+13],13);ATTACK_WRITEXS_LR(datax[j+14],14);ATTACK_WRITEXS_LR(datax[j+15],15);
-
-	}
-	// without global writes it has maximum speed of 21ms
-	// these global writes up it to 26ms.
-	// hope here is that sorting won't take long, so that sorted entries are under total 35ms
-	// and then the matching *should* be quicker than when it's bucketed
-	__syncthreads();
-	if (threadIdx.x == 0) {
-		//printf("finished with %u %u counts\n", local_num_xs_L, local_num_xs_R);
-		global_L_slot = atomicAdd(&xs_L_count[0],local_num_xs_L);
-		global_R_slot = atomicAdd(&xs_R_count[0],local_num_xs_R);
-	}
-	__syncthreads();
-	for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) {
-		xs_L[i+global_L_slot] = local_xs_L[i];
-	}
-	for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) {
-		ys_L[i+global_L_slot] = local_ys_L[i];
-	}
-	for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) {
-		xs_R[i+global_R_slot] = local_xs_R[i];
-	}
-	for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) {
-		ys_R[i+global_R_slot] = local_ys_R[i];
-	}
-}
-
-__global__
-void gpu_merge_f1xypairs_into_kbc_buckets(
-		const uint32_t KBC_START_ID, // determined by batch_id
-		const uint64_t *in, const uint32_t N,
-		Tx_Bucketed_Meta1 *local_kbc_entries, int *local_kbc_counts)
-{
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	//for (int i = 0; i < N; i++) {
-
-	if (i < N) {
-		uint64_t value = in[i];
-		uint32_t x = value >> 32;
-		uint32_t chacha_y = value;
-		uint64_t calc_y = (((uint64_t) chacha_y) << 6) + (x >> 26);
-		uint32_t kbc_id = calc_y / kBC;
-		uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS / 256;
-		if ((kbc_id >= KBC_START_ID) || (kbc_id < KBC_END_ID)) {
-
-
-		uint32_t local_kbc_id = kbc_id - KBC_START_ID;
-		int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1);
-		uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot;
-
-		//printf("block_id:%u [i: %u] entry.y:%u  kbc_id:%u   local_kbc:%u   slot:%u   dest:%u\n",
-		//		block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address);
-
-		if (slot > KBC_MAX_ENTRIES_PER_BUCKET) {
-			printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n");
-		}
-		if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address);
-		}
-		Tx_Bucketed_Meta1 kbc_entry = {};
-		kbc_entry.y = calc_y % kBC;
-		kbc_entry.meta[0] = x;
-		local_kbc_entries[destination_address] = kbc_entry;
-		}
-	}
-}
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR_write_xypairs(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint64_t *xys_L, uint32_t *xs_L_count, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		uint64_t *xys_R, uint32_t *xs_R_count, uint32_t KBC_START_R, uint32_t KBC_END_R)
-{
-	__shared__ uint32_t local_xs_L[256];
-	__shared__ uint32_t local_ys_L[256];
-	__shared__ uint32_t local_xs_R[256];
-	__shared__ uint32_t local_ys_R[256];
-	__shared__ int local_num_xs_L;
-	__shared__ int local_num_xs_R;
-	__shared__ int global_L_slot;
-	__shared__ int global_R_slot;
-
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[256*17]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	if (threadIdx.x == 0) {
-		local_num_xs_L = 0;
-		local_num_xs_R = 0;
-	}
-	__syncthreads();
-	const int j = 17*threadIdx.x;
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITEXS_LR(datax[j+0],0);ATTACK_WRITEXS_LR(datax[j+1],1);ATTACK_WRITEXS_LR(datax[j+2],2);ATTACK_WRITEXS_LR(datax[j+3],3);
-		ATTACK_WRITEXS_LR(datax[j+4],4);ATTACK_WRITEXS_LR(datax[j+5],5);ATTACK_WRITEXS_LR(datax[j+6],6);ATTACK_WRITEXS_LR(datax[j+7],7);
-		ATTACK_WRITEXS_LR(datax[j+8],8);ATTACK_WRITEXS_LR(datax[j+9],9);ATTACK_WRITEXS_LR(datax[j+10],10);ATTACK_WRITEXS_LR(datax[j+11],11);
-		ATTACK_WRITEXS_LR(datax[j+12],12);ATTACK_WRITEXS_LR(datax[j+13],13);ATTACK_WRITEXS_LR(datax[j+14],14);ATTACK_WRITEXS_LR(datax[j+15],15);
-
-	}
-	// without global writes it has maximum speed of 21ms
-	// these global writes up it to 26ms.
-	// hope here is that sorting won't take long, so that sorted entries are under total 35ms
-	// and then the matching *should* be quicker than when it's bucketed
-	__syncthreads();
-	if (threadIdx.x == 0) {
-		//printf("finished with %u %u counts\n", local_num_xs_L, local_num_xs_R);
-		global_L_slot = atomicAdd(&xs_L_count[0],local_num_xs_L);
-		global_R_slot = atomicAdd(&xs_R_count[0],local_num_xs_R);
-	}
-	__syncthreads();
-	for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) {
-		xys_L[i+global_L_slot] = (((uint64_t) local_xs_L[i]) << 32) + local_ys_L[i];
-	}
-	for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) {
-		xys_R[i+global_R_slot] = (((uint64_t) local_xs_R[i]) << 32) + local_ys_R[i];
-	}
-}
-
-
-#define ATTACK_KBCFILTER_LR(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-		if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-			int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_L[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-			int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_R[entries_address] = entry; \
-		} \
-}
-
-
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_KBCFILTER_LR(x0,0);ATTACK_KBCFILTER_LR(x1,1);ATTACK_KBCFILTER_LR(x2,2);ATTACK_KBCFILTER_LR(x3,3);
-		ATTACK_KBCFILTER_LR(x4,4);ATTACK_KBCFILTER_LR(x5,5);ATTACK_KBCFILTER_LR(x6,6);ATTACK_KBCFILTER_LR(x7,7);
-		ATTACK_KBCFILTER_LR(x8,8);ATTACK_KBCFILTER_LR(x9,9);ATTACK_KBCFILTER_LR(x10,10);ATTACK_KBCFILTER_LR(x11,11);
-		ATTACK_KBCFILTER_LR(x12,12);ATTACK_KBCFILTER_LR(x13,13);ATTACK_KBCFILTER_LR(x14,14);ATTACK_KBCFILTER_LR(x15,15);
-	}
-}
-
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_find_t1_matches(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		int kbc_map = r_y / 2;
-		const int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			int kbc_map = r_target / 2;
-			const int kbc_box_shift = (r_target % 2) * 15;
-			int rmap_value = nick_rmap[kbc_map];
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-
-			uint64_t batch_bucket = blake_result >> (38-6); // setting this to 0 (seq.) changes from 57ms to 48ms.
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-			uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-			if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-				printf("ERROR: results address overflow\n");
-			} else {
-				bucketed_out[pair_address] = pair;
-			}
-
-	}
-}
-
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_find_t1_matches_out_kbc(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *kbc_out, unsigned int *out_kbc_counts, const uint32_t MAX_KBC_ENTRIES) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ BUCKETED_ENTRY_IN kbc_L_entries[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ BUCKETED_ENTRY_IN kbc_R_entries[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R];
-	}
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L];
-	}
-
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		int kbc_map = r_y / 2;
-		const int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			int kbc_map = r_target / 2;
-			const int kbc_box_shift = (r_target % 2) * 15;
-			int rmap_value = nick_rmap[kbc_map];
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//printf("match list\n");
-			//for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-
-
-
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-
-			//uint32_t batch_bucket = blake_result >> (38-6); // setting this to 0 (seq.) changes from 57ms to 48ms.
-
-			//if ((pair.meta[0] == 1320788535) || (pair.meta[0] == 2131394289)) {
-			//	printf("Got y %llu batch:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, batch_bucket, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-			uint32_t kbc_bucket = blake_result / kBC;
-
-			pair.y = (uint32_t) (blake_result % kBC);
-		//if (batch_bucket == 49) {
-			//int block_slot = atomicAdd(&out_kbc_counts[kbc_bucket],1);
-
-			// slightly faster and more memory efficient anyway
-			uint32_t kbc_bitmask_bucket = kbc_bucket / 8; \
-			uint32_t kbc_bitmask_shift = 4*(kbc_bucket % 8); \
-			unsigned int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-			unsigned int bitadd = atomicAdd(&out_kbc_counts[kbc_bitmask_bucket],kbc_bitmask_add); \
-			uint32_t block_slot = bitadd; \
-			block_slot = (block_slot >> (kbc_bitmask_shift)) & 0b01111; \
-
-/*
- * Doing T1
-   chacha L1 time: 35 ms
-   match T1 L time: 18 ms
-   match T1 R time: 18 ms
-   match T2 L time: 22 ms
-Freeing memory...
-GPU DISPLAY T2 MATCH RESULTS:
-  block 22 entry 3140   x1:1320788535  x2:3465356684  x3:2131394289  x4:606438761
-  TOTAL: 262341
-
-  Doing T1
-   chacha L1 time: 36 ms
-   match T1 L time: 19 ms
-   match T1 R time: 19 ms
-   match T2 L time: 22 ms
-Freeing memory...
-GPU DISPLAY T2 MATCH RESULTS:
-  block 22 entry 3140   x1:1320788535  x2:3465356684  x3:2131394289  x4:606438761
-  TOTAL: 262341
- */
-
-			if (block_slot > MAX_KBC_ENTRIES) {
-				printf("block_slot > MAX %u\n", block_slot);
-			} else {
-				uint32_t pair_address = kbc_bucket * MAX_KBC_ENTRIES + block_slot;
-			//if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-				//printf("ERROR: results address overflow\n");
-			//} else {
-				kbc_out[pair_address] = pair;
-			//}
-			}
-		//} // TOKENPOD
-
-
-	}
-}
-
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_find_tx_LR_matches(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries_L, const int *kbc_local_num_entries_L,
-		const BUCKETED_ENTRY_IN *kbc_local_entries_R, const int *kbc_local_num_entries_R,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; // TODO: this could be smaller
-	__shared__ int total_matches;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries_L[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries_R[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries_L[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries_R[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	if ((num_L == 0) || (num_R == 0)) {
-		return;
-	}
-
-	__syncthreads(); // all written initialize data should sync
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  AND
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		int16_t yr_kbc = R_entry.y;
-		int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-		for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) {
-			// do L_entry and R_entry match?
-			BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-			int16_t yl_kbc = L_entry.y;
-			int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-			int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			if (formula_one < 0) {
-				formula_one += kB;
-			}
-			int16_t m = formula_one;
-			if (m >= kB) {
-				m -= kB;
-			}
-			if (m < 64) {
-				// passed first test
-				int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-				int16_t yr_cid = yr_kbc % kC;
-				int16_t parity = (global_kbc_L_bucket_id) % 2;
-				int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				int16_t formula_two = yr_cid - yl_cid;
-				if (formula_two < 0) {
-					formula_two += kC;
-				}
-				if (formula_two == m2_parity_squared) {
-					// we have a match.
-					int num_matches = atomicAdd(&total_matches,1);
-					if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-						printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;//value >> 4;
-						matches[num_matches] = match;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-			//	uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-			//	uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-			//	printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			//}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		if (table < 6) {
-			uint64_t batch_bucket = blake_result >> (38-6);
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-			uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-			if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-				printf("ERROR: results address overflow\n");
-			} else {
-				//bucketed_out[pair_address] = pair;
-			}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_find_tx_LR_matches_global(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_R, const unsigned int *kbc_global_num_entries_R,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts,
-		uint32_t KBC_MAX_ENTRIES, uint32_t BLOCK_MAX_ENTRIES) {
-
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; // TODO: this could be smaller
-	__shared__ int total_matches;
-	//__shared__ int num_L;
-	//__shared__ int num_R;
-
-	int global_kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != kBC_NUM_BUCKETS) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL KBC NUM BUCKETS %u\n", gridDim.x, kBC_NUM_BUCKETS);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES;
-	const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES;
-
-
-	//if (threadIdx.x == 0) {
-		uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8;
-		uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8);
-		uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket];
-		const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-	//}
-	//if (threadIdx.x == 1) {
-		kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8;
-		kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8);
-		bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket];
-		const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-	//}
-	//__syncthreads();
-	//const int num_L = kbc_global_num_entries_L[global_kbc_L_bucket_id];
-	//const int num_R = kbc_global_num_entries_R[(global_kbc_L_bucket_id+1)];
-	if ((num_L == 0) || (num_R == 0)) {
-		return;
-	}
-
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  AND
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		int16_t yr_kbc = R_entry.y;
-		int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-		for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) {
-			// do L_entry and R_entry match?
-			BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-			int16_t yl_kbc = L_entry.y;
-			int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-			int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			if (formula_one < 0) {
-				formula_one += kB;
-			}
-			int16_t m = formula_one;
-			if (m >= kB) {
-				m -= kB;
-			}
-			if (m < 64) {
-				// passed first test
-				int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-				int16_t yr_cid = yr_kbc % kC;
-				int16_t parity = (global_kbc_L_bucket_id) % 2;
-				int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				int16_t formula_two = yr_cid - yl_cid;
-				if (formula_two < 0) {
-					formula_two += kC;
-				}
-				if (formula_two == m2_parity_squared) {
-					// we have a match.
-					int num_matches = atomicAdd(&total_matches,1);
-					if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-						printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;//value >> 4;
-						matches[num_matches] = match;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			//printf("Got t2 match x1: %u x2: %u x3: %u x4: %u\n", L_Entry.meta[0], L_Entry.meta[1], R_Entry.meta[0], R_Entry.meta[1]);
-
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-			//	uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-			//	uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-				//}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		if (table < 6) {
-			uint64_t batch_bucket = blake_result >> (38-6);
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-			uint32_t pair_address = batch_bucket * BLOCK_MAX_ENTRIES + block_slot;
-			//if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			//	printf("ERROR: results address overflow\n");
-			//} else {
-				bucketed_out[pair_address] = pair;
-			//}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY>
-__global__
-void gpu_attack_merge_block_buckets_into_kbc_buckets(
-		const uint32_t KBC_START_ID, // determined by batch_id
-		const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N,
-		BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts)
-{
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	//for (int i = 0; i < N; i++) {
-
-	if (i < N) {
-		// TODO: try just reading out entries and see if they match when going in
-
-		BUCKETED_ENTRY block_entry = in[i];
-		uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y;
-		uint32_t kbc_id = calc_y / kBC;
-		//uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS;
-		//if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) {
-		//	printf(" i:%u  entry.y:%u  add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u   KBC_LOCAL_NUM_BUCKETS:%u START:%u  END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID);
-		//}
-
-		uint32_t local_kbc_id = kbc_id - KBC_START_ID;
-		int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1);
-		uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot;
-
-		//printf("block_id:%u [i: %u] entry.y:%u  kbc_id:%u   local_kbc:%u   slot:%u   dest:%u\n",
-		//		block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address);
-
-		if (slot > KBC_MAX_ENTRIES_PER_BUCKET) {
-			printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n");
-		}
-		if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address);
-		}
-		block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form.
-		local_kbc_entries[destination_address] = block_entry;
-	}
-}
-
-
-__global__
-void gpu_list_local_kbc_entries(int *kbc_num_entries, int from, int to, int skip) {
-	for (int i=from;i<to;i+=skip) {
-		int num = kbc_num_entries[i];
-		printf("kbc %u : %u\n", i, num);
-	}
-}
-
-__global__
-void gpu_list_local_kbc_entries_bitmask(unsigned int *kbc_num_entries, int from, int to, int skip) {
-	for (int i=from;i<to;i+=skip) {
-		uint32_t kbc_bitmask_bucket = i / 8;
-		uint32_t kbc_bitmask_shift = 4*(i % 8);
-		uint32_t bitvalue = kbc_num_entries[kbc_bitmask_bucket];
-		const unsigned int num = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		printf("kbc %u : %u\n", i, num);
-	}
-}
-
-//#include "attack_method_kbc_list.hpp"
-#include "attack_method_lxs.hpp"
-//#include "attack_method_2.hpp" // this is current working one
-//#include "attack_method_xpairbits.hpp"
-
-void attack_it() {
-	std::cout << "Attack it!" << std::endl;
-
-	//uint32_t bits = 10;
-	//attack_method_2(bits);
-
-
-	//attack_method_xpairbits();
-	attack_method_lxs(6000000);
-	return;
-
-	//auto sort_start = std::chrono::high_resolution_clock::now();
-	//thrust::device_ptr<uint32_t> device_xs_L_ptr(device_xs_L);
-	//thrust::device_ptr<uint32_t> device_ys_L_ptr(device_ys_L);
-	//thrust::sort_by_key(device_ys_L_ptr, device_ys_L_ptr + xs_count_L[0], device_xs_L_ptr);
-	//CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	//auto sort_finish = std::chrono::high_resolution_clock::now();
-	//std::cout << "   sort time: " << std::chrono::duration_cast<milli>(sort_finish - sort_start).count() << " ms\n";
-	// why is 2nd sort 31ms and first sort 8ms!?!?
-	//sort_start = std::chrono::high_resolution_clock::now();
-	//thrust::device_ptr<uint32_t> device_xs_R_ptr(device_xs_R);
-	//thrust::device_ptr<uint32_t> device_ys_R_ptr(device_ys_R);
-	//thrust::sort_by_key(device_ys_R_ptr, device_ys_R_ptr + xs_count_R[0], device_xs_R_ptr);
-	//CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	//sort_finish = std::chrono::high_resolution_clock::now();
-	//std::cout << "   sort time: " << std::chrono::duration_cast<milli>(sort_finish - sort_start).count() << " ms\n";
-
-
-	/*auto matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_L, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-	gpu_attack_find_t1_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_L - KBC_START_L), 256>>>(1, batch_id_L, KBC_START_L, KBC_END_L,
-			T0_local_kbc_entries_L, device_local_kbc_num_entries_L,
-			T1_L_batch_match_results, device_block_entry_counts_L);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 L time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-
-	matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_R, 0, (BATCHES)*sizeof(int)));
-	gpu_attack_find_t1_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_R - KBC_START_R), 256>>>(1, batch_id_R, KBC_START_R, KBC_END_R,
-				T0_local_kbc_entries_R, device_local_kbc_num_entries_R,
-				T1_R_batch_match_results, device_block_entry_counts_R);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 R time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-
-	auto t1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "      T1 total time: " << std::chrono::duration_cast<milli>(t1_finish - t1_start).count() << " ms\n";
-
-
-
-	auto mergekbcs_start = std::chrono::high_resolution_clock::now();
-	// clear our local kbc num entries as these will be written with new data
-
-
-	Tx_Bucketed_Meta2 *T1_local_kbc_entries_L = (Tx_Bucketed_Meta2 *) &device_local_kbc_entries_L[0]; // will replace...
-	Tx_Bucketed_Meta2 *T1_local_kbc_entries_R = (Tx_Bucketed_Meta2 *) &device_local_kbc_entries_R[0];
-	// clump block-0-batch_id_L block-0-batch_id_R into same group and solve.
-	auto matchTx_start = std::chrono::high_resolution_clock::now();
-	auto matchTx_finish = std::chrono::high_resolution_clock::now();
-	auto mergeTx_start = std::chrono::high_resolution_clock::now();
-	auto mergeTx_finish = std::chrono::high_resolution_clock::now();
-	uint64_t total_match_time_micros = 0;
-	uint64_t total_merge_time_micros = 0;
-	uint32_t global_block_counts[BATCHES] = {0};
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-		CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-		CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-		uint32_t KBC_MERGE_BUCKET_START = MIN_KBC_BUCKET_FOR_BATCH(block_id);
-		uint32_t num_entries_to_copy = device_block_entry_counts_L[block_id];
-		int blockSize = 256;
-		int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize);
-		uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(block_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id);
-
-		uint32_t block_address = block_id * HOST_MAX_BLOCK_ENTRIES;
-		Tx_Bucketed_Meta2 *in = &T1_L_batch_match_results[block_address];
-
-		//std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl;
-		mergeTx_start = std::chrono::high_resolution_clock::now();
-		gpu_attack_merge_block_buckets_into_kbc_buckets<Tx_Bucketed_Meta2><<<numBlocks,blockSize>>>(
-				KBC_MERGE_BUCKET_START,
-				in, batch_bucket_add_Y, num_entries_to_copy,
-				T1_local_kbc_entries_L, device_local_kbc_num_entries_L);
-
-		num_entries_to_copy = device_block_entry_counts_R[block_id];
-		numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize);
-		in = &T1_R_batch_match_results[block_address];
-
-		//std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl;
-		gpu_attack_merge_block_buckets_into_kbc_buckets<Tx_Bucketed_Meta2><<<numBlocks,blockSize>>>(
-						KBC_MERGE_BUCKET_START,
-						in, batch_bucket_add_Y, num_entries_to_copy,
-						T1_local_kbc_entries_R, device_local_kbc_num_entries_R);
-
-		// TODO: find matches in entries_L against entries_R...should be <16, avg around 3-4
-		// only have 2m entries...so...could sort 1mL's against 1mR's?
-		//auto matchTx_start = std::chrono::high_resolution_clock::now();
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		mergeTx_finish = std::chrono::high_resolution_clock::now();
-		total_merge_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( mergeTx_finish - mergeTx_start ).count();
-
-
-		CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-
-		// yes this can be ram optimized to contrain MAX_ENTRIES to a fraction (at least 1/16th the size)
-		// yikes...577ms...terrible...CPU WOULD BE FASTER!!!
-		matchTx_start = std::chrono::high_resolution_clock::now();
-		gpu_attack_find_tx_LR_matches<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<(KBC_END_L - KBC_START_L), 8>>>(1, batch_id_L, KBC_START_L, KBC_END_L,
-					T1_local_kbc_entries_L, device_local_kbc_num_entries_L,
-					T1_local_kbc_entries_R, device_local_kbc_num_entries_R,
-					T2_batch_match_results, device_T2_block_entry_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		matchTx_finish = std::chrono::high_resolution_clock::now();
-		total_match_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( matchTx_finish - matchTx_start ).count();
-
-		//total_match_time_ms += std::chrono::duration_cast<microseconds>(matchTx_finish - matchTx_start).count();
-		for (int i = 0; i < BATCHES; i++) {
-			global_block_counts[i] += device_T2_block_entry_counts[i];
-		}
-
-	}
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	std::cout << "   match t2 LR sum time: " << (total_match_time_micros/1000) << "ms" << std::endl;
-	std::cout << "   merge t2 LR sum time: " << (total_merge_time_micros/1000) << "ms" << std::endl;
-	auto mergekbcs_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "      T2 total time: " << std::chrono::duration_cast<milli>(mergekbcs_finish - mergekbcs_start).count() << " ms\n";
-	//gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L);
-*/
-
-	/*{
-		auto matchT2_start = std::chrono::high_resolution_clock::now();
-		Tx_Bucketed_Meta2 *t2bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A;
-		Tx_Bucketed_Meta4 *t2bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B;
-
-		CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-
-		gpu_attack_find_t1_matches<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<(KBC_END - KBC_START), 256>>>(2, batch_id, KBC_START, KBC_END,
-				t2bucketed_kbc_entries_in, device_local_kbc_num_entries,
-				t2bucketed_out, device_block_entry_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto matchT2_finish = std::chrono::high_resolution_clock::now();
-
-		std::cout << "   match T2 time: " << std::chrono::duration_cast<milli>(matchT2_finish - matchT2_start).count() << " ms\n";
-		//gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries);
-	}
-*/
-
-}
-
-
-#endif /* ATTACK_HPP_ */
diff --git a/attack_method_1.hpp b/attack_method_1.hpp
deleted file mode 100644
index da79016..0000000
--- a/attack_method_1.hpp
+++ /dev/null
@@ -1,493 +0,0 @@
-/*
- * attack_method_1.hpp
- *
- *  Created on: Nov 2, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_1_HPP_
-#define ATTACK_METHOD_1_HPP_
-
-
-
-
-#define ATTACK_KBCFILTER_LR1LR2(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-		if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-			int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_L[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-			int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_R[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \
-					int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_L2[entries_address] = entry; \
-				} \
-				if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \
-					int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_R2[entries_address] = entry; \
-				} \
-}
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR1LR2(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L2, int *kbc_local_num_entries_L2, uint32_t KBC_START_L2, uint32_t KBC_END_L2,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R2, int *kbc_local_num_entries_R2, uint32_t KBC_START_R2, uint32_t KBC_END_R2)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_KBCFILTER_LR1LR2(x0,0);ATTACK_KBCFILTER_LR1LR2(x1,1);ATTACK_KBCFILTER_LR1LR2(x2,2);ATTACK_KBCFILTER_LR1LR2(x3,3);
-		ATTACK_KBCFILTER_LR1LR2(x4,4);ATTACK_KBCFILTER_LR1LR2(x5,5);ATTACK_KBCFILTER_LR1LR2(x6,6);ATTACK_KBCFILTER_LR1LR2(x7,7);
-		ATTACK_KBCFILTER_LR1LR2(x8,8);ATTACK_KBCFILTER_LR1LR2(x9,9);ATTACK_KBCFILTER_LR1LR2(x10,10);ATTACK_KBCFILTER_LR1LR2(x11,11);
-		ATTACK_KBCFILTER_LR1LR2(x12,12);ATTACK_KBCFILTER_LR1LR2(x13,13);ATTACK_KBCFILTER_LR1LR2(x14,14);ATTACK_KBCFILTER_LR1LR2(x15,15);
-	}
-}
-
-template <typename BUCKETED_ENTRY>
-__global__
-void gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit(
-		const uint32_t KBC_START_ID, // determined by batch_id
-		const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N,
-		BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts,
-		const uint32_t MAX_KBC_ENTRIES)
-{
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	//for (int i = 0; i < N; i++) {
-
-	if (i < N) {
-		// TODO: try just reading out entries and see if they match when going in
-
-		BUCKETED_ENTRY block_entry = in[i];
-		uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y;
-		uint32_t kbc_id = calc_y / kBC;
-		uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS;
-		if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) {
-			printf(" i:%u  entry.y:%u  add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u   KBC_LOCAL_NUM_BUCKETS:%u START:%u  END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID);
-		}
-
-		uint32_t local_kbc_id = kbc_id - KBC_START_ID;
-		int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1);
-		uint32_t destination_address = local_kbc_id * MAX_KBC_ENTRIES + slot;
-
-		//printf("block_id:%u [i: %u] entry.y:%u  kbc_id:%u   local_kbc:%u   slot:%u   dest:%u\n",
-		//		block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address);
-
-		if (slot > MAX_KBC_ENTRIES) {
-			printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n");
-		}
-		//if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-		//	printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address);
-		//}
-		block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form.
-		local_kbc_entries[destination_address] = block_entry;
-	}
-}
-
-__global__
-void gpu_display_t2_match_results(Tx_Bucketed_Meta4 *T2_batch_match_results, int *device_T2_block_entry_counts, uint32_t MAX_ENTRIES_PER_BLOCK) {
-	printf("GPU DISPLAY T2 MATCH RESULTS:\n");
-	int total_counts = 0;
-	for (int i=0;i<BATCHES;i++) {
-		int num_results = device_T2_block_entry_counts[i];
-		total_counts += num_results;
-		uint32_t block_start_entry = MAX_ENTRIES_PER_BLOCK * i;
-		for (int j=0;j<num_results;j++) {
-			Tx_Bucketed_Meta4 entry = T2_batch_match_results[block_start_entry + j];
-			if ((entry.meta[0] == 1320788535) || (entry.meta[0] == 434033488)) {
-				printf("  block %d entry %d   x1:%u  x2:%u  x3:%u  x4:%u\n", i, j, entry.meta[0], entry.meta[1], entry.meta[2], entry.meta[3]);
-			}
-		}
-	}
-	printf("  TOTAL: %d\n", total_counts);
-}
-
-// std::vector<uint32_t> solution_xs = {1320788535,3465356684,2131394289,606438761,434033488,2479909174,3785038649,1942582046,438483300,2306941967,2327418650,184663264,3396904066,3057226705,2120150435,441715922,10459628,1281656413,88943898,810187686,112052271,2540716951,3073359813,4019528057,504026248,1706169436,2772410422,1772771468,607317630,4168020964,4286528917,2472944651,3546546119,1799281226,1202952199,1278165962,4062613743,2747217422,1182029562,1339760739,613483600,3661736730,1251588944,3140803170,2503085418,2541929248,4159128725,2325034733,4257771109,2804935474,2997421030,150533389,709945445,4159463930,714122558,1939000200,3291628318,1878268201,2874051942,2826426895,2146970589,4276159281,3509962078,2808839331};
-/*
- * Pair 0 x:1320788535 y:76835538515  kBC:5084069
-  Pair 1 x:3465356684 y:76835558195  kBC:5084070
-
-  Pair 2 x:2131394289 y:227752410271  kBC:15069966
-  Pair 3 x:606438761 y:227752417481  kBC:15069967
-
-  Pair 4 x:434033488 y:274225910406  kBC:18145034
-  Pair 5 x:2479909174 y:274225916708  kBC:18145035
-
-  Pair 6 x:3785038649 y:213830149496  kBC:14148756
-  Pair 7 x:1942582046 y:213830170524  kBC:14148757
-
-  Pair 8 x:438483300 y:248522697030  kBC:16444299
-  Pair 9 x:2306941967 y:248522719906  kBC:16444300
-  Pair 10 x:2327418650 y:23832869730  kBC:1576978
-  Pair 11 x:184663264 y:23832892290  kBC:1576979
-  Pair 12 x:3396904066 y:31837336818  kBC:2106619
-  Pair 13 x:3057226705 y:31837353261  kBC:2106620
-  Pair 14 x:2120150435 y:22313127263  kBC:1476419
-  Pair 15 x:441715922 y:22313149126  kBC:1476420
- */
-void attack_method_1(uint32_t bits) {
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-	uint64_t BITS_DIVISOR = 1 << bits;
-
-	uint64_t target_kbc_L1 = 5084069;
-	uint64_t target_kbc_R1 = 15069966;
-	uint64_t bucket_L1 = ((target_kbc_L1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t bucket_R1 = ((target_kbc_R1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t KBC_START_L1 = (bucket_L1*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_L1 = ((bucket_L1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_START_R1 = (bucket_R1*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_R1 = ((bucket_R1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-
-	uint64_t target_kbc_L2 = 18145034;
-	uint64_t target_kbc_R2 = 14148756;
-	uint64_t bucket_L2 = ((target_kbc_L2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t bucket_R2 = ((target_kbc_R2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t KBC_START_L2 = (bucket_L2*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_L2 = ((bucket_L2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_START_R2 = (bucket_R2*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_R2 = ((bucket_R2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-
-	//Pair 0 x:1320788535 y:76835538515  kBC:5084069
-	//  Pair 1 x:3465356684 y:76835558195  kBC:5084070
-	//  Pair 2 x:2131394289 y:227752410271  kBC:15069966
-	//  Pair 3 x:606438761 y:227752417481  kBC:15069967
-
-	uint64_t KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; // +1 is for including last R bucket space
-
-	uint64_t MAX_KBCS_POST_T1 = 16; // reduce if smaller selection based on initial t0 range.
-	uint32_t BLOCK_MAX_ENTRIES_T2 = HOST_MAX_BLOCK_ENTRIES / 16;
-	//uint32_t NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864;
-	uint32_t NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576;
-	if (bits == 6) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864;
-		MAX_KBCS_POST_T1 = 16;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 7) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 2;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 33554432;
-		MAX_KBCS_POST_T1 = 14;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 262144;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 8) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 4;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 16777216;
-		MAX_KBCS_POST_T1 = 12;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 65536;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 9) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 8;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 8388608;
-		MAX_KBCS_POST_T1 = 10;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 16384;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 10) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 16;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 4194304;
-		MAX_KBCS_POST_T1 = 8;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 4096;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	}
-
-	std::cout << "Attack Method 1 " << std::endl
-			  << "   L0 kbc range " << KBC_START_L1 << " to " << KBC_END_L1 << " = " << (KBC_END_L1-KBC_START_L1) << "kbcs " << (100.0*(double)(KBC_END_L1-KBC_START_L1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl
-			  << "   R0 kbc range " << KBC_START_R1 << " to " << KBC_END_R1 << " = " << (KBC_END_R1-KBC_START_R1) << "kbcs " << (100.0*(double)(KBC_END_R1-KBC_START_R1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl
-			  << "   KBC_ATTACK_NUM_BUCKETS: " << KBC_ATTACK_NUM_BUCKETS << std::endl
-			  << "   MAX BCS POST T1: " << MAX_KBCS_POST_T1 << std::endl
-			  << "   BLOCK_MAX_ENTRIES_T2: " << BLOCK_MAX_ENTRIES_T2 << std::endl;
-
-
-	char *device_buffer;
-
-	int* device_local_kbc_num_entries_L1;
-	int* device_local_kbc_num_entries_R1;
-	int* device_local_kbc_num_entries_L2;
-	int* device_local_kbc_num_entries_R2;
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_L1;
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_R1;
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_L2;
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_R2;
-
-	int* device_block_entry_counts_L;
-	int* device_block_entry_counts_R;
-	Tx_Bucketed_Meta2 *T1_L_batch_match_results;
-	Tx_Bucketed_Meta2 *T1_R_batch_match_results;
-
-	int* device_T2_block_entry_counts;
-	Tx_Bucketed_Meta4 *T2_batch_match_results;
-
-
-	const uint64_t T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_ATTACK_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET;
-
-	const uint64_t CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED = T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2);
-		std::cout << "  CHACHA BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED: " << CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED << std::endl;
-		std::cout << "                                               * 4 =  " << (CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED * 4) << std::endl;
-	const uint64_t T1_BATCH_MATCH_RESULTS_BYTES_NEEDED = DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2);
-	std::cout << "KBC RESULTS T1 L NEEDED: " << T1_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl;
-	const uint64_t T2_BATCH_MATCH_RESULTS_BYTES_NEEDED = (BLOCK_MAX_ENTRIES_T2 * BATCHES) * sizeof(Tx_Bucketed_Meta4);
-		std::cout << "  T2_BATCH_MATCH_RESULTS_BYTES_NEEDED: " << T2_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl;
-
-
-	const uint64_t TOTAL_BYTES_NEEDED =
-			      4 * CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED
-				+ 2 * T1_BATCH_MATCH_RESULTS_BYTES_NEEDED
-				+     T2_BATCH_MATCH_RESULTS_BYTES_NEEDED;
-
-	std::cout << "      device_buffer TOTAL BYTES: " <<  TOTAL_BYTES_NEEDED << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, TOTAL_BYTES_NEEDED));
-	uint64_t MEM_POS = 0;
-
-	T0_local_kbc_entries_L1 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS];
-	T0_local_kbc_entries_R1 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED];
-	T0_local_kbc_entries_L2 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED*2];
-	T0_local_kbc_entries_R2 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED*3];
-	MEM_POS += 4 * CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED;
-
-	T1_L_batch_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS];
-	T1_R_batch_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS + T1_BATCH_MATCH_RESULTS_BYTES_NEEDED];
-	MEM_POS += 2 * T1_BATCH_MATCH_RESULTS_BYTES_NEEDED;
-	T2_batch_match_results = (Tx_Bucketed_Meta4 *) &device_buffer[MEM_POS];
-	MEM_POS +=  T2_BATCH_MATCH_RESULTS_BYTES_NEEDED;
-
-	std::cout << "      device_block_entry_counts_L (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts_L, BATCHES*sizeof(int)));
-	std::cout << "      device_block_entry_counts_R (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts_R, BATCHES*sizeof(int)));
-	std::cout << "      device_T2_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_T2_block_entry_counts, BATCHES*sizeof(int)));
-
-
-	auto alloc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   alloc time: " << std::chrono::duration_cast<milli>(alloc_finish - attack_start).count() << " ms\n";
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-	std::cout << "Doing chacha\n";
-
-
-	int blockSize = 128; // # of threads per block, maximum is 1024.
-	const uint64_t calc_N = UINT_MAX;
-	const uint64_t calc_blockSize = blockSize;
-	const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	int numBlocks = calc_numBlocks;
-
-
-
-	// don't forget to clear counter...will only use a portion of this memory so should be fast access.
-	std::cout << "      device_local_kbc_num_entries_L1 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L1, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_R1 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R1, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_L2 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L2, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_R2 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R2, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-
-	std::cout << "Doing T1" << std::endl;
-
-	// we use only attack range for local num buckets
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L1, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R1, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-
-	auto t1_start = std::chrono::high_resolution_clock::now();
-	auto chacha_start = std::chrono::high_resolution_clock::now();
-	gpu_chacha8_k32_kbc_ranges_LR1LR2<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-			T0_local_kbc_entries_L1, device_local_kbc_num_entries_L1, KBC_START_L1, KBC_END_L1,
-			T0_local_kbc_entries_R1, device_local_kbc_num_entries_R1, KBC_START_R1, KBC_END_R1,
-			T0_local_kbc_entries_L2, device_local_kbc_num_entries_L2, KBC_START_L2, KBC_END_L2,
-			T0_local_kbc_entries_R2, device_local_kbc_num_entries_R2, KBC_START_R2, KBC_END_R2);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto chacha_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   chacha L1 time: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-
-	auto matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_L, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-	gpu_attack_find_t1_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1,
-				T0_local_kbc_entries_L1, device_local_kbc_num_entries_L1,
-				T1_L_batch_match_results, device_block_entry_counts_L);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 L time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-
-	matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_R, 0, (BATCHES)*sizeof(int)));
-	gpu_attack_find_t1_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_R1 - KBC_START_R1), 256>>>(1, KBC_START_R1, KBC_END_R1,
-				T0_local_kbc_entries_R1, device_local_kbc_num_entries_R1,
-				T1_R_batch_match_results, device_block_entry_counts_R);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 R time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-
-	auto t1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "      T1 total time: " << std::chrono::duration_cast<milli>(t1_finish - t1_start).count() << " ms\n";
-
-
-
-	auto mergekbcs_start = std::chrono::high_resolution_clock::now();
-	// clear our local kbc num entries as these will be written with new data
-
-	// don't use T0 buckets anymore, so overwrite/reuse their memory space.
-	Tx_Bucketed_Meta2 *T1_local_kbc_entries_L = (Tx_Bucketed_Meta2 *) &T0_local_kbc_entries_L1[0];
-	Tx_Bucketed_Meta2 *T1_local_kbc_entries_R = (Tx_Bucketed_Meta2 *) &T0_local_kbc_entries_R1[0];
-
-	// clump block-0-batch_id_L block-0-batch_id_R into same group and solve.
-
-	auto matchTx_start = std::chrono::high_resolution_clock::now();
-	auto matchTx_finish = std::chrono::high_resolution_clock::now();
-	auto mergeTx_start = std::chrono::high_resolution_clock::now();
-	auto mergeTx_finish = std::chrono::high_resolution_clock::now();
-	uint64_t total_match_time_micros = 0;
-	uint64_t total_merge_time_micros = 0;
-	uint32_t global_block_counts[BATCHES] = {0};
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-		CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L1, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-		CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R1, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-		uint32_t KBC_MERGE_BUCKET_START = MIN_KBC_BUCKET_FOR_BATCH(block_id);
-		const uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(block_id);
-		const uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(block_id+1);
-
-		uint32_t num_entries_to_copy = device_block_entry_counts_L[block_id];
-		int blockSize = 256;
-		int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize);
-		uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(block_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id);
-
-		uint32_t block_address = block_id * HOST_MAX_BLOCK_ENTRIES;
-		Tx_Bucketed_Meta2 *in = &T1_L_batch_match_results[block_address];
-
-		//std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl;
-		mergeTx_start = std::chrono::high_resolution_clock::now();
-		gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit<Tx_Bucketed_Meta2><<<numBlocks,blockSize>>>(
-					KBC_MERGE_BUCKET_START,
-					in, batch_bucket_add_Y, num_entries_to_copy,
-					T1_local_kbc_entries_L, device_local_kbc_num_entries_L1,
-					MAX_KBCS_POST_T1);
-
-		num_entries_to_copy = device_block_entry_counts_R[block_id];
-		numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize);
-		in = &T1_R_batch_match_results[block_address];
-
-			//std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl;
-		gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit<Tx_Bucketed_Meta2><<<numBlocks,blockSize>>>(
-						KBC_MERGE_BUCKET_START,
-						in, batch_bucket_add_Y, num_entries_to_copy,
-						T1_local_kbc_entries_R, device_local_kbc_num_entries_R1,
-						MAX_KBCS_POST_T1);
-
-		// TODO: find matches in entries_L against entries_R...should be <16, avg around 3-4
-		// only have 2m entries...so...could sort 1mL's against 1mR's?
-		//auto matchTx_start = std::chrono::high_resolution_clock::now();
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		mergeTx_finish = std::chrono::high_resolution_clock::now();
-		total_merge_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( mergeTx_finish - mergeTx_start ).count();
-
-
-		/*CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-
-
-		matchTx_start = std::chrono::high_resolution_clock::now();
-		gpu_attack_find_tx_LR_matches<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<(KBC_END - KBC_START), 8>>>(1, KBC_START, KBC_END,
-						T1_local_kbc_entries_L, device_local_kbc_num_entries_L1,
-						T1_local_kbc_entries_R, device_local_kbc_num_entries_R1,
-						T2_batch_match_results, device_T2_block_entry_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		matchTx_finish = std::chrono::high_resolution_clock::now();
-		total_match_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( matchTx_finish - matchTx_start ).count();
-*/
-		//total_match_time_ms += std::chrono::duration_cast<microseconds>(matchTx_finish - matchTx_start).count();
-		//for (int i = 0; i < BATCHES; i++) {
-		//	global_block_counts[i] += device_T2_block_entry_counts[i];
-		//}
-
-	}
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	std::cout << "   match t2 LR sum time: " << (total_match_time_micros/1000) << "ms" << std::endl;
-	std::cout << "   merge t2 LR sum time: " << (total_merge_time_micros/1000) << "ms" << std::endl;
-	auto mergekbcs_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "      T2 total time: " << std::chrono::duration_cast<milli>(mergekbcs_finish - mergekbcs_start).count() << " ms\n";
-		//gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L);
-
-
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-
-	uint32_t total_counts = 0;
-	for (int i=0;i<BATCHES;i++) {
-		//std::cout << " device_T2_block_entry_counts[" << i << "] : " << device_T2_block_entry_counts[i] << std::endl;
-		//total_counts += global_block_counts[i];
-	}
-	std::cout << "Total T2 block entry counts: " << total_counts << std::endl;
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_L1));
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_R1));
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_L2));
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_R2));
-	//CUDA_CHECK_RETURN(cudaFree(device_block_entry_counts));
-	CUDA_CHECK_RETURN(cudaFree(device_buffer));
-
-	auto attack_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "   attack total time: " << std::chrono::duration_cast<milli>(attack_finish - attack_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-}
-
-
-#endif /* ATTACK_METHOD_1_HPP_ */
diff --git a/attack_method_2.hpp b/attack_method_2.hpp
deleted file mode 100644
index 40eb07f..0000000
--- a/attack_method_2.hpp
+++ /dev/null
@@ -1,1460 +0,0 @@
-/*
- * attack_method_2.hpp
- *
- *  Created on: Nov 4, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_2_HPP_
-#define ATTACK_METHOD_2_HPP_
-
-#define ATTACK_KBCFILTER_LR1LR2slower(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	uint32_t local_kbc_bucket_id = 30000000; \
-	int slot = -1; \
-	int *num_list; \
-	Tx_Bucketed_Meta1 *entries_list; \
-	if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-		local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-		num_list = kbc_local_num_entries_L; \
-		entries_list = kbc_local_entries_L; \
-	} \
-	if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-		local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-		num_list = kbc_local_num_entries_R; \
-		entries_list = kbc_local_entries_R; \
-	} \
-	if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \
-		local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \
-		num_list = kbc_local_num_entries_L2; \
-		entries_list = kbc_local_entries_L2; \
-	} \
-	if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \
-		local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \
-		num_list = kbc_local_num_entries_R2; \
-		entries_list = kbc_local_entries_R2; \
-	} \
-	if (local_kbc_bucket_id < 30000000) { \
-		slot = atomicAdd(&num_list[local_kbc_bucket_id],1); \
-		Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		entries_list[entries_address] = entry; \
-	} \
-}
-
-/*
- * uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \
-		int kbc_bitmask_add = 1 << (kbc_bitmask_bucket*9); \
-		int bitadd = atomicAdd(&kbc_local_num_entries_L[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t slot = bitadd; \
-		slot = (slot >> (kbc_bitmask_bucket*9)) & 0b0111111111; \
-
-		 TOTAL: 262341
- */
-//with bitmask kbcs
-#define ATTACK_KBCFILTER_LR1LR2bitmask(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-		uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \
-		uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \
-		int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-		int bitadd = atomicAdd(&kbc_local_num_entries_L[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t slot = bitadd; \
-		slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_L[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-			uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \
-		uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \
-		int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-		int bitadd = atomicAdd(&kbc_local_num_entries_R[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t slot = bitadd; \
-		slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_R[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \
-					uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \
-		uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \
-		int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-		int bitadd = atomicAdd(&kbc_local_num_entries_L2[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t slot = bitadd; \
-		slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_L2[entries_address] = entry; \
-				} \
-				if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \
-					uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \
-		uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \
-		int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-		int bitadd = atomicAdd(&kbc_local_num_entries_R2[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t slot = bitadd; \
-		slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_R2[entries_address] = entry; \
-				} \
-}
-
-#define ATTACK_KBCFILTER_LR1LR2(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-		if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-			int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_L[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-			int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_R[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \
-					int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_L2[entries_address] = entry; \
-				} \
-				if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \
-					int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_R2[entries_address] = entry; \
-				} \
-}
-
-#define ATTACK_KBCFILTER_LR1LR2_CHACHA(chacha_y,x) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-		if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \
-			int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_L[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \
-			uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \
-			int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \
-			Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \
-			if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-			uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-			kbc_local_entries_R[entries_address] = entry; \
-		} \
-		if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \
-					int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_L2[entries_address] = entry; \
-				} \
-				if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \
-					uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \
-					int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \
-					Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \
-					if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-					uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-					kbc_local_entries_R2[entries_address] = entry; \
-				} \
-}
-
-#define ATTACK_KBCSTREAM_LR1LR2(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + ((base_x + i) >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if (((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) \
-			|| ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) \
-			|| ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) \
-			|| ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2))) { \
-		xchacha_pair pair = { base_x + i, chacha_y }; \
-		int slot = atomicAdd(&local_filter_count,1); \
-		if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \
-		shared_chachas[slot] = pair; \
-	} \
-}
-struct xchacha_pair {
-	uint32_t x;
-	uint32_t chacha;
-};
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR1LR2(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L2, int *kbc_local_num_entries_L2, uint32_t KBC_START_L2, uint32_t KBC_END_L2,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R2, int *kbc_local_num_entries_R2, uint32_t KBC_START_R2, uint32_t KBC_END_R2)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-	const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can
-	__shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-	__shared__ uint local_filter_count;
-
-	//if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 32;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (threadIdx.x == 0) {
-		local_filter_count = 0;
-	}
-	__syncthreads();
-
-	const int j = 0;
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 2;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+0],0);ATTACK_KBCSTREAM_LR1LR2(datax[j+1],1);ATTACK_KBCSTREAM_LR1LR2(datax[j+2],2);ATTACK_KBCSTREAM_LR1LR2(datax[j+3],3);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+4],4);ATTACK_KBCSTREAM_LR1LR2(datax[j+5],5);ATTACK_KBCSTREAM_LR1LR2(datax[j+6],6);ATTACK_KBCSTREAM_LR1LR2(datax[j+7],7);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+8],8);ATTACK_KBCSTREAM_LR1LR2(datax[j+9],9);ATTACK_KBCSTREAM_LR1LR2(datax[j+10],10);ATTACK_KBCSTREAM_LR1LR2(datax[j+11],11);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+12],12);ATTACK_KBCSTREAM_LR1LR2(datax[j+13],13);ATTACK_KBCSTREAM_LR1LR2(datax[j+14],14);ATTACK_KBCSTREAM_LR1LR2(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+0],16+0);ATTACK_KBCSTREAM_LR1LR2(datax[j+1],16+1);ATTACK_KBCSTREAM_LR1LR2(datax[j+2],16+2);ATTACK_KBCSTREAM_LR1LR2(datax[j+3],16+3);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+4],16+4);ATTACK_KBCSTREAM_LR1LR2(datax[j+5],16+5);ATTACK_KBCSTREAM_LR1LR2(datax[j+6],16+6);ATTACK_KBCSTREAM_LR1LR2(datax[j+7],16+7);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+8],16+8);ATTACK_KBCSTREAM_LR1LR2(datax[j+9],16+9);ATTACK_KBCSTREAM_LR1LR2(datax[j+10],16+10);ATTACK_KBCSTREAM_LR1LR2(datax[j+11],16+11);
-		ATTACK_KBCSTREAM_LR1LR2(datax[j+12],16+12);ATTACK_KBCSTREAM_LR1LR2(datax[j+13],16+13);ATTACK_KBCSTREAM_LR1LR2(datax[j+14],16+14);ATTACK_KBCSTREAM_LR1LR2(datax[j+15],16+15);
-	}
-	// at this point we have 128*32 = 4096 entries
-	// now we have to sort them into the buckets
-	// we already have the shared counts set from the ATTACK macro
-	// now just scan our filtered entries and bucket them
-	__syncthreads();
-	for (int i=threadIdx.x;i<local_filter_count;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-		// remember, these are *already* bucketed to some range
-		xchacha_pair pair = shared_chachas[i];
-		ATTACK_KBCFILTER_LR1LR2_CHACHA(pair.chacha, pair.x);
-	}
-}
-
-
-
-
-__global__
-void gpu_chacha8_k32_kbc_ranges_LR1LR2_orig(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R,
-		Tx_Bucketed_Meta1 *kbc_local_entries_L2, int *kbc_local_num_entries_L2, uint32_t KBC_START_L2, uint32_t KBC_END_L2,
-		Tx_Bucketed_Meta1 *kbc_local_entries_R2, int *kbc_local_num_entries_R2, uint32_t KBC_START_R2, uint32_t KBC_END_R2)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_KBCFILTER_LR1LR2(x0,0);ATTACK_KBCFILTER_LR1LR2(x1,1);ATTACK_KBCFILTER_LR1LR2(x2,2);ATTACK_KBCFILTER_LR1LR2(x3,3);
-		ATTACK_KBCFILTER_LR1LR2(x4,4);ATTACK_KBCFILTER_LR1LR2(x5,5);ATTACK_KBCFILTER_LR1LR2(x6,6);ATTACK_KBCFILTER_LR1LR2(x7,7);
-		ATTACK_KBCFILTER_LR1LR2(x8,8);ATTACK_KBCFILTER_LR1LR2(x9,9);ATTACK_KBCFILTER_LR1LR2(x10,10);ATTACK_KBCFILTER_LR1LR2(x11,11);
-		ATTACK_KBCFILTER_LR1LR2(x12,12);ATTACK_KBCFILTER_LR1LR2(x13,13);ATTACK_KBCFILTER_LR1LR2(x14,14);ATTACK_KBCFILTER_LR1LR2(x15,15);
-	}
-}
-
-__global__
-void gpu_display_t2_match_results(Tx_Bucketed_Meta4 *T2_batch_match_results, int *device_T2_block_entry_counts, uint32_t MAX_ENTRIES_PER_BLOCK) {
-	printf("GPU DISPLAY T2 MATCH RESULTS:\n");
-	int total_counts = 0;
-	for (int i=0;i<BATCHES;i++) {
-		int num_results = device_T2_block_entry_counts[i];
-		total_counts += num_results;
-		uint32_t block_start_entry = MAX_ENTRIES_PER_BLOCK * i;
-		for (int j=0;j<num_results;j++) {
-			Tx_Bucketed_Meta4 entry = T2_batch_match_results[block_start_entry + j];
-			if ((entry.meta[0] == 1320788535) || (entry.meta[0] == 434033488)) {
-				printf("  block %d entry %d   x1:%u  x2:%u  x3:%u  x4:%u\n", i, j, entry.meta[0], entry.meta[1], entry.meta[2], entry.meta[3]);
-			}
-		}
-	}
-	printf("  TOTAL: %d\n", total_counts);
-}
-
-// std::vector<uint32_t> solution_xs = {1320788535,3465356684,2131394289,606438761,434033488,2479909174,3785038649,1942582046,438483300,2306941967,2327418650,184663264,3396904066,3057226705,2120150435,441715922,10459628,1281656413,88943898,810187686,112052271,2540716951,3073359813,4019528057,504026248,1706169436,2772410422,1772771468,607317630,4168020964,4286528917,2472944651,3546546119,1799281226,1202952199,1278165962,4062613743,2747217422,1182029562,1339760739,613483600,3661736730,1251588944,3140803170,2503085418,2541929248,4159128725,2325034733,4257771109,2804935474,2997421030,150533389,709945445,4159463930,714122558,1939000200,3291628318,1878268201,2874051942,2826426895,2146970589,4276159281,3509962078,2808839331};
-/*
- * Pair 0 x:1320788535 y:76835538515  kBC:5084069
-  Pair 1 x:3465356684 y:76835558195  kBC:5084070
-
-  Pair 2 x:2131394289 y:227752410271  kBC:15069966
-  Pair 3 x:606438761 y:227752417481  kBC:15069967
-
-  Pair 4 x:434033488 y:274225910406  kBC:18145034
-  Pair 5 x:2479909174 y:274225916708  kBC:18145035
-
-  Pair 6 x:3785038649 y:213830149496  kBC:14148756
-  Pair 7 x:1942582046 y:213830170524  kBC:14148757
-
-  Pair 8 x:438483300 y:248522697030  kBC:16444299
-  Pair 9 x:2306941967 y:248522719906  kBC:16444300
-  Pair 10 x:2327418650 y:23832869730  kBC:1576978
-  Pair 11 x:184663264 y:23832892290  kBC:1576979
-  Pair 12 x:3396904066 y:31837336818  kBC:2106619
-  Pair 13 x:3057226705 y:31837353261  kBC:2106620
-  Pair 14 x:2120150435 y:22313127263  kBC:1476419
-  Pair 15 x:441715922 y:22313149126  kBC:1476420
- */
-
-
-
-__global__
-void gpu_attack_get_kbcs_with_pairs_from_global_kbcs(
-		const unsigned int *kbc_global_num_entries_L,
-		const unsigned int *kbc_global_num_entries_R,
-		unsigned int *kbc_pairs_list_L_bucket_ids, int *pairs_count) {
-
-	uint32_t global_kbc_L_bucket_id = blockIdx.x*blockDim.x+threadIdx.x;
-
-	if (global_kbc_L_bucket_id < (kBC_NUM_BUCKETS-1)) {
-
-		uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8;
-		uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8);
-		uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket];
-		const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8;
-		kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8);
-		bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket];
-		const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		if ((num_L > 0) && (num_R > 0)) {
-
-			int slot = atomicAdd(&pairs_count[0], 1);
-			//printf("found kbc %u with two blocks > 0 slot %u \n", global_kbc_L_bucket_id,slot);
-			kbc_pairs_list_L_bucket_ids[slot] = global_kbc_L_bucket_id;
-		}
-	}
-}
-
-struct Match_Attack_Pair_Index {
-	uint32_t bucket_L_id; // could compress this to fit in 32 bit
-	uint16_t idx_L;
-	uint16_t idx_R;
-};
-
-template <typename BUCKETED_ENTRY_IN>
-__global__
-void gpu_attack_process_t1_pairs(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		Match_Attack_Pair_Index *match_list, int *match_counts) {
-	// testmatch count: 33532242
-	//   testmatch T1 L time: 9 ms
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-		__shared__ unsigned int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-		__shared__ uint32_t nick_rmap_extras_rl[32];
-		__shared__ uint16_t nick_rmap_extras_ry[32];
-		__shared__ uint16_t nick_rmap_extras_pos[32];
-		__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int global_match_slot;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	// bucket sort the r positions!
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		BUCKETED_ENTRY_IN R_entry = kbc_local_entries[start_R+pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		unsigned int kbc_map = r_y / 2;
-		const unsigned int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-
-	}
-	//for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-	//	kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R];
-	//}
-	//for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-	//	kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L];
-	//}
-
-
-
-
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	//testmatch count: 33271871
-	//   testmatch T1 L time: 9 ms
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_local_entries[start_L+pos_L];
-		uint16_t l_y = L_entry.y;
-
-		//bool doPrint = (L_entry.meta[0] == 601683299);
-
-		//uint16_t base_indJ = l_y / kC;
-		//uint16_t indJ_plus_m_mod_kB = base_indJ % kB;
-		//uint16_t indJ_plus_m_mod_kB_times_kC = indJ_plus_m_mod_kB * kC;
-		//uint16_t m_2_plus_parity_squared_iter = (parity + l_y) % kC;
-		//uint16_t m_2_plus_parity_start_add = parity == 0 ? 4 : 8; // this increments by 8 each time
-		//if (doPrint) {
-		//	printf("Starting values:\n");
-		//	printf("                           l_y: %u\n",l_y);
-		//	printf("                        parity: %u\n",parity);
-		///	printf("            indJ_plus_m_mod_kB: %u\n",indJ_plus_m_mod_kB);
-		//	printf("   indJ_plus_m_mod_kB_times_kC: %u\n",indJ_plus_m_mod_kB_times_kC);
-		//	printf("  m_2_plus_parity_squared_iter: %u\n",m_2_plus_parity_squared_iter);
-		//	printf("     m_2_plus_parity_start_add: %u\n",m_2_plus_parity_start_add);
-		//}
-		for (int m=0;m<64;m++) {
-
-
-			/*
-			 * sadly these no division optimations turned out to be slower than a single calculation line
-			 * uint16_t r_target = indJ_plus_m_mod_kB_times_kC + m_2_plus_parity_squared_iter;
-
-			// this gets updated at end of loop.
-			indJ_plus_m_mod_kB += 1;
-			if (indJ_plus_m_mod_kB >= kB) {
-				indJ_plus_m_mod_kB = 0;
-				indJ_plus_m_mod_kB_times_kC = 0;
-			} else {
-				indJ_plus_m_mod_kB_times_kC += kC;
-			}
-
-			m_2_plus_parity_squared_iter += m_2_plus_parity_start_add;
-			m_2_plus_parity_start_add += 8; // adds 8 extra each round compounding
-			if (m_2_plus_parity_squared_iter >= kC) m_2_plus_parity_squared_iter -= kC;
-			if (m_2_plus_parity_start_add >= kC) m_2_plus_parity_start_add -= kC;
-*/
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			//if (!(test_target == r_target)) printf("fail: meta[0] %u\n",L_entry.meta[0]);
-			//if (doPrint) {
-
-			//	printf(" Test target result   : %u ",test_target);
-			//	if (r_target == test_target) printf(" SUCCESS!\n"); else printf(" FAIL.\n");
-			//	printf(" Desired target result: %u\n",r_target);
-
-			//	printf("\nNext values m:%u\n",m+1);
-
-			//	printf("            indJ_plus_m_mod_kB: %u\n",indJ_plus_m_mod_kB);
-			//	printf("   indJ_plus_m_mod_kB_times_kC: %u\n",indJ_plus_m_mod_kB_times_kC);
-			//	printf("  m_2_plus_parity_squared_iter: %u\n",m_2_plus_parity_squared_iter);
-			//	printf("     m_2_plus_parity_start_add: %u\n",m_2_plus_parity_start_add);
-			//}
-
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			unsigned int kbc_map = r_target / 2;
-			const unsigned int kbc_box_shift = (r_target % 2) * 15;
-			unsigned int rmap_value = nick_rmap[kbc_map];
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-		global_match_slot = atomicAdd(&match_counts[0],total_matches);
-	}
-
-	__syncthreads();
-
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match shared_match = matches[i];
-		Match_Attack_Pair_Index match = { };
-		match.bucket_L_id = global_kbc_L_bucket_id;
-		match.idx_L = shared_match.idxL;
-		match.idx_R = shared_match.idxR;
-		// *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list.
-		// suspect splitting y list would be faster.
-		match_list[global_match_slot + i] = match;
-	}
-}
-
-
-template <typename BUCKETED_ENTRY_IN>
-__global__
-void gpu_attack_process_t1_pairs_orig(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		Match_Attack_Pair_Index *match_list, int *match_counts) {
-	// testmatch count: 33532242
-	//   testmatch T1 L time: 12 ms
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ unsigned int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ BUCKETED_ENTRY_IN kbc_L_entries[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ BUCKETED_ENTRY_IN kbc_R_entries[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int global_match_slot;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 0;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R];
-	}
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L];
-	}
-
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		unsigned int kbc_map = r_y / 2;
-		const unsigned int kbc_box_shift = (r_y % 2) * 15;
-		unsigned int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			unsigned int kbc_map = r_target / 2;
-			const unsigned int kbc_box_shift = (r_target % 2) * 15;
-			int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-			unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-		global_match_slot = atomicAdd(&match_counts[0],total_matches);
-	}
-
-	__syncthreads();
-
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match shared_match = matches[i];
-		Match_Attack_Pair_Index match = { };
-		match.bucket_L_id = global_kbc_L_bucket_id;
-		match.idx_L = shared_match.idxL;
-		match.idx_R = shared_match.idxR;
-		// *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list.
-		// suspect splitting y list would be faster.
-		match_list[global_match_slot + i] = match;
-	}
-}
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_process_t1_matches_list(
-		const int MATCHES_COUNT, Match_Attack_Pair_Index *match_list,
-		const BUCKETED_ENTRY_IN *kbc_local_entries,
-		BUCKETED_ENTRY_OUT *kbc_out, unsigned int *out_kbc_counts,
-		const uint32_t KBC_START_L1, const uint32_t KBC_MAX_ENTRIES) {
-
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-
-	if (i < MATCHES_COUNT) {
-		Match_Attack_Pair_Index match = match_list[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		uint32_t local_bucket_id = match.bucket_L_id - KBC_START_L1;
-		//printf("reading match %u : bucketL %u  idx_L %u    idx_R %u\n", i, local_bucket_id, match.idx_L, match.idx_R);
-		BUCKETED_ENTRY_IN L_Entry = kbc_local_entries[local_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + match.idx_L];
-		BUCKETED_ENTRY_IN R_Entry = kbc_local_entries[(local_bucket_id+1) * KBC_MAX_ENTRIES_PER_BUCKET + match.idx_R];
-
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, match.bucket_L_id); // make sure this is global bucket id
-
-		pair.meta[0] = L_Entry.meta[0];
-		pair.meta[1] = R_Entry.meta[0];
-		nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-
-		uint32_t kbc_bucket = blake_result / kBC;
-
-		pair.y = (uint32_t) (blake_result % kBC);
-
-		uint32_t kbc_bitmask_bucket = kbc_bucket / 8; \
-		uint32_t kbc_bitmask_shift = 4*(kbc_bucket % 8); \
-		unsigned int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \
-		unsigned int bitadd = atomicAdd(&out_kbc_counts[kbc_bitmask_bucket],kbc_bitmask_add); \
-		uint32_t block_slot = bitadd; \
-		block_slot = (block_slot >> (kbc_bitmask_shift)) & 0b01111; \
-
-		if (block_slot > KBC_MAX_ENTRIES) {
-			printf("block_slot > MAX %u\n", block_slot);
-		} else {
-			uint32_t pair_address = kbc_bucket * KBC_MAX_ENTRIES + block_slot;
-			//if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-				//printf("ERROR: results address overflow\n");
-			//} else {
-				kbc_out[pair_address] = pair;
-			//}
-			}
-		//}
-
-
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_process_global_kbc_pairs_list(
-		const int PAIRS_COUNT, unsigned int *kbc_pairs_list_L_bucket_ids,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_R, const unsigned int *kbc_global_num_entries_R,
-		Match_Attack_Pair_Index *match_list, int *match_counts,
-		const uint32_t KBC_MAX_ENTRIES) {
-
-	// NOTE: possible optimization is to only get y elements of a list instead of ALL the meta...
-	// requires splitting the meta and y fields into two separate lists. Alternatively we copy
-	// all the meta chunk in this round.
-
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-
-	if (i < PAIRS_COUNT) {
-		unsigned int global_kbc_L_bucket_id = kbc_pairs_list_L_bucket_ids[i];
-
-		uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8;
-		uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8);
-		uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket];
-		const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8;
-		kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8);
-		bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket];
-		const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		if ((num_L == 0) || (num_R == 0)) {
-			printf("ERROR: PAIRS LIST SHOULD NOT HAVE 0 COUNTS\n");
-			return; // shouldn't ever happen with a pairs list...
-		}
-
-		const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES;
-		const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES;
-
-		const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L];
-		const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R];
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  AND
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-		for (int pos_R = 0; pos_R < num_R; pos_R+=1) {
-			//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-			BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-			int16_t yr_kbc = R_entry.y;
-			int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-			for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) {
-				// do L_entry and R_entry match?
-				BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-				int16_t yl_kbc = L_entry.y;
-				int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-				int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-				if (formula_one < 0) {
-					formula_one += kB;
-				}
-				int16_t m = formula_one;
-				if (m >= kB) {
-					m -= kB;
-				}
-				if (m < 64) {
-					// passed first test
-					int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-					int16_t yr_cid = yr_kbc % kC;
-					int16_t parity = (global_kbc_L_bucket_id) % 2;
-					int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-					int16_t formula_two = yr_cid - yl_cid;
-					if (formula_two < 0) {
-						formula_two += kC;
-					}
-					if (formula_two == m2_parity_squared) {
-						// we have a match.
-						int slot = atomicAdd(&match_counts[0],1);
-						Match_Attack_Pair_Index match = { };
-						match.bucket_L_id = global_kbc_L_bucket_id;
-						match.idx_L = pos_L;
-						match.idx_R = pos_R;
-						// *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list.
-						// suspect splitting y list would be faster.
-						match_list[slot] = match;
-					}
-				}
-			}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_process_matches_list(
-		uint16_t table,
-		const int MATCHES_COUNT, Match_Attack_Pair_Index *match_list,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_L,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_R,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts,
-		const uint32_t KBC_MAX_ENTRIES, const uint32_t BLOCK_MAX_ENTRIES) {
-
-	// NOTE: possible optimization is to only get y elements of a list instead of ALL the meta...
-	// requires splitting the meta and y fields into two separate lists. Alternatively we copy
-	// all the meta chunk in this round.
-
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-
-	if (i < MATCHES_COUNT) {
-		Match_Attack_Pair_Index match = match_list[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_global_entries_L[match.bucket_L_id * KBC_MAX_ENTRIES + match.idx_L];
-		BUCKETED_ENTRY_IN R_Entry = kbc_global_entries_R[(match.bucket_L_id+1) * KBC_MAX_ENTRIES + match.idx_R];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, match.bucket_L_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		}
-		uint64_t batch_bucket = blake_result >> (38-6);
-		const uint64_t block_mod = (uint64_t) 1 << (38-6);
-		pair.y = (uint32_t) (blake_result % block_mod);
-		int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-		uint32_t pair_address = batch_bucket * BLOCK_MAX_ENTRIES + block_slot;
-		bucketed_out[pair_address] = pair;
-	}
-}
-
-
-void attack_method_2(uint32_t bits) {
-
-	// attack method 2 does:
-
-	uint64_t BITS_DIVISOR = 1 << bits;
-
-	uint64_t target_kbc_L1 = 5084069;
-	uint64_t target_kbc_R1 = 15069966;
-	uint64_t bucket_L1 = ((target_kbc_L1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t bucket_R1 = ((target_kbc_R1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t KBC_START_L1 = (bucket_L1*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_L1 = ((bucket_L1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_START_R1 = (bucket_R1*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_R1 = ((bucket_R1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-
-	uint64_t target_kbc_L2 = 18145034;
-	uint64_t target_kbc_R2 = 14148756;
-	uint64_t bucket_L2 = ((target_kbc_L2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t bucket_R2 = ((target_kbc_R2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS;
-	uint64_t KBC_START_L2 = (bucket_L2*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_L2 = ((bucket_L2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_START_R2 = (bucket_R2*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-	uint64_t KBC_END_R2 = ((bucket_R2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR;
-
-	// kbc bucket bitmask: e.g. if 10 bits = 1024 buckets
-	// set [64][64] with appropriate bit
-	// when chacha, do kbc_bucket and translate to kbc_bit
-	// then kbc_bit & [64] for the check, to get true/false
-	// then need to find which array to write to. Oh.
-	// maybe easier to make array [0..1024] of (Array *), where NULL is in ones not used
-	// and just do kbc_array = array[kbc_bucket]
-	// if !NULL DO....
-
-	std::cout << "ATTACK METHOD 2" << std::endl;
-	std::cout << "   BITS: " << bits << " DIVISOR:" << BITS_DIVISOR
-			<< "        target_kbc_L1 " << target_kbc_L1 << " -> bucket L1 " << bucket_L1
-			<< "        kbc range: "<< KBC_START_L1 << " - " << (KBC_END_L1) << "kbcs " << std::endl;
-
-	//Pair 0 x:1320788535 y:76835538515  kBC:5084069
-	//  Pair 1 x:3465356684 y:76835558195  kBC:5084070
-	//  Pair 2 x:2131394289 y:227752410271  kBC:15069966
-	//  Pair 3 x:606438761 y:227752417481  kBC:15069967
-
-	uint64_t KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; // +1 is for including last R bucket space
-
-	uint64_t MAX_KBCS_POST_T1 = 16; // reduce if smaller selection based on initial t0 range.
-	uint32_t BLOCK_MAX_ENTRIES_T2 = HOST_MAX_BLOCK_ENTRIES / 16;
-	//uint32_t NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864;
-	uint32_t NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576;
-	if (bits == 6) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864;
-		MAX_KBCS_POST_T1 = 16;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 7) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 2;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 33554432;
-		MAX_KBCS_POST_T1 = 12;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 262144;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 8) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 4;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 16777216;
-		MAX_KBCS_POST_T1 = 12;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 65536;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 9) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 8;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 8388608;
-		MAX_KBCS_POST_T1 = 10;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 16384;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	} else if (bits == 10) {
-		KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 16;
-		//NUM_EXPECTED_ENTRIES_T1_MATCHES = 4194304;
-		MAX_KBCS_POST_T1 = 8;
-		NUM_EXPECTED_ENTRIES_T2_MATCHES = 4096;
-		BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32;
-	}
-	uint64_t T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_ATTACK_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET;
-		std::cout	  << "   L0 kbc range " << KBC_START_L1 << " to " << KBC_END_L1 << " = " << (KBC_END_L1-KBC_START_L1) << "kbcs " << (100.0*(double)(KBC_END_L1-KBC_START_L1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl
-			  << "   R0 kbc range " << KBC_START_R1 << " to " << KBC_END_R1 << " = " << (KBC_END_R1-KBC_START_R1) << "kbcs " << (100.0*(double)(KBC_END_R1-KBC_START_R1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl
-			  << "   KBC_ATTACK_NUM_BUCKETS: " << KBC_ATTACK_NUM_BUCKETS << std::endl
-			  << "   MAX BCS POST T1: " << MAX_KBCS_POST_T1 << std::endl
-			  << "   BLOCK_MAX_ENTRIES_T2: " << BLOCK_MAX_ENTRIES_T2 << std::endl;
-
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-	char *device_buffer;
-	int* device_local_kbc_num_entries_L;
-	int* device_local_kbc_num_entries_R;
-	int* device_local_kbc_num_entries_L2;
-	int* device_local_kbc_num_entries_R2;
-	int* device_T2_block_entry_counts;
-
-	const uint64_t T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED = kBC_NUM_BUCKETS * MAX_KBCS_POST_T1 * sizeof(Tx_Bucketed_Meta2);
-	std::cout << "  T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED: " << T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED << std::endl;
-	std::cout << "                                               * 2 =  " << (T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED * 2) << std::endl;
-
-	const uint64_t T2_BATCH_MATCH_RESULTS_BYTES_NEEDED = (BLOCK_MAX_ENTRIES_T2 * BATCHES) * sizeof(Tx_Bucketed_Meta4);
-	std::cout << "  T2_BATCH_MATCH_RESULTS_BYTES_NEEDED: " << T2_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl;
-		const uint64_t BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED = T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2);
-	std::cout << "  CHACHA BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED: " << BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED << std::endl;
-	std::cout << "                                               * 4 =  " << (BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED * 4) << std::endl;
-
-	const uint64_t TOTAL_BYTES_NEEDED =
-			  4 * BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED
-			+ 2 * T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED
-			+     T2_BATCH_MATCH_RESULTS_BYTES_NEEDED;
-
-	Tx_Bucketed_Meta4 *T2_batch_match_results;
-	char *device_local_kbc_entries_L;
-	char *device_local_kbc_entries_R;
-	char *device_local_kbc_entries_L2;
-	char *device_local_kbc_entries_R2;
-
-	Tx_Bucketed_Meta2 *T1_L_kbc_match_results;
-	Tx_Bucketed_Meta2 *T1_R_kbc_match_results;
-	unsigned int *device_global_kbc_num_entries_L;
-	unsigned int *device_global_kbc_num_entries_R;
-
-	//std::cout << "      T1_L_batch_match_results " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  sizeof(Tx_Bucketed_Meta2) << ") = " << (DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2)) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2)));
-
-	std::cout << "      device_local_kbc_num_entries_L " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_R " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_L2 " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L2, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	std::cout << "      device_local_kbc_num_entries_R2 " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R2, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-
-	// 32 bit...limit to 4 bits = 16 max, = 8 entries per kbc
-	std::cout << "      device_global_kbc_num_entries_L " << (kBC_NUM_BUCKETS/8) << " = " << ((kBC_NUM_BUCKETS/8)*sizeof(int)) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_global_kbc_num_entries_L, (kBC_NUM_BUCKETS/8)*sizeof(int)));
-	std::cout << "      device_global_kbc_num_entries_R " << (kBC_NUM_BUCKETS/8) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_global_kbc_num_entries_R, (kBC_NUM_BUCKETS/8)*sizeof(int)));
-
-
-	std::cout << "      device_buffer TOTAL BYTES: " <<  TOTAL_BYTES_NEEDED << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, TOTAL_BYTES_NEEDED));
-	uint64_t MEM_POS = 0;
-	device_local_kbc_entries_L = &device_buffer[MEM_POS];
-	device_local_kbc_entries_R = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED];
-	device_local_kbc_entries_L2 = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED*2];
-	device_local_kbc_entries_R2 = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED*3];
-	MEM_POS += 4 * BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED;
-	T1_L_kbc_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS];
-	T1_R_kbc_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS + T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED];
-	MEM_POS += 2 * T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED;
-	T2_batch_match_results = (Tx_Bucketed_Meta4 *) &device_buffer[MEM_POS];
-	MEM_POS +=  T2_BATCH_MATCH_RESULTS_BYTES_NEEDED;
-
-	std::cout << "      device_T2_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_T2_block_entry_counts, BATCHES*sizeof(int)));
-
-	auto alloc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   alloc time: " << std::chrono::duration_cast<milli>(alloc_finish - attack_start).count() << " ms\n";
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-	std::cout << "Doing chacha\n";
-
-
-	int blockSize = 128; // # of threads per block, maximum is 1024.
-	const uint64_t calc_N = UINT_MAX;
-	const uint64_t calc_blockSize = blockSize;
-	const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	int numBlocks = calc_numBlocks;
-
-
-
-
-
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_L = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_L[0]; // will replace...
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_R = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_R[0];
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_L2 = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_L2[0]; // will replace...
-	Tx_Bucketed_Meta1 *T0_local_kbc_entries_R2 = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_R2[0];
-
-	std::cout << "Note: sizeof(Tx_Bucketed_Meta1) is " << sizeof(Tx_Bucketed_Meta2)*8 << "bits, when it should be 96 bits" << std::endl;
-
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int)));
-
-	std::cout << "Doing T1" << std::endl;
-	auto t1_start = std::chrono::high_resolution_clock::now();
-	auto chacha_start = std::chrono::high_resolution_clock::now();
-	//gpu_chacha8_k32_kbc_ranges_LR<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-	//			T0_local_kbc_entries_L, device_local_kbc_num_entries_L, KBC_START_L1, KBC_END_L1,
-	//			T0_local_kbc_entries_R, device_local_kbc_num_entries_R, KBC_START_R1, KBC_END_R1);
-	gpu_chacha8_k32_kbc_ranges_LR1LR2<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-					T0_local_kbc_entries_L, device_local_kbc_num_entries_L, KBC_START_L1, KBC_END_L1,
-					T0_local_kbc_entries_R, device_local_kbc_num_entries_R, KBC_START_R1, KBC_END_R1,
-					T0_local_kbc_entries_L2, device_local_kbc_num_entries_L2, KBC_START_L2, KBC_END_L2,
-					T0_local_kbc_entries_R2, device_local_kbc_num_entries_R2, KBC_START_R2, KBC_END_R2);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto chacha_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   chacha L1 time: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-	//gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L2, 0, 100, 1);
-
-	Match_Attack_Pair_Index *match_list;
-	int *match_counts;
-	CUDA_CHECK_RETURN(cudaMalloc(&match_list, 67108864*sizeof(Match_Attack_Pair_Index)));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&match_counts, sizeof(unsigned int)));
-	match_counts[0] = 0;
-	auto testmatchT1_start = std::chrono::high_resolution_clock::now();
-	gpu_attack_process_t1_pairs<Tx_Bucketed_Meta1><<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1,
-			T0_local_kbc_entries_L, device_local_kbc_num_entries_L,
-			match_list,match_counts);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto testmatchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   testmatch count: " << match_counts[0] << std::endl;
-	std::cout << "   testmatch T1 L time: " << std::chrono::duration_cast<milli>(testmatchT1_finish - testmatchT1_start).count() << " ms\n";
-
-	// CODE BELOW CRASHES
-
-	int matchT1_count = match_counts[0];
-	const int matchT1_blockSize = 256;
-	const int matchT1_numBlocks = (matchT1_count + matchT1_blockSize - 1) / matchT1_blockSize;
-	auto bestmatchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, (kBC_NUM_BUCKETS/8)*sizeof(int)));
-	gpu_attack_process_t1_matches_list<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<matchT1_numBlocks,matchT1_blockSize>>>(
-	//gpu_attack_process_t1_matches_list<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<matchT1_count,1>>>(
-			matchT1_count, match_list,
-			T0_local_kbc_entries_L,
-			T1_L_kbc_match_results, device_global_kbc_num_entries_L,
-			KBC_START_L1, MAX_KBCS_POST_T1);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto bestmatchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   blake match T1 L time: " << std::chrono::duration_cast<milli>(bestmatchT1_finish - bestmatchT1_start).count() << " ms\n";
-	std::cout << "   FINAL match T1 L time: " << std::chrono::duration_cast<milli>(bestmatchT1_finish - testmatchT1_start).count() << " ms\n";
-
-	//gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_L, 0, 100, 1);
-
-	auto matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, (kBC_NUM_BUCKETS/8)*sizeof(int)));
-	gpu_attack_find_t1_matches_out_kbc<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1,
-			T0_local_kbc_entries_L, device_local_kbc_num_entries_L,
-			T1_L_kbc_match_results, device_global_kbc_num_entries_L, MAX_KBCS_POST_T1);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 L time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-	//gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_L, 0, 100, 1);
-
-	matchT1_start = std::chrono::high_resolution_clock::now();
-	CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_R, 0, (kBC_NUM_BUCKETS/8)*sizeof(int)));
-	gpu_attack_find_t1_matches_out_kbc<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END_R1 - KBC_START_R1), 256>>>(1, KBC_START_R1, KBC_END_R1,
-				T0_local_kbc_entries_R, device_local_kbc_num_entries_R,
-				T1_R_kbc_match_results, device_global_kbc_num_entries_R, MAX_KBCS_POST_T1);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	matchT1_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T1 R time: " << std::chrono::duration_cast<milli>(matchT1_finish - matchT1_start).count() << " ms\n";
-
-	//gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_R, 0, 100, 1);
-
-	// TODO: need to do a "pairing" pass, where we just scan through each bucket and spit out a list of kbc pairs
-	// then, on a second pass, process the pairs with compute method. This way all threads are going to be working
-	// and it should be near instant.
-	// NOTE: will have to handle pairing pass having more than one entry
-	// ALSO TRY: single pass where we compute on the fly, but probably it will store all the 0 entries
-	//   e.g. T2 9 bit, expect 16000 matches from 18188177 buckets = 1 in 1100 buckets
-
-	// after t1 pairs output to kbc list, for t2 pairing we first filter all eligible bucket ids.
-	unsigned int *kbc_pairs_list_L_bucket_ids;
-	int *pairs_count;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_pairs_list_L_bucket_ids, kBC_NUM_BUCKETS*sizeof(unsigned int)));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&pairs_count, sizeof(unsigned int)));
-	pairs_count[0] = 0;
-	//CUDA_CHECK_RETURN(cudaMemset(pairs_count, 0, sizeof(int)));
-
-	auto pairingT2_start = std::chrono::high_resolution_clock::now();
-	const int pair_blockSize = 256; // # of threads per block, maximum is 1024.
-	const uint32_t pair_numBlocks = (kBC_NUM_BUCKETS + pair_blockSize - 1) / pair_blockSize;
-	gpu_attack_get_kbcs_with_pairs_from_global_kbcs<<<pair_numBlocks,pair_blockSize>>>(
-			device_global_kbc_num_entries_L,device_global_kbc_num_entries_R,
-			kbc_pairs_list_L_bucket_ids, pairs_count);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto pairingT2_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   pairs count: " << pairs_count[0] << std::endl;
-	std::cout << "   pairing T2 L time: " << std::chrono::duration_cast<milli>(pairingT2_finish - pairingT2_start).count() << " ms\n";
-
-	//Match_Attack_Pair_Index *match_list;
-	//int *match_counts;
-	//CUDA_CHECK_RETURN(cudaMalloc(&match_list, 2*NUM_EXPECTED_ENTRIES_T2_MATCHES*sizeof(Match_Attack_Pair_Index)));
-	//CUDA_CHECK_RETURN(cudaMallocManaged(&match_counts, sizeof(unsigned int)));
-	match_counts[0] = 0;
-
-
-	auto processT2_start = std::chrono::high_resolution_clock::now();
-	int process_count = pairs_count[0];
-	const int process_blockSize = 256;
-	const int process_numBlocks = (process_count + process_blockSize - 1) / process_blockSize;
-	gpu_attack_process_global_kbc_pairs_list<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<process_numBlocks,process_blockSize>>>(
-			process_count, kbc_pairs_list_L_bucket_ids,
-			T1_L_kbc_match_results, device_global_kbc_num_entries_L,
-			T1_R_kbc_match_results, device_global_kbc_num_entries_R,
-			match_list, match_counts,
-			MAX_KBCS_POST_T1);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto processT2_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   t2 match_counts: " << match_counts[0] << std::endl;
-	std::cout << "   process T2 L time: " << std::chrono::duration_cast<milli>(processT2_finish - processT2_start).count() << " ms\n";
-
-	CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-
-	auto matchT2_start = std::chrono::high_resolution_clock::now();
-	int matches_count = match_counts[0];
-	const int match_blockSize = 256;
-	const int match_numBlocks = (matches_count + match_blockSize - 1) / match_blockSize;
-	gpu_attack_process_matches_list<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<match_numBlocks,match_blockSize>>>(
-			2,
-			matches_count, match_list,
-			T1_L_kbc_match_results,
-			T1_R_kbc_match_results,
-			T2_batch_match_results, device_T2_block_entry_counts,
-			MAX_KBCS_POST_T1, BLOCK_MAX_ENTRIES_T2);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto matchT2_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T2 L time: " << std::chrono::duration_cast<milli>(matchT2_finish - matchT2_start).count() << " ms\n";
-
-	/*
-	 *   process T2 L time: 0 ms
-   match T2 L time: 12 ms
-Freeing memory...
-GPU DISPLAY T2 MATCH RESULTS:
-  block 22 entry 198   x1:1320788535  x2:3465356684  x3:2131394289  x4:606438761
-  TOTAL: 16498
-	 */
-
-	/*auto matchT2_start = std::chrono::high_resolution_clock::now();
-	gpu_attack_find_tx_LR_matches_global<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<kBC_NUM_BUCKETS, 8>>>(2, 0, kBC_NUM_BUCKETS,
-			T1_L_kbc_match_results, device_global_kbc_num_entries_L,
-			T1_R_kbc_match_results, device_global_kbc_num_entries_R,
-			T2_batch_match_results, device_T2_block_entry_counts,
-			MAX_KBCS_POST_T1, BLOCK_MAX_ENTRIES_T2);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto matchT2_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   match T2 L time: " << std::chrono::duration_cast<milli>(matchT2_finish - matchT2_start).count() << " ms\n";
-*/
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-	gpu_display_t2_match_results<<<1,1>>>(T2_batch_match_results, device_T2_block_entry_counts, BLOCK_MAX_ENTRIES_T2);
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_L));
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_R));
-	//CUDA_CHECK_RETURN(cudaFree(device_block_entry_counts));
-	CUDA_CHECK_RETURN(cudaFree(device_buffer));
-
-	auto attack_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "   attack total time: " << std::chrono::duration_cast<milli>(attack_finish - attack_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-}
-
-
-
-
-#endif /* ATTACK_METHOD_2_HPP_ */
diff --git a/attack_method_kbc_list.hpp b/attack_method_kbc_list.hpp
deleted file mode 100644
index dcd6f5a..0000000
--- a/attack_method_kbc_list.hpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * attack_method_kbc_list.hpp
- *
- *  Created on: Nov 7, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_KBC_LIST_HPP_
-#define ATTACK_METHOD_KBC_LIST_HPP_
-
-#define ATTACK_FILTER_BITMASK(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	if (kbc_bucket_id_L > 0) { \
-		int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \
-		unsigned int kbc_bit_slot = kbc_bucket_id_L % 32; \
-		unsigned int kbc_mask = 1 << kbc_bit_slot; \
-		unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \
-		if ((kbc_mask & kbc_value) > 0) { \
-			int slot = atomicAdd(&count[0],1); \
-			xs[slot] = (x+i); \
-			chachas[slot] = chacha_y; \
-		} \
-	} \
-}
-
-__global__
-void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		const unsigned int* __restrict__ kbc_global_bitmask,
-		uint32_t * __restrict__ xs, uint32_t * __restrict__ chachas, int *count)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3);
-		ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7);
-		ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11);
-		ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15);
-	}
-}
-
-__global__
-void gpu_set_kbc_bitmask_from_kbc_list(const uint32_t N,
-		uint32_t *kbc_list, unsigned int* kbc_bitmask)
-{
-	int i = blockIdx.x * blockDim.x + threadIdx.x;
-	if (i < N) {
-		uint32_t kbc_bucket_id = kbc_list[i];
-		int kbc_bitmask_bucket = kbc_bucket_id / 32;
-		int kbc_bit_slot = kbc_bucket_id % 32;
-		unsigned int kbc_mask = 1 << kbc_bit_slot;
-		atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask);
-		//printf("kbc slot %u value %u SET mask bucket: %u  bitslot:%u\n",i, kbc_bucket_id, kbc_bitmask_bucket, kbc_bit_slot);
-		// don't forget buckets needed for rx's.
-		kbc_bitmask_bucket = (kbc_bucket_id+1) / 32;
-		kbc_bit_slot = (kbc_bucket_id+1) % 32;
-		kbc_mask = 1 << kbc_bit_slot;
-		atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask);
-		//printf("kbc %u SET mask bucket: %u  bitslot:%u\n",kbc_bucket_id+1, kbc_bitmask_bucket, kbc_bit_slot);
-	}
-}
-
-__global__
-void gpu_count_kbc_mask_bits(unsigned int* kbc_bitmask)
-{
-	int count = 0;
-	for (int kbc_bucket_id_L=0;kbc_bucket_id_L<kBC_NUM_BUCKETS;kbc_bucket_id_L++) {
-		int kbc_bitmask_bucket = kbc_bucket_id_L / 32;
-		int kbc_bit_slot = kbc_bucket_id_L % 32;
-		unsigned int kbc_mask = 1 << kbc_bit_slot;
-		unsigned int kbc_value = kbc_bitmask[kbc_bitmask_bucket];
-		if ((kbc_mask & kbc_value) > 0) {
-			count++;
-		}
-	}
-	printf("Counted kbc masks: %u\n",count);
-}
-
-#include <bits/stdc++.h>
-
-void attack_method_kbc_list(uint32_t bits) {
-
-	const uint32_t NUM_L_KBCS = 208147; // T4 16-bit entry list size
-	std::cout << "ATTACK METHOD KBC LIST NUM: " << NUM_L_KBCS << std::endl;
-
-	/* Tried, really tried, but the bitmask slows it down too much, all those x's checking 4 billion times against
-	 * ram and then doing a simple xs/ys add, even so it's 109ms just to filter the xs compared to kbc bit scan method
-	 * that's done with that phase and sorted into buckets at 40ms tops.
-	 * DrPlotter v0.1d
-Attack it!
-ATTACK METHOD KBC LIST NUM: 208147
-      kbc list bytes size:832588
-      kbc_bitmask:832588
-      expected xs:106571264 size: 426285056
-               chachas:106571264 size: 426285056
-Generating kbc list (step:87)
- num uniques:208146    duplicates: 0
-setting kbc mask
-   gpu_chacha8_set_Lxs_into_kbc_bitmask results: 1 ms
-Counted kbc masks: 411613
-getting filtered xs/chachas list
-   gpu_chacha8_filter_rxs_by_kbc_bitmask time: 109 ms
-   xs count: 97190536
-Freeing memory...
-   compute only time: 287 ms
-end.
-	 *
-	 */
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-	// first we "read" the kbc list on host
-
-	const uint32_t EXPECTED_XS = NUM_L_KBCS*2*256;
-	uint32_t *kbc_list;
-	unsigned int *kbc_bitmask;
-	int *xs_count;
-	uint32_t *xs;
-	uint32_t *chachas;
-
-	std::cout << "      kbc list bytes size:" << (sizeof(uint32_t)*NUM_L_KBCS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&kbc_list, sizeof(uint32_t)*NUM_L_KBCS));
-	std::cout << "      kbc_bitmask:" << (sizeof(unsigned int)*NUM_L_KBCS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_bitmask, kBC_NUM_BUCKETS*sizeof(unsigned int)));
-	CUDA_CHECK_RETURN(cudaMemset(kbc_bitmask, 0, kBC_NUM_BUCKETS*sizeof(unsigned int)));
-	std::cout << "      expected xs:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&xs, EXPECTED_XS*sizeof(uint32_t)));
-	std::cout << "               chachas:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&chachas, EXPECTED_XS*sizeof(uint32_t)));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&xs_count, 1024)); // 1024 blocks maybe?
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-
-	int step = kBC_NUM_BUCKETS / NUM_L_KBCS;
-	std::cout << "Generating kbc list (step:" << step << ")" << std::endl;
-	for (int i=0;i<NUM_L_KBCS;i++) {
-		int value = rand() % kBC_NUM_BUCKETS;//i*step;
-		//std::cout << " setting kbc " << value << std::endl;
-		kbc_list[i] = value; // just set distribution but consistent for testing.
-	}
-	//std::sort(kbc_list, kbc_list + NUM_L_KBCS);
-	int duplicates = 0;
-	int uniques = 0;
-	for (int i=1;i<NUM_L_KBCS;i++) {
-		if (kbc_list[i] == kbc_list[i-1]) duplicates++;
-		else uniques++;
-	}
-	std::cout << " num uniques:" << uniques << "    duplicates: " << duplicates << std::endl;
-
-	std::cout << "setting kbc mask" << std::endl;
-		int blockSize = 256; // # of threads per block, maximum is 1024.
-		uint64_t calc_N = NUM_L_KBCS;
-		uint64_t calc_blockSize = blockSize;
-		uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-		int numBlocks = calc_numBlocks;
-
-		auto time_start = std::chrono::high_resolution_clock::now();
-		gpu_set_kbc_bitmask_from_kbc_list<<<numBlocks,blockSize>>>(calc_N, kbc_list, kbc_bitmask);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto time_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n";
-
-	gpu_count_kbc_mask_bits<<<1,1>>>(kbc_bitmask);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-	std::cout << "getting filtered xs/chachas list" << std::endl;
-		blockSize = 256; // # of threads per block, maximum is 1024.
-		calc_N = UINT_MAX;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-		xs_count[0] = 0;
-		time_start = std::chrono::high_resolution_clock::now();
-		gpu_chacha8_filter_rxs_by_kbc_bitmask<<<numBlocks,blockSize>>>(calc_N,chacha_input,
-			kbc_bitmask, xs, chachas, &xs_count[0]);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		time_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   gpu_chacha8_filter_rxs_by_kbc_bitmask time: " << std::chrono::duration_cast<milli>(time_finish - time_start).count() << " ms\n";
-		std::cout << "   xs count: " << xs_count[0] << "\n";
-
-
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(kbc_bitmask));
-	CUDA_CHECK_RETURN(cudaFree(xs));
-	CUDA_CHECK_RETURN(cudaFree(chachas));
-
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-
-}
-
-#endif /* ATTACK_METHOD_KBC_LIST_HPP_ */
diff --git a/attack_method_lxs.hpp b/attack_method_lxs.hpp
deleted file mode 100644
index ce2eb86..0000000
--- a/attack_method_lxs.hpp
+++ /dev/null
@@ -1,1268 +0,0 @@
-/*
- * attack_method_lxs.hpp
- *
- *  Created on: Nov 6, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_LXS_HPP_
-#define ATTACK_METHOD_LXS_HPP_
-
-#include <cuda/barrier> // memcpy_async
-
-const uint32_t CHACHA_NUM_BATCHES_BITS = 3;
-const uint32_t CHACHA_NUM_BATCHES = 1 << CHACHA_NUM_BATCHES_BITS;
-const uint32_t CHACHA_TOTAL_ENTRIES_PER_BATCH = UINT_MAX / CHACHA_NUM_BATCHES;
-const uint32_t CHACHA_BUCKET_BITS = 4; // ACROSS ALL BATCHES
-const uint32_t CHACHA_NUM_BUCKETS = (1 << CHACHA_BUCKET_BITS);
-const uint32_t CHACHA_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS));
-const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS - CHACHA_NUM_BATCHES_BITS));
-const uint32_t CHACHA_MAX_ENTRIES_PER_BUCKET = (11 * (CHACHA_TOTAL_ENTRIES_PER_BATCH / CHACHA_NUM_BUCKETS)) / 10;
-const uint64_t CHACHA_OUT_MAX_ENTRIES_NEEDED = (CHACHA_NUM_BUCKETS * CHACHA_MAX_ENTRIES_PER_BUCKET);
-
-struct xchacha_pair {
-	uint32_t x;
-	uint32_t chacha;
-};
-
-#define CHECK_MATCH() \
-{ \
-	int16_t yr_kbc = Ry % kBC; \
-	int16_t yr_bid = yr_kbc / kC; \
-	int16_t yl_bid = yl_kbc / kC; \
-	int16_t formula_one = yr_bid - yl_bid; \
-	if (formula_one < 0) { \
-		formula_one += kB; \
-	} \
-	int16_t m = formula_one; \
-	if (m >= kB) { \
-		m -= kB; \
-	} \
-	if (m < 64) { \
-		int16_t yl_cid = yl_kbc % kC; \
-		int16_t yr_cid = yr_kbc % kC;\
-		int16_t parity = (kbc_bucket_id_L) % 2; \
-		int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; \
-		int16_t formula_two = yr_cid - yl_cid; \
-		if (formula_two < 0) { \
-			formula_two += kC; \
-		} \
-		if (formula_two == m2_parity_squared) { \
-			isMatch = true; \
-		} \
-	} \
-}
-
-// MASKED method for counter 10 bits, should help cache by 3x
-#define KBCFILTER_mask(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int kbc_bitmask_bucket = local_kbc_bucket_id / 10; \
-		int kbc_bit_slot = local_kbc_bucket_id % 10; \
-		unsigned int kbc_mask = 1 << kbc_bit_slot; \
-		unsigned int add = atomicAdd(&kbc_local_num_entries[kbc_bitmask_bucket],kbc_mask); \
-		unsigned int slot = (add >> kbc_bit_slot) & 0b01111111111; \
-		F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-}
-
-#define KBCFILTER(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
-		F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-}
-
-__global__
-void gpu_chacha8_get_k32_keystream_into_local_kbc_entries(const uint32_t N,
-		const __restrict__ uint32_t *input, F1_Bucketed_kBC_Entry *kbc_local_entries, unsigned int *kbc_local_num_entries,
-		uint32_t KBC_START, uint32_t KBC_END)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	/*const uint32_t include_xs[64] = {602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917,
-	                             3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,
-	                             1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862,
-	                             2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395,
-	                             2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700,
-	                             2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635,
-	                             2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,
-	                             2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969};*/
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		KBCFILTER(x0,0);KBCFILTER(x1,1);KBCFILTER(x2,2);KBCFILTER(x3,3);
-		KBCFILTER(x4,4);KBCFILTER(x5,5);KBCFILTER(x6,6);KBCFILTER(x7,7);
-		KBCFILTER(x8,8);KBCFILTER(x9,9);KBCFILTER(x10,10);KBCFILTER(x11,11);
-		KBCFILTER(x12,12);KBCFILTER(x13,13);KBCFILTER(x14,14);KBCFILTER(x15,15);
-	}
-}
-
-#define ATTACK_INTO_KBC_YS(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	int slot = atomicAdd(&kbc_global_num_entries_L[kbc_bucket_id],1); \
-	if (slot >= MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \
-	uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \
-	kbc_global_Ly_entries_L[entries_address] = y; \
-	kbc_x_entries[entries_address] = (x + i); \
-}
-
-// can hold 6 entries of 5 bits each = 5*6 = 30 bits.
-#define KBC_MASK_SHIFT 5
-#define KBC_MASK_MOD 6
-#define KBC_MASK_BITS 0b011111
-#define ATTACK_INTO_KBC_YS_BITMASK(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD; \
-	uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD); \
-	uint32_t add = 1 << kbc_bitmask_shift; \
-	uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add); \
-	uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; \
-	if (slot >= MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \
-	uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \
-	kbc_global_Ly_entries_L[entries_address] = y; \
-	kbc_x_entries[entries_address] = (x + i); \
-}
-
-#define CHACHA_OUT(chacha_y,i) \
-{ \
-	chachas[x+i] = chacha_y; \
-}
-
-// uint16_t indJ = l_y / kC;
-// uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-// OK, so we get all our Lx's and get their Ly's, and then compute their target Lys,
-// but then we have to write this to huge data of global_target_rys which is 38 bits.
-// even with 1 bit per entry it's too much data, unless we remove bottom bits and get some false positives.
-// 2^38 bits / 8 = 2^34 bits, >> 2 = 2^32 bits...means we can do 4 Lx passes and 4 Rx passes...interesting...
-// will have to do binary tree search for rxs...fuck.
-__global__
-void gpu_chacha8_only_chacha_results(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint32_t *chachas)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		CHACHA_OUT(x0,0);CHACHA_OUT(x1,1);CHACHA_OUT(x2,2);CHACHA_OUT(x3,3);
-		CHACHA_OUT(x4,4);CHACHA_OUT(x5,5);CHACHA_OUT(x6,6);CHACHA_OUT(x7,7);
-		CHACHA_OUT(x8,8);CHACHA_OUT(x9,9);CHACHA_OUT(x10,10);CHACHA_OUT(x11,11);
-		CHACHA_OUT(x12,12);CHACHA_OUT(x13,13);CHACHA_OUT(x14,14);CHACHA_OUT(x15,15);
-	}
-}
-
-#define CHACHA_BUCKET_OUT(chacha_y,i) \
-{ \
-	uint32_t rx_bucket = chacha_y / CHACHA_BUCKET_DIVISOR; \
-	if ((rx_bucket > CHACHA_BUCKET_RANGE_MIN) && (rx_bucket <= CHACHA_BUCKET_RANGE_MAX)) { \
-		rx_bucket = rx_bucket - CHACHA_BUCKET_RANGE_MIN; \
-		uint slot = atomicAdd(&shared_rx_counts[rx_bucket],1); \
-		if (slot > MAX_ENTRIES_PER_LOCAL_BUCKET) printf("CHACHA BUCKET OUT SLOT OVERFLOW %u\n", slot); \
-		chachas_buffer[rx_bucket * NUM_LOCAL_BUCKETS + slot] = chacha_y; \
-		xs_buffer[rx_bucket * NUM_LOCAL_BUCKETS + slot] = (x+i); \
-	} \
-}
-//printf("PASSED FILTER   local rx bucket %u   slot %u\n", chacha_y, rx_bucket+CHACHA_BUCKET_MIN, rx_bucket, slot); \
-	printf("chacha y: %u rx_bucket %u \n", chacha_y, rx_bucket); \ chachas[address] = chacha_y; \
-		//rxs[address] = (x+i); \
-
-#define ATTACK_WRITE_CHACHAS32_PAIR(chacha_y,i) \
-{ \
-	xchacha_pair pair = { base_x + i, chacha_y }; \
-	shared_chachas[threadIdx.x*32+i] = pair; \
-	const uint32_t bucket_id = pair.chacha >> (32 - CHACHA_BUCKET_BITS); \
-	atomicAdd(&shared_counts[bucket_id],1); \
-}
-
-// run with 128 blocksize, more doesn't matter.
-template<int NUM_BUCKETS>
-__global__
-void gpu_chacha8_k32_write_chachas32_buckets(const uint32_t N, const uint32_t X_START,
-		const uint32_t CHACHA_MAX_PER_BUCKET,
-		const __restrict__ uint32_t *input,
-		xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	__shared__ xchacha_pair shared_chachas[128*32]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-	__shared__ uint shared_counts[NUM_BUCKETS];
-	__shared__ uint global_counts[NUM_BUCKETS];
-
-	if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 32;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-		shared_counts[i] = 0;
-	}
-	__syncthreads();
-
-	const int j = 0;
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 2 + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],3);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],7);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],11);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],16+3);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],16+7);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],16+11);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],16+15);
-
-	}
-	__syncthreads();
-	for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-		global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-		shared_counts[i] = 0;
-	}
-	__syncthreads();
-	for (int i=threadIdx.x;i<blockDim.x*32;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-		xchacha_pair pair = shared_chachas[i];
-
-		uint32_t bucket_id = pair.chacha >> (32 - CHACHA_BUCKET_BITS); // 16 buckets
-		uint slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1);
-		if (slot > CHACHA_MAX_PER_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_BUCKET, slot);
-		else xchachas_buckets[CHACHA_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = shared_chachas[i];
-	}
-}
-
-#define ATTACK_BUCKETBATCH_CHACHAS32_PAIR(chacha_y,i) \
-{ \
-	if ((chacha_y >= BATCH_CHACHA_RANGE_MIN) && (chacha_y <= BATCH_CHACHA_RANGE_MAX)) { \
-		xchacha_pair pair = { base_x + i, chacha_y }; \
-		int slot = atomicAdd(&local_filter_count,1); \
-		if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \
-		shared_chachas[slot] = pair; \
-		uint32_t split_bucket_id = (chacha_y - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR; \
-		atomicAdd(&shared_counts[split_bucket_id],1); \
-	} \
-}
-
-// run with 128 blocksize, more doesn't matter.
-template<int NUM_SPLIT_BUCKETS>
-__global__
-void gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange(const uint32_t N,
-		const uint32_t BATCH_CHACHA_RANGE_MIN, const uint32_t BATCH_CHACHA_RANGE_MAX,
-		const uint32_t CHACHA_MAX_PER_SPLIT_BUCKET, const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR,
-		const __restrict__ uint32_t *input,
-		xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-	const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can
-	__shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-	__shared__ uint shared_counts[NUM_SPLIT_BUCKETS];
-	__shared__ uint global_counts[NUM_SPLIT_BUCKETS];
-	__shared__ uint local_filter_count;
-
-	//if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 32;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	for (int i=threadIdx.x;i<NUM_SPLIT_BUCKETS;i+=blockDim.x) {
-		shared_counts[i] = 0;
-	}
-	if (threadIdx.x == 0) {
-		local_filter_count = 0;
-	}
-	__syncthreads();
-
-	const int j = 0;
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 2;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],3);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],7);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],11);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],16+3);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],16+7);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],16+11);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],16+15);
-	}
-	// at this point we have 128*32 = 4096 entries
-	// now we have to sort them into the buckets
-	// we already have the shared counts set from the ATTACK macro
-	__syncthreads();
-	for (int i=threadIdx.x;i<NUM_SPLIT_BUCKETS;i+=blockDim.x) {
-		global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-		shared_counts[i] = 0;
-	}
-	// now just scan our filtered entries and bucket them
-	__syncthreads();
-	for (int i=threadIdx.x;i<local_filter_count;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-
-		// remember, these are *already* bucketed to some range
-		xchacha_pair pair = shared_chachas[i];
-		uint32_t split_bucket_id = (pair.chacha - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR;
-		uint slot = global_counts[split_bucket_id] + atomicAdd(&shared_counts[split_bucket_id],1);
-		if (slot > CHACHA_MAX_PER_SPLIT_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_SPLIT_BUCKET, slot);
-		else xchachas_buckets[CHACHA_MAX_PER_SPLIT_BUCKET * split_bucket_id + slot] = shared_chachas[i];
-	}
-}
-
-
-__global__
-void gpu_chacha8_tag_rxs_from_chacha(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		const uint16_t *kbc_global_Ly_entries_L, const unsigned int *kbc_global_num_entries_L, const uint32_t MAX_LXS_PER_KBC_BUCKET,
-		uint32_t *chachas)
-{
-	int x = blockIdx.x * blockDim.x + threadIdx.x;
-	if (x < N) {
-		uint32_t chacha_y = chachas[x];
-		uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26);
-		int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1;
-		if (kbc_bucket_id_L > 0) {
-			int num = kbc_global_num_entries_L[kbc_bucket_id_L];
-			for (int nm=0;nm<num;nm++) {
-				bool isMatch = false;
-				int16_t yl_kbc = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET + nm];
-				int16_t yr_kbc = Ry % kBC;
-				int16_t yr_bid = yr_kbc / kC;
-				int16_t yl_bid = yl_kbc / kC;
-				int16_t formula_one = yr_bid - yl_bid;
-				if (formula_one < 0) {
-					formula_one += kB;
-				}
-				int16_t m = formula_one;
-				if (m >= kB) {
-					m -= kB;
-				}
-				if (m < 64) {
-					int16_t yl_cid = yl_kbc % kC;
-					int16_t yr_cid = yr_kbc % kC;
-					int16_t parity = (kbc_bucket_id_L) % 2;
-					int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC;
-					int16_t formula_two = yr_cid - yl_cid;
-					if (formula_two < 0) {
-						formula_two += kC;
-					}
-					if (formula_two == m2_parity_squared) {
-						isMatch = true;
-					}
-				}
-				if (isMatch) {
-					chachas[x] = 0;
-				}
-			}
-		}
-	}
-
-}
-
-__global__
-void gpu_chacha8_filter_rxs_from_chacha(const uint32_t N, const uint32_t *chachas, uint32_t *rxs, int *rx_count)
-{
-	int x = blockIdx.x * blockDim.x + threadIdx.x;
-	if (x < N) {
-		uint32_t chacha_y = chachas[x];
-		if (chacha_y == 0) {
-			int slot = atomicAdd(&rx_count[0], 1);
-			rxs[slot] = x;
-		}
-	}
-
-}
-
-__global__
-void gpu_chacha8_set_Lxs_into_kbc_ys(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group < end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_INTO_KBC_YS(x0,0);ATTACK_INTO_KBC_YS(x1,1);ATTACK_INTO_KBC_YS(x2,2);ATTACK_INTO_KBC_YS(x3,3);
-		ATTACK_INTO_KBC_YS(x4,4);ATTACK_INTO_KBC_YS(x5,5);ATTACK_INTO_KBC_YS(x6,6);ATTACK_INTO_KBC_YS(x7,7);
-		ATTACK_INTO_KBC_YS(x8,8);ATTACK_INTO_KBC_YS(x9,9);ATTACK_INTO_KBC_YS(x10,10);ATTACK_INTO_KBC_YS(x11,11);
-		ATTACK_INTO_KBC_YS(x12,12);ATTACK_INTO_KBC_YS(x13,13);ATTACK_INTO_KBC_YS(x14,14);ATTACK_INTO_KBC_YS(x15,15);
-	}
-}
-
-__global__
-void gpu_chacha8_set_Lxs_into_kbc_ys_mask(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group < end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_INTO_KBC_YS_BITMASK(x0,0);ATTACK_INTO_KBC_YS_BITMASK(x1,1);ATTACK_INTO_KBC_YS_BITMASK(x2,2);ATTACK_INTO_KBC_YS_BITMASK(x3,3);
-		ATTACK_INTO_KBC_YS_BITMASK(x4,4);ATTACK_INTO_KBC_YS_BITMASK(x5,5);ATTACK_INTO_KBC_YS_BITMASK(x6,6);ATTACK_INTO_KBC_YS_BITMASK(x7,7);
-		ATTACK_INTO_KBC_YS_BITMASK(x8,8);ATTACK_INTO_KBC_YS_BITMASK(x9,9);ATTACK_INTO_KBC_YS_BITMASK(x10,10);ATTACK_INTO_KBC_YS_BITMASK(x11,11);
-		ATTACK_INTO_KBC_YS_BITMASK(x12,12);ATTACK_INTO_KBC_YS_BITMASK(x13,13);ATTACK_INTO_KBC_YS_BITMASK(x14,14);ATTACK_INTO_KBC_YS_BITMASK(x15,15);
-	}
-}
-
-
-
-#define ATTACK_FILTER_RXS(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	if ((kbc_bucket_id_L > KBC_MIN_RANGE) && (kbc_bucket_id_L <= KBC_MAX_RANGE)) { \
-		int num = kbc_global_num_entries_L[kbc_bucket_id_L]; \
-		for (int nm=0;nm<num;nm++) { \
-			isMatch = false; \
-			int16_t yl_kbc = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET + nm]; \
-			CHECK_MATCH(); \
-			if (isMatch) { \
-				int slot = atomicAdd(&rx_count[0],1); \
-				rxs[slot] = (x+i); \
-			} \
-		} \
-	} \
-}
-
-#define ATTACK_FILTER_RXS_single_match(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	isMatch = false; \
-	if (kbc_bucket_id_L > 0) { \
-		uint64_t Ly = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET]; \
-		if (Ly > 0) { \
-			CHECK_MATCH(); \
-		} \
-	} \
-	if (isMatch) { \
-		int slot = atomicAdd(&rx_count[0],1); \
-		rxs[slot] = (x+i); \
-	} \
-}
-
-
-__global__
-void gpu_chacha8_filter_rxs(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		const uint16_t* __restrict__ kbc_global_Ly_entries_L, const unsigned int* __restrict__ kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET,
-		uint32_t * __restrict__ rxs, int *rx_count,
-		const uint32_t KBC_MIN_RANGE, const uint32_t KBC_MAX_RANGE)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		bool isMatch = false;
-		ATTACK_FILTER_RXS(x0,0);ATTACK_FILTER_RXS(x1,1);ATTACK_FILTER_RXS(x2,2);ATTACK_FILTER_RXS(x3,3);
-		ATTACK_FILTER_RXS(x4,4);ATTACK_FILTER_RXS(x5,5);ATTACK_FILTER_RXS(x6,6);ATTACK_FILTER_RXS(x7,7);
-		ATTACK_FILTER_RXS(x8,8);ATTACK_FILTER_RXS(x9,9);ATTACK_FILTER_RXS(x10,10);ATTACK_FILTER_RXS(x11,11);
-		ATTACK_FILTER_RXS(x12,12);ATTACK_FILTER_RXS(x13,13);ATTACK_FILTER_RXS(x14,14);ATTACK_FILTER_RXS(x15,15);
-	}
-}
-
-__global__
-void gpu_chacha8_filter_rxs_from_bucket_batch_async(
-		const uint32_t N,
-		const xchacha_pair* __restrict__ xchachas,
-		const uint16_t* __restrict__ kbc_global_Ly_entries_L,
-		const unsigned int* __restrict__ kbc_global_num_entries_L,
-		uint32_t MAX_LXS_PER_KBC_BUCKET,
-		uint32_t * __restrict__ rxs,
-		int *rx_count)
-{
-	__shared__ uint16_t copy_Ly_entries[64];
-
-	cuda::barrier<cuda::thread_scope_system> bar;
-	init(&bar, 1);
-
-	int num;
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		xchacha_pair entry = xchachas[i];
-		uint64_t Ry = (((uint64_t) entry.chacha) << 6) + (entry.x >> 26);
-		int kbc_bucket_id_R = (uint32_t (Ry / kBC));
-		if (kbc_bucket_id_R > 0) {
-			int kbc_bucket_id_L = kbc_bucket_id_R - 1;
-			//printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L);
-			num = kbc_global_num_entries_L[kbc_bucket_id_L];
-			cuda::memcpy_async(&copy_Ly_entries[0],
-							        &kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET], sizeof(uint16_t)*num, bar);
-			bar.arrive_and_wait();
-			for (int nm=0;nm<num;nm++) {
-				bool isMatch = false;
-
-				int16_t yl_kbc = copy_Ly_entries[nm];
-				CHECK_MATCH();
-				if (isMatch) {
-					int slot = atomicAdd(&rx_count[0],1);
-					rxs[slot] = entry.x;
-				}
-			}
-		}
-	}
-}
-
-__global__
-void gpu_chacha8_filter_rxs_from_bucket_batch(
-		const uint32_t N,
-		const xchacha_pair* __restrict__ xchachas,
-		const uint16_t* __restrict__ kbc_global_Ly_entries_L,
-		const unsigned int* __restrict__ kbc_global_num_entries_L,
-		uint32_t MAX_LXS_PER_KBC_BUCKET,
-		uint32_t * __restrict__ rxs,
-		int *rx_count)
-{
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		xchacha_pair entry = xchachas[i];
-		uint64_t Ry = (((uint64_t) entry.chacha) << 6) + (entry.x >> 26);
-		int kbc_bucket_id_R = (uint32_t (Ry / kBC));
-		if (kbc_bucket_id_R > 0) {
-			int kbc_bucket_id_L = kbc_bucket_id_R - 1;
-			//printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L);
-			//int num = kbc_global_num_entries_L[kbc_bucket_id_L];
-
-			//uint num = kbc_global_num_entries_L[kbc_bucket_id_L];
-			uint32_t kbc_bitmask_bucket = kbc_bucket_id_L / KBC_MASK_MOD;
-			uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id_L % KBC_MASK_MOD);
-			uint slot_value =kbc_global_num_entries_L[kbc_bitmask_bucket];
-			uint num = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS;
-			for (int nm=0;nm<num;nm++) {
-				bool isMatch = false;
-				int16_t yl_kbc = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET + nm];
-				CHECK_MATCH();
-				if (isMatch) {
-					int slot = atomicAdd(&rx_count[0],1);
-					rxs[slot] = entry.x;
-				}
-			}
-		}
-	}
-}
-
-#define ATTACK_SET_BITMASK(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id = (uint32_t (y / kBC)); \
-	int kbc_bitmask_bucket = kbc_bucket_id / 32; \
-	int kbc_bit_slot = kbc_bucket_id % 32; \
-	unsigned int kbc_mask = 1 << kbc_bit_slot; \
-	atomicOr(&kbc_global_bitmask[kbc_bitmask_bucket],kbc_mask); \
-}
-
-#define ATTACK_FILTER_BITMASK_batch64(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	if (kbc_bucket_id_L > 0) { \
-		int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \
-		int kbc_bit_slot = kbc_bucket_id_L % 32; \
-		unsigned int kbc_mask = 1 << kbc_bit_slot; \
-		unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \
-		if ((kbc_mask & kbc_value) > 0) { \
-			uint32_t batch_id = kbc_bucket_id_L >> (32-6); \
-			int slot = atomicAdd(&rx_count[batch_id],1); \
-			rxs[batch_id * RX_MAX_ENTRIES_PER_BATCH + slot] = (x+i); \
-		} \
-	} \
-}
-
-#define ATTACK_FILTER_BITMASK(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	if (kbc_bucket_id_L > 0) { \
-		int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \
-		int kbc_bit_slot = kbc_bucket_id_L % 32; \
-		unsigned int kbc_mask = 1 << kbc_bit_slot; \
-		unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \
-		if ((kbc_mask & kbc_value) > 0) { \
-			int slot = atomicAdd(&rx_local_count,1); \
-			shared_rxs[slot] = (x+i); \
-		} \
-	} \
-}
-
-#define ATTACK_FILTER_BITMASK_origbeforeaddingshared(chacha_y,i) \
-{ \
-	uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \
-	if (kbc_bucket_id_L > 0) { \
-		int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \
-		int kbc_bit_slot = kbc_bucket_id_L % 32; \
-		unsigned int kbc_mask = 1 << kbc_bit_slot; \
-		unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \
-		if ((kbc_mask & kbc_value) > 0) { \
-			int slot = atomicAdd(&rx_count[0],1); \
-			rxs[slot] = (x+i); \
-		} \
-	} \
-}
-
-__global__
-void gpu_chacha8_set_Lxs_into_kbc_bitmask(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		unsigned int* kbc_global_bitmask)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_SET_BITMASK(x0,0);ATTACK_SET_BITMASK(x1,1);ATTACK_SET_BITMASK(x2,2);ATTACK_SET_BITMASK(x3,3);
-		ATTACK_SET_BITMASK(x4,4);ATTACK_SET_BITMASK(x5,5);ATTACK_SET_BITMASK(x6,6);ATTACK_SET_BITMASK(x7,7);
-		ATTACK_SET_BITMASK(x8,8);ATTACK_SET_BITMASK(x9,9);ATTACK_SET_BITMASK(x10,10);ATTACK_SET_BITMASK(x11,11);
-		ATTACK_SET_BITMASK(x12,12);ATTACK_SET_BITMASK(x13,13);ATTACK_SET_BITMASK(x14,14);ATTACK_SET_BITMASK(x15,15);
-	}
-}
-
-
-
-__global__
-void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		const unsigned int* __restrict__ kbc_global_bitmask,
-		uint32_t * __restrict__ rxs, int *rx_count,
-		const uint32_t RX_BATCHES, const uint32_t RX_MAX_ENTRIES_PER_BATCH)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	__shared__ uint32_t shared_rxs[1024];
-	__shared__ int rx_local_count;
-	__shared__ int global_slot;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	if (threadIdx.x == 0) {
-		rx_local_count = 0;
-	}
-	for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-		__syncthreads();
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3);
-		ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7);
-		ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11);
-		ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15);
-
-		__syncthreads();
-		if (threadIdx.x == 0) {
-			global_slot = atomicAdd(&rx_count[0],rx_local_count);
-			rx_local_count = 0;
-		}
-		__syncthreads();
-		for (int i=threadIdx.x;i<rx_local_count;i+=blockDim.x) {
-			rxs[global_slot+i] = shared_rxs[i];
-		}
-
-	}
-
-}
-
-__global__ void gpu_get_max_counts_from_counter_list(unsigned int *kbc_counts, const int NUM, const bool printAll) {
-	__shared__ unsigned int max_kbc_count;
-	__shared__ unsigned int sum_kbc_count;
-	if (threadIdx.x == 0) {
-		max_kbc_count = 0;
-		sum_kbc_count = 0;
-	}
-	__syncthreads();
-	for (int i=threadIdx.x;i<NUM;i+=blockDim.x) {
-		unsigned int kbc_count = kbc_counts[i];
-		if (printAll) printf("id: %u count: %u\n", i, kbc_count);
-		atomicMax(&max_kbc_count, kbc_count);
-		atomicAdd(&sum_kbc_count, kbc_count);
-	}
-	__syncthreads();
-	if (threadIdx.x == 0) printf("counter list counts  SUM:%u   MAX:%u\n", sum_kbc_count, max_kbc_count);
-}
-
-void attack_method_lxs(uint32_t num_lxs) {
-	num_lxs =  110000000;
-	std::cout << "ATTACK METHOD LXS: " << num_lxs << std::endl;
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-	unsigned int *device_global_kbc_num_entries_L;
-	uint16_t *kbc_Ly_entries; // the y % kbc bucketed entries
-	uint32_t *kbc_x_entries;  // the associated x value for the y pairing
-	uint32_t *rx_match_list;
-
-	const uint32_t NUM_KBC_RANGE_BATCHES = 1;
-	const uint32_t MAX_LXS_PER_KBC_BUCKET = 24; // 24 for 110,000,000
-	const uint32_t MAX_RX_MATCHES = num_lxs;
-	const uint32_t RX_BATCHES = 1;
-	const uint32_t RX_MAX_ENTRIES_PER_BATCH = MAX_RX_MATCHES / RX_BATCHES;
-	std::cout   << "CHACHA NUM BATCHES    : " << CHACHA_NUM_BATCHES << std::endl
-				<< "CHACHA_TOTAL_ENTRIES_PER_BATCH : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl
-			    << "CHACHA BUCKETS        : " << CHACHA_NUM_BUCKETS << std::endl
-			    << "CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl
-			    << "CHACHA_OUT_MAX_ENTRIES_NEEDED : " << CHACHA_OUT_MAX_ENTRIES_NEEDED << std::endl
-	            << "MAX_RX_MATCHES: " << MAX_RX_MATCHES << std::endl
-				<< "RX_BATCHES: " << RX_BATCHES << std::endl
-			  << "RX_MAX_ENTRIES_PER_BATCH: " << RX_MAX_ENTRIES_PER_BATCH << std::endl;
-
-	xchacha_pair *xchachas;
-	uint *xchachas_bucket_counts;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&xchachas_bucket_counts, CHACHA_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(xchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-
-	std::cout << "      xchachas size:" << (sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&xchachas, sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED));
-
-	// alloc for lx's
-	std::cout << "      kbc_Ly_entries MAX_LXS: " << MAX_LXS_PER_KBC_BUCKET << " TOTAL BYTES: " <<  (MAX_LXS_PER_KBC_BUCKET * sizeof(uint64_t) * kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_Ly_entries, (MAX_LXS_PER_KBC_BUCKET * sizeof(uint16_t) * kBC_NUM_BUCKETS)));
-	std::cout << "      kbc_x_entries MAX_LXS: " << MAX_LXS_PER_KBC_BUCKET << " TOTAL BYTES: " <<  (MAX_LXS_PER_KBC_BUCKET * sizeof(uint64_t) * kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_x_entries, (MAX_LXS_PER_KBC_BUCKET * sizeof(uint32_t) * kBC_NUM_BUCKETS)));
-
-	std::cout << "      device_global_kbc_num_entries_L size:" << (sizeof(int)*kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_global_kbc_num_entries_L, sizeof(int)*kBC_NUM_BUCKETS));
-	CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, kBC_NUM_BUCKETS*sizeof(int)));
-
-	// alloc for rx's
-	int *rx_match_count;
-	std::cout << "      rx_match_list MAX_RX_MATCHES: " << MAX_RX_MATCHES << " TOTAL BYTES: " <<  (MAX_RX_MATCHES * sizeof(uint32_t)) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&rx_match_list, (MAX_RX_MATCHES * sizeof(uint32_t))));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&rx_match_count, RX_BATCHES*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(rx_match_count, 0, RX_BATCHES*sizeof(int)));
-
-	auto alloc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   alloc time: " << std::chrono::duration_cast<milli>(alloc_finish - attack_start).count() << " ms\n";
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-
-	int blockSize; // # of threads per block, maximum is 1024.
-	uint64_t calc_N;
-	uint64_t calc_blockSize;
-	uint64_t calc_numBlocks;
-	int numBlocks;
-
-/*	std::cout << "   gpu_chacha8_set_Lxs_into_kbc_bitmask \n";
-		int blockSize = 16; // # of threads per block, maximum is 1024.
-		uint64_t calc_N = num_lxs;
-		uint64_t calc_blockSize = blockSize;
-		uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		int numBlocks = calc_numBlocks;
-
-		auto chacha_start = std::chrono::high_resolution_clock::now();
-		gpu_chacha8_set_Lxs_into_kbc_bitmask<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-				device_global_kbc_num_entries_L);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto chacha_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   - gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-
-	F1_Bucketed_kBC_Entry *local_kbc_entries = (F1_Bucketed_kBC_Entry *) rx_match_list;
-		chacha_start = std::chrono::high_resolution_clock::now();
-		// 1) gpu scan kbs into (F1_Bucketed_kBC_Entry *) bufferA
-		//std::cout << "   Generating F1 results into kbc buckets...";
-		blockSize = 128; // # of threads per block, maximum is 1024.
-		calc_N = UINT_MAX;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-		//std::cout << "  Block configuration: [blockSize:" << blockSize << "  numBlocks:" << numBlocks << "]" << std::endl;
-		// don't forget to clear counter...will only use a portion of this memory so should be fast access.
-
-		CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, 10000000*sizeof(int)));
-		gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-				local_kbc_entries, device_global_kbc_num_entries_L, 0, 2000000);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		chacha_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   - gpu_chacha8_get_k32_keystream_into_local_kbc_entries results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-
-
-	std::cout << "   gpu_chacha8_filter_rxs_by_kbc_bitmask \n";
-		blockSize = 256; // # of threads per block, maximum is 1024.
-		calc_N = UINT_MAX;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-
-		chacha_start = std::chrono::high_resolution_clock::now();
-		gpu_chacha8_filter_rxs_by_kbc_bitmask<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-						device_global_kbc_num_entries_L,
-						rx_match_list, rx_match_count,
-						RX_BATCHES, RX_MAX_ENTRIES_PER_BATCH);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		chacha_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   gpu_chacha8_filter_rxs_by_kbc_bitmask results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-		std::cout << "   found " << rx_match_count[0] << " RXS" << std::endl;
-
-*/
-
-	// FIRST SET LXS into global memory, these stay put for each chacha round
-	blockSize = 128; // # of threads per block, maximum is 1024.
-	calc_N = num_lxs;
-	calc_blockSize = blockSize;
-	calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	numBlocks = calc_numBlocks;
-
-	std::cout << "   gpu_chacha8_set_Lxs_into_kbc_ys num:" << calc_N << std::endl;
-	auto lxintokbc_start = std::chrono::high_resolution_clock::now();
-	gpu_chacha8_set_Lxs_into_kbc_ys_mask<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-			kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET);
-
-	/* Doing chacha batch 7
-	   gpu_chacha8_k32_write_chachas32_buckets results: 32 ms
-	   chacha Rxs time: 37 ms
-	   found 90582467 matches
-	Freeing memory...
-	   total chachas time: 248 ms
-	   total Rxs time: 302 ms
-	   compute only time: 654 ms  attack total time: 692 ms */
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto lxintokbc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   gpu_chacha8_set_Lxs_into_kbc_ys time: " << std::chrono::duration_cast<milli>(lxintokbc_finish - lxintokbc_start).count() << " ms\n";
-	gpu_get_max_counts_from_counter_list<<<1,1024>>>(device_global_kbc_num_entries_L, kBC_NUM_BUCKETS, false);
-
-	int64_t total_chacha_ms = 0;
-	int64_t total_rx_ms = 0;
-	for (uint64_t chacha_batch_id = 0; chacha_batch_id < CHACHA_NUM_BATCHES; chacha_batch_id++) {
-		std::cout << "Doing chacha batch " << chacha_batch_id << std::endl;
-		uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS));
-		uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR;
-		uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch
-		//if (chacha_batch_id == CHACHA_NUM_BATCHES - 1) BATCH_CHACHA_RANGE_MAX = UINT_MAX;
-
-		//std::cout << "   BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl;
-		//std::cout << "   BATCH_CHACHA_RANGE   : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl;
-		//std::cout << "   BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl;
-		//std::cout << "   CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl;
-		//std::cout << "   CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl;
-
-
-		blockSize = 128; // # of threads per block, maximum is 1024.
-		calc_N = UINT_MAX;//CHACHA_TOTAL_ENTRIES_PER_BATCH;
-		uint32_t CHACHA_X_START = 0;//chacha_batch_id * calc_N;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32);
-		numBlocks = calc_numBlocks;
-		CUDA_CHECK_RETURN(cudaMemset(xchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-		auto chacha_start = std::chrono::high_resolution_clock::now();
-		//std::cout << "   calc_N   : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<CHACHA_NUM_BUCKETS><<<numBlocks,blockSize>>>(calc_N,
-				BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX,
-				CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR,
-				chacha_input,
-				xchachas, xchachas_bucket_counts);
-
-
-		//gpu_chacha8_only_chacha_results<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-		//				chachas);
-		//gpu_chacha8_k32_write_chachas32_buckets<CHACHA_NUM_BUCKETS><<<numBlocks,blockSize>>>(calc_N, CHACHA_X_START,
-		//		CHACHA_MAX_ENTRIES_PER_BUCKET,
-		//		chacha_input,
-		//		xchachas, xchachas_bucket_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto chacha_finish = std::chrono::high_resolution_clock::now();
-		total_chacha_ms += std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count();
-		std::cout << "   gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-		//gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, CHACHA_NUM_BUCKETS, true);
-		auto chacha_rs_start = std::chrono::high_resolution_clock::now();
-		for (uint chacha_bucket_id=0;chacha_bucket_id<CHACHA_NUM_BUCKETS;chacha_bucket_id++) {
-			blockSize = 256; // # of threads per block, maximum is 1024.
-			calc_N = xchachas_bucket_counts[chacha_bucket_id];
-			calc_blockSize = blockSize;
-			calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-			numBlocks = calc_numBlocks;
-			//std::cout << "Doing chacha bucket " << chacha_bucket_id << " (calc_N: " << calc_N << ")" << std::endl;
-			gpu_chacha8_filter_rxs_from_bucket_batch<<<numBlocks,blockSize>>>(
-					calc_N,
-					&xchachas[chacha_bucket_id],
-					kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET,
-					rx_match_list, rx_match_count);
-			//CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		}
-
-		/*
-		blockSize = 128; // # of threads per block, maximum is 1024.
-		calc_N = UINT_MAX/CHACHA_NUM_BATCHES;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-
-
-		std::cout << "Doing " << NUM_KBC_RANGE_BATCHES << " range batches of gpu_chacha_filter_rxs" << std::endl;
-		for (int kbc_range_batch=0;kbc_range_batch < NUM_KBC_RANGE_BATCHES; kbc_range_batch++) {
-			const uint32_t KBC_MIN_RANGE = ((kbc_range_batch+0) * 18188177) / (NUM_KBC_RANGE_BATCHES);
-			const uint32_t KBC_MAX_RANGE = ((kbc_range_batch+1) * 18188177) / (NUM_KBC_RANGE_BATCHES);
-			std::cout << "range KBC_MIN: " << KBC_MIN_RANGE << " - " << KBC_MAX_RANGE << std::endl;
-			gpu_chacha8_filter_rxs<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-					kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET,
-					rx_match_list, rx_match_count,
-					KBC_MIN_RANGE, KBC_MAX_RANGE);
-		}
-*/
-
-			//calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); numBlocks = calc_numBlocks;
-			//gpu_chacha8_tag_rxs_from_chacha<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-			//		kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET,
-			//		chachas);
-			//gpu_chacha8_filter_rxs_from_chacha<<<numBlocks,blockSize>>>(calc_N,chachas,rx_match_list,rx_match_count);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto chacha_rs_finish = std::chrono::high_resolution_clock::now();
-		total_rx_ms += std::chrono::duration_cast<milli>(chacha_rs_finish - chacha_rs_start).count();
-		std::cout << "   chacha Rxs time: " << std::chrono::duration_cast<milli>(chacha_rs_finish - chacha_rs_start).count() << " ms\n";
-		std::cout << "   found " << rx_match_count[0] << " matches" << std::endl;
-
-
-	}
-
-
-
-
-
-
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(kbc_Ly_entries));
-	CUDA_CHECK_RETURN(cudaFree(device_global_kbc_num_entries_L));
-
-	auto attack_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   total chachas time: " << total_chacha_ms << " ms\n";
-	std::cout << "   total Rxs time: " << total_rx_ms << " ms\n";
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "   attack total time: " << std::chrono::duration_cast<milli>(attack_finish - attack_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-}
-
-
-
-
-
-#endif /* ATTACK_METHOD_LXS_HPP_ */
diff --git a/attack_method_lxs2.hpp b/attack_method_lxs2.hpp
deleted file mode 100644
index a8529b5..0000000
--- a/attack_method_lxs2.hpp
+++ /dev/null
@@ -1,1766 +0,0 @@
-/*
- * attack_method_lxs2.hpp
- *
- *  Created on: Nov 8, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_LXS2_HPP_
-#define ATTACK_METHOD_LXS2_HPP_
-
-//#include <thrust/device_ptr.h>
-//#include <thrust/sort.h>
-//#include <thrust/unique.h>
-
-
-
-struct xchacha_pair {
-	uint32_t x;
-	uint32_t chacha;
-};
-
-// TODO: try increasing the buckets as we go down the iterations
-// suspect we can benefit more from cache when flipping back and forth vs the chacha generation
-// which likely eats a lot of the cache? Or I had a huge bug somewhere.
-
-const uint32_t DUMBSORT_BUCKET_BITS = 4;
-const uint32_t DUMBSORT_NUM_BUCKETS = 1 << DUMBSORT_BUCKET_BITS;
-const uint32_t PHASE_3_DUMBSORT_MAX_PER_BUCKET = 42;//32;
-const uint32_t PHASE_2_DUMBSORT_MAX_PER_BUCKET = 42*16;//512;
-const uint32_t PHASE_1_DUMBSORT_MAX_PER_BUCKET = 42*16*16;//8192; // 8601 was largest found, using a multiple of 256 so going for 8704
-const uint32_t DUMBSORT_BATCHES_TILE_SPACE = PHASE_1_DUMBSORT_MAX_PER_BUCKET * DUMBSORT_NUM_BUCKETS;
-const uint32_t GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK = 65536;
-const uint32_t DUMBSORT_SPACE_NEEDED_FOR_SCRATCH = ((1 << (32-6)) / GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK) * DUMBSORT_BATCHES_TILE_SPACE;
-
-
-////////////////////////////////////////////////////////////////////////////////
-// Monolithic bitonic sort kernel for short arrays fitting into shared memory
-////////////////////////////////////////////////////////////////////////////////
-#include <cooperative_groups.h>
-
-namespace cg = cooperative_groups;
-#define SHARED_SIZE_LIMIT 1024U
-
-__device__ inline void Comparator(
-    uint &keyA,
-    uint &valA,
-    uint &keyB,
-    uint &valB,
-    uint dir
-)
-{
-    uint t;
-
-    if ((keyA > keyB) == dir)
-    {
-        t = keyA;
-        keyA = keyB;
-        keyB = t;
-        t = valA;
-        valA = valB;
-        valB = t;
-    }
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Monolithic Bacther's sort kernel for short arrays fitting into shared memory
-////////////////////////////////////////////////////////////////////////////////
-__global__ void oddEvenMergeSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-    //Shared memory storage for one or more small vectors
-    __shared__ uint s_key[SHARED_SIZE_LIMIT];
-    __shared__ uint s_val[SHARED_SIZE_LIMIT];
-
-    uint dir = 1;
-        uint arrayLength = 1024;
-
-        //Offset to the beginning of subbatch and load data
-        chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-        out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-        out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-        s_key[threadIdx.x +                       0] = chachas[                      0];
-        s_val[threadIdx.x +                       0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-        s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = chachas[(SHARED_SIZE_LIMIT / 2)];
-        s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2);
-
-    for (uint size = 2; size <= arrayLength; size <<= 1)
-    {
-        uint stride = size / 2;
-        uint offset = threadIdx.x & (stride - 1);
-
-        {
-            cg::sync(cta);
-            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-            Comparator(
-                s_key[pos +      0], s_val[pos +      0],
-                s_key[pos + stride], s_val[pos + stride],
-                dir
-            );
-            stride >>= 1;
-        }
-
-        for (; stride > 0; stride >>= 1)
-        {
-            cg::sync(cta);
-            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-
-            if (offset >= stride)
-                Comparator(
-                    s_key[pos - stride], s_val[pos - stride],
-                    s_key[pos +      0], s_val[pos +      0],
-                    dir
-                );
-        }
-    }
-
-    cg::sync(cta);
-    out_chachas[                      0] = s_key[threadIdx.x +                       0];
-        out_xs[                      0] = s_val[threadIdx.x +                       0];
-        out_chachas[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-        out_xs[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-
-}
-
-// threads must be SHARED_SIZE_LIMIT/2
-__global__ void nickSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-    //Shared memory storage for one or more short vectors
-    __shared__ uint order[SHARED_SIZE_LIMIT*2]; // we're going to use top 16 and bottom 16 to store indexes
-    __shared__ uint bucket_counts[1024];
-    __shared__ uint s_key[SHARED_SIZE_LIMIT]; // the sort values
-    __shared__ uint s_val[SHARED_SIZE_LIMIT]; // stores the xs
-    __shared__ uint sorted_val[SHARED_SIZE_LIMIT];
-    __shared__ uint sorted_key[SHARED_SIZE_LIMIT];
-
-    uint dir = 1;
-    uint arrayLength = 1024;
-
-    //Offset to the beginning of subbatch and load data
-    chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    uint32_t chacha = chachas[0];
-    uint16_t index = threadIdx.x;
-    bucket_counts[threadIdx.x]   = 0;
-    order[threadIdx.x] = 0;
-    order[threadIdx.x + 1024] = 0;
-    //order[threadIdx.x*2+1] = 0;
-    s_key[threadIdx.x +                       0] = chachas[                      0];
-    s_val[threadIdx.x +                       0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-
-    cg::sync(cta);
-    uint16_t bucket_id = chacha >> (32 - 10);
-        int add = atomicAdd(&bucket_counts[bucket_id],1);
-        if (add < 4) {
-        	uint pos = bucket_id * 2 + add;
-        	uint value = index << ((pos & 0b01)*16);
-        	atomicAdd(&order[pos], value);
-        }
-        // from [ 1 3 2 0 0 1 0 2 ]
-        //  to> [ 0 1 4 6 6 6 7 7 ]
-        // then each thread, reads its scan offset, and that's the shared start + the counts to copy into global memory
-        //      [ 1 3 2 0 0 1 0 2 ]
-        //      [ 0 1 3 5 5 0 1 0 ]
-        //      [ 0 1 4 6
-
-
-      // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
-
-
-    if (threadIdx.x == 0) {
-        printf("buckets counts:\n");
-        for (int i=0;i<SHARED_SIZE_LIMIT;i++) {
-           printf("bucket %u - counts: %u\n", i, bucket_counts[i]);
-        }
-        printf("buckets order:\n");
-        for (int pos=0;pos<SHARED_SIZE_LIMIT*2;pos++) {
-           printf("pos %u - index value: %u\n", pos, order[pos]);
-        }
-      }
-
-
-    // should be sorted now.
-    //out_chachas[                      0] = sorted_key[threadIdx.x +                       0];
-    //out_xs[                      0] = sorted_val[threadIdx.x +                       0];
-
-    //__syncthreads();
-    //if (threadIdx.x == 0) {
-    //	printf("results sort:\n");
-    //    for (int i=0;i<SHARED_SIZE_LIMIT;i++) {
-   //     	printf("i %u - x: %u   chacha: %u\n", i, sorted_val[i], sorted_key[i]);
-   //     }
-   // }
-
-}
-
-__global__ void prescan(float *g_odata, float *g_idata, int n) {
-	extern __shared__ float temp[];  // allocated on invocation
-	int thid = threadIdx.x; int offset = 1;
-
-    temp[2*thid] = g_idata[2*thid]; // load input into shared memory
-    temp[2*thid+1] = g_idata[2*thid+1];
-
-    for (int d = n>>1; d > 0; d >>= 1)                    // build sum in place up the tree
-    {
-    	__syncthreads();
-    	if (thid < d)    {
-    		int ai = offset*(2*thid+1)-1;
-    		int bi = offset*(2*thid+2)-1;
-    		temp[bi] += temp[ai];
-    	}
-    	offset *= 2;
-    }
-
-     if (thid == 0) { temp[n - 1] = 0; } // clear the last element
-     for (int d = 1; d < n; d *= 2) // traverse down tree & build scan
-    	 {
-    	 offset >>= 1;
-    	 __syncthreads();
-    	 if (thid < d)      {
-    		 int ai = offset*(2*thid+1)-1;
-    		 int bi = offset*(2*thid+2)-1;
-    		 float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t;
-    	 }
-    	 }
-     __syncthreads();
-
-    g_odata[2*thid] = temp[2*thid];
-    // write results to device memory
-    g_odata[2*thid+1] = temp[2*thid+1];
-}
-
-// threads must be SHARED_SIZE_LIMIT/2
-__global__ void bitonicSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs)
-{
-    // Handle to thread block group
-    cg::thread_block cta = cg::this_thread_block();
-    //Shared memory storage for one or more short vectors
-    __shared__ uint s_key[SHARED_SIZE_LIMIT]; // the sort values
-    __shared__ uint s_val[SHARED_SIZE_LIMIT]; // stores the xs
-
-
-    uint dir = 1;
-    uint arrayLength = 1024;
-
-    //Offset to the beginning of subbatch and load data
-    chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    s_key[threadIdx.x +                       0] = chachas[                      0];
-    s_val[threadIdx.x +                       0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x;
-    s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = chachas[(SHARED_SIZE_LIMIT / 2)];
-    s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2);
-
-    //__syncthreads();
-    //if (threadIdx.x == 0) {
-    //	printf("doing bitonic sort, start list: \n");
-    //	for (int i=0;i<SHARED_SIZE_LIMIT;i++) {
-    //		printf("i %u - x: %u   chacha: %u\n", i, s_val[i], s_key[i]);
-    //	}
-   // }
-
-
-    for (uint size = 2; size < arrayLength; size <<= 1)
-    {
-        //Bitonic merge
-        uint ddd = dir ^ ((threadIdx.x & (size / 2)) != 0);
-
-        for (uint stride = size / 2; stride > 0; stride >>= 1)
-        {
-            cg::sync(cta);
-            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-            Comparator(
-                s_key[pos +      0], s_val[pos +      0],
-                s_key[pos + stride], s_val[pos + stride],
-                ddd
-            );
-        }
-    }
-
-    //ddd == dir for the last bitonic merge step
-    {
-        for (uint stride = arrayLength / 2; stride > 0; stride >>= 1)
-        {
-            cg::sync(cta);
-            uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1));
-            Comparator(
-                s_key[pos +      0], s_val[pos +      0],
-                s_key[pos + stride], s_val[pos + stride],
-                dir
-            );
-        }
-    }
-
-    cg::sync(cta);
-
-    // should be sorted now.
-    out_chachas[                      0] = s_key[threadIdx.x +                       0];
-    out_xs[                      0] = s_val[threadIdx.x +                       0];
-    out_chachas[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-    out_xs[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)];
-
-    //__syncthreads();
-    //if (threadIdx.x == 0) {
-    //	printf("results sort:\n");
-    //    for (int i=0;i<SHARED_SIZE_LIMIT;i++) {
-    //    	printf("i %u - x: %u   chacha: %u\n", i, s_val[i], s_key[i]);
-    //    }
-    //}
-
-}
-
-__global__
-void gpu_show_chacha_xs_lists(uint32_t START, uint32_t NUM, uint32_t *chachas, uint32_t *xs) {
-	printf("gpu_show_chacha_xs_lists:\n");
-	for (int i=START;i<NUM;i++) {
-		printf("i %u   x: %u  chacha: %u\n", i, xs[i], chachas[i]);
-	}
-}
-
-__global__
-void gpu_write_chachas_into_buckets_dumb_batches(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		xchacha_pair *results, xchacha_pair *results2)
-{
-	// highest performance bucket bits 4, with 1024 threads, num per block 65536. Then all blocks work with L2 cache?
-	const uint32_t NUM_BUCKETS = DUMBSORT_NUM_BUCKETS;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-DUMBSORT_BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t NUM_THREADS = blockDim.x;
-	uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-	const uint32_t GLOBAL_TILE_START = x_group * DUMBSORT_BATCHES_TILE_SPACE;
-
-	__shared__ int buffer_counts[NUM_BUCKETS];
-	__shared__ int buffer_counts_phase2[NUM_BUCKETS*NUM_BUCKETS];
-	__shared__ int buffer_counts_phase3[NUM_BUCKETS];
-
-	if (x_start < N) {
-		//if (threadIdx.x == 0) {
-		//	printf("x start: %u global_bucket_start_pos: %u vs before %u\n", x_start, global_bucket_start_pos, x_start / blockDim.x);
-		//}
-		for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts[i] = 0;
-		}
-		for (int i=threadIdx.x;i<NUM_BUCKETS*NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts_phase2[i] = 0;
-		}
-		__syncthreads();
-
-		uint32_t batch_id = 0;
-		for (batch_id = 0; batch_id < NUM_BATCHES_OF_THREADS; batch_id++) {
-			uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-			uint32_t chacha = chachas[x];
-			xchacha_pair entry = { x, chacha };
-
-			uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-			//printf("chacha %u - bucket id: %u\n", chacha, bucket_id);
-			if (bucket_id >= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id);
-
-			int slot = atomicAdd(&buffer_counts[bucket_id],1);
-			if (slot > PHASE_1_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 1 DUMBSORT OVERFLOW: %u\n", slot);
-
-			uint32_t results_address = GLOBAL_TILE_START + bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET + slot;
-			if (results_address < DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) {
-				results[results_address] = entry;
-			} else {
-				printf("results address overflow %u - global start pos: %u bucket %u slot %u DUMBSORT_SPACE_NEEDED_FOR_SCRATCH: %u\n",
-						results_address, GLOBAL_TILE_START, bucket_id, slot, DUMBSORT_SPACE_NEEDED_FOR_SCRATCH);
-			}
-		}
-
-		__syncthreads();
-		//if (threadIdx.x == 0) {
-		//	printf("end phase 1, buffer counts:\n");
-		//	for (int i=0;i<NUM_BUCKETS;i++) {
-		//		printf("  bucket %u : %u\n", i, buffer_counts[i]);
-		//	}
-		//}
-
-		// phase 2...now read from buckets and sort into small buckets...hohoho.
-		for (int read_bucket_id=0;read_bucket_id < NUM_BUCKETS; read_bucket_id++) {
-			const uint32_t NUM_ENTRIES = buffer_counts[read_bucket_id];
-			const uint32_t BUCKET_DIVISOR2 = 1 << (32 - DUMBSORT_BUCKET_BITS*2);
-			const uint32_t SUB_2_TILE_START = GLOBAL_TILE_START + read_bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET;
-			const uint32_t SUB_2_BUFFER_COUNT_OFFSET = read_bucket_id*NUM_BUCKETS;
-
-			//for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			//	buffer_counts_phase2[i] = 0;
-			//}
-			//__syncthreads();
-
-			for (batch_id = 0; batch_id < (NUM_ENTRIES + NUM_THREADS - 1) / NUM_THREADS; batch_id++) {
-
-				uint32_t pos = batch_id * NUM_THREADS + threadIdx.x;
-				if (pos < NUM_ENTRIES) {
-
-					uint32_t read_address = SUB_2_TILE_START + pos;
-					xchacha_pair entry = results[read_address];
-
-					uint32_t local_bucket_id = (entry.chacha / BUCKET_DIVISOR2) % NUM_BUCKETS;
-					//printf("chacha %u - bucket id: %u\n", entry.chacha, local_bucket_id);
-
-					int slot = atomicAdd(&buffer_counts_phase2[SUB_2_BUFFER_COUNT_OFFSET + local_bucket_id],1);
-					if (slot > PHASE_2_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 2 DUMBSORT OVERFLOW: %u\n", slot);
-
-					uint32_t results_address2 = SUB_2_TILE_START + local_bucket_id * PHASE_2_DUMBSORT_MAX_PER_BUCKET + slot;
-					results2[results_address2] = entry;
-				}
-			}
-
-			__syncthreads();
-			//if (threadIdx.x == 0) {
-			//	printf("end phase 2-%u, buffer counts:\n",read_bucket_id);
-			//	for (int i=0;i<NUM_BUCKETS;i++) {
-			//		printf("  bucket %u : %u\n", i, buffer_counts_phase2[SUB_2_BUFFER_COUNT_OFFSET + i]);
-			//	}
-			//}
-		}
-
-		// phase 3...now read from buckets and sort into small buckets...hohoho.
-		for (int read_bucket_id = 0; read_bucket_id < NUM_BUCKETS; read_bucket_id++) {
-			const uint32_t SUB_2_TILE_START = GLOBAL_TILE_START + read_bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET;
-			const uint32_t SUB_2_BUFFER_COUNT_OFFSET = read_bucket_id*NUM_BUCKETS;
-			for (int read_bucket_id_phase2=0;read_bucket_id_phase2 < NUM_BUCKETS; read_bucket_id_phase2++) {
-				const uint32_t NUM_PHASE2_ENTRIES = buffer_counts_phase2[SUB_2_BUFFER_COUNT_OFFSET + read_bucket_id_phase2];
-				const uint32_t BUCKET_DIVISOR3 = 1 << (32 - DUMBSORT_BUCKET_BITS*3);
-				const uint32_t SUB_3_TILE_START = SUB_2_TILE_START + read_bucket_id_phase2 * PHASE_2_DUMBSORT_MAX_PER_BUCKET;
-
-
-				for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-					buffer_counts_phase3[i] = 0;
-				}
-				__syncthreads();
-
-				for (batch_id = 0; batch_id < (NUM_PHASE2_ENTRIES + NUM_THREADS - 1) / NUM_THREADS; batch_id++) {
-
-					uint32_t pos2 = batch_id * NUM_THREADS + threadIdx.x;
-					if (pos2 < NUM_PHASE2_ENTRIES) {
-
-						uint32_t read_address_2 = SUB_3_TILE_START + pos2;
-						xchacha_pair entry = results2[read_address_2];
-
-						uint32_t local_bucket_id_3 = (entry.chacha / BUCKET_DIVISOR3) % NUM_BUCKETS;
-						//printf("chacha %u - bucket id: %u\n", entry.chacha, local_bucket_id_3);
-
-						int slot = atomicAdd(&buffer_counts_phase3[local_bucket_id_3],1);
-						if (slot > PHASE_3_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 3 DUMBSORT OVERFLOW: %u\n", slot);
-
-						uint32_t results_address3 = SUB_3_TILE_START + local_bucket_id_3 * PHASE_3_DUMBSORT_MAX_PER_BUCKET + slot;
-						results[results_address3] = entry;
-					}
-				}
-
-				__syncthreads();
-
-				//if (threadIdx.x == 0) {
-				//	printf("end phase 3-2:[%u]-1[%u], buffer counts:\n",read_bucket_id_phase2,read_bucket_id);
-				//	for (int i=0;i<NUM_BUCKETS;i++) {
-				//		printf("  bucket %u : %u\n", i, buffer_counts_phase3[i]);
-				//	}
-				//}
-			}
-		}
-	}
-}
-
-__global__
-void gpu_write_chachas_into_buckets_dumb_batches_orig1phaseonly(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		uint32_t const MAX_TOTAL_GROUPED_ENTRIES, xchacha_pair *results, unsigned int *results_counts)
-{
-	// highest performance bucket bits 4, with 1024 threads, num per block 65536. Then all blocks work with L2 cache?
-	const uint32_t NUM_BUCKET_BITS = 4; // 4 = 16 buckets
-	const uint32_t NUM_BUCKETS = 1 << NUM_BUCKET_BITS;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-NUM_BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t NUM_THREADS = blockDim.x;
-	const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	const uint32_t GLOBAL_BUCKET_MAX_ENTRIES = MAX_TOTAL_GROUPED_ENTRIES / NUM_BUCKETS;
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-	uint32_t global_bucket_start_pos = x_group * PHASE_1_DUMBSORT_MAX_PER_BUCKET;
-
-	__shared__ int buffer_counts[NUM_BUCKETS];
-
-	if (x_start < N) {
-		//if (threadIdx.x == 0) {
-		//	printf("x start: %u global_bucket_start_pos: %u vs before %u\n", x_start, global_bucket_start_pos, x_start / blockDim.x);
-		//}
-		for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts[i] = 0;
-		}
-		__syncthreads();
-		//https://stackoverflow.com/questions/42620649/sorting-algorithm-with-cuda-inside-or-outside-kernels
-		//https://stackoverflow.com/questions/5510715/thrust-inside-user-written-kernels
-		//https://stackoverflow.com/questions/22339936/sorting-many-small-arrays-in-cuda
-
-		uint32_t batch_id = 0;
-		// simplest algorith, works 167ms with 32 buckets, but only 50ms with 8 buckets
-		// want to reduce this down to same with 32 buckets as 8, means we take 4 batches and sort it.
-		for (batch_id = 0; batch_id < NUM_BATCHES_OF_THREADS; batch_id++) {
-			uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-			uint32_t chacha = chachas[x];
-			xchacha_pair entry = { x, chacha };
-
-			uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-			if (bucket_id >= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id);
-
-			int slot = atomicAdd(&buffer_counts[bucket_id],1);
-			uint32_t results_address = global_bucket_start_pos + bucket_id * GLOBAL_BUCKET_MAX_ENTRIES + slot;
-			if (results_address < 134217728) {
-				results[results_address] = entry;
-			} else {
-				printf("results address overflow %u - global start pos: %u bucket %u slot %u globalmaxentries: %u\n",
-						results_address, global_bucket_start_pos, bucket_id, slot, GLOBAL_BUCKET_MAX_ENTRIES);
-			}
-			//__syncthreads(); // holy fuck a sync threads increases from 50ms to 85!!!!! That's why!
-			//for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			//	atomicAdd(&results_counts[i], buffer_counts[i]);
-			//	buffer_counts[i] = 0;
-			//}
-			//__syncthreads();
-		}
-		__syncthreads();
-		for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			atomicAdd(&results_counts[i], buffer_counts[i]+1);
-			if (buffer_counts[i] > PHASE_1_DUMBSORT_MAX_PER_BUCKET)
-				printf("BUFFER OVERFLOW: %u was over max per bucket\n",buffer_counts[i], PHASE_1_DUMBSORT_MAX_PER_BUCKET);
-		}
-	}
-}
-
-
-__global__
-void gpu_write_chachas_into_buckets_with_single_row_depthflush(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		uint32_t const MAX_TOTAL_GROUPED_ENTRIES, xchacha_pair *results, unsigned int *results_counts)
-{
-	// note num threads should be equal or higher than NUM_BUCKETS
-	// 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables.
-
-	// good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms)
-	//                            256,              22,            12 (11ms)
-	// the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up.
-	const uint32_t BUCKET_BITS = 5;
-	const uint32_t FLUSH_DEPTH = 128;
-	const uint32_t BUCKET_DEPTH = FLUSH_DEPTH+32; // give some room for overflow - careful too much and it slows down!
-	// I tried with a buffer overflow instead of padding...but...it performed slightly slower and that's without
-	// moving the buckets back in. seems like loops on threads not being perfect multiples when writing is more
-	// forgiving than though? Can try again.
-
-	const uint32_t NUM_BUCKETS = 1 << BUCKET_BITS;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t GLOBAL_BUCKET_MAX_ENTRIES = MAX_TOTAL_GROUPED_ENTRIES / NUM_BUCKETS;
-
-	__shared__ int buffer_counts[NUM_BUCKETS];
-	__shared__ int global_counts[NUM_BUCKETS];
-	__shared__ uint32_t chachas_buffer[NUM_BUCKETS*BUCKET_DEPTH];
-	__shared__ uint16_t xs_buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket
-	__shared__ int num_ready;
-	__shared__ int batch_id;
-	__shared__ int bucket_to_flush;
-
-	// 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory.
-	// let's try first with 256 buckets.
-	//__shared__ int flush;
-
-	const uint32_t NUM_THREADS = blockDim.x;
-	const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	//if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n");
-
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-
-	if (x_start < N) {
-		if (threadIdx.x == 0) {
-			num_ready = 0;
-			batch_id = 0;
-		}
-		// make sure all values start right!
-		for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts[i] = 0;
-			global_counts[i] = 0;
-		}
-		__syncthreads();
-
-		// go through each batch of data
-		while (batch_id < NUM_BATCHES_OF_THREADS) {
-			while ((num_ready == 0) && (batch_id < NUM_BATCHES_OF_THREADS)) {
-				// thread is of course threadIdx.x
-				uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-				uint32_t chacha = chachas[x];
-
-				//if (threadIdx.x == 0) {
-				//	printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready);
-				//}
-				__syncthreads();
-
-				uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-				uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1);
-				uint32_t address = bucket_id * BUCKET_DEPTH + slot;
-
-				//printf("      xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot);
-				if (address > NUM_BUCKETS*BUCKET_DEPTH) {
-					printf("ERROR ADDRESS %u  --  batch: %u  bucket_id: %u  slot: %u\n", address, batch_id, bucket_id, slot);
-				} else {
-					//xchacha_pair entry = { x, chacha };
-					chachas_buffer[address] = chacha;
-					xs_buffer[address] = x;
-				}
-
-				if (slot == (FLUSH_DEPTH-1)) {
-					atomicAdd(&num_ready, 1);
-					bucket_to_flush = bucket_id; // doesn't matter if this gets overwritten by another thread
-					// point is we want to get first bucket and if there is more we fetch it from list.
-					//printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready);
-				}
-
-				__syncthreads();
-				if (threadIdx.x == 0) {
-					//for (int i=0;i<NUM_BUCKETS;i++) {
-					//	printf("bucket %u entries: %u\n", i, buffer_counts[i]);
-					//}
-					//printf("NUM READY after batch %u is %u\n", batch_id, num_ready);
-					batch_id++;
-				}
-				__syncthreads();
-			}
-
-			// all buffers should be full OR batch processing is over and we have some to flush
-			__syncthreads();
-
-			while (num_ready > 0) {
-				// flush those ready
-				const int num_to_flush = buffer_counts[bucket_to_flush];
-				if (threadIdx.x == 0) {
-					global_counts[bucket_to_flush] += num_to_flush;
-					//global_counts[bucket_to_flush] = atomicAdd(&results_counts[bucket_to_flush],num_to_flush);
-				//	printf("FLUSHING! %u buckets are ready, flushing bucket %u\n", num_ready, bucket_to_flush);
-				}
-
-				__syncthreads();
-
-				for (int i=threadIdx.x;i<num_to_flush;i+=blockDim.x) {
-					uint32_t buffer_address = bucket_to_flush * BUCKET_DEPTH + i;
-					uint32_t chacha = chachas_buffer[buffer_address];
-					uint32_t x = xs_buffer[buffer_address] + x_start;
-					xchacha_pair entry = { x, chacha };
-
-					const int global_pos = global_counts[bucket_to_flush] + i;
-					uint32_t global_address = bucket_to_flush * GLOBAL_BUCKET_MAX_ENTRIES + global_pos;
-					results[global_address] = entry;
-				}
-
-				__syncthreads();
-
-				if (threadIdx.x == 0) {
-					num_ready--;
-					//printf("num ready set to %u\n", num_ready);
-					buffer_counts[bucket_to_flush] = 0;
-				}
-
-				__syncthreads();
-
-				if (num_ready > 0) {
-					// find next bucket to flush! doesn't matter if multiple threads overwrite,
-					// just want one of them
-					for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-						if (buffer_counts[i] >= FLUSH_DEPTH)
-							bucket_to_flush = i;
-					}
-				}
-			}
-
-			__syncthreads();
-
-		}
-		if (batch_id == NUM_BATCHES_OF_THREADS) {
-			// we finished entering all our data, now check left-over buckets.
-			// TODO: check each bucket count and write out data to global.
-			//if (threadIdx.x == 0) {
-			//	printf("BATCHES COMPLETED: todo finish flushing rest of buffers\n");
-			//}
-			for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-				if (buffer_counts[i] > 0) atomicAdd(&results_counts[i], buffer_counts[i]);
-			}
-
-		}
-	}
-}
-
-__global__
-void gpu_write_chachas_into_buckets_with_single_row_depthflush_ORIG(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		uint32_t const MAX_TOTAL_GROUPED_ENTRIES, xchacha_pair *results, unsigned int *results_counts)
-{
-	// note num threads should be equal or higher than NUM_BUCKETS
-	// 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables.
-
-	// good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms)
-	//                            256,              22,            12 (11ms)
-	// the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up.
-	const uint32_t BUCKET_BITS = 5;
-	const uint32_t FLUSH_DEPTH = 128;
-	const uint32_t BUCKET_DEPTH = FLUSH_DEPTH+32; // give some room for overflow
-
-	const uint32_t NUM_BUCKETS = 1 << BUCKET_BITS;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t GLOBAL_BUCKET_MAX_ENTRIES = MAX_TOTAL_GROUPED_ENTRIES / NUM_BUCKETS;
-
-	__shared__ int buffer_counts[NUM_BUCKETS];
-	__shared__ int global_counts[NUM_BUCKETS];
-	__shared__ xchacha_pair buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket
-	__shared__ int num_ready;
-	__shared__ int batch_id;
-	__shared__ int bucket_to_flush;
-
-	// 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory.
-	// let's try first with 256 buckets.
-	//__shared__ int flush;
-
-	const uint32_t NUM_THREADS = blockDim.x;
-	const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	//if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n");
-
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-
-	if (x_start < N) {
-		if (threadIdx.x == 0) {
-			num_ready = 0;
-			batch_id = 0;
-		}
-		// make sure all values start right!
-		for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts[i] = 0;
-			global_counts[i] = 0;
-		}
-		__syncthreads();
-
-		// go through each batch of data
-		while (batch_id < NUM_BATCHES_OF_THREADS) {
-			while ((num_ready == 0) && (batch_id < NUM_BATCHES_OF_THREADS)) {
-				// thread is of course threadIdx.x
-				uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-				uint32_t chacha = chachas[x];
-
-				//if (threadIdx.x == 0) {
-				//	printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready);
-				//}
-				__syncthreads();
-
-				uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-				uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1);
-				uint32_t address = bucket_id * BUCKET_DEPTH + slot;
-
-				//printf("      xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot);
-				if (address > NUM_BUCKETS*BUCKET_DEPTH) {
-					printf("ERROR ADDRESS %u  --  batch: %u  bucket_id: %u  slot: %u\n", address, batch_id, bucket_id, slot);
-				} else {
-					xchacha_pair entry = { x, chacha };
-					buffer[address] = entry;
-				}
-
-				if (slot == (FLUSH_DEPTH-1)) {
-					atomicAdd(&num_ready, 1);
-					bucket_to_flush = bucket_id; // doesn't matter if this gets overwritten by another thread
-					// point is we want to get first bucket and if there is more we fetch it from list.
-					//printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready);
-				}
-
-				__syncthreads();
-				if (threadIdx.x == 0) {
-					//for (int i=0;i<NUM_BUCKETS;i++) {
-					//	printf("bucket %u entries: %u\n", i, buffer_counts[i]);
-					//}
-					//printf("NUM READY after batch %u is %u\n", batch_id, num_ready);
-					batch_id++;
-				}
-				__syncthreads();
-			}
-
-			// all buffers should be full OR batch processing is over and we have some to flush
-			__syncthreads();
-
-			while (num_ready > 0) {
-				// flush those ready
-				const int num_to_flush = buffer_counts[bucket_to_flush];
-				if (threadIdx.x == 0) {
-					global_counts[bucket_to_flush] += num_to_flush;
-					//global_counts[bucket_to_flush] = atomicAdd(&results_counts[bucket_to_flush],num_to_flush);
-				//	printf("FLUSHING! %u buckets are ready, flushing bucket %u\n", num_ready, bucket_to_flush);
-				}
-
-				__syncthreads();
-
-				for (int i=threadIdx.x;i<num_to_flush;i+=blockDim.x) {
-					uint32_t buffer_address = bucket_to_flush * BUCKET_DEPTH + i;
-					xchacha_pair entry = buffer[buffer_address];
-
-					const int global_pos = global_counts[bucket_to_flush] + i;
-					uint32_t global_address = bucket_to_flush * GLOBAL_BUCKET_MAX_ENTRIES + global_pos;
-					results[global_address] = entry;
-				}
-
-				__syncthreads();
-
-				if (threadIdx.x == 0) {
-					num_ready--;
-					//printf("num ready set to %u\n", num_ready);
-					buffer_counts[bucket_to_flush] = 0;
-				}
-
-				__syncthreads();
-
-				if (num_ready > 0) {
-					// find next bucket to flush! doesn't matter if multiple threads overwrite,
-					// just want one of them
-					for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-						if (buffer_counts[i] >= FLUSH_DEPTH)
-							bucket_to_flush = i;
-					}
-				}
-			}
-
-			__syncthreads();
-
-		}
-		if (batch_id == NUM_BATCHES_OF_THREADS) {
-			// we finished entering all our data, now check left-over buckets.
-			// TODO: check each bucket count and write out data to global.
-			//if (threadIdx.x == 0) {
-			//	printf("BATCHES COMPLETED: todo finish flushing rest of buffers\n");
-			//}
-			for (int i=threadIdx.x;i<NUM_BUCKETS;i+=blockDim.x) {
-				if (buffer_counts[i] > 0) atomicAdd(&results_counts[i], buffer_counts[i]);
-			}
-
-		}
-	}
-}
-
-
-
-__global__
-void gpu_write_chachas_into_buckets_with_buffer_batches(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		uint32_t const MAX_PER_RESULTS_BUCKET, xchacha_pair *results, unsigned int *results_counts)
-{
-	// note num threads should be equal or higher than NUM_BUCKETS
-	// 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables.
-
-	// good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms)
-	//                            256,              22,            12 (11ms)
-	// the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up.
-	const uint32_t NUM_BUCKETS = 32;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-5); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t BUCKET_DEPTH = 128; // *should* be able to set this freely, as the active window should modulo effectively.
-	const uint32_t FLUSH_DEPTH = 32; // cache does best with a flush depth of 8, but even 6 is ok, 4 is 1st benefit jump.
-
-	__shared__ int buffer_counts[NUM_BUCKETS];
-	__shared__ int global_counts[NUM_BUCKETS];
-	__shared__ xchacha_pair buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket
-	__shared__ int num_ready;
-	__shared__ int active_buffer_pos; // this is the moving position/window in the buffer
-	__shared__ int eviction_needed;
-	__shared__ uint32_t batch_id;
-
-	// 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory.
-	// let's try first with 256 buckets.
-	//__shared__ int flush;
-
-	const uint32_t NUM_THREADS = blockDim.x;
-	const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	//if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n");
-
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-
-	if (x_start < N) {
-		if (threadIdx.x == 0) {
-			num_ready = 0;
-			active_buffer_pos = 0;
-			eviction_needed = 0;
-			batch_id = 0;
-		}
-		// make sure all values start right!
-		for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) {
-			buffer_counts[i] = 0;
-			global_counts[i] = 0;
-		}
-		__syncthreads();
-
-		// go through each batch of data
-		while (batch_id < NUM_BATCHES_OF_THREADS) {
-			while ((num_ready < NUM_BUCKETS) && (batch_id < NUM_BATCHES_OF_THREADS) && (eviction_needed == 0)) {
-				// thread is of course threadIdx.x
-				uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-				uint32_t chacha = chachas[x];
-
-				//if (threadIdx.x == 0) {
-				//	printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready);
-				//}
-				//__syncthreads();
-
-				uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-				uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1);
-				uint32_t address = bucket_id * BUCKET_DEPTH + (slot + active_buffer_pos) % BUCKET_DEPTH;
-
-				//printf("      xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot);
-
-				if (address > NUM_BUCKETS*BUCKET_DEPTH) {
-					printf("ERROR ADDRESS %u  --  batch: %u  bucket_id: %u  slot: %u\n", address, batch_id, bucket_id, slot);
-				} else {
-					xchacha_pair entry = { x, chacha };
-					buffer[address] = entry;
-				}
-
-				if (slot == (FLUSH_DEPTH-1)) {
-					atomicAdd(&num_ready, 1);
-					//printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready);
-				} else if (slot == (BUCKET_DEPTH-1)) {
-					// one bucket got full, so it's time to evict all
-					atomicAdd(&eviction_needed, 1); // atomic not really necessary
-					//printf("-> bucket %u slot reached max bucket depth %u, set eviction needed to %u\n", bucket_id, BUCKET_DEPTH-1, eviction_needed);
-				}
-
-				__syncthreads();
-				if (threadIdx.x == 0) {
-					//for (int i=0;i<NUM_BUCKETS;i++) {
-					//	printf("bucket %u entries: %u\n", i, buffer_counts[i]);
-					//}
-					//printf("NUM READY after batch %u is %u\n", batch_id, num_ready);
-					batch_id++;
-				}
-				__syncthreads();
-			}
-
-			// all buffers should be full OR batch processing is over and we have some to flush
-			__syncthreads();
-
-			if (num_ready == NUM_BUCKETS) {
-				// flush all up to FLUSH_DEPTH
-				//if (threadIdx.x == 0) {
-				//	printf("FLUSHING! %u buckets are ready\n", num_ready);
-				//}
-				for (uint32_t bucket_id=threadIdx.x;bucket_id < NUM_BUCKETS;bucket_id+=blockDim.x) {
-					// now increment all global counts to reserve space for flush
-					global_counts[bucket_id] = atomicAdd(&results_counts[bucket_id], FLUSH_DEPTH);
-					// decrement our buffer counts by the flush depth
-					int new_count = buffer_counts[bucket_id] - FLUSH_DEPTH;
-					buffer_counts[bucket_id] = new_count;
-					if (new_count < FLUSH_DEPTH) atomicSub(&num_ready,1);
-				}
-
-				__syncthreads();
-
-				// and write all stuffs
-				for (uint32_t i=threadIdx.x;i < NUM_BUCKETS * FLUSH_DEPTH;i+=blockDim.x) {
-					uint32_t bucket_id_for_thread = (i / FLUSH_DEPTH) % NUM_BUCKETS;
-					int local_pos = i % FLUSH_DEPTH;
-					uint32_t buffer_address = bucket_id_for_thread * BUCKET_DEPTH + ((local_pos + active_buffer_pos) % BUCKET_DEPTH);
-					xchacha_pair entry = buffer[buffer_address];
-
-					int global_pos = global_counts[bucket_id_for_thread] + local_pos;
-					uint32_t global_address = bucket_id_for_thread * MAX_PER_RESULTS_BUCKET + global_pos;
-					if (global_address > 256 * MAX_PER_RESULTS_BUCKET) {
-						printf("global address out of bounds bucket_id: %u global_pos:%u\n", bucket_id_for_thread, global_pos);
-					} else {
-						//printf("global address bucket_id: %u global_pos:%u\n", bucket_id_for_thread, global_pos);
-						results[global_address] = entry;
-					}
-				}
-
-				__syncthreads();
-
-				if (threadIdx.x == 0) {
-					// switch active buffer position now
-					active_buffer_pos = (active_buffer_pos + FLUSH_DEPTH) % BUCKET_DEPTH;
-					//printf("  - active_buffer_pos now set to %u\n", active_buffer_pos);
-					//for (int i=0;i<NUM_BUCKETS;i++) {
-					//	printf("bucket %u entries: %u\n", i, buffer_counts[i]);
-					//}
-					//printf("NUM READY after batch %u is %u\n", batch_id, num_ready);
-				}
-
-				__syncthreads();
-
-			} else if (batch_id == NUM_BATCHES_OF_THREADS) {
-				// we finished entering all our data, now check left-over buckets.
-				// TODO: check each bucket count and write out data to global.
-				//if (threadIdx.x == 0) {
-				//	printf("BATCHES COMPLETED: todo finish flushing rest of buffers\n");
-				//}
-
-			} else if (eviction_needed > 0) {
-				if (threadIdx.x == 0) {
-					//printf("HANDLE EVICTION CASE\n");
-					for (int i=0;i<NUM_BUCKETS;i++) {
-						if (buffer_counts[i] >= BUCKET_DEPTH) {
-							eviction_needed = i;
-							num_ready = num_ready - 1;
-							buffer_counts[i] = 0; // okay, kind of a bug b/c if more than one eviction then we lose entries
-							// but for now it's just to test performance.
-						}
-					}
-				}
-				__syncthreads();
-
-				for (int i=threadIdx.x;i<BUCKET_DEPTH;i+=blockDim.x) {
-					uint32_t bucket_id_for_thread = eviction_needed;
-					uint32_t buffer_address = bucket_id_for_thread * BUCKET_DEPTH + i;
-					xchacha_pair entry = buffer[buffer_address];
-
-					int global_pos = global_counts[bucket_id_for_thread] + i;
-					uint32_t global_address = bucket_id_for_thread * MAX_PER_RESULTS_BUCKET + global_pos;
-					results[global_address] = entry;
-				}
-
-				// afterwards clear eviction flag
-				__syncthreads();
-
-				if (threadIdx.x == 0) {
-					eviction_needed = 0;
-				}
-
-				__syncthreads();
-			}
-		}
-	}
-}
-
-
-
-__global__
-void gpu_filter_chachas_into_global_kbc_bucket(const uint32_t N, const uint32_t X_START, const __restrict__ uint32_t *chachas,
-		uint16_t *out_kbc_ys, uint32_t *out_kbc_xs, unsigned int *kbc_counts) {
-	uint32_t x = blockIdx.x * blockDim.x + threadIdx.x;
-	if (x < N) {
-		uint64_t y = (((uint64_t) chachas[x]) << 6) + (x >> 26);
-		uint16_t kbc_y = y % kBC;
-		uint32_t kbc_bucket_id = y / kBC;
-		//printf("x: %u  kbc: %u\n", x, kbc_bucket_id);
-		unsigned int kbc_shift = kbc_bucket_id % 32;
-		unsigned int kbc_add_slot = 1 << kbc_shift;
-		unsigned int value = atomicAdd(&kbc_counts[kbc_bucket_id/32], kbc_add_slot);
-		unsigned int slot = (value >> kbc_shift) & 31;
-		//kbc_counts[kbc_bucket_id/32] = slot+1;
-		// THE ATOMIC ADDS ARE THE PROBLEM!
-		//unsigned int slot = atomicAdd(&kbc_counts[kbc_bucket_id % (32768*32)],1);// = slot+1;
-		out_kbc_ys[kbc_bucket_id * 32 + slot] = kbc_y;
-		out_kbc_xs[kbc_bucket_id * 32 + slot] = x;
-	}
-}
-
-__global__ void gpu_get_max_counts_from_counter_list(unsigned int *kbc_counts, const int NUM) {
-	__shared__ unsigned int max_kbc_count;
-	__shared__ unsigned int sum_kbc_count;
-	if (threadIdx.x == 0) {
-		max_kbc_count = 0;
-		sum_kbc_count = 0;
-	}
-	__syncthreads();
-	for (int i=threadIdx.x;i<NUM;i+=blockDim.x) {
-		unsigned int kbc_count = kbc_counts[i];
-		//if (kbc_count > 150) printf("kbc: %u count: %u\n", i, kbc_count);
-		atomicMax(&max_kbc_count, kbc_count);
-		atomicAdd(&sum_kbc_count, kbc_count);
-	}
-	if (threadIdx.x == 0) printf("counter list counts  SUM:%u   MAX:%u\n", sum_kbc_count, max_kbc_count);
-}
-
-__global__ void gpu_show_chachas(const uint32_t N, const uint32_t step, uint32_t *chachas) {
-	for (int i=0;i<N;i+=step) {
-		printf("x: %u chacha: %u\n", i, chachas[i]);
-	}
-}
-
-// threadIdx.x of 0 gets x=0,1,2,3,4...15
-//                1 gets x=16......31
-
-#define ATTACK_WRITE_CHACHAS(chacha_y,i) \
-{ \
-	shared_chachas[threadIdx.x*16+i] = chacha_y; \
-}
-
-#define ATTACK_WRITE_CHACHAS32(chacha_y,i) \
-{ \
-	shared_chachas[threadIdx.x*32+i] = chacha_y; \
-}
-
-#define ATTACK_WRITE_CHACHAS32_PAIR(chacha_y,i) \
-{ \
-	xchacha_pair pair = { base_x + i, chacha_y }; \
-	shared_chachas[threadIdx.x*32+i] = pair; \
-}
-
-#define ATTACK_WRITE_CHACHAS_COALESCED(chacha_y,i) \
-{ \
-	chachas[base_x+threadIdx.x+i*blockDim.x] = chacha_y; \
-}
-
-__global__
-void gpu_chacha8_k32_write_chachas(const uint32_t N, const uint32_t X_START,
-		const __restrict__ uint32_t *input,
-		uint32_t *chachas)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	__shared__ uint32_t shared_chachas[256*16]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-
-	int base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	const int j = 0;
-	if (x_group < end_n) {
-		//uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS(datax[j+0],0);ATTACK_WRITE_CHACHAS(datax[j+1],1);ATTACK_WRITE_CHACHAS(datax[j+2],2);ATTACK_WRITE_CHACHAS(datax[j+3],3);
-		ATTACK_WRITE_CHACHAS(datax[j+4],4);ATTACK_WRITE_CHACHAS(datax[j+5],5);ATTACK_WRITE_CHACHAS(datax[j+6],6);ATTACK_WRITE_CHACHAS(datax[j+7],7);
-		ATTACK_WRITE_CHACHAS(datax[j+8],8);ATTACK_WRITE_CHACHAS(datax[j+9],9);ATTACK_WRITE_CHACHAS(datax[j+10],10);ATTACK_WRITE_CHACHAS(datax[j+11],11);
-		ATTACK_WRITE_CHACHAS(datax[j+12],12);ATTACK_WRITE_CHACHAS(datax[j+13],13);ATTACK_WRITE_CHACHAS(datax[j+14],14);ATTACK_WRITE_CHACHAS(datax[j+15],15);
-	}
-
-	__syncthreads();
-	for (int i=threadIdx.x;i<blockDim.x*16;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n", threadIdx.x*16 + i, base_x + threadIdx.x*blockDim.x + i);
-		chachas[base_x + i] = shared_chachas[i];
-	}
-}
-
-// can run optimal with 32 blocksize BUT results are out of order since each thread
-// writes sequentially and to regain order needed to % blockSize
-__global__
-void gpu_chacha8_k32_write_chachas_global_coalesced(const uint32_t N, const uint32_t X_START,
-		const __restrict__ uint32_t *input,
-		uint32_t *chachas)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-
-	int base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	const int j = 0;
-	if (x_group < end_n) {
-		//uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS_COALESCED(datax[j+0],0);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+1],1);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+2],2);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+3],3);
-		ATTACK_WRITE_CHACHAS_COALESCED(datax[j+4],4);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+5],5);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+6],6);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+7],7);
-		ATTACK_WRITE_CHACHAS_COALESCED(datax[j+8],8);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+9],9);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+10],10);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+11],11);
-		ATTACK_WRITE_CHACHAS_COALESCED(datax[j+12],12);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+13],13);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+14],14);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+15],15);
-	}
-}
-
-// run with 128 blocksize, more doesn't matter.
-__global__
-void gpu_chacha8_k32_write_chachas32(const uint32_t N, const uint32_t X_START,
-		const __restrict__ uint32_t *input,
-		uint32_t *chachas)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	__shared__ uint32_t shared_chachas[128*32]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-
-	if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 32;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	const int j = 0;
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 2 + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32(datax[j+0],0);ATTACK_WRITE_CHACHAS32(datax[j+1],1);ATTACK_WRITE_CHACHAS32(datax[j+2],2);ATTACK_WRITE_CHACHAS32(datax[j+3],3);
-		ATTACK_WRITE_CHACHAS32(datax[j+4],4);ATTACK_WRITE_CHACHAS32(datax[j+5],5);ATTACK_WRITE_CHACHAS32(datax[j+6],6);ATTACK_WRITE_CHACHAS32(datax[j+7],7);
-		ATTACK_WRITE_CHACHAS32(datax[j+8],8);ATTACK_WRITE_CHACHAS32(datax[j+9],9);ATTACK_WRITE_CHACHAS32(datax[j+10],10);ATTACK_WRITE_CHACHAS32(datax[j+11],11);
-		ATTACK_WRITE_CHACHAS32(datax[j+12],12);ATTACK_WRITE_CHACHAS32(datax[j+13],13);ATTACK_WRITE_CHACHAS32(datax[j+14],14);ATTACK_WRITE_CHACHAS32(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32(datax[j+3],16+3);
-		ATTACK_WRITE_CHACHAS32(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32(datax[j+7],16+7);
-		ATTACK_WRITE_CHACHAS32(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32(datax[j+11],16+11);
-		ATTACK_WRITE_CHACHAS32(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32(datax[j+15],16+15);
-
-	}
-
-	__syncthreads();
-	for (int i=threadIdx.x;i<blockDim.x*32;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-		chachas[base_x + i] = shared_chachas[i];
-	}
-}
-
-// run with 128 blocksize, more doesn't matter.
-__global__
-void gpu_chacha8_k32_write_chachas32_buckets(const uint32_t N, const uint32_t X_START,
-		const uint32_t CHACHA_NUM_BUCKETS, const uint32_t CHACHA_MAX_PER_BUCKET,
-		const __restrict__ uint32_t *input,
-		xchacha_pair *chachas_buckets)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	__shared__ xchacha_pair shared_chachas[128*32]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-	__shared__ int counts[32];
-
-	if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 32;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	const int j = 0;
-	if (x_group < end_n) {
-		for (int i=threadIdx.x;i<32;i+=blockDim.x) {
-			counts[i] = 0;
-		}
-
-		uint32_t pos = x_group * 2 + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],3);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],7);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],11);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],16+3);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],16+7);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],16+11);
-		ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],16+15);
-
-	}
-
-	__syncthreads();
-	const uint32_t TEST_BUCKET_BITS = 5;
-	const uint32_t TEST_MAX_PER_BUCKET = (1 << (32-TEST_BUCKET_BITS-6))*2;
-	for (int i=threadIdx.x;i<blockDim.x*32;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-		xchacha_pair pair = shared_chachas[i];
-
-		uint32_t bucket_id = pair.chacha >> (32 - TEST_BUCKET_BITS); // 16 buckets
-		int slot = atomicAdd(&counts[bucket_id],1);
-		chachas_buckets[TEST_MAX_PER_BUCKET * bucket_id + base_x + slot] = shared_chachas[i];
-	}
-}
-
-__global__
-void gpu_filter_chachas(
-		const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas,
-		xchacha_pair *results, xchacha_pair *results2)
-{
-	// highest performance bucket bits 4, with 1024 threads, num per block 65536. Then all blocks work with L2 cache?
-	const uint32_t NUM_BUCKETS = DUMBSORT_NUM_BUCKETS;
-	const uint32_t BUCKET_DIVISOR = 1 << (32-DUMBSORT_BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets
-	const uint32_t NUM_THREADS = blockDim.x;
-	uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads
-	uint32_t x_group = blockIdx.x;
-	uint32_t x_start = x_group * NUM_PER_BLOCK;
-	const uint32_t GLOBAL_TILE_START = x_group * DUMBSORT_BATCHES_TILE_SPACE;
-
-	__shared__ int filter_count;
-
-
-	if (x_start < N) {
-		//if (threadIdx.x == 0) {
-		//	printf("x start: %u global_bucket_start_pos: %u vs before %u\n", x_start, global_bucket_start_pos, x_start / blockDim.x);
-		//}
-		if (threadIdx.x == 0) filter_count = 0;
-		__syncthreads();
-
-		uint32_t batch_id = 0;
-		for (batch_id = 0; batch_id < NUM_BATCHES_OF_THREADS; batch_id++) {
-			uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x;
-			uint32_t chacha = chachas[x];
-			xchacha_pair entry = { x, chacha };
-
-			uint32_t bucket_id = chacha / BUCKET_DIVISOR;
-			//printf("chacha %u - bucket id: %u\n", chacha, bucket_id);
-			if (bucket_id >= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id);
-			if (bucket_id == 0) {
-				int slot = atomicAdd(&filter_count,1);
-				uint32_t results_address = GLOBAL_TILE_START + bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET + slot;
-				if (results_address < DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) {
-					results[results_address] = entry;
-				} else {
-					printf("results address overflow %u - global start pos: %u bucket %u slot %u DUMBSORT_SPACE_NEEDED_FOR_SCRATCH: %u\n",
-							results_address, GLOBAL_TILE_START, bucket_id, slot, DUMBSORT_SPACE_NEEDED_FOR_SCRATCH);
-				}
-			}
-		}
-	}
-}
-
-/*template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_attack_process_global_kbc_pairs_list(
-		const int PAIRS_COUNT, unsigned int *kbc_pairs_list_L_bucket_ids,
-		const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L,
-		const uint32_t *rx_list, const uint RX_START, const uint RX_END,
-		Match_Attack_Pair_Index *match_list, int *match_counts,
-		const uint32_t KBC_MAX_ENTRIES) {
-
-	// NOTE: possible optimization is to only get y elements of a list instead of ALL the meta...
-	// requires splitting the meta and y fields into two separate lists. Alternatively we copy
-	// all the meta chunk in this round.
-
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-
-	if (i < PAIRS_COUNT) {
-		unsigned int global_kbc_L_bucket_id = kbc_pairs_list_L_bucket_ids[i];
-
-		uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8;
-		uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8);
-		uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket];
-		const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8;
-		kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8);
-		bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket];
-		const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111;
-
-		if ((num_L == 0) || (num_R == 0)) {
-			printf("ERROR: PAIRS LIST SHOULD NOT HAVE 0 COUNTS\n");
-			return; // shouldn't ever happen with a pairs list...
-		}
-
-		const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES;
-		const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES;
-
-		const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L];
-		const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R];
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  AND
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-		for (int pos_R = 0; pos_R < num_R; pos_R+=1) {
-			//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-			BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-			int16_t yr_kbc = R_entry.y;
-			int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-			for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) {
-				// do L_entry and R_entry match?
-				BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-				int16_t yl_kbc = L_entry.y;
-				int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-				int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-				if (formula_one < 0) {
-					formula_one += kB;
-				}
-				int16_t m = formula_one;
-				if (m >= kB) {
-					m -= kB;
-				}
-				if (m < 64) {
-					// passed first test
-					int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-					int16_t yr_cid = yr_kbc % kC;
-					int16_t parity = (global_kbc_L_bucket_id) % 2;
-					int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-					int16_t formula_two = yr_cid - yl_cid;
-					if (formula_two < 0) {
-						formula_two += kC;
-					}
-					if (formula_two == m2_parity_squared) {
-						// we have a match.
-						int slot = atomicAdd(&match_counts[0],1);
-						Match_Attack_Pair_Index match = { };
-						match.bucket_L_id = global_kbc_L_bucket_id;
-						match.idx_L = pos_L;
-						match.idx_R = pos_R;
-						// *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list.
-						// suspect splitting y list would be faster.
-						match_list[slot] = match;
-					}
-				}
-			}
-		}
-	}
-}*/
-
-
-void attack_method_lxs(uint32_t num_lxs) {
-
-	std::cout << "ATTACK METHOD LXS - SORT XS/YS! " << num_lxs << std::endl;
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-
-	const uint32_t NUM_LXS = 20000000;
-	const uint32_t BATCHES = 64;
-	const uint32_t NUM_PER_BATCH = UINT_MAX / BATCHES;
-	const uint32_t KBC_MAX_BUCKET_SIZE = 32; // SHOULD BE MAX 19 FOR BATCHES 64
-	// for our bucketing sort, we have a total number of grouped entries and divvy that up into 256 stripes to get
-	// our max per entry
-	const uint32_t MAX_TOTAL_GROUPED_ENTRIES = DUMBSORT_BATCHES_TILE_SPACE;
-	//const uint32_t MAX_ENTRIES_PER_GROUPING = MAX_TOTAL_GROUPED_ENTRIES / 256;
-
-
-	auto alloc_start = std::chrono::high_resolution_clock::now();
-	int blockSize; uint64_t calc_N;uint64_t calc_blockSize;uint64_t calc_numBlocks;int numBlocks;
-
-	uint32_t *chachas;
-	xchacha_pair *xchachas_buffer_1;
-	xchacha_pair *xchachas_buffer_2;
-	uint32_t *batched_chachas;
-	uint32_t *batched_xs;
-	unsigned int *xchachas_counts;
-	uint16_t *out_kbc_ys;
-	uint32_t *out_kbc_xs;
-	unsigned int *global_kbc_counts;
-
-	std::cout << " NUM BATCHES:   " << BATCHES << std::endl;
-	std::cout << " NUM PER BATCH: " << NUM_PER_BATCH << std::endl;
-	std::cout << " KBC MAX BUCKET SIZE:" << KBC_MAX_BUCKET_SIZE << std::endl;
-	std::cout << " MAX_TOTAL_GROUPED_ENTRIES: " << MAX_TOTAL_GROUPED_ENTRIES << std::endl;
-
-	std::cout << "      chachas size:" << (sizeof(uint32_t)*NUM_PER_BATCH) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&chachas, sizeof(uint32_t)*NUM_PER_BATCH));
-	CUDA_CHECK_RETURN(cudaMemset(chachas, 0, sizeof(uint32_t)*NUM_PER_BATCH));
-
-	std::cout << "      xchachas_grouped size: " << (sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&xchachas_buffer_1, sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH));
-	CUDA_CHECK_RETURN(cudaMalloc(&xchachas_buffer_2, sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH));
-	CUDA_CHECK_RETURN(cudaMalloc(&xchachas_counts, sizeof(int)*1024)); // can be tuned to less, for now this is general
-	CUDA_CHECK_RETURN(cudaMemset(xchachas_counts, 0, 1024));
-	batched_chachas = (uint32_t *) &xchachas_buffer_1[0];
-	batched_xs = (uint32_t *) &xchachas_buffer_2[0];
-
-
-	//std::cout << "      out_kbc_ys size:" << (sizeof(uint16_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMalloc(&out_kbc_ys, sizeof(uint16_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS));
-	//std::cout << "      out_kbc_xs size:" << (sizeof(uint32_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMalloc(&out_kbc_xs, sizeof(uint32_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS));
-
-	std::cout << "      global_kbc_counts size:" << (sizeof(int)*kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&global_kbc_counts, sizeof(int)*kBC_NUM_BUCKETS));
-	CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, kBC_NUM_BUCKETS*sizeof(int)));
-
-	int deviceCount = 0;
-	cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
-
-	if (error_id != cudaSuccess) {
-		printf("cudaGetDeviceCount returned %d\n-> %s\n",
-				static_cast<int>(error_id), cudaGetErrorString(error_id));
-		printf("Result = FAIL\n");
-		exit(EXIT_FAILURE);
-	}
-
-	// This function call returns 0 if there are no CUDA capable devices.
-	if (deviceCount == 0) {
-		printf("There are no available device(s) that support CUDA\n");
-	} else {
-		printf("Detected %d CUDA Capable device(s)\n", deviceCount);
-	}
-
-	//int device_id = 0;
-	//cudaSetDevice(device_id);
-	//cudaDeviceProp deviceProp;
-	//cudaGetDeviceProperties(&deviceProp, device_id);
-	//printf("\nDevice %d: \"%s\"\n", device_id, deviceProp.name);
-	//cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize);
-	//std::cout << " persisting cache size: " << deviceProp.persistingL2CacheMaxSize << std::endl;
-	//std::cout << " accessPolicyMaxWindowSize: " << deviceProp.accessPolicyMaxWindowSize << std::endl;
-	//cudaStream_t stream;
-	//cudaStreamCreate(&stream);
-	//cudaStreamAttrValue attr;
-	//attr.accessPolicyWindow.base_ptr = global_kbc_counts;
-	//attr.accessPolicyWindow.num_bytes = kBC_NUM_BUCKETS*sizeof(int) / 32;
-	//attr.accessPolicyWindow.hitRatio = 1.0;
-	//attr.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
-	//attr.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
-	//cudaStreamSetAttribute(stream,cudaStreamAttributeAccessPolicyWindow,&attr);
-
-	auto alloc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   alloc time: " << std::chrono::duration_cast<milli>(alloc_finish - attack_start).count() << " ms\n";
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-
-
-	auto chacha_start = std::chrono::high_resolution_clock::now();
-	blockSize = 128; // # of threads per block, maximum is 1024.
-	calc_N = UINT_MAX / BATCHES;
-	calc_blockSize = blockSize;
-	calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32);
-	numBlocks = calc_numBlocks;
-
-	// NEW ALGORITHM!!!!
-	// 1) LOAD ALL LX'S INTO GLOBAL_KBC_L1_BUCKETED_YS
-	// 2) GO THROUGH EACH RX IN ORDER (NO SORTING!) AND FIND L VALUES IN BUCKETS AND CHECK FOR MATCHES. THAT'S IT.
-	// will the cache be fast enough???????? or will sorting be better!?!?!?
-	// can experiment with different local_kbc sizes to see if lx's fit in cache we get sufficient performance
-
-
-	const int groupingBlockSize = 1024;
-
-	//const uint32-t GROUPING_BATCH_MAX_ENTRIES_PER_BUCKET = 65536 / 8;
-	int groupingNumBlocks = (NUM_PER_BATCH + groupingBlockSize - 1) / (GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK);
-
-	int bitonicThreads = 512;
-	int bitonicBlocks = NUM_PER_BATCH / 1024; // should be 65536
-	std::cout << "GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK: " << GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK << "  NUM BLOCKS: " << groupingNumBlocks << std::endl;
-	uint32_t X_START = 0;
-	for (uint32_t batch_id=0;batch_id < 1; batch_id++) {
-		X_START = batch_id * (1 << (32-6));
-
-		gpu_chacha8_k32_write_chachas32<<<numBlocks, blockSize>>>(calc_N, X_START, chacha_input, chachas); // 24ms
-
-		//bitonicSortShared<<<bitonicBlocks,(SHARED_SIZE_LIMIT / 2)>>>(chachas, batched_chachas, batched_xs);
-		nickSortShared<<<1,SHARED_SIZE_LIMIT>>>(chachas, batched_chachas, batched_xs);
-		//gpu_show_chacha_xs_lists<<<1,1>>>(0,10,batched_chachas, batched_xs);
-		//gpu_show_chacha_xs_lists<<<1,1>>>(1024,10,batched_chachas, batched_xs);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-		//CUDA_CHECK_RETURN(cudaMemset(xchachas_counts, 0, sizeof(int)*1024));
-		//gpu_filter_chachas<<<groupingNumBlocks,groupingBlockSize>>>(
-		//				GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas,
-		//				xchachas_buffer_1, xchachas_buffer_2);
-
-		//gpu_write_chachas_into_buckets_dumb_batches<<<groupingNumBlocks,groupingBlockSize>>>(
-		//		GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas,
-		//		xchachas_buffer_1, xchachas_buffer_2);
-
-		//gpu_write_chachas_into_buckets_with_single_row_depthflush<<<groupingNumBlocks,groupingBlockSize>>>(
-		//				GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas,
-		//				MAX_TOTAL_GROUPED_ENTRIES, xchachas_grouped, xchachas_counts);
-
-		/*
-		 * GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK: 65536
-   - gpu_chacha8_k32_write_chachas 4294967232 in 64 BATCHES results: 1158 ms
-Freeing memory...
-counter list counts  SUM:67108864   MAX:263782
-		 */
-
-		//gpu_write_chachas_into_buckets_with_buffer_batches<<<groupingNumBlocks,groupingBlockSize>>>(
-		//	GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas,
-		//	MAX_ENTRIES_PER_GROUPING, xchachas_grouped, xchachas_counts);
-
-		// stupid thrust, not a 100% deal break but close to being too slow
-		//thrust::device_ptr<uint32_t> device_xs_R_ptr(out_kbc_xs);
-		//thrust::device_ptr<uint32_t> device_ys_R_ptr(chachas);
-		//thrust::sort_by_key(device_ys_R_ptr, device_ys_R_ptr + calc_N, device_xs_R_ptr);
-		//thrust::sort(device_ys_R_ptr, device_ys_R_ptr + calc_N);
-
-		//CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, kBC_NUM_BUCKETS*sizeof(int))); // 30ms
-		//gpu_filter_chachas_into_global_kbc_bucket<<<numBlocks*32, blockSize>>>(calc_N, X_START, chachas,
-		//		out_kbc_ys, out_kbc_xs, global_kbc_counts); // 56ms
-		//gpu_get_max_count_in_global_kbc_bucket<<<1,256>>>(global_kbc_counts);
-
-	}
-
-
-	//CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, 10000000*sizeof(int)));
-	//gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-	//		local_kbc_entries, device_global_kbc_num_entries_L, 0, 2000000);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto chacha_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   - gpu_chacha8_k32_write_chachas " << (calc_N*BATCHES) << " in " << BATCHES << " BATCHES results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-	gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_counts, 256);
-	//gpu_show_chachas<<<1,1>>>(NUM_PER_BATCH, 10000, chachas);
-	//CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	//gpu_get_max_counts_from_counter_list<<<1,256>>>(global_kbc_counts, kBC_NUM_BUCKETS);
-
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(chachas));
-	CUDA_CHECK_RETURN(cudaFree(out_kbc_ys));
-	CUDA_CHECK_RETURN(cudaFree(out_kbc_xs));
-	CUDA_CHECK_RETURN(cudaFree(global_kbc_counts));
-
-
-
-	auto attack_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "   attack total time: " << std::chrono::duration_cast<milli>(attack_finish - attack_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-}
-
-
-
-
-
-
-
-#endif /* ATTACK_METHOD_LXS2_HPP_ */
diff --git a/attack_method_xpairbits.hpp b/attack_method_xpairbits.hpp
deleted file mode 100644
index 34f2cae..0000000
--- a/attack_method_xpairbits.hpp
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * attack_method_xpairbits.hpp
- *
- *  Created on: Dec 5, 2021
- *      Author: nick
- */
-
-#ifndef ATTACK_METHOD_XPAIRBITS_HPP_
-#define ATTACK_METHOD_XPAIRBITS_HPP_
-
-const uint32_t MAX_LXS_PER_KBC_BUCKET = 16; // 24 for 110,000,000
-
-const uint32_t XPAIR_BITS = 8;
-const uint32_t MAX_RX_MATCHES = (1 << (32 - XPAIR_BITS))*2;
-const uint32_t CHACHA_NUM_BATCHES_BITS = 3;
-const uint32_t CHACHA_NUM_BATCHES = 1 << CHACHA_NUM_BATCHES_BITS;
-const uint32_t CHACHA_TOTAL_ENTRIES_PER_BATCH = (1 << (32 - XPAIR_BITS - CHACHA_NUM_BATCHES_BITS));
-const uint32_t CHACHA_BUCKET_BITS = 4; // ACROSS ALL BATCHES
-const uint32_t CHACHA_NUM_BUCKETS = (1 << CHACHA_BUCKET_BITS);
-const uint32_t CHACHA_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS));
-const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS - CHACHA_NUM_BATCHES_BITS));
-const uint32_t CHACHA_MAX_ENTRIES_PER_BUCKET = (11 * (CHACHA_TOTAL_ENTRIES_PER_BATCH / CHACHA_NUM_BUCKETS)) / 10;
-const uint64_t CHACHA_OUT_MAX_ENTRIES_NEEDED = (CHACHA_NUM_BUCKETS * CHACHA_MAX_ENTRIES_PER_BUCKET);
-
-struct xchacha_pair {
-	uint32_t x;
-	uint32_t chacha;
-};
-
-#define KBC_MASK_SHIFT 4
-#define KBC_MASK_MOD 8
-#define KBC_MASK_BITS 0b001111
-#define ATTACK_INTO_KBC_YS_BITMASK(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD; \
-	uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD); \
-	uint32_t add = 1 << kbc_bitmask_shift; \
-	uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add); \
-	uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; \
-	if (slot > MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \
-	uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \
-	kbc_global_Ly_entries_L[entries_address] = y; \
-	kbc_x_entries[entries_address] = (x + i); \
-}
-
-__global__
-void gpu_chacha8_set_Lxs_into_kbc_ys_mask(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-
-	for (uint32_t x_group = index; x_group < end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		ATTACK_INTO_KBC_YS_BITMASK(x0,0);ATTACK_INTO_KBC_YS_BITMASK(x1,1);ATTACK_INTO_KBC_YS_BITMASK(x2,2);ATTACK_INTO_KBC_YS_BITMASK(x3,3);
-		ATTACK_INTO_KBC_YS_BITMASK(x4,4);ATTACK_INTO_KBC_YS_BITMASK(x5,5);ATTACK_INTO_KBC_YS_BITMASK(x6,6);ATTACK_INTO_KBC_YS_BITMASK(x7,7);
-		ATTACK_INTO_KBC_YS_BITMASK(x8,8);ATTACK_INTO_KBC_YS_BITMASK(x9,9);ATTACK_INTO_KBC_YS_BITMASK(x10,10);ATTACK_INTO_KBC_YS_BITMASK(x11,11);
-		ATTACK_INTO_KBC_YS_BITMASK(x12,12);ATTACK_INTO_KBC_YS_BITMASK(x13,13);ATTACK_INTO_KBC_YS_BITMASK(x14,14);ATTACK_INTO_KBC_YS_BITMASK(x15,15);
-	}
-}
-
-__global__
-void gpu_list_xchachas(const uint32_t N, const xchacha_pair *xchachas)
-{
-	uint index = blockIdx.x * blockDim.x + threadIdx.x;
-	if (index < N) {
-		xchacha_pair pair = xchachas[index];
-		uint64_t y = (((uint64_t) pair.chacha) << 6) + (pair.x >> 26);
-		uint32_t kbc_bucket_id = uint32_t (y / kBC);
-		printf("set xchachas kbc mask index: %u  x: %u  chacha: %u   y: %llu   kbc_bucket_id: %u\n",
-					index, pair.x, pair.chacha, y, kbc_bucket_id);
-	}
-}
-
-__global__
-void gpu_chacha8_set_xchachas_into_kbc_ys_mask(const uint32_t N,
-		const xchacha_pair *xchachas,
-		uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET)
-{
-	int index = blockIdx.x * blockDim.x + threadIdx.x;
-	if (index < N) {
-		xchacha_pair pair = xchachas[index];
-		uint64_t y = (((uint64_t) pair.chacha) << 6) + (pair.x >> 26);
-		uint32_t kbc_bucket_id = uint32_t (y / kBC);
-		if (index < 10)
-		printf("set xchachas kbc mask index: %u  x: %u  chacha: %u   y: %llu   kbc_bucket_id: %u\n",
-							index, pair.x, pair.chacha, y, kbc_bucket_id);
-		//uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD;
-		//uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD);
-		//uint32_t add = 1 << kbc_bitmask_shift;
-		//uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add);
-		//uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS;
-
-		uint slot = atomicAdd(&kbc_global_num_entries_L[kbc_bucket_id],1);
-
-		if (index < 10) {
-			printf("set xchachas kbc mask index: %u  x: %u  chacha: %u   y: %llu   kbc_bucket_id: %u\n",
-					index, pair.x, pair.chacha, y, kbc_bucket_id);
-		}
-
-		//if (slot > MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); }
-		//uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot;
-		//kbc_global_Ly_entries_L[entries_address] = y % kBC;
-		//kbc_x_entries[entries_address] = pair.x;
-	}
-
-}
-
-__global__ void gpu_get_max_counts_from_counter_list(unsigned int *kbc_counts, const int NUM, const bool printAll) {
-	__shared__ unsigned int max_kbc_count;
-	__shared__ unsigned int sum_kbc_count;
-	if (threadIdx.x == 0) {
-		max_kbc_count = 0;
-		sum_kbc_count = 0;
-	}
-	__syncthreads();
-	for (uint32_t i=threadIdx.x;i<NUM;i+=blockDim.x) {
-		//uint32_t kbc_bitmask_bucket = i / KBC_MASK_MOD;
-		//uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (i % KBC_MASK_MOD);
-		//uint slot_value = kbc_counts[kbc_bitmask_bucket];
-		//unsigned int kbc_count = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS;
-		unsigned int kbc_count = kbc_counts[i];
-		if (printAll) printf("id: %u count: %u\n", i, kbc_count);
-		atomicMax(&max_kbc_count, kbc_count);
-		atomicAdd(&sum_kbc_count, kbc_count);
-	}
-	__syncthreads();
-	if (threadIdx.x == 0) printf("counter list counts  SUM:%u   MAX:%u\n", sum_kbc_count, max_kbc_count);
-}
-
-#define ATTACK_BUCKETBATCH_CHACHAS32_PAIR(chacha_y,i) \
-{ \
-	if ((chacha_y >= BATCH_CHACHA_RANGE_MIN) && (chacha_y <= BATCH_CHACHA_RANGE_MAX)) { \
-		xchacha_pair pair = { base_x + i, chacha_y }; \
-		int slot = atomicAdd(&local_filter_count,1); \
-		if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \
-		shared_chachas[slot] = pair; \
-		uint32_t split_bucket_id = (chacha_y - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR; \
-		atomicAdd(&shared_counts[split_bucket_id],1); \
-	} \
-}
-
-// run with 128 blocksize, more doesn't matter.
-template<int NUM_SPLIT_BUCKETS>
-__global__
-void gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange(const uint32_t N,
-		const uint32_t BATCH_CHACHA_RANGE_MIN, const uint32_t BATCH_CHACHA_RANGE_MAX,
-		const uint32_t CHACHA_MAX_PER_SPLIT_BUCKET, const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR,
-		const __restrict__ uint32_t *input,
-		xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	//__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary.
-	//uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-	const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can
-	__shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so.
-	__shared__ uint shared_counts[NUM_SPLIT_BUCKETS];
-	__shared__ uint global_counts[NUM_SPLIT_BUCKETS];
-	__shared__ uint local_filter_count;
-
-	//if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n");
-
-	//uint32_t base_group = blockIdx.x * blockDim.x;
-
-	uint32_t x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	uint32_t base_x = x_group * 32;
-	const uint32_t end_n = N / 32; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	for (int i=threadIdx.x;i<NUM_SPLIT_BUCKETS;i+=blockDim.x) {
-		shared_counts[i] = 0;
-	}
-	if (threadIdx.x == 0) {
-		local_filter_count = 0;
-	}
-	__syncthreads();
-
-	const int j = 0;
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 2;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],3);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],7);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],11);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],15);
-
-		pos += 1;
-
-		datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7];
-		datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11];
-		datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[j+14] = input[14];datax[j+15] = input[15];
-
-#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]);
-			QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]);
-			QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]);
-			QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]);
-		}
-
-		datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4];
-		datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9];
-		datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0;
-		datax[j+14] += input[14];datax[j+15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]);
-		BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]);
-		BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]);
-
-		//uint64_t y = datax[j+0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[j+0] >> 22; // gives bucket id 0..1023
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],16+3);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],16+7);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],16+11);
-		ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],16+15);
-	}
-	// at this point we have 128*32 = 4096 entries
-	// now we have to sort them into the buckets
-	// we already have the shared counts set from the ATTACK macro
-	__syncthreads();
-	for (int i=threadIdx.x;i<NUM_SPLIT_BUCKETS;i+=blockDim.x) {
-		global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-		shared_counts[i] = 0;
-	}
-	// now just scan our filtered entries and bucket them
-	__syncthreads();
-	for (int i=threadIdx.x;i<local_filter_count;i+=blockDim.x) {
-		//printf("writing slot %u into global slot %u\n",i,base_x + i);
-
-		// remember, these are *already* bucketed to some range
-		xchacha_pair pair = shared_chachas[i];
-		uint32_t split_bucket_id = (pair.chacha - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR;
-		uint slot = global_counts[split_bucket_id] + atomicAdd(&shared_counts[split_bucket_id],1);
-		if (slot > CHACHA_MAX_PER_SPLIT_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_SPLIT_BUCKET, slot);
-		else xchachas_buckets[CHACHA_MAX_PER_SPLIT_BUCKET * split_bucket_id + slot] = shared_chachas[i];
-	}
-}
-
-#define CHECK_MATCH() \
-{ \
-	int16_t yr_kbc = Ry % kBC; \
-	int16_t yr_bid = yr_kbc / kC; \
-	int16_t yl_bid = yl_kbc / kC; \
-	int16_t formula_one = yr_bid - yl_bid; \
-	if (formula_one < 0) { \
-		formula_one += kB; \
-	} \
-	int16_t m = formula_one; \
-	if (m >= kB) { \
-		m -= kB; \
-	} \
-	if (m < 64) { \
-		int16_t yl_cid = yl_kbc % kC; \
-		int16_t yr_cid = yr_kbc % kC;\
-		int16_t parity = (kbc_bucket_id_L) % 2; \
-		int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; \
-		int16_t formula_two = yr_cid - yl_cid; \
-		if (formula_two < 0) { \
-			formula_two += kC; \
-		} \
-		if (formula_two == m2_parity_squared) { \
-			isMatch = true; \
-		} \
-	} \
-}
-
-__global__
-void gpu_chacha8_filter_rxs_from_bucket_batch(
-		const uint32_t N,
-		const xchacha_pair* __restrict__ xchachas,
-		const uint16_t* __restrict__ kbc_global_Ly_entries_L,
-		const unsigned int* __restrict__ kbc_global_num_entries_L,
-		uint32_t MAX_LXS_PER_KBC_BUCKET,
-		uint32_t * __restrict__ rxs,
-		int *rx_count)
-{
-	int i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		xchacha_pair entry = xchachas[i];
-		uint64_t Ry = (((uint64_t) entry.chacha) << 6) + (entry.x >> 26);
-		int kbc_bucket_id_R = (uint32_t (Ry / kBC));
-		if (kbc_bucket_id_R > 0) {
-			int kbc_bucket_id_L = kbc_bucket_id_R - 1;
-			//printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L);
-			//int num = kbc_global_num_entries_L[kbc_bucket_id_L];
-
-			//uint num = kbc_global_num_entries_L[kbc_bucket_id_L];
-			uint32_t kbc_bitmask_bucket = kbc_bucket_id_L / KBC_MASK_MOD;
-			uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id_L % KBC_MASK_MOD);
-			uint slot_value =kbc_global_num_entries_L[kbc_bitmask_bucket];
-			uint num = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS;
-			for (int nm=0;nm<num;nm++) {
-				bool isMatch = false;
-				int16_t yl_kbc = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET + nm];
-				CHECK_MATCH();
-				if (isMatch) {
-					int slot = atomicAdd(&rx_count[0],1);
-					rxs[slot] = entry.x;
-				}
-			}
-		}
-	}
-}
-
-void attack_method_xpairbits() {
-	std::cout << "ATTACK METHOD X PAIR BITS: " << XPAIR_BITS << std::endl;
-
-	using milli = std::chrono::milliseconds;
-	auto attack_start = std::chrono::high_resolution_clock::now();
-
-	unsigned int *device_global_kbc_num_entries_L;
-	uint16_t *kbc_Ly_entries; // the y % kbc bucketed entries
-	uint32_t *kbc_x_entries;  // the associated x value for the y pairing
-
-	// alloc for lx's
-	std::cout << "      kbc_Ly_entries MAX_LXS: " << MAX_LXS_PER_KBC_BUCKET << " TOTAL BYTES: " <<  (MAX_LXS_PER_KBC_BUCKET * sizeof(uint64_t) * kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_Ly_entries, (MAX_LXS_PER_KBC_BUCKET * sizeof(uint16_t) * kBC_NUM_BUCKETS)));
-	std::cout << "      kbc_x_entries MAX_LXS: " << MAX_LXS_PER_KBC_BUCKET << " TOTAL BYTES: " <<  (MAX_LXS_PER_KBC_BUCKET * sizeof(uint64_t) * kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&kbc_x_entries, (MAX_LXS_PER_KBC_BUCKET * sizeof(uint32_t) * kBC_NUM_BUCKETS)));
-
-	std::cout << "      device_global_kbc_num_entries_L size:" << (sizeof(int)*kBC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_global_kbc_num_entries_L, sizeof(int)*kBC_NUM_BUCKETS));
-	CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, kBC_NUM_BUCKETS*sizeof(int)));
-
-	xchacha_pair *lxchachas, *rxchachas;
-	uint *lxchachas_bucket_counts, *rxchachas_bucket_counts;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&lxchachas_bucket_counts, CHACHA_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(lxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&rxchachas_bucket_counts, CHACHA_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(rxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-
-	std::cout << "      lxchachas size:" << (sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&lxchachas, sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED));
-	std::cout << "      rxchachas size:" << (sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&rxchachas, sizeof(xchacha_pair)*CHACHA_OUT_MAX_ENTRIES_NEEDED));
-
-	uint32_t *rx_match_list;
-	int *rx_match_count;
-	std::cout << "      rx_match_list MAX_RX_MATCHES: " << MAX_RX_MATCHES << " TOTAL BYTES: " <<  (MAX_RX_MATCHES * sizeof(uint32_t)) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&rx_match_list, (MAX_RX_MATCHES * sizeof(uint32_t))));
-	CUDA_CHECK_RETURN(cudaMallocManaged(&rx_match_count, sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(rx_match_count, 0, sizeof(int)));
-
-
-
-	auto alloc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   alloc time: " << std::chrono::duration_cast<milli>(alloc_finish - attack_start).count() << " ms\n";
-
-	auto compute_only_start = std::chrono::high_resolution_clock::now();
-
-	int blockSize; // # of threads per block, maximum is 1024.
-	uint64_t calc_N;
-	uint64_t calc_blockSize;
-	uint64_t calc_numBlocks;
-	int numBlocks;
-
-	// FIRST SET LXS into global memory, these stay put for each chacha round
-	/*blockSize = 256; // # of threads per block, maximum is 1024.
-	calc_N = 1 << (32 - XPAIR_BITS);
-	calc_blockSize = blockSize;
-	calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	numBlocks = calc_numBlocks;
-
-	std::cout << "   gpu_chacha8_set_Lxs_into_kbc_ys num:" << calc_N << std::endl;
-	auto lxintokbc_start = std::chrono::high_resolution_clock::now();
-	gpu_chacha8_set_Lxs_into_kbc_ys_mask<<<numBlocks,blockSize>>>(calc_N, chacha_input,
-			kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	auto lxintokbc_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   gpu_chacha8_set_Lxs_into_kbc_ys time: " << std::chrono::duration_cast<milli>(lxintokbc_finish - lxintokbc_start).count() << " ms\n";
-	gpu_get_max_counts_from_counter_list<<<1,1024>>>(device_global_kbc_num_entries_L, kBC_NUM_BUCKETS, false);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());*/
-
-	auto chacha_batches_start = std::chrono::high_resolution_clock::now();
-	int64_t total_chacha_ms = 0;
-	uint32_t sum_counts = 0;
-	for (uint64_t chacha_batch_id = 0; chacha_batch_id < 1/*CHACHA_NUM_BATCHES*/; chacha_batch_id++) {
-		//std::cout << "Doing chacha batch " << chacha_batch_id << std::endl;
-		uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS));
-		uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR;
-		uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch
-
-		//std::cout << "   BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl;
-		//std::cout << "   BATCH_CHACHA_RANGE   : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl;
-		//std::cout << "   BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl;
-		//std::cout << "   CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl;
-		//std::cout << "   CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl;
-
-		blockSize = 128; // # of threads per block, maximum is 1024.
-		calc_N = 1 << (32 - XPAIR_BITS); //CHACHA_TOTAL_ENTRIES_PER_BATCH;
-		uint32_t CHACHA_X_START = chacha_batch_id * calc_N;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32);
-		numBlocks = calc_numBlocks;
-		CUDA_CHECK_RETURN(cudaMemset(lxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-		auto chacha_start = std::chrono::high_resolution_clock::now();
-		//std::cout << "   calc_N   : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<CHACHA_NUM_BUCKETS><<<numBlocks,blockSize>>>(calc_N,
-							BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX,
-							CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR,
-							chacha_input,
-							lxchachas, lxchachas_bucket_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto chacha_finish = std::chrono::high_resolution_clock::now();
-		total_chacha_ms += std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count();
-		gpu_get_max_counts_from_counter_list<<<1,1>>>(lxchachas_bucket_counts, CHACHA_NUM_BUCKETS, true);
-		//auto chacha_rs_start = std::chrono::high_resolution_clock::now();
-
-
-		for (uint chacha_bucket_id=0;chacha_bucket_id<CHACHA_NUM_BUCKETS;chacha_bucket_id++) {
-			std::cout << " chacha bucket id " << chacha_bucket_id << std::endl;
-			gpu_list_xchachas<<<1,1>>>(10, &lxchachas[chacha_bucket_id]);
-			blockSize = 256; // # of threads per block, maximum is 1024.
-			calc_N = lxchachas_bucket_counts[chacha_bucket_id];
-			sum_counts += calc_N;
-			calc_blockSize = blockSize;
-			calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-			numBlocks = calc_numBlocks;
-			std::cout << "Setting kbcs calc_N: " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-			//gpu_chacha8_set_xchachas_into_kbc_ys_mask<<<numBlocks,blockSize>>>(calc_N, &lxchachas[chacha_bucket_id],
-			//	kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET);
-		}
-		std::cout << "sum counts: " << sum_counts << std::endl;
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		//std::cout << "   gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-	}
-
-	gpu_get_max_counts_from_counter_list<<<1,1>>>(device_global_kbc_num_entries_L, 100, true);//kBC_NUM_BUCKETS, false);
-
-
-	/*for (uint64_t chacha_batch_id = 0; chacha_batch_id < CHACHA_NUM_BATCHES; chacha_batch_id++) {
-		//std::cout << "Doing chacha batch " << chacha_batch_id << std::endl;
-		uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS));
-		uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR;
-		uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch
-
-		//std::cout << "   BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl;
-		//std::cout << "   BATCH_CHACHA_RANGE   : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl;
-		//std::cout << "   BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl;
-		//std::cout << "   CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl;
-		//std::cout << "   CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl;
-
-		blockSize = 128; // # of threads per block, maximum is 1024.
-		calc_N = 1 << (32 - XPAIR_BITS); //CHACHA_TOTAL_ENTRIES_PER_BATCH;
-		uint32_t CHACHA_X_START = 0;//chacha_batch_id * calc_N;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32);
-		numBlocks = calc_numBlocks;
-		CUDA_CHECK_RETURN(cudaMemset(rxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int)));
-		auto chacha_start = std::chrono::high_resolution_clock::now();
-		//std::cout << "   calc_N   : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<CHACHA_NUM_BUCKETS><<<numBlocks,blockSize>>>(calc_N,
-							BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX,
-							CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR,
-							chacha_input,
-							rxchachas, rxchachas_bucket_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto chacha_finish = std::chrono::high_resolution_clock::now();
-		total_chacha_ms += std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count();
-		//gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, CHACHA_NUM_BUCKETS, true);
-		//auto chacha_rs_start = std::chrono::high_resolution_clock::now();
-
-		for (uint chacha_bucket_id=0;chacha_bucket_id<CHACHA_NUM_BUCKETS;chacha_bucket_id++) {
-			blockSize = 256; // # of threads per block, maximum is 1024.
-			calc_N = lxchachas_bucket_counts[chacha_bucket_id];
-			calc_blockSize = blockSize;
-			calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-			numBlocks = calc_numBlocks;
-			//std::cout << "Setting kbcs calc_N: " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-			gpu_chacha8_filter_rxs_from_bucket_batch<<<numBlocks,blockSize>>>(
-							calc_N,
-							&rxchachas[chacha_bucket_id],
-							kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET,
-							rx_match_list, rx_match_count);
-		}
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		//std::cout << "   gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast<milli>(chacha_finish - chacha_start).count() << " ms\n";
-	}*/
-
-
-
-	auto compute_only_finish = std::chrono::high_resolution_clock::now();
-
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(kbc_Ly_entries));
-	CUDA_CHECK_RETURN(cudaFree(kbc_x_entries));
-	CUDA_CHECK_RETURN(cudaFree(device_global_kbc_num_entries_L));
-
-	auto attack_finish = std::chrono::high_resolution_clock::now();
-	std::cout << "   found " << rx_match_count[0] << " matches" << std::endl;
-	std::cout << "   total chachas time: " << total_chacha_ms << " ms\n";
-	std::cout << "   compute only time: " << std::chrono::duration_cast<milli>(compute_only_finish - compute_only_start).count() << " ms\n";
-	std::cout << "   attack total time: " << std::chrono::duration_cast<milli>(attack_finish - attack_start).count() << " ms\n";
-	std::cout << "end." << std::endl;
-}
-
-
-#endif /* ATTACK_METHOD_XPAIRBITS_HPP_ */
diff --git a/chia/chacha8.c b/chia/chacha8.c
deleted file mode 100644
index dc707a7..0000000
--- a/chia/chacha8.c
+++ /dev/null
@@ -1,355 +0,0 @@
-#include "chacha8.h"
-
-
-
-static const char sigma[16] = "expand 32-byte k";
-static const char tau[16] = "expand 16-byte k";
-
-void chacha8_keysetup_data(uint32_t *input, const uint8_t *k, uint32_t kbits, const uint8_t *iv)
-{
-    const char *constants;
-
-    input[4] = U8TO32_LITTLE(k + 0);
-    input[5] = U8TO32_LITTLE(k + 4);
-    input[6] = U8TO32_LITTLE(k + 8);
-    input[7] = U8TO32_LITTLE(k + 12);
-    if (kbits == 256) { /* recommended */
-        k += 16;
-        constants = sigma;
-    } else { /* kbits == 128 */
-        constants = tau;
-    }
-    input[8] = U8TO32_LITTLE(k + 0);
-    input[9] = U8TO32_LITTLE(k + 4);
-    input[10] = U8TO32_LITTLE(k + 8);
-    input[11] = U8TO32_LITTLE(k + 12);
-    input[0] = U8TO32_LITTLE(constants + 0);
-    input[1] = U8TO32_LITTLE(constants + 4);
-    input[2] = U8TO32_LITTLE(constants + 8);
-    input[3] = U8TO32_LITTLE(constants + 12);
-    if (iv) {
-        input[14] = U8TO32_LITTLE(iv + 0);
-        input[15] = U8TO32_LITTLE(iv + 4);
-    } else {
-        input[14] = 0;
-        input[15] = 0;
-    }
-
-    //for (int i=0;i<=15;i++) {
-    //	printf("\ninput%d:%u", i, input[i]);
-    //}
-    //exit(0);
-}
-
-void chacha8_get_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint8_t *c)
-{
-    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
-    int i;
-
-    j0 = input[0];
-    j1 = input[1];
-    j2 = input[2];
-    j3 = input[3];
-    j4 = input[4];
-    j5 = input[5];
-    j6 = input[6];
-    j7 = input[7];
-    j8 = input[8];
-    j9 = input[9];
-    j10 = input[10];
-    j11 = input[11];
-    j12 = pos;
-    j13 = pos >> 32;
-    j14 = input[14];
-    j15 = input[15];
-
-    while (n_blocks--) {
-        x0 = j0;
-        x1 = j1;
-        x2 = j2;
-        x3 = j3;
-        x4 = j4;
-        x5 = j5;
-        x6 = j6;
-        x7 = j7;
-        x8 = j8;
-        x9 = j9;
-        x10 = j10;
-        x11 = j11;
-        x12 = j12;
-        x13 = j13;
-        x14 = j14;
-        x15 = j15;
-        for (i = 8; i > 0; i -= 2) {
-            QUARTERROUND(x0, x4, x8, x12);
-            QUARTERROUND(x1, x5, x9, x13);
-            QUARTERROUND(x2, x6, x10, x14);
-            QUARTERROUND(x3, x7, x11, x15);
-            QUARTERROUND(x0, x5, x10, x15);
-            QUARTERROUND(x1, x6, x11, x12);
-            QUARTERROUND(x2, x7, x8, x13);
-            QUARTERROUND(x3, x4, x9, x14);
-        }
-        x0 = PLUS(x0, j0);
-        x1 = PLUS(x1, j1);
-        x2 = PLUS(x2, j2);
-        x3 = PLUS(x3, j3);
-        x4 = PLUS(x4, j4);
-        x5 = PLUS(x5, j5);
-        x6 = PLUS(x6, j6);
-        x7 = PLUS(x7, j7);
-        x8 = PLUS(x8, j8);
-        x9 = PLUS(x9, j9);
-        x10 = PLUS(x10, j10);
-        x11 = PLUS(x11, j11);
-        x12 = PLUS(x12, j12);
-        x13 = PLUS(x13, j13);
-        x14 = PLUS(x14, j14);
-        x15 = PLUS(x15, j15);
-
-        j12 = PLUSONE(j12);
-        if (!j12) {
-            j13 = PLUSONE(j13);
-            /* stopping at 2^70 bytes per nonce is user's responsibility */
-        }
-
-        U32TO8_LITTLE(c + 0, x0);
-        U32TO8_LITTLE(c + 4, x1);
-        U32TO8_LITTLE(c + 8, x2);
-        U32TO8_LITTLE(c + 12, x3);
-        U32TO8_LITTLE(c + 16, x4);
-        U32TO8_LITTLE(c + 20, x5);
-        U32TO8_LITTLE(c + 24, x6);
-        U32TO8_LITTLE(c + 28, x7);
-        U32TO8_LITTLE(c + 32, x8);
-        U32TO8_LITTLE(c + 36, x9);
-        U32TO8_LITTLE(c + 40, x10);
-        U32TO8_LITTLE(c + 44, x11);
-        U32TO8_LITTLE(c + 48, x12);
-        U32TO8_LITTLE(c + 52, x13);
-        U32TO8_LITTLE(c + 56, x14);
-        U32TO8_LITTLE(c + 60, x15);
-
-        c += 64;
-    }
-}
-
-void chacha8_get_k32_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint32_t *c)
-{
-    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
-    int i;
-
-    j0 = input[0];
-    j1 = input[1];
-    j2 = input[2];
-    j3 = input[3];
-    j4 = input[4];
-    j5 = input[5];
-    j6 = input[6];
-    j7 = input[7];
-    j8 = input[8];
-    j9 = input[9];
-    j10 = input[10];
-    j11 = input[11];
-    j12 = pos;
-    j13 = pos >> 32;
-    j14 = input[14];
-    j15 = input[15];
-
-    while (n_blocks--) {
-        x0 = j0;
-        x1 = j1;
-        x2 = j2;
-        x3 = j3;
-        x4 = j4;
-        x5 = j5;
-        x6 = j6;
-        x7 = j7;
-        x8 = j8;
-        x9 = j9;
-        x10 = j10;
-        x11 = j11;
-        x12 = j12;
-        x13 = j13;
-        x14 = j14;
-        x15 = j15;
-        for (i = 8; i > 0; i -= 2) {
-            QUARTERROUND(x0, x4, x8, x12);
-            QUARTERROUND(x1, x5, x9, x13);
-            QUARTERROUND(x2, x6, x10, x14);
-            QUARTERROUND(x3, x7, x11, x15);
-            QUARTERROUND(x0, x5, x10, x15);
-            QUARTERROUND(x1, x6, x11, x12);
-            QUARTERROUND(x2, x7, x8, x13);
-            QUARTERROUND(x3, x4, x9, x14);
-        }
-        x0 += j0;
-        x1 += j1;
-        x2 += j2;
-        x3 += j3;
-        x4 += j4;
-        x5 += j5;
-        x6 += j6;
-        x7 += j7;
-        x8 += j8;
-        x9 += j9;
-        x10 += j10;
-        x11 += j11;
-        x12 += j12;
-        x13 += j13;
-        x14 += j14;
-        x15 += j15;
-
-        j12 = PLUSONE(j12);
-        if (!j12) {
-            j13 = PLUSONE(j13);
-            /* stopping at 2^70 bytes per nonce is user's responsibility */
-        }
-        c[0] = __builtin_bswap32(x0);
-        c[1] = __builtin_bswap32(x1);
-        c[2] = __builtin_bswap32(x2);
-        c[3] = __builtin_bswap32(x3);
-        c[4] = __builtin_bswap32(x4);
-        c[5] = __builtin_bswap32(x5);
-        c[6] = __builtin_bswap32(x6);
-        c[7] = __builtin_bswap32(x7);
-        c[8] = __builtin_bswap32(x8);
-        c[9] = __builtin_bswap32(x9);
-        c[10] = __builtin_bswap32(x10);
-        c[11] = __builtin_bswap32(x11);
-        c[12] = __builtin_bswap32(x12);
-        c[13] = __builtin_bswap32(x13);
-        c[14] = __builtin_bswap32(x14);
-        c[15] = __builtin_bswap32(x15);
-
-        c += 16;
-    }
-}
-
-
-void chacha8_keysetup(struct chacha8_ctx *x, const uint8_t *k, uint32_t kbits, const uint8_t *iv)
-{
-    const char *constants;
-
-    x->input[4] = U8TO32_LITTLE(k + 0);
-    x->input[5] = U8TO32_LITTLE(k + 4);
-    x->input[6] = U8TO32_LITTLE(k + 8);
-    x->input[7] = U8TO32_LITTLE(k + 12);
-    if (kbits == 256) { /* recommended */
-        k += 16;
-        constants = sigma;
-    } else { /* kbits == 128 */
-        constants = tau;
-    }
-    x->input[8] = U8TO32_LITTLE(k + 0);
-    x->input[9] = U8TO32_LITTLE(k + 4);
-    x->input[10] = U8TO32_LITTLE(k + 8);
-    x->input[11] = U8TO32_LITTLE(k + 12);
-    x->input[0] = U8TO32_LITTLE(constants + 0);
-    x->input[1] = U8TO32_LITTLE(constants + 4);
-    x->input[2] = U8TO32_LITTLE(constants + 8);
-    x->input[3] = U8TO32_LITTLE(constants + 12);
-    if (iv) {
-        x->input[14] = U8TO32_LITTLE(iv + 0);
-        x->input[15] = U8TO32_LITTLE(iv + 4);
-    } else {
-        x->input[14] = 0;
-        x->input[15] = 0;
-    }
-}
-
-void chacha8_get_keystream(const struct chacha8_ctx *x, uint64_t pos, uint32_t n_blocks, uint8_t *c)
-{
-    uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-    uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15;
-    int i;
-
-    j0 = x->input[0];
-    j1 = x->input[1];
-    j2 = x->input[2];
-    j3 = x->input[3];
-    j4 = x->input[4];
-    j5 = x->input[5];
-    j6 = x->input[6];
-    j7 = x->input[7];
-    j8 = x->input[8];
-    j9 = x->input[9];
-    j10 = x->input[10];
-    j11 = x->input[11];
-    j12 = pos;
-    j13 = pos >> 32;
-    j14 = x->input[14];
-    j15 = x->input[15];
-
-    while (n_blocks--) {
-        x0 = j0;
-        x1 = j1;
-        x2 = j2;
-        x3 = j3;
-        x4 = j4;
-        x5 = j5;
-        x6 = j6;
-        x7 = j7;
-        x8 = j8;
-        x9 = j9;
-        x10 = j10;
-        x11 = j11;
-        x12 = j12;
-        x13 = j13;
-        x14 = j14;
-        x15 = j15;
-        for (i = 8; i > 0; i -= 2) {
-            QUARTERROUND(x0, x4, x8, x12);
-            QUARTERROUND(x1, x5, x9, x13);
-            QUARTERROUND(x2, x6, x10, x14);
-            QUARTERROUND(x3, x7, x11, x15);
-            QUARTERROUND(x0, x5, x10, x15);
-            QUARTERROUND(x1, x6, x11, x12);
-            QUARTERROUND(x2, x7, x8, x13);
-            QUARTERROUND(x3, x4, x9, x14);
-        }
-        x0 = PLUS(x0, j0);
-        x1 = PLUS(x1, j1);
-        x2 = PLUS(x2, j2);
-        x3 = PLUS(x3, j3);
-        x4 = PLUS(x4, j4);
-        x5 = PLUS(x5, j5);
-        x6 = PLUS(x6, j6);
-        x7 = PLUS(x7, j7);
-        x8 = PLUS(x8, j8);
-        x9 = PLUS(x9, j9);
-        x10 = PLUS(x10, j10);
-        x11 = PLUS(x11, j11);
-        x12 = PLUS(x12, j12);
-        x13 = PLUS(x13, j13);
-        x14 = PLUS(x14, j14);
-        x15 = PLUS(x15, j15);
-
-        j12 = PLUSONE(j12);
-        if (!j12) {
-            j13 = PLUSONE(j13);
-            /* stopping at 2^70 bytes per nonce is user's responsibility */
-        }
-
-        U32TO8_LITTLE(c + 0, x0);
-        U32TO8_LITTLE(c + 4, x1);
-        U32TO8_LITTLE(c + 8, x2);
-        U32TO8_LITTLE(c + 12, x3);
-        U32TO8_LITTLE(c + 16, x4);
-        U32TO8_LITTLE(c + 20, x5);
-        U32TO8_LITTLE(c + 24, x6);
-        U32TO8_LITTLE(c + 28, x7);
-        U32TO8_LITTLE(c + 32, x8);
-        U32TO8_LITTLE(c + 36, x9);
-        U32TO8_LITTLE(c + 40, x10);
-        U32TO8_LITTLE(c + 44, x11);
-        U32TO8_LITTLE(c + 48, x12);
-        U32TO8_LITTLE(c + 52, x13);
-        U32TO8_LITTLE(c + 56, x14);
-        U32TO8_LITTLE(c + 60, x15);
-
-        c += 64;
-    }
-}
diff --git a/chia/chacha8.h b/chia/chacha8.h
deleted file mode 100644
index c0d844d..0000000
--- a/chia/chacha8.h
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef SRC_CHACHA8_H_
-#define SRC_CHACHA8_H_
-
-#include <stdint.h>
-
-struct chacha8_ctx {
-    uint32_t input[16];
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// blake...
-/*#define NICK_ROTR32(w,c) \
-  (((w) >> (c)) | ((w) << (32 - (c))))
-
-#define NICK_G(a,b,c,d,x,y) \
-  state[a] = state[a] + state[b] + x; \
-  state[d] = NICK_ROTR32(state[d] ^ state[a], 16); \
-  state[c] = state[c] + state[d]; \
-  state[b] = NICK_ROTR32(state[b] ^ state[c], 12); \
-  state[a] = state[a] + state[b] + y; \
-  state[d] = NICK_ROTR32(state[d] ^ state[a], 8); \
-  state[c] = state[c] + state[d]; \
-  state[b] = NICK_ROTR32(state[b] ^ state[c], 7); \
-
-#define NICK_LOAD32(block,i) \
-  ((uint32_t)(block[i+0]) << 0) | ((uint32_t)(block[i+1]) << 8) | ((uint32_t)(block[i+2]) << 16) | ((uint32_t)(block[i+3]) << 24)
-// end blake*/
-
-#define U32TO32_LITTLE(v) (v)
-#define U8TO32_LITTLE(p) (*(const uint32_t *)(p))
-#define U32TO8_LITTLE(p, v) (((uint32_t *)(p))[0] = U32TO32_LITTLE(v))
-#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n))))
-
-#define ROTATE(v, c) (ROTL32(v, c))
-#define XOR(v, w) ((v) ^ (w))
-#define PLUS(v, w) ((v) + (w))
-#define PLUSONE(v) (PLUS((v), 1))
-
-#define QUARTERROUND(a, b, c, d) \
-    a = PLUS(a, b);              \
-    d = ROTATE(XOR(d, a), 16);   \
-    c = PLUS(c, d);              \
-    b = ROTATE(XOR(b, c), 12);   \
-    a = PLUS(a, b);              \
-    d = ROTATE(XOR(d, a), 8);    \
-    c = PLUS(c, d);              \
-    b = ROTATE(XOR(b, c), 7)
-
-#define BYTESWAP32(x) \
-	x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16; \
-	x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8
-
-void chacha8_keysetup_data(uint32_t *input, const uint8_t *k, uint32_t kbits, const uint8_t *iv);
-void chacha8_keysetup(struct chacha8_ctx *x, const uint8_t *k, uint32_t kbits, const uint8_t *iv);
-void chacha8_get_k32_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint32_t *c);
-void chacha8_get_keystream_data(const uint32_t *input,uint64_t pos,uint32_t n_blocks,uint8_t *c);
-void chacha8_get_keystream(
-    const struct chacha8_ctx *x,
-    uint64_t pos,
-    uint32_t n_blocks,
-    uint8_t *c);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  // SRC_CHACHA8_H_
diff --git a/chia/util.hpp b/chia/util.hpp
deleted file mode 100644
index f114e1f..0000000
--- a/chia/util.hpp
+++ /dev/null
@@ -1,378 +0,0 @@
-// Copyright 2018 Chia Network Inc
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//    http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef SRC_CPP_UTIL_HPP_
-#define SRC_CPP_UTIL_HPP_
-
-#include <cassert>
-#include <chrono>
-#include <cstring>
-#include <fstream>
-#include <iomanip>
-#include <iostream>
-#include <map>
-#include <numeric>
-#include <queue>
-#include <random>
-#include <set>
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-template <typename Int>
-constexpr inline Int cdiv(Int a, int b) { return (a + b - 1) / b; }
-
-#ifdef _WIN32
-#define NOMINMAX
-#include <windows.h>
-#include <processthreadsapi.h>
-#include "uint128_t.h"
-#else
-// __uint__128_t is only available in 64 bit architectures and on certain
-// compilers.
-typedef __uint128_t uint128_t;
-
-// Allows printing of uint128_t
-std::ostream &operator<<(std::ostream &strm, uint128_t const &v)
-{
-    strm << "uint128(" << (uint64_t)(v >> 64) << "," << (uint64_t)(v & (((uint128_t)1 << 64) - 1))
-         << ")";
-    return strm;
-}
-
-#endif
-
-// compiler-specific byte swap macros.
-#if defined(_MSC_VER)
-
-#include <cstdlib>
-
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/byteswap-uint64-byteswap-ulong-byteswap-ushort?view=msvc-160
-inline uint16_t bswap_16(uint16_t x) { return _byteswap_ushort(x); }
-inline uint32_t bswap_32(uint32_t x) { return _byteswap_ulong(x); }
-inline uint64_t bswap_64(uint64_t x) { return _byteswap_uint64(x); }
-
-#elif defined(__clang__) || defined(__GNUC__)
-
-inline uint16_t bswap_16(uint16_t x) { return __builtin_bswap16(x); }
-inline uint32_t bswap_32(uint32_t x) { return __builtin_bswap32(x); }
-inline uint64_t bswap_64(uint64_t x) { return __builtin_bswap64(x); }
-
-#else
-#error "unknown compiler, don't know how to swap bytes"
-#endif
-
-/* Platform-specific cpuid include. */
-#if defined(_WIN32)
-#include <intrin.h>
-#elif defined(__x86_64__)
-#include <cpuid.h>
-#endif
-
-class Timer {
-public:
-    Timer()
-    {
-        wall_clock_time_start_ = std::chrono::steady_clock::now();
-#if _WIN32
-        ::GetProcessTimes(::GetCurrentProcess(), &ft_[3], &ft_[2], &ft_[1], &ft_[0]);
-#else
-        cpu_time_start_ = clock();
-#endif
-    }
-
-    static char *GetNow()
-    {
-        auto now = std::chrono::system_clock::now();
-        auto tt = std::chrono::system_clock::to_time_t(now);
-        return ctime(&tt);  // ctime includes newline
-    }
-
-    void PrintElapsed(const std::string &name) const
-    {
-        auto end = std::chrono::steady_clock::now();
-        auto wall_clock_ms = std::chrono::duration_cast<std::chrono::milliseconds>(
-                                 end - this->wall_clock_time_start_)
-                                 .count();
-
-#if _WIN32
-        FILETIME nowft_[6];
-        nowft_[0] = ft_[0];
-        nowft_[1] = ft_[1];
-
-        ::GetProcessTimes(::GetCurrentProcess(), &nowft_[5], &nowft_[4], &nowft_[3], &nowft_[2]);
-        ULARGE_INTEGER u[4];
-        for (size_t i = 0; i < 4; ++i) {
-            u[i].LowPart = nowft_[i].dwLowDateTime;
-            u[i].HighPart = nowft_[i].dwHighDateTime;
-        }
-        double user = (u[2].QuadPart - u[0].QuadPart) / 10000.0;
-        double kernel = (u[3].QuadPart - u[1].QuadPart) / 10000.0;
-        double cpu_time_ms = user + kernel;
-#else
-        double cpu_time_ms =
-            1000.0 * (static_cast<double>(clock()) - this->cpu_time_start_) / CLOCKS_PER_SEC;
-#endif
-
-        double cpu_ratio = static_cast<int>(10000 * (cpu_time_ms / wall_clock_ms)) / 100.0;
-
-        std::cout << name << " " << (wall_clock_ms / 1000.0) << " seconds. CPU (" << cpu_ratio
-                  << "%) " << Timer::GetNow();
-    }
-
-private:
-    std::chrono::time_point<std::chrono::steady_clock> wall_clock_time_start_;
-#if _WIN32
-    FILETIME ft_[4];
-#else
-    clock_t cpu_time_start_;
-#endif
-
-};
-
-namespace Util {
-
-    template <typename X>
-    inline X Mod(X i, X n)
-    {
-        return (i % n + n) % n;
-    }
-
-    inline uint32_t ByteAlign(uint32_t num_bits) { return (num_bits + (8 - ((num_bits) % 8)) % 8); }
-
-    inline std::string HexStr(const uint8_t *data, size_t len)
-    {
-        std::stringstream s;
-        s << std::hex;
-        for (size_t i = 0; i < len; ++i)
-            s << std::setw(2) << std::setfill('0') << static_cast<int>(data[i]);
-        s << std::dec;
-        return s.str();
-    }
-
-    inline void IntToTwoBytes(uint8_t *result, const uint16_t input)
-    {
-        uint16_t r = bswap_16(input);
-        memcpy(result, &r, sizeof(r));
-    }
-
-    // Used to encode deltas object size
-    inline void IntToTwoBytesLE(uint8_t *result, const uint16_t input)
-    {
-        result[0] = input & 0xff;
-        result[1] = input >> 8;
-    }
-
-    inline uint16_t TwoBytesToInt(const uint8_t *bytes)
-    {
-        uint16_t i;
-        memcpy(&i, bytes, sizeof(i));
-        return bswap_16(i);
-    }
-
-    /*
-     * Converts a 64 bit int to bytes.
-     */
-    inline void IntToEightBytes(uint8_t *result, const uint64_t input)
-    {
-        uint64_t r = bswap_64(input);
-        memcpy(result, &r, sizeof(r));
-    }
-
-    /*
-     * Converts a byte array to a 64 bit int.
-     */
-    inline uint64_t EightBytesToInt(const uint8_t *bytes)
-    {
-        uint64_t i;
-        memcpy(&i, bytes, sizeof(i));
-        return bswap_64(i);
-    }
-
-    static void IntTo16Bytes(uint8_t *result, const uint128_t input)
-    {
-        uint64_t r = bswap_64(input >> 64);
-        memcpy(result, &r, sizeof(r));
-        r = bswap_64((uint64_t)input);
-        memcpy(result + 8, &r, sizeof(r));
-    }
-
-    /*
-     * Retrieves the size of an integer, in Bits.
-     */
-    inline uint8_t GetSizeBits(uint128_t value)
-    {
-        uint8_t count = 0;
-        while (value) {
-            count++;
-            value >>= 1;
-        }
-        return count;
-    }
-
-    // 'bytes' points to a big-endian 64 bit value (possibly truncated, if
-    // (start_bit % 8 + num_bits > 64)). Returns the integer that starts at
-    // 'start_bit' that is 'num_bits' long (as a native-endian integer).
-    //
-    // Note: requires that 8 bytes after the first sliced byte are addressable
-    // (regardless of 'num_bits'). In practice it can be ensured by allocating
-    // extra 7 bytes to all memory buffers passed to this function.
-    inline uint64_t SliceInt64FromBytes(
-        const uint8_t *bytes,
-        uint32_t start_bit,
-        const uint32_t num_bits)
-    {
-        uint64_t tmp;
-
-        if (start_bit + num_bits > 64) {
-            bytes += start_bit / 8;
-            start_bit %= 8;
-        }
-
-        tmp = Util::EightBytesToInt(bytes);
-        tmp <<= start_bit;
-        tmp >>= 64 - num_bits;
-        return tmp;
-    }
-
-    inline uint64_t SliceInt64FromBytesFull(
-        const uint8_t *bytes,
-        uint32_t start_bit,
-        uint32_t num_bits)
-    {
-        uint32_t last_bit = start_bit + num_bits;
-        uint64_t r = SliceInt64FromBytes(bytes, start_bit, num_bits);
-        if (start_bit % 8 + num_bits > 64)
-            r |= bytes[last_bit / 8] >> (8 - last_bit % 8);
-        return r;
-    }
-
-    inline uint128_t SliceInt128FromBytes(
-        const uint8_t *bytes,
-        const uint32_t start_bit,
-        const uint32_t num_bits)
-    {
-        if (num_bits <= 64)
-            return SliceInt64FromBytesFull(bytes, start_bit, num_bits);
-
-        uint32_t num_bits_high = num_bits - 64;
-        uint64_t high = SliceInt64FromBytesFull(bytes, start_bit, num_bits_high);
-        uint64_t low = SliceInt64FromBytesFull(bytes, start_bit + num_bits_high, 64);
-        return ((uint128_t)high << 64) | low;
-    }
-
-    inline void GetRandomBytes(uint8_t *buf, uint32_t num_bytes)
-    {
-        std::random_device rd;
-        std::mt19937 mt(rd());
-        std::uniform_int_distribution<int> dist(0, 255);
-        for (uint32_t i = 0; i < num_bytes; i++) {
-            buf[i] = dist(mt);
-        }
-    }
-
-    inline uint64_t ExtractNum(
-        const uint8_t *bytes,
-        uint32_t len_bytes,
-        uint32_t begin_bits,
-        uint32_t take_bits)
-    {
-        if ((begin_bits + take_bits) / 8 > len_bytes - 1) {
-            take_bits = len_bytes * 8 - begin_bits;
-        }
-        return Util::SliceInt64FromBytes(bytes, begin_bits, take_bits);
-    }
-
-    // The number of memory entries required to do the custom SortInMemory algorithm, given the
-    // total number of entries to be sorted.
-    inline uint64_t RoundSize(uint64_t size)
-    {
-        size *= 2;
-        uint64_t result = 1;
-        while (result < size) result *= 2;
-        return result + 50;
-    }
-
-    /*
-     * Like memcmp, but only compares starting at a certain bit.
-     */
-    inline int MemCmpBits(
-        uint8_t *left_arr,
-        uint8_t *right_arr,
-        uint32_t len,
-        uint32_t bits_begin)
-    {
-        uint32_t start_byte = bits_begin / 8;
-        uint8_t mask = ((1 << (8 - (bits_begin % 8))) - 1);
-        if ((left_arr[start_byte] & mask) != (right_arr[start_byte] & mask)) {
-            return (left_arr[start_byte] & mask) - (right_arr[start_byte] & mask);
-        }
-
-        for (uint32_t i = start_byte + 1; i < len; i++) {
-            if (left_arr[i] != right_arr[i])
-                return left_arr[i] - right_arr[i];
-        }
-        return 0;
-    }
-
-    inline double RoundPow2(double a)
-    {
-        // https://stackoverflow.com/questions/54611562/truncate-float-to-nearest-power-of-2-in-c-performance
-        int exp;
-        double frac = frexp(a, &exp);
-        if (frac > 0.0)
-            frac = 0.5;
-        else if (frac < 0.0)
-            frac = -0.5;
-        double b = ldexp(frac, exp);
-        return b;
-    }
-
-#if defined(_WIN32) || defined(__x86_64__)
-    void CpuID(uint32_t leaf, uint32_t *regs)
-    {
-#if defined(_WIN32)
-        __cpuid((int *)regs, (int)leaf);
-#else
-        __get_cpuid(leaf, &regs[0], &regs[1], &regs[2], &regs[3]);
-#endif /* defined(_WIN32) */
-    }
-
-    bool HavePopcnt(void)
-    {
-        // EAX, EBX, ECX, EDX
-        uint32_t regs[4] = {0};
-
-        CpuID(1, regs);
-        // Bit 23 of ECX indicates POPCNT instruction support
-        return (regs[2] >> 23) & 1;
-    }
-#endif /* defined(_WIN32) || defined(__x86_64__) */
-
-    inline uint64_t PopCount(uint64_t n)
-    {
-#if defined(_WIN32)
-        return __popcnt64(n);
-#elif defined(__x86_64__)
-        uint64_t r;
-        __asm__("popcnt %1, %0" : "=r"(r) : "r"(n));
-        return r;
-#else
-        return __builtin_popcountl(n);
-#endif /* defined(_WIN32) ... defined(__x86_64__) */
-    }
-}
-
-#endif  // SRC_CPP_UTIL_HPP_
diff --git a/drplotter.cu b/drplotter.cu
deleted file mode 100644
index 4d455c8..0000000
--- a/drplotter.cu
+++ /dev/null
@@ -1,3662 +0,0 @@
-/*
- ============================================================================
- Name        : drplotter.cu
- Author      : NH
- Version     :
- Copyright   : Your copyright notice
- Description : CUDA compute reciprocals
- ============================================================================
- */
-
-#include <iostream>
-#include <numeric>
-#include <stdlib.h>
-#include <stdio.h>
-#include <cstdio>
-#include <chrono>
-#include <cuda_fp16.h>
-
-// for mmap
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h> /* mmap() is defined in this header */
-#include <fcntl.h>
-
-#include "chia/util.hpp"
-#include "chia/chacha8.h"
-#include "nick_globals.hpp"
-#include "attack.hpp"
-#include "phase2.hpp"
-
-
-
-const uint16_t THREADS_FOR_MATCHING = 256; // 386 is 10600ms matching. 256 is 9761ms matching. 237 is...10109
-
-int cmd_read = 0;
-
-using milli = std::chrono::milliseconds;
-int64_t total_gpu_time_ms = 0;
-int64_t total_transfer_in_time_ms = 0;
-int64_t total_transfer_out_time_ms = 0;
-int64_t total_chacha_time_ms = 0;
-int64_t total_match_time_ms = 0;
-uint64_t total_transfer_in_bytes = 0;
-uint64_t total_transfer_out_bytes = 0;
-int64_t table_gpu_time_ms = 0;
-int64_t table_transfer_in_time_ms = 0;
-int64_t table_transfer_out_time_ms = 0;
-int64_t table_match_time_ms = 0;
-uint64_t table_transfer_in_bytes = 0;
-uint64_t table_transfer_out_bytes = 0;
-
-// global memory
-char *host_criss_cross_blocks; // aka host_meta_blocks
-char *host_refdata_blocks;
-char *device_buffer_A;
-char *device_buffer_B;
-
-char *device_buffer_C;
-char *device_buffer_T3_base;
-char *device_buffer_refdata;
-
-int* device_block_entry_counts; // [BATCHES];
-int* device_local_kbc_num_entries;
-uint32_t host_criss_cross_entry_counts[BATCHES * BATCHES]; // kbc counts for each block
-
-
-#include "nick_blake3.hpp"
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_calc_only(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	// match: 25804 ms
-	// phase 1: 4366ms
-	__shared__ uint Lys[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ uint Rys[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ Index_Match matches[512];//KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int yr_yl_bid_m_results[kB*2];
-	__shared__ int yr_yl_cid_mod_kC[kC*2];
-
-
-	uint32_t kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	const uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (num_L == 0) {
-		return;
-	}
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		Rys[pos_R] = (R_entry.y / kC) + ((R_entry.y % kC) << 8); // do mod and div entries too in bitmask.
-	}
-	for (int pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		Lys[pos_L] = (L_entry.y / kC) + ((L_entry.y % kC) << 8);
-	}
-	const int16_t parity = (global_kbc_L_bucket_id) % 2;
-	for (int i=threadIdx.x;i<kB*2;i+=blockDim.x) {
-		int16_t yr_bid_minus_yl_bid = (i-kB);
-		int16_t m = yr_bid_minus_yl_bid;
-		if (m < 0) m+=kB;
-		int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC;
-		//if (blockIdx.x == 0) printf("m: %d  parity:%d m2_parity_squared: %d\n", m, parity, m2_parity_squared);
-		yr_yl_bid_m_results[i] = (m << 8) + m2_parity_squared;
-	}
-	for (int i=threadIdx.x;i<kC*2;i+=blockDim.x) {
-		int16_t yr_cid_minus_yl_cid = i-kC;
-		if (yr_cid_minus_yl_cid < 0) yr_cid_minus_yl_cid += kC;
-		yr_yl_cid_mod_kC[i] = yr_cid_minus_yl_cid;
-	}
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  -> MEANS yr/kC can only match with the 64 slots including and to the right of yl/kC
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		const uint yr_data = Rys[pos_R];
-		//int16_t yr_kbc = yr_data;// & 0b01111111111111111;
-		const int16_t yr_bid = yr_data & 0b011111111; // yr_kbc / kC; // values [0..kB]
-		const int16_t yr_cid = (yr_data >> 8);//yr_kbc % kC;//(yr_data >> 24);
-		for (int pos_L = 0; pos_L < num_L; pos_L++) {
-			// do L_entry and R_entry match?
-			const uint yl_data = Lys[pos_L];
-			//int16_t yl_kbc = yl_data;// & 0b01111111111111111;
-			const int8_t yl_bid = yl_data & 0b011111111; //yl_kbc / kC; values [0..kB]
-			const int8_t yl_cid = yl_data >> 8;//yl_kbc % kC;//(yl_data >> 24);
-
-			int16_t m_results = yr_yl_bid_m_results[yr_bid-yl_bid+kB];
-			int16_t m = m_results >> 8;//& 0b011111111;
-			int16_t m2_parity_squared = (m_results & 0b011111111);
-			int16_t formula_two = yr_yl_cid_mod_kC[yr_cid - yl_cid + kC];
-
-			//int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			//if (formula_one < 0) {
-			//	formula_one += kB;
-			//}
-			//int16_t m = formula_one;
-			//if (m >= kB) {
-			//	m -= kB;
-			//}
-
-			//int16_t m = (yr_bid - yl_bid);
-			//if (m < 0) m+=kB;
-			//if (m >= kB) m-=kB;
-
-			//if (m < 64) {
-				// passed first test
-				//const int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				//int16_t formula_two = yr_cid - yl_cid;
-				//if (formula_two < 0) formula_two += kC;
-
-				if ((m < 64) && (formula_two == m2_parity_squared)) {
-					// we have a match.
-					int num_matches = atomicAdd(&total_matches,1);
-					//if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					//	printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					//} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;//value >> 4;
-						matches[num_matches] = match;
-					//}
-				}
-			//}
-						/*
-
-
-
-
-			uint16_t m = (yr_bid - yl_bid) % kB; // 77ms w/o branch mod test, big jump w/ mod. - 158ms
-			uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC;
-			uint16_t formula_two = (yr_cid - yl_cid) % kC;
-			//if (m < 0) {
-			//	m += kB;
-			//}// else if (m >= kB) m-=kB;
-			if ((m < 64) && (m2_parity_squared == formula_two)) {
-				//uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC;
-				//uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC;
-				int num_matches = atomicAdd(&total_matches,1);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;//value >> 4;
-					matches[num_matches] = match;
-				}
-			}*/
-		}
-	}
-
-	/*for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		int16_t yr_kbc = R_entry.y;
-		int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-		for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) {
-			// do L_entry and R_entry match?
-			BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-			int16_t yl_kbc = L_entry.y;
-			int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-			int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			if (formula_one < 0) {
-				formula_one += kB;
-			}
-			int16_t m = formula_one;
-			if (m >= kB) {
-				m -= kB;
-			}
-			if (m < 64) {
-				// passed first test
-				int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-				int16_t yr_cid = yr_kbc % kC;
-				int16_t parity = (global_kbc_L_bucket_id) % 2;
-				int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				int16_t formula_two = yr_cid - yl_cid;
-				if (formula_two < 0) {
-					formula_two += kC;
-				}
-				if (formula_two == m2_parity_squared) {
-					// we have a match.
-					int num_matches = atomicAdd(&total_matches,1);
-					if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-						printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;//value >> 4;
-						matches[num_matches] = match;
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();*/
-
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			printf("Bucket L %u Total matches: %u\n", kbc_L_bucket_id, total_matches);
-		}
-		if (total_matches > KBC_MAX_ENTRIES_PER_BUCKET) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET);
-			total_matches = KBC_MAX_ENTRIES_PER_BUCKET;
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) {
-			//	printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-				//Ly is:[20932] Lx: [322482289] Rx: [3382886636]  f result:[273114646565]
-				//if (blake_result == 56477140042) {
-				//	printf(" ---** BLAKE CORRECT **\n");
-				//} else {
-				//	printf(" ---** BLAKE WRONG :(((( \n");
-				//}
-				// Ly is:[21557] Lx: [3620724289] Rx: [2663198278]  f result:[56477140042]
-				//}
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			if (global_kbc_L_bucket_id == 1) {
-				uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-				uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-				printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		if (table < 6) {
-			uint64_t batch_bucket = blake_result >> (38-6);
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-			uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-			if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-				printf("ERROR: results address overflow\n");
-			} else {
-				bucketed_out[pair_address] = pair;
-			}
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	//if (threadIdx.x == 0) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-		//if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-	//}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if (threadIdx.x == 0) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-	// table 1 4865ms match time to beat.
-	// with shared mem for pos_L/R is 3942 - win!
-	// formula improvement (one branch) - 3810ms
-	// removal of max kbc test in m loop - 3639ms +33% faster.
-	// shared compute buffers to prevent % and division - 2280ms!
-	//   -- now getting dangerously close to best algo time of 1606ms :)
-}
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_test(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	__shared__ int total_matches;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	uint8_t doPrint = 2;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-
-	//   For any 0 <= m < kExtraBitsPow:
-	//   yl / kBC + 1 = yR / kBC   AND
-	//   (yr % kBC) / kC - (yl % kBC) / kC = m   (mod kB)  -> MEANS (1) yr/kC can only match with the 64 slots including and to the right of yl/kC
-	//   (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2   (mod kC)
-
-	// yr_kc's : [0..127] -> contains what? Either y-value, then compute matching m, or contains %kC
-	// if /kC distance yr to yl is 5, m = 5, then diff %kC must be (20^2)%kC = 400 % kC =
-
-	//  000001111111111000000 yl1
-	//  000111111111100000000 y12
-	//  000000011111111111000 yl3
-
-	const uint16_t parity = global_kbc_L_bucket_id % 2;
-	for (int16_t Ry = threadIdx.x; Ry < kBC; Ry+=blockDim.x) {
-		int16_t yr_kbc = Ry;
-		int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-		for (int16_t Ly = 0; Ly < kBC; Ly++) {
-			int16_t yl_kbc = Ly;
-			int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-			int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			if (formula_one < 0) {
-				formula_one += kB;
-			}
-			int16_t m = formula_one;
-			if (m >= kB) {
-				m -= kB;
-			}
-			if (m < 64) {
-				// passed first test
-				int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-				int16_t yr_cid = yr_kbc % kC;
-				int16_t parity = (global_kbc_L_bucket_id) % 2;
-				int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				int16_t formula_two = yr_cid - yl_cid;
-				if (formula_two < 0) {
-					formula_two += kC;
-				}
-				if (formula_two == m2_parity_squared) {
-					// we have a match.
-					printf("match Ly:%u Ry:%u\n", Ly, Ry);
-					atomicAdd(&total_matches,1);
-				}
-			}
-		}
-	}
-	if (threadIdx.x == 0) {
-		printf("Done. Total matche: %u", total_matches);
-	}
-
-}
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	uint8_t doPrint = 1;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-
-	/*bool printandquit = ((global_kbc_L_bucket_id == 0));
-		if (printandquit) {
-			if (threadIdx.x == 0) {
-
-				printf("R_y list:\n");
-				for (size_t pos_R = 0; pos_R < num_R; pos_R++) {
-					uint16_t r_y = kbc_R_entries[pos_R].y;
-					printf("[x:%u y:%u]\n",kbc_R_entries[pos_R].meta[0], r_y);
-				}
-				printf("L_y list num %u:\n", num_L);
-				for (size_t pos_L = 0; pos_L < num_L; pos_L++) {
-					uint16_t l_y = kbc_L_entries[pos_L].y;
-					printf("[x:%u y:%u]\n",kbc_L_entries[pos_L].meta[0], l_y);
-				}
-			}
-		}*/
-	//__syncthreads();
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		int kbc_map = r_y / 2;
-		const int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	// benchmark: 66ms at this point
-	//if ((nick_rmap_extras_ry[threadIdx.x % 32] + nick_rmap_extras_pos[threadIdx.x % 32]) == 2334534423) printf("bogus");
-	//return;
-
-	// load parity tables into shared
-	/*if (printandquit) {
-		if (threadIdx.x == 0) {
-			printf("num extras bucket %u : %u   parity: %u \n", global_kbc_L_bucket_id, num_extras, parity);
-
-			for (int i=0;i<kBC;i++) {
-				int kbc_map = i / 2;
-				const int kbc_box_shift = (i % 2) * 15;
-				int rmap_value = nick_rmap[kbc_map];
-				rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-				//uint16_t rmap_value = nick_rmap[i];
-				uint16_t pos = (rmap_value & 0b0111111111);
-				if (rmap_value > 0) {
-					printf("kbc:%i  value:%u pos:%u\n", i, rmap_value, pos);
-				}
-			}
-
-		}
-
-	}
-	__syncthreads();*/
-
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-			//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-			BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-			uint16_t l_y = L_entry.y;
-			uint16_t indJ = l_y / kC;
-			//printf("scanning for pos_L: %u\n", pos_L);
-
-			// this part is killer, this does add bulk of time.
-			// weird simplfiying the math doesn't help much unless you pragma unroll it
-			// might be too much branching inside too.
-				// setup code for loop increment "optimization"
-				//uint16_t indJ_mod_kB_times_kC = ((indJ + 0) % kB) * kC;
-				//uint16_t start_parity_add = 4 + parity * 4;
-				//uint16_t parity_base = (parity + l_y) % kC;
-				//const uint16_t m_switch_kb = kB - indJ; // calculate point at which indJ + m is %kb!
-			for (int m=0;m<64;m++) {
-
-				//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-					// is super-inefficient.
-
-				// 27.58ms
-				uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-
-
-
-
-					// a cute "optimization" but saves no time whatsoever...27.7ms instead of 27.58ms :/
-					//if (m_switch_kb == m) indJ_mod_kB_times_kC = ((indJ + m) % kB) * kC; // 323ms // 490
-					//uint16_t r_target = indJ_mod_kB_times_kC + parity_base;
-					//indJ_mod_kB_times_kC += kC; // 256ms
-					//parity_base += start_parity_add;
-					//if (parity_base >= kC) parity_base -= kC;
-					//start_parity_add += 8;
-					//if (start_parity_add >= kC) start_parity_add -= kC;
-					//if (test_target != r_target) {
-					//	printf("Ly: %u m: %u target: %u test_target: %u \n", l_y, m, r_target, test_target);
-					//}
-
-
-				//if (r_target + indJ == m) bogus_match_counter++;
-				//if (bogus_match_counter >= KBC_MAX_ENTRIES_PER_BUCKET) {
-				//		printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, bogus_match_counter);
-				//}
-
-				// find which box our r_target is in, extra the 15bit value from that box
-				int kbc_map = r_target / 2;
-				const int kbc_box_shift = (r_target % 2) * 15;
-				int rmap_value = nick_rmap[kbc_map];
-				rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-				if (rmap_value > 0) {
-					// the pos_R is the lower 9 bits of that 15bit boxed value
-					uint16_t pos_R = rmap_value & 0b0111111111;
-					uint16_t count = rmap_value / 1024;
-
-					//if (printandquit) {
-					//	printf("L_y: %u  r_target hit: %u   pos_R:%u\n", l_y, r_target, pos_R);
-					//}
-					int num_matches = atomicAdd(&total_matches,1);//count);
-					if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-						printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;
-						matches[num_matches] = match;
-
-
-						// handle edge cases
-						// TODO: let's push these into separate array
-						// then test them later.
-						if (count > 1) {
-							int slot = atomicAdd(&y_duplicate_counts, 1);
-							nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-							// add the extras
-							/*int extra_match = 0;
-							for (int slot = 0; slot < num_extras; slot++) {
-								if (nick_rmap_extras_ry[slot] == r_target) {
-									uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-									match.idxR = extra_pos_R;//value >> 4;
-									int num_matches = atomicAdd(&total_matches,1);
-									matches[num_matches] = match;
-									//extra_match++;
-									//matches[num_matches+extra_match] = match;
-									//if (doPrint > 1) {
-									//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-									//}
-								}
-							}*/
-
-							//if (global_kbc_L_bucket_id < 10) {
-							//	if (extra_match != count-1) {
-							//		printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count);
-							//	} else {
-							//		printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count);
-							//	}
-							//}
-						}
-					}
-				}
-			}
-		}
-
-	__syncthreads();
-
-	// up until this point matching takes 976ms total for k32
-	// it's 936ms with only the total matches counter (so about 40ms for appending match data)
-	// 745ms with a bogus counter (so no shared atomic conflict)
-	// it's 586ms with only m computations and bogus counter (no lookups) - so rmap lookups add 140ms
-	// it's 128ms with only 1m -- so calculations are adding 460ms!!!
-	// in summary:
-	// -- 460ms : m loop calculations - moreso the actual m loop than the math inside!
-	// -- 140ms : rmap lookups (bank conflict improvements possible)
-	// -- 128ms : data reads
-	//     - 66ms rmap setup
-	//     - 62ms reading y values back in
-	// --  40ms : match atomic shared counter (vs non atomic shared counter)
-	//if (threadIdx.x == 0) {
-	//	if (total_matches == 1342343) printf("bogus");
-	//}
-	//return;
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-
-			blake_result = 23;
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-			//	uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-			//	uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-			//	printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			//}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		if (table < 6) {
-			uint64_t batch_bucket = blake_result >> (38-6); // 27.52ms for 1/64 of kbcs
-			//uint64_t batch_bucket = threadIdx.x % 64; // 25.3ms with blake computation, 20ms without. So blake adds 5ms for 1/64 of values;
-			//uint64_t batch_bucket = 0; // 18ms per 1/64 of values, and our block counts aren't even optimized since global locking on atomic adds
-			// so...in theory could reduce from 27ms time down to sub 18ms, and then do blake pass on seperate scan, which *should* be faster.
-			// since we write less blocks/data in here
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-			uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-			//if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			//	printf("ERROR: results address overflow\n");
-			//} else {
-				// up to here takes 1508ms. Seems 1508-976 = 532ms for blake results
-				// quite substantial!
-				bucketed_out[pair_address] = pair;
-				// including the write-out is 1696ms
-			//}
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	if ((doPrint >=1) && (threadIdx.x == 0)) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-		if ((global_kbc_L_bucket_id % 1000000 == 0) || (global_kbc_L_bucket_id < 10)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-
-	}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if ((doPrint >= 1) && (threadIdx.x == 0)) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_direct_to_host(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		char *host_criss_cross, int *out_bucket_counts) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	uint8_t doPrint = 1;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	//bool printandquit = ((global_kbc_L_bucket_id == 75000));
-
-
-
-
-	//	if (printandquit) {
-			//printf("R_y list:\n");
-			//for (size_t pos_R = 0; pos_R < num_R; pos_R++) {
-			//	uint16_t r_y = kbc_R_entries[pos_R].y;
-			//	printf("%u\n",r_y);
-			//}
-			//if (threadIdx.x == 0) {
-			//	printf("L_y list num %u:\n", num_L);
-			//	for (size_t pos_L = 0; pos_L < num_L; pos_L++) {
-			//		uint16_t l_y = kbc_L_entries[pos_L].y;
-			//		printf("%u\n",l_y);
-			//	}
-			//}
-	//	}
-	//__syncthreads();
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		int kbc_map = r_y / 2;
-		const int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	// load parity tables into shared
-	/*if (printandquit) {
-		if (threadIdx.x == 0) {
-			printf("num extras bucket %u : %u   parity: %u \n", global_kbc_L_bucket_id, num_extras, parity);
-
-			for (int i=0;i<kBC;i++) {
-				int kbc_map = i / 2;
-				const int kbc_box_shift = (i % 2) * 15;
-				int rmap_value = nick_rmap[kbc_map];
-				rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-				//uint16_t rmap_value = nick_rmap[i];
-				uint16_t pos = (rmap_value & 0b0111111111);
-				if (rmap_value > 0) {
-					printf("kbc:%i  value:%u pos:%u\n", i, rmap_value, pos);
-				}
-			}
-
-		}
-
-	}
-	__syncthreads();*/
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			int kbc_map = r_target / 2;
-			const int kbc_box_shift = (r_target % 2) * 15;
-			int rmap_value = nick_rmap[kbc_map];
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				//if (printandquit) {
-				//	printf("L_y: %u  r_target hit: %u   pos_R:%u\n", l_y, r_target, pos_R);
-				//}
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-						// add the extras
-						/*int extra_match = 0;
-						for (int slot = 0; slot < num_extras; slot++) {
-							if (nick_rmap_extras_ry[slot] == r_target) {
-								uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-								match.idxR = extra_pos_R;//value >> 4;
-								int num_matches = atomicAdd(&total_matches,1);
-								matches[num_matches] = match;
-								//extra_match++;
-								//matches[num_matches+extra_match] = match;
-								//if (doPrint > 1) {
-								//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-								//}
-							}
-						}*/
-						//if (global_kbc_L_bucket_id < 10) {
-						//	if (extra_match != count-1) {
-						//		printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count);
-						//	} else {
-						//		printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count);
-						//	}
-						//}
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-			//	uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-			//	uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-			//	printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			//}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		if (table < 6) {
-			uint64_t batch_bucket = blake_result >> (38-6);
-			const uint64_t block_mod = (uint64_t) 1 << (38-6);
-			pair.y = (uint32_t) (blake_result % block_mod);
-			int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-
-			uint64_t criss_cross_id;
-			uint64_t cross_row_id = batch_id;
-			uint64_t cross_column_id = batch_bucket;
-			if ((table % 2) == 1) {
-				criss_cross_id = (cross_row_id * BATCHES  + cross_column_id);
-			} else {
-			  	criss_cross_id = (cross_column_id * BATCHES  + cross_row_id);
-			}
-			uint64_t host_block_entry_start_position = criss_cross_id * HOST_MAX_BLOCK_ENTRIES;
-			uint64_t host_bytes_start = host_block_entry_start_position * HOST_UNIT_BYTES;
-
-			BUCKETED_ENTRY_OUT *host_block = (BUCKETED_ENTRY_OUT *) &host_criss_cross[host_bytes_start];
-			host_block[block_slot] = pair;
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	if (threadIdx.x == 0) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-		if ((global_kbc_L_bucket_id % 1000000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-	}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if (threadIdx.x == 0) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_with_backref(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out,
-		char *bucketed_ref_out, int *out_bucket_counts) {
-	// T1 match: 1714 ms -> with delaying extras: 1630
-	//Total tables time: 73726 ms
-	//        match: 10015 ms -> 9705ms with delaying extras
-	const uint16_t NUM_RMAPS = (kBC/2)+1;
-	__shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count
-	__shared__ uint32_t nick_rmap_extras_rl[32];
-	__shared__ uint16_t nick_rmap_extras_ry[32];
-	__shared__ uint16_t nick_rmap_extras_pos[32];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-	__shared__ int y_duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	uint8_t doPrint = 1;
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		y_duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this
-	for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) {
-		nick_rmap[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	//bool printandquit = ((global_kbc_L_bucket_id == 75000));
-
-
-
-
-	//	if (printandquit) {
-			//printf("R_y list:\n");
-			//for (size_t pos_R = 0; pos_R < num_R; pos_R++) {
-			//	uint16_t r_y = kbc_R_entries[pos_R].y;
-			//	printf("%u\n",r_y);
-			//}
-			//if (threadIdx.x == 0) {
-			//	printf("L_y list num %u:\n", num_L);
-			//	for (size_t pos_L = 0; pos_L < num_L; pos_L++) {
-			//		uint16_t l_y = kbc_L_entries[pos_L].y;
-			//		printf("%u\n",l_y);
-			//	}
-			//}
-	//	}
-	//__syncthreads();
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-
-		// r_y's share a block across two adjacent values, so kbc_map just works out which part it's in.
-		int kbc_map = r_y / 2;
-		const int kbc_box_shift = (r_y % 2) * 15;
-		int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in)
-
-		int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above)
-		rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-		if (rmap_value == 0) {
-			// if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box
-			// and ONLY for this one.
-			atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift));
-			//if (printandquit) {
-			//	printf("r_y: %u   pos:%u\n", r_y, pos_R);
-			//}
-		} else {
-			// we hit duplicate entry...add this to a row
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	// load parity tables into shared
-	/*if (printandquit) {
-		if (threadIdx.x == 0) {
-			printf("num extras bucket %u : %u   parity: %u \n", global_kbc_L_bucket_id, num_extras, parity);
-
-			for (int i=0;i<kBC;i++) {
-				int kbc_map = i / 2;
-				const int kbc_box_shift = (i % 2) * 15;
-				int rmap_value = nick_rmap[kbc_map];
-				rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-				//uint16_t rmap_value = nick_rmap[i];
-				uint16_t pos = (rmap_value & 0b0111111111);
-				if (rmap_value > 0) {
-					printf("kbc:%i  value:%u pos:%u\n", i, rmap_value, pos);
-				}
-			}
-
-		}
-
-	}
-	__syncthreads();*/
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			// find which box our r_target is in, extra the 15bit value from that box
-			int kbc_map = r_target / 2;
-			const int kbc_box_shift = (r_target % 2) * 15;
-			int rmap_value = nick_rmap[kbc_map];
-			rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111;
-
-			if (rmap_value > 0) {
-				// the pos_R is the lower 9 bits of that 15bit boxed value
-				uint16_t pos_R = rmap_value & 0b0111111111;
-				uint16_t count = rmap_value / 1024;
-
-				//if (printandquit) {
-				//	printf("L_y: %u  r_target hit: %u   pos_R:%u\n", l_y, r_target, pos_R);
-				//}
-				int num_matches = atomicAdd(&total_matches,1);//count);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;
-					matches[num_matches] = match;
-
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						int slot = atomicAdd(&y_duplicate_counts, 1);
-						nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L;
-						// add the extras
-						/*int extra_match = 0;
-						for (int slot = 0; slot < num_extras; slot++) {
-							if (nick_rmap_extras_ry[slot] == r_target) {
-								uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-								match.idxR = extra_pos_R;//value >> 4;
-								int num_matches = atomicAdd(&total_matches,1);
-								matches[num_matches] = match;
-								//extra_match++;
-								//matches[num_matches+extra_match] = match;
-								//if (doPrint > 1) {
-								//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-								//}
-							}
-						}*/
-						//if (global_kbc_L_bucket_id < 10) {
-						//	if (extra_match != count-1) {
-						//		printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count);
-						//	} else {
-						//		printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count);
-						//	}
-						//}
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	// do the extras
-
-	//int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add!
-	for (int slot=threadIdx.x; slot<num_extras; slot+=blockDim.x) {
-		for (int i = 0; i < y_duplicate_counts; i++) {
-			uint32_t value = nick_rmap_extras_rl[i];
-			uint16_t r_target = value >> 16;
-			uint16_t pos_L = value & 0x0FFFF;
-			if (nick_rmap_extras_ry[slot] == r_target) {
-				uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-				Index_Match match = { };
-				match.idxL = pos_L;
-				match.idxR = extra_pos_R;
-				int num_matches = atomicAdd(&total_matches,1);
-				matches[num_matches] = match;
-				//matches[total_matches+slot] = match;
-				//if (doPrint > 1) {
-				//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-				//}
-			}
-		}
-	}
-
-	__syncthreads();
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-				//printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-			//}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			//if (global_kbc_L_bucket_id == 1) {
-			//	uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-			//	uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-			//	printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			//}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		//printf("table %u blake result: %llu\n", table, blake_result);
-		uint64_t batch_bucket = blake_result >> (38-6);
-		const uint64_t block_mod = (uint64_t) 1 << (38-6);
-		int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-		uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-		if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("ERROR: results address overflow\n");
-		} else {
-			if (table < 6) {
-				// our last table 6 doesn't write into hostmem criss cross, it just does backref with extra y instead.
-				pair.y = (uint32_t) (blake_result % block_mod);
-				bucketed_out[pair_address] = pair;
-			}
-		}
-
-		//// TODO: export Lx's to save into table, these are x1,x3 denoting 2 pairs that can be compressed into kbc buckets
-		// we *could* do double the data in table 3, but then we need extra buffers and memory that we don't have
-		if (table == 2) {
-			// this task can be left to the CPU to deal with the batch buckets and write baseref to file.
-		}
-		if ((table == 3) || (table == 4) || (table == 5) || (table == 6)) {
-
-			if (table == 6) {
-				// last table does backref with extra y truncated to most significant k bits.
-				T6BackRef ref = {};
-				ref.prev_block_ref_L = L_Entry.blockposref;
-				ref.prev_block_ref_R = R_Entry.blockposref;
-				ref.y = (uint32_t) (blake_result >> kExtraBits); // get top 32 most significant bits, since calc_y is 38 bits.
-				//printf("blake y result table 6: %llu  -> %u\n", blake_result, ref.y);
-
-				T6BackRef *out = (T6BackRef *) bucketed_ref_out;
-				//if ((ref.prev_block_ref_L == 0) && (ref.prev_block_ref_R == 0)) {
-				//	printf("Both refs are 0!\n");
-				//}
-				out[pair_address] = ref;
-			} else if (table == 3) {
-				T3BaseRef ref = {};
-				ref.Lx1 = L_Entry.meta[0];
-				ref.Lx2 = L_Entry.meta[2];
-				ref.Lx3 = R_Entry.meta[0];
-				ref.Lx4 = R_Entry.meta[2];
-				T3BaseRef *out = (T3BaseRef *) bucketed_ref_out;
-				out[pair_address] = ref;
-			} else if ((table == 3) || (table == 4) || (table == 5)) {
-				BackRef ref = {};
-				ref.prev_block_ref_L = L_Entry.blockposref;
-				ref.prev_block_ref_R = R_Entry.blockposref;
-				BackRef *out = (BackRef *) bucketed_ref_out;
-				//if ((ref.prev_block_ref_L == 0) && (ref.prev_block_ref_R == 0)) {
-				//	printf("Both refs are 0!\n");
-				//}
-				out[pair_address] = ref;
-			}
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	if (threadIdx.x == 0) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-		if ((global_kbc_L_bucket_id % 1000000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-	}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if (threadIdx.x == 0) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_rmap_working(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	//  match: 10000 ms
-	//  table 1 match match: 1633 ms, potentially 2.5x faster than orig method
-	//          with extras: 1841 ms - win!
-	//     with extras hashed counters (working): 2144 ms
-	// Total tables time: 77112 ms
-	//        match: 12505 ms
-	// TODO: TRY THIS AS GLOBAL MEMORY COVERING BATCH SIZE
-	//__shared__ __half nick_rmap_counts[kBC]; // 30226 bytes
-	const int RMAP_NUM_COUNTS_PER_BOX = 8; // whether 8  per box, 7, 4, bit counts 4 etc doesn't change result measurably I don't think.
-	const int RMAP_BITS_FOR_COUNTS = 4;
-	const int RMAP_COUNT_MASK = 0b01111;
-	const int NUM_RMAP_COUNTS = (15113 / RMAP_NUM_COUNTS_PER_BOX)+1;
-	__shared__ int nick_rmap_counts[NUM_RMAP_COUNTS]; // kBC / 2, sharing bits [12bits pos, 3 bits counter][12 bits pos, 3 bits counter]
-	//__shared__ int16_t nick_rmap_counts[kBC]; // 30226 bytes
-	__shared__ uint16_t nick_rmap_positions[kBC];
-	__shared__ uint16_t nick_rmap_extras_ry[100];
-	__shared__ uint16_t nick_rmap_extras_pos[100];
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	__shared__ int num_extras;
-
-	//__shared__ int non_duplicate_counts;
-	//__shared__ int duplicate_counts;
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	//if (global_kbc_L_bucket_id > 0) {
-	//	return;
-	//}
-
-	uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed
-	//if (global_kbc_L_bucket_id == 75000) {
-	//	doPrint = 100;
-	//}
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	if (threadIdx.x == 0) {
-		total_matches = 0;
-		num_extras = 0;
-		//non_duplicate_counts = 0;
-		//duplicate_counts = 0;
-		if (doPrint > 1) {
-			printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-			if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-				printf("ERROR numL or numR > max entries\n");
-				return;
-			}
-			if ((num_L == 0) || (num_R == 0) ) {
-				printf("ERROR: numL and numR are 0\n");
-				return;
-			}
-		}
-	}
-	// unfortunately to clear we have to do this 236 times for 64 threads
-	for (int i = threadIdx.x; i < NUM_RMAP_COUNTS; i += blockDim.x) {
-		nick_rmap_counts[i] = 0;
-	}
-	__syncthreads(); // all written initialize data should sync
-
-	//bool printandquit = ((global_kbc_L_bucket_id == 75000));
-
-
-
-
-	//	if (printandquit) {
-			//printf("R_y list:\n");
-			//for (size_t pos_R = 0; pos_R < num_R; pos_R++) {
-			//	uint16_t r_y = kbc_R_entries[pos_R].y;
-			//	printf("%u\n",r_y);
-			//}
-			//if (threadIdx.x == 0) {
-			//	printf("L_y list num %u:\n", num_L);
-			//	for (size_t pos_L = 0; pos_L < num_L; pos_L++) {
-			//		uint16_t l_y = kbc_L_entries[pos_L].y;
-			//		printf("%u\n",l_y);
-			//	}
-			//}
-	//	}
-	//__syncthreads();
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		uint16_t r_y = R_entry.y;
-		//int16_t rmap_value = nick_rmap_counts[r_y];
-		//uint8_t rmap_count = rmap_value & 0b0111;
-
-		// TODO: ok, let's make it MUCH easier, and have the atomic adds on 3 bits only
-		// and cut kbc_map into 15 bit counts (5 counts) each. Gives us plenty of space now
-		// to have separate rmap_positions entries, and greaty simplifies code (hopefully).
-		// however...may be slower!
-		//int kbc_map = r_y / 2;
-		//const int kbc_box_shift = (r_y % 2) * 12;
-		//int add = 1 << kbc_box_shift;
-		//int rmap_value = atomicAdd(&nick_rmap_counts[kbc_map],add);
-		//rmap_value = (rmap_value >> kbc_box_shift) & 0x0000FFFF;
-		//int rmap_count = rmap_value & 0b0111;
-
-		int kbc_map = r_y / RMAP_NUM_COUNTS_PER_BOX;
-		const int kbc_box_shift = (r_y % RMAP_NUM_COUNTS_PER_BOX) * RMAP_BITS_FOR_COUNTS; // 3 bits each, gives up to 111 = 7 duplicates
-
-		int add = 1 << kbc_box_shift;
-		int rmap_value = atomicAdd(&nick_rmap_counts[kbc_map],add);
-		int rmap_count = (rmap_value >> kbc_box_shift) & RMAP_COUNT_MASK;
-
-		if (rmap_count == 0) {
-			nick_rmap_positions[r_y] = pos_R;
-			//int add_value = (pos_R << 3) << kbc_box_shift;
-			//atomicAdd(&nick_rmap_counts[kbc_map], add_value);
-			//int16_t new_value = atomicAdd(&nick_rmap_counts[r_y], add_value); // encode position
-			//if ((printandquit) && (r_y == 1725)) {
-			//	nick_rmap_counts[r_y] = add + 1;
-				//unsigned short prev = atomicAdd(&nick_rmap_counts[r_y],add);
-				//printf("***** add value is: %u  prev:%u\n", add, prev);
-				//prev = atomicAdd(&nick_rmap_counts[r_y],1);
-				//printf("***** add value is: %u  prev:%u\n", add, prev);
-			//}
-			//nick_rmap_counts[r_y] = 1 + (pos_R << 3);
-		} else {
-			// we hit duplicate entry...
-			int slot = atomicAdd(&num_extras, 1);
-			nick_rmap_extras_ry[slot] = r_y;
-			nick_rmap_extras_pos[slot] = pos_R;
-		}
-	}
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	// load parity tables into shared
-	/*if (doPrint > 1) {
-		if (threadIdx.x == 0) {
-			printf("num extras bucket %u : %u   parity: %u \n", global_kbc_L_bucket_id, num_extras, parity);
-			if (printandquit) {
-				for (int i=1700;i<1750;i++) {
-					//unsigned short value = nick_rmap_counts[i];
-					//unsigned short count = value & 0b0111;
-					//printf("kbc:%u  value:%u count:%u\n", i, value, count);
-
-					int kbc_map = i / 2;
-					int kbc_box_shift = (i % 2) * 12;
-					int rmap_value = (nick_rmap_counts[kbc_map]) >> kbc_box_shift;
-					int rmap_count = rmap_value & (0b0111);
-					int pos = (rmap_value & 0b0111111111000) >> 3;
-					printf("kbc:%i  value:%u count:%u pos:%u\n", i, rmap_value, rmap_count,pos);
-				}
-			}
-		}
-
-	}
-	__syncthreads();*/
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-		uint16_t l_y = L_entry.y;
-		//printf("scanning for pos_L: %u\n", pos_L);
-
-		for (int m=0;m<64;m++) {
-
-			//uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup
-				// is super-inefficient.
-
-			uint16_t indJ = l_y / kC;
-			uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC);
-
-			//if (r_target != r_target_calc) {
-			//	printf("CALC ERROR r_target calc %u does not match r_target %u\n", r_target_calc, r_target);
-			//}
-
-			//uint16_t value = nick_rmap[r_target];
-			//uint8_t count = value & 0x000F;
-			//__half value = nick_rmap_counts[r_target];
-			//int16_t value = nick_rmap_counts[r_target];
-			//unsigned short value = nick_rmap_counts[r_target];
-			//unsigned short count = value & 0b0111;
-
-			//int kbc_map = r_target / 2;
-			//int kbc_box_shift = (r_target % 2) * 12;
-			//int value = (nick_rmap_counts[kbc_map] >> kbc_box_shift) & 0x0000FFFF;
-			//int count = value & (0b0111);
-
-			const int kbc_map = r_target / RMAP_NUM_COUNTS_PER_BOX;
-			const int kbc_box_shift = (r_target % RMAP_NUM_COUNTS_PER_BOX) * RMAP_BITS_FOR_COUNTS; // 3 bits each.
-
-			int rmap_value = nick_rmap_counts[kbc_map];
-			int count = (rmap_value >> kbc_box_shift) & RMAP_COUNT_MASK;
-
-			//if ((printandquit) && (l_y == 13414)) {
-				// bool superdebug = l_y == 13414  r_target hit: 1725
-			//	printf("  m: %u   r_target: %u   count:%u\n", m, r_target, count);
-			//}
-			if (count > 0) {
-				//uint16_t pos_R = value >> 3;
-				uint16_t pos_R = nick_rmap_positions[r_target];
-				//if (printandquit) {
-				//	printf("L_y: %u  r_target hit: %u\n", l_y, r_target);
-				//}
-				//printf("      has match\n");
-				int num_matches = atomicAdd(&total_matches,1);
-				if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-					printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-				} else {
-					Index_Match match = { };
-					match.idxL = pos_L;
-					match.idxR = pos_R;//nick_rmap_positions[r_target];//value >> 4;
-					matches[num_matches] = match;
-					//atomicAdd(&non_duplicate_counts,1);
-
-					// handle edge cases
-					// TODO: let's push these into separate array
-					// then test them later.
-					if (count > 1) {
-						// add the extras
-						//int extra_match = 0;
-						for (int slot = 0; slot < num_extras; slot++) {
-							if (nick_rmap_extras_ry[slot] == r_target) {
-								uint16_t extra_pos_R = nick_rmap_extras_pos[slot];
-								match.idxR = extra_pos_R;//value >> 4;
-								int num_matches = atomicAdd(&total_matches,1);
-								matches[num_matches] = match;
-								//extra_match++;
-								//matches[num_matches+extra_match] = match;
-								//atomicAdd(&duplicate_counts,1);
-								//if (doPrint > 1) {
-								//	printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot);
-								//}
-							}
-						}
-						//if (global_kbc_L_bucket_id < 10) {
-						//	if (extra_match != count-1) {
-						//		printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count);
-						//	} else {
-						//		printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count);
-						//	}
-						//}
-					}
-				}
-			}
-		}
-	}
-
-	__syncthreads();
-
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			//printf("Bucket L %u Total matches: %u   duplicate counts: %u non_dupes: %u\n", kbc_L_bucket_id, total_matches, duplicate_counts, non_duplicate_counts);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			if (global_kbc_L_bucket_id == 1) {
-				//if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) {
-					printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-					//Ly is:[20932] Lx: [322482289] Rx: [3382886636]  f result:[273114646565]
-					//if (blake_result == 56477140042) {
-					//	printf(" ---** BLAKE CORRECT **\n");
-					//} else {
-					//	printf(" ---** BLAKE WRONG :(((( \n");
-					//}
-					// Ly is:[21557] Lx: [3620724289] Rx: [2663198278]  f result:[56477140042]
-				//}
-			}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			if (global_kbc_L_bucket_id == 1) {
-				uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-				uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-				printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		uint64_t batch_bucket = blake_result >> (38-6);
-		const uint64_t block_mod = (uint64_t) 1 << (38-6);
-		pair.y = (uint32_t) (blake_result % block_mod);
-		int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-		uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-		if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("ERROR: results address overflow\n");
-		} else {
-			bucketed_out[pair_address] = pair;
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	if (threadIdx.x == 0) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-		if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-	}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if (threadIdx.x == 0) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-}
-
-
-template <typename BUCKETED_ENTRY_IN, typename BUCKETED_ENTRY_OUT>
-__global__
-void gpu_find_tx_matches_orig(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R,
-		const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries,
-		BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) {
-	const uint16_t MAX_BIDS = 16;
-	__shared__ uint16_t R_bids[kC*MAX_BIDS]; // kC is 127, this gives us 127*8 * 2 bytes = 2kb
-	__shared__ int R_bids_count[kC]; // size 127 bytes
-	__shared__ int R_bid_positions[kC*MAX_BIDS];//RBid_Entry R_bid_entries[kC*MAX_BIDS]; // size 127 * 8 * 6 bytes = 6kb
-	__shared__ uint8_t matching_shifts_c[64]; // 128 bytes
-	__shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET];
-	__shared__ int total_matches;
-	//*********************
-	//Total tables time: 86822 ms
-	//        match: 22397 ms
-	//      phase 1: 3930ms
-	//__shared__ Bucketed_kBC_Entry kbc_L_entries[400]; // will copy global to here, unfortunately not faster :(
-	//__shared__ Bucketed_kBC_Entry kbc_R_entries[400];
-
-	//end_kbc_R = end_kbc_R - start_kbc_L;
-	//start_kbc_L = 0;
-	//if (threadIdx.x == 0) {
-	//	printf("doing block inside kernel %u\n", start_kbc_L);
-	//}
-
-	int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... //  + start_kbc_L;
-	uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-
-	// doPrint 1 = end matches and bucket counts, 2 = a little debug, 3 = lots.
-	const uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed
-
-
-
-	if (gridDim.x != (end_kbc_R - start_kbc_L)) {
-		printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L);
-	}
-	int numThreadsInBlock = blockDim.x;
-	int threadId = threadIdx.x;
-	int threadStartScan = threadId;
-	int threadSkipScan = numThreadsInBlock;
-
-	//printf("threadId: %u  startScan: %u skipScan: %u", threadId, threadStartScan, threadSkipScan);
-	if (threadIdx.x == 0) {
-		// only do this once, should be in constant memory
-		/*for (uint16_t parity = 0; parity < 2; parity++) {
-			for (uint16_t r = 0; r < 64; r++) {
-				uint16_t v = ((2 * r + parity) * (2 * r + parity)) % kC;
-				matching_shifts_c[parity][r] = v;
-				//printf("matching shifts %u %u = %u\n", parity, r, v);
-			}
-		}*/
-		total_matches = 0;
-	}
-
-	uint16_t max_bids_found = 0;
-
-	//const uint32_t start_L = kbc_start_addresses[kbc_L_bucket_id];
-	//const uint32_t start_R = kbc_start_addresses[kbc_R_bucket_id];
-	//const int num_L = start_R - start_L;
-	//const int num_R = (start_R < kBC_NUM_BUCKETS) ? kbc_start_addresses[kbc_R_bucket_id+1] - start_R : total_entries_count - start_R;
-	const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET;
-	const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET;
-	const int num_L = kbc_local_num_entries[kbc_L_bucket_id];
-	const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)];
-
-	if (threadIdx.x == 0) {
-		if (doPrint > 1) printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R);
-		if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) {
-			printf("ERROR numL or numR > max entries\n");
-			return;
-		}
-		if ((num_L == 0) || (num_R == 0) ) {
-			printf("ERROR: numL and numR are 0\n");
-			return;
-		}
-	}
-
-	const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L];
-	const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R];
-
-	uint16_t parity = global_kbc_L_bucket_id % 2;
-	for (int r = threadIdx.x; r < 64; r += blockDim.x) {
-		uint16_t v = ((2 * r + parity) * (2 * r + parity)) % kC;
-		matching_shifts_c[r] = v; // this is a wash...doesn't save much if anything
-	}
-	for (int i = threadIdx.x; i < kC; i += blockDim.x) {
-		R_bids_count[i] = 0;
-	}
-
-	__syncthreads(); // all written initialize data should sync
-
-
-
-	//Bucketed_kBC_Entry L_entry = kbc_local_entries[0];
-	BUCKETED_ENTRY_IN temp_entry = kbc_L_entries[0];
-
-	//uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(temp_entry, global_kbc_L_bucket_id);
-	//uint16_t parity = (calc_y / kBC) % 2;
-
-
-	for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) {
-		//Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R];
-		BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R];
-		//global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L;
-		//calc_y = CALC_Y_BUCKETED_KBC_ENTRY(R_entry, global_kbc_L_bucket_id+1);
-		uint16_t y_kC = R_entry.y % kC; // should be same as calc_y % kC ?
-		uint16_t y_mod_kBC_div_kC = R_entry.y / kC; // should be same as R_entry.y / kC
-
-
-		int num_bids = atomicAdd(&R_bids_count[y_kC],1);
-		if (num_bids >= MAX_BIDS) {
-			printf("ERROR KBC LOCAL MAX BIDS EXCEEDED %u in global bucket %u\n", num_bids, global_kbc_L_bucket_id);
-			//printf("\nR_entry y:%u  meta[0]:%u  y_kC:%u  y_mod_kBC_div_kC: %u  into slot: %u\n ", R_entry.y, R_entry.meta[0], y_kC, y_mod_kBC_div_kC, num_bids);
-		} else {
-			// uint8_t num_bids = R_bids_count[y_kC]++;
-			R_bids[y_kC*MAX_BIDS + num_bids] = y_mod_kBC_div_kC;
-			//R_bid_entries[y_kC*MAX_BIDS + num_bids].x = R_entry.x;
-			R_bid_positions[y_kC*MAX_BIDS + num_bids] = pos_R;
-		}
-
-		//if (doPrint>2) printf("R_entry x:%u  y:%u  y_kC:%u  y_mod_kBC_div_kC: %u  into slot: %u\n ", R_entry.x, R_entry.y, y_kC, y_mod_kBC_div_kC, num_bids);
-
-
-		if (max_bids_found > num_bids) {
-			max_bids_found = num_bids;
-		}
-	}
-
-
-	__syncthreads(); // wait for all threads to write r_bid entries
-
-	for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) {
-		//Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L];
-		BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L];
-
-		//if (doPrint>2) printf("CHECKING pos_L:%u entry x:%u for match\n", pos_L, L_entry.x);
-		uint16_t yl_bid = L_entry.y / kC;
-		uint16_t yl_cid = L_entry.y % kC;
-
-		for (uint8_t m = 0; m < 64; m++) {
-			uint16_t target_bid = (yl_bid + m);
-			// TODO: benchmark if matching_shifts array is actually faster...doubt it.
-			uint16_t target_cid = yl_cid + matching_shifts_c[m]; // turns out it's a wash
-			//uint16_t target_cid = yl_cid + ((2 * m + parity) * (2 * m + parity)) % kC;
-
-			// This is faster than %
-			if (target_bid >= kB) {
-				target_bid -= kB;
-			}
-			if (target_cid >= kC) { // check if rid of %k on = part above.
-				target_cid -= kC;
-			}
-
-			uint16_t num_bids = R_bids_count[target_cid];
-			if (num_bids > MAX_BIDS) {
-				printf("PRUNING NUM BIDS FROM %u TO %u", num_bids, MAX_BIDS);
-				num_bids = MAX_BIDS;
-			}
-			// this inner loop is inefficient as num bids can vary...maybe push into list?
-			for (uint32_t i = 0; i < num_bids; i++) {
-				uint16_t R_bid = R_bids[target_cid*MAX_BIDS + i];
-
-				if (target_bid == R_bid) {
-					int pos_R = R_bid_positions[target_cid*MAX_BIDS + i];
-					int num_matches = atomicAdd(&total_matches,1);
-					if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) {
-						printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches);
-					} else {
-						Index_Match match = { };
-						match.idxL = pos_L;
-						match.idxR = pos_R;
-						matches[num_matches] = match;
-					}
-					//if (doPrint>2) {
-					//	printf("Thread %u pos_L:%u Match #%u found Lx:%u, Rx:%u\n", threadId, pos_L, num_matches, L_entry.x, R_entry.x);
-					//}
-					//printf("          Match found Lx:%u, Rx:%u\n", match.Lx, match.Rx);
-				}
-			}
-		}
-
-	}
-
-	__syncthreads();
-
-
-	if (threadIdx.x == 0) {
-		if (doPrint>1) {
-			// only do this once, should be in constant memory
-			//if (doPrint>2) {
-			//	printf("match list\n");
-			//	for (int i=0;i<total_matches;i++) {
-			//		Index_Match match = matches[i];
-			//		printf("match %u = Lx %u   Rx %u   y %u\n", i, match.Lx, match.Rx, match.y);
-			//	}
-			//}
-			printf("Bucket L %u Total matches: %u\n", kbc_L_bucket_id, total_matches);
-		}
-		if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) {
-			printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1);
-			total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1);
-		}
-	}
-
-	__syncthreads();
-
-	/*if ((global_kbc_L_bucket_id == 0) && (threadIdx.x == 0)) {
-
-		printf("Bucket match calc verification bucket %u num_matches: %u", global_kbc_L_bucket_id, total_matches);
-		for (int i=0;i < total_matches;i++) {
-			Index_Match match = matches[i];
-			BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-			BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-
-			printf("L_Entry y %u   R_Entry y %u\n", L_Entry.y, R_Entry.y);
-			int16_t yr_kbc = R_Entry.y;
-			int16_t yr_bid = yr_kbc / kC; // values [0..kB]
-			int16_t yl_kbc = L_Entry.y;
-			int16_t yl_bid = yl_kbc / kC; // values [0..kB]
-			int16_t formula_one = yr_bid - yl_bid; // this should actually give m
-			if (formula_one < 0) {
-				formula_one += kB;
-			}
-			int16_t m = formula_one;
-			if (m >= kB) {
-				m -= kB;
-			}
-			printf("     m value calc: %u\n", m);
-			if (m < 64) {
-				// passed first test
-				int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC
-				int16_t yr_cid = yr_kbc % kC;
-				int16_t parity = (global_kbc_L_bucket_id) % 2;
-				int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127]
-				int16_t formula_two = yr_cid - yl_cid;
-				if (formula_two < 0) {
-					formula_two += kC;
-				}
-				printf("     formula two %u   <-> %u  m2_parity %u\n", formula_two, m2_parity_squared);
-				if (formula_two == m2_parity_squared) {
-					// we have a match.
-					printf("       MATCH OK\n");
-				} else {
-					printf("      FAILED TO MATCH\n");
-				}
-			}
-		}
-
-	}*/
-
-	// now we go through all our matches and output to next round.
-	for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) {
-		Index_Match match = matches[i];
-		BUCKETED_ENTRY_OUT pair = {};
-		BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL];
-		BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR];
-		uint64_t blake_result;
-		uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id);
-		if (table == 1) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = R_Entry.meta[0];
-			//nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms
-			nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL);
-			if (global_kbc_L_bucket_id == 1) {
-				//if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) {
-					printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result);
-					//Ly is:[20932] Lx: [322482289] Rx: [3382886636]  f result:[273114646565]
-					//if (blake_result == 56477140042) {
-					//	printf(" ---** BLAKE CORRECT **\n");
-					//} else {
-					//	printf(" ---** BLAKE WRONG :(((( \n");
-					//}
-					// Ly is:[21557] Lx: [3620724289] Rx: [2663198278]  f result:[56477140042]
-				//}
-			}
-
-		} else if (table == 2) {
-			pair.meta[0] = L_Entry.meta[0];
-			pair.meta[1] = L_Entry.meta[1];
-			pair.meta[2] = R_Entry.meta[0];
-			pair.meta[3] = R_Entry.meta[1];
-			nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL);
-			if (global_kbc_L_bucket_id == 1) {
-				uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1];
-				uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3];
-				printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result);
-			}
-		} else if (table == 3) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta);
-		} else if (table == 4) {
-			const uint32_t meta[8] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3]
-			};
-			nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta);
-		} else if (table == 5) {
-			const uint32_t meta[6] = {
-					L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2],
-					R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2],
-			};
-			nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta);
-		} else if (table == 6) {
-			const uint32_t meta[4] = {
-					L_Entry.meta[0], L_Entry.meta[1],
-					R_Entry.meta[0], R_Entry.meta[1]
-			};
-			nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL);
-		}
-		uint64_t batch_bucket = blake_result >> (38-6);
-		const uint64_t block_mod = (uint64_t) 1 << (38-6);
-		pair.y = (uint32_t) (blake_result % block_mod);
-		int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1);
-		uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot;
-		if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("ERROR: results address overflow\n");
-		} else {
-			bucketed_out[pair_address] = pair;
-		}
-
-		// do we have a double bucket to write into?
-		//uint32_t double_bucket_id = 0;
-		//uint32_t kbc_bucket_id = blake_result / kBC;
-		//uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC;
-		//uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC;
-		//if (kbc_bucket_id == batch_bucket_min_kbc) {
-		//	double_bucket_id = batch_bucket - 1;
-		//} else if (kbc_bucket_id == batch_bucket_max_kbc) {
-		//	double_bucket_id = batch_bucket + 1;
-		//}
-	}
-
-	if (threadIdx.x == 0) {
-		//if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-		if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches);
-
-	}
-	/*
-	kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219
-	 kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248
-	 kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222
-	 kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185
-	Total matches: 4294859632
-
-	 Computing table 3
-	Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB.
- 	 kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276
- 	 kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227
- 	 kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212
- 	 kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221
- 	 Total matches: 4294848520
-	 */
-	if (threadIdx.x == 0) {
-		if (table == 1) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==222) && (num_R==242) && (total_matches==219)) {
-					printf("- TABLE 1 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 1 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==240) && (num_R==233) && (total_matches==232)) {
-					printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-		if (table == 2) {
-			if (global_kbc_L_bucket_id == 0) {
-				if ((num_L==228) && (num_R==253) && (total_matches==276)) {
-					printf("- TABLE 2 MATCHES CORRECT -\n");
-				} else {
-					printf("*** TABLE 2 MATCHES WRONG! ***\n");
-				}
-			}
-			//kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224
-
-			if (global_kbc_L_bucket_id == 4000000) {
-				if ((num_L==241) && (num_R==238) && (total_matches==224)) {
-					printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				} else {
-					printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches);
-				}
-			}
-		}
-	}
-}
-
-#define KBCFILTER_WITH_XINCLUDES(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	for (int j=0;j<64;j++) { \
-		if (include_xs[j] == (x+i)) { printf("including x %u\n", (x+i)); \
-	if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
-		F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-	} } \
-}
-
-//if ((x + i) < 256) { printf("x: %u  y:%llu  kbc:%u\n", (x+i), y, kbc_bucket_id); }
-#define KBCFILTER(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-	for (int j=0;j<64;j++) { \
-		if (include_xs[j] == (x+i)) { printf("including x %u\n", (x+i)); \
-	if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
-		F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-	} } \
-}
-
-//if ((x + i) < 256) { printf("x: %u  y:%llu  kbc:%u\n", (x+i), y, kbc_bucket_id); }
-//if (((x+i) % (1024*1024)) == 0) { printf("x: %u  chacha: %u y:%llu  kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
-//if (kbc_bucket_id == 0) { printf("x: %u  chacha: %u y:%llu  kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); }
-
-#define KBCFILTER(chacha_y,i) \
-{ \
-	uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \
-	uint32_t kbc_bucket_id = uint32_t (y / kBC); \
-if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \
-		uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \
-		int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \
-		F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \
-		if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \
-		uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \
-		kbc_local_entries[entries_address] = entry; \
-	} \
-}
-
-__global__
-void gpu_chacha8_get_k32_keystream_into_local_kbc_entries(const uint32_t N,
-		const __restrict__ uint32_t *input, F1_Bucketed_kBC_Entry *kbc_local_entries, int *kbc_local_num_entries,
-		uint32_t KBC_START, uint32_t KBC_END)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	int stride = blockDim.x * gridDim.x;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	/*const uint32_t include_xs[64] = {602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917,
-	                             3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,
-	                             1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862,
-	                             2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395,
-	                             2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700,
-	                             2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635,
-	                             2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,
-	                             2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969};*/
-
-	uint32_t x_group = index;
-	//for (uint32_t x_group = index; x_group <= end_n; x_group += stride) {
-		uint32_t x = x_group << 4;//  *16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += pos; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		//uint64_t y = x0 << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = x0 >> 22; // gives bucket id 0..1023
-		KBCFILTER(x0,0);KBCFILTER(x1,1);KBCFILTER(x2,2);KBCFILTER(x3,3);
-		KBCFILTER(x4,4);KBCFILTER(x5,5);KBCFILTER(x6,6);KBCFILTER(x7,7);
-		KBCFILTER(x8,8);KBCFILTER(x9,9);KBCFILTER(x10,10);KBCFILTER(x11,11);
-		KBCFILTER(x12,12);KBCFILTER(x13,13);KBCFILTER(x14,14);KBCFILTER(x15,15);
-	//}
-}
-
-__global__
-void gpu_print_kbc_counts(int *local_kbc_counts) {
-	for (int i = 0; i < 10/*KBC_LOCAL_NUM_BUCKETS*/; i++) {
-		printf("kbc bucket: %u  num:%u\n", i, local_kbc_counts[i]);
-	}
-}
-
-
-template <typename BUCKETED_ENTRY>
-__global__
-void gpu_print_kbc_bucket_contents(BUCKETED_ENTRY *entries, int *local_kbc_counts) {
-	for (uint32_t kbc_bucket_id = 0; kbc_bucket_id < 4/*KBC_LOCAL_NUM_BUCKETS*/; kbc_bucket_id++) {
-		int num = local_kbc_counts[kbc_bucket_id];
-		uint64_t add_Y = CALC_KBC_BUCKET_ADD_Y(kbc_bucket_id);
-		printf("kbc bucket: %u  num:%u\n", kbc_bucket_id, num);
-		for (int idxL=0;idxL<num;idxL++) {
-			BUCKETED_ENTRY entry = entries[kbc_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET + idxL];
-			uint64_t calc_y = (uint64_t) entry.y + add_Y;
-			printf("   y: %u   calc_y:%llu   meta0:%u meta1:%u \n", entry.y, calc_y, entry.meta[0], entry.meta[1]);
-		}
-	}
-}
-
-template <typename BUCKETED_ENTRY>
-__global__
-void gpu_merge_block_buckets_into_kbc_buckets(
-		const uint32_t KBC_START_ID, // determined by batch_id
-		const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N,
-		BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts)
-{
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	//for (int i = 0; i < N; i++) {
-	if (i < N) {
-		// TODO: try just reading out entries and see if they match when going in
-
-		BUCKETED_ENTRY block_entry = in[i];
-		uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y;
-		uint32_t kbc_id = calc_y / kBC;
-		uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS;
-		if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) {
-			printf(" i:%u  entry.y:%u  add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u   KBC_LOCAL_NUM_BUCKETS:%u START:%u  END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID);
-		}
-
-		uint32_t local_kbc_id = kbc_id - KBC_START_ID;
-		int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1);
-		uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot;
-
-		//printf("block_id:%u [i: %u] entry.y:%u  kbc_id:%u   local_kbc:%u   slot:%u   dest:%u\n",
-		//		block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address);
-
-		if (slot > KBC_MAX_ENTRIES_PER_BUCKET) {
-			printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n");
-		}
-		if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address);
-		}
-		block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form.
-		local_kbc_entries[destination_address] = block_entry;
-	}
-}
-
-template <typename BUCKETED_ENTRY, typename BUCKETED_ENTRY_BLOCKPOSREF>
-__global__
-void gpu_merge_block_buckets_into_kbc_buckets_with_blockposref(
-		const uint32_t KBC_START_ID, const uint32_t block_id, // determined by batch_id
-		const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N,
-		BUCKETED_ENTRY_BLOCKPOSREF *local_kbc_entries, int *local_kbc_counts,
-		int metasize)
-{
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	//for (int i = 0; i < N; i++) {
-	if (i < N) {
-		// TODO: try just reading out entries and see if they match when going in
-
-		BUCKETED_ENTRY block_entry = in[i];
-		BUCKETED_ENTRY_BLOCKPOSREF backref_entry = {};
-		//size_t n = sizeof(block_entry.meta)/sizeof(block_entry.meta[0]);
-		for (int s=0;s<metasize;s++) backref_entry.meta[s] = block_entry.meta[s];
-
-		backref_entry.blockposref = (block_id << (32 - 6)) + i; // encode block_id on top 6 bits, and refpos on lower bits.
-		uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y;
-		backref_entry.y = calc_y % kBC;
-		uint32_t kbc_id = calc_y / kBC;
-		uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS;
-		if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) {
-			printf(" i:%u  entry.y:%u  add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u   KBC_LOCAL_NUM_BUCKETS:%u START:%u  END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID);
-		}
-
-		uint32_t local_kbc_id = kbc_id - KBC_START_ID;
-		int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1);
-		uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot;
-
-		//printf("block_id:%u [i: %u] entry.y:%u  kbc_id:%u   local_kbc:%u   slot:%u   dest:%u\n",
-		//		block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address);
-
-		if (slot > KBC_MAX_ENTRIES_PER_BUCKET) {
-			printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n");
-		}
-		if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) {
-			printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address);
-		}
-		//block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form.
-		local_kbc_entries[destination_address] = backref_entry;
-	}
-}
-
-void transferBlocksFromHostToDevice(const uint16_t table, const uint32_t batch_id,
-		char *device_buffer_in, char *device_buffer_kbc, const size_t DEVICE_ENTRY_SIZE) {
-	uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id);
-
-	// consider compressing stream to cpu
-	// https://developer.nvidia.com/blog/optimizing-data-transfer-using-lossless-compression-with-nvcomp/
-
-	// clear local kbc's!
-	CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-
-	uint64_t device_bytes_start = 0;
-	uint32_t total_entries_copied = 0;
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-		//std::cout << "\n   Preparing batch:" << batch_id << " block:" << block_id << " for host->device" << std::endl;
-		uint32_t criss_cross_id = getCrissCrossBlockId(table,batch_id,block_id);
-		//std::cout << "      criss_cross_id:" << criss_cross_id << std::endl;
-		uint32_t num_entries_to_copy = host_criss_cross_entry_counts[criss_cross_id];
-		//std::cout << "        num_entries_to_copy: " << num_entries_to_copy << std::endl;
-		uint64_t host_block_entry_start_position = getCrissCrossBlockEntryStartPosition(criss_cross_id);
-		uint64_t host_bytes_start = host_block_entry_start_position * HOST_UNIT_BYTES;
-		//std::cout << "        host_block_entry_start_position: " << host_block_entry_start_position << std::endl;
-		//std::cout << "        host_bytes_start: " << host_bytes_start << std::endl;
-		total_entries_copied += num_entries_to_copy;
-
-		if (num_entries_to_copy > HOST_MAX_BLOCK_ENTRIES) {
-			std::cout << "OVERFLOW: num_entries_to_copy " << num_entries_to_copy << " > HOST_MAX_BLOCK_ENTRIES " << HOST_MAX_BLOCK_ENTRIES << std::endl;
-		}
-
-		size_t bytes_to_copy = num_entries_to_copy*DEVICE_ENTRY_SIZE;
-		if (device_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) {
-			std::cout << "ERROR: DEVICE BUFFER OVERFLOW\n size wanted: " << (device_bytes_start + bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-		}
-		if (host_bytes_start + bytes_to_copy > HOST_ALLOCATED_BYTES) {
-			std::cout << "ERROR: HOST MEM OVERFLOW\n size wanted: " << (host_bytes_start + bytes_to_copy) << " size available:" << HOST_ALLOCATED_BYTES << std::endl;
-		}
-
-		/*
-			Total tables time: 73825 ms
-        match: 10377 ms
-   ----------
-transfer time: 61610 ms
-        bytes: 687109273160 (639GB)
-
-
-		******- no pci transfer, do direct fro mhost...saved 7s or 10% (ok, we don't include writing to disk) ***************
-Total tables time: 66989 ms
-        match: 10358 ms
-   ----------
-transfer time: 54805 ms
-        bytes: 687109273464 (639GB)
-*********************
-			        */
-
-		//std::cout << "   Copying " << num_entries_to_copy
-		//		<< " entries from device_bytes_start: " << device_bytes_start
-		//		<< "             to host_bytes_start: " << host_bytes_start
-		//		<< "                    bytes length: " << bytes_to_copy << std::endl;
-		//std::cout << "   Block_id: " << block_id << " device->host bytes:" << bytes_to_copy << " entries:" << num_entries_to_copy << std::endl;
-		const bool use_direct_from_host = true;
-		if (!use_direct_from_host) {
-			CUDA_CHECK_RETURN(cudaMemcpy(&device_buffer_in[device_bytes_start], &host_criss_cross_blocks[host_bytes_start],bytes_to_copy,cudaMemcpyHostToDevice));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		}
-		//std::cout << "   done.\n";
-
-		// now for our block, determine what the kbc counts were, and merge entries ordered into global kbc's.
-		// gpu_map_in_buffer_to_global_kbc_for_batch(device_buffer_in, device_buffer_out, num_entries_to_copy);
-		int blockSize = 256;
-		int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize);
-		uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(batch_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id);
-		if (table == 2) {
-			Tx_Bucketed_Meta2 *in;
-			if (use_direct_from_host) in = (Tx_Bucketed_Meta2 *) &host_criss_cross_blocks[host_bytes_start];
-			else in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start];
-			Tx_Bucketed_Meta2 *local_kbc_entries = (Tx_Bucketed_Meta2 *) &device_buffer_kbc[0];
-			gpu_merge_block_buckets_into_kbc_buckets<Tx_Bucketed_Meta2><<<numBlocks,blockSize>>>(
-								KBC_START,
-								in, batch_bucket_add_Y, num_entries_to_copy,
-								local_kbc_entries, device_local_kbc_num_entries);
-		} else if ((table == 3) || (table == 4)) {
-			Tx_Bucketed_Meta4 *in;
-			if (use_direct_from_host) in = (Tx_Bucketed_Meta4 *) &host_criss_cross_blocks[host_bytes_start];
-			else in = (Tx_Bucketed_Meta4 *) &device_buffer_in[device_bytes_start];
-			//Tx_Bucketed_Meta4 *in = (Tx_Bucketed_Meta4 *) &device_buffer_in[device_bytes_start];
-			//Tx_Bucketed_Meta4 *local_kbc_entries = (Tx_Bucketed_Meta4 *) &device_buffer_kbc[0];
-			Tx_Bucketed_Meta4_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta4_Blockposref *) &device_buffer_kbc[0];
-			gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<Tx_Bucketed_Meta4,Tx_Bucketed_Meta4_Blockposref><<<numBlocks,blockSize>>>(
-					KBC_START,block_id,
-					in, batch_bucket_add_Y, num_entries_to_copy,
-					local_kbc_entries, device_local_kbc_num_entries,
-					4);
-		} else if (table == 5) {
-			Tx_Bucketed_Meta3 *in;
-			if (use_direct_from_host) in = (Tx_Bucketed_Meta3 *) &host_criss_cross_blocks[host_bytes_start];
-			else in = (Tx_Bucketed_Meta3 *) &device_buffer_in[device_bytes_start];
-			//Tx_Bucketed_Meta3 *in = (Tx_Bucketed_Meta3 *) &device_buffer_in[device_bytes_start];
-			Tx_Bucketed_Meta3_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta3_Blockposref *) &device_buffer_kbc[0];
-			gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<Tx_Bucketed_Meta3,Tx_Bucketed_Meta3_Blockposref><<<numBlocks,blockSize>>>(
-					KBC_START,block_id,
-					in, batch_bucket_add_Y, num_entries_to_copy,
-					local_kbc_entries, device_local_kbc_num_entries,
-					3);
-		} else if (table == 6) {
-			Tx_Bucketed_Meta2 *in;
-			if (use_direct_from_host) in = (Tx_Bucketed_Meta2 *) &host_criss_cross_blocks[host_bytes_start];
-			else in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start];
-			//Tx_Bucketed_Meta2 *in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start];
-			Tx_Bucketed_Meta2_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta2_Blockposref *) &device_buffer_kbc[0];
-			gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<Tx_Bucketed_Meta2,Tx_Bucketed_Meta2_Blockposref><<<numBlocks,blockSize>>>(
-					KBC_START,block_id,
-					in, batch_bucket_add_Y, num_entries_to_copy,
-					local_kbc_entries, device_local_kbc_num_entries,
-					2);
-		}
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-		device_bytes_start += bytes_to_copy;
-		table_transfer_in_bytes += bytes_to_copy;
-	}
-	//std::cout << "\nTotal entries copied in batch " << batch_id << ": " << total_entries_copied << std::endl;
-}
-
-
-
-int mmap_fdout;
-char *mmap_address;
-void setupMMap(size_t desired_size_bytes) {
-
-	int mode = 0x0777;
-
-	std::string filename = "/mnt/kioxia/tmp/test-mmap.tmp";
-
-	std::cout << "Setting up MMap with " << desired_size_bytes << " bytes in file: " << filename << std::endl;
-
-	if ((mmap_fdout = open (filename.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode )) < 0) {
-		std::cout << "can't create " << filename << " for writing" << std::endl;
-		return;
-	}
-
-	/* go to the location corresponding to the last byte */
-	if (lseek (mmap_fdout, desired_size_bytes, SEEK_SET) == -1) {
-		printf ("lseek error");
-		return;
-	}
-
-	/* write a dummy byte at the last location */
-	if (write (mmap_fdout, "", 1) != 1) {
-		printf ("write error");
-		return;
-	}
-
-	if ((mmap_address = (char *) mmap (0, desired_size_bytes, PROT_READ | PROT_WRITE, MAP_SHARED, mmap_fdout, 0)) == (caddr_t) -1) {
-		printf ("mmap error for output");
-	    return;
-	}
-
-	std::cout << "MMap done." << std::endl;
-}
-
-inline void writeHostMemToMMap(uint32_t address, char *host_mem, uint32_t bytes_to_copy) {
-	//std::string filename = "/mnt/kioxia/tmp/test" + std::to_string(table) + "-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	//std::cout << "Writing to file " << filename << std::endl;
-	//FILE* pFile;
-	//pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-	//fwrite(host_mem, 1, bytes_to_copy, pFile);
-	//fclose(pFile);
-	memcpy(mmap_address, host_mem, bytes_to_copy);
-}
-
-
-
-void convertAndWriteT2HostMemToBlockFiles(
-		uint16_t batch_id, uint16_t block_id,
-		Tx_Bucketed_Meta4 *t2_data, // will take meta[0] and meta[2] for Lx1 and Lx2
-		uint32_t num_entries_to_copy) {
-
-	if (num_entries_to_copy == 0) {
-		return;
-	}
-	// first convert to memory
-	T2BaseRef *t2_base = (T2BaseRef *) host_refdata_blocks;
-	for (int i=0;i<num_entries_to_copy;i++) {
-		T2BaseRef entry = {};
-		entry.Lx1 = t2_data[i].meta[0];
-		entry.Lx2 = t2_data[i].meta[2];
-		if ((entry.Lx1 == 0) && (entry.Lx2 == 0)) {
-			std::cout << "error: Lx1/Lx2 entries are 0 for batch_id:" << batch_id << " block_id:" << block_id << std::endl;
-		}
-		t2_base[i] = entry;
-	}
-
-	// then flush to disk...
-
-	uint32_t bytes_to_copy = sizeof(T2BaseRef) * num_entries_to_copy;
-
-	std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	//if (batch_id == 0) {
-	//	std::cout << "Writing to file [" << filename << "]";
-	//} else {
-	//	std::cout << " [" << filename << "]";
-	//}
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fwrite(&num_entries_to_copy, sizeof(uint32_t), 1, pFile); // write the num entries first.
-	fwrite(t2_base, 1, bytes_to_copy, pFile);
-	fclose(pFile);
-	//if (batch_id == BATCHES-1) {
-	//	std::cout << " done." << std::endl;
-	//}
-}
-
-
-void writeT3BaseDataToBlockFiles(uint16_t batch_id, uint16_t block_id, char *t3_base_ref,
-		uint32_t num_entries_to_copy, uint32_t bytes_to_copy) {
-	if (num_entries_to_copy == 0) {
-		return;
-	}
-	std::string filename = "/mnt/kioxia/tmp/T3BaseRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fwrite(&num_entries_to_copy, sizeof(uint32_t), 1, pFile); // write the num entries first.
-	fwrite(t3_base_ref, 1, bytes_to_copy, pFile);
-	fclose(pFile);
-}
-
-void writeHostRefdataToBlockFiles(uint16_t table, uint16_t batch_id, uint16_t block_id, char *host_ref, uint32_t num_entries_to_copy, uint32_t bytes_to_copy) {
-	if (num_entries_to_copy == 0) {
-		return;
-	}
-	std::string filename = "/mnt/kioxia/tmp/T" + std::to_string(table) + "BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	//if (batch_id == 0) {
-	//	std::cout << "Writing backref to file [" << filename << "]";
-	//} else {
-	//	std::cout << " [" << filename << "]";
-	//}
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fwrite(&num_entries_to_copy, sizeof(uint32_t), 1, pFile); // write the num entries first.
-	fwrite(host_ref, 1, bytes_to_copy, pFile);
-	fclose(pFile);
-	//if (batch_id == BATCHES-1) {
-	//	std::cout << " done." << std::endl;
-	//}
-	//if (table == 6) {
-	//	T6BackRef *t6_data = (T6BackRef *) host_ref;
-	//	uint32_t num_entries = num_entries_to_copy;
-		//std::cout << "Num entries T6 block " << block_id << " num_entries: " << num_entries << "   bytes:" << bytes_to_copy << std::endl;
-		//for (int i=0;i<10;i++) {
-		//	T6BackRef entry = t6_data[i];
-			//printf("T6 BackRef L:%u R:%u y:%u\n", entry.prev_block_ref_L, entry.prev_block_ref_R, entry.y);
-		//}
-	//}
-}
-
-
-void writeT2HostMemToBlockFiles(uint16_t table, uint16_t batch_id, uint16_t block_id, char *host_mem, uint32_t bytes_to_copy) {
-	std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	//if (batch_id == 0) {
-	//	std::cout << "Writing to file [" << filename << "]";
-	//} else {
-	//	std::cout << " [" << filename << "]";
-	//}
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fwrite(host_mem, 1, bytes_to_copy, pFile);
-	fclose(pFile);
-	//if (batch_id == BATCHES-1) {
-	//	std::cout << " done." << std::endl;
-	//}
-}
-
-FILE* t2File;
-void writeT2HostMemToBatchFiles(uint16_t table, uint16_t batch_id, uint16_t block_id, char *host_mem, uint32_t bytes_to_copy) {
-	std::string filename = "/mnt/kioxia/tmp/T2-largefile-" + std::to_string(batch_id) + ".tmp";
-	if (block_id == 0) {
-		std::cout << "Opening file " << filename << std::endl;
-		t2File = fopen(filename.c_str(), "wb");
-	}
-	std::cout << ".";
-	fwrite(host_mem, 1, bytes_to_copy, t2File);
-	if (block_id == BATCHES) {
-		std::cout << "Closing file " << filename << std::endl;
-		fclose(t2File);
-	}
-}
-
-void writeT2HostMemToTableFiles(uint16_t table, uint16_t batch_id, uint16_t block_id, char *host_mem, uint32_t bytes_to_copy) {
-	std::string filename = "/mnt/kioxia/tmp/T2-table-batches.tmp";
-	if ((batch_id == 0) && (block_id == 0)) {
-		std::cout << "Opening file " << filename << std::endl;
-		t2File = fopen(filename.c_str(), "wb");
-	}
-	std::cout << "Writing to file " << filename << std::endl;
-	fwrite(host_mem, 1, bytes_to_copy, t2File);
-	if ((batch_id == (BATCHES-1)) && (block_id == (BATCHES-1))) {
-		std::cout << "Closing file " << filename << std::endl;
-		fclose(t2File);
-	}
-}
-
-bool doWriteT2BaseData = false;
-bool doWriteT3BaseData = true;
-bool doWriteRefData = true;
-bool doWriteT6Data = true;
-
-uint32_t max_block_entries_copied_device_to_host = 0;
-void transferBucketedBlocksFromDeviceToHost(const uint16_t table, const uint32_t batch_id,
-		char *device_buffer, const size_t DEVICE_ENTRY_SIZE,
-		char *device_refdata, const int* block_counts) {
-
-	const bool doPrint = false;
-	uint64_t batch_bytes_transfered = 0;
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-
-		//std::cout << "\n   Preparing batch:" << batch_id << " block:" << block_id << " for transfer" << std::endl;
-
-		uint32_t criss_cross_id = getCrissCrossBlockId(table,batch_id,block_id);
-		if (doPrint) std::cout << "      criss_cross_id:" << criss_cross_id << std::endl;
-
-		uint32_t num_entries_to_copy = block_counts[block_id];
-		//std::cout << "        num_entries_to_copy: " << num_entries_to_copy << std::endl;
-
-		uint64_t host_block_entry_start_position = getCrissCrossBlockEntryStartPosition(criss_cross_id);
-		uint64_t host_bytes_start = host_block_entry_start_position * HOST_UNIT_BYTES;
-		if (doPrint) std::cout << "        host_block_entry_start_position: " << host_block_entry_start_position << std::endl;
-		if (doPrint) std::cout << "        host_bytes_start: " << host_bytes_start << std::endl;
-
-		uint32_t device_entry_start = block_id * HOST_MAX_BLOCK_ENTRIES; // device bucketed block entry pos
-		host_criss_cross_entry_counts[criss_cross_id] = num_entries_to_copy;
-
-		if (doPrint) std::cout << "        device_entry_start: " << device_entry_start << std::endl;
-		if (num_entries_to_copy > HOST_MAX_BLOCK_ENTRIES) {
-			std::cout << "OVERFLOW: num_entries_to_copy " << num_entries_to_copy << " > HOST_MAX_BLOCK_ENTRIES " << HOST_MAX_BLOCK_ENTRIES << std::endl;
-		}
-		if (max_block_entries_copied_device_to_host < num_entries_to_copy) {
-			max_block_entries_copied_device_to_host = num_entries_to_copy; // helps determine HOST_MAX_BLOCK_ENTRIES value.
-		}
-
-		uint64_t device_bytes_start = device_entry_start * DEVICE_ENTRY_SIZE;
-		size_t bytes_to_copy = num_entries_to_copy*DEVICE_ENTRY_SIZE;
-		if (device_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) {
-			std::cout << "ERROR: DEVICE BUFFER OVERFLOW\n size wanted: " << (device_bytes_start + bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-		}
-		if (host_bytes_start + bytes_to_copy > HOST_ALLOCATED_BYTES) {
-			std::cout << "ERROR: HOST MEM OVERFLOW\n size wanted: " << (host_bytes_start + bytes_to_copy) << " size available:" << HOST_ALLOCATED_BYTES << std::endl;
-		}
-		//if (doPrint) std::cout << "   Copying " << num_entries_to_copy
-		//		<< " entries from device_bytes_start: " << device_bytes_start
-		//		<< "             to host_bytes_start: " << host_bytes_start
-		//		<< "                    bytes length: " << bytes_to_copy << std::endl;
-		//std::cout << "   Block_id: " << block_id << " device->host bytes:" << bytes_to_copy << " entries:" << num_entries_to_copy << std::endl;
-
-		if (table < 6) {
-			// we only copy criss cross memory if it's not the last table, since that only exports back ref data and no forward propagation.
-			CUDA_CHECK_RETURN(cudaMemcpy(&host_criss_cross_blocks[host_bytes_start],&device_buffer[device_bytes_start],bytes_to_copy,cudaMemcpyDeviceToHost));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			batch_bytes_transfered += bytes_to_copy;
-		}
-		if (doPrint) std::cout << "   done.\n";
-
-		//if (table == 1) {
-			// oof...mmap is 47000ms transfer for T1
-		//	memcpy(mmap_address + total_transfered_bytes, &host_criss_cross_blocks[host_bytes_start], bytes_to_copy);
-			//writeHostMemToMMap(total_transfered_bytes, &host_criss_cross_blocks[host_bytes_start], bytes_to_copy);
-		//}
-
-
-
-		// for T2 we dump to file, since this becomes the baseline with 4 meta entries for x's.
-		/*if (table == 2) {
-			// 42241 ms - a wash whether we write 4 x's in T2 or use 2'xs in T1 and write a 64bit ref here.
-			// BUT- our goals is to get kbc's, so base level T2 can just write 2 kbc entries (only 50 bits (25 kbc * 2) but we need
-			// CPU to process the entry and split into the proper reference buckets at this stage. 64 batches already splits 18m kbc's
-			// down into 285k kbc's so should help with mem buffer. OR...we could do this in GPU and just use CPU to dumb copy, ay?
-			// BUUUTTT - we need enough spare memory so have to do it at end of entire first phase process.
-			// tables was 56..yeesh
-			if (doWriteT2BaseData) {
-				Tx_Bucketed_Meta4 *t2_data = (Tx_Bucketed_Meta4 *) &host_criss_cross_blocks[host_bytes_start];
-				convertAndWriteT2HostMemToBlockFiles(batch_id, block_id, t2_data, num_entries_to_copy);
-			}
-		}*/
-		if (table == 3) {
-			if (doWriteT3BaseData) {
-				uint64_t refdata_bytes_start;
-				size_t refdata_bytes_to_copy;
-				refdata_bytes_start = device_entry_start * sizeof(T3BaseRef);
-				refdata_bytes_to_copy = num_entries_to_copy*sizeof(T3BaseRef);
-
-				if (refdata_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) {
-					std::cout << "ERROR: DEVICE REFDATA OVERFLOW\n size wanted: " << (refdata_bytes_start + refdata_bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-				}
-				CUDA_CHECK_RETURN(cudaMemcpy(&host_refdata_blocks[refdata_bytes_start],&device_refdata[refdata_bytes_start],refdata_bytes_to_copy,cudaMemcpyDeviceToHost));
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-				// now write to files
-				writeT3BaseDataToBlockFiles(batch_id, block_id,
-						&host_refdata_blocks[refdata_bytes_start],
-						num_entries_to_copy, refdata_bytes_to_copy);
-			}
-		}
-		if (table > 3) {
-			// transfer back ref
-			if (table == 6) doWriteRefData = doWriteT6Data;
-			if (doWriteRefData) {
-				uint64_t refdata_bytes_start;
-				size_t refdata_bytes_to_copy;
-				if (table == 6) {
-					refdata_bytes_start = device_entry_start * sizeof(T6BackRef);
-					refdata_bytes_to_copy = num_entries_to_copy*sizeof(T6BackRef);
-				} else {
-					refdata_bytes_start = device_entry_start * sizeof(BackRef);
-					refdata_bytes_to_copy = num_entries_to_copy*sizeof(BackRef);
-				}
-				if (refdata_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) {
-					std::cout << "ERROR: DEVICE REFDATA OVERFLOW\n size wanted: " << (refdata_bytes_start + refdata_bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-				}
-				CUDA_CHECK_RETURN(cudaMemcpy(&host_refdata_blocks[refdata_bytes_start],&device_refdata[refdata_bytes_start],refdata_bytes_to_copy,cudaMemcpyDeviceToHost));
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-				// now write to files
-				writeHostRefdataToBlockFiles(table, batch_id, block_id, &host_refdata_blocks[refdata_bytes_start], num_entries_to_copy, refdata_bytes_to_copy);
-			}
-		}
-
-	}
-	//fclose(pFile);
-	//std::cout << "Waiting for writes to finish...";
-	//for(uint8_t i=0;i<BATCHES;i++) { threads[i].join(); std::cout << "[" << i << "]";}
-
-	if (doPrint) std::cout << "\nTotal bytes for batch copied: " << batch_bytes_transfered << std::endl;
-	table_transfer_out_bytes += batch_bytes_transfered;
-}
-
-
-
-void doT1Batch(uint32_t batch_id, int* local_kbc_num_entries, const uint32_t KBC_START, const uint32_t KBC_END) {
-	// 1) gpu scan kbs into (F1_Bucketed_kBC_Entry *) bufferA
-	// 2) gpu find_f1_matches from (F1_Bucketed_kBC_Entry *) bufferA to (T1_Pairing_Chunk *) bufferB
-	// 3) gpu exclusive scan kbc_counts to get kbc_memory_positions by blocks, and kbc_block_counts
-	// 4) gpu cp (T1_Pairing_Chunk *) bufferB into (T1_Bucketed_kBC_Entry *) bufferA
-	// 5) device to host transfer bufferA
-
-	std::cout << "   doF1Batch: " << batch_id << std::endl <<
-				 "     SPANNING FOR BUCKETS    count:" << (KBC_END - KBC_START + 1) << "  KBC_START: " << KBC_START << "   KBC_END: " << KBC_END << std::endl;
-
-	auto batch_start = std::chrono::high_resolution_clock::now();
-	auto start = std::chrono::high_resolution_clock::now();
-	auto finish = std::chrono::high_resolution_clock::now();
-
-	F1_Bucketed_kBC_Entry *local_kbc_entries = (F1_Bucketed_kBC_Entry *) device_buffer_A;
-
-	// 1) gpu scan kbs into (F1_Bucketed_kBC_Entry *) bufferA
-	//std::cout << "   Generating F1 results into kbc buckets...";
-	start = std::chrono::high_resolution_clock::now();
-	int blockSize = 256; // # of threads per block, maximum is 1024.
-	const uint64_t calc_N = UINT_MAX;
-	const uint64_t calc_blockSize = blockSize;
-	const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	int numBlocks = calc_numBlocks;
-	//std::cout << "  Block configuration: [blockSize:" << blockSize << "  numBlocks:" << numBlocks << "]" << std::endl;
-	// don't forget to clear counter...will only use a portion of this memory so should be fast access.
-	CUDA_CHECK_RETURN(cudaMemset(local_kbc_num_entries, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-	gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<<numBlocks, blockSize>>>(calc_N, chacha_input,
-			local_kbc_entries, local_kbc_num_entries, KBC_START, KBC_END);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	finish = std::chrono::high_resolution_clock::now();
-	total_chacha_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-	//std::cout << "   done.     " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-	// 2) gpu find_f1_matches from (F1_Bucketed_kBC_Entry *) bufferA to (T1_Pairing_Chunk *) bufferB
-	std::cout << "   Finding matches...";
-	cudaEvent_t mstart, mstop;
-	float milliseconds = 0;
-	cudaEventCreate(&mstart);
-	cudaEventCreate(&mstop);
-
-	start = std::chrono::high_resolution_clock::now();
-
-	Tx_Bucketed_Meta1 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta1 *) device_buffer_A;
-	Tx_Bucketed_Meta2 *bucketed_out = (Tx_Bucketed_Meta2 *) device_buffer_B;
-
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599
-	cudaEventRecord(mstart);
-	gpu_find_tx_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(1, batch_id, KBC_START, KBC_END,
-			bucketed_kbc_entries_in, local_kbc_num_entries,
-			bucketed_out, device_block_entry_counts);
-	cudaEventRecord(mstop);
-	cudaEventSynchronize(mstop);
-	cudaEventElapsedTime(&milliseconds, mstart, mstop);
-	std::cout << "gpu_find_tx_matches time: " << milliseconds << " ms\n";
-	//gpu_find_tx_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(1, batch_id, KBC_START, KBC_END,
-	//		bucketed_kbc_entries_in, local_kbc_num_entries,
-	//		host_criss_cross_blocks, device_block_entry_counts);
-
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	finish = std::chrono::high_resolution_clock::now();
-	total_match_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-	std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-
-	// 4) gpu cp (T1_Pairing_Chunk *) bufferB into (T1_Bucketed_kBC_Entry *) bufferA
-	total_gpu_time_ms += std::chrono::duration_cast<milli>(finish - batch_start).count();
-	//std::cout << "     transferBucketedBlocksFromDeviceToHost\n";
-	start = std::chrono::high_resolution_clock::now();
-	transferBucketedBlocksFromDeviceToHost(1, batch_id, device_buffer_B, sizeof(Tx_Bucketed_Meta2), NULL, device_block_entry_counts);
-	finish = std::chrono::high_resolution_clock::now();
-	table_transfer_out_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-	//std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-}
-
-void doTxBatch(uint16_t table, uint32_t batch_id) {
-	// 1) host to device transfer -> bufferB = (T1_Bucketed_kBC_Entry *) bufferB
-	// 2) gpu find_f1_matches from (T1_Bucketed_kBC_Entry *) bufferB to (T2_Pairing_Chunk *) bufferA
-	// 3) gpu exclusive scan kbc_counts to get kbc_memory_positions by blocks, and kbc_block_counts
-	// 4) gpu cp (T2_Pairing_Chunk *) bufferB into (T2_Bucketed_kBC_Entry *) bufferA
-	// 5) device to host transfer bufferA
-	auto batch_start = std::chrono::high_resolution_clock::now();
-	auto start = std::chrono::high_resolution_clock::now();
-	auto finish = std::chrono::high_resolution_clock::now();
-
-	size_t transfer_in_size = 0;
-	size_t transfer_out_size = 0;
-	if (table == 2) {
-		transfer_in_size = sizeof(Tx_Bucketed_Meta2);
-		transfer_out_size = sizeof(Tx_Bucketed_Meta4);
-	}
-	else if (table == 3) {
-		transfer_in_size = sizeof(Tx_Bucketed_Meta4);
-		transfer_out_size = sizeof(Tx_Bucketed_Meta4);
-	}
-	else if (table == 4) {
-		transfer_in_size = sizeof(Tx_Bucketed_Meta4);
-		transfer_out_size = sizeof(Tx_Bucketed_Meta3);
-	}
-	else if (table == 5) {
-		transfer_in_size = sizeof(Tx_Bucketed_Meta3);
-		transfer_out_size = sizeof(Tx_Bucketed_Meta2);
-	}
-	else if (table == 6) {
-		transfer_in_size = sizeof(Tx_Bucketed_Meta2);
-		transfer_out_size = 0;
-		// TODO: T6 could transfer to hostmem or to the backref blocks table
-		// since we will then read from backref blocks tables for all backrefs across tables.
-	}
-
-	start = std::chrono::high_resolution_clock::now();
-	transferBlocksFromHostToDevice(table, batch_id, device_buffer_B, device_buffer_A, transfer_in_size);
-	finish = std::chrono::high_resolution_clock::now();
-	table_transfer_in_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-
-	//gpu_print_kbc_counts<<<1,1>>>(device_local_kbc_num_entries);
-
-
-	// 2) gpu find_f1_matches from (F1_Bucketed_kBC_Entry *) bufferA to (T1_Pairing_Chunk *) bufferB
-	//std::cout << "   Finding matches...";
-	start = std::chrono::high_resolution_clock::now();
-
-	//if (batch_id == 0) {
-	//	gpu_print_kbc_bucket_contents<Tx_Bucketed_Meta2><<<1,1>>>(bucketed_kbc_entries_in, device_local_kbc_num_entries);
-	//}
-
-	const uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id);
-	const uint32_t next_batch = batch_id + 1;
-	const uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(next_batch);
-
-	CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int)));
-	if (table == 2) {
-		Tx_Bucketed_Meta2 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A;
-		Tx_Bucketed_Meta4 *bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B;
-		gpu_find_tx_matches<Tx_Bucketed_Meta2,Tx_Bucketed_Meta4><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END,
-				bucketed_kbc_entries_in, device_local_kbc_num_entries,
-				bucketed_out, device_block_entry_counts);
-	} else if (table == 3) {
-		// at table 3 we start pulling in backref to table 2
-		//Tx_Bucketed_Meta4 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4 *) device_buffer_A;
-		Tx_Bucketed_Meta4_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4_Blockposref *) device_buffer_A;
-		Tx_Bucketed_Meta4 *bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B;
-		gpu_find_tx_matches_with_backref<Tx_Bucketed_Meta4_Blockposref,Tx_Bucketed_Meta4><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END,
-				bucketed_kbc_entries_in, device_local_kbc_num_entries,
-				bucketed_out, device_buffer_refdata, device_block_entry_counts);
-	} else if (table == 4) {
-		//Tx_Bucketed_Meta4 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4 *) device_buffer_A;
-		Tx_Bucketed_Meta4_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4_Blockposref *) device_buffer_A;
-		Tx_Bucketed_Meta3 *bucketed_out = (Tx_Bucketed_Meta3 *) device_buffer_B;
-		gpu_find_tx_matches_with_backref<Tx_Bucketed_Meta4_Blockposref,Tx_Bucketed_Meta3><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END,
-				bucketed_kbc_entries_in, device_local_kbc_num_entries,
-				bucketed_out, device_buffer_refdata, device_block_entry_counts);
-	} else if (table == 5) {
-		//Tx_Bucketed_Meta3 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta3 *) device_buffer_A;
-		Tx_Bucketed_Meta3_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta3_Blockposref *) device_buffer_A;
-		Tx_Bucketed_Meta2 *bucketed_out = (Tx_Bucketed_Meta2 *) device_buffer_B;
-		gpu_find_tx_matches_with_backref<Tx_Bucketed_Meta3_Blockposref,Tx_Bucketed_Meta2><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END,
-				bucketed_kbc_entries_in, device_local_kbc_num_entries,
-				bucketed_out, device_buffer_refdata, device_block_entry_counts);
-	} else if (table == 6) {
-		//Tx_Bucketed_Meta2 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A;
-		Tx_Bucketed_Meta2_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2_Blockposref *) device_buffer_A;
-		Tx_Bucketed_Meta2 *NOT_USED = (Tx_Bucketed_Meta2 *) device_buffer_B;
-		gpu_find_tx_matches_with_backref<Tx_Bucketed_Meta2_Blockposref,Tx_Bucketed_Meta2><<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END,
-			bucketed_kbc_entries_in, device_local_kbc_num_entries,
-			NOT_USED, device_buffer_refdata, device_block_entry_counts);
-	}
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	finish = std::chrono::high_resolution_clock::now();
-	table_match_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-	//std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-
-	// 4) gpu cp (T1_Pairing_Chunk *) bufferB into (T1_Bucketed_kBC_Entry *) bufferA
-	//if (table < 6) {
-		//std::cout << "     transferBucketedBlocksFromDeviceToHost\n";
-		start = std::chrono::high_resolution_clock::now();
-		transferBucketedBlocksFromDeviceToHost(table, batch_id, device_buffer_B, transfer_out_size, device_buffer_refdata, device_block_entry_counts);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		finish = std::chrono::high_resolution_clock::now();
-
-		table_transfer_out_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-
-		//std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-	//} else if (table == 6) {
-		// TODO: handle final T6 file...maybe this can write into hostmem instead of to file.
-	//}
-
-}
-
-void doT1() {
-
-	std::cout << "doT1  BATCHES:" << BATCHES  << std::endl;
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-	auto finish =  std::chrono::high_resolution_clock::now(); // just to allocate
-
-	// what's faster, 0.4% of kbc's, or 0.63% of xs'
-	for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) {
-
-		uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id);
-		uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(batch_id+1)-1;
-
-		auto batch_start = std::chrono::high_resolution_clock::now();
-		//if (batch_id < 2)
-			doT1Batch(batch_id, device_local_kbc_num_entries, KBC_START, KBC_END);
-		finish =  std::chrono::high_resolution_clock::now();
-		//std::cout << "  ** T1 batch " << batch_id << " finished ** " << std::chrono::duration_cast<milli>(finish - batch_start).count() << " ms\n";
-	}
-
-	finish = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "T1 Total time: " << std::chrono::duration_cast<milli>(finish - total_start).count() << " ms\n";
-	std::cout << "     gpu time: " << total_gpu_time_ms << " ms\n";
-	std::cout << "       chacha: " << total_chacha_time_ms << " ms\n";
-	std::cout << "        match: " << total_match_time_ms << " ms\n";
-	std::cout << "   ----------  " << std::endl;
-	std::cout << "transfer time: " << table_transfer_out_time_ms << " ms\n";
-	std::cout << "        bytes: " << table_transfer_out_bytes << " (" << (table_transfer_out_bytes/(1024*1024*1024)) << "GB)\n";
-	std::cout << "*********************" << std::endl;
-
-	total_transfer_in_time_ms += table_transfer_in_time_ms;
-	total_transfer_out_time_ms += table_transfer_out_time_ms;
-	total_transfer_in_bytes += table_transfer_in_bytes;
-	total_transfer_out_bytes += table_transfer_out_bytes;
-}
-
-void doTx(uint16_t table) {
-	std::cout << "do Table " << table <<"   BATCHES:" << BATCHES << std::endl;
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-	auto finish =  std::chrono::high_resolution_clock::now(); // just to allocate
-
-	table_match_time_ms = 0;
-	table_transfer_in_time_ms = 0;
-	table_transfer_out_time_ms = 0;
-	table_transfer_in_bytes = 0;
-	table_transfer_out_bytes = 0;
-
-	for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) {
-		auto batch_start = std::chrono::high_resolution_clock::now();
-		doTxBatch(table, batch_id);
-		finish =  std::chrono::high_resolution_clock::now();
-		//std::cout << "  ** T" << table << " batch " << batch_id << " finished ** " << std::chrono::duration_cast<milli>(finish - batch_start).count() << " ms\n";
-	}
-
-	finish = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "T" << table << " time: " << std::chrono::duration_cast<milli>(finish - total_start).count() << " ms\n";
-	std::cout << "        match: " << table_match_time_ms << " ms\n";
-	std::cout << "   ----------  " << std::endl;
-	std::cout << "transfer in time: " << table_transfer_in_time_ms << " ms\n";
-	std::cout << "         in bytes: " << table_transfer_in_bytes << " (" << (table_transfer_in_bytes/(1024*1024*1024)) << "GB)\n";
-	std::cout << "transfer out time: " << table_transfer_out_time_ms << " ms\n";
-	std::cout << "        out bytes: " << table_transfer_out_bytes << " (" << (table_transfer_out_bytes/(1024*1024*1024)) << "GB)\n";
-	std::cout << "*********************" << std::endl;
-	total_match_time_ms += table_match_time_ms;
-	total_transfer_in_time_ms += table_transfer_in_time_ms;
-	total_transfer_out_time_ms += table_transfer_out_time_ms;
-	total_transfer_in_bytes += table_transfer_in_bytes;
-	total_transfer_out_bytes += table_transfer_out_bytes;
-}
-
-
-
-
-void setupMemory() {
-
-	//setupMMap(HOST_ALLOCATED_BYTES); // potentially useful if going to do random reads/writes to stored data
-
-	std::cout << "      device_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts, BATCHES*sizeof(int)));
-
-	std::cout << "      device_local_kbc_num_entries " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-
-	//Tx_Pairing_Chunk_Meta4 *device_buffer_A;
-	std::cout << "      device_buffer_A " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_A, DEVICE_BUFFER_ALLOCATED_BYTES));
-
-	//Tx_Pairing_Chunk_Meta4 *device_buffer_B;
-	std::cout << "      device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES));
-
-
-	std::cout << "      device_buffer_C " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_C, DEVICE_BUFFER_ALLOCATED_BYTES));
-
-	std::cout << "      device_buffer_refdata " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  BACKREF_UNIT_BYTES << ") = " << BACKREF_ALLOCATED_BYTES << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_refdata, BACKREF_ALLOCATED_BYTES));
-
-	std::cout << "      HOST host_refdata_blocks ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " ALLOCATED ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " UNIT BYTES: " << BACKREF_UNIT_BYTES << " = " << (BACKREF_ALLOCATED_BYTES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_refdata_blocks, BACKREF_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-
-	std::cout << "      HOST host_criss_cross_blocks MAX_ENTRIES: " << HOST_MAX_BLOCK_ENTRIES << " ALLOCATED ENTRIES: " << HOST_ALLOCATED_ENTRIES << " UNIT BYTES: " << HOST_UNIT_BYTES << " = " << (HOST_ALLOCATED_BYTES) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_criss_cross_blocks, HOST_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-}
-
-
-
-void freeMemory() {
-	std::cout << "Freeing memory..." << std::endl;
-	CUDA_CHECK_RETURN(cudaFree(device_buffer_A));
-	CUDA_CHECK_RETURN(cudaFree(device_buffer_B));
-	CUDA_CHECK_RETURN(cudaFree(device_buffer_C));
-
-	//CUDA_CHECK_RETURN(cudaFree(device_block_entry_counts));
-	CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries));
-	CUDA_CHECK_RETURN(cudaFreeHost(host_criss_cross_blocks));
-	std::cout << "   memory freed." << std::endl;
-}
-
-
-
-
-void doPhase3Compression() {
-	// our phase 3 compression then needs to take all pruned batches for T2, and write blocks of kbc's compressed with ANS.
-	// it also needs to take T6_Backref table, load all into memory, and sort by y, and put into blocks with new backref into criss cross back ref to table 2 kbc sets.p
-}
-
-#include "k29_plotter.hpp"
-
-int main(int argc, char *argv[])
-{
-	std::cout << "DrPlotter v0.1d" << std::endl;
-	chacha_setup();
-
-	cmd_read = 0;
-
-	if (cmd_read == 2) {
-		//attack_it();
-		doPhase2Pruning();
-		exit(EXIT_SUCCESS);
-	}
-	if (cmd_read == 3) {
-		do_k29();
-		exit(EXIT_SUCCESS);
-	}
-
-
-	doWriteT2BaseData = false;
-	doWriteT3BaseData = false;
-	doWriteRefData = false;
-	doWriteT6Data = false;
-	setupMemory();
-
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-	doT1();
-	doTx(2);
-	doTx(3);
-	doTx(4);
-	doTx(5);
-	doTx(6);
-	auto total_end = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "Total tables time: " << std::chrono::duration_cast<milli>(total_end - total_start).count() << " ms\n";
-	std::cout << "        match: " << total_match_time_ms << " ms\n";
-	std::cout << "   ----------  " << std::endl;
-	std::cout << "transfer in time: " << total_transfer_in_time_ms << " ms\n";
-	std::cout << "        bytes: " << total_transfer_in_bytes << " (" << (total_transfer_in_bytes/(1024*1024*1024)) << "GB)\n";
-	std::cout << "transfer out time: " << total_transfer_out_time_ms << " ms\n";
-		std::cout << "        bytes: " << total_transfer_out_bytes << " (" << (total_transfer_out_bytes/(1024*1024*1024)) << "GB)\n";
-	std::cout << "*********************" << std::endl;
-	std::cout << "Max block entries used: " << max_block_entries_copied_device_to_host << " VS HOST_MAX_BLOCK_ENTRIES:" << HOST_MAX_BLOCK_ENTRIES << std::endl;
-	std::cout << " freeing memory...";
-	freeMemory();
-	std::cout << "end." << std::endl;
-	exit(EXIT_SUCCESS);
-}
diff --git a/k29_plotter.hpp b/k29_plotter.hpp
deleted file mode 100644
index d760a3b..0000000
--- a/k29_plotter.hpp
+++ /dev/null
@@ -1,972 +0,0 @@
-/*
- * k29_plotter.hpp
- *
- *  Created on: Mar 25, 2022
- *      Author: nick
- */
-
-#ifndef K29_PLOTTER_HPP_
-#define K29_PLOTTER_HPP_
-
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-const uint32_t kXX_BITS = 29;
-
-const uint64_t k29_DEVICE_BUFFER_A_BYTES = 8589934592; // 8GB total buffer
-const uint32_t k29_MAX_X_VALUE = 1 << kXX_BITS;
-const uint64_t k29_MAX_Y_VALUE = 4294967296; // hack, set to 32 bit value of chacha
-
-const uint32_t k29_CHACHA_SPLIT_BUCKETS = 1024; // after 10 starts dropping
-const uint64_t k29_CHACHA_SPLIT_BUCKET_DIVISOR = k29_MAX_Y_VALUE / (k29_CHACHA_SPLIT_BUCKETS);
-const uint64_t k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET = 2 * k29_MAX_X_VALUE / k29_CHACHA_SPLIT_BUCKETS;
-
-uint *xchachas_bucket_counts;
-uint *global_kbc_counts;
-
-const uint32_t k29_BATCHES = 1;
-const uint32_t k29_BC_NUM_BUCKETS = 568381;//1136761;//2273523;
-const uint64_t k29_BC_BUCKET_DIVISOR = k29_MAX_Y_VALUE / k29_BC_NUM_BUCKETS;
-const uint32_t k29_BC_LAST_BUCKET_ID = 1136761 - 1;//2273522;
-const uint32_t k29_BCS_PER_BATCH = (k29_BC_NUM_BUCKETS / BATCHES)+1;
-const uint32_t k29_BC_LOCAL_NUM_BUCKETS = k29_BCS_PER_BATCH + 1; // +1 is for including last R bucket space
-
-const uint64_t k29_DEVICE_BUFFER_UNIT_BYTES = 32; // Tx_pairing_chunk_meta4 is 24 bytes, w/ backref is 32 bytes
-const uint64_t k29_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_LOCAL_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; // HOST_MAX_BLOCK_ENTRIES * BATCHES;// DEVICE_BUFFER_ALLOCATED_ENTRIES = 120 * ((uint64_t) 1 << 32) / (100*BATCHES);
-const uint64_t k29_DEVICE_BUFFER_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * DEVICE_BUFFER_UNIT_BYTES;
-
-
-#define ATTACK_CHACHAS_k29_YS_ONLY(datax_slot) \
-{ \
-	int x_value = pos + datax_slot; \
-	chacha_ys[x_value] = datax[datax_slot]; \
-	chacha_xs[x_value] = x_value; \
-}
-
-#define ATTACK_CHACHAS_k29_TO_KBC(datax_slot) \
-{ \
-	uint32_t x_value = pos + datax_slot; \
-	uint32_t chacha_y = datax[datax_slot]; \
-	uint32_t Ly = chacha_y; \
-	uint32_t bucket_id = Ly / k29_BC_BUCKET_DIVISOR; \
-	xchacha_pair pair = { x_value, chacha_y }; \
-	int slot = atomicAdd(&xchachas_bucket_counts[bucket_id],1); \
-	if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); \
-	else { \
-		xchachas_buckets[KBC_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; \
-	} \
-}
-
-#define ATTACK_CHACHAS_k29_BUCKETADD(datax_slot) \
-{ \
-	uint32_t chacha_y = datax[datax_slot]; \
-	uint32_t Ly = chacha_y; \
-	uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \
-	int slot = atomicAdd(&shared_counts[bucket_id],1); \
-}
-
-#define ATTACK_CHACHAS_k29_SORTEDADD(datax_slot) \
-{ \
-	uint32_t x_value = pos + datax_slot; \
-	uint32_t chacha_y = datax[datax_slot]; \
-	uint32_t Ly = chacha_y; \
-	uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \
-	int slot = shared_counts_offsets[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \
-	shared_sorted_xs[slot] = x_value; shared_sorted_chachas[slot] = chacha_y; \
-}
-
-#define ATTACK_CHACHAS_k29_SORTEDADD_FILTERED(datax_slot) \
-{ \
-	uint32_t x_value = pos + datax_slot; \
-	uint32_t chacha_y = datax[datax_slot]; \
-	uint32_t Ly = chacha_y; \
-	uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \
-	if ((bucket_id >= filter_min) && (bucket_id < filter_max)) { \
-		xchacha_pair pair = { x_value, chacha_y }; \
-		int slot = shared_counts_offsets[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \
-		shared_sorted_xchachas[slot] = pair; \
-	} \
-}
-
-#define ATTACK_CHACHAS_k29_BUCKETSET(datax_slot) \
-{ \
-	uint32_t x_value = pos + datax_slot; \
-	uint32_t chacha_y = datax[datax_slot]; \
-	uint32_t Ly = chacha_y; \
-	uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \
-	xchacha_pair pair = { x_value, chacha_y }; \
-	int slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \
-	if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); \
-	else { \
-		xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; \
-	} \
-}
-
-__global__
-void gpu_chacha8_k29_bucketadd(const uint32_t N,
-		const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	__shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS];
-	__shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS];
-
-
-
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	//uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (x_group < end_n) {
-
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			shared_counts[i] = 0;
-		}
-		__syncthreads();
-
-		uint32_t pos = x_group * 16;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3);
-		ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7);
-		ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11);
-		ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15);
-
-		__syncthreads();
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-			shared_counts[i] = 0;
-		}
-
-		__syncthreads();
-
-		// now recompute and this time add to global array
-		pos = x_group * 16;// + X_START/16;
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		ATTACK_CHACHAS_k29_BUCKETSET(0);ATTACK_CHACHAS_k29_BUCKETSET(1);ATTACK_CHACHAS_k29_BUCKETSET(2);ATTACK_CHACHAS_k29_BUCKETSET(3);
-		ATTACK_CHACHAS_k29_BUCKETSET(4);ATTACK_CHACHAS_k29_BUCKETSET(5);ATTACK_CHACHAS_k29_BUCKETSET(6);ATTACK_CHACHAS_k29_BUCKETSET(7);
-		ATTACK_CHACHAS_k29_BUCKETSET(8);ATTACK_CHACHAS_k29_BUCKETSET(9);ATTACK_CHACHAS_k29_BUCKETSET(10);ATTACK_CHACHAS_k29_BUCKETSET(11);
-		ATTACK_CHACHAS_k29_BUCKETSET(12);ATTACK_CHACHAS_k29_BUCKETSET(13);ATTACK_CHACHAS_k29_BUCKETSET(14);ATTACK_CHACHAS_k29_BUCKETSET(15);
-
-	}
-}
-
-// we do computes and tally up number in each bucket
-// if number in a bucket exceeds the 128 bytes (i.e. 128/8 bytes = 16)
-// and we have at least 2 buckets with said bytes, then write those out to global.
-__global__
-void gpu_chacha8_k29_threshold_counters(const uint32_t N,
-		const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	__shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS];
-	__shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS];
-
-
-
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	//uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (x_group < end_n) {
-
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			shared_counts[i] = 0;
-		}
-		__syncthreads();
-
-		uint32_t pos = x_group * 16;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3);
-		ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7);
-		ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11);
-		ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15);
-
-		__syncthreads();
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-			shared_counts[i] = 0;
-		}
-
-		__syncthreads();
-
-		// now recompute and this time add to global array
-		pos = x_group * 16;// + X_START/16;
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		ATTACK_CHACHAS_k29_BUCKETSET(0);ATTACK_CHACHAS_k29_BUCKETSET(1);ATTACK_CHACHAS_k29_BUCKETSET(2);ATTACK_CHACHAS_k29_BUCKETSET(3);
-		ATTACK_CHACHAS_k29_BUCKETSET(4);ATTACK_CHACHAS_k29_BUCKETSET(5);ATTACK_CHACHAS_k29_BUCKETSET(6);ATTACK_CHACHAS_k29_BUCKETSET(7);
-		ATTACK_CHACHAS_k29_BUCKETSET(8);ATTACK_CHACHAS_k29_BUCKETSET(9);ATTACK_CHACHAS_k29_BUCKETSET(10);ATTACK_CHACHAS_k29_BUCKETSET(11);
-		ATTACK_CHACHAS_k29_BUCKETSET(12);ATTACK_CHACHAS_k29_BUCKETSET(13);ATTACK_CHACHAS_k29_BUCKETSET(14);ATTACK_CHACHAS_k29_BUCKETSET(15);
-
-	}
-}
-
-__global__
-void gpu_chacha8_k29_bucketadd_256threads_warp_buckets(const uint32_t N,
-		const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	__shared__ uint32_t warp_bucket_ys[32];
-	__shared__ uint32_t warp_bucket_xs[32];
-	__shared__ int warp_bucket_counts[256/32]; // 8 different sets of warp buckets
-	// idea here is to process with warps
-	int warp_id = threadIdx.x % 32;
-	uint32_t chacha_result = 23; // computed...
-	if ((chacha_result % 32) == warp_id) {
-		// add result to our bucket
-		int count = atomicAdd(&warp_bucket_counts[warp_id],1);
-		if (count == 16) {
-			// 8 * (4x2) = 128 bytes, full bandwidth write
-		}
-	}
-	// 256 threads, one bucket add at a time = 256 entries each loop.
-	// we need 128 bytes to make a full bandwidth global write
-	// = 128/8 = 16 entries from a bucket.
-
-}
-
-__global__
-void gpu_chacha8_k29_bucketadd_256threads_upto1024buckets(const uint32_t N,
-		// 1024 buckets = 176GB/s (240GB/s possible), 512 buckets =  276GB/s, 256 buckets =  293GB/s, 8 buckets =  337GB/s
-		// note we lose ~1ms on innefficient prefix sums so this can improve +20% for 1024 buckets
-		// with only shared counters we get 400GB/s, so this does take significant time and could be optimized
-		// against having bank conflicts for instance.
-		// possibly by doing 32 passes(!) where each thread focuses on it's own bank for shared memory. Yikes.
-		const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	__shared__ int shared_counts[k29_CHACHA_SPLIT_BUCKETS];
-	__shared__ int global_counts[k29_CHACHA_SPLIT_BUCKETS];
-	__shared__ int shared_counts_offsets[k29_CHACHA_SPLIT_BUCKETS];
-	// a 256 thread x 16 pass gives 4096 values total.
-	// for 1024 buckets that's only 4 values average per bucket. We want to write 128 bytes = 128/8 = 16 entries minimum.
-	// so want minimum multiple of 4 so we average 16 entries
-	// our shared space only allows 32k / 8 = 4096 entries
-	// 1024 buckets = 176GB/s
-	// 512 buckets =  276GB/s
-	// 256 buckets =  293GB/s
-	//   8 buckets =  337GB/s
-
-	//__shared__ xchacha_pair shared_sorted_xchachas[4096];// 32k
-	__shared__ uint32_t shared_sorted_xs[4096];// 16k <- tried to resolve bank conflicts but didn't do much
-	__shared__ uint32_t shared_sorted_chachas[4096];// 16k
-
-	if (blockDim.x != 256) printf("ERROR BLOCKDIM MUST BE 256\n");
-	if (k29_CHACHA_SPLIT_BUCKETS > 1024) printf("ERROR SPLIT BUCKETS MUST BE <1024\n");
-
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	//uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / (16); // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (x_group < end_n) {
-
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			shared_counts[i] = 0;
-		}
-		__syncthreads();
-
-		uint32_t pos = x_group * 16;// this is incorrect but shouldn't really matter that much
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3);
-		ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7);
-		ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11);
-		ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15);
-
-		__syncthreads();
-
-		/*
-		 * 1.6 - 2.06746 ms with only bucket adds and shared to global counts
-		 * 2.43 - 2.64 with our single thread prefix sum = +0.8 to 0.64
-		 * then 5.94 total after writing out. = 180GB/s but minus 0.64 = 12% faster which is 200GB/s
-		 */
-		if (threadIdx.x == 0) {
-			// yes this can be sped up, it adds 1.6ms/multiple - i.e. mult = 1 = +1.6ms, 2 = +0.8ms etc.
-			shared_counts_offsets[0] = 0;
-			//int min = shared_counts[0]; int max = shared_counts[0]; int num_above_16 = 0;
-			for (int i=1;i<k29_CHACHA_SPLIT_BUCKETS;i++) {
-				//if (min > shared_counts[i]) min = shared_counts[i];
-				//if (max < shared_counts[i]) max = shared_counts[i];
-				//if (shared_counts[i] >= 16) num_above_16++;
-				//printf(" %i ", shared_counts[i]);
-				shared_counts_offsets[i] = shared_counts[i-1] + shared_counts_offsets[i-1];
-			}
-			//printf("min: %i max: %i above16: %i\n", min, max,num_above_16);
-		}
-		__syncthreads();
-
-
-		/*if ((base_group == 0) && (threadIdx.x == 0)) {
-			printf("base group %u : ",base_group);
-			for (int i=0;i<1024;i++) printf("%u ",shared_counts[i]);
-			printf("\n");
-			for (int i=0;i<1024;i++) printf("%u ",shared_counts_offsets[i]);
-			printf("\n");
-		}
-		__syncthreads();*/
-
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-			shared_counts[i] = 0;
-		}
-		__syncthreads();
-
-		// now recompute and add sorted into array
-		pos = x_group * 16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		// makes it 3.61 from 2.41 so yes did add a lot.
-		ATTACK_CHACHAS_k29_SORTEDADD(0);ATTACK_CHACHAS_k29_SORTEDADD(1);ATTACK_CHACHAS_k29_SORTEDADD(2);ATTACK_CHACHAS_k29_SORTEDADD(3);
-		ATTACK_CHACHAS_k29_SORTEDADD(4);ATTACK_CHACHAS_k29_SORTEDADD(5);ATTACK_CHACHAS_k29_SORTEDADD(6);ATTACK_CHACHAS_k29_SORTEDADD(7);
-		ATTACK_CHACHAS_k29_SORTEDADD(8);ATTACK_CHACHAS_k29_SORTEDADD(9);ATTACK_CHACHAS_k29_SORTEDADD(10);ATTACK_CHACHAS_k29_SORTEDADD(11);
-		ATTACK_CHACHAS_k29_SORTEDADD(12);ATTACK_CHACHAS_k29_SORTEDADD(13);ATTACK_CHACHAS_k29_SORTEDADD(14);ATTACK_CHACHAS_k29_SORTEDADD(15);
-
-		// now push to global
-		__syncthreads();
-		for (int i=threadIdx.x;i<4096;i+=blockDim.x) {
-
-			uint32_t x = shared_sorted_xs[i];
-			uint32_t Ly = shared_sorted_chachas[i];//pair.chacha;
-			xchacha_pair pair = {}; pair.x = x; pair.chacha = Ly;
-			uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-			int slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1);
-			if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot);
-			else xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair;
-		}
-
-		//gpu_chacha8_k29_bucketadd time: 10.9147 ms w/ 1024 buckets no multipasses, w/o writing is 3.6ms so writes take 7ms
-		//Effective Bandwidth (GB/s): 196.752304
-	}
-}
-
-
-
-
-
-__global__
-void gpu_chacha8_k29_linear(const uint32_t N,
-		const __restrict__ uint32_t *input, uint32_t *chacha_xs, uint32_t *chacha_ys)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 16;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		ATTACK_CHACHAS_k29_YS_ONLY(0);ATTACK_CHACHAS_k29_YS_ONLY(1);ATTACK_CHACHAS_k29_YS_ONLY(2);ATTACK_CHACHAS_k29_YS_ONLY(3);
-		ATTACK_CHACHAS_k29_YS_ONLY(4);ATTACK_CHACHAS_k29_YS_ONLY(5);ATTACK_CHACHAS_k29_YS_ONLY(6);ATTACK_CHACHAS_k29_YS_ONLY(7);
-		ATTACK_CHACHAS_k29_YS_ONLY(8);ATTACK_CHACHAS_k29_YS_ONLY(9);ATTACK_CHACHAS_k29_YS_ONLY(10);ATTACK_CHACHAS_k29_YS_ONLY(11);
-		ATTACK_CHACHAS_k29_YS_ONLY(12);ATTACK_CHACHAS_k29_YS_ONLY(13);ATTACK_CHACHAS_k29_YS_ONLY(14);ATTACK_CHACHAS_k29_YS_ONLY(15);
-	}
-}
-
-__global__
-void gpu_chacha8_k29_to_kbc(const uint32_t N,
-		const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local
-	uint32_t base_group = blockIdx.x * blockDim.x;
-	uint32_t base_x = base_group * 16;
-	int x_group = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N / 16; // 16 x's in each group
-	//printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u  x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group);
-
-	if (x_group < end_n) {
-		uint32_t pos = x_group * 16;// + X_START/16;
-		//printf("x group pos = %u\n", pos);
-
-		datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7];
-		datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11];
-		datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32;
-		datax[14] = input[14];datax[15] = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]);
-			QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]);
-			QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]);
-			QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]);
-		}
-
-		datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4];
-		datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9];
-		datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0;
-		datax[14] += input[14];datax[15] += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]);
-		BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]);
-		BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]);
-
-		//uint64_t y = datax[0] << 6 + x >> 26;  for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22
-		//int nick_bucket_id; //  = datax[0] >> 22; // gives bucket id 0..1023
-		ATTACK_CHACHAS_k29_TO_KBC(0);ATTACK_CHACHAS_k29_TO_KBC(1);ATTACK_CHACHAS_k29_TO_KBC(2);ATTACK_CHACHAS_k29_TO_KBC(3);
-		ATTACK_CHACHAS_k29_TO_KBC(4);ATTACK_CHACHAS_k29_TO_KBC(5);ATTACK_CHACHAS_k29_TO_KBC(6);ATTACK_CHACHAS_k29_TO_KBC(7);
-		ATTACK_CHACHAS_k29_TO_KBC(8);ATTACK_CHACHAS_k29_TO_KBC(9);ATTACK_CHACHAS_k29_TO_KBC(10);ATTACK_CHACHAS_k29_TO_KBC(11);
-		ATTACK_CHACHAS_k29_TO_KBC(12);ATTACK_CHACHAS_k29_TO_KBC(13);ATTACK_CHACHAS_k29_TO_KBC(14);ATTACK_CHACHAS_k29_TO_KBC(15);
-	}
-}
-
-__global__
-void gpu_chacha_ys_bucket_direct(const uint32_t N, const __restrict__ uint32_t *chacha_ys,
-		xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N;
-
-	if (x < end_n) {
-		uint32_t chacha_y = chacha_ys[x];
-		uint32_t Ly = chacha_y; // (((uint64_t) chacha_y) << 6) + (x >> 26);
-		uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-		int slot = atomicAdd(&xchachas_bucket_counts[bucket_id],1);
-		xchacha_pair pair = { x, chacha_y };
-		xchachas_buckets[bucket_id * k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET + slot] = pair;
-	}
-}
-
-__global__
-void gpu_chacha_ys_bucket_shared_counts(const uint32_t N, const __restrict__ uint32_t *chacha_ys,
-		xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts)
-{
-	__shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS];
-	__shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS];
-
-	uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N;
-
-	if (x < end_n) {
-
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			shared_counts[i] = 0;
-		}
-		__syncthreads();
-
-		uint32_t chacha_y = chacha_ys[x];
-		uint64_t Ly = chacha_y; // (((uint64_t) chacha_y) << 6) + (x >> 26);
-		uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-		xchacha_pair pair = { x, chacha_y };
-		atomicAdd(&shared_counts[bucket_id],1);
-
-		__syncthreads();
-		for (int i=threadIdx.x;i<k29_CHACHA_SPLIT_BUCKETS;i+=blockDim.x) {
-			global_counts[i] = atomicAdd(&xchachas_bucket_counts[i],shared_counts[i]);
-			shared_counts[i] = 0;
-		}
-
-		__syncthreads();
-		for (int i=threadIdx.x;i<blockDim.x;i+=blockDim.x) {
-			//printf("writing slot %u into global slot %u\n",i,base_x + i);
-			//xchacha_pair pair = shared_chachas[i];
-			//uint32_t bucket_id = pair.chacha / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-			uint slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1);
-			if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot);
-			else xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; // shared_chachas[i];
-		}
-	}
-
-
-}
-
-__global__
-void gpu_test_cache_cp(const uint32_t N, const uint32_t cache_size_bytes, uint32_t *cache)
-{
-	uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-	const uint32_t end_n = N;
-	//if (threadIdx.x == 0) {
-	//	printf("gridDim.x: %u  blockIdx.x: %u  our block id: %u  total blocks: %u\n", gridDim.x, blockIdx.x, block_id, total_blocks);
-	// }
-	const uint32_t CACHE_SIZE = 1024*1024;
-	if (x < end_n) {
-		//uint32_t address = x*1; // x*1: 1 write - 862GB/s vs 1444G/s cached = 1.67x
-		//uint32_t address = x*4; // x*4: 1 write - 171GB/s in no cache zone, and 404GB/s in cache zone = 2.3x
-		uint32_t address = x*64 + 1; //   Xt*8: 1 write - 85GB/s in no cache zone,  and 204GB/s in cache zone = 2.4x (2.63ms)
-		 // *8 with 2 writes (8 byte) :  85GB/s same as 1 write, so effective 170GB/s cache zone: 137GB/s effective 274GB/s in cache zone
-		//  *8 with 4 writes (16 byte):  81GB/s..so x4 = 240GB/s
-		//  *64 with 1 writes (4 byte):  38GB/s ..  cache:  127GB/s in cache,
-		//  *64 with 4 writes (16 byte): 32GB/s full random write effective x4 = 120GB/s, cache doesn't seem to help here.
-		//       also writing at address*64+1 didn't affect write speed strangely.
-		 // *64 with 6 writes (24 byte): 21GB/s - x6 = 120GB/s
-		//  *64 with 8 writes (32 byte): 16GB/s - x8 = 128GB/s
-		 // *8 with 32 bytes: 17.9ms
-		const int BOUNDS = false ? CACHE_SIZE : N;
-		cache[(address + 0) % BOUNDS] = x;
-		cache[(address + 1) % BOUNDS] = x;
-		cache[(address + 2) % BOUNDS] = x;
-		cache[(address + 3) % BOUNDS] = x;
-
-		//float4 val;
-		//const float4* myinput = cache+address;
-		//asm("ld.global.cv.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(val.x), "=f"(val.y), "=f"(val.z), "=f"(val.w) : "l"(myinput));
-
-		//cache[(address + 4) % BOUNDS] = x;
-		//cache[(address + 5) % BOUNDS] = x;
-		//cache[(address + 6) % BOUNDS] = x;
-		//cache[(address + 7) % BOUNDS] = x;
-
-		//cache[(x*8) % (N)] = x;
-		// x*1: 862GB/s vs 1444G/s cached = 1.67x
-		// x*4: 171GB/s in no cache zone, and 404GB/s in cache zone = 2.3x
-		// x*8: 85GB/s in no cache zone,  and 204GB/s in cache zone = 2.4x
-	}
-}
-
-
-
-
-__global__
-void gpu_count_xpairs_kbc_buckets(
-		const xchacha_pair *xchachas_buckets, const uint *xchachas_block_counts, uint *global_kbc_counts)
-{
-	uint32_t block_id = blockIdx.x;
-	const uint32_t num_in_block = xchachas_block_counts[block_id];
-	const uint32_t block_offset = k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * block_id;
-
-	for (int i=block_offset + threadIdx.x;i<block_offset + num_in_block;i+=blockDim.x) {
-		xchacha_pair pair = xchachas_buckets[i];
-		uint32_t Ly = pair.chacha; // (((uint64_t) chacha_y) << 6) + (x >> 26);
-		//uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-		uint32_t kbc_id = pair.chacha / k29_BC_BUCKET_DIVISOR; // hack for k28 size kBC;
-		//printf("x: %u  chacha: %u  bucket: %u  kbc_id:%u\n", pair.x, pair.chacha, bucket_id, kbc_id);
-		int slot = atomicAdd(&global_kbc_counts[kbc_id],1);
-	}
-}
-
-
-__global__ void gpu_check_xpairs(const xchacha_pair *xchachas_in, const uint32_t N) {
-	if (threadIdx.x == 0) {
-		for (int i=0;i<N;i++) {
-			xchacha_pair pair = xchachas_in[i];
-
-			uint32_t Ly = pair.chacha; // (((uint64_t) chacha_y) << 6) + (x >> 26);
-			uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR;
-			printf("%u = %u  bucket: %u\n", pair.x, pair.chacha, bucket_id);
-		}
-	}
-}
-
-#include <cub/cub.cuh>   // or equivalently <cub/device/device_radix_sort.cuh>
-
-
-void do_k29_T1() {
-
-	std::cout << "do k29 T1  BATCHES:" << k29_BATCHES  << std::endl;
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-	auto finish =  std::chrono::high_resolution_clock::now(); // just to allocate
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEvent_t begin, end;
-	cudaEventCreate(&begin);
-	cudaEventCreate(&end);
-	cudaEventRecord(begin);
-
-
-	int blockSize; // # of threads per block, maximum is 1024.
-	uint64_t calc_N;
-	uint64_t calc_blockSize;
-	uint64_t calc_numBlocks;
-	int numBlocks;
-
-	// first phase is writing chacha results
-	uint32_t *chacha_ys = (uint32_t *) &device_buffer_A[0]; // set ys to beginning of device buffer A
-	uint32_t *chacha_xs = (uint32_t *) &device_buffer_A[k29_MAX_X_VALUE*4]; // set ys to beginning of device buffer A
-	xchacha_pair *xchachas_buckets = (xchacha_pair *) &device_buffer_A[k29_MAX_X_VALUE*4];
-	float milliseconds = 0;
-
-	std::cout << "   gpu_chacha8_k29_bucketadd    ys num:" << calc_N << std::endl;
-		blockSize = 256; // # of threads per block, maximum is 1024.
-		calc_N = k29_MAX_X_VALUE;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-		std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, k29_BC_NUM_BUCKETS*sizeof(int)));
-		cudaEventRecord(start);
-		//gpu_chacha8_k29_to_kbc<<<numBlocks,blockSize>>>(calc_N, chacha_input,xchachas_buckets, global_kbc_counts);
-		// cuda event total time: 65.0044 ms
-		//gpu_chacha8_k29_bucketadd time: 23.0625 ms
-		//Effective Bandwidth (GB/s): 46.557852
-		//gpu_chacha8_k29_bucketadd time: 23.0697 ms
-		//Effective Bandwidth (GB/s): 46.543388
-
-		//gpu_chacha8_k29_bucketadd<<<numBlocks,blockSize>>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts);
-		//gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<<numBlocks,blockSize>>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts);
-		gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<<numBlocks,blockSize>>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts);
-		//gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<<numBlocks,blockSize>>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts);
-
-		// counter list counts  SUM:134217728   MAX:132347 id: 0 count: 131044 6.06925 ms (GB/s): 176.706432
-
-		cudaEventRecord(stop);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		cudaEventSynchronize(stop);
-
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		std::cout << "gpu_chacha8_k29_bucketadd_256threads_upto1024buckets time: " << milliseconds << " ms\n";
-		printf("Effective Bandwidth (GB/s): %f\n", calc_N*8/milliseconds/1e6);
-
-		//1024 buckets multiple 1 gpu_chacha8_k29_bucketadd time: 11.008 ms Effective Bandwidth (GB/s): 195.083904
-/*
-	cudaEventRecord(start);
-	// 1 block per split bucket, threads will have to work out how much to parse
-	gpu_count_xpairs_kbc_buckets<<<k29_CHACHA_SPLIT_BUCKETS,blockSize>>>(xchachas_buckets, xchachas_bucket_counts, global_kbc_counts);
-	cudaEventRecord(stop);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&milliseconds, start, stop);
-	std::cout << "gpu_count_xpairs_kbc_buckets time: " << milliseconds << " ms\n";
-	printf("Effective Bandwidth (GB/s): %f\n", k29_MAX_X_VALUE*8/milliseconds/1e6);
-
-	thrust::device_ptr<uint> device_kbc_counts(global_kbc_counts);
-	cudaEventRecord(start);
-	thrust::exclusive_scan(device_kbc_counts, device_kbc_counts + k29_BC_NUM_BUCKETS, device_kbc_counts);
-	cudaEventRecord(stop);
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&milliseconds, start, stop);
-	std::cout << "exclusive scan kbc_buckets time: " << milliseconds << " ms\n";
-*/
-
-		std::cout << "   gpu_test_cache    ys num:" << calc_N << std::endl;
-		blockSize = 256; // # of threads per block, maximum is 1024.
-		calc_N = k29_MAX_X_VALUE;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-		numBlocks = calc_numBlocks;
-		std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		cudaEventRecord(start);
-		gpu_test_cache_cp<<<numBlocks,blockSize>>>(calc_N, calc_N, chacha_ys);
-		cudaEventRecord(stop);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		cudaEventSynchronize(stop);//auto sort_start = std::chrono::high_resolution_clock::now();
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		std::cout << "cache linear test " << calc_N << " time: " << milliseconds << " ms\n";
-		printf("Effective Bandwidth (GB/s): %f\n", calc_N*4/milliseconds/1e6);
-
-
-	{
-		// thrust linear then sort method
-
-		std::cout << "   gpu_chacha8_k29_linear    ys num:" << calc_N << std::endl;
-
-		blockSize = 256; // # of threads per block, maximum is 1024.
-		calc_N = k29_MAX_X_VALUE;
-		calc_blockSize = blockSize;
-		calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-		numBlocks = calc_numBlocks;
-		std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl;
-		cudaEventRecord(start);
-		gpu_chacha8_k29_linear<<<numBlocks,blockSize>>>(calc_N, chacha_input,chacha_xs,chacha_ys);
-		cudaEventRecord(stop);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		cudaEventSynchronize(stop);//auto sort_start = std::chrono::high_resolution_clock::now();
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		std::cout << "write chachas time: " << milliseconds << " ms\n";
-		printf("Effective Bandwidth (GB/s): %f\n", calc_N*8/milliseconds/1e6);
-
-		/*auto sort_start = std::chrono::high_resolution_clock::now();
-		cudaEventRecord(start);
-		thrust::device_ptr<uint32_t> device_xs_L_ptr(chacha_xs);
-		thrust::device_ptr<uint32_t> device_ys_L_ptr(chacha_ys);
-		thrust::sort_by_key(device_ys_L_ptr, device_ys_L_ptr + calc_N, device_xs_L_ptr);
-		cudaEventRecord(stop);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		auto sort_finish = std::chrono::high_resolution_clock::now();
-		std::cout << "   sort time: " << std::chrono::duration_cast<milli>(sort_finish - sort_start).count() << " ms\n";
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		std::cout << "thrust sort " << calc_N << " time: " << milliseconds << " ms\n";
-		printf("Effective Bandwidth (GB/s): %f\n", calc_N*8*2/milliseconds/1e6);*/
-	}
-
-	{// Declare, allocate, and initialize device-accessible pointers for sorting data
-
-		// Determine temporary device storage requirements
-		void     *d_temp_storage = NULL;
-		size_t   temp_storage_bytes = 0;
-		cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-				chacha_ys, chacha_ys, chacha_xs, chacha_xs, k29_MAX_X_VALUE);
-		// Allocate temporary storage
-		cudaMalloc(&d_temp_storage, temp_storage_bytes);
-		// Run sorting operation
-		cudaEventRecord(start);
-		cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes,
-				chacha_ys, chacha_ys, chacha_xs, chacha_xs, k29_MAX_X_VALUE);
-		cudaEventRecord(stop);
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&milliseconds, start, stop);
-		// thrust is 13ms
-		std::cout << "cuda sort " << calc_N << " time: " << milliseconds << " ms\n";
-		// d_keys_out            <-- [0, 3, 5, 6, 7, 8, 9]
-		// d_values_out          <-- [5, 4, 3, 1, 2, 0, 6]
-	}
-
-
-	/*std::cout << "   gpu_chacha split buckets (num: " << k29_CHACHA_SPLIT_BUCKETS << " divisor:" << k29_CHACHA_SPLIT_BUCKET_DIVISOR << ")   ys num:" << calc_N << std::endl;
-	blockSize = 1024; // # of threads per block, maximum is 1024.
-	calc_N = k29_MAX_X_VALUE;
-	calc_blockSize = blockSize;
-	calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize);
-	numBlocks = calc_numBlocks;
-	cudaEventRecord(start);
-	// erm...thrust sort on 268435456 is 27ms...just saying, beats most other timings.
-	//       and sort on 33million elements (1/8 of total) is 3ms. In other words...worth doing...
-	//gpu_chacha_ys_bucket_shared_counts<<<numBlocks,blockSize>>>(calc_N, chacha_ys, xchachas_buckets, xchachas_bucket_counts);
-	//counter list counts  SUM:268435456   MAX:33561699
-	//   gpu_chacha split buckets (num: 1024 divisor:4194304)      ys num:268435456 time: 47.1419 ms (GB/s): 68.330432
-	//   gpu_chacha split buckets (num: 128 divisor:33554432)      ys num:268435456 time: 38.0465 ms (GB/s): 84.665424
-	//   gpu_chacha_ys_bucket_shared (num: 32 divisor:134217728)   ys num:268435456 time: 17.9118 ms (GB/s): 179.838096
-	//   gpu_chacha split buckets (num: 8 divisor:536870912)       ys num:268435456 time: 6.79731 ms (GB/s): 473.896960
-	//   -> note 8*8*8*8 = 4096, and would take 27ms, which is less than 1024 @ 47ms
-	//gpu_chacha_ys_bucket_direct<<<numBlocks,blockSize>>>(calc_N, chacha_ys, xchachas_buckets, xchachas_bucket_counts);
-	//   gpu_chacha_ys_bucket_direct (num: 1136761 divisor:3778)   ys num:268435456 time: 102.703 ms   (GB/s): 31.364442
-	//   gpu_chacha_ys_bucket_direct (num: 1024 divisor:4194304)   ys num:268435456 time: 48.5682 ms   (GB/s): 66.323720
-	//   gpu_chacha_ys_bucket_direct (num: 128 divisor:33554432)   ys num:268435456 time: 73.6359 ms (GB/s): 43.745292
-	//   gpu_chacha_ys_bucket_direct (num: 32 divisor:134217728)   ys num:268435456 time: 85.1845 ms   (GB/s): 37.814688
-	cudaEventRecord(stop);
-	CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	cudaEventSynchronize(stop);
-	cudaEventElapsedTime(&milliseconds, start, stop);
-	std::cout << "bucket chachas time: " << milliseconds << " ms\n";
-	printf("Effective Bandwidth (GB/s): %f\n", (calc_N*(4+8))/milliseconds/1e6);*/
-
-	//gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, k29_CHACHA_SPLIT_BUCKETS, true);
-	//<<<1,1>>>(global_kbc_counts, 1024, false);
-	cudaEventRecord(end);
-	cudaEventSynchronize(end);
-	cudaEventElapsedTime(&milliseconds, begin, end);
-	std::cout << "cuda event total time: " << milliseconds << " ms\n";
-}
-
-
-void setup_memory_k29() {
-
-	//setupMMap(HOST_ALLOCATED_BYTES); // potentially useful if going to do random reads/writes to stored data
-
-	//std::cout << "      device_block_entry_counts (" << k29_BATCHES << "): " << k29_BATCHES << " size:" << (sizeof(int)*k29_BATCHES) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts, k29_BATCHES*sizeof(int)));
-
-	std::cout << "      device_local_kbc_num_entries " << k29_BC_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*k29_BC_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&global_kbc_counts, k29_BC_NUM_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, k29_BC_NUM_BUCKETS*sizeof(int)));
-
-	//Tx_Pairing_Chunk_Meta4 *device_buffer_A;
-	std::cout << "      device_buffer_A " << k29_DEVICE_BUFFER_A_BYTES << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_A, k29_DEVICE_BUFFER_A_BYTES));
-
-	std::cout << "      xchachas_bucket_counts k29_CHACHA_SPLIT_BUCKETS:" << k29_CHACHA_SPLIT_BUCKETS << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&xchachas_bucket_counts, k29_CHACHA_SPLIT_BUCKETS*sizeof(int)));
-	CUDA_CHECK_RETURN(cudaMemset(xchachas_bucket_counts, 0, k29_CHACHA_SPLIT_BUCKETS*sizeof(int)));
-
-
-	//Tx_Pairing_Chunk_Meta4 *device_buffer_B;
-	//std::cout << "      device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl;
-	//CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES));
-
-	//std::cout << "      device_buffer_refdata " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" <<  BACKREF_UNIT_BYTES << ") = " << BACKREF_ALLOCATED_BYTES << std::endl;
-	//CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_refdata, BACKREF_ALLOCATED_BYTES));
-
-	//std::cout << "      HOST host_refdata_blocks ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " ALLOCATED ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " UNIT BYTES: " << BACKREF_UNIT_BYTES << " = " << (BACKREF_ALLOCATED_BYTES) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_refdata_blocks, BACKREF_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-
-	//std::cout << "      HOST host_criss_cross_blocks MAX_ENTRIES: " << HOST_MAX_BLOCK_ENTRIES << " ALLOCATED ENTRIES: " << HOST_ALLOCATED_ENTRIES << " UNIT BYTES: " << HOST_UNIT_BYTES << " = " << (HOST_ALLOCATED_BYTES) << std::endl;
-	//CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_criss_cross_blocks, HOST_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-}
-
-void do_k29() {
-	std::cout << "****** PROGRAM START K29 V0.1 *********" << std::endl;
-
-	setup_memory_k29();
-
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-	do_k29_T1();
-	std::cout << " freeing memory...";
-	freeMemory();
-	std::cout << "end." << std::endl;
-	exit(EXIT_SUCCESS);
-}
-
-
-
-
-#endif /* K29_PLOTTER_HPP_ */
diff --git a/nick_blake3.hpp b/nick_blake3.hpp
deleted file mode 100644
index 930f50a..0000000
--- a/nick_blake3.hpp
+++ /dev/null
@@ -1,336 +0,0 @@
-/*
- * nick_blake3.hpp
- *
- *  Created on: Oct 26, 2021
- *      Author: nick
- */
-
-#ifndef NICK_BLAKE3_HPP_
-#define NICK_BLAKE3_HPP_
-
-#define CALC_Y_BUCKETED_KBC_ENTRY(entry, bucket_id) \
-	(((uint64_t) bucket_id) * ((uint64_t) 15113) + (uint64_t) entry.y)
-
-#define BSWAP32(i) (__byte_perm ((i), 0, 0x0123))
-
-#define NICK_ROTR32(w,c) \
-  (((w) >> (c)) | ((w) << (32 - (c))))
-
-// rotate32 by 8 * c bits (1 byte)
-#define NICK_ROTR32_BYTE8(w,c) __byte_perm (w, w, 0x3210 + 0x1111 * c);
-
-// optimized for cuda instructions with rotate by multiples of 8 bites
-#define NICK_G(a,b,c,d,x,y) \
-  state[a] = state[a] + state[b] + x; \
-  state[d] = NICK_ROTR32_BYTE8(state[d] ^ state[a], 2); \
-  state[c] = state[c] + state[d]; \
-  state[b] = NICK_ROTR32(state[b] ^ state[c], 12); \
-  state[a] = state[a] + state[b] + y; \
-  state[d] = NICK_ROTR32_BYTE8(state[d] ^ state[a], 1); \
-  state[c] = state[c] + state[d]; \
-  state[b] = NICK_ROTR32(state[b] ^ state[c], 7); \
-
-
-
-__device__
-void nick_blake3(const uint32_t* meta, int meta_len, const uint64_t y,
-		uint64_t *y_result, uint8_t c_len, uint32_t *c_results) {
-	uint32_t state[16];
-	uint32_t block_words[16];// = {0};
-	size_t input_len = 21;
-
-    block_words[0] = BSWAP32(y >> 6);
-	block_words[1] = BSWAP32(__funnelshift_l ( meta[0], y, 26));
-	block_words[2] = BSWAP32(__funnelshift_l ( meta[1], meta[0], 26));
-	if (meta_len == 2) {
-		// [32][6-26][6-26][6-]
-		block_words[3] = BSWAP32(meta[1] << 26);
-		input_len = 13;
-	}
-	else if (meta_len == 3) {
-		// [32][6-26][6-26][6-26][6-26][6-]
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(meta[2] << 26);
-		input_len = 17;
-	}
-	else if (meta_len == 4) {
-		// [32][6-26][6-26][6-26][6-26][6-26][6-]
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(meta[3] << 26);
-		input_len = 21;
-	}
-	else if (meta_len == 6) {
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26));
-		block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26));
-		block_words[7] = BSWAP32(meta[5] << 26);
-		input_len = 29;
-	}
-	else if (meta_len == 8) {
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26));
-		block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26));
-		block_words[7] = BSWAP32(__funnelshift_l ( meta[6], meta[5], 26));
-		block_words[8] = BSWAP32(__funnelshift_l ( meta[7], meta[6], 26));
-		block_words[9] = BSWAP32(meta[7] << 26);
-		input_len = 37;
-	}
-
-	for (int i=meta_len+2;i<16;i++) block_words[i]=0;
-
-
-	state[0] = 0x6A09E667UL;
-	state[1] = 0xBB67AE85UL;
-	state[2] = 0x3C6EF372UL;
-	state[3] = 0xA54FF53AUL;
-	state[4] = 0x510E527FUL;
-	state[5] = 0x9B05688CUL;
-	state[6] = 0x1F83D9ABUL;
-	state[7] = 0x5BE0CD19UL;
-	state[8] = 0x6A09E667UL;
-	state[9] = 0xBB67AE85UL;
-	state[10] = 0x3C6EF372UL;
-	state[11] = 0xA54FF53AUL;
-	state[12] = 0; // counter_low(0);
-	state[13] = 0; // counter_high(0);
-	state[14] = (uint32_t) input_len; // take;// (uint32_t)output.block_len;
-	state[15] = (uint32_t) (1 | 2 | 8);// (output.flags | ROOT);
-
-	NICK_G(0,4,8,12,block_words[0],block_words[1]);
-	NICK_G(1,5,9,13,block_words[2],block_words[3]);
-	NICK_G(2,6,10,14,block_words[4],block_words[5]);
-	NICK_G(3,7,11,15,block_words[6],block_words[7]);
-	NICK_G(0,5,10,15,block_words[8],block_words[9]);
-	NICK_G(1,6,11,12,block_words[10],block_words[11]);
-	NICK_G(2,7,8,13,block_words[12],block_words[13]);
-	NICK_G(3,4,9,14,block_words[14],block_words[15]);
-	NICK_G(0,4,8,12,block_words[2],block_words[6]);
-	NICK_G(1,5,9,13,block_words[3],block_words[10]);
-	NICK_G(2,6,10,14,block_words[7],block_words[0]);
-	NICK_G(3,7,11,15,block_words[4],block_words[13]);
-	NICK_G(0,5,10,15,block_words[1],block_words[11]);
-	NICK_G(1,6,11,12,block_words[12],block_words[5]);
-	NICK_G(2,7,8,13,block_words[9],block_words[14]);
-	NICK_G(3,4,9,14,block_words[15],block_words[8]);
-	NICK_G(0,4,8,12,block_words[3],block_words[4]);
-	NICK_G(1,5,9,13,block_words[10],block_words[12]);
-	NICK_G(2,6,10,14,block_words[13],block_words[2]);
-	NICK_G(3,7,11,15,block_words[7],block_words[14]);
-	NICK_G(0,5,10,15,block_words[6],block_words[5]);
-	NICK_G(1,6,11,12,block_words[9],block_words[0]);
-	NICK_G(2,7,8,13,block_words[11],block_words[15]);
-	NICK_G(3,4,9,14,block_words[8],block_words[1]);
-	NICK_G(0,4,8,12,block_words[10],block_words[7]);
-	NICK_G(1,5,9,13,block_words[12],block_words[9]);
-	NICK_G(2,6,10,14,block_words[14],block_words[3]);
-	NICK_G(3,7,11,15,block_words[13],block_words[15]);
-	NICK_G(0,5,10,15,block_words[4],block_words[0]);
-	NICK_G(1,6,11,12,block_words[11],block_words[2]);
-	NICK_G(2,7,8,13,block_words[5],block_words[8]);
-	NICK_G(3,4,9,14,block_words[1],block_words[6]);
-	NICK_G(0,4,8,12,block_words[12],block_words[13]);
-	NICK_G(1,5,9,13,block_words[9],block_words[11]);
-	NICK_G(2,6,10,14,block_words[15],block_words[10]);
-	NICK_G(3,7,11,15,block_words[14],block_words[8]);
-	NICK_G(0,5,10,15,block_words[7],block_words[2]);
-	NICK_G(1,6,11,12,block_words[5],block_words[3]);
-	NICK_G(2,7,8,13,block_words[0],block_words[1]);
-	NICK_G(3,4,9,14,block_words[6],block_words[4]);
-	NICK_G(0,4,8,12,block_words[9],block_words[14]);
-	NICK_G(1,5,9,13,block_words[11],block_words[5]);
-	NICK_G(2,6,10,14,block_words[8],block_words[12]);
-	NICK_G(3,7,11,15,block_words[15],block_words[1]);
-	NICK_G(0,5,10,15,block_words[13],block_words[3]);
-	NICK_G(1,6,11,12,block_words[0],block_words[10]);
-	NICK_G(2,7,8,13,block_words[2],block_words[6]);
-	NICK_G(3,4,9,14,block_words[4],block_words[7]);
-	NICK_G(0,4,8,12,block_words[11],block_words[15]);
-	NICK_G(1,5,9,13,block_words[5],block_words[0]);
-	NICK_G(2,6,10,14,block_words[1],block_words[9]);
-	NICK_G(3,7,11,15,block_words[8],block_words[6]);
-	NICK_G(0,5,10,15,block_words[14],block_words[10]);
-	NICK_G(1,6,11,12,block_words[2],block_words[12]);
-	NICK_G(2,7,8,13,block_words[3],block_words[4]);
-	NICK_G(3,4,9,14,block_words[7],block_words[13]);
-
-
-	uint32_t r0 = BSWAP32(state[0] ^ state[8]);
-	uint32_t r1 = BSWAP32(state[1] ^ state[9]); // y_result is 38 bits of [a][6-]
-	uint32_t r2 = BSWAP32(state[2] ^ state[10]);
-	uint32_t r3 = BSWAP32(state[3] ^ state[11]);
-	uint32_t r4 = BSWAP32(state[4] ^ state[12]);
-	uint32_t r5 = BSWAP32(state[5] ^ state[13]);
-
-	// MINOR OPTIMIZATION: on last table could just return top 32 bits instead of the 38 bits.
-	uint64_t y_hi = __funnelshift_l ( r0, 0, 6); // shift 6 of top bits of r0 into y_hi
-	uint32_t y_lo = __funnelshift_l ( r1, r0, 6);
-	if (c_len > 0) {
-		c_results[0] = __funnelshift_l ( r2, r1, 6);
-		c_results[1] = __funnelshift_l ( r3, r2, 6);
-	}
-	if (c_len > 2) {
-		c_results[2] = __funnelshift_l ( r4, r3, 6);
-	}
-	if (c_len > 3) {
-		c_results[3] = __funnelshift_l ( r5, r4, 6);
-	}
-
-	(*y_result) = (y_hi << 32) + y_lo;
-
-}
-
-__device__
-void nick_blake_k29(const uint32_t* meta, int meta_len, const uint64_t y,
-		uint64_t *y_result, uint8_t c_len, uint32_t *c_results) {
-	uint32_t state[16];
-	uint32_t block_words[16];// = {0};
-	size_t input_len = 21;
-
-	block_words[0] = BSWAP32(y >> 6);
-	block_words[1] = BSWAP32(__funnelshift_l ( meta[0], y, 26));
-	block_words[2] = BSWAP32(__funnelshift_l ( meta[1], meta[0], 26));
-	if (meta_len == 2) {
-		// [32][6-26][6-26][6-]
-		block_words[3] = BSWAP32(meta[1] << 26);
-		input_len = 13;
-	}
-	else if (meta_len == 3) {
-		// [32][6-26][6-26][6-26][6-26][6-]
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(meta[2] << 26);
-		input_len = 17;
-	}
-	else if (meta_len == 4) {
-		// [32][6-26][6-26][6-26][6-26][6-26][6-]
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(meta[3] << 26);
-		input_len = 21;
-	}
-	else if (meta_len == 6) {
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26));
-		block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26));
-		block_words[7] = BSWAP32(meta[5] << 26);
-		input_len = 29;
-	}
-	else if (meta_len == 8) {
-		block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26));
-		block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26));
-		block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26));
-		block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26));
-		block_words[7] = BSWAP32(__funnelshift_l ( meta[6], meta[5], 26));
-		block_words[8] = BSWAP32(__funnelshift_l ( meta[7], meta[6], 26));
-		block_words[9] = BSWAP32(meta[7] << 26);
-		input_len = 37;
-	}
-
-	for (int i=meta_len+2;i<16;i++) block_words[i]=0;
-
-
-	state[0] = 0x6A09E667UL;
-	state[1] = 0xBB67AE85UL;
-	state[2] = 0x3C6EF372UL;
-	state[3] = 0xA54FF53AUL;
-	state[4] = 0x510E527FUL;
-	state[5] = 0x9B05688CUL;
-	state[6] = 0x1F83D9ABUL;
-	state[7] = 0x5BE0CD19UL;
-	state[8] = 0x6A09E667UL;
-	state[9] = 0xBB67AE85UL;
-	state[10] = 0x3C6EF372UL;
-	state[11] = 0xA54FF53AUL;
-	state[12] = 0; // counter_low(0);
-	state[13] = 0; // counter_high(0);
-	state[14] = (uint32_t) input_len; // take;// (uint32_t)output.block_len;
-	state[15] = (uint32_t) (1 | 2 | 8);// (output.flags | ROOT);
-
-	NICK_G(0,4,8,12,block_words[0],block_words[1]);
-	NICK_G(1,5,9,13,block_words[2],block_words[3]);
-	NICK_G(2,6,10,14,block_words[4],block_words[5]);
-	NICK_G(3,7,11,15,block_words[6],block_words[7]);
-	NICK_G(0,5,10,15,block_words[8],block_words[9]);
-	NICK_G(1,6,11,12,block_words[10],block_words[11]);
-	NICK_G(2,7,8,13,block_words[12],block_words[13]);
-	NICK_G(3,4,9,14,block_words[14],block_words[15]);
-	NICK_G(0,4,8,12,block_words[2],block_words[6]);
-	NICK_G(1,5,9,13,block_words[3],block_words[10]);
-	NICK_G(2,6,10,14,block_words[7],block_words[0]);
-	NICK_G(3,7,11,15,block_words[4],block_words[13]);
-	NICK_G(0,5,10,15,block_words[1],block_words[11]);
-	NICK_G(1,6,11,12,block_words[12],block_words[5]);
-	NICK_G(2,7,8,13,block_words[9],block_words[14]);
-	NICK_G(3,4,9,14,block_words[15],block_words[8]);
-	NICK_G(0,4,8,12,block_words[3],block_words[4]);
-	NICK_G(1,5,9,13,block_words[10],block_words[12]);
-	NICK_G(2,6,10,14,block_words[13],block_words[2]);
-	NICK_G(3,7,11,15,block_words[7],block_words[14]);
-	NICK_G(0,5,10,15,block_words[6],block_words[5]);
-	NICK_G(1,6,11,12,block_words[9],block_words[0]);
-	NICK_G(2,7,8,13,block_words[11],block_words[15]);
-	NICK_G(3,4,9,14,block_words[8],block_words[1]);
-	NICK_G(0,4,8,12,block_words[10],block_words[7]);
-	NICK_G(1,5,9,13,block_words[12],block_words[9]);
-	NICK_G(2,6,10,14,block_words[14],block_words[3]);
-	NICK_G(3,7,11,15,block_words[13],block_words[15]);
-	NICK_G(0,5,10,15,block_words[4],block_words[0]);
-	NICK_G(1,6,11,12,block_words[11],block_words[2]);
-	NICK_G(2,7,8,13,block_words[5],block_words[8]);
-	NICK_G(3,4,9,14,block_words[1],block_words[6]);
-	NICK_G(0,4,8,12,block_words[12],block_words[13]);
-	NICK_G(1,5,9,13,block_words[9],block_words[11]);
-	NICK_G(2,6,10,14,block_words[15],block_words[10]);
-	NICK_G(3,7,11,15,block_words[14],block_words[8]);
-	NICK_G(0,5,10,15,block_words[7],block_words[2]);
-	NICK_G(1,6,11,12,block_words[5],block_words[3]);
-	NICK_G(2,7,8,13,block_words[0],block_words[1]);
-	NICK_G(3,4,9,14,block_words[6],block_words[4]);
-	NICK_G(0,4,8,12,block_words[9],block_words[14]);
-	NICK_G(1,5,9,13,block_words[11],block_words[5]);
-	NICK_G(2,6,10,14,block_words[8],block_words[12]);
-	NICK_G(3,7,11,15,block_words[15],block_words[1]);
-	NICK_G(0,5,10,15,block_words[13],block_words[3]);
-	NICK_G(1,6,11,12,block_words[0],block_words[10]);
-	NICK_G(2,7,8,13,block_words[2],block_words[6]);
-	NICK_G(3,4,9,14,block_words[4],block_words[7]);
-	NICK_G(0,4,8,12,block_words[11],block_words[15]);
-	NICK_G(1,5,9,13,block_words[5],block_words[0]);
-	NICK_G(2,6,10,14,block_words[1],block_words[9]);
-	NICK_G(3,7,11,15,block_words[8],block_words[6]);
-	NICK_G(0,5,10,15,block_words[14],block_words[10]);
-	NICK_G(1,6,11,12,block_words[2],block_words[12]);
-	NICK_G(2,7,8,13,block_words[3],block_words[4]);
-	NICK_G(3,4,9,14,block_words[7],block_words[13]);
-
-	uint32_t r0 = BSWAP32(state[0] ^ state[8]);
-	uint32_t r1 = BSWAP32(state[1] ^ state[9]); // y_result is 38 bits of [a][6-]
-	uint32_t r2 = BSWAP32(state[2] ^ state[10]);
-	uint32_t r3 = BSWAP32(state[3] ^ state[11]);
-	uint32_t r4 = BSWAP32(state[4] ^ state[12]);
-	uint32_t r5 = BSWAP32(state[5] ^ state[13]);
-
-	// MINOR OPTIMIZATION: on last table could just return top 32 bits instead of the 38 bits.
-	uint64_t y_hi = __funnelshift_l ( r0, 0, 6); // shift 6 of top bits of r0 into y_hi
-	uint32_t y_lo = __funnelshift_l ( r1, r0, 6);
-	if (c_len > 0) {
-		c_results[0] = __funnelshift_l ( r2, r1, 6);
-		c_results[1] = __funnelshift_l ( r3, r2, 6);
-	}
-	if (c_len > 2) {
-		c_results[2] = __funnelshift_l ( r4, r3, 6);
-	}
-	if (c_len > 3) {
-		c_results[3] = __funnelshift_l ( r5, r4, 6);
-	}
-
-	(*y_result) = ((y_hi << 32) + y_lo) >> 3;
-
-}
-
-
-
-#endif /* NICK_BLAKE3_HPP_ */
diff --git a/nick_globals.hpp b/nick_globals.hpp
deleted file mode 100644
index badc637..0000000
--- a/nick_globals.hpp
+++ /dev/null
@@ -1,457 +0,0 @@
-#ifndef NICK_GLOBALS_HPP_
-#define NICK_GLOBALS_HPP_
-
-#include <iostream>
-#include <stdlib.h>
-
-using std::string;
-
-const uint32_t BATCHES = 64;
-const uint64_t BATCHBC = (uint64_t) 1 << (38 - 6);
-
-const uint32_t KBC_MAX_ENTRIES_PER_BUCKET = 400;
-const uint32_t kBC_NUM_BUCKETS = 18188177;
-const uint32_t kBC_LAST_BUCKET_ID = 18188176;
-const uint16_t kBC = 15113;
-const uint32_t KBCS_PER_BATCH = (kBC_NUM_BUCKETS / BATCHES)+1;
-const uint32_t KBC_LOCAL_NUM_BUCKETS = KBCS_PER_BATCH + 1; // +1 is for including last R bucket space
-
-#define CALC_BATCH_BUCKET_ADD_Y(batch_id) ((((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id))
-#define CALC_KBC_BUCKET_ADD_Y(kbc_bucket_id) (((uint64_t) kBC) * ((uint64_t) kbc_bucket_id))
-
-#define MIN_KBC_BUCKET_FOR_BATCH(batch_id) \
-	( (uint32_t) ((((uint64_t) 1 << 32) * ((uint64_t) (batch_id))) / ((uint64_t) kBC) ));
-
-
-const uint64_t HOST_UNIT_BYTES = 20; //12// Bytes used for biggest host entry.
-const uint64_t HOST_MAX_BLOCK_ENTRIES = 1114112;//1114112; // MUST be multiple of 32 so it works with bit masking // 1052614 (min calculated) // 1258291; // (120 * ((uint64_t) 1 << 32)) / (100*(BATCHES * BATCHES));
-const uint64_t HOST_ALLOCATED_ENTRIES = HOST_MAX_BLOCK_ENTRIES * BATCHES * BATCHES;
-const uint64_t HOST_ALLOCATED_BYTES = HOST_UNIT_BYTES * HOST_ALLOCATED_ENTRIES;
-
-
-const uint64_t DEVICE_BUFFER_UNIT_BYTES = 24;//32; // Tx_pairing_chunk_meta4 is 24 bytes, w/ backref is 32 bytes
-
-const uint64_t DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_LOCAL_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; // HOST_MAX_BLOCK_ENTRIES * BATCHES;// DEVICE_BUFFER_ALLOCATED_ENTRIES = 120 * ((uint64_t) 1 << 32) / (100*BATCHES);
-const uint64_t DEVICE_BUFFER_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * DEVICE_BUFFER_UNIT_BYTES;
-const uint64_t BACKREF_UNIT_BYTES = 12; // backref w/y for last table is 12 bytes
-const uint64_t BACKREF_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * BACKREF_UNIT_BYTES;
-
-
-const uint64_t CROSS_MATRIX_BC = (2097152 * 128) + kBC - ((2097152 * 128) % kBC);
-const uint64_t CROSS_MATRIX_NUM_BUCKETS = 1024; // each batch splits into buckets, the max per bucket is dependent on size of batch
-const uint64_t CROSS_MATRIX_BATCH_MAX_ENTRIES_PER_BUCKET = (119 * ((uint64_t)1 << 32)) / (100*(CROSS_MATRIX_NUM_BUCKETS * BATCHES));
-const uint64_t CROSS_MATRIX_ALLOCATED_SPACE_PER_BATCH = CROSS_MATRIX_BATCH_MAX_ENTRIES_PER_BUCKET * CROSS_MATRIX_NUM_BUCKETS;
-const uint64_t CROSS_MATRIX_ALLOCATED_SPACE = CROSS_MATRIX_ALLOCATED_SPACE_PER_BATCH * BATCHES;
-
-
-
-
-
-
-
-
-static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t);
-#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value)
-
-uint32_t *chacha_input;
-
-// output from F(x) -> chacha
-struct F1_Bucketed_kBC_Entry {
-	uint32_t x;
-	uint32_t y;
-};
-
-struct T1_Match {
-	uint32_t Lx;
-	uint32_t Rx;
-	uint32_t y;
-};
-
-struct T1_Pairing_Chunk {
-	uint32_t Lx;
-	uint32_t Rx;
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Final_Y {
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Meta1 {
-	uint32_t meta[1];
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Meta2 {
-	uint32_t meta[2];
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Meta3 {
-	uint32_t meta[3];
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Meta4 {
-	uint32_t meta[4];
-	uint32_t y;
-};
-
-struct Tx_Bucketed_Meta2_Blockposref {
-	uint32_t meta[2];
-	uint32_t y;
-	uint32_t blockposref;
-};
-
-struct Tx_Bucketed_Meta3_Blockposref {
-	uint32_t meta[3];
-	uint32_t y;
-	uint32_t blockposref;
-};
-
-struct Tx_Bucketed_Meta4_Blockposref {
-	uint32_t meta[4];
-	uint32_t y;
-	uint32_t blockposref;
-};
-
-
-struct Tx_Pairing_Chunk_Meta2 {
-	uint64_t y;
-	uint32_t meta[2];
-	//uint16_t idxL;
-	//uint16_t idxR;
-	//uint32_t p_b_id;
-};
-
-struct Tx_Pairing_Chunk_Meta3 {
-	uint64_t y;
-	uint32_t meta[2];
-	//uint16_t idxL;
-	//uint16_t idxR;
-	//uint32_t p_b_id;
-};
-
-struct Tx_Pairing_Chunk_Meta4 {
-	uint64_t y;
-	uint32_t meta[4];
-	//uint16_t idxL;
-	//uint16_t idxR;
-	//uint32_t p_b_id;
-};
-
-struct Index_Match {
-	uint16_t idxL;
-	uint16_t idxR;
-};
-
-// our base pairing struct T3.
-struct T2BaseRef {
-	uint32_t Lx1;
-	uint32_t Lx2;
-};
-
-struct T3BaseRef {
-	uint32_t Lx1;
-	uint32_t Lx2;
-	uint32_t Lx3;
-	uint32_t Lx4;
-};
-
-struct T2BaseRefWithUsed {
-	uint32_t Lx1;
-	uint32_t Lx2;
-	bool used;
-};
-
-struct BackRef {
-	uint32_t prev_block_ref_L; // (block_id(L) << (32 - 6)) + block_pos
-	uint32_t prev_block_ref_R; // (block_id(R) << (32 - 6)) + block_pos
-};
-
-struct T6BackRef { // 12 bytes
-	uint32_t prev_block_ref_L; // (block_id(L) << (32 - 6)) + block_pos
-	uint32_t prev_block_ref_R; // (block_id(R) << (32 - 6)) + block_pos
-	uint32_t y;
-};
-
-struct T6FinalEntry {
-	uint32_t refL; // 6,6,6 = 24
-	uint32_t refR; // 6,6,6 = 24
-	uint32_t y;    // 32
-};
-
-struct T4FinalEntry {
-	uint32_t Lx1,Lx2,Lx3,Lx4,Lx5,Lx6,Lx7,Lx8;
-};
-
-
-struct RBid_Entry {
-	uint32_t x;
-	uint16_t pos;
-};
-
-// chia specific constants
-const uint32_t K_SIZE = 32;
-const uint64_t K_MAX = ((uint64_t) 1 << K_SIZE);
-const uint64_t K_MAX_Y = K_MAX << 6;
-const uint8_t kExtraBits = 6;
-const uint16_t kB = 119;
-const uint16_t kC = 127;
-const uint32_t nickBC = (2097152 * 128) + kBC - ((2097152 * 128) % kBC);
-const uint32_t NICK_BUCKET_MAX_ENTRIES = 34000 * 128;
-const uint32_t NICK_NUM_BUCKETS = 1024;
-
-// code below is WRONG! 2nd clause only uses batch_id
-//#define CRISS_CROSS_BLOCK_ID(table, batch_id, block_id) \
-//(((table % 2) == 1) ? batch_id * BATCHES  + block_id : batch_id * BATCHES + batch_id)
-
-
-uint64_t getCrissCrossBlockId(uint8_t table, uint32_t batch_id, uint32_t block_id) {
-    uint64_t cross_row_id = batch_id;
-    uint64_t cross_column_id = block_id;
-    if ((table % 2) == 1) {
-        return (cross_row_id * BATCHES  + cross_column_id);
-    } else {
-        return (cross_column_id * BATCHES  + cross_row_id);
-    }
-}
-
-inline uint64_t getCrissCrossBlockEntryStartPosition(uint64_t criss_cross_id) {
-    return criss_cross_id * HOST_MAX_BLOCK_ENTRIES;
-}
-
-
-string Strip0x(const string &hex)
-{
-    if (hex.size() > 1 && (hex.substr(0, 2) == "0x" || hex.substr(0, 2) == "0X")) {
-        return hex.substr(2);
-    }
-    return hex;
-}
-
-void HexToBytes(const string &hex, uint8_t *result)
-{
-    for (uint32_t i = 0; i < hex.length(); i += 2) {
-        string byteString = hex.substr(i, 2);
-        uint8_t byte = (uint8_t)strtol(byteString.c_str(), NULL, 16);
-        result[i / 2] = byte;
-    }
-}
-
-void chacha_setup() {
-	string id = "022fb42c08c12de3a6af053880199806532e79515f94e83461612101f9412f9e";
-	//string id = "0000000000000000000000000000000000000000000000000000000000000000";
-
-	uint8_t enc_key[32];
-
-	id = Strip0x(id);
-	std::array<uint8_t, 32> id_bytes;
-	HexToBytes(id, id_bytes.data());
-	uint8_t* orig_key = id_bytes.data();
-
-	enc_key[0] = 1;
-	memcpy(enc_key + 1, orig_key, 31);
-
-	CUDA_CHECK_RETURN(cudaMallocManaged(&chacha_input, 16*sizeof(uint32_t)));
-	// Setup ChaCha8 context with zero-filled IV
-	chacha8_keysetup_data(chacha_input, enc_key, 256, NULL);
-
-}
-
-// chacha specific macros end
-
-/**
- * Check the return value of the CUDA runtime API call and exit
- * the application if the call has failed.
- */
-static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err)
-{
-	if (err == cudaSuccess)
-		return;
-	std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<<err<< ") at "<<file<<":"<<line << std::endl;
-	exit (1);
-}
-
-
-#endif /* NICK_GLOBALS_HPP_ */
-
-/*void doF1Original() {
-	const uint32_t N = UINT_MAX;
-
-	const uint32_t N_PER_BATCH = (N) / BATCHES;
-	uint32_t KBCS_PER_BATCH = (kBC_LAST_BUCKET_ID + 1) / BATCHES;
-	uint32_t KBC_START = 0;
-	uint32_t KBC_END = KBC_START + KBCS_PER_BATCH;
-	const uint32_t KBC_LOCAL_NUM_BUCKETS = KBC_END - KBC_START + 1; // +1 is for including last R bucket space
-	const uint32_t KBC_ALLOCATED_SPACE = KBC_MAX_ENTRIES_PER_BUCKET * KBC_LOCAL_NUM_BUCKETS;
-	const uint64_t HOST_COPY_BUFFER_ALLOCATED_SPACE = KBC_ALLOCATED_SPACE; // should then cp to disk anyway.(((uint64_t) 1) << 32);
-	const uint32_t MAX_RESULTS = KBC_ALLOCATED_SPACE;
-
-	std::cout << "doF1  N:" << N << "  BATCHES:" << BATCHES << "   N per batch:" << N_PER_BATCH << std::endl;
-	std::cout << "   CROSS_MATRIX_NUM_BUCKETS:" << CROSS_MATRIX_NUM_BUCKETS << " MAX_ENTRIES BATCH:" << CROSS_MATRIX_BATCH_MAX_ENTRIES_PER_BUCKET << std::endl;
-	std::cout << "   gpu memory alloc:" << std::endl;
-
-	auto total_start = std::chrono::high_resolution_clock::now();
-
-	int* local_kbc_num_entries;
-	std::cout << "      local_kbc_num_entries " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&local_kbc_num_entries, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-
-	int* global_kbc_num_entries;
-	std::cout << "      global_kbc_num_entries " << (kBC_LAST_BUCKET_ID+1) << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*(kBC_LAST_BUCKET_ID+1)) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&global_kbc_num_entries, (kBC_LAST_BUCKET_ID+1)*sizeof(int)));
-
-
-	// todo: allocate this as fx, then map to f1, f2 etc.
-	Tx_Bucketed_Meta4 *device_bucketed_meta_entries;
-	std::cout << "      device_bucketed_meta_entries " << KBC_ALLOCATED_SPACE << " * (Tx_Bucketed_Meta4:" <<  sizeof(Tx_Bucketed_Meta4) << ") = " << (sizeof(Tx_Bucketed_Meta4)*KBC_ALLOCATED_SPACE) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_bucketed_meta_entries, KBC_ALLOCATED_SPACE*sizeof(Tx_Bucketed_Meta4)));
-
-	Tx_Pairing_Chunk_Meta4 *device_pairing_chunks;
-	std::cout << "      device_pairing_chunks " << MAX_RESULTS << " * (Tx_Pairing_Chunk_Meta4:" << sizeof(Tx_Pairing_Chunk_Meta4) << ") = " << (sizeof(Tx_Pairing_Chunk_Meta4)*MAX_RESULTS) << std::endl;
-	CUDA_CHECK_RETURN(cudaMalloc(&device_pairing_chunks, MAX_RESULTS*sizeof(Tx_Pairing_Chunk_Meta4)));
-	int *pairing_chunks_count;
-	std::cout << "      t1_pairing_chunks_count (1)" << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocManaged(&pairing_chunks_count, sizeof(int)));
-
-	Tx_Pairing_Chunk_Meta4 *host_copy_buffer;
-	std::cout << "      HOST t1_pairing_chunks * " << HOST_COPY_BUFFER_ALLOCATED_SPACE << " size:" << (sizeof(Tx_Pairing_Chunk_Meta4)*HOST_COPY_BUFFER_ALLOCATED_SPACE) << std::endl;
-	CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_copy_buffer, HOST_COPY_BUFFER_ALLOCATED_SPACE * sizeof(Tx_Pairing_Chunk_Meta4))); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-
-	Tx_Pairing_Chunk_Meta4 *host_criss_cross_store;
-	//std::cout << "      HOST host_criss_cross_store * " << CROSS_MATRIX_ALLOCATED_SPACE << " size:" << (sizeof(Tx_Bucketed_Meta4)*CROSS_MATRIX_ALLOCATED_SPACE) << std::endl;
-	//host_criss_cross_store = (Tx_Bucketed_Meta4 *) malloc(CROSS_MATRIX_ALLOCATED_SPACE * sizeof(Tx_Bucketed_Meta4));
-	//CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_criss_cross_store, CROSS_MATRIX_ALLOCATED_SPACE * sizeof(Tx_Bucketed_Meta4))); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-
-
-	uint32_t *criss_cross_num_entries; // host num_buckets for each batch
-	std::cout << "      HOST criss_cross_num_entries * " << (CROSS_MATRIX_NUM_BUCKETS * BATCHES) << " size:" << (sizeof(uint32_t) * CROSS_MATRIX_NUM_BUCKETS * BATCHES) << std::endl;
-	criss_cross_num_entries = (uint32_t *) malloc(CROSS_MATRIX_NUM_BUCKETS * BATCHES * sizeof(uint32_t));
-
-
-	auto finish = std::chrono::high_resolution_clock::now();
-	std::cout << "    mem allocation done. " << std::chrono::duration_cast<milli>(finish - total_start).count() << " ms\n";
-
-	auto total_start_without_memory = std::chrono::high_resolution_clock::now();
-
-	int blockSize = 64; // # of threads per block, maximum is 1024.
-	const uint64_t calc_N = N;
-	const uint64_t calc_blockSize = blockSize;
-	const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16);
-	int numBlocks = calc_numBlocks;
-	std::cout << "  Block configuration: [blockSize:" << blockSize << "  numBlocks:" << numBlocks << "]" << std::endl;
-
-	//batches = 2;
-	int64_t total_compute_time_ms = 0;
-	int64_t total_transfer_time_ms = 0;
-	uint32_t total_f2_results_count = 0;
-
-	// map for table 1.
-	{
-		T1_Pairing_Chunk *t1_pairing_chunks = (T1_Pairing_Chunk *) device_pairing_chunks;
-		F1_Bucketed_kBC_Entry *local_kbc_entries = (F1_Bucketed_kBC_Entry *) device_bucketed_meta_entries;
-		T1_Pairing_Chunk *host_t1_pairing_chunks = (T1_Pairing_Chunk *) host_copy_buffer;
-		uint32_t batches_to_go = BATCHES;
-		while (batches_to_go > 0) {
-
-			std::cout << "   gpuScanIntoKbcBuckets BATCHES to go: " << batches_to_go << std::endl <<
-					"     SPANNING FOR BUCKETS    count:" << (KBC_END - KBC_START + 1) << "  KBC_START: " << KBC_START << "   KBC_END: " << KBC_END << std::endl;
-			std::cout << "   Generating F1 results into kbc buckets...";
-			auto batch_start = std::chrono::high_resolution_clock::now();
-			auto start = std::chrono::high_resolution_clock::now();
-
-			// don't forget to clear counter...
-			CUDA_CHECK_RETURN(cudaMemset(local_kbc_num_entries, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int)));
-
-			gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<<numBlocks, blockSize>>>(N, chacha_input,
-					local_kbc_entries, local_kbc_num_entries, KBC_START, KBC_END);
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			auto finish = std::chrono::high_resolution_clock::now();
-			std::cout << "   done.     " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-			std::cout << "   Finding matches...";
-			(*pairing_chunks_count) = 0; // set...
-			CUDA_CHECK_RETURN(cudaMemset(global_kbc_num_entries, 0, (kBC_LAST_BUCKET_ID+1)*sizeof(int)));
-			gpu_find_f1_matches<<<(KBC_LOCAL_NUM_BUCKETS-1), 256>>>(KBC_START, KBC_END,
-					local_kbc_entries, local_kbc_num_entries,
-					t1_pairing_chunks, pairing_chunks_count, MAX_RESULTS);
-			//gpu_find_fx_matches<Tx_Bucketed_Meta1,Tx_Bucketed_Meta2,1><<<(KBC_LOCAL_NUM_BUCKETS-1), 256>>>(KBC_START, KBC_END,
-			//		local_kbc_entries, local_kbc_num_entries,
-			//		t1_pairing_chunks, t1_pairing_chunks_count, MAX_RESULTS);
-			//gpu_find_matches<<<1, 64>>>(1,2, KBC_MAX_ENTRIES_PER_BUCKET, local_kbc_entries, local_kbc_num_entries);
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			finish = std::chrono::high_resolution_clock::now();
-
-			total_compute_time_ms += std::chrono::duration_cast<milli>(finish - batch_start).count();
-			std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-
-
-
-			// now copy pair results to CPU memory.
-			int num_results = (*pairing_chunks_count);
-			total_f2_results_count += num_results;
-			std::cout << "   Copying " << num_results << " T1 pairing chunks to CPU...";
-			start = std::chrono::high_resolution_clock::now();
-			CUDA_CHECK_RETURN(cudaMemcpy(host_t1_pairing_chunks,t1_pairing_chunks,num_results*sizeof(T1_Pairing_Chunk),cudaMemcpyDeviceToHost));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			finish = std::chrono::high_resolution_clock::now();
-			total_transfer_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-			std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-
-			// and move from CPU memory into reserved criss cross buckets
-			std::cout << "    Moving pairing chunks into CPU criss cross storage\n";
-			start = std::chrono::high_resolution_clock::now();
-			uint32_t batch_id = BATCHES-batches_to_go;
-			Tx_Bucketed_Meta2 *host_cast = (Tx_Bucketed_Meta2 *) host_criss_cross_store;
-			//cpuT1MoveCopyBufferToCrissCross(batch_id, host_t1_pairing_chunks, num_results, host_cast, &criss_cross_num_entries[batch_id]);
-			finish = std::chrono::high_resolution_clock::now();
-			total_transfer_time_ms += std::chrono::duration_cast<milli>(finish - start).count();
-			std::cout << "   done. " << std::chrono::duration_cast<milli>(finish - start).count() << " ms\n";
-
-
-			std::cout << "  ** batch finish ** " << std::chrono::duration_cast<milli>(finish - batch_start).count() << " ms\n";
-			batches_to_go--;
-
-			KBC_START += KBCS_PER_BATCH;
-			//if (BATCHES == 0) {
-			//	KBC_END = kBC_LAST_BUCKET_ID;
-			//} else {
-			KBC_END = KBC_START + KBCS_PER_BATCH;
-			//}
-			if ((KBC_END - KBC_START + 1) > KBC_LOCAL_NUM_BUCKETS) {
-				std::cout << "ERROR: kbc span is more than local buckets allocated!\n" << std::endl;
-			}
-		}
-	}
-
-
-	finish = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "Total time: " << std::chrono::duration_cast<milli>(finish - total_start).count() << " ms\n";
-	std::cout << " w/o alloc: " << std::chrono::duration_cast<milli>(finish - total_start_without_memory).count() << " ms\n";
-	std::cout << " gpu compute: " << total_compute_time_ms << " ms\n";
-	std::cout << "    transfer: " << total_transfer_time_ms << " ms\n";
-
-	std::cout << "*********************" << std::endl;
-	/*uint32_t total_entries = 0;
-	for (int bucket_id=0;bucket_id<2;bucket_id++) { // NICK_NUM_BUCKETS;i++) {
-		int num = local_kbc_num_entries[bucket_id];
-		std::cout << "KBC LOCAL num entries bucket " << bucket_id << " : " << num << std::endl;
-		total_entries += num;
-		//for (int i=0;i<num;i++) {
-		//	Bucketed_kBC_Entry entry = local_kbc_entries[bucket_id*KBC_MAX_ENTRIES_PER_BUCKET + i];
-		//	std::cout << " x: " << entry.x << " f(x):" << CALC_Y_BUCKETED_KBC_ENTRY(entry, bucket_id) << std::endl;
-		//}
-	}
-	std::cout << "  total entries: " << total_entries << std::endl;
-*/
-/*
-	CUDA_CHECK_RETURN(cudaFree(local_kbc_num_entries));
-	CUDA_CHECK_RETURN(cudaFree(device_bucketed_meta_entries));
-	CUDA_CHECK_RETURN(cudaFree(device_pairing_chunks));
-}*/
-
-
diff --git a/phase2.hpp b/phase2.hpp
deleted file mode 100644
index b57df7e..0000000
--- a/phase2.hpp
+++ /dev/null
@@ -1,1319 +0,0 @@
-/*
- * phase2.hpp
- *
- *  Created on: Oct 15, 2021
- *      Author: nick
- */
-
-#ifndef PHASE2_HPP_
-#define PHASE2_HPP_
-
-#include "nick_globals.hpp"
-#include <thrust/device_ptr.h>
-#include <thrust/sort.h>
-#include <thrust/unique.h>
-
-
-// bladebit
-//  phase 1: 209s
-//  phase 2: 25s
-//  phase 3: 102s
-//  phase 4: <1s
-
-const uint64_t PHASE_2_MAX_BYTES_PER_UNIT = 12; // enter max. bytes used per entry for any of the tables
-const uint64_t PHASE_2_ALLOCATED_BYTES_PER_TABLE = PHASE_2_MAX_BYTES_PER_UNIT * DEVICE_BUFFER_ALLOCATED_ENTRIES; // enter max. bytes used per entry for any of the tables
-
-uint32_t num_set_t4 = 0;
-uint32_t num_same_addresses = 0;
-uint32_t num_set_t5 = 0;
-
-void readT2BlockFilesToHostMem(uint32_t batch_id, T2BaseRef *t2_data, uint32_t *num_entries) {
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-		std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-		//if (batch_id == 0) {
-		//	std::cout << "Reading file [" << filename << "]";
-		//} else {
-		//	std::cout << " [" << filename << "]";
-		//}
-		FILE* pFile;
-		pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-		if (fread(&num_entries[block_id], sizeof(uint32_t), 1, pFile)) {
-			//std::cout << " num_entries: " << num_entries[block_id] << std::endl;
-			if (fread(t2_data, sizeof(T2BaseRef), num_entries[block_id], pFile)) {
-				//std::cout << "success.";
-			} else {
-				std::cout << "failed.";
-			}
-		}
-		fclose(pFile);
-		//if (batch_id == BATCHES-1) {
-		//	std::cout << " done." << std::endl;
-		//}
-		//for (int i = 0; i < 1; i++) {
-		//	std::cout << "Value " << i << " is: " << t2_data[0].Lx1 << std::endl;
-		//}
-	}
-}
-
-void readTxBackRefBlockFilesToHostMem(uint32_t table, uint32_t batch_id, BackRef *tx_data, uint32_t *num_entries) {
-	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-		std::string filename = "/mnt/kioxia/tmp/T"+std::to_string(table)+"BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-		if (batch_id == 0) {
-			std::cout << "Reading file [" << filename << "]";
-		} else {
-			std::cout << " [" << filename << "]";
-		}
-		FILE* pFile;
-		pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-		//uint32_t num_entries;
-		if (fread(&num_entries[block_id], sizeof(uint32_t), 1, pFile)) {
-			std::cout << " num_entries: " << num_entries[block_id] << std::endl;
-			if (fread(tx_data, sizeof(BackRef), num_entries[block_id], pFile)) {
-				std::cout << "success.";
-			} else {
-				std::cout << "failed.";
-			}
-		} else {
-			std::cout << "Failed to read count " << std::endl;
-		}
-		fclose(pFile);
-		if (batch_id == BATCHES-1) {
-			std::cout << " done." << std::endl;
-		}
-	}
-
-}
-
-void readT6BackRefBlockFilesToHostMem(uint32_t batch_id, uint32_t block_id, T6BackRef *tx_data, uint32_t &num_entries) {
-	std::string filename = "/mnt/kioxia/tmp/T6BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) {
-		std::cout << "reading..." << num_entries << std::endl;
-		if (!fread(&tx_data, sizeof(T6BackRef), num_entries, pFile)) {
-			std::cout << "failed.";
-		}
-	}
-	fclose(pFile);
-}
-
-void readT2BlockFile(uint32_t batch_id, uint32_t block_id, T2BaseRef *t2_data, uint32_t &num_entries) {
-	std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	//if (batch_id == 0) {
-	//	std::cout << "Reading file [" << filename << "]";
-	//} else {
-	////	std::cout << " [" << filename << "]";
-	//}
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) {
-		//std::cout << " num_entries: " << num_entries << std::endl;
-		if (fread(t2_data, sizeof(T2BaseRef), num_entries, pFile)) {
-			//std::cout << "success.";
-		} else {
-			std::cout << "failed.";
-		}
-	}
-	fclose(pFile);
-	//if (batch_id == BATCHES-1) {
-	//	std::cout << " done." << std::endl;
-	//}
-	//for (int i = 0; i < 1; i++) {
-	//	std::cout << "Value " << i << " is: " << t2_data[0].Lx1 << std::endl;
-	//}
-}
-
-void readBackRefBlockFile(uint32_t table, uint32_t batch_id, uint32_t block_id, BackRef *tx_data, uint32_t &num_entries) {
-	std::string filename = "/mnt/kioxia/tmp/T"+std::to_string(table)+"BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	FILE* pFile;
-	//std::cout << "reading " << filename << std::endl;
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) {
-		if (!fread(tx_data, sizeof(BackRef), num_entries, pFile)) {
-			std::cout << "failed reading " << filename;
-		}
-	}
-	fclose(pFile);
-}
-
-void readT6BlockFile(uint32_t batch_id, uint32_t block_id, T6BackRef *t6_data, uint32_t &num_entries) {
-	std::string filename = "/mnt/kioxia/tmp/T6BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) {
-		if (fread(t6_data, sizeof(T6BackRef), num_entries, pFile)) {
-			//std::cout << "success.";
-		} else {
-			std::cout << "failed.";
-		}
-	}
-	fclose(pFile);
-}
-
-void readT3BaseRefBlockFile(uint32_t batch_id, uint32_t block_id, T3BaseRef *t3_data, uint32_t &num_entries) {
-	std::string filename = "/mnt/kioxia/tmp/T3BaseRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	FILE* pFile;
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) {
-		if (fread(t3_data, sizeof(T3BaseRef), num_entries, pFile)) {
-			//std::cout << "success.";
-		} else {
-			std::cout << "failed.";
-		}
-	}
-	fclose(pFile);
-}
-
-// should total around 48GB...so maybe don't have to write to disk...
-void writeT6FinalBlockFile(uint32_t batch_id, uint32_t block_id, T6FinalEntry *t6_final_data, uint32_t &num_entries) {
-	if (num_entries == 0) {
-		return;
-	}
-	std::string filename = "/mnt/kioxia/tmp/T6Final-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-		//if (batch_id == 0) {
-		//	std::cout << "Writing backref to file [" << filename << "]";
-		//} else {
-		//	std::cout << " [" << filename << "]";
-		//}
-		FILE* pFile;
-		pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing??
-		fwrite(&num_entries, sizeof(uint32_t), 1, pFile); // write the num entries first.
-		fwrite(t6_final_data, 1, num_entries * sizeof(T6FinalEntry), pFile);
-		fclose(pFile);
-		//if (batch_id == BATCHES-1) {
-		//	std::cout << " done." << std::endl;
-		//}
-
-}
-
-void readT2BlockEntry(uint32_t batch_id, uint32_t block_id, uint32_t idx, T2BaseRef *t2_entry) {
-	std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	uint32_t seekpos = idx * sizeof(T2BaseRef) + sizeof(uint32_t);
-	std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl;
-	FILE* pFile;
-
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fseek ( pFile , seekpos , SEEK_SET );
-	fread(t2_entry, sizeof(T2BaseRef), 1, pFile);
-	fclose(pFile);
-}
-
-void readT3BlockEntry(uint32_t batch_id, uint32_t block_id, uint32_t idx, T3BaseRef *t3_entry) {
-	std::string filename = "/mnt/kioxia/tmp/T3BaseRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	uint32_t seekpos = idx * sizeof(T3BaseRef) + sizeof(uint32_t);
-	std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl;
-	FILE* pFile;
-
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fseek ( pFile , seekpos , SEEK_SET );
-	fread(t3_entry, sizeof(T3BaseRef), 1, pFile);
-	fclose(pFile);
-}
-
-void readBackRefBlockEntry(uint32_t table, uint32_t batch_id, uint32_t block_id, uint32_t idx, BackRef *return_data) {
-	std::string filename = "/mnt/kioxia/tmp/T" + std::to_string(table) + "BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp";
-	uint32_t seekpos = idx * sizeof(BackRef) + sizeof(uint32_t);
-	std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl;
-	FILE* pFile;
-
-	pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing??
-	fseek ( pFile , seekpos , SEEK_SET );
-	fread(return_data, sizeof(BackRef), 1, pFile);
-	fclose(pFile);
-}
-
-void backPropagate(uint32_t table, uint32_t batch_id, uint32_t block_id, uint32_t idx) {
-	std::cout << "Back propagate to table: " << table << " batch_id:" << batch_id << " block_id:" << block_id << " idx:" << idx << std::endl;
-	BackRef entry;
-	readBackRefBlockEntry(table, batch_id, block_id, idx, &entry);
-	//std::cout << "Ready entry L:" << entry_data.prev_block_ref_L << " R:" << entry_data.prev_block_ref_R << std::endl;
-	uint32_t prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-	uint32_t prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF;
-	uint32_t prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-	uint32_t prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF;
-	printf("T%uBackRef batch_id:%u block_id:%u! L:%u R:%u L_block_id:%u L_idx:%u R_block_id:%u R_idx:%u y:%u\n",
-								table, batch_id, block_id, entry.prev_block_ref_L, entry.prev_block_ref_R,
-								prev_block_id_L, prev_idx_L,
-								prev_block_id_R, prev_idx_R);
-	/*if (table > 3) {
-		backPropagate(table-1, prev_block_id_L, batch_id, prev_idx_L);
-		backPropagate(table-1, prev_block_id_R, batch_id, prev_idx_R);
-	} else if (table == 3) {
-		// read T2 entries right?
-		T2BaseRef L, R;
-		readT2BlockEntry(prev_block_id_L, batch_id, prev_idx_L, &L);
-		readT2BlockEntry(prev_block_id_R, batch_id, prev_idx_R, &R);
-		printf("T2 L: %u %u\n", L.Lx1, L.Lx2);
-		printf("T2 R: %u %u\n", R.Lx1, R.Lx2);
-	}*/
-
-	if (table > 4) {
-		backPropagate(table-1, prev_block_id_L, batch_id, prev_idx_L);
-		backPropagate(table-1, prev_block_id_R, batch_id, prev_idx_R);
-	} else if (table == 4) {
-		// read T3 entries right?
-		T3BaseRef L, R;
-		readT3BlockEntry(prev_block_id_L, batch_id, prev_idx_L, &L);
-		readT3BlockEntry(prev_block_id_R, batch_id, prev_idx_R, &R);
-		printf("T3 pos: %u L: %u %u %u %u\n", prev_idx_L, L.Lx1, L.Lx2, L.Lx3, L.Lx4);
-		printf("T3 pos: %u R: %u %u %u %u\n", prev_idx_R, R.Lx1, R.Lx2, R.Lx3, R.Lx4);
-	}
-
-}
-
-
-
-// try to see if we have correct back propagation values stored.
-// y = 573855352
-// xs 602009779,2127221679, 3186459061,443532047, 1234434947,1652736830, 396228306,464118917,3981993340,
-//    3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,1468569780,2075860307,
-//    2880258779,999340005,1240438978,4293399624,4226635802,1031429862,2391120891,3533658526,3823422504,
-//    3983813271,4180778279,2403148863,2441456056,319558395,2338010591,196206622,1637393731,853158574,2704638588,
-//    2368357012,1703808356,451208700,2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,
-//    1429077635,2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,2222781147,
-//    1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969
-
-
-/*
- * this is what a single solution looks like file-wise
- * -rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-10-15.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-11-0.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-13-51.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-15-12.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-17-3.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-19-3.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-25-51.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-26-17.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-30-55.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-31-0.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-33-1.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-34-15.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-4-1.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-43-17.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-53-12.tmp
--rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-60-55.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-0-10.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-12-51.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-1-43.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-15-43.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-17-51.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-3-10.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-51-35.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-55-35.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-10-38.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-35-40.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-43-38.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-51-40.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T5BackRef-38-5.tmp
--rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T5BackRef-40-5.tmp
--rw-rw-r-- 1 nick nick 16 Okt 19 11:16 T6BackRef-5-8.tmp
- *
- */
-
-// y = 573855352
-// xs 602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917,3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862,2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395,2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700,2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635,2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969
-void findYsolution(char *memstore) {
-	if (memstore == NULL) {
-		memstore = (char *) malloc(1738014720);
-	}
-	uint32_t y = 573855352;
-	std::cout << "findYsolution: " << y << std::endl;
-	T6BackRef *t6_data = (T6BackRef *) &memstore[0];
-
-	// how to back propagate all?
-	// read batch. Sort by all blocks. Then read batch related to sorted blocks.
-	// loop
-	//uint32_t t6_num;
-	//readT6BlockFile(0,0,t6_data, t6_num);
-
-	//for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) {
-		//std::cout << "Scanning T6 batch " << batch_id << std::endl;
-
-	//	for (uint32_t block_id = 0; block_id < BATCHES; block_id++) {
-	uint32_t batch_id = 5;
-	uint32_t block_id = 8;
-			uint32_t num_entries;
-			readT6BlockFile(batch_id,block_id,t6_data, num_entries);
-			std::cout << "Scanning T6 batch-block " << batch_id << "-" << block_id << " : " << num_entries << " entries" << std::endl;
-
-			for (int i=0;i<num_entries;i++) {
-				T6BackRef entry = t6_data[i];
-				if (entry.y == y) {
-					uint32_t prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-					uint32_t prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF;
-					uint32_t prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-					uint32_t prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF;
-					printf("T6BackRef Y FOUND! L:%u R:%u L_block_id:%u L_idx:%u R_block_id:%u R_idx:%u y:%u\n",
-							entry.prev_block_ref_L, entry.prev_block_ref_R,
-							prev_block_id_L, prev_idx_L,
-							prev_block_id_R, prev_idx_R,
-							entry.y);
-					backPropagate(5,prev_block_id_L, batch_id, prev_idx_L );
-					backPropagate(5,prev_block_id_R, batch_id, prev_idx_R );
-				}
-			}
-	//	}
-	//}
-
-}
-
-__global__
-void gpu_set_t6_final_data_and_t4_tags_directly(const uint32_t N, T6BackRef *t6_data, BackRef *t5_data, T6FinalEntry *t6_final_data, uint32_t *t4_tags) {
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		T6BackRef entry = t6_data[i];
-		uint64_t t6_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-		uint64_t t6_prev_idx_L      = entry.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-		uint64_t t6_prev_idx_R      = entry.prev_block_ref_R & 0x3FFFFFF;
-
-		// now could back ref t5...
-		BackRef t5_L, t5_R;
-		uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L;
-		uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R;
-		t5_L = t5_data[t5_address_L];
-		t5_R = t5_data[t5_address_R];
-		uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6);
-		uint64_t t5_L_prev_idx_L      = t5_L.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6);
-		uint64_t t5_L_prev_idx_R      = t5_L.prev_block_ref_R & 0x3FFFFFF;
-		uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6);
-		uint64_t t5_R_prev_idx_L      = t5_R.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6);
-		uint64_t t5_R_prev_idx_R      = t5_R.prev_block_ref_R & 0x3FFFFFF;
-
-		T6FinalEntry final_entry = {};
-		final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12);
-		final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12);
-		//std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl;
-		//std::cout << "              [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl;
-		final_entry.y = entry.y;
-		t6_final_data[i] = final_entry;
-
-
-
-		// directly set t4 tags
-		if (true) { // w/ this is 571ms, without is 440ms. Max optimization is 8 seconds over 64 batches.
-			uint32_t value;
-			uint64_t file_batch_id, file_block_id, file_idx;
-			uint64_t address;
-			uint32_t bits_to_set;
-			file_batch_id = t5_L_prev_block_id_L; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_L;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-
-			file_batch_id = t5_L_prev_block_id_R; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_R;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-
-			file_batch_id = t5_R_prev_block_id_L; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_L;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-
-			file_batch_id = t5_R_prev_block_id_R; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_R;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-		}
-
-	}
-}
-
-__global__
-void gpu_backref_t5_tag(const uint32_t N, T6BackRef *t6_data, BackRef *t5_data, T6FinalEntry *t6_final_data, uint32_t *t5_tags) {
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		T6BackRef entry = t6_data[i];
-		uint64_t t6_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-		uint64_t t6_prev_idx_L      = entry.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-		uint64_t t6_prev_idx_R      = entry.prev_block_ref_R & 0x3FFFFFF;
-
-		// now could back ref t5...
-		BackRef t5_L, t5_R;
-		uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L;
-		uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R;
-		t5_L = t5_data[t5_address_L];
-		t5_R = t5_data[t5_address_R];
-		uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6);
-		uint64_t t5_L_prev_idx_L      = t5_L.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6);
-		uint64_t t5_L_prev_idx_R      = t5_L.prev_block_ref_R & 0x3FFFFFF;
-		uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6);
-		uint64_t t5_R_prev_idx_L      = t5_R.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6);
-		uint64_t t5_R_prev_idx_R      = t5_R.prev_block_ref_R & 0x3FFFFFF;
-
-		// tag addresses that were used here...
-
-		uint32_t bits_to_set;
-		bits_to_set = 1 << (t5_address_L % 32);
-		atomicOr(&t5_tags[t5_address_L / 32], bits_to_set);
-
-		bits_to_set = 1 << (t5_address_R % 32);
-		atomicOr(&t5_tags[t5_address_R / 32], bits_to_set);
-
-
-		T6FinalEntry final_entry = {};
-		final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12);
-		final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12);
-		//std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl;
-		//std::cout << "              [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl;
-		final_entry.y = entry.y;
-		t6_final_data[i] = final_entry;
-	}
-}
-
-// t6's map to t4's, t5's map to t3's
-__global__
-void gpu_backref_t4_tag(const uint32_t N, BackRef *t4_data, T3BaseRef *t3_data, T4FinalEntry *t4_final_data, uint32_t *t4_tags) {
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		BackRef entry = t4_data[i];
-		uint64_t t4_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-		uint64_t t4_prev_idx_L      = entry.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t4_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-		uint64_t t4_prev_idx_R      = entry.prev_block_ref_R & 0x3FFFFFF;
-
-		// now could back ref t5...
-		T3BaseRef t3_L, t3_R;
-		uint32_t t3_address_L = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_L + t4_prev_idx_L;
-		uint32_t t3_address_R = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_R + t4_prev_idx_R;
-		t3_L = t3_data[t3_address_L];
-		t3_R = t3_data[t3_address_R];
-
-		T4FinalEntry finalEntry;
-		finalEntry.Lx1 = t3_L.Lx1;
-		finalEntry.Lx2 = t3_L.Lx2;
-		finalEntry.Lx3 = t3_L.Lx3;
-		finalEntry.Lx4 = t3_L.Lx4;
-		finalEntry.Lx5 = t3_R.Lx1;
-		finalEntry.Lx6 = t3_R.Lx2;
-		finalEntry.Lx7 = t3_R.Lx3;
-		finalEntry.Lx8 = t3_R.Lx4;
-
-		t4_final_data[i] = finalEntry;
-	}
-}
-
-__global__
-void gpu_backref_t4_lxlists(const uint32_t N, BackRef *t4_data, T3BaseRef *t3_data, uint32_t *t4_lx_list, uint32_t *t4_tags) {
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		BackRef entry = t4_data[i];
-		uint64_t t4_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-		uint64_t t4_prev_idx_L      = entry.prev_block_ref_L & 0x3FFFFFF;
-		uint64_t t4_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-		uint64_t t4_prev_idx_R      = entry.prev_block_ref_R & 0x3FFFFFF;
-
-		// now could back ref t5...
-		T3BaseRef t3_L, t3_R;
-		uint32_t t3_address_L = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_L + t4_prev_idx_L;
-		uint32_t t3_address_R = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_R + t4_prev_idx_R;
-		t3_L = t3_data[t3_address_L];
-		t3_R = t3_data[t3_address_R];
-
-		uint32_t base_address = i*8;
-		t4_lx_list[base_address+0] = t3_L.Lx1;
-		t4_lx_list[base_address+1] = t3_L.Lx2;
-		t4_lx_list[base_address+2] = t3_L.Lx3;
-		t4_lx_list[base_address+3] = t3_L.Lx4;
-		t4_lx_list[base_address+4] = t3_R.Lx1;
-		t4_lx_list[base_address+5] = t3_R.Lx2;
-		t4_lx_list[base_address+6] = t3_R.Lx3;
-		t4_lx_list[base_address+7] = t3_R.Lx4;
-
-	}
-}
-
-__global__
-void gpu_t5_tag_to_t4(const uint32_t N, const uint32_t t5_block_id, BackRef *t5_data, uint32_t *t5_tags, uint32_t *t4_tags) {
-	uint32_t i = blockIdx.x*blockDim.x+threadIdx.x;
-	if (i < N) {
-		uint32_t t5_address = i;
-		uint32_t bits_to_set = 1 << (t5_address % 32);
-		uint32_t has_set = t5_tags[t5_address / 32] & bits_to_set;
-		if (has_set > 0) {
-			BackRef t5_entry = t5_data[t5_address];
-			uint64_t t5_L_prev_block_id_L = t5_entry.prev_block_ref_L >> (32 - 6);
-			uint64_t t5_L_prev_idx_L      = t5_entry.prev_block_ref_L & 0x3FFFFFF;
-			uint64_t t5_L_prev_block_id_R = t5_entry.prev_block_ref_R >> (32 - 6);
-			uint64_t t5_L_prev_idx_R      = t5_entry.prev_block_ref_R & 0x3FFFFFF;
-
-			uint64_t file_batch_id, file_block_id, file_idx;
-			uint64_t address;
-			uint32_t bits_to_set;
-
-			file_batch_id = t5_L_prev_block_id_L; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_L;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-
-			file_batch_id = t5_L_prev_block_id_R; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_R;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			atomicOr(&t4_tags[address / 32], bits_to_set);
-		}
-	}
-}
-
-void tagPreviousTable(uint32_t t5_block_id, BackRef *t5_data, uint32_t num_entries, uint32_t *t5_tags, uint32_t *t4_tags) {
-	// we have to read all T2 entries and merge into T3 table that then contains 4 Lx entries.
-	//std::cout << " doing table block " << t5_block_id << std::endl;
-	for (int i=0;i<num_entries;i++) {
-		uint32_t t5_address = i;
-		uint32_t bits_to_set = 1 << (i % 32);
-		uint32_t has_set = t5_tags[i / 32] & bits_to_set;
-		if (has_set > 0) {
-			//std::cout << "WAS SET: t5 block_id: " << t5_block_id << " entry i: " << i << std::endl;
-			BackRef t5_entry = t5_data[t5_address];
-			uint64_t t5_L_prev_block_id_L = t5_entry.prev_block_ref_L >> (32 - 6);
-			uint64_t t5_L_prev_idx_L      = t5_entry.prev_block_ref_L & 0x3FFFFFF;
-			uint64_t t5_L_prev_block_id_R = t5_entry.prev_block_ref_R >> (32 - 6);
-			uint64_t t5_L_prev_idx_R      = t5_entry.prev_block_ref_R & 0x3FFFFFF;
-
-			uint64_t file_batch_id, file_block_id, file_idx;
-			uint64_t address;
-			uint32_t bits_to_set;
-			file_batch_id = t5_L_prev_block_id_L; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_L;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			address = address / 32;
-			//uint32_t has_set = t4_tags[address] & bits_to_set;
-			//if (has_set == 0) printf("error did not set first time some address mistake\n");
-			t4_tags[address] |= bits_to_set;
-
-			file_batch_id = t5_L_prev_block_id_R; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_R;
-			address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-			bits_to_set = 1 << (address % 32);
-			address = address / 32;
-			//has_set = t4_tags[address] & bits_to_set;
-			//if (has_set == 0) printf("error did not set first time some address mistake\n");
-
-			t4_tags[address] |= bits_to_set;
-
-			num_set_t4 += 2;
-		}
-	}
-	//std::cout << " done table block " << t5_block_id << std::endl;
-}
-
-void createT6FinalEntries_oldbenchmarks(char *memstore) {
-	// 2) T6 must propagate down to T4 and tag all used entries, and then update T6 references to include T4.
-		// 3) T6 reads one block at a time for each batch, small memory print
-		//       - then T5 one whole batch, since each block references 0..BATCHES
-		//       - T4 tag list can be set (booleans)
-		//       - update T6 data to include y, 6,6,6 and 6,6,6 references
-	const uint64_t T4_TAG_MEM_BYTES_NEEDED = (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES * BATCHES)) * sizeof(uint32_t)) / 32;
-	const uint64_t T5_TAG_MEM_BYTES_NEEDED = T4_TAG_MEM_BYTES_NEEDED; // (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(uint32_t)) / 32;
-	const uint64_t T6_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T6BackRef);
-	const uint64_t T6_FINAL_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T6FinalEntry);
-	const uint64_t T5_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(BackRef);
-	const uint64_t TOTAL_MEM_BYTES_NEEDED = T4_TAG_MEM_BYTES_NEEDED + T5_TAG_MEM_BYTES_NEEDED + T6_MEM_BYTES_NEEDED + T6_FINAL_MEM_BYTES_NEEDED + T5_MEM_BYTES_NEEDED;
-
-	T6BackRef *device_t6_data;
-	BackRef *device_t5_data;
-	T6FinalEntry *device_t6_final_data;
-	uint32_t *device_t4_tags;
-	uint32_t *device_t5_tags;
-
-	if (memstore==NULL) {
-		std::cout << "Allocating memory bytes: " << TOTAL_MEM_BYTES_NEEDED << std::endl;
-		//memstore = (char *) malloc(TOTAL_MEM_BYTES_NEEDED);
-		CUDA_CHECK_RETURN(cudaMallocHost((void**)&memstore, TOTAL_MEM_BYTES_NEEDED)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-		std::cout << "    host mem allocated..." << std::endl;
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t6_data, T6_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t5_data, T5_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t6_final_data, T6_FINAL_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t4_tags, T4_TAG_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t5_tags, T5_TAG_MEM_BYTES_NEEDED));
-		// clear bits...
-		CUDA_CHECK_RETURN(cudaMemset(device_t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED));
-
-		std::cout << "    gpu mem allocated..." << std::endl;
-
-		if (memstore == NULL) {
-			exit (1);
-		}
-	}
-
-	// TODO: THIS IS SUPER SLOW ON HOST CPU! but it only needs 5GB so could load into GPU and set it all there...
-
-	uint64_t NEXT_MEM_BYTES_START = 0;
-
-	const uint64_t T5_DATA_START = NEXT_MEM_BYTES_START;
-	BackRef *t5_data = (BackRef *) &memstore[T5_DATA_START];
-	uint32_t t5_num_entries[BATCHES];
-	NEXT_MEM_BYTES_START += T5_MEM_BYTES_NEEDED;
-
-	const uint64_t T6_DATA_START = NEXT_MEM_BYTES_START;
-	T6BackRef *t6_data = (T6BackRef *) &memstore[T6_DATA_START];
-	NEXT_MEM_BYTES_START += T6_MEM_BYTES_NEEDED;
-
-	const uint64_t T6_FINAL_DATA_START = NEXT_MEM_BYTES_START;
-	T6FinalEntry *t6_final_data = (T6FinalEntry *) &memstore[T6_FINAL_DATA_START];
-	NEXT_MEM_BYTES_START += T6_FINAL_MEM_BYTES_NEEDED;
-
-	uint32_t *t4_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes
-	// will reference this as if file, like memstore[batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + block_id*HOST_MAX_BLOCK_ENTRIES]
-	memset(t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED);
-	NEXT_MEM_BYTES_START += T4_TAG_MEM_BYTES_NEEDED;
-
-	uint32_t *t5_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes
-	// will reference this as if file, like memstore[batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + block_id*HOST_MAX_BLOCK_ENTRIES]
-	memset(t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED);
-	NEXT_MEM_BYTES_START += T5_TAG_MEM_BYTES_NEEDED;
-
-	using milli = std::chrono::milliseconds;
-
-	std::cout << "Starting..." << std::endl;
-	uint64_t total_t4_tagged = 0;
-	num_set_t4 = 0;
-	num_same_addresses = 0;
-	num_set_t5 = 0;
-
-	const int doCPUmethod = 0;
-	const int doGPUmethod = 2; // 1 is single shot setting, 2 is 2-phase setting
-	/*
-*******method 1**************
-All compute loop time: 30760 ms
-*********************
- Tagged T4 entries: 3531979836 should be 114437654 out of max 4563402752
-*********************
-Total time: 33862 ms
-
-*******method 2**************
-All compute loop time: 28907 ms
-*********************
-All compute loop time: 28753 ms
-*********************
- Tagged T4 entries: 3531979836 should be 114437654 out of max 4563402752
-*********************
-Total time: 32010 ms
-
-*/
-	int blockSize = 256;
-
-
-	auto compute_loop_start = std::chrono::high_resolution_clock::now();
-	//for (uint32_t t6_batch_id = 5; t6_batch_id < 6; t6_batch_id++) {
-	for (uint32_t t6_batch_id = 0; t6_batch_id < BATCHES; t6_batch_id++) {
-		auto batch_start = std::chrono::high_resolution_clock::now();
-		if (doCPUmethod > 0) {
-			memset(t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED);
-		} else {
-			CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED));
-		}
-		for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) {
-			readBackRefBlockFile(5, t5_block_id, t6_batch_id,
-					&t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],
-					t5_num_entries[t5_block_id]);
-			//std::cout << "Loading T5 batch-block " << t5_block_id << "-" << t6_batch_id << " : " << t5_num_entries[t5_block_id] << " entries" << std::endl;
-			if (doGPUmethod > 0)
-				CUDA_CHECK_RETURN(cudaMemcpy(&device_t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],&t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],t5_num_entries[t5_block_id]*sizeof(BackRef),cudaMemcpyHostToDevice));
-		}
-		if (doGPUmethod > 0)
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-		// TODO: we need to make sure we are getting the correct values/tags set
-		// find the file for the single value y and follow that y back to see if we are doing it right....
-
-		//for (uint32_t t6_block_id = 8; t6_block_id < 9; t6_block_id++) { //  BATCHES; t6_block_id++) {
-		for (uint32_t t6_block_id = 0; t6_block_id < BATCHES; t6_block_id++) {
-			uint32_t t6_num_entries;
-			readT6BlockFile(t6_batch_id,t6_block_id,t6_data, t6_num_entries);
-			if (doGPUmethod > 0) {
-				CUDA_CHECK_RETURN(cudaMemcpy(device_t6_data, t6_data,t6_num_entries*sizeof(T6BackRef),cudaMemcpyHostToDevice));
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-				//std::cout << "Scanning T6 batch-block " << t6_batch_id << "-" << t6_block_id << " : " << t6_num_entries << " entries" << std::endl;
-				int numBlocks = (t6_num_entries + blockSize - 1) / (blockSize);
-				if (doGPUmethod == 1)
-					gpu_set_t6_final_data_and_t4_tags_directly<<<numBlocks,blockSize>>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t4_tags);
-				else
-					gpu_backref_t5_tag<<<numBlocks,blockSize>>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t5_tags);
-
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-				// now write back results to hostmem
-				CUDA_CHECK_RETURN(cudaMemcpy(t6_final_data, device_t6_final_data,t6_num_entries*sizeof(T6FinalEntry),cudaMemcpyDeviceToHost));
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-			}
-			if (doCPUmethod > 0)
-			for (int i=0;i<t6_num_entries;i++) {
-				T6BackRef entry = t6_data[i];
-			//if (entry.y == 573855352) {
-				uint64_t t6_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6);
-				uint64_t t6_prev_idx_L      = entry.prev_block_ref_L & 0x3FFFFFF;
-				uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6);
-				uint64_t t6_prev_idx_R      = entry.prev_block_ref_R & 0x3FFFFFF;
-
-				// now could back ref t5...
-				BackRef t5_L, t5_R;
-				uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L;
-				uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R;
-				t5_L = t5_data[t5_address_L];
-				t5_R = t5_data[t5_address_R];
-				uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6);
-				uint64_t t5_L_prev_idx_L      = t5_L.prev_block_ref_L & 0x3FFFFFF;
-				uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6);
-				uint64_t t5_L_prev_idx_R      = t5_L.prev_block_ref_R & 0x3FFFFFF;
-				uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6);
-				uint64_t t5_R_prev_idx_L      = t5_R.prev_block_ref_L & 0x3FFFFFF;
-				uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6);
-				uint64_t t5_R_prev_idx_R      = t5_R.prev_block_ref_R & 0x3FFFFFF;
-
-				// tag addresses that were used here...
-				if (doCPUmethod == 2) {
-					uint32_t bits_to_set;
-					bits_to_set = 1 << (t5_address_L % 32);
-					uint32_t value = t5_tags[t5_address_L / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t5_tags[t5_address_L / 32] |= bits_to_set;
-
-					bits_to_set = 1 << (t5_address_R % 32);
-					value = t5_tags[t5_address_R / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t5_tags[t5_address_R / 32] |= bits_to_set;
-
-					num_set_t5 += 2;
-				}
-
-				T6FinalEntry final_entry = {};
-				final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12);
-				final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12);
-				//std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl;
-				//std::cout << "              [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl;
-				final_entry.y = entry.y;
-				t6_final_data[i] = final_entry;
-
-
-
-				// directly set t4 tags
-				if (doCPUmethod == 1) {
-					uint32_t value;
-					uint64_t file_batch_id, file_block_id, file_idx;
-					uint64_t address;
-					uint32_t bits_to_set;
-					file_batch_id = t5_L_prev_block_id_L; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_L;
-					address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-					bits_to_set = 1 << (address % 32);
-					value = t4_tags[address / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t4_tags[address / 32] |= bits_to_set;
-
-					file_batch_id = t5_L_prev_block_id_R; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_R;
-					address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-					bits_to_set = 1 << (address % 32);
-					value = t4_tags[address / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t4_tags[address / 32] |= bits_to_set;
-
-					file_batch_id = t5_R_prev_block_id_L; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_L;
-					address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-					bits_to_set = 1 << (address % 32);
-					value = t4_tags[address / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t4_tags[address / 32] |= bits_to_set;
-
-					file_batch_id = t5_R_prev_block_id_R; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_R;
-					address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx;
-					bits_to_set = 1 << (address % 32);
-					value = t4_tags[address / 32] & bits_to_set;
-					if (value > 1) { num_same_addresses++; }
-					t4_tags[address / 32] |= bits_to_set;
-
-					num_set_t4 += 4;
-				}
-
-				// just for benchmarks
-				// CPU:
-				// 10239 ms with / 32 per batch, writing to t4 tags directly
-				//  5057 ms with / 32 per batch but writing to t5 then scan t5 tags and write to t4 tags
-				//
-				//  1800ms without writing to tags, but fetch t5 data and setting t6 backrefs. -> still 1.9 minutes
-				//  1588ms with T6 writing tags for t5 instead of fetching t5, doesn't seem to save much huh?
-				//   1479ms without reading t5 at all -- so almost no gain (although to be fair t5 was cached reads).
-
-				// Bladebit is 25s phase 2
-				// read t5+t6 data only is 232ms, lowest bound, 15s min.
-				// read and transfer to gpu is 324ms - total 20s
-				// gpu setting data is 350ms...hallejuya
-				//     - 26 seconds total but without tags written
-				//     - 28.8 seconds writting back final data T6
-				//     - 41 seconds w/ tags written.
-				//     even settings tags is 640ms hot god damn I love gpu, vs 6500ms = x10!
-				//     but can this be improved so less random writes?
-				//     total time is 41s
-
-				//backPropagate(5,prev_block_id_L, batch_id, prev_idx_L );
-				//backPropagate(5,prev_block_id_R, batch_id, prev_idx_R );
-				//printf("%u %u %u %u\n", t5_L_prev_block_id_L, t5_L_prev_block_id_R, t5_R_prev_block_id_L, t5_R_prev_block_id_R);
-
-			}
-			//}// entry.y
-
-			//writeT6FinalBlockFile(t6_batch_id,t6_block_id,t6_data,t6_num_entries);
-
-		}
-
-		// 2067ms w/o any tagging
-		// 3865ms w/ tagging but not tagging t4
-		// 6299ms w tag on 5 and t4 tags all set
-		// 10954ms tagging 4 directly (skipping 5)
-		if (doCPUmethod == 2) {
-			for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) {
-				uint32_t num_entries = t5_num_entries[t5_block_id];
-				//std::cout << "Doing previous table tag for t6_batch_id: " << t6_batch_id << std::endl;
-				tagPreviousTable(t5_block_id,
-						&t5_data[HOST_MAX_BLOCK_ENTRIES * t5_block_id], t5_num_entries[t5_block_id],
-						&t5_tags[(HOST_MAX_BLOCK_ENTRIES * t5_block_id) / 32], // note /32 since 32 bits
-						t4_tags);
-			}
-		}
-
-		if (doGPUmethod == 2) {
-			for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) {
-				uint32_t num_entries = t5_num_entries[t5_block_id];
-				uint32_t t5_address = HOST_MAX_BLOCK_ENTRIES * t5_block_id;
-				int numBlocks = (num_entries + blockSize - 1) / (blockSize);
-				gpu_t5_tag_to_t4<<<numBlocks,blockSize>>>(num_entries, t5_block_id,
-						&device_t5_data[t5_address],
-						&device_t5_tags[t5_address/32], device_t4_tags);
-				CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			}
-		}
-
-		auto batch_end = std::chrono::high_resolution_clock::now();
-		//std::cout << "*********************" << std::endl;
-		std::cout << "*** Batch " << t6_batch_id << " time: " << std::chrono::duration_cast<milli>(batch_end - batch_start).count() << " ms ***\n";
-		//std::cout << "*********************" << std::endl;
-	}
-	if (doGPUmethod > 0) {
-		// technically don't need to do this if stays in device memory...just for verfication purposes.
-		CUDA_CHECK_RETURN(cudaMemcpy(t4_tags, device_t4_tags,T4_TAG_MEM_BYTES_NEEDED,cudaMemcpyDeviceToHost));
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-	}
-	auto compute_loop_end = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "All compute loop time: " << std::chrono::duration_cast<milli>(compute_loop_end - compute_loop_start).count() << " ms\n";
-	std::cout << "*********************" << std::endl;
-
-
-
-	/*std::cout << "setting tags..." << std::endl;
-		for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) {
-			//std::cout << "setting batch " << t4_batch_id << std::endl;
-			for (uint64_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) {
-				//for (uint64_t i=0;i<1;i++) {
-				for (uint64_t i=0;i<HOST_MAX_BLOCK_ENTRIES;i++) {
-					uint64_t address = t4_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + t4_block_id*HOST_MAX_BLOCK_ENTRIES + i;
-					uint32_t bits_to_set = 1 << (address % 32);
-					address = address / 32;
-					t4_tags[address] |= bits_to_set;
-					num_set_t4++;
-				}
-			}
-		}*/
-
-	std::cout << "Counting tags..." << std::endl;
-	for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) {
-		//std::cout << "Counting batch " << t4_batch_id << std::endl;
-		for (uint64_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) {
-			for (uint64_t i=0;i<HOST_MAX_BLOCK_ENTRIES;i++) {
-				uint64_t address = t4_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + t4_block_id*HOST_MAX_BLOCK_ENTRIES + i;
-				uint32_t bits_to_set = 1 << (address % 32);
-				address = address / 32;
-				uint32_t has_set = t4_tags[address] & bits_to_set;
-				if (has_set > 0) {
-					total_t4_tagged++;
-					//std::cout << " Tagged entry t4 batch_id: " << t4_batch_id << " block:" << t4_block_id << std::endl;
-				};
-			}
-		}
-		//std::cout << "partial result: " << total_t4_tagged << std::endl;
-	}
-	std::cout << " Num set t5: " << num_set_t5 << std::endl;
-	std::cout << " Num set t4: " << num_set_t4 << std::endl;
-	std::cout << " Num same addresses: " << num_same_addresses << std::endl;
-	std::cout << " Tagged T4 entries: " << total_t4_tagged << " should be 114437654 out of max 4563402752" << std::endl;
-
-	std::cout << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-10-38.tmp" << std::endl
-			  << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-35-40.tmp" << std::endl
-			  << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-43-38.tmp" << std::endl
-			  << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-51-40.tmp" << std::endl;
-}
-
-
-__global__
-void gpu_chacha8_xs_to_kbcs(const uint32_t N,
-		const __restrict__ uint32_t *input,
-		uint32_t *xs, uint32_t *kbcs)
-{
-	uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
-
-	int index = blockIdx.x * blockDim.x + threadIdx.x; //  + x_start/16;
-
-	if (index < N) {
-		uint32_t x = xs[index];
-		uint32_t x_group = x / 16;
-		uint32_t x_selection = x % 16;
-		uint32_t pos = x_group;
-
-		x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7];
-		x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11];
-		x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32;
-		x14 = input[14];x15 = input[15];
-
-		#pragma unroll
-		for (int i = 0; i < 4; i++) {
-			QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15);
-			QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14);
-		}
-
-		x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4];
-		x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9];
-		x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0;
-		x14 += input[14];x15 += input[15];
-
-		// convert to little endian/big endian whatever, chia needs it like this
-		BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5);
-		BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11);
-		BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15);
-
-		uint32_t result_x;
-		if (x_selection == 0) result_x = x0;
-		if (x_selection == 1) result_x = x1;
-		if (x_selection == 2) result_x = x2;
-		if (x_selection == 3) result_x = x3;
-		if (x_selection == 4) result_x = x4;
-		if (x_selection == 5) result_x = x5;
-		if (x_selection == 6) result_x = x6;
-		if (x_selection == 7) result_x = x7;
-		if (x_selection == 8) result_x = x8;
-		if (x_selection == 9) result_x = x9;
-		if (x_selection == 10) result_x = x10;
-		if (x_selection == 11) result_x = x11;
-		if (x_selection == 12) result_x = x12;
-		if (x_selection == 13) result_x = x13;
-		if (x_selection == 14) result_x = x14;
-		if (x_selection == 15) result_x = x15;
-		uint64_t y = (((uint64_t) result_x) << 6) + (x >> 26);
-		uint32_t kbc_bucket_id = uint32_t (y / kBC);
-		//printf("x: %u  y:%llu  kbc:%u\n", x, y, kbc_bucket_id);
-		kbcs[index] = kbc_bucket_id;
-	}
-}
-
-
-__global__
-void showSorted(const uint32_t N, uint32_t *list) {
-	for (int i=0;i<N;i++) {
-		printf("%u ", list[i]);
-	}
-	printf("\n");
-}
-
-void createT6FinalEntriesGPU(char *memstore) {
-	// 2) T6 must propagate down to T4 and tag all used entries, and then update T6 references to include T4.
-		// 3) T6 reads one block at a time for each batch, small memory print
-		//       - then T5 one whole batch, since each block references 0..BATCHES
-		//       - T4 tag list can be set (booleans)
-		//       - update T6 data to include y, 6,6,6 and 6,6,6 references
-	const uint64_t T4_TAG_MEM_BYTES_NEEDED = (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES * BATCHES)) * sizeof(uint32_t)) / 32;
-	const uint64_t T5_TAG_MEM_BYTES_NEEDED = (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(uint32_t)) / 32;
-	const uint64_t T4AND6_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T6BackRef);// sizeof(BackRef)
-	const uint64_t T4AND6_FINAL_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T4FinalEntry); // * sizeof(T6FinalEntry);
-	const uint64_t T3AND5_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(T3BaseRef); // * sizeof(BackRef);
-
-	const uint64_t TOTAL_MEM_BYTES_NEEDED = T4_TAG_MEM_BYTES_NEEDED + T5_TAG_MEM_BYTES_NEEDED + T4AND6_MEM_BYTES_NEEDED + T4AND6_FINAL_MEM_BYTES_NEEDED + T3AND5_MEM_BYTES_NEEDED;
-
-	bool verify_results = true;
-
-	T6BackRef *device_t6_data;
-	BackRef *device_t4_data;
-
-	T3BaseRef *device_t3_baseref_data;
-	BackRef *device_t5_data;
-
-
-	T4FinalEntry *device_t4_final_data;
-	uint32_t *device_t4_lx_list;
-	uint32_t *kbcs;
-	T6FinalEntry *device_t6_final_data;
-	uint32_t *device_t4_tags;
-	uint32_t *device_t5_tags;
-
-	if (memstore==NULL) {
-		std::cout << "Allocating memory bytes: " << TOTAL_MEM_BYTES_NEEDED << std::endl;
-		//memstore = (char *) malloc(TOTAL_MEM_BYTES_NEEDED);
-		CUDA_CHECK_RETURN(cudaMallocHost((void**)&memstore, TOTAL_MEM_BYTES_NEEDED)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE]();
-		std::cout << "    host mem allocated..." << std::endl;
-
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t6_data, T4AND6_MEM_BYTES_NEEDED));
-		device_t4_data = (BackRef *) device_t6_data;
-
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t5_data, T3AND5_MEM_BYTES_NEEDED));
-		device_t3_baseref_data = (T3BaseRef *) device_t5_data;
-
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t6_final_data, T4AND6_FINAL_MEM_BYTES_NEEDED));
-		device_t4_final_data = (T4FinalEntry *) device_t6_final_data; // shared when t6 is done with it
-		device_t4_lx_list = (uint32_t *) device_t4_final_data;
-		CUDA_CHECK_RETURN(cudaMalloc(&kbcs, T4AND6_FINAL_MEM_BYTES_NEEDED));
-
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t4_tags, T4_TAG_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMalloc(&device_t5_tags, T5_TAG_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMemset(device_t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED));
-		CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED));
-
-		std::cout << "    gpu mem allocated..." << std::endl;
-
-		if (memstore == NULL) {
-			exit (1);
-		}
-	}
-
-	// TODO: THIS IS SUPER SLOW ON HOST CPU! but it only needs 5GB so could load into GPU and set it all there...
-
-	uint64_t NEXT_MEM_BYTES_START = 0;
-
-	const uint64_t T5_DATA_START = NEXT_MEM_BYTES_START;
-	BackRef *t5_data = (BackRef *) &memstore[T5_DATA_START];
-	T3BaseRef *t3_baseref_data = (T3BaseRef *) t5_data;
-	uint32_t t5_num_entries[BATCHES];
-	NEXT_MEM_BYTES_START += T3AND5_MEM_BYTES_NEEDED;
-
-	const uint64_t T6_DATA_START = NEXT_MEM_BYTES_START;
-	T6BackRef *t6_data = (T6BackRef *) &memstore[T6_DATA_START];
-	BackRef *t4_data = (BackRef *) t6_data;
-	NEXT_MEM_BYTES_START += T4AND6_MEM_BYTES_NEEDED;
-
-	const uint64_t T6_FINAL_DATA_START = NEXT_MEM_BYTES_START;
-	T6FinalEntry *t6_final_data = (T6FinalEntry *) &memstore[T6_FINAL_DATA_START];
-	T4FinalEntry *t4_final_data = (T4FinalEntry *) t6_final_data;
-	uint32_t *t4_lx_list = (uint32_t *) t6_final_data;
-	NEXT_MEM_BYTES_START += T4AND6_FINAL_MEM_BYTES_NEEDED;
-
-	uint32_t *t4_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes
-	// will reference this as if file, like memstore[batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + block_id*HOST_MAX_BLOCK_ENTRIES]
-	memset(t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED);
-	NEXT_MEM_BYTES_START += T4_TAG_MEM_BYTES_NEEDED;
-
-	uint32_t *t5_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes
-	memset(t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED);
-	NEXT_MEM_BYTES_START += T5_TAG_MEM_BYTES_NEEDED;
-
-	using milli = std::chrono::milliseconds;
-
-	std::cout << "Starting..." << std::endl;
-	uint64_t total_t4_tagged = 0;
-	num_set_t4 = 0;
-	num_same_addresses = 0;
-	num_set_t5 = 0;
-
-	int blockSize = 256;
-
-	auto compute_loop_start = std::chrono::high_resolution_clock::now();
-	//for (uint32_t t6_batch_id = 5; t6_batch_id < 6; t6_batch_id++) {
-	for (uint32_t t6_batch_id = 0; t6_batch_id < BATCHES; t6_batch_id++) {
-		auto batch_start = std::chrono::high_resolution_clock::now();
-		CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED));
-
-		for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) {
-			readBackRefBlockFile(5, t5_block_id, t6_batch_id,
-					&t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],
-					t5_num_entries[t5_block_id]);
-			std::cout << "Loading T5 batch-block " << t5_block_id << "-" << t6_batch_id << " : " << t5_num_entries[t5_block_id] << " entries" << std::endl;
-			CUDA_CHECK_RETURN(cudaMemcpy(&device_t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],&t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],t5_num_entries[t5_block_id]*sizeof(BackRef),cudaMemcpyHostToDevice));
-		}
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-		// TODO: we need to make sure we are getting the correct values/tags set
-		// find the file for the single value y and follow that y back to see if we are doing it right....
-
-		//for (uint32_t t6_block_id = 8; t6_block_id < 9; t6_block_id++) { //  BATCHES; t6_block_id++) {
-		for (uint32_t t6_block_id = 0; t6_block_id < BATCHES; t6_block_id++) {
-			uint32_t t6_num_entries;
-			readT6BlockFile(t6_batch_id,t6_block_id,t6_data, t6_num_entries);
-			CUDA_CHECK_RETURN(cudaMemcpy(device_t6_data, t6_data,t6_num_entries*sizeof(T6BackRef),cudaMemcpyHostToDevice));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			std::cout << "Scanning T6 batch-block " << t6_batch_id << "-" << t6_block_id << " : " << t6_num_entries << " entries to tag t5s" << std::endl;
-			int numBlocks = (t6_num_entries + blockSize - 1) / (blockSize);
-			auto tag_start = std::chrono::high_resolution_clock::now();
-			gpu_backref_t5_tag<<<numBlocks,blockSize>>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t5_tags);
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			auto tag_end = std::chrono::high_resolution_clock::now();
-			std::cout << "*** gpu tag ms: " << std::chrono::duration_cast<milli>(tag_end - tag_start).count() << " ms ***\n";
-			// now write back results to hostmem
-			std::cout << "writing results to hostmem" << std::endl;
-
-			CUDA_CHECK_RETURN(cudaMemcpy(t6_final_data, device_t6_final_data,t6_num_entries*sizeof(T6FinalEntry),cudaMemcpyDeviceToHost));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		}
-
-		auto tag_start = std::chrono::high_resolution_clock::now();
-		for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) {
-			std::cout << " gpu t5 tag to t4 t5_block_id:" << std::endl;
-			uint32_t num_entries = t5_num_entries[t5_block_id];
-			uint32_t t5_address = HOST_MAX_BLOCK_ENTRIES * t5_block_id;
-			int numBlocks = (num_entries + blockSize - 1) / (blockSize);
-			gpu_t5_tag_to_t4<<<numBlocks,blockSize>>>(num_entries, t5_block_id,
-						&device_t5_data[t5_address],
-						&device_t5_tags[t5_address/32], device_t4_tags);
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		}
-		auto tag_end = std::chrono::high_resolution_clock::now();
-		std::cout << "*** gpu tag ms: " << std::chrono::duration_cast<milli>(tag_end - tag_start).count() << " ms ***\n";
-		auto batch_end = std::chrono::high_resolution_clock::now();
-		//std::cout << "*********************" << std::endl;
-		std::cout << "*** Batch " << t6_batch_id << " time: " << std::chrono::duration_cast<milli>(batch_end - batch_start).count() << " ms ***\n";
-		//std::cout << "*********************" << std::endl;
-	}
-
-	auto compute_loop_end = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "All compute loop time: " << std::chrono::duration_cast<milli>(compute_loop_end - compute_loop_start).count() << " ms\n";
-	std::cout << "*********************" << std::endl;
-	auto t4_final_start = std::chrono::high_resolution_clock::now();
-	// TODO: free gpu mem and setup t3 and t4 mem
-	std::cout << "Doing T4->T3 tags" << std::endl;
-
-	uint32_t t3_num_entries[BATCHES];
-
-	return;
-
-	for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) {
-		auto batch_start = std::chrono::high_resolution_clock::now();
-		std::cout << "Loading T3BaseRef [0-63]-batch " << t4_batch_id << std::endl;
-
-		for (uint64_t t3_block_id = 0; t3_block_id < BATCHES; t3_block_id++) {
-			readT3BaseRefBlockFile(t3_block_id, t4_batch_id,
-					&t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id],
-					t3_num_entries[t3_block_id]);
-
-			CUDA_CHECK_RETURN(cudaMemcpy(&device_t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id],&t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id],
-					t3_num_entries[t3_block_id]*sizeof(T3BaseRef), // note T3BaseRef
-					cudaMemcpyHostToDevice));
-		}
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		// now we have all t3 entries in block row for back referencing from t4 blocks.
-		// the t4 blocks now just need to fetch the t3 entries and get the Lx1,Lx2,Lx3,Lx4 * 2 = 8 Lx entries.
-
-		for (uint32_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) {
-			uint32_t t4_num_entries;
-			readBackRefBlockFile(4, t4_batch_id,t4_block_id,t4_data, t4_num_entries);
-
-			CUDA_CHECK_RETURN(cudaMemcpy(device_t4_data, t4_data,t4_num_entries*sizeof(BackRef),cudaMemcpyHostToDevice));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-			//std::cout << "Scanning T6 batch-block " << t6_batch_id << "-" << t6_block_id << " : " << t6_num_entries << " entries" << std::endl;
-			int numBlocks = (t4_num_entries + blockSize - 1) / (blockSize);
-			// gpu_backref_t4_tag<<<numBlocks,blockSize>>>(t4_num_entries,device_t4_data, device_t3_baseref_data, device_t4_final_data, device_t4_tags);
-			gpu_backref_t4_lxlists<<<numBlocks,blockSize>>>(t4_num_entries,device_t4_data, device_t3_baseref_data, device_t4_lx_list, device_t4_tags);
-			gpu_chacha8_xs_to_kbcs<<<numBlocks,blockSize>>>(t4_num_entries*8, chacha_input, device_t4_lx_list, kbcs);
-			// wrap raw pointer with a device_ptr
-			thrust::device_ptr<uint32_t> device_t4_lx_list_ptr(device_t4_lx_list);
-			thrust::sort(device_t4_lx_list_ptr, device_t4_lx_list_ptr + t4_num_entries*8);        // modify your sort line
-			showSorted<<<1,1>>>(30,device_t4_lx_list);
-
-			thrust::device_ptr<uint32_t> device_kbcs_ptr(kbcs);
-			thrust::sort(device_kbcs_ptr, device_kbcs_ptr + t4_num_entries*8);        // modify your sort line
-			showSorted<<<1,1>>>(30,kbcs);
-
-			//thrust::sort(device_t4_lx_list_ptr.begin(), device_t4_lx_list_ptr.end() + t4_num_entries*4);
-			//uint32_t new_end = thrust::unique(device_t4_lx_list_ptr, device_t4_lx_list_ptr + t4_num_entries*4);
-			//std::cout << "Thrust sorted " << (t4_num_entries*4) << " down to " << new_end << std::endl;
-
-			 // T4 final time: 61754 ms (57808ms without backref t4 tag)
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-			// now write back results to hostmem
-			// erm...t4_num_entries can change, ay? Since output will be pruned somewhat.
-			CUDA_CHECK_RETURN(cudaMemcpy(t4_final_data, device_t4_final_data,t4_num_entries*sizeof(T4FinalEntry),cudaMemcpyDeviceToHost));
-			CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-
-			// todo: get t4_final_data into a unique sorted list and compressed...of course could do this in GPU mem
-		}
-	}
-
-	auto t4_final_end = std::chrono::high_resolution_clock::now();
-	std::cout << "*********************" << std::endl;
-	std::cout << "T4 final time: " << std::chrono::duration_cast<milli>(t4_final_end - t4_final_start).count() << " ms\n";
-	std::cout << "*********************" << std::endl;
-
-
-
-	if (verify_results) {
-		// technically don't need to copy t4_tags if stays in device memory...just for verification purposes.
-		CUDA_CHECK_RETURN(cudaMemcpy(t4_tags, device_t4_tags,T4_TAG_MEM_BYTES_NEEDED,cudaMemcpyDeviceToHost));
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		std::cout << "Counting tags..." << std::endl;
-		for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) {
-			//std::cout << "Counting batch " << t4_batch_id << std::endl;
-			for (uint64_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) {
-				for (uint64_t i=0;i<HOST_MAX_BLOCK_ENTRIES;i++) {
-					uint64_t address = t4_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + t4_block_id*HOST_MAX_BLOCK_ENTRIES + i;
-					uint32_t bits_to_set = 1 << (address % 32);
-					address = address / 32;
-					uint32_t has_set = t4_tags[address] & bits_to_set;
-					if (has_set > 0) {
-						total_t4_tagged++;
-						//std::cout << " Tagged entry t4 batch_id: " << t4_batch_id << " block:" << t4_block_id << std::endl;
-					};
-				}
-			}
-			//std::cout << "partial result: " << total_t4_tagged << std::endl;
-		}
-		std::cout << " Tagged T4 entries: " << total_t4_tagged << " should be 114437654 out of max 4563402752" << std::endl;
-	}
-}
-
-
-void doPhase2Pruning() {
-	char *memstore;
-
-	/*if (true) {
-		// test xs'...
-		uint32_t *xs;
-		uint32_t *kbcs;
-		CUDA_CHECK_RETURN(cudaMallocManaged(&xs, 256));
-		CUDA_CHECK_RETURN(cudaMallocManaged(&kbcs, 256));
-		for (int i=0;i<256;i++) {
-			xs[i] = i;
-		}
-		std::cout << "Doing chacha single xs" << std::endl;
-		gpu_chacha8_xs_to_kbcs<<<1,256>>>(256, chacha_input, xs, kbcs);
-		CUDA_CHECK_RETURN(cudaDeviceSynchronize());
-		for (int i=0;i<256;i++) {
-			std::cout << " kbc " << i << " = " << kbcs[i] << std::endl;
-		}
-
-	}*/
-
-	if (true) {
-		using milli = std::chrono::milliseconds;
-		auto total_start = std::chrono::high_resolution_clock::now();
-		createT6FinalEntriesGPU(memstore);
-		auto total_end = std::chrono::high_resolution_clock::now();
-		std::cout << "*********************" << std::endl;
-		std::cout << "Total time: " << std::chrono::duration_cast<milli>(total_end - total_start).count() << " ms\n";
-		std::cout << "*********************" << std::endl;
-	}
-
-	//batch_id:30 block_id: 55dx:6169
-	//findYsolution(memstore);
-
-	//std::cout << "Phase 2 Pruning" << std::endl;
-	//for (uint32_t batch_id = 0; batch_id < 1; batch_id++) {
-	//	readPruneToT2(batch_id,memstore);
-		// TODO: now that we have all data in mem for a batch, test whether getting the y will get the actual lx pairs!
-	//}
-	std::cout << "Done doPhase2Pruning." << std::endl;
-}
-
-
-#endif /* PHASE2_HPP_ */