diff --git a/attack.hpp b/attack.hpp deleted file mode 100644 index 136e1ad..0000000 --- a/attack.hpp +++ /dev/null @@ -1,1468 +0,0 @@ -/* - * attack.hpp - * - * Created on: Oct 26, 2021 - * Author: nick - */ - -#ifndef ATTACK_HPP_ -#define ATTACK_HPP_ - - -#include "nick_blake3.hpp" -//#include -//#include -//#include - -#define ATTACK_KBCFILTER(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ -} - -__global__ -void gpu_chacha8_k32_kbc_ranges(const uint32_t N, - const __restrict__ uint32_t *input, Tx_Bucketed_Meta1 *kbc_local_entries, int *kbc_local_num_entries, - uint32_t KBC_START, uint32_t KBC_END) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_KBCFILTER(x0,0);ATTACK_KBCFILTER(x1,1);ATTACK_KBCFILTER(x2,2);ATTACK_KBCFILTER(x3,3); - ATTACK_KBCFILTER(x4,4);ATTACK_KBCFILTER(x5,5);ATTACK_KBCFILTER(x6,6);ATTACK_KBCFILTER(x7,7); - ATTACK_KBCFILTER(x8,8);ATTACK_KBCFILTER(x9,9);ATTACK_KBCFILTER(x10,10);ATTACK_KBCFILTER(x11,11); - ATTACK_KBCFILTER(x12,12);ATTACK_KBCFILTER(x13,13);ATTACK_KBCFILTER(x14,14);ATTACK_KBCFILTER(x15,15); - } -} - -__device__ int gpu_xs_L_count = 0; -__device__ int gpu_xs_R_count = 0; - -#define ATTACK_WRITEXS_LR(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - int slot = atomicAdd(&local_num_xs_L,1); \ - local_xs_L[slot] = x+i; \ - local_ys_L[slot] = chacha_y; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - int slot = atomicAdd(&local_num_xs_R,1); \ - local_xs_R[slot] = x+i; \ - local_ys_R[slot] = chacha_y; \ - } \ -} - -__global__ -void gpu_chacha8_k32_kbc_ranges_LR_write_xy(const uint32_t N, - const __restrict__ uint32_t *input, - uint32_t *xs_L, uint32_t *ys_L, uint32_t *xs_L_count, uint32_t KBC_START_L, uint32_t KBC_END_L, - uint32_t *xs_R, uint32_t *ys_R, uint32_t *xs_R_count, uint32_t KBC_START_R, uint32_t KBC_END_R) -{ - __shared__ uint32_t local_xs_L[256]; - __shared__ uint32_t local_ys_L[256]; - __shared__ uint32_t local_xs_R[256]; - __shared__ uint32_t local_ys_R[256]; - __shared__ int local_num_xs_L; - __shared__ int local_num_xs_R; - __shared__ int global_L_slot; - __shared__ int global_R_slot; - - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - if (threadIdx.x == 0) { - local_num_xs_L = 0; - local_num_xs_R = 0; - } - __syncthreads(); - const int j = 33*threadIdx.x; - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITEXS_LR(datax[j+0],0);ATTACK_WRITEXS_LR(datax[j+1],1);ATTACK_WRITEXS_LR(datax[j+2],2);ATTACK_WRITEXS_LR(datax[j+3],3); - ATTACK_WRITEXS_LR(datax[j+4],4);ATTACK_WRITEXS_LR(datax[j+5],5);ATTACK_WRITEXS_LR(datax[j+6],6);ATTACK_WRITEXS_LR(datax[j+7],7); - ATTACK_WRITEXS_LR(datax[j+8],8);ATTACK_WRITEXS_LR(datax[j+9],9);ATTACK_WRITEXS_LR(datax[j+10],10);ATTACK_WRITEXS_LR(datax[j+11],11); - ATTACK_WRITEXS_LR(datax[j+12],12);ATTACK_WRITEXS_LR(datax[j+13],13);ATTACK_WRITEXS_LR(datax[j+14],14);ATTACK_WRITEXS_LR(datax[j+15],15); - - } - // without global writes it has maximum speed of 21ms - // these global writes up it to 26ms. - // hope here is that sorting won't take long, so that sorted entries are under total 35ms - // and then the matching *should* be quicker than when it's bucketed - __syncthreads(); - if (threadIdx.x == 0) { - //printf("finished with %u %u counts\n", local_num_xs_L, local_num_xs_R); - global_L_slot = atomicAdd(&xs_L_count[0],local_num_xs_L); - global_R_slot = atomicAdd(&xs_R_count[0],local_num_xs_R); - } - __syncthreads(); - for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) { - xs_L[i+global_L_slot] = local_xs_L[i]; - } - for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) { - ys_L[i+global_L_slot] = local_ys_L[i]; - } - for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) { - xs_R[i+global_R_slot] = local_xs_R[i]; - } - for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) { - ys_R[i+global_R_slot] = local_ys_R[i]; - } -} - -__global__ -void gpu_merge_f1xypairs_into_kbc_buckets( - const uint32_t KBC_START_ID, // determined by batch_id - const uint64_t *in, const uint32_t N, - Tx_Bucketed_Meta1 *local_kbc_entries, int *local_kbc_counts) -{ - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - //for (int i = 0; i < N; i++) { - - if (i < N) { - uint64_t value = in[i]; - uint32_t x = value >> 32; - uint32_t chacha_y = value; - uint64_t calc_y = (((uint64_t) chacha_y) << 6) + (x >> 26); - uint32_t kbc_id = calc_y / kBC; - uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS / 256; - if ((kbc_id >= KBC_START_ID) || (kbc_id < KBC_END_ID)) { - - - uint32_t local_kbc_id = kbc_id - KBC_START_ID; - int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1); - uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; - - //printf("block_id:%u [i: %u] entry.y:%u kbc_id:%u local_kbc:%u slot:%u dest:%u\n", - // block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address); - - if (slot > KBC_MAX_ENTRIES_PER_BUCKET) { - printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n"); - } - if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address); - } - Tx_Bucketed_Meta1 kbc_entry = {}; - kbc_entry.y = calc_y % kBC; - kbc_entry.meta[0] = x; - local_kbc_entries[destination_address] = kbc_entry; - } - } -} - -__global__ -void gpu_chacha8_k32_kbc_ranges_LR_write_xypairs(const uint32_t N, - const __restrict__ uint32_t *input, - uint64_t *xys_L, uint32_t *xs_L_count, uint32_t KBC_START_L, uint32_t KBC_END_L, - uint64_t *xys_R, uint32_t *xs_R_count, uint32_t KBC_START_R, uint32_t KBC_END_R) -{ - __shared__ uint32_t local_xs_L[256]; - __shared__ uint32_t local_ys_L[256]; - __shared__ uint32_t local_xs_R[256]; - __shared__ uint32_t local_ys_R[256]; - __shared__ int local_num_xs_L; - __shared__ int local_num_xs_R; - __shared__ int global_L_slot; - __shared__ int global_R_slot; - - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[256*17]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - if (threadIdx.x == 0) { - local_num_xs_L = 0; - local_num_xs_R = 0; - } - __syncthreads(); - const int j = 17*threadIdx.x; - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITEXS_LR(datax[j+0],0);ATTACK_WRITEXS_LR(datax[j+1],1);ATTACK_WRITEXS_LR(datax[j+2],2);ATTACK_WRITEXS_LR(datax[j+3],3); - ATTACK_WRITEXS_LR(datax[j+4],4);ATTACK_WRITEXS_LR(datax[j+5],5);ATTACK_WRITEXS_LR(datax[j+6],6);ATTACK_WRITEXS_LR(datax[j+7],7); - ATTACK_WRITEXS_LR(datax[j+8],8);ATTACK_WRITEXS_LR(datax[j+9],9);ATTACK_WRITEXS_LR(datax[j+10],10);ATTACK_WRITEXS_LR(datax[j+11],11); - ATTACK_WRITEXS_LR(datax[j+12],12);ATTACK_WRITEXS_LR(datax[j+13],13);ATTACK_WRITEXS_LR(datax[j+14],14);ATTACK_WRITEXS_LR(datax[j+15],15); - - } - // without global writes it has maximum speed of 21ms - // these global writes up it to 26ms. - // hope here is that sorting won't take long, so that sorted entries are under total 35ms - // and then the matching *should* be quicker than when it's bucketed - __syncthreads(); - if (threadIdx.x == 0) { - //printf("finished with %u %u counts\n", local_num_xs_L, local_num_xs_R); - global_L_slot = atomicAdd(&xs_L_count[0],local_num_xs_L); - global_R_slot = atomicAdd(&xs_R_count[0],local_num_xs_R); - } - __syncthreads(); - for (int i = threadIdx.x; i < local_num_xs_L; i+=blockDim.x) { - xys_L[i+global_L_slot] = (((uint64_t) local_xs_L[i]) << 32) + local_ys_L[i]; - } - for (int i = threadIdx.x; i < local_num_xs_R; i+=blockDim.x) { - xys_R[i+global_R_slot] = (((uint64_t) local_xs_R[i]) << 32) + local_ys_R[i]; - } -} - - -#define ATTACK_KBCFILTER_LR(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R[entries_address] = entry; \ - } \ -} - - - -__global__ -void gpu_chacha8_k32_kbc_ranges_LR(const uint32_t N, - const __restrict__ uint32_t *input, - Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L, - Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_KBCFILTER_LR(x0,0);ATTACK_KBCFILTER_LR(x1,1);ATTACK_KBCFILTER_LR(x2,2);ATTACK_KBCFILTER_LR(x3,3); - ATTACK_KBCFILTER_LR(x4,4);ATTACK_KBCFILTER_LR(x5,5);ATTACK_KBCFILTER_LR(x6,6);ATTACK_KBCFILTER_LR(x7,7); - ATTACK_KBCFILTER_LR(x8,8);ATTACK_KBCFILTER_LR(x9,9);ATTACK_KBCFILTER_LR(x10,10);ATTACK_KBCFILTER_LR(x11,11); - ATTACK_KBCFILTER_LR(x12,12);ATTACK_KBCFILTER_LR(x13,13);ATTACK_KBCFILTER_LR(x14,14);ATTACK_KBCFILTER_LR(x15,15); - } -} - - - -template -__global__ -void gpu_attack_find_t1_matches(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - int kbc_map = r_y / 2; - const int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - // find which box our r_target is in, extra the 15bit value from that box - int kbc_map = r_target / 2; - const int kbc_box_shift = (r_target % 2) * 15; - int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - - uint64_t batch_bucket = blake_result >> (38-6); // setting this to 0 (seq.) changes from 57ms to 48ms. - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - bucketed_out[pair_address] = pair; - } - - } -} - - - -template -__global__ -void gpu_attack_find_t1_matches_out_kbc(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *kbc_out, unsigned int *out_kbc_counts, const uint32_t MAX_KBC_ENTRIES) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ BUCKETED_ENTRY_IN kbc_L_entries[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ BUCKETED_ENTRY_IN kbc_R_entries[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R]; - } - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L]; - } - - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - int kbc_map = r_y / 2; - const int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - // find which box our r_target is in, extra the 15bit value from that box - int kbc_map = r_target / 2; - const int kbc_box_shift = (r_target % 2) * 15; - int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - //printf("match list\n"); - //for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - - - - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - - //uint32_t batch_bucket = blake_result >> (38-6); // setting this to 0 (seq.) changes from 57ms to 48ms. - - //if ((pair.meta[0] == 1320788535) || (pair.meta[0] == 2131394289)) { - // printf("Got y %llu batch:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, batch_bucket, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - uint32_t kbc_bucket = blake_result / kBC; - - pair.y = (uint32_t) (blake_result % kBC); - //if (batch_bucket == 49) { - //int block_slot = atomicAdd(&out_kbc_counts[kbc_bucket],1); - - // slightly faster and more memory efficient anyway - uint32_t kbc_bitmask_bucket = kbc_bucket / 8; \ - uint32_t kbc_bitmask_shift = 4*(kbc_bucket % 8); \ - unsigned int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - unsigned int bitadd = atomicAdd(&out_kbc_counts[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t block_slot = bitadd; \ - block_slot = (block_slot >> (kbc_bitmask_shift)) & 0b01111; \ - -/* - * Doing T1 - chacha L1 time: 35 ms - match T1 L time: 18 ms - match T1 R time: 18 ms - match T2 L time: 22 ms -Freeing memory... -GPU DISPLAY T2 MATCH RESULTS: - block 22 entry 3140 x1:1320788535 x2:3465356684 x3:2131394289 x4:606438761 - TOTAL: 262341 - - Doing T1 - chacha L1 time: 36 ms - match T1 L time: 19 ms - match T1 R time: 19 ms - match T2 L time: 22 ms -Freeing memory... -GPU DISPLAY T2 MATCH RESULTS: - block 22 entry 3140 x1:1320788535 x2:3465356684 x3:2131394289 x4:606438761 - TOTAL: 262341 - */ - - if (block_slot > MAX_KBC_ENTRIES) { - printf("block_slot > MAX %u\n", block_slot); - } else { - uint32_t pair_address = kbc_bucket * MAX_KBC_ENTRIES + block_slot; - //if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - //printf("ERROR: results address overflow\n"); - //} else { - kbc_out[pair_address] = pair; - //} - } - //} // TOKENPOD - - - } -} - - - -template -__global__ -void gpu_attack_find_tx_LR_matches(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries_L, const int *kbc_local_num_entries_L, - const BUCKETED_ENTRY_IN *kbc_local_entries_R, const int *kbc_local_num_entries_R, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; // TODO: this could be smaller - __shared__ int total_matches; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries_L[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries_R[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries_L[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries_R[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - if ((num_L == 0) || (num_R == 0)) { - return; - } - - __syncthreads(); // all written initialize data should sync - - // For any 0 <= m < kExtraBitsPow: - // yl / kBC + 1 = yR / kBC AND - // (yr % kBC) / kC - (yl % kBC) / kC = m (mod kB) AND - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - int16_t yr_kbc = R_entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - int16_t yl_kbc = L_entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//value >> 4; - matches[num_matches] = match; - } - } - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - // uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - // uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - // printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - //} - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - if (table < 6) { - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - //bucketed_out[pair_address] = pair; - } - } - } -} - -template -__global__ -void gpu_attack_find_tx_LR_matches_global(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L, - const BUCKETED_ENTRY_IN *kbc_global_entries_R, const unsigned int *kbc_global_num_entries_R, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts, - uint32_t KBC_MAX_ENTRIES, uint32_t BLOCK_MAX_ENTRIES) { - - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; // TODO: this could be smaller - __shared__ int total_matches; - //__shared__ int num_L; - //__shared__ int num_R; - - int global_kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != kBC_NUM_BUCKETS) { - printf("ERROR: GRIDDIM %u MUST EQUAL KBC NUM BUCKETS %u\n", gridDim.x, kBC_NUM_BUCKETS); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES; - const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES; - - - //if (threadIdx.x == 0) { - uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8; - uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8); - uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket]; - const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - //} - //if (threadIdx.x == 1) { - kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8; - kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8); - bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket]; - const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - //} - //__syncthreads(); - //const int num_L = kbc_global_num_entries_L[global_kbc_L_bucket_id]; - //const int num_R = kbc_global_num_entries_R[(global_kbc_L_bucket_id+1)]; - if ((num_L == 0) || (num_R == 0)) { - return; - } - - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - } - __syncthreads(); // all written initialize data should sync - - // For any 0 <= m < kExtraBitsPow: - // yl / kBC + 1 = yR / kBC AND - // (yr % kBC) / kC - (yl % kBC) / kC = m (mod kB) AND - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - int16_t yr_kbc = R_entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - int16_t yl_kbc = L_entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//value >> 4; - matches[num_matches] = match; - } - } - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - //printf("Got t2 match x1: %u x2: %u x3: %u x4: %u\n", L_Entry.meta[0], L_Entry.meta[1], R_Entry.meta[0], R_Entry.meta[1]); - - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - // uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - // uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - //} - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - if (table < 6) { - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * BLOCK_MAX_ENTRIES + block_slot; - //if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - // printf("ERROR: results address overflow\n"); - //} else { - bucketed_out[pair_address] = pair; - //} - } - } -} - -template -__global__ -void gpu_attack_merge_block_buckets_into_kbc_buckets( - const uint32_t KBC_START_ID, // determined by batch_id - const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N, - BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts) -{ - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - //for (int i = 0; i < N; i++) { - - if (i < N) { - // TODO: try just reading out entries and see if they match when going in - - BUCKETED_ENTRY block_entry = in[i]; - uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y; - uint32_t kbc_id = calc_y / kBC; - //uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS; - //if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) { - // printf(" i:%u entry.y:%u add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u KBC_LOCAL_NUM_BUCKETS:%u START:%u END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID); - //} - - uint32_t local_kbc_id = kbc_id - KBC_START_ID; - int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1); - uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; - - //printf("block_id:%u [i: %u] entry.y:%u kbc_id:%u local_kbc:%u slot:%u dest:%u\n", - // block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address); - - if (slot > KBC_MAX_ENTRIES_PER_BUCKET) { - printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n"); - } - if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address); - } - block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form. - local_kbc_entries[destination_address] = block_entry; - } -} - - -__global__ -void gpu_list_local_kbc_entries(int *kbc_num_entries, int from, int to, int skip) { - for (int i=from;i> (kbc_bitmask_shift)) & 0b01111; - - printf("kbc %u : %u\n", i, num); - } -} - -//#include "attack_method_kbc_list.hpp" -#include "attack_method_lxs.hpp" -//#include "attack_method_2.hpp" // this is current working one -//#include "attack_method_xpairbits.hpp" - -void attack_it() { - std::cout << "Attack it!" << std::endl; - - //uint32_t bits = 10; - //attack_method_2(bits); - - - //attack_method_xpairbits(); - attack_method_lxs(6000000); - return; - - //auto sort_start = std::chrono::high_resolution_clock::now(); - //thrust::device_ptr device_xs_L_ptr(device_xs_L); - //thrust::device_ptr device_ys_L_ptr(device_ys_L); - //thrust::sort_by_key(device_ys_L_ptr, device_ys_L_ptr + xs_count_L[0], device_xs_L_ptr); - //CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //auto sort_finish = std::chrono::high_resolution_clock::now(); - //std::cout << " sort time: " << std::chrono::duration_cast(sort_finish - sort_start).count() << " ms\n"; - // why is 2nd sort 31ms and first sort 8ms!?!? - //sort_start = std::chrono::high_resolution_clock::now(); - //thrust::device_ptr device_xs_R_ptr(device_xs_R); - //thrust::device_ptr device_ys_R_ptr(device_ys_R); - //thrust::sort_by_key(device_ys_R_ptr, device_ys_R_ptr + xs_count_R[0], device_xs_R_ptr); - //CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //sort_finish = std::chrono::high_resolution_clock::now(); - //std::cout << " sort time: " << std::chrono::duration_cast(sort_finish - sort_start).count() << " ms\n"; - - - /*auto matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_L, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - gpu_attack_find_t1_matches<<<(KBC_END_L - KBC_START_L), 256>>>(1, batch_id_L, KBC_START_L, KBC_END_L, - T0_local_kbc_entries_L, device_local_kbc_num_entries_L, - T1_L_batch_match_results, device_block_entry_counts_L); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 L time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - - matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_R, 0, (BATCHES)*sizeof(int))); - gpu_attack_find_t1_matches<<<(KBC_END_R - KBC_START_R), 256>>>(1, batch_id_R, KBC_START_R, KBC_END_R, - T0_local_kbc_entries_R, device_local_kbc_num_entries_R, - T1_R_batch_match_results, device_block_entry_counts_R); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 R time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - - auto t1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " T1 total time: " << std::chrono::duration_cast(t1_finish - t1_start).count() << " ms\n"; - - - - auto mergekbcs_start = std::chrono::high_resolution_clock::now(); - // clear our local kbc num entries as these will be written with new data - - - Tx_Bucketed_Meta2 *T1_local_kbc_entries_L = (Tx_Bucketed_Meta2 *) &device_local_kbc_entries_L[0]; // will replace... - Tx_Bucketed_Meta2 *T1_local_kbc_entries_R = (Tx_Bucketed_Meta2 *) &device_local_kbc_entries_R[0]; - // clump block-0-batch_id_L block-0-batch_id_R into same group and solve. - auto matchTx_start = std::chrono::high_resolution_clock::now(); - auto matchTx_finish = std::chrono::high_resolution_clock::now(); - auto mergeTx_start = std::chrono::high_resolution_clock::now(); - auto mergeTx_finish = std::chrono::high_resolution_clock::now(); - uint64_t total_match_time_micros = 0; - uint64_t total_merge_time_micros = 0; - uint32_t global_block_counts[BATCHES] = {0}; - for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - uint32_t KBC_MERGE_BUCKET_START = MIN_KBC_BUCKET_FOR_BATCH(block_id); - uint32_t num_entries_to_copy = device_block_entry_counts_L[block_id]; - int blockSize = 256; - int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize); - uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(block_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id); - - uint32_t block_address = block_id * HOST_MAX_BLOCK_ENTRIES; - Tx_Bucketed_Meta2 *in = &T1_L_batch_match_results[block_address]; - - //std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl; - mergeTx_start = std::chrono::high_resolution_clock::now(); - gpu_attack_merge_block_buckets_into_kbc_buckets<<>>( - KBC_MERGE_BUCKET_START, - in, batch_bucket_add_Y, num_entries_to_copy, - T1_local_kbc_entries_L, device_local_kbc_num_entries_L); - - num_entries_to_copy = device_block_entry_counts_R[block_id]; - numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize); - in = &T1_R_batch_match_results[block_address]; - - //std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl; - gpu_attack_merge_block_buckets_into_kbc_buckets<<>>( - KBC_MERGE_BUCKET_START, - in, batch_bucket_add_Y, num_entries_to_copy, - T1_local_kbc_entries_R, device_local_kbc_num_entries_R); - - // TODO: find matches in entries_L against entries_R...should be <16, avg around 3-4 - // only have 2m entries...so...could sort 1mL's against 1mR's? - //auto matchTx_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - mergeTx_finish = std::chrono::high_resolution_clock::now(); - total_merge_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( mergeTx_finish - mergeTx_start ).count(); - - - CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - - // yes this can be ram optimized to contrain MAX_ENTRIES to a fraction (at least 1/16th the size) - // yikes...577ms...terrible...CPU WOULD BE FASTER!!! - matchTx_start = std::chrono::high_resolution_clock::now(); - gpu_attack_find_tx_LR_matches<<<(KBC_END_L - KBC_START_L), 8>>>(1, batch_id_L, KBC_START_L, KBC_END_L, - T1_local_kbc_entries_L, device_local_kbc_num_entries_L, - T1_local_kbc_entries_R, device_local_kbc_num_entries_R, - T2_batch_match_results, device_T2_block_entry_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - matchTx_finish = std::chrono::high_resolution_clock::now(); - total_match_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( matchTx_finish - matchTx_start ).count(); - - //total_match_time_ms += std::chrono::duration_cast(matchTx_finish - matchTx_start).count(); - for (int i = 0; i < BATCHES; i++) { - global_block_counts[i] += device_T2_block_entry_counts[i]; - } - - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - std::cout << " match t2 LR sum time: " << (total_match_time_micros/1000) << "ms" << std::endl; - std::cout << " merge t2 LR sum time: " << (total_merge_time_micros/1000) << "ms" << std::endl; - auto mergekbcs_finish = std::chrono::high_resolution_clock::now(); - std::cout << " T2 total time: " << std::chrono::duration_cast(mergekbcs_finish - mergekbcs_start).count() << " ms\n"; - //gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L); -*/ - - /*{ - auto matchT2_start = std::chrono::high_resolution_clock::now(); - Tx_Bucketed_Meta2 *t2bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A; - Tx_Bucketed_Meta4 *t2bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B; - - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - - gpu_attack_find_t1_matches<<<(KBC_END - KBC_START), 256>>>(2, batch_id, KBC_START, KBC_END, - t2bucketed_kbc_entries_in, device_local_kbc_num_entries, - t2bucketed_out, device_block_entry_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT2_finish = std::chrono::high_resolution_clock::now(); - - std::cout << " match T2 time: " << std::chrono::duration_cast(matchT2_finish - matchT2_start).count() << " ms\n"; - //gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries); - } -*/ - -} - - -#endif /* ATTACK_HPP_ */ diff --git a/attack_method_1.hpp b/attack_method_1.hpp deleted file mode 100644 index da79016..0000000 --- a/attack_method_1.hpp +++ /dev/null @@ -1,493 +0,0 @@ -/* - * attack_method_1.hpp - * - * Created on: Nov 2, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_1_HPP_ -#define ATTACK_METHOD_1_HPP_ - - - - -#define ATTACK_KBCFILTER_LR1LR2(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \ - int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L2[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \ - int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R2[entries_address] = entry; \ - } \ -} - -__global__ -void gpu_chacha8_k32_kbc_ranges_LR1LR2(const uint32_t N, - const __restrict__ uint32_t *input, - Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L, - Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R, - Tx_Bucketed_Meta1 *kbc_local_entries_L2, int *kbc_local_num_entries_L2, uint32_t KBC_START_L2, uint32_t KBC_END_L2, - Tx_Bucketed_Meta1 *kbc_local_entries_R2, int *kbc_local_num_entries_R2, uint32_t KBC_START_R2, uint32_t KBC_END_R2) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_KBCFILTER_LR1LR2(x0,0);ATTACK_KBCFILTER_LR1LR2(x1,1);ATTACK_KBCFILTER_LR1LR2(x2,2);ATTACK_KBCFILTER_LR1LR2(x3,3); - ATTACK_KBCFILTER_LR1LR2(x4,4);ATTACK_KBCFILTER_LR1LR2(x5,5);ATTACK_KBCFILTER_LR1LR2(x6,6);ATTACK_KBCFILTER_LR1LR2(x7,7); - ATTACK_KBCFILTER_LR1LR2(x8,8);ATTACK_KBCFILTER_LR1LR2(x9,9);ATTACK_KBCFILTER_LR1LR2(x10,10);ATTACK_KBCFILTER_LR1LR2(x11,11); - ATTACK_KBCFILTER_LR1LR2(x12,12);ATTACK_KBCFILTER_LR1LR2(x13,13);ATTACK_KBCFILTER_LR1LR2(x14,14);ATTACK_KBCFILTER_LR1LR2(x15,15); - } -} - -template -__global__ -void gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit( - const uint32_t KBC_START_ID, // determined by batch_id - const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N, - BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts, - const uint32_t MAX_KBC_ENTRIES) -{ - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - //for (int i = 0; i < N; i++) { - - if (i < N) { - // TODO: try just reading out entries and see if they match when going in - - BUCKETED_ENTRY block_entry = in[i]; - uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y; - uint32_t kbc_id = calc_y / kBC; - uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS; - if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) { - printf(" i:%u entry.y:%u add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u KBC_LOCAL_NUM_BUCKETS:%u START:%u END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID); - } - - uint32_t local_kbc_id = kbc_id - KBC_START_ID; - int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1); - uint32_t destination_address = local_kbc_id * MAX_KBC_ENTRIES + slot; - - //printf("block_id:%u [i: %u] entry.y:%u kbc_id:%u local_kbc:%u slot:%u dest:%u\n", - // block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address); - - if (slot > MAX_KBC_ENTRIES) { - printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n"); - } - //if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) { - // printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address); - //} - block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form. - local_kbc_entries[destination_address] = block_entry; - } -} - -__global__ -void gpu_display_t2_match_results(Tx_Bucketed_Meta4 *T2_batch_match_results, int *device_T2_block_entry_counts, uint32_t MAX_ENTRIES_PER_BLOCK) { - printf("GPU DISPLAY T2 MATCH RESULTS:\n"); - int total_counts = 0; - for (int i=0;i solution_xs = {1320788535,3465356684,2131394289,606438761,434033488,2479909174,3785038649,1942582046,438483300,2306941967,2327418650,184663264,3396904066,3057226705,2120150435,441715922,10459628,1281656413,88943898,810187686,112052271,2540716951,3073359813,4019528057,504026248,1706169436,2772410422,1772771468,607317630,4168020964,4286528917,2472944651,3546546119,1799281226,1202952199,1278165962,4062613743,2747217422,1182029562,1339760739,613483600,3661736730,1251588944,3140803170,2503085418,2541929248,4159128725,2325034733,4257771109,2804935474,2997421030,150533389,709945445,4159463930,714122558,1939000200,3291628318,1878268201,2874051942,2826426895,2146970589,4276159281,3509962078,2808839331}; -/* - * Pair 0 x:1320788535 y:76835538515 kBC:5084069 - Pair 1 x:3465356684 y:76835558195 kBC:5084070 - - Pair 2 x:2131394289 y:227752410271 kBC:15069966 - Pair 3 x:606438761 y:227752417481 kBC:15069967 - - Pair 4 x:434033488 y:274225910406 kBC:18145034 - Pair 5 x:2479909174 y:274225916708 kBC:18145035 - - Pair 6 x:3785038649 y:213830149496 kBC:14148756 - Pair 7 x:1942582046 y:213830170524 kBC:14148757 - - Pair 8 x:438483300 y:248522697030 kBC:16444299 - Pair 9 x:2306941967 y:248522719906 kBC:16444300 - Pair 10 x:2327418650 y:23832869730 kBC:1576978 - Pair 11 x:184663264 y:23832892290 kBC:1576979 - Pair 12 x:3396904066 y:31837336818 kBC:2106619 - Pair 13 x:3057226705 y:31837353261 kBC:2106620 - Pair 14 x:2120150435 y:22313127263 kBC:1476419 - Pair 15 x:441715922 y:22313149126 kBC:1476420 - */ -void attack_method_1(uint32_t bits) { - - using milli = std::chrono::milliseconds; - auto attack_start = std::chrono::high_resolution_clock::now(); - - uint64_t BITS_DIVISOR = 1 << bits; - - uint64_t target_kbc_L1 = 5084069; - uint64_t target_kbc_R1 = 15069966; - uint64_t bucket_L1 = ((target_kbc_L1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t bucket_R1 = ((target_kbc_R1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t KBC_START_L1 = (bucket_L1*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_L1 = ((bucket_L1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_START_R1 = (bucket_R1*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_R1 = ((bucket_R1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - - uint64_t target_kbc_L2 = 18145034; - uint64_t target_kbc_R2 = 14148756; - uint64_t bucket_L2 = ((target_kbc_L2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t bucket_R2 = ((target_kbc_R2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t KBC_START_L2 = (bucket_L2*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_L2 = ((bucket_L2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_START_R2 = (bucket_R2*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_R2 = ((bucket_R2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - - //Pair 0 x:1320788535 y:76835538515 kBC:5084069 - // Pair 1 x:3465356684 y:76835558195 kBC:5084070 - // Pair 2 x:2131394289 y:227752410271 kBC:15069966 - // Pair 3 x:606438761 y:227752417481 kBC:15069967 - - uint64_t KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; // +1 is for including last R bucket space - - uint64_t MAX_KBCS_POST_T1 = 16; // reduce if smaller selection based on initial t0 range. - uint32_t BLOCK_MAX_ENTRIES_T2 = HOST_MAX_BLOCK_ENTRIES / 16; - //uint32_t NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864; - uint32_t NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576; - if (bits == 6) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864; - MAX_KBCS_POST_T1 = 16; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 7) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 2; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 33554432; - MAX_KBCS_POST_T1 = 14; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 262144; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 8) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 4; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 16777216; - MAX_KBCS_POST_T1 = 12; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 65536; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 9) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 8; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 8388608; - MAX_KBCS_POST_T1 = 10; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 16384; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 10) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 16; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 4194304; - MAX_KBCS_POST_T1 = 8; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 4096; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } - - std::cout << "Attack Method 1 " << std::endl - << " L0 kbc range " << KBC_START_L1 << " to " << KBC_END_L1 << " = " << (KBC_END_L1-KBC_START_L1) << "kbcs " << (100.0*(double)(KBC_END_L1-KBC_START_L1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl - << " R0 kbc range " << KBC_START_R1 << " to " << KBC_END_R1 << " = " << (KBC_END_R1-KBC_START_R1) << "kbcs " << (100.0*(double)(KBC_END_R1-KBC_START_R1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl - << " KBC_ATTACK_NUM_BUCKETS: " << KBC_ATTACK_NUM_BUCKETS << std::endl - << " MAX BCS POST T1: " << MAX_KBCS_POST_T1 << std::endl - << " BLOCK_MAX_ENTRIES_T2: " << BLOCK_MAX_ENTRIES_T2 << std::endl; - - - char *device_buffer; - - int* device_local_kbc_num_entries_L1; - int* device_local_kbc_num_entries_R1; - int* device_local_kbc_num_entries_L2; - int* device_local_kbc_num_entries_R2; - Tx_Bucketed_Meta1 *T0_local_kbc_entries_L1; - Tx_Bucketed_Meta1 *T0_local_kbc_entries_R1; - Tx_Bucketed_Meta1 *T0_local_kbc_entries_L2; - Tx_Bucketed_Meta1 *T0_local_kbc_entries_R2; - - int* device_block_entry_counts_L; - int* device_block_entry_counts_R; - Tx_Bucketed_Meta2 *T1_L_batch_match_results; - Tx_Bucketed_Meta2 *T1_R_batch_match_results; - - int* device_T2_block_entry_counts; - Tx_Bucketed_Meta4 *T2_batch_match_results; - - - const uint64_t T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_ATTACK_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; - - const uint64_t CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED = T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2); - std::cout << " CHACHA BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED: " << CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED << std::endl; - std::cout << " * 4 = " << (CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED * 4) << std::endl; - const uint64_t T1_BATCH_MATCH_RESULTS_BYTES_NEEDED = DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2); - std::cout << "KBC RESULTS T1 L NEEDED: " << T1_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl; - const uint64_t T2_BATCH_MATCH_RESULTS_BYTES_NEEDED = (BLOCK_MAX_ENTRIES_T2 * BATCHES) * sizeof(Tx_Bucketed_Meta4); - std::cout << " T2_BATCH_MATCH_RESULTS_BYTES_NEEDED: " << T2_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl; - - - const uint64_t TOTAL_BYTES_NEEDED = - 4 * CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED - + 2 * T1_BATCH_MATCH_RESULTS_BYTES_NEEDED - + T2_BATCH_MATCH_RESULTS_BYTES_NEEDED; - - std::cout << " device_buffer TOTAL BYTES: " << TOTAL_BYTES_NEEDED << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, TOTAL_BYTES_NEEDED)); - uint64_t MEM_POS = 0; - - T0_local_kbc_entries_L1 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS]; - T0_local_kbc_entries_R1 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED]; - T0_local_kbc_entries_L2 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED*2]; - T0_local_kbc_entries_R2 = (Tx_Bucketed_Meta1 *) &device_buffer[MEM_POS + CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED*3]; - MEM_POS += 4 * CHACHA_LOCAL_KBC_ENTRIES_BYTES_NEEDED; - - T1_L_batch_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS]; - T1_R_batch_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS + T1_BATCH_MATCH_RESULTS_BYTES_NEEDED]; - MEM_POS += 2 * T1_BATCH_MATCH_RESULTS_BYTES_NEEDED; - T2_batch_match_results = (Tx_Bucketed_Meta4 *) &device_buffer[MEM_POS]; - MEM_POS += T2_BATCH_MATCH_RESULTS_BYTES_NEEDED; - - std::cout << " device_block_entry_counts_L (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts_L, BATCHES*sizeof(int))); - std::cout << " device_block_entry_counts_R (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts_R, BATCHES*sizeof(int))); - std::cout << " device_T2_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&device_T2_block_entry_counts, BATCHES*sizeof(int))); - - - auto alloc_finish = std::chrono::high_resolution_clock::now(); - std::cout << " alloc time: " << std::chrono::duration_cast(alloc_finish - attack_start).count() << " ms\n"; - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - std::cout << "Doing chacha\n"; - - - int blockSize = 128; // # of threads per block, maximum is 1024. - const uint64_t calc_N = UINT_MAX; - const uint64_t calc_blockSize = blockSize; - const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - int numBlocks = calc_numBlocks; - - - - // don't forget to clear counter...will only use a portion of this memory so should be fast access. - std::cout << " device_local_kbc_num_entries_L1 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L1, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_R1 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R1, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_L2 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L2, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_R2 " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R2, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - - std::cout << "Doing T1" << std::endl; - - // we use only attack range for local num buckets - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L1, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R1, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - - auto t1_start = std::chrono::high_resolution_clock::now(); - auto chacha_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_k32_kbc_ranges_LR1LR2<<>>(calc_N, chacha_input, - T0_local_kbc_entries_L1, device_local_kbc_num_entries_L1, KBC_START_L1, KBC_END_L1, - T0_local_kbc_entries_R1, device_local_kbc_num_entries_R1, KBC_START_R1, KBC_END_R1, - T0_local_kbc_entries_L2, device_local_kbc_num_entries_L2, KBC_START_L2, KBC_END_L2, - T0_local_kbc_entries_R2, device_local_kbc_num_entries_R2, KBC_START_R2, KBC_END_R2); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " chacha L1 time: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - - auto matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_L, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - gpu_attack_find_t1_matches<<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1, - T0_local_kbc_entries_L1, device_local_kbc_num_entries_L1, - T1_L_batch_match_results, device_block_entry_counts_L); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 L time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - - matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts_R, 0, (BATCHES)*sizeof(int))); - gpu_attack_find_t1_matches<<<(KBC_END_R1 - KBC_START_R1), 256>>>(1, KBC_START_R1, KBC_END_R1, - T0_local_kbc_entries_R1, device_local_kbc_num_entries_R1, - T1_R_batch_match_results, device_block_entry_counts_R); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 R time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - - auto t1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " T1 total time: " << std::chrono::duration_cast(t1_finish - t1_start).count() << " ms\n"; - - - - auto mergekbcs_start = std::chrono::high_resolution_clock::now(); - // clear our local kbc num entries as these will be written with new data - - // don't use T0 buckets anymore, so overwrite/reuse their memory space. - Tx_Bucketed_Meta2 *T1_local_kbc_entries_L = (Tx_Bucketed_Meta2 *) &T0_local_kbc_entries_L1[0]; - Tx_Bucketed_Meta2 *T1_local_kbc_entries_R = (Tx_Bucketed_Meta2 *) &T0_local_kbc_entries_R1[0]; - - // clump block-0-batch_id_L block-0-batch_id_R into same group and solve. - - auto matchTx_start = std::chrono::high_resolution_clock::now(); - auto matchTx_finish = std::chrono::high_resolution_clock::now(); - auto mergeTx_start = std::chrono::high_resolution_clock::now(); - auto mergeTx_finish = std::chrono::high_resolution_clock::now(); - uint64_t total_match_time_micros = 0; - uint64_t total_merge_time_micros = 0; - uint32_t global_block_counts[BATCHES] = {0}; - for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L1, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R1, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - uint32_t KBC_MERGE_BUCKET_START = MIN_KBC_BUCKET_FOR_BATCH(block_id); - const uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(block_id); - const uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(block_id+1); - - uint32_t num_entries_to_copy = device_block_entry_counts_L[block_id]; - int blockSize = 256; - int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize); - uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(block_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id); - - uint32_t block_address = block_id * HOST_MAX_BLOCK_ENTRIES; - Tx_Bucketed_Meta2 *in = &T1_L_batch_match_results[block_address]; - - //std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl; - mergeTx_start = std::chrono::high_resolution_clock::now(); - gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit<<>>( - KBC_MERGE_BUCKET_START, - in, batch_bucket_add_Y, num_entries_to_copy, - T1_local_kbc_entries_L, device_local_kbc_num_entries_L1, - MAX_KBCS_POST_T1); - - num_entries_to_copy = device_block_entry_counts_R[block_id]; - numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize); - in = &T1_R_batch_match_results[block_address]; - - //std::cout << "batch " << batch_id << " num_entries: " << num_entries_to_copy << std::endl; - gpu_attack_merge_block_buckets_into_kbc_buckets_with_kbc_count_limit<<>>( - KBC_MERGE_BUCKET_START, - in, batch_bucket_add_Y, num_entries_to_copy, - T1_local_kbc_entries_R, device_local_kbc_num_entries_R1, - MAX_KBCS_POST_T1); - - // TODO: find matches in entries_L against entries_R...should be <16, avg around 3-4 - // only have 2m entries...so...could sort 1mL's against 1mR's? - //auto matchTx_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - mergeTx_finish = std::chrono::high_resolution_clock::now(); - total_merge_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( mergeTx_finish - mergeTx_start ).count(); - - - /*CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - - - matchTx_start = std::chrono::high_resolution_clock::now(); - gpu_attack_find_tx_LR_matches<<<(KBC_END - KBC_START), 8>>>(1, KBC_START, KBC_END, - T1_local_kbc_entries_L, device_local_kbc_num_entries_L1, - T1_local_kbc_entries_R, device_local_kbc_num_entries_R1, - T2_batch_match_results, device_T2_block_entry_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - matchTx_finish = std::chrono::high_resolution_clock::now(); - total_match_time_micros += std::chrono::duration_cast< std::chrono::microseconds >( matchTx_finish - matchTx_start ).count(); -*/ - //total_match_time_ms += std::chrono::duration_cast(matchTx_finish - matchTx_start).count(); - //for (int i = 0; i < BATCHES; i++) { - // global_block_counts[i] += device_T2_block_entry_counts[i]; - //} - - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - std::cout << " match t2 LR sum time: " << (total_match_time_micros/1000) << "ms" << std::endl; - std::cout << " merge t2 LR sum time: " << (total_merge_time_micros/1000) << "ms" << std::endl; - auto mergekbcs_finish = std::chrono::high_resolution_clock::now(); - std::cout << " T2 total time: " << std::chrono::duration_cast(mergekbcs_finish - mergekbcs_start).count() << " ms\n"; - //gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L); - - - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - - uint32_t total_counts = 0; - for (int i=0;i(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << " attack total time: " << std::chrono::duration_cast(attack_finish - attack_start).count() << " ms\n"; - std::cout << "end." << std::endl; -} - - -#endif /* ATTACK_METHOD_1_HPP_ */ diff --git a/attack_method_2.hpp b/attack_method_2.hpp deleted file mode 100644 index 40eb07f..0000000 --- a/attack_method_2.hpp +++ /dev/null @@ -1,1460 +0,0 @@ -/* - * attack_method_2.hpp - * - * Created on: Nov 4, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_2_HPP_ -#define ATTACK_METHOD_2_HPP_ - -#define ATTACK_KBCFILTER_LR1LR2slower(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - uint32_t local_kbc_bucket_id = 30000000; \ - int slot = -1; \ - int *num_list; \ - Tx_Bucketed_Meta1 *entries_list; \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - num_list = kbc_local_num_entries_L; \ - entries_list = kbc_local_entries_L; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - num_list = kbc_local_num_entries_R; \ - entries_list = kbc_local_entries_R; \ - } \ - if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \ - local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \ - num_list = kbc_local_num_entries_L2; \ - entries_list = kbc_local_entries_L2; \ - } \ - if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \ - local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \ - num_list = kbc_local_num_entries_R2; \ - entries_list = kbc_local_entries_R2; \ - } \ - if (local_kbc_bucket_id < 30000000) { \ - slot = atomicAdd(&num_list[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - entries_list[entries_address] = entry; \ - } \ -} - -/* - * uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \ - int kbc_bitmask_add = 1 << (kbc_bitmask_bucket*9); \ - int bitadd = atomicAdd(&kbc_local_num_entries_L[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t slot = bitadd; \ - slot = (slot >> (kbc_bitmask_bucket*9)) & 0b0111111111; \ - - TOTAL: 262341 - */ -//with bitmask kbcs -#define ATTACK_KBCFILTER_LR1LR2bitmask(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \ - uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \ - int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - int bitadd = atomicAdd(&kbc_local_num_entries_L[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t slot = bitadd; \ - slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \ - uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \ - int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - int bitadd = atomicAdd(&kbc_local_num_entries_R[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t slot = bitadd; \ - slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \ - uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \ - uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \ - int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - int bitadd = atomicAdd(&kbc_local_num_entries_L2[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t slot = bitadd; \ - slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L2[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \ - uint32_t kbc_bitmask_bucket = local_kbc_bucket_id / 3; \ - uint32_t kbc_bitmask_shift = 9*(local_kbc_bucket_id % 3); \ - int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - int bitadd = atomicAdd(&kbc_local_num_entries_R2[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t slot = bitadd; \ - slot = (slot >> (kbc_bitmask_shift)) & 0b0111111111; \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R2[entries_address] = entry; \ - } \ -} - -#define ATTACK_KBCFILTER_LR1LR2(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \ - int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L2[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \ - int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R2[entries_address] = entry; \ - } \ -} - -#define ATTACK_KBCFILTER_LR1LR2_CHACHA(chacha_y,x) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L; \ - int slot = atomicAdd(&kbc_local_num_entries_L[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R; \ - int slot = atomicAdd(&kbc_local_num_entries_R[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_L2; \ - int slot = atomicAdd(&kbc_local_num_entries_L2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_L2[entries_address] = entry; \ - } \ - if ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START_R2; \ - int slot = atomicAdd(&kbc_local_num_entries_R2[local_kbc_bucket_id],1); \ - Tx_Bucketed_Meta1 entry = { x, (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries_R2[entries_address] = entry; \ - } \ -} - -#define ATTACK_KBCSTREAM_LR1LR2(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + ((base_x + i) >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if (((kbc_bucket_id >= KBC_START_L) && (kbc_bucket_id <= KBC_END_L)) \ - || ((kbc_bucket_id >= KBC_START_R) && (kbc_bucket_id <= KBC_END_R)) \ - || ((kbc_bucket_id >= KBC_START_L2) && (kbc_bucket_id <= KBC_END_L2)) \ - || ((kbc_bucket_id >= KBC_START_R2) && (kbc_bucket_id <= KBC_END_R2))) { \ - xchacha_pair pair = { base_x + i, chacha_y }; \ - int slot = atomicAdd(&local_filter_count,1); \ - if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \ - shared_chachas[slot] = pair; \ - } \ -} -struct xchacha_pair { - uint32_t x; - uint32_t chacha; -}; - -__global__ -void gpu_chacha8_k32_kbc_ranges_LR1LR2(const uint32_t N, - const __restrict__ uint32_t *input, - Tx_Bucketed_Meta1 *kbc_local_entries_L, int *kbc_local_num_entries_L, uint32_t KBC_START_L, uint32_t KBC_END_L, - Tx_Bucketed_Meta1 *kbc_local_entries_R, int *kbc_local_num_entries_R, uint32_t KBC_START_R, uint32_t KBC_END_R, - Tx_Bucketed_Meta1 *kbc_local_entries_L2, int *kbc_local_num_entries_L2, uint32_t KBC_START_L2, uint32_t KBC_END_L2, - Tx_Bucketed_Meta1 *kbc_local_entries_R2, int *kbc_local_num_entries_R2, uint32_t KBC_START_R2, uint32_t KBC_END_R2) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can - __shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so. - __shared__ uint local_filter_count; - - //if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 32; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (threadIdx.x == 0) { - local_filter_count = 0; - } - __syncthreads(); - - const int j = 0; - if (x_group < end_n) { - uint32_t pos = x_group * 2;// + X_START/16; - //printf("x group pos = %u\n", pos); - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_KBCSTREAM_LR1LR2(datax[j+0],0);ATTACK_KBCSTREAM_LR1LR2(datax[j+1],1);ATTACK_KBCSTREAM_LR1LR2(datax[j+2],2);ATTACK_KBCSTREAM_LR1LR2(datax[j+3],3); - ATTACK_KBCSTREAM_LR1LR2(datax[j+4],4);ATTACK_KBCSTREAM_LR1LR2(datax[j+5],5);ATTACK_KBCSTREAM_LR1LR2(datax[j+6],6);ATTACK_KBCSTREAM_LR1LR2(datax[j+7],7); - ATTACK_KBCSTREAM_LR1LR2(datax[j+8],8);ATTACK_KBCSTREAM_LR1LR2(datax[j+9],9);ATTACK_KBCSTREAM_LR1LR2(datax[j+10],10);ATTACK_KBCSTREAM_LR1LR2(datax[j+11],11); - ATTACK_KBCSTREAM_LR1LR2(datax[j+12],12);ATTACK_KBCSTREAM_LR1LR2(datax[j+13],13);ATTACK_KBCSTREAM_LR1LR2(datax[j+14],14);ATTACK_KBCSTREAM_LR1LR2(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_KBCSTREAM_LR1LR2(datax[j+0],16+0);ATTACK_KBCSTREAM_LR1LR2(datax[j+1],16+1);ATTACK_KBCSTREAM_LR1LR2(datax[j+2],16+2);ATTACK_KBCSTREAM_LR1LR2(datax[j+3],16+3); - ATTACK_KBCSTREAM_LR1LR2(datax[j+4],16+4);ATTACK_KBCSTREAM_LR1LR2(datax[j+5],16+5);ATTACK_KBCSTREAM_LR1LR2(datax[j+6],16+6);ATTACK_KBCSTREAM_LR1LR2(datax[j+7],16+7); - ATTACK_KBCSTREAM_LR1LR2(datax[j+8],16+8);ATTACK_KBCSTREAM_LR1LR2(datax[j+9],16+9);ATTACK_KBCSTREAM_LR1LR2(datax[j+10],16+10);ATTACK_KBCSTREAM_LR1LR2(datax[j+11],16+11); - ATTACK_KBCSTREAM_LR1LR2(datax[j+12],16+12);ATTACK_KBCSTREAM_LR1LR2(datax[j+13],16+13);ATTACK_KBCSTREAM_LR1LR2(datax[j+14],16+14);ATTACK_KBCSTREAM_LR1LR2(datax[j+15],16+15); - } - // at this point we have 128*32 = 4096 entries - // now we have to sort them into the buckets - // we already have the shared counts set from the ATTACK macro - // now just scan our filtered entries and bucket them - __syncthreads(); - for (int i=threadIdx.x;i> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_KBCFILTER_LR1LR2(x0,0);ATTACK_KBCFILTER_LR1LR2(x1,1);ATTACK_KBCFILTER_LR1LR2(x2,2);ATTACK_KBCFILTER_LR1LR2(x3,3); - ATTACK_KBCFILTER_LR1LR2(x4,4);ATTACK_KBCFILTER_LR1LR2(x5,5);ATTACK_KBCFILTER_LR1LR2(x6,6);ATTACK_KBCFILTER_LR1LR2(x7,7); - ATTACK_KBCFILTER_LR1LR2(x8,8);ATTACK_KBCFILTER_LR1LR2(x9,9);ATTACK_KBCFILTER_LR1LR2(x10,10);ATTACK_KBCFILTER_LR1LR2(x11,11); - ATTACK_KBCFILTER_LR1LR2(x12,12);ATTACK_KBCFILTER_LR1LR2(x13,13);ATTACK_KBCFILTER_LR1LR2(x14,14);ATTACK_KBCFILTER_LR1LR2(x15,15); - } -} - -__global__ -void gpu_display_t2_match_results(Tx_Bucketed_Meta4 *T2_batch_match_results, int *device_T2_block_entry_counts, uint32_t MAX_ENTRIES_PER_BLOCK) { - printf("GPU DISPLAY T2 MATCH RESULTS:\n"); - int total_counts = 0; - for (int i=0;i solution_xs = {1320788535,3465356684,2131394289,606438761,434033488,2479909174,3785038649,1942582046,438483300,2306941967,2327418650,184663264,3396904066,3057226705,2120150435,441715922,10459628,1281656413,88943898,810187686,112052271,2540716951,3073359813,4019528057,504026248,1706169436,2772410422,1772771468,607317630,4168020964,4286528917,2472944651,3546546119,1799281226,1202952199,1278165962,4062613743,2747217422,1182029562,1339760739,613483600,3661736730,1251588944,3140803170,2503085418,2541929248,4159128725,2325034733,4257771109,2804935474,2997421030,150533389,709945445,4159463930,714122558,1939000200,3291628318,1878268201,2874051942,2826426895,2146970589,4276159281,3509962078,2808839331}; -/* - * Pair 0 x:1320788535 y:76835538515 kBC:5084069 - Pair 1 x:3465356684 y:76835558195 kBC:5084070 - - Pair 2 x:2131394289 y:227752410271 kBC:15069966 - Pair 3 x:606438761 y:227752417481 kBC:15069967 - - Pair 4 x:434033488 y:274225910406 kBC:18145034 - Pair 5 x:2479909174 y:274225916708 kBC:18145035 - - Pair 6 x:3785038649 y:213830149496 kBC:14148756 - Pair 7 x:1942582046 y:213830170524 kBC:14148757 - - Pair 8 x:438483300 y:248522697030 kBC:16444299 - Pair 9 x:2306941967 y:248522719906 kBC:16444300 - Pair 10 x:2327418650 y:23832869730 kBC:1576978 - Pair 11 x:184663264 y:23832892290 kBC:1576979 - Pair 12 x:3396904066 y:31837336818 kBC:2106619 - Pair 13 x:3057226705 y:31837353261 kBC:2106620 - Pair 14 x:2120150435 y:22313127263 kBC:1476419 - Pair 15 x:441715922 y:22313149126 kBC:1476420 - */ - - - -__global__ -void gpu_attack_get_kbcs_with_pairs_from_global_kbcs( - const unsigned int *kbc_global_num_entries_L, - const unsigned int *kbc_global_num_entries_R, - unsigned int *kbc_pairs_list_L_bucket_ids, int *pairs_count) { - - uint32_t global_kbc_L_bucket_id = blockIdx.x*blockDim.x+threadIdx.x; - - if (global_kbc_L_bucket_id < (kBC_NUM_BUCKETS-1)) { - - uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8; - uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8); - uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket]; - const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8; - kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8); - bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket]; - const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - if ((num_L > 0) && (num_R > 0)) { - - int slot = atomicAdd(&pairs_count[0], 1); - //printf("found kbc %u with two blocks > 0 slot %u \n", global_kbc_L_bucket_id,slot); - kbc_pairs_list_L_bucket_ids[slot] = global_kbc_L_bucket_id; - } - } -} - -struct Match_Attack_Pair_Index { - uint32_t bucket_L_id; // could compress this to fit in 32 bit - uint16_t idx_L; - uint16_t idx_R; -}; - -template -__global__ -void gpu_attack_process_t1_pairs(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - Match_Attack_Pair_Index *match_list, int *match_counts) { - // testmatch count: 33532242 - // testmatch T1 L time: 9 ms - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ unsigned int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int global_match_slot; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - // bucket sort the r positions! - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - BUCKETED_ENTRY_IN R_entry = kbc_local_entries[start_R+pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - unsigned int kbc_map = r_y / 2; - const unsigned int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - - } - //for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - // kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R]; - //} - //for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - // kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L]; - //} - - - - - uint16_t parity = global_kbc_L_bucket_id % 2; - - __syncthreads(); // wait for all threads to write r_bid entries - - //testmatch count: 33271871 - // testmatch T1 L time: 9 ms - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_local_entries[start_L+pos_L]; - uint16_t l_y = L_entry.y; - - //bool doPrint = (L_entry.meta[0] == 601683299); - - //uint16_t base_indJ = l_y / kC; - //uint16_t indJ_plus_m_mod_kB = base_indJ % kB; - //uint16_t indJ_plus_m_mod_kB_times_kC = indJ_plus_m_mod_kB * kC; - //uint16_t m_2_plus_parity_squared_iter = (parity + l_y) % kC; - //uint16_t m_2_plus_parity_start_add = parity == 0 ? 4 : 8; // this increments by 8 each time - //if (doPrint) { - // printf("Starting values:\n"); - // printf(" l_y: %u\n",l_y); - // printf(" parity: %u\n",parity); - /// printf(" indJ_plus_m_mod_kB: %u\n",indJ_plus_m_mod_kB); - // printf(" indJ_plus_m_mod_kB_times_kC: %u\n",indJ_plus_m_mod_kB_times_kC); - // printf(" m_2_plus_parity_squared_iter: %u\n",m_2_plus_parity_squared_iter); - // printf(" m_2_plus_parity_start_add: %u\n",m_2_plus_parity_start_add); - //} - for (int m=0;m<64;m++) { - - - /* - * sadly these no division optimations turned out to be slower than a single calculation line - * uint16_t r_target = indJ_plus_m_mod_kB_times_kC + m_2_plus_parity_squared_iter; - - // this gets updated at end of loop. - indJ_plus_m_mod_kB += 1; - if (indJ_plus_m_mod_kB >= kB) { - indJ_plus_m_mod_kB = 0; - indJ_plus_m_mod_kB_times_kC = 0; - } else { - indJ_plus_m_mod_kB_times_kC += kC; - } - - m_2_plus_parity_squared_iter += m_2_plus_parity_start_add; - m_2_plus_parity_start_add += 8; // adds 8 extra each round compounding - if (m_2_plus_parity_squared_iter >= kC) m_2_plus_parity_squared_iter -= kC; - if (m_2_plus_parity_start_add >= kC) m_2_plus_parity_start_add -= kC; -*/ - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - //if (!(test_target == r_target)) printf("fail: meta[0] %u\n",L_entry.meta[0]); - //if (doPrint) { - - // printf(" Test target result : %u ",test_target); - // if (r_target == test_target) printf(" SUCCESS!\n"); else printf(" FAIL.\n"); - // printf(" Desired target result: %u\n",r_target); - - // printf("\nNext values m:%u\n",m+1); - - // printf(" indJ_plus_m_mod_kB: %u\n",indJ_plus_m_mod_kB); - // printf(" indJ_plus_m_mod_kB_times_kC: %u\n",indJ_plus_m_mod_kB_times_kC); - // printf(" m_2_plus_parity_squared_iter: %u\n",m_2_plus_parity_squared_iter); - // printf(" m_2_plus_parity_start_add: %u\n",m_2_plus_parity_start_add); - //} - - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - - // find which box our r_target is in, extra the 15bit value from that box - unsigned int kbc_map = r_target / 2; - const unsigned int kbc_box_shift = (r_target % 2) * 15; - unsigned int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - global_match_slot = atomicAdd(&match_counts[0],total_matches); - } - - __syncthreads(); - - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match shared_match = matches[i]; - Match_Attack_Pair_Index match = { }; - match.bucket_L_id = global_kbc_L_bucket_id; - match.idx_L = shared_match.idxL; - match.idx_R = shared_match.idxR; - // *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list. - // suspect splitting y list would be faster. - match_list[global_match_slot + i] = match; - } -} - - -template -__global__ -void gpu_attack_process_t1_pairs_orig(uint16_t table, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - Match_Attack_Pair_Index *match_list, int *match_counts) { - // testmatch count: 33532242 - // testmatch T1 L time: 12 ms - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ unsigned int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ BUCKETED_ENTRY_IN kbc_L_entries[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ BUCKETED_ENTRY_IN kbc_R_entries[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int global_match_slot; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 0; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - kbc_R_entries[pos_R] = kbc_local_entries[start_R+pos_R]; - } - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - kbc_L_entries[pos_L] = kbc_local_entries[start_L+pos_L]; - } - - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - unsigned int kbc_map = r_y / 2; - const unsigned int kbc_box_shift = (r_y % 2) * 15; - unsigned int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - // find which box our r_target is in, extra the 15bit value from that box - unsigned int kbc_map = r_target / 2; - const unsigned int kbc_box_shift = (r_target % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - unsigned int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (total_matches > (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - global_match_slot = atomicAdd(&match_counts[0],total_matches); - } - - __syncthreads(); - - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match shared_match = matches[i]; - Match_Attack_Pair_Index match = { }; - match.bucket_L_id = global_kbc_L_bucket_id; - match.idx_L = shared_match.idxL; - match.idx_R = shared_match.idxR; - // *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list. - // suspect splitting y list would be faster. - match_list[global_match_slot + i] = match; - } -} - - -template -__global__ -void gpu_attack_process_t1_matches_list( - const int MATCHES_COUNT, Match_Attack_Pair_Index *match_list, - const BUCKETED_ENTRY_IN *kbc_local_entries, - BUCKETED_ENTRY_OUT *kbc_out, unsigned int *out_kbc_counts, - const uint32_t KBC_START_L1, const uint32_t KBC_MAX_ENTRIES) { - - int i = blockIdx.x*blockDim.x+threadIdx.x; - - if (i < MATCHES_COUNT) { - Match_Attack_Pair_Index match = match_list[i]; - BUCKETED_ENTRY_OUT pair = {}; - uint32_t local_bucket_id = match.bucket_L_id - KBC_START_L1; - //printf("reading match %u : bucketL %u idx_L %u idx_R %u\n", i, local_bucket_id, match.idx_L, match.idx_R); - BUCKETED_ENTRY_IN L_Entry = kbc_local_entries[local_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + match.idx_L]; - BUCKETED_ENTRY_IN R_Entry = kbc_local_entries[(local_bucket_id+1) * KBC_MAX_ENTRIES_PER_BUCKET + match.idx_R]; - - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, match.bucket_L_id); // make sure this is global bucket id - - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - - uint32_t kbc_bucket = blake_result / kBC; - - pair.y = (uint32_t) (blake_result % kBC); - - uint32_t kbc_bitmask_bucket = kbc_bucket / 8; \ - uint32_t kbc_bitmask_shift = 4*(kbc_bucket % 8); \ - unsigned int kbc_bitmask_add = 1 << (kbc_bitmask_shift); \ - unsigned int bitadd = atomicAdd(&out_kbc_counts[kbc_bitmask_bucket],kbc_bitmask_add); \ - uint32_t block_slot = bitadd; \ - block_slot = (block_slot >> (kbc_bitmask_shift)) & 0b01111; \ - - if (block_slot > KBC_MAX_ENTRIES) { - printf("block_slot > MAX %u\n", block_slot); - } else { - uint32_t pair_address = kbc_bucket * KBC_MAX_ENTRIES + block_slot; - //if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - //printf("ERROR: results address overflow\n"); - //} else { - kbc_out[pair_address] = pair; - //} - } - //} - - - } -} - -template -__global__ -void gpu_attack_process_global_kbc_pairs_list( - const int PAIRS_COUNT, unsigned int *kbc_pairs_list_L_bucket_ids, - const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L, - const BUCKETED_ENTRY_IN *kbc_global_entries_R, const unsigned int *kbc_global_num_entries_R, - Match_Attack_Pair_Index *match_list, int *match_counts, - const uint32_t KBC_MAX_ENTRIES) { - - // NOTE: possible optimization is to only get y elements of a list instead of ALL the meta... - // requires splitting the meta and y fields into two separate lists. Alternatively we copy - // all the meta chunk in this round. - - int i = blockIdx.x*blockDim.x+threadIdx.x; - - if (i < PAIRS_COUNT) { - unsigned int global_kbc_L_bucket_id = kbc_pairs_list_L_bucket_ids[i]; - - uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8; - uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8); - uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket]; - const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8; - kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8); - bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket]; - const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - if ((num_L == 0) || (num_R == 0)) { - printf("ERROR: PAIRS LIST SHOULD NOT HAVE 0 COUNTS\n"); - return; // shouldn't ever happen with a pairs list... - } - - const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES; - const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES; - - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R]; - - // For any 0 <= m < kExtraBitsPow: - // yl / kBC + 1 = yR / kBC AND - // (yr % kBC) / kC - (yl % kBC) / kC = m (mod kB) AND - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - for (int pos_R = 0; pos_R < num_R; pos_R+=1) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - int16_t yr_kbc = R_entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - int16_t yl_kbc = L_entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - int slot = atomicAdd(&match_counts[0],1); - Match_Attack_Pair_Index match = { }; - match.bucket_L_id = global_kbc_L_bucket_id; - match.idx_L = pos_L; - match.idx_R = pos_R; - // *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list. - // suspect splitting y list would be faster. - match_list[slot] = match; - } - } - } - } - } -} - -template -__global__ -void gpu_attack_process_matches_list( - uint16_t table, - const int MATCHES_COUNT, Match_Attack_Pair_Index *match_list, - const BUCKETED_ENTRY_IN *kbc_global_entries_L, - const BUCKETED_ENTRY_IN *kbc_global_entries_R, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts, - const uint32_t KBC_MAX_ENTRIES, const uint32_t BLOCK_MAX_ENTRIES) { - - // NOTE: possible optimization is to only get y elements of a list instead of ALL the meta... - // requires splitting the meta and y fields into two separate lists. Alternatively we copy - // all the meta chunk in this round. - - int i = blockIdx.x*blockDim.x+threadIdx.x; - - if (i < MATCHES_COUNT) { - Match_Attack_Pair_Index match = match_list[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_global_entries_L[match.bucket_L_id * KBC_MAX_ENTRIES + match.idx_L]; - BUCKETED_ENTRY_IN R_Entry = kbc_global_entries_R[(match.bucket_L_id+1) * KBC_MAX_ENTRIES + match.idx_R]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, match.bucket_L_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * BLOCK_MAX_ENTRIES + block_slot; - bucketed_out[pair_address] = pair; - } -} - - -void attack_method_2(uint32_t bits) { - - // attack method 2 does: - - uint64_t BITS_DIVISOR = 1 << bits; - - uint64_t target_kbc_L1 = 5084069; - uint64_t target_kbc_R1 = 15069966; - uint64_t bucket_L1 = ((target_kbc_L1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t bucket_R1 = ((target_kbc_R1 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t KBC_START_L1 = (bucket_L1*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_L1 = ((bucket_L1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_START_R1 = (bucket_R1*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_R1 = ((bucket_R1+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - - uint64_t target_kbc_L2 = 18145034; - uint64_t target_kbc_R2 = 14148756; - uint64_t bucket_L2 = ((target_kbc_L2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t bucket_R2 = ((target_kbc_R2 + 1) * BITS_DIVISOR) / kBC_NUM_BUCKETS; - uint64_t KBC_START_L2 = (bucket_L2*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_L2 = ((bucket_L2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_START_R2 = (bucket_R2*kBC_NUM_BUCKETS) / BITS_DIVISOR; - uint64_t KBC_END_R2 = ((bucket_R2+1)*kBC_NUM_BUCKETS) / BITS_DIVISOR; - - // kbc bucket bitmask: e.g. if 10 bits = 1024 buckets - // set [64][64] with appropriate bit - // when chacha, do kbc_bucket and translate to kbc_bit - // then kbc_bit & [64] for the check, to get true/false - // then need to find which array to write to. Oh. - // maybe easier to make array [0..1024] of (Array *), where NULL is in ones not used - // and just do kbc_array = array[kbc_bucket] - // if !NULL DO.... - - std::cout << "ATTACK METHOD 2" << std::endl; - std::cout << " BITS: " << bits << " DIVISOR:" << BITS_DIVISOR - << " target_kbc_L1 " << target_kbc_L1 << " -> bucket L1 " << bucket_L1 - << " kbc range: "<< KBC_START_L1 << " - " << (KBC_END_L1) << "kbcs " << std::endl; - - //Pair 0 x:1320788535 y:76835538515 kBC:5084069 - // Pair 1 x:3465356684 y:76835558195 kBC:5084070 - // Pair 2 x:2131394289 y:227752410271 kBC:15069966 - // Pair 3 x:606438761 y:227752417481 kBC:15069967 - - uint64_t KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; // +1 is for including last R bucket space - - uint64_t MAX_KBCS_POST_T1 = 16; // reduce if smaller selection based on initial t0 range. - uint32_t BLOCK_MAX_ENTRIES_T2 = HOST_MAX_BLOCK_ENTRIES / 16; - //uint32_t NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864; - uint32_t NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576; - if (bits == 6) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 67108864; - MAX_KBCS_POST_T1 = 16; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 1048576; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 7) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 2; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 33554432; - MAX_KBCS_POST_T1 = 12; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 262144; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 8) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 4; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 16777216; - MAX_KBCS_POST_T1 = 12; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 65536; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 9) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 8; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 8388608; - MAX_KBCS_POST_T1 = 10; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 16384; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } else if (bits == 10) { - KBC_ATTACK_NUM_BUCKETS = KBC_LOCAL_NUM_BUCKETS / 16; - //NUM_EXPECTED_ENTRIES_T1_MATCHES = 4194304; - MAX_KBCS_POST_T1 = 8; - NUM_EXPECTED_ENTRIES_T2_MATCHES = 4096; - BLOCK_MAX_ENTRIES_T2 = NUM_EXPECTED_ENTRIES_T2_MATCHES / 32; - } - uint64_t T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_ATTACK_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; - std::cout << " L0 kbc range " << KBC_START_L1 << " to " << KBC_END_L1 << " = " << (KBC_END_L1-KBC_START_L1) << "kbcs " << (100.0*(double)(KBC_END_L1-KBC_START_L1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl - << " R0 kbc range " << KBC_START_R1 << " to " << KBC_END_R1 << " = " << (KBC_END_R1-KBC_START_R1) << "kbcs " << (100.0*(double)(KBC_END_R1-KBC_START_R1)/(double)kBC_LAST_BUCKET_ID) << "%" << std::endl - << " KBC_ATTACK_NUM_BUCKETS: " << KBC_ATTACK_NUM_BUCKETS << std::endl - << " MAX BCS POST T1: " << MAX_KBCS_POST_T1 << std::endl - << " BLOCK_MAX_ENTRIES_T2: " << BLOCK_MAX_ENTRIES_T2 << std::endl; - - - using milli = std::chrono::milliseconds; - auto attack_start = std::chrono::high_resolution_clock::now(); - - char *device_buffer; - int* device_local_kbc_num_entries_L; - int* device_local_kbc_num_entries_R; - int* device_local_kbc_num_entries_L2; - int* device_local_kbc_num_entries_R2; - int* device_T2_block_entry_counts; - - const uint64_t T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED = kBC_NUM_BUCKETS * MAX_KBCS_POST_T1 * sizeof(Tx_Bucketed_Meta2); - std::cout << " T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED: " << T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED << std::endl; - std::cout << " * 2 = " << (T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED * 2) << std::endl; - - const uint64_t T2_BATCH_MATCH_RESULTS_BYTES_NEEDED = (BLOCK_MAX_ENTRIES_T2 * BATCHES) * sizeof(Tx_Bucketed_Meta4); - std::cout << " T2_BATCH_MATCH_RESULTS_BYTES_NEEDED: " << T2_BATCH_MATCH_RESULTS_BYTES_NEEDED << std::endl; - const uint64_t BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED = T0_KBC_DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2); - std::cout << " CHACHA BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED: " << BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED << std::endl; - std::cout << " * 4 = " << (BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED * 4) << std::endl; - - const uint64_t TOTAL_BYTES_NEEDED = - 4 * BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED - + 2 * T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED - + T2_BATCH_MATCH_RESULTS_BYTES_NEEDED; - - Tx_Bucketed_Meta4 *T2_batch_match_results; - char *device_local_kbc_entries_L; - char *device_local_kbc_entries_R; - char *device_local_kbc_entries_L2; - char *device_local_kbc_entries_R2; - - Tx_Bucketed_Meta2 *T1_L_kbc_match_results; - Tx_Bucketed_Meta2 *T1_R_kbc_match_results; - unsigned int *device_global_kbc_num_entries_L; - unsigned int *device_global_kbc_num_entries_R; - - //std::cout << " T1_L_batch_match_results " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << sizeof(Tx_Bucketed_Meta2) << ") = " << (DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2)) << std::endl; - //CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, DEVICE_BUFFER_ALLOCATED_ENTRIES * sizeof(Tx_Bucketed_Meta2))); - - std::cout << " device_local_kbc_num_entries_L " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_R " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_L2 " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_L2, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - std::cout << " device_local_kbc_num_entries_R2 " << KBC_ATTACK_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries_R2, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - - // 32 bit...limit to 4 bits = 16 max, = 8 entries per kbc - std::cout << " device_global_kbc_num_entries_L " << (kBC_NUM_BUCKETS/8) << " = " << ((kBC_NUM_BUCKETS/8)*sizeof(int)) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_global_kbc_num_entries_L, (kBC_NUM_BUCKETS/8)*sizeof(int))); - std::cout << " device_global_kbc_num_entries_R " << (kBC_NUM_BUCKETS/8) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_global_kbc_num_entries_R, (kBC_NUM_BUCKETS/8)*sizeof(int))); - - - std::cout << " device_buffer TOTAL BYTES: " << TOTAL_BYTES_NEEDED << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer, TOTAL_BYTES_NEEDED)); - uint64_t MEM_POS = 0; - device_local_kbc_entries_L = &device_buffer[MEM_POS]; - device_local_kbc_entries_R = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED]; - device_local_kbc_entries_L2 = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED*2]; - device_local_kbc_entries_R2 = &device_buffer[MEM_POS + BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED*3]; - MEM_POS += 4 * BATCH_LOCAL_KBC_ENTRIES_BYTES_NEEDED; - T1_L_kbc_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS]; - T1_R_kbc_match_results = (Tx_Bucketed_Meta2 *) &device_buffer[MEM_POS + T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED]; - MEM_POS += 2 * T1_BATCH_MATCH_KBC_RESULTS_BYTES_NEEDED; - T2_batch_match_results = (Tx_Bucketed_Meta4 *) &device_buffer[MEM_POS]; - MEM_POS += T2_BATCH_MATCH_RESULTS_BYTES_NEEDED; - - std::cout << " device_T2_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&device_T2_block_entry_counts, BATCHES*sizeof(int))); - - auto alloc_finish = std::chrono::high_resolution_clock::now(); - std::cout << " alloc time: " << std::chrono::duration_cast(alloc_finish - attack_start).count() << " ms\n"; - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - std::cout << "Doing chacha\n"; - - - int blockSize = 128; // # of threads per block, maximum is 1024. - const uint64_t calc_N = UINT_MAX; - const uint64_t calc_blockSize = blockSize; - const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - int numBlocks = calc_numBlocks; - - - - - - Tx_Bucketed_Meta1 *T0_local_kbc_entries_L = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_L[0]; // will replace... - Tx_Bucketed_Meta1 *T0_local_kbc_entries_R = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_R[0]; - Tx_Bucketed_Meta1 *T0_local_kbc_entries_L2 = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_L2[0]; // will replace... - Tx_Bucketed_Meta1 *T0_local_kbc_entries_R2 = (Tx_Bucketed_Meta1 *) &device_local_kbc_entries_R2[0]; - - std::cout << "Note: sizeof(Tx_Bucketed_Meta1) is " << sizeof(Tx_Bucketed_Meta2)*8 << "bits, when it should be 96 bits" << std::endl; - - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_L2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries_R2, 0, KBC_ATTACK_NUM_BUCKETS*sizeof(int))); - - std::cout << "Doing T1" << std::endl; - auto t1_start = std::chrono::high_resolution_clock::now(); - auto chacha_start = std::chrono::high_resolution_clock::now(); - //gpu_chacha8_k32_kbc_ranges_LR<<>>(calc_N, chacha_input, - // T0_local_kbc_entries_L, device_local_kbc_num_entries_L, KBC_START_L1, KBC_END_L1, - // T0_local_kbc_entries_R, device_local_kbc_num_entries_R, KBC_START_R1, KBC_END_R1); - gpu_chacha8_k32_kbc_ranges_LR1LR2<<>>(calc_N, chacha_input, - T0_local_kbc_entries_L, device_local_kbc_num_entries_L, KBC_START_L1, KBC_END_L1, - T0_local_kbc_entries_R, device_local_kbc_num_entries_R, KBC_START_R1, KBC_END_R1, - T0_local_kbc_entries_L2, device_local_kbc_num_entries_L2, KBC_START_L2, KBC_END_L2, - T0_local_kbc_entries_R2, device_local_kbc_num_entries_R2, KBC_START_R2, KBC_END_R2); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " chacha L1 time: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - //gpu_list_local_kbc_entries<<<1,1>>>(device_local_kbc_num_entries_L2, 0, 100, 1); - - Match_Attack_Pair_Index *match_list; - int *match_counts; - CUDA_CHECK_RETURN(cudaMalloc(&match_list, 67108864*sizeof(Match_Attack_Pair_Index))); - CUDA_CHECK_RETURN(cudaMallocManaged(&match_counts, sizeof(unsigned int))); - match_counts[0] = 0; - auto testmatchT1_start = std::chrono::high_resolution_clock::now(); - gpu_attack_process_t1_pairs<<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1, - T0_local_kbc_entries_L, device_local_kbc_num_entries_L, - match_list,match_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto testmatchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " testmatch count: " << match_counts[0] << std::endl; - std::cout << " testmatch T1 L time: " << std::chrono::duration_cast(testmatchT1_finish - testmatchT1_start).count() << " ms\n"; - - // CODE BELOW CRASHES - - int matchT1_count = match_counts[0]; - const int matchT1_blockSize = 256; - const int matchT1_numBlocks = (matchT1_count + matchT1_blockSize - 1) / matchT1_blockSize; - auto bestmatchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, (kBC_NUM_BUCKETS/8)*sizeof(int))); - gpu_attack_process_t1_matches_list<<>>( - //gpu_attack_process_t1_matches_list<<>>( - matchT1_count, match_list, - T0_local_kbc_entries_L, - T1_L_kbc_match_results, device_global_kbc_num_entries_L, - KBC_START_L1, MAX_KBCS_POST_T1); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto bestmatchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " blake match T1 L time: " << std::chrono::duration_cast(bestmatchT1_finish - bestmatchT1_start).count() << " ms\n"; - std::cout << " FINAL match T1 L time: " << std::chrono::duration_cast(bestmatchT1_finish - testmatchT1_start).count() << " ms\n"; - - //gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_L, 0, 100, 1); - - auto matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, (kBC_NUM_BUCKETS/8)*sizeof(int))); - gpu_attack_find_t1_matches_out_kbc<<<(KBC_END_L1 - KBC_START_L1), 256>>>(1, KBC_START_L1, KBC_END_L1, - T0_local_kbc_entries_L, device_local_kbc_num_entries_L, - T1_L_kbc_match_results, device_global_kbc_num_entries_L, MAX_KBCS_POST_T1); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 L time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - //gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_L, 0, 100, 1); - - matchT1_start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_R, 0, (kBC_NUM_BUCKETS/8)*sizeof(int))); - gpu_attack_find_t1_matches_out_kbc<<<(KBC_END_R1 - KBC_START_R1), 256>>>(1, KBC_START_R1, KBC_END_R1, - T0_local_kbc_entries_R, device_local_kbc_num_entries_R, - T1_R_kbc_match_results, device_global_kbc_num_entries_R, MAX_KBCS_POST_T1); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - matchT1_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T1 R time: " << std::chrono::duration_cast(matchT1_finish - matchT1_start).count() << " ms\n"; - - //gpu_list_local_kbc_entries_bitmask<<<1,1>>>(device_global_kbc_num_entries_R, 0, 100, 1); - - // TODO: need to do a "pairing" pass, where we just scan through each bucket and spit out a list of kbc pairs - // then, on a second pass, process the pairs with compute method. This way all threads are going to be working - // and it should be near instant. - // NOTE: will have to handle pairing pass having more than one entry - // ALSO TRY: single pass where we compute on the fly, but probably it will store all the 0 entries - // e.g. T2 9 bit, expect 16000 matches from 18188177 buckets = 1 in 1100 buckets - - // after t1 pairs output to kbc list, for t2 pairing we first filter all eligible bucket ids. - unsigned int *kbc_pairs_list_L_bucket_ids; - int *pairs_count; - CUDA_CHECK_RETURN(cudaMalloc(&kbc_pairs_list_L_bucket_ids, kBC_NUM_BUCKETS*sizeof(unsigned int))); - CUDA_CHECK_RETURN(cudaMallocManaged(&pairs_count, sizeof(unsigned int))); - pairs_count[0] = 0; - //CUDA_CHECK_RETURN(cudaMemset(pairs_count, 0, sizeof(int))); - - auto pairingT2_start = std::chrono::high_resolution_clock::now(); - const int pair_blockSize = 256; // # of threads per block, maximum is 1024. - const uint32_t pair_numBlocks = (kBC_NUM_BUCKETS + pair_blockSize - 1) / pair_blockSize; - gpu_attack_get_kbcs_with_pairs_from_global_kbcs<<>>( - device_global_kbc_num_entries_L,device_global_kbc_num_entries_R, - kbc_pairs_list_L_bucket_ids, pairs_count); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto pairingT2_finish = std::chrono::high_resolution_clock::now(); - std::cout << " pairs count: " << pairs_count[0] << std::endl; - std::cout << " pairing T2 L time: " << std::chrono::duration_cast(pairingT2_finish - pairingT2_start).count() << " ms\n"; - - //Match_Attack_Pair_Index *match_list; - //int *match_counts; - //CUDA_CHECK_RETURN(cudaMalloc(&match_list, 2*NUM_EXPECTED_ENTRIES_T2_MATCHES*sizeof(Match_Attack_Pair_Index))); - //CUDA_CHECK_RETURN(cudaMallocManaged(&match_counts, sizeof(unsigned int))); - match_counts[0] = 0; - - - auto processT2_start = std::chrono::high_resolution_clock::now(); - int process_count = pairs_count[0]; - const int process_blockSize = 256; - const int process_numBlocks = (process_count + process_blockSize - 1) / process_blockSize; - gpu_attack_process_global_kbc_pairs_list<<>>( - process_count, kbc_pairs_list_L_bucket_ids, - T1_L_kbc_match_results, device_global_kbc_num_entries_L, - T1_R_kbc_match_results, device_global_kbc_num_entries_R, - match_list, match_counts, - MAX_KBCS_POST_T1); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto processT2_finish = std::chrono::high_resolution_clock::now(); - std::cout << " t2 match_counts: " << match_counts[0] << std::endl; - std::cout << " process T2 L time: " << std::chrono::duration_cast(processT2_finish - processT2_start).count() << " ms\n"; - - CUDA_CHECK_RETURN(cudaMemset(device_T2_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - - auto matchT2_start = std::chrono::high_resolution_clock::now(); - int matches_count = match_counts[0]; - const int match_blockSize = 256; - const int match_numBlocks = (matches_count + match_blockSize - 1) / match_blockSize; - gpu_attack_process_matches_list<<>>( - 2, - matches_count, match_list, - T1_L_kbc_match_results, - T1_R_kbc_match_results, - T2_batch_match_results, device_T2_block_entry_counts, - MAX_KBCS_POST_T1, BLOCK_MAX_ENTRIES_T2); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT2_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T2 L time: " << std::chrono::duration_cast(matchT2_finish - matchT2_start).count() << " ms\n"; - - /* - * process T2 L time: 0 ms - match T2 L time: 12 ms -Freeing memory... -GPU DISPLAY T2 MATCH RESULTS: - block 22 entry 198 x1:1320788535 x2:3465356684 x3:2131394289 x4:606438761 - TOTAL: 16498 - */ - - /*auto matchT2_start = std::chrono::high_resolution_clock::now(); - gpu_attack_find_tx_LR_matches_global<<>>(2, 0, kBC_NUM_BUCKETS, - T1_L_kbc_match_results, device_global_kbc_num_entries_L, - T1_R_kbc_match_results, device_global_kbc_num_entries_R, - T2_batch_match_results, device_T2_block_entry_counts, - MAX_KBCS_POST_T1, BLOCK_MAX_ENTRIES_T2); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto matchT2_finish = std::chrono::high_resolution_clock::now(); - std::cout << " match T2 L time: " << std::chrono::duration_cast(matchT2_finish - matchT2_start).count() << " ms\n"; -*/ - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - gpu_display_t2_match_results<<<1,1>>>(T2_batch_match_results, device_T2_block_entry_counts, BLOCK_MAX_ENTRIES_T2); - - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_L)); - CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries_R)); - //CUDA_CHECK_RETURN(cudaFree(device_block_entry_counts)); - CUDA_CHECK_RETURN(cudaFree(device_buffer)); - - auto attack_finish = std::chrono::high_resolution_clock::now(); - std::cout << " compute only time: " << std::chrono::duration_cast(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << " attack total time: " << std::chrono::duration_cast(attack_finish - attack_start).count() << " ms\n"; - std::cout << "end." << std::endl; -} - - - - -#endif /* ATTACK_METHOD_2_HPP_ */ diff --git a/attack_method_kbc_list.hpp b/attack_method_kbc_list.hpp deleted file mode 100644 index dcd6f5a..0000000 --- a/attack_method_kbc_list.hpp +++ /dev/null @@ -1,227 +0,0 @@ -/* - * attack_method_kbc_list.hpp - * - * Created on: Nov 7, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_KBC_LIST_HPP_ -#define ATTACK_METHOD_KBC_LIST_HPP_ - -#define ATTACK_FILTER_BITMASK(chacha_y,i) \ -{ \ - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - if (kbc_bucket_id_L > 0) { \ - int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \ - unsigned int kbc_bit_slot = kbc_bucket_id_L % 32; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \ - if ((kbc_mask & kbc_value) > 0) { \ - int slot = atomicAdd(&count[0],1); \ - xs[slot] = (x+i); \ - chachas[slot] = chacha_y; \ - } \ - } \ -} - -__global__ -void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N, - const __restrict__ uint32_t *input, - const unsigned int* __restrict__ kbc_global_bitmask, - uint32_t * __restrict__ xs, uint32_t * __restrict__ chachas, int *count) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3); - ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7); - ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11); - ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15); - } -} - -__global__ -void gpu_set_kbc_bitmask_from_kbc_list(const uint32_t N, - uint32_t *kbc_list, unsigned int* kbc_bitmask) -{ - int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N) { - uint32_t kbc_bucket_id = kbc_list[i]; - int kbc_bitmask_bucket = kbc_bucket_id / 32; - int kbc_bit_slot = kbc_bucket_id % 32; - unsigned int kbc_mask = 1 << kbc_bit_slot; - atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask); - //printf("kbc slot %u value %u SET mask bucket: %u bitslot:%u\n",i, kbc_bucket_id, kbc_bitmask_bucket, kbc_bit_slot); - // don't forget buckets needed for rx's. - kbc_bitmask_bucket = (kbc_bucket_id+1) / 32; - kbc_bit_slot = (kbc_bucket_id+1) % 32; - kbc_mask = 1 << kbc_bit_slot; - atomicOr(&kbc_bitmask[kbc_bitmask_bucket],kbc_mask); - //printf("kbc %u SET mask bucket: %u bitslot:%u\n",kbc_bucket_id+1, kbc_bitmask_bucket, kbc_bit_slot); - } -} - -__global__ -void gpu_count_kbc_mask_bits(unsigned int* kbc_bitmask) -{ - int count = 0; - for (int kbc_bucket_id_L=0;kbc_bucket_id_L 0) { - count++; - } - } - printf("Counted kbc masks: %u\n",count); -} - -#include - -void attack_method_kbc_list(uint32_t bits) { - - const uint32_t NUM_L_KBCS = 208147; // T4 16-bit entry list size - std::cout << "ATTACK METHOD KBC LIST NUM: " << NUM_L_KBCS << std::endl; - - /* Tried, really tried, but the bitmask slows it down too much, all those x's checking 4 billion times against - * ram and then doing a simple xs/ys add, even so it's 109ms just to filter the xs compared to kbc bit scan method - * that's done with that phase and sorted into buckets at 40ms tops. - * DrPlotter v0.1d -Attack it! -ATTACK METHOD KBC LIST NUM: 208147 - kbc list bytes size:832588 - kbc_bitmask:832588 - expected xs:106571264 size: 426285056 - chachas:106571264 size: 426285056 -Generating kbc list (step:87) - num uniques:208146 duplicates: 0 -setting kbc mask - gpu_chacha8_set_Lxs_into_kbc_bitmask results: 1 ms -Counted kbc masks: 411613 -getting filtered xs/chachas list - gpu_chacha8_filter_rxs_by_kbc_bitmask time: 109 ms - xs count: 97190536 -Freeing memory... - compute only time: 287 ms -end. - * - */ - - using milli = std::chrono::milliseconds; - auto attack_start = std::chrono::high_resolution_clock::now(); - - // first we "read" the kbc list on host - - const uint32_t EXPECTED_XS = NUM_L_KBCS*2*256; - uint32_t *kbc_list; - unsigned int *kbc_bitmask; - int *xs_count; - uint32_t *xs; - uint32_t *chachas; - - std::cout << " kbc list bytes size:" << (sizeof(uint32_t)*NUM_L_KBCS) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&kbc_list, sizeof(uint32_t)*NUM_L_KBCS)); - std::cout << " kbc_bitmask:" << (sizeof(unsigned int)*NUM_L_KBCS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&kbc_bitmask, kBC_NUM_BUCKETS*sizeof(unsigned int))); - CUDA_CHECK_RETURN(cudaMemset(kbc_bitmask, 0, kBC_NUM_BUCKETS*sizeof(unsigned int))); - std::cout << " expected xs:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&xs, EXPECTED_XS*sizeof(uint32_t))); - std::cout << " chachas:" << EXPECTED_XS << " size: " << (sizeof(uint32_t)*EXPECTED_XS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&chachas, EXPECTED_XS*sizeof(uint32_t))); - CUDA_CHECK_RETURN(cudaMallocManaged(&xs_count, 1024)); // 1024 blocks maybe? - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - - int step = kBC_NUM_BUCKETS / NUM_L_KBCS; - std::cout << "Generating kbc list (step:" << step << ")" << std::endl; - for (int i=0;i>>(calc_N, kbc_list, kbc_bitmask); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto time_finish = std::chrono::high_resolution_clock::now(); - std::cout << " gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast(time_finish - time_start).count() << " ms\n"; - - gpu_count_kbc_mask_bits<<<1,1>>>(kbc_bitmask); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - std::cout << "getting filtered xs/chachas list" << std::endl; - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - xs_count[0] = 0; - time_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_filter_rxs_by_kbc_bitmask<<>>(calc_N,chacha_input, - kbc_bitmask, xs, chachas, &xs_count[0]); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - time_finish = std::chrono::high_resolution_clock::now(); - std::cout << " gpu_chacha8_filter_rxs_by_kbc_bitmask time: " << std::chrono::duration_cast(time_finish - time_start).count() << " ms\n"; - std::cout << " xs count: " << xs_count[0] << "\n"; - - - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(kbc_bitmask)); - CUDA_CHECK_RETURN(cudaFree(xs)); - CUDA_CHECK_RETURN(cudaFree(chachas)); - - std::cout << " compute only time: " << std::chrono::duration_cast(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << "end." << std::endl; - -} - -#endif /* ATTACK_METHOD_KBC_LIST_HPP_ */ diff --git a/attack_method_lxs.hpp b/attack_method_lxs.hpp deleted file mode 100644 index ce2eb86..0000000 --- a/attack_method_lxs.hpp +++ /dev/null @@ -1,1268 +0,0 @@ -/* - * attack_method_lxs.hpp - * - * Created on: Nov 6, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_LXS_HPP_ -#define ATTACK_METHOD_LXS_HPP_ - -#include // memcpy_async - -const uint32_t CHACHA_NUM_BATCHES_BITS = 3; -const uint32_t CHACHA_NUM_BATCHES = 1 << CHACHA_NUM_BATCHES_BITS; -const uint32_t CHACHA_TOTAL_ENTRIES_PER_BATCH = UINT_MAX / CHACHA_NUM_BATCHES; -const uint32_t CHACHA_BUCKET_BITS = 4; // ACROSS ALL BATCHES -const uint32_t CHACHA_NUM_BUCKETS = (1 << CHACHA_BUCKET_BITS); -const uint32_t CHACHA_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS)); -const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS - CHACHA_NUM_BATCHES_BITS)); -const uint32_t CHACHA_MAX_ENTRIES_PER_BUCKET = (11 * (CHACHA_TOTAL_ENTRIES_PER_BATCH / CHACHA_NUM_BUCKETS)) / 10; -const uint64_t CHACHA_OUT_MAX_ENTRIES_NEEDED = (CHACHA_NUM_BUCKETS * CHACHA_MAX_ENTRIES_PER_BUCKET); - -struct xchacha_pair { - uint32_t x; - uint32_t chacha; -}; - -#define CHECK_MATCH() \ -{ \ - int16_t yr_kbc = Ry % kBC; \ - int16_t yr_bid = yr_kbc / kC; \ - int16_t yl_bid = yl_kbc / kC; \ - int16_t formula_one = yr_bid - yl_bid; \ - if (formula_one < 0) { \ - formula_one += kB; \ - } \ - int16_t m = formula_one; \ - if (m >= kB) { \ - m -= kB; \ - } \ - if (m < 64) { \ - int16_t yl_cid = yl_kbc % kC; \ - int16_t yr_cid = yr_kbc % kC;\ - int16_t parity = (kbc_bucket_id_L) % 2; \ - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; \ - int16_t formula_two = yr_cid - yl_cid; \ - if (formula_two < 0) { \ - formula_two += kC; \ - } \ - if (formula_two == m2_parity_squared) { \ - isMatch = true; \ - } \ - } \ -} - -// MASKED method for counter 10 bits, should help cache by 3x -#define KBCFILTER_mask(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int kbc_bitmask_bucket = local_kbc_bucket_id / 10; \ - int kbc_bit_slot = local_kbc_bucket_id % 10; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - unsigned int add = atomicAdd(&kbc_local_num_entries[kbc_bitmask_bucket],kbc_mask); \ - unsigned int slot = (add >> kbc_bit_slot) & 0b01111111111; \ - F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ -} - -#define KBCFILTER(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \ - F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ -} - -__global__ -void gpu_chacha8_get_k32_keystream_into_local_kbc_entries(const uint32_t N, - const __restrict__ uint32_t *input, F1_Bucketed_kBC_Entry *kbc_local_entries, unsigned int *kbc_local_num_entries, - uint32_t KBC_START, uint32_t KBC_END) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - /*const uint32_t include_xs[64] = {602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917, - 3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027, - 1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862, - 2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395, - 2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700, - 2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635, - 2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056, - 2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969};*/ - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - KBCFILTER(x0,0);KBCFILTER(x1,1);KBCFILTER(x2,2);KBCFILTER(x3,3); - KBCFILTER(x4,4);KBCFILTER(x5,5);KBCFILTER(x6,6);KBCFILTER(x7,7); - KBCFILTER(x8,8);KBCFILTER(x9,9);KBCFILTER(x10,10);KBCFILTER(x11,11); - KBCFILTER(x12,12);KBCFILTER(x13,13);KBCFILTER(x14,14);KBCFILTER(x15,15); - } -} - -#define ATTACK_INTO_KBC_YS(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - int slot = atomicAdd(&kbc_global_num_entries_L[kbc_bucket_id],1); \ - if (slot >= MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \ - uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \ - kbc_global_Ly_entries_L[entries_address] = y; \ - kbc_x_entries[entries_address] = (x + i); \ -} - -// can hold 6 entries of 5 bits each = 5*6 = 30 bits. -#define KBC_MASK_SHIFT 5 -#define KBC_MASK_MOD 6 -#define KBC_MASK_BITS 0b011111 -#define ATTACK_INTO_KBC_YS_BITMASK(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD; \ - uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD); \ - uint32_t add = 1 << kbc_bitmask_shift; \ - uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add); \ - uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; \ - if (slot >= MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \ - uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \ - kbc_global_Ly_entries_L[entries_address] = y; \ - kbc_x_entries[entries_address] = (x + i); \ -} - -#define CHACHA_OUT(chacha_y,i) \ -{ \ - chachas[x+i] = chacha_y; \ -} - -// uint16_t indJ = l_y / kC; -// uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); -// OK, so we get all our Lx's and get their Ly's, and then compute their target Lys, -// but then we have to write this to huge data of global_target_rys which is 38 bits. -// even with 1 bit per entry it's too much data, unless we remove bottom bits and get some false positives. -// 2^38 bits / 8 = 2^34 bits, >> 2 = 2^32 bits...means we can do 4 Lx passes and 4 Rx passes...interesting... -// will have to do binary tree search for rxs...fuck. -__global__ -void gpu_chacha8_only_chacha_results(const uint32_t N, - const __restrict__ uint32_t *input, - uint32_t *chachas) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - CHACHA_OUT(x0,0);CHACHA_OUT(x1,1);CHACHA_OUT(x2,2);CHACHA_OUT(x3,3); - CHACHA_OUT(x4,4);CHACHA_OUT(x5,5);CHACHA_OUT(x6,6);CHACHA_OUT(x7,7); - CHACHA_OUT(x8,8);CHACHA_OUT(x9,9);CHACHA_OUT(x10,10);CHACHA_OUT(x11,11); - CHACHA_OUT(x12,12);CHACHA_OUT(x13,13);CHACHA_OUT(x14,14);CHACHA_OUT(x15,15); - } -} - -#define CHACHA_BUCKET_OUT(chacha_y,i) \ -{ \ - uint32_t rx_bucket = chacha_y / CHACHA_BUCKET_DIVISOR; \ - if ((rx_bucket > CHACHA_BUCKET_RANGE_MIN) && (rx_bucket <= CHACHA_BUCKET_RANGE_MAX)) { \ - rx_bucket = rx_bucket - CHACHA_BUCKET_RANGE_MIN; \ - uint slot = atomicAdd(&shared_rx_counts[rx_bucket],1); \ - if (slot > MAX_ENTRIES_PER_LOCAL_BUCKET) printf("CHACHA BUCKET OUT SLOT OVERFLOW %u\n", slot); \ - chachas_buffer[rx_bucket * NUM_LOCAL_BUCKETS + slot] = chacha_y; \ - xs_buffer[rx_bucket * NUM_LOCAL_BUCKETS + slot] = (x+i); \ - } \ -} -//printf("PASSED FILTER local rx bucket %u slot %u\n", chacha_y, rx_bucket+CHACHA_BUCKET_MIN, rx_bucket, slot); \ - printf("chacha y: %u rx_bucket %u \n", chacha_y, rx_bucket); \ chachas[address] = chacha_y; \ - //rxs[address] = (x+i); \ - -#define ATTACK_WRITE_CHACHAS32_PAIR(chacha_y,i) \ -{ \ - xchacha_pair pair = { base_x + i, chacha_y }; \ - shared_chachas[threadIdx.x*32+i] = pair; \ - const uint32_t bucket_id = pair.chacha >> (32 - CHACHA_BUCKET_BITS); \ - atomicAdd(&shared_counts[bucket_id],1); \ -} - -// run with 128 blocksize, more doesn't matter. -template -__global__ -void gpu_chacha8_k32_write_chachas32_buckets(const uint32_t N, const uint32_t X_START, - const uint32_t CHACHA_MAX_PER_BUCKET, - const __restrict__ uint32_t *input, - xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - __shared__ xchacha_pair shared_chachas[128*32]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so. - __shared__ uint shared_counts[NUM_BUCKETS]; - __shared__ uint global_counts[NUM_BUCKETS]; - - if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 32; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - for (int i=threadIdx.x;i> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],3); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],7); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],11); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],16+3); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],16+7); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],16+11); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],16+15); - - } - __syncthreads(); - for (int i=threadIdx.x;i> (32 - CHACHA_BUCKET_BITS); // 16 buckets - uint slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); - if (slot > CHACHA_MAX_PER_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_BUCKET, slot); - else xchachas_buckets[CHACHA_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = shared_chachas[i]; - } -} - -#define ATTACK_BUCKETBATCH_CHACHAS32_PAIR(chacha_y,i) \ -{ \ - if ((chacha_y >= BATCH_CHACHA_RANGE_MIN) && (chacha_y <= BATCH_CHACHA_RANGE_MAX)) { \ - xchacha_pair pair = { base_x + i, chacha_y }; \ - int slot = atomicAdd(&local_filter_count,1); \ - if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \ - shared_chachas[slot] = pair; \ - uint32_t split_bucket_id = (chacha_y - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR; \ - atomicAdd(&shared_counts[split_bucket_id],1); \ - } \ -} - -// run with 128 blocksize, more doesn't matter. -template -__global__ -void gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange(const uint32_t N, - const uint32_t BATCH_CHACHA_RANGE_MIN, const uint32_t BATCH_CHACHA_RANGE_MAX, - const uint32_t CHACHA_MAX_PER_SPLIT_BUCKET, const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR, - const __restrict__ uint32_t *input, - xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can - __shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so. - __shared__ uint shared_counts[NUM_SPLIT_BUCKETS]; - __shared__ uint global_counts[NUM_SPLIT_BUCKETS]; - __shared__ uint local_filter_count; - - //if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 32; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - for (int i=threadIdx.x;i> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],3); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],7); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],11); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],16+3); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],16+7); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],16+11); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],16+15); - } - // at this point we have 128*32 = 4096 entries - // now we have to sort them into the buckets - // we already have the shared counts set from the ATTACK macro - __syncthreads(); - for (int i=threadIdx.x;i CHACHA_MAX_PER_SPLIT_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_SPLIT_BUCKET, slot); - else xchachas_buckets[CHACHA_MAX_PER_SPLIT_BUCKET * split_bucket_id + slot] = shared_chachas[i]; - } -} - - -__global__ -void gpu_chacha8_tag_rxs_from_chacha(const uint32_t N, - const __restrict__ uint32_t *input, - const uint16_t *kbc_global_Ly_entries_L, const unsigned int *kbc_global_num_entries_L, const uint32_t MAX_LXS_PER_KBC_BUCKET, - uint32_t *chachas) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x < N) { - uint32_t chacha_y = chachas[x]; - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; - if (kbc_bucket_id_L > 0) { - int num = kbc_global_num_entries_L[kbc_bucket_id_L]; - for (int nm=0;nm= kB) { - m -= kB; - } - if (m < 64) { - int16_t yl_cid = yl_kbc % kC; - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (kbc_bucket_id_L) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - isMatch = true; - } - } - if (isMatch) { - chachas[x] = 0; - } - } - } - } - -} - -__global__ -void gpu_chacha8_filter_rxs_from_chacha(const uint32_t N, const uint32_t *chachas, uint32_t *rxs, int *rx_count) -{ - int x = blockIdx.x * blockDim.x + threadIdx.x; - if (x < N) { - uint32_t chacha_y = chachas[x]; - if (chacha_y == 0) { - int slot = atomicAdd(&rx_count[0], 1); - rxs[slot] = x; - } - } - -} - -__global__ -void gpu_chacha8_set_Lxs_into_kbc_ys(const uint32_t N, - const __restrict__ uint32_t *input, - uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group < end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_INTO_KBC_YS(x0,0);ATTACK_INTO_KBC_YS(x1,1);ATTACK_INTO_KBC_YS(x2,2);ATTACK_INTO_KBC_YS(x3,3); - ATTACK_INTO_KBC_YS(x4,4);ATTACK_INTO_KBC_YS(x5,5);ATTACK_INTO_KBC_YS(x6,6);ATTACK_INTO_KBC_YS(x7,7); - ATTACK_INTO_KBC_YS(x8,8);ATTACK_INTO_KBC_YS(x9,9);ATTACK_INTO_KBC_YS(x10,10);ATTACK_INTO_KBC_YS(x11,11); - ATTACK_INTO_KBC_YS(x12,12);ATTACK_INTO_KBC_YS(x13,13);ATTACK_INTO_KBC_YS(x14,14);ATTACK_INTO_KBC_YS(x15,15); - } -} - -__global__ -void gpu_chacha8_set_Lxs_into_kbc_ys_mask(const uint32_t N, - const __restrict__ uint32_t *input, - uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group < end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_INTO_KBC_YS_BITMASK(x0,0);ATTACK_INTO_KBC_YS_BITMASK(x1,1);ATTACK_INTO_KBC_YS_BITMASK(x2,2);ATTACK_INTO_KBC_YS_BITMASK(x3,3); - ATTACK_INTO_KBC_YS_BITMASK(x4,4);ATTACK_INTO_KBC_YS_BITMASK(x5,5);ATTACK_INTO_KBC_YS_BITMASK(x6,6);ATTACK_INTO_KBC_YS_BITMASK(x7,7); - ATTACK_INTO_KBC_YS_BITMASK(x8,8);ATTACK_INTO_KBC_YS_BITMASK(x9,9);ATTACK_INTO_KBC_YS_BITMASK(x10,10);ATTACK_INTO_KBC_YS_BITMASK(x11,11); - ATTACK_INTO_KBC_YS_BITMASK(x12,12);ATTACK_INTO_KBC_YS_BITMASK(x13,13);ATTACK_INTO_KBC_YS_BITMASK(x14,14);ATTACK_INTO_KBC_YS_BITMASK(x15,15); - } -} - - - -#define ATTACK_FILTER_RXS(chacha_y,i) \ -{ \ - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - if ((kbc_bucket_id_L > KBC_MIN_RANGE) && (kbc_bucket_id_L <= KBC_MAX_RANGE)) { \ - int num = kbc_global_num_entries_L[kbc_bucket_id_L]; \ - for (int nm=0;nm> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - isMatch = false; \ - if (kbc_bucket_id_L > 0) { \ - uint64_t Ly = kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET]; \ - if (Ly > 0) { \ - CHECK_MATCH(); \ - } \ - } \ - if (isMatch) { \ - int slot = atomicAdd(&rx_count[0],1); \ - rxs[slot] = (x+i); \ - } \ -} - - -__global__ -void gpu_chacha8_filter_rxs(const uint32_t N, - const __restrict__ uint32_t *input, - const uint16_t* __restrict__ kbc_global_Ly_entries_L, const unsigned int* __restrict__ kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET, - uint32_t * __restrict__ rxs, int *rx_count, - const uint32_t KBC_MIN_RANGE, const uint32_t KBC_MAX_RANGE) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - bool isMatch = false; - ATTACK_FILTER_RXS(x0,0);ATTACK_FILTER_RXS(x1,1);ATTACK_FILTER_RXS(x2,2);ATTACK_FILTER_RXS(x3,3); - ATTACK_FILTER_RXS(x4,4);ATTACK_FILTER_RXS(x5,5);ATTACK_FILTER_RXS(x6,6);ATTACK_FILTER_RXS(x7,7); - ATTACK_FILTER_RXS(x8,8);ATTACK_FILTER_RXS(x9,9);ATTACK_FILTER_RXS(x10,10);ATTACK_FILTER_RXS(x11,11); - ATTACK_FILTER_RXS(x12,12);ATTACK_FILTER_RXS(x13,13);ATTACK_FILTER_RXS(x14,14);ATTACK_FILTER_RXS(x15,15); - } -} - -__global__ -void gpu_chacha8_filter_rxs_from_bucket_batch_async( - const uint32_t N, - const xchacha_pair* __restrict__ xchachas, - const uint16_t* __restrict__ kbc_global_Ly_entries_L, - const unsigned int* __restrict__ kbc_global_num_entries_L, - uint32_t MAX_LXS_PER_KBC_BUCKET, - uint32_t * __restrict__ rxs, - int *rx_count) -{ - __shared__ uint16_t copy_Ly_entries[64]; - - cuda::barrier bar; - init(&bar, 1); - - int num; - int i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - xchacha_pair entry = xchachas[i]; - uint64_t Ry = (((uint64_t) entry.chacha) << 6) + (entry.x >> 26); - int kbc_bucket_id_R = (uint32_t (Ry / kBC)); - if (kbc_bucket_id_R > 0) { - int kbc_bucket_id_L = kbc_bucket_id_R - 1; - //printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L); - num = kbc_global_num_entries_L[kbc_bucket_id_L]; - cuda::memcpy_async(©_Ly_entries[0], - &kbc_global_Ly_entries_L[kbc_bucket_id_L * MAX_LXS_PER_KBC_BUCKET], sizeof(uint16_t)*num, bar); - bar.arrive_and_wait(); - for (int nm=0;nm> 26); - int kbc_bucket_id_R = (uint32_t (Ry / kBC)); - if (kbc_bucket_id_R > 0) { - int kbc_bucket_id_L = kbc_bucket_id_R - 1; - //printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L); - //int num = kbc_global_num_entries_L[kbc_bucket_id_L]; - - //uint num = kbc_global_num_entries_L[kbc_bucket_id_L]; - uint32_t kbc_bitmask_bucket = kbc_bucket_id_L / KBC_MASK_MOD; - uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id_L % KBC_MASK_MOD); - uint slot_value =kbc_global_num_entries_L[kbc_bitmask_bucket]; - uint num = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; - for (int nm=0;nm> 26); \ - int kbc_bucket_id = (uint32_t (y / kBC)); \ - int kbc_bitmask_bucket = kbc_bucket_id / 32; \ - int kbc_bit_slot = kbc_bucket_id % 32; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - atomicOr(&kbc_global_bitmask[kbc_bitmask_bucket],kbc_mask); \ -} - -#define ATTACK_FILTER_BITMASK_batch64(chacha_y,i) \ -{ \ - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - if (kbc_bucket_id_L > 0) { \ - int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \ - int kbc_bit_slot = kbc_bucket_id_L % 32; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \ - if ((kbc_mask & kbc_value) > 0) { \ - uint32_t batch_id = kbc_bucket_id_L >> (32-6); \ - int slot = atomicAdd(&rx_count[batch_id],1); \ - rxs[batch_id * RX_MAX_ENTRIES_PER_BATCH + slot] = (x+i); \ - } \ - } \ -} - -#define ATTACK_FILTER_BITMASK(chacha_y,i) \ -{ \ - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - if (kbc_bucket_id_L > 0) { \ - int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \ - int kbc_bit_slot = kbc_bucket_id_L % 32; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \ - if ((kbc_mask & kbc_value) > 0) { \ - int slot = atomicAdd(&rx_local_count,1); \ - shared_rxs[slot] = (x+i); \ - } \ - } \ -} - -#define ATTACK_FILTER_BITMASK_origbeforeaddingshared(chacha_y,i) \ -{ \ - uint64_t Ry = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - int kbc_bucket_id_L = (uint32_t (Ry / kBC)) - 1; \ - if (kbc_bucket_id_L > 0) { \ - int kbc_bitmask_bucket = kbc_bucket_id_L / 32; \ - int kbc_bit_slot = kbc_bucket_id_L % 32; \ - unsigned int kbc_mask = 1 << kbc_bit_slot; \ - unsigned int kbc_value = kbc_global_bitmask[kbc_bitmask_bucket]; \ - if ((kbc_mask & kbc_value) > 0) { \ - int slot = atomicAdd(&rx_count[0],1); \ - rxs[slot] = (x+i); \ - } \ - } \ -} - -__global__ -void gpu_chacha8_set_Lxs_into_kbc_bitmask(const uint32_t N, - const __restrict__ uint32_t *input, - unsigned int* kbc_global_bitmask) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_SET_BITMASK(x0,0);ATTACK_SET_BITMASK(x1,1);ATTACK_SET_BITMASK(x2,2);ATTACK_SET_BITMASK(x3,3); - ATTACK_SET_BITMASK(x4,4);ATTACK_SET_BITMASK(x5,5);ATTACK_SET_BITMASK(x6,6);ATTACK_SET_BITMASK(x7,7); - ATTACK_SET_BITMASK(x8,8);ATTACK_SET_BITMASK(x9,9);ATTACK_SET_BITMASK(x10,10);ATTACK_SET_BITMASK(x11,11); - ATTACK_SET_BITMASK(x12,12);ATTACK_SET_BITMASK(x13,13);ATTACK_SET_BITMASK(x14,14);ATTACK_SET_BITMASK(x15,15); - } -} - - - -__global__ -void gpu_chacha8_filter_rxs_by_kbc_bitmask(const uint32_t N, - const __restrict__ uint32_t *input, - const unsigned int* __restrict__ kbc_global_bitmask, - uint32_t * __restrict__ rxs, int *rx_count, - const uint32_t RX_BATCHES, const uint32_t RX_MAX_ENTRIES_PER_BATCH) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - __shared__ uint32_t shared_rxs[1024]; - __shared__ int rx_local_count; - __shared__ int global_slot; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - if (threadIdx.x == 0) { - rx_local_count = 0; - } - for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - __syncthreads(); - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_FILTER_BITMASK(x0,0);ATTACK_FILTER_BITMASK(x1,1);ATTACK_FILTER_BITMASK(x2,2);ATTACK_FILTER_BITMASK(x3,3); - ATTACK_FILTER_BITMASK(x4,4);ATTACK_FILTER_BITMASK(x5,5);ATTACK_FILTER_BITMASK(x6,6);ATTACK_FILTER_BITMASK(x7,7); - ATTACK_FILTER_BITMASK(x8,8);ATTACK_FILTER_BITMASK(x9,9);ATTACK_FILTER_BITMASK(x10,10);ATTACK_FILTER_BITMASK(x11,11); - ATTACK_FILTER_BITMASK(x12,12);ATTACK_FILTER_BITMASK(x13,13);ATTACK_FILTER_BITMASK(x14,14);ATTACK_FILTER_BITMASK(x15,15); - - __syncthreads(); - if (threadIdx.x == 0) { - global_slot = atomicAdd(&rx_count[0],rx_local_count); - rx_local_count = 0; - } - __syncthreads(); - for (int i=threadIdx.x;i(alloc_finish - attack_start).count() << " ms\n"; - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - - int blockSize; // # of threads per block, maximum is 1024. - uint64_t calc_N; - uint64_t calc_blockSize; - uint64_t calc_numBlocks; - int numBlocks; - -/* std::cout << " gpu_chacha8_set_Lxs_into_kbc_bitmask \n"; - int blockSize = 16; // # of threads per block, maximum is 1024. - uint64_t calc_N = num_lxs; - uint64_t calc_blockSize = blockSize; - uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - int numBlocks = calc_numBlocks; - - auto chacha_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_set_Lxs_into_kbc_bitmask<<>>(calc_N, chacha_input, - device_global_kbc_num_entries_L); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " - gpu_chacha8_set_Lxs_into_kbc_bitmask results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - - F1_Bucketed_kBC_Entry *local_kbc_entries = (F1_Bucketed_kBC_Entry *) rx_match_list; - chacha_start = std::chrono::high_resolution_clock::now(); - // 1) gpu scan kbs into (F1_Bucketed_kBC_Entry *) bufferA - //std::cout << " Generating F1 results into kbc buckets..."; - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - //std::cout << " Block configuration: [blockSize:" << blockSize << " numBlocks:" << numBlocks << "]" << std::endl; - // don't forget to clear counter...will only use a portion of this memory so should be fast access. - - CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, 10000000*sizeof(int))); - gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<>>(calc_N, chacha_input, - local_kbc_entries, device_global_kbc_num_entries_L, 0, 2000000); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " - gpu_chacha8_get_k32_keystream_into_local_kbc_entries results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - - - std::cout << " gpu_chacha8_filter_rxs_by_kbc_bitmask \n"; - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - - chacha_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_filter_rxs_by_kbc_bitmask<<>>(calc_N, chacha_input, - device_global_kbc_num_entries_L, - rx_match_list, rx_match_count, - RX_BATCHES, RX_MAX_ENTRIES_PER_BATCH); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " gpu_chacha8_filter_rxs_by_kbc_bitmask results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - std::cout << " found " << rx_match_count[0] << " RXS" << std::endl; - -*/ - - // FIRST SET LXS into global memory, these stay put for each chacha round - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = num_lxs; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - - std::cout << " gpu_chacha8_set_Lxs_into_kbc_ys num:" << calc_N << std::endl; - auto lxintokbc_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_set_Lxs_into_kbc_ys_mask<<>>(calc_N, chacha_input, - kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET); - - /* Doing chacha batch 7 - gpu_chacha8_k32_write_chachas32_buckets results: 32 ms - chacha Rxs time: 37 ms - found 90582467 matches - Freeing memory... - total chachas time: 248 ms - total Rxs time: 302 ms - compute only time: 654 ms attack total time: 692 ms */ - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto lxintokbc_finish = std::chrono::high_resolution_clock::now(); - std::cout << " gpu_chacha8_set_Lxs_into_kbc_ys time: " << std::chrono::duration_cast(lxintokbc_finish - lxintokbc_start).count() << " ms\n"; - gpu_get_max_counts_from_counter_list<<<1,1024>>>(device_global_kbc_num_entries_L, kBC_NUM_BUCKETS, false); - - int64_t total_chacha_ms = 0; - int64_t total_rx_ms = 0; - for (uint64_t chacha_batch_id = 0; chacha_batch_id < CHACHA_NUM_BATCHES; chacha_batch_id++) { - std::cout << "Doing chacha batch " << chacha_batch_id << std::endl; - uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS)); - uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR; - uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch - //if (chacha_batch_id == CHACHA_NUM_BATCHES - 1) BATCH_CHACHA_RANGE_MAX = UINT_MAX; - - //std::cout << " BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl; - //std::cout << " BATCH_CHACHA_RANGE : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl; - //std::cout << " BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl; - //std::cout << " CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl; - //std::cout << " CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl; - - - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX;//CHACHA_TOTAL_ENTRIES_PER_BATCH; - uint32_t CHACHA_X_START = 0;//chacha_batch_id * calc_N; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32); - numBlocks = calc_numBlocks; - CUDA_CHECK_RETURN(cudaMemset(xchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int))); - auto chacha_start = std::chrono::high_resolution_clock::now(); - //std::cout << " calc_N : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<<>>(calc_N, - BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX, - CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR, - chacha_input, - xchachas, xchachas_bucket_counts); - - - //gpu_chacha8_only_chacha_results<<>>(calc_N, chacha_input, - // chachas); - //gpu_chacha8_k32_write_chachas32_buckets<<>>(calc_N, CHACHA_X_START, - // CHACHA_MAX_ENTRIES_PER_BUCKET, - // chacha_input, - // xchachas, xchachas_bucket_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - total_chacha_ms += std::chrono::duration_cast(chacha_finish - chacha_start).count(); - std::cout << " gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - //gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, CHACHA_NUM_BUCKETS, true); - auto chacha_rs_start = std::chrono::high_resolution_clock::now(); - for (uint chacha_bucket_id=0;chacha_bucket_id>>( - calc_N, - &xchachas[chacha_bucket_id], - kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET, - rx_match_list, rx_match_count); - //CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - - /* - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX/CHACHA_NUM_BATCHES; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - - - std::cout << "Doing " << NUM_KBC_RANGE_BATCHES << " range batches of gpu_chacha_filter_rxs" << std::endl; - for (int kbc_range_batch=0;kbc_range_batch < NUM_KBC_RANGE_BATCHES; kbc_range_batch++) { - const uint32_t KBC_MIN_RANGE = ((kbc_range_batch+0) * 18188177) / (NUM_KBC_RANGE_BATCHES); - const uint32_t KBC_MAX_RANGE = ((kbc_range_batch+1) * 18188177) / (NUM_KBC_RANGE_BATCHES); - std::cout << "range KBC_MIN: " << KBC_MIN_RANGE << " - " << KBC_MAX_RANGE << std::endl; - gpu_chacha8_filter_rxs<<>>(calc_N, chacha_input, - kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET, - rx_match_list, rx_match_count, - KBC_MIN_RANGE, KBC_MAX_RANGE); - } -*/ - - //calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); numBlocks = calc_numBlocks; - //gpu_chacha8_tag_rxs_from_chacha<<>>(calc_N, chacha_input, - // kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET, - // chachas); - //gpu_chacha8_filter_rxs_from_chacha<<>>(calc_N,chachas,rx_match_list,rx_match_count); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_rs_finish = std::chrono::high_resolution_clock::now(); - total_rx_ms += std::chrono::duration_cast(chacha_rs_finish - chacha_rs_start).count(); - std::cout << " chacha Rxs time: " << std::chrono::duration_cast(chacha_rs_finish - chacha_rs_start).count() << " ms\n"; - std::cout << " found " << rx_match_count[0] << " matches" << std::endl; - - - } - - - - - - - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(kbc_Ly_entries)); - CUDA_CHECK_RETURN(cudaFree(device_global_kbc_num_entries_L)); - - auto attack_finish = std::chrono::high_resolution_clock::now(); - std::cout << " total chachas time: " << total_chacha_ms << " ms\n"; - std::cout << " total Rxs time: " << total_rx_ms << " ms\n"; - std::cout << " compute only time: " << std::chrono::duration_cast(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << " attack total time: " << std::chrono::duration_cast(attack_finish - attack_start).count() << " ms\n"; - std::cout << "end." << std::endl; -} - - - - - -#endif /* ATTACK_METHOD_LXS_HPP_ */ diff --git a/attack_method_lxs2.hpp b/attack_method_lxs2.hpp deleted file mode 100644 index a8529b5..0000000 --- a/attack_method_lxs2.hpp +++ /dev/null @@ -1,1766 +0,0 @@ -/* - * attack_method_lxs2.hpp - * - * Created on: Nov 8, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_LXS2_HPP_ -#define ATTACK_METHOD_LXS2_HPP_ - -//#include -//#include -//#include - - - -struct xchacha_pair { - uint32_t x; - uint32_t chacha; -}; - -// TODO: try increasing the buckets as we go down the iterations -// suspect we can benefit more from cache when flipping back and forth vs the chacha generation -// which likely eats a lot of the cache? Or I had a huge bug somewhere. - -const uint32_t DUMBSORT_BUCKET_BITS = 4; -const uint32_t DUMBSORT_NUM_BUCKETS = 1 << DUMBSORT_BUCKET_BITS; -const uint32_t PHASE_3_DUMBSORT_MAX_PER_BUCKET = 42;//32; -const uint32_t PHASE_2_DUMBSORT_MAX_PER_BUCKET = 42*16;//512; -const uint32_t PHASE_1_DUMBSORT_MAX_PER_BUCKET = 42*16*16;//8192; // 8601 was largest found, using a multiple of 256 so going for 8704 -const uint32_t DUMBSORT_BATCHES_TILE_SPACE = PHASE_1_DUMBSORT_MAX_PER_BUCKET * DUMBSORT_NUM_BUCKETS; -const uint32_t GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK = 65536; -const uint32_t DUMBSORT_SPACE_NEEDED_FOR_SCRATCH = ((1 << (32-6)) / GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK) * DUMBSORT_BATCHES_TILE_SPACE; - - -//////////////////////////////////////////////////////////////////////////////// -// Monolithic bitonic sort kernel for short arrays fitting into shared memory -//////////////////////////////////////////////////////////////////////////////// -#include - -namespace cg = cooperative_groups; -#define SHARED_SIZE_LIMIT 1024U - -__device__ inline void Comparator( - uint &keyA, - uint &valA, - uint &keyB, - uint &valB, - uint dir -) -{ - uint t; - - if ((keyA > keyB) == dir) - { - t = keyA; - keyA = keyB; - keyB = t; - t = valA; - valA = valB; - valB = t; - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Monolithic Bacther's sort kernel for short arrays fitting into shared memory -//////////////////////////////////////////////////////////////////////////////// -__global__ void oddEvenMergeSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs) -{ - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - //Shared memory storage for one or more small vectors - __shared__ uint s_key[SHARED_SIZE_LIMIT]; - __shared__ uint s_val[SHARED_SIZE_LIMIT]; - - uint dir = 1; - uint arrayLength = 1024; - - //Offset to the beginning of subbatch and load data - chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = chachas[ 0]; - s_val[threadIdx.x + 0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = chachas[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2); - - for (uint size = 2; size <= arrayLength; size <<= 1) - { - uint stride = size / 2; - uint offset = threadIdx.x & (stride - 1); - - { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator( - s_key[pos + 0], s_val[pos + 0], - s_key[pos + stride], s_val[pos + stride], - dir - ); - stride >>= 1; - } - - for (; stride > 0; stride >>= 1) - { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - - if (offset >= stride) - Comparator( - s_key[pos - stride], s_val[pos - stride], - s_key[pos + 0], s_val[pos + 0], - dir - ); - } - } - - cg::sync(cta); - out_chachas[ 0] = s_key[threadIdx.x + 0]; - out_xs[ 0] = s_val[threadIdx.x + 0]; - out_chachas[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - out_xs[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - -} - -// threads must be SHARED_SIZE_LIMIT/2 -__global__ void nickSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs) -{ - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - //Shared memory storage for one or more short vectors - __shared__ uint order[SHARED_SIZE_LIMIT*2]; // we're going to use top 16 and bottom 16 to store indexes - __shared__ uint bucket_counts[1024]; - __shared__ uint s_key[SHARED_SIZE_LIMIT]; // the sort values - __shared__ uint s_val[SHARED_SIZE_LIMIT]; // stores the xs - __shared__ uint sorted_val[SHARED_SIZE_LIMIT]; - __shared__ uint sorted_key[SHARED_SIZE_LIMIT]; - - uint dir = 1; - uint arrayLength = 1024; - - //Offset to the beginning of subbatch and load data - chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - uint32_t chacha = chachas[0]; - uint16_t index = threadIdx.x; - bucket_counts[threadIdx.x] = 0; - order[threadIdx.x] = 0; - order[threadIdx.x + 1024] = 0; - //order[threadIdx.x*2+1] = 0; - s_key[threadIdx.x + 0] = chachas[ 0]; - s_val[threadIdx.x + 0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - - cg::sync(cta); - uint16_t bucket_id = chacha >> (32 - 10); - int add = atomicAdd(&bucket_counts[bucket_id],1); - if (add < 4) { - uint pos = bucket_id * 2 + add; - uint value = index << ((pos & 0b01)*16); - atomicAdd(&order[pos], value); - } - // from [ 1 3 2 0 0 1 0 2 ] - // to> [ 0 1 4 6 6 6 7 7 ] - // then each thread, reads its scan offset, and that's the shared start + the counts to copy into global memory - // [ 1 3 2 0 0 1 0 2 ] - // [ 0 1 3 5 5 0 1 0 ] - // [ 0 1 4 6 - - - // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda - - - if (threadIdx.x == 0) { - printf("buckets counts:\n"); - for (int i=0;i>1; d > 0; d >>= 1) // build sum in place up the tree - { - __syncthreads(); - if (thid < d) { - int ai = offset*(2*thid+1)-1; - int bi = offset*(2*thid+2)-1; - temp[bi] += temp[ai]; - } - offset *= 2; - } - - if (thid == 0) { temp[n - 1] = 0; } // clear the last element - for (int d = 1; d < n; d *= 2) // traverse down tree & build scan - { - offset >>= 1; - __syncthreads(); - if (thid < d) { - int ai = offset*(2*thid+1)-1; - int bi = offset*(2*thid+2)-1; - float t = temp[ai]; temp[ai] = temp[bi]; temp[bi] += t; - } - } - __syncthreads(); - - g_odata[2*thid] = temp[2*thid]; - // write results to device memory - g_odata[2*thid+1] = temp[2*thid+1]; -} - -// threads must be SHARED_SIZE_LIMIT/2 -__global__ void bitonicSortShared(uint32_t *chachas, uint32_t *out_chachas, uint32_t *out_xs) -{ - // Handle to thread block group - cg::thread_block cta = cg::this_thread_block(); - //Shared memory storage for one or more short vectors - __shared__ uint s_key[SHARED_SIZE_LIMIT]; // the sort values - __shared__ uint s_val[SHARED_SIZE_LIMIT]; // stores the xs - - - uint dir = 1; - uint arrayLength = 1024; - - //Offset to the beginning of subbatch and load data - chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_chachas += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - out_xs += blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + 0] = chachas[ 0]; - s_val[threadIdx.x + 0] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x; - s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = chachas[(SHARED_SIZE_LIMIT / 2)]; - s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = blockIdx.x * SHARED_SIZE_LIMIT + threadIdx.x + (SHARED_SIZE_LIMIT / 2); - - //__syncthreads(); - //if (threadIdx.x == 0) { - // printf("doing bitonic sort, start list: \n"); - // for (int i=0;i 0; stride >>= 1) - { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator( - s_key[pos + 0], s_val[pos + 0], - s_key[pos + stride], s_val[pos + stride], - ddd - ); - } - } - - //ddd == dir for the last bitonic merge step - { - for (uint stride = arrayLength / 2; stride > 0; stride >>= 1) - { - cg::sync(cta); - uint pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - Comparator( - s_key[pos + 0], s_val[pos + 0], - s_key[pos + stride], s_val[pos + stride], - dir - ); - } - } - - cg::sync(cta); - - // should be sorted now. - out_chachas[ 0] = s_key[threadIdx.x + 0]; - out_xs[ 0] = s_val[threadIdx.x + 0]; - out_chachas[(SHARED_SIZE_LIMIT / 2)] = s_key[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - out_xs[(SHARED_SIZE_LIMIT / 2)] = s_val[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - - //__syncthreads(); - //if (threadIdx.x == 0) { - // printf("results sort:\n"); - // for (int i=0;i= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id); - - int slot = atomicAdd(&buffer_counts[bucket_id],1); - if (slot > PHASE_1_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 1 DUMBSORT OVERFLOW: %u\n", slot); - - uint32_t results_address = GLOBAL_TILE_START + bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET + slot; - if (results_address < DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) { - results[results_address] = entry; - } else { - printf("results address overflow %u - global start pos: %u bucket %u slot %u DUMBSORT_SPACE_NEEDED_FOR_SCRATCH: %u\n", - results_address, GLOBAL_TILE_START, bucket_id, slot, DUMBSORT_SPACE_NEEDED_FOR_SCRATCH); - } - } - - __syncthreads(); - //if (threadIdx.x == 0) { - // printf("end phase 1, buffer counts:\n"); - // for (int i=0;i PHASE_2_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 2 DUMBSORT OVERFLOW: %u\n", slot); - - uint32_t results_address2 = SUB_2_TILE_START + local_bucket_id * PHASE_2_DUMBSORT_MAX_PER_BUCKET + slot; - results2[results_address2] = entry; - } - } - - __syncthreads(); - //if (threadIdx.x == 0) { - // printf("end phase 2-%u, buffer counts:\n",read_bucket_id); - // for (int i=0;i PHASE_3_DUMBSORT_MAX_PER_BUCKET) printf("PHASE 3 DUMBSORT OVERFLOW: %u\n", slot); - - uint32_t results_address3 = SUB_3_TILE_START + local_bucket_id_3 * PHASE_3_DUMBSORT_MAX_PER_BUCKET + slot; - results[results_address3] = entry; - } - } - - __syncthreads(); - - //if (threadIdx.x == 0) { - // printf("end phase 3-2:[%u]-1[%u], buffer counts:\n",read_bucket_id_phase2,read_bucket_id); - // for (int i=0;i= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id); - - int slot = atomicAdd(&buffer_counts[bucket_id],1); - uint32_t results_address = global_bucket_start_pos + bucket_id * GLOBAL_BUCKET_MAX_ENTRIES + slot; - if (results_address < 134217728) { - results[results_address] = entry; - } else { - printf("results address overflow %u - global start pos: %u bucket %u slot %u globalmaxentries: %u\n", - results_address, global_bucket_start_pos, bucket_id, slot, GLOBAL_BUCKET_MAX_ENTRIES); - } - //__syncthreads(); // holy fuck a sync threads increases from 50ms to 85!!!!! That's why! - //for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) { - // atomicAdd(&results_counts[i], buffer_counts[i]); - // buffer_counts[i] = 0; - //} - //__syncthreads(); - } - __syncthreads(); - for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) { - atomicAdd(&results_counts[i], buffer_counts[i]+1); - if (buffer_counts[i] > PHASE_1_DUMBSORT_MAX_PER_BUCKET) - printf("BUFFER OVERFLOW: %u was over max per bucket\n",buffer_counts[i], PHASE_1_DUMBSORT_MAX_PER_BUCKET); - } - } -} - - -__global__ -void gpu_write_chachas_into_buckets_with_single_row_depthflush( - const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas, - uint32_t const MAX_TOTAL_GROUPED_ENTRIES, xchacha_pair *results, unsigned int *results_counts) -{ - // note num threads should be equal or higher than NUM_BUCKETS - // 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables. - - // good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms) - // 256, 22, 12 (11ms) - // the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up. - const uint32_t BUCKET_BITS = 5; - const uint32_t FLUSH_DEPTH = 128; - const uint32_t BUCKET_DEPTH = FLUSH_DEPTH+32; // give some room for overflow - careful too much and it slows down! - // I tried with a buffer overflow instead of padding...but...it performed slightly slower and that's without - // moving the buckets back in. seems like loops on threads not being perfect multiples when writing is more - // forgiving than though? Can try again. - - const uint32_t NUM_BUCKETS = 1 << BUCKET_BITS; - const uint32_t BUCKET_DIVISOR = 1 << (32-BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets - const uint32_t GLOBAL_BUCKET_MAX_ENTRIES = MAX_TOTAL_GROUPED_ENTRIES / NUM_BUCKETS; - - __shared__ int buffer_counts[NUM_BUCKETS]; - __shared__ int global_counts[NUM_BUCKETS]; - __shared__ uint32_t chachas_buffer[NUM_BUCKETS*BUCKET_DEPTH]; - __shared__ uint16_t xs_buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket - __shared__ int num_ready; - __shared__ int batch_id; - __shared__ int bucket_to_flush; - - // 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory. - // let's try first with 256 buckets. - //__shared__ int flush; - - const uint32_t NUM_THREADS = blockDim.x; - const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads - //if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n"); - - uint32_t x_group = blockIdx.x; - uint32_t x_start = x_group * NUM_PER_BLOCK; - - if (x_start < N) { - if (threadIdx.x == 0) { - num_ready = 0; - batch_id = 0; - } - // make sure all values start right! - for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) { - buffer_counts[i] = 0; - global_counts[i] = 0; - } - __syncthreads(); - - // go through each batch of data - while (batch_id < NUM_BATCHES_OF_THREADS) { - while ((num_ready == 0) && (batch_id < NUM_BATCHES_OF_THREADS)) { - // thread is of course threadIdx.x - uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x; - uint32_t chacha = chachas[x]; - - //if (threadIdx.x == 0) { - // printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready); - //} - __syncthreads(); - - uint32_t bucket_id = chacha / BUCKET_DIVISOR; - uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1); - uint32_t address = bucket_id * BUCKET_DEPTH + slot; - - //printf(" xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot); - if (address > NUM_BUCKETS*BUCKET_DEPTH) { - printf("ERROR ADDRESS %u -- batch: %u bucket_id: %u slot: %u\n", address, batch_id, bucket_id, slot); - } else { - //xchacha_pair entry = { x, chacha }; - chachas_buffer[address] = chacha; - xs_buffer[address] = x; - } - - if (slot == (FLUSH_DEPTH-1)) { - atomicAdd(&num_ready, 1); - bucket_to_flush = bucket_id; // doesn't matter if this gets overwritten by another thread - // point is we want to get first bucket and if there is more we fetch it from list. - //printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready); - } - - __syncthreads(); - if (threadIdx.x == 0) { - //for (int i=0;i 0) { - // flush those ready - const int num_to_flush = buffer_counts[bucket_to_flush]; - if (threadIdx.x == 0) { - global_counts[bucket_to_flush] += num_to_flush; - //global_counts[bucket_to_flush] = atomicAdd(&results_counts[bucket_to_flush],num_to_flush); - // printf("FLUSHING! %u buckets are ready, flushing bucket %u\n", num_ready, bucket_to_flush); - } - - __syncthreads(); - - for (int i=threadIdx.x;i 0) { - // find next bucket to flush! doesn't matter if multiple threads overwrite, - // just want one of them - for (int i=threadIdx.x;i= FLUSH_DEPTH) - bucket_to_flush = i; - } - } - } - - __syncthreads(); - - } - if (batch_id == NUM_BATCHES_OF_THREADS) { - // we finished entering all our data, now check left-over buckets. - // TODO: check each bucket count and write out data to global. - //if (threadIdx.x == 0) { - // printf("BATCHES COMPLETED: todo finish flushing rest of buffers\n"); - //} - for (int i=threadIdx.x;i 0) atomicAdd(&results_counts[i], buffer_counts[i]); - } - - } - } -} - -__global__ -void gpu_write_chachas_into_buckets_with_single_row_depthflush_ORIG( - const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas, - uint32_t const MAX_TOTAL_GROUPED_ENTRIES, xchacha_pair *results, unsigned int *results_counts) -{ - // note num threads should be equal or higher than NUM_BUCKETS - // 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables. - - // good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms) - // 256, 22, 12 (11ms) - // the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up. - const uint32_t BUCKET_BITS = 5; - const uint32_t FLUSH_DEPTH = 128; - const uint32_t BUCKET_DEPTH = FLUSH_DEPTH+32; // give some room for overflow - - const uint32_t NUM_BUCKETS = 1 << BUCKET_BITS; - const uint32_t BUCKET_DIVISOR = 1 << (32-BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets - const uint32_t GLOBAL_BUCKET_MAX_ENTRIES = MAX_TOTAL_GROUPED_ENTRIES / NUM_BUCKETS; - - __shared__ int buffer_counts[NUM_BUCKETS]; - __shared__ int global_counts[NUM_BUCKETS]; - __shared__ xchacha_pair buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket - __shared__ int num_ready; - __shared__ int batch_id; - __shared__ int bucket_to_flush; - - // 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory. - // let's try first with 256 buckets. - //__shared__ int flush; - - const uint32_t NUM_THREADS = blockDim.x; - const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads - //if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n"); - - uint32_t x_group = blockIdx.x; - uint32_t x_start = x_group * NUM_PER_BLOCK; - - if (x_start < N) { - if (threadIdx.x == 0) { - num_ready = 0; - batch_id = 0; - } - // make sure all values start right! - for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) { - buffer_counts[i] = 0; - global_counts[i] = 0; - } - __syncthreads(); - - // go through each batch of data - while (batch_id < NUM_BATCHES_OF_THREADS) { - while ((num_ready == 0) && (batch_id < NUM_BATCHES_OF_THREADS)) { - // thread is of course threadIdx.x - uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x; - uint32_t chacha = chachas[x]; - - //if (threadIdx.x == 0) { - // printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready); - //} - __syncthreads(); - - uint32_t bucket_id = chacha / BUCKET_DIVISOR; - uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1); - uint32_t address = bucket_id * BUCKET_DEPTH + slot; - - //printf(" xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot); - if (address > NUM_BUCKETS*BUCKET_DEPTH) { - printf("ERROR ADDRESS %u -- batch: %u bucket_id: %u slot: %u\n", address, batch_id, bucket_id, slot); - } else { - xchacha_pair entry = { x, chacha }; - buffer[address] = entry; - } - - if (slot == (FLUSH_DEPTH-1)) { - atomicAdd(&num_ready, 1); - bucket_to_flush = bucket_id; // doesn't matter if this gets overwritten by another thread - // point is we want to get first bucket and if there is more we fetch it from list. - //printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready); - } - - __syncthreads(); - if (threadIdx.x == 0) { - //for (int i=0;i 0) { - // flush those ready - const int num_to_flush = buffer_counts[bucket_to_flush]; - if (threadIdx.x == 0) { - global_counts[bucket_to_flush] += num_to_flush; - //global_counts[bucket_to_flush] = atomicAdd(&results_counts[bucket_to_flush],num_to_flush); - // printf("FLUSHING! %u buckets are ready, flushing bucket %u\n", num_ready, bucket_to_flush); - } - - __syncthreads(); - - for (int i=threadIdx.x;i 0) { - // find next bucket to flush! doesn't matter if multiple threads overwrite, - // just want one of them - for (int i=threadIdx.x;i= FLUSH_DEPTH) - bucket_to_flush = i; - } - } - } - - __syncthreads(); - - } - if (batch_id == NUM_BATCHES_OF_THREADS) { - // we finished entering all our data, now check left-over buckets. - // TODO: check each bucket count and write out data to global. - //if (threadIdx.x == 0) { - // printf("BATCHES COMPLETED: todo finish flushing rest of buffers\n"); - //} - for (int i=threadIdx.x;i 0) atomicAdd(&results_counts[i], buffer_counts[i]); - } - - } - } -} - - - -__global__ -void gpu_write_chachas_into_buckets_with_buffer_batches( - const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas, - uint32_t const MAX_PER_RESULTS_BUCKET, xchacha_pair *results, unsigned int *results_counts) -{ - // note num threads should be equal or higher than NUM_BUCKETS - // 256 has max depth of 23, 512 has max depth of 11. need keep some space for other variables. - - // good settings: NUM_BUCKETS 512, BUCKET DEPTH 11, FLUSH DEPTH 6 (15ms) - // 256, 22, 12 (11ms) - // the bigger the span betwen flush depth and bucket depth, the less likely hashes will overflow before the rest can fill up. - const uint32_t NUM_BUCKETS = 32; - const uint32_t BUCKET_DIVISOR = 1 << (32-5); // 32bit chacha into 8 bit NUM buckets - const uint32_t BUCKET_DEPTH = 128; // *should* be able to set this freely, as the active window should modulo effectively. - const uint32_t FLUSH_DEPTH = 32; // cache does best with a flush depth of 8, but even 6 is ok, 4 is 1st benefit jump. - - __shared__ int buffer_counts[NUM_BUCKETS]; - __shared__ int global_counts[NUM_BUCKETS]; - __shared__ xchacha_pair buffer[NUM_BUCKETS*BUCKET_DEPTH]; // 4 entries per bucket - __shared__ int num_ready; - __shared__ int active_buffer_pos; // this is the moving position/window in the buffer - __shared__ int eviction_needed; - __shared__ uint32_t batch_id; - - // 49152 bytes total shared memory = 384 chunks of 128 bytes. Means we can use 384 buckets to fill shared memory. - // let's try first with 256 buckets. - //__shared__ int flush; - - const uint32_t NUM_THREADS = blockDim.x; - const uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads - //if ((NUM_PER_BLOCK % NUM_THREADS) > 0) printf("CONFIG ERROR: NUM PER BLOCK MUST BE MULTIPLE OF NUM THREADS\n"); - - uint32_t x_group = blockIdx.x; - uint32_t x_start = x_group * NUM_PER_BLOCK; - - if (x_start < N) { - if (threadIdx.x == 0) { - num_ready = 0; - active_buffer_pos = 0; - eviction_needed = 0; - batch_id = 0; - } - // make sure all values start right! - for (int i=threadIdx.x;i < NUM_BUCKETS;i+=blockDim.x) { - buffer_counts[i] = 0; - global_counts[i] = 0; - } - __syncthreads(); - - // go through each batch of data - while (batch_id < NUM_BATCHES_OF_THREADS) { - while ((num_ready < NUM_BUCKETS) && (batch_id < NUM_BATCHES_OF_THREADS) && (eviction_needed == 0)) { - // thread is of course threadIdx.x - uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x; - uint32_t chacha = chachas[x]; - - //if (threadIdx.x == 0) { - // printf("BATCH_ID %u of %u - x starts: %u num_ready: %u\n",batch_id, NUM_BATCHES_OF_THREADS, x, num_ready); - //} - //__syncthreads(); - - uint32_t bucket_id = chacha / BUCKET_DIVISOR; - uint32_t slot = atomicAdd(&buffer_counts[bucket_id], 1); - uint32_t address = bucket_id * BUCKET_DEPTH + (slot + active_buffer_pos) % BUCKET_DEPTH; - - //printf(" xchacha pair x:%u chacha:%u into bucket:%u slot:%u \n", x, chachas[x], bucket_id, slot); - - if (address > NUM_BUCKETS*BUCKET_DEPTH) { - printf("ERROR ADDRESS %u -- batch: %u bucket_id: %u slot: %u\n", address, batch_id, bucket_id, slot); - } else { - xchacha_pair entry = { x, chacha }; - buffer[address] = entry; - } - - if (slot == (FLUSH_DEPTH-1)) { - atomicAdd(&num_ready, 1); - //printf("-> bucket %u slot is FLUSH ready, incremented num_ready counter to %u\n", bucket_id, num_ready); - } else if (slot == (BUCKET_DEPTH-1)) { - // one bucket got full, so it's time to evict all - atomicAdd(&eviction_needed, 1); // atomic not really necessary - //printf("-> bucket %u slot reached max bucket depth %u, set eviction needed to %u\n", bucket_id, BUCKET_DEPTH-1, eviction_needed); - } - - __syncthreads(); - if (threadIdx.x == 0) { - //for (int i=0;i 256 * MAX_PER_RESULTS_BUCKET) { - printf("global address out of bounds bucket_id: %u global_pos:%u\n", bucket_id_for_thread, global_pos); - } else { - //printf("global address bucket_id: %u global_pos:%u\n", bucket_id_for_thread, global_pos); - results[global_address] = entry; - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - // switch active buffer position now - active_buffer_pos = (active_buffer_pos + FLUSH_DEPTH) % BUCKET_DEPTH; - //printf(" - active_buffer_pos now set to %u\n", active_buffer_pos); - //for (int i=0;i 0) { - if (threadIdx.x == 0) { - //printf("HANDLE EVICTION CASE\n"); - for (int i=0;i= BUCKET_DEPTH) { - eviction_needed = i; - num_ready = num_ready - 1; - buffer_counts[i] = 0; // okay, kind of a bug b/c if more than one eviction then we lose entries - // but for now it's just to test performance. - } - } - } - __syncthreads(); - - for (int i=threadIdx.x;i> 26); - uint16_t kbc_y = y % kBC; - uint32_t kbc_bucket_id = y / kBC; - //printf("x: %u kbc: %u\n", x, kbc_bucket_id); - unsigned int kbc_shift = kbc_bucket_id % 32; - unsigned int kbc_add_slot = 1 << kbc_shift; - unsigned int value = atomicAdd(&kbc_counts[kbc_bucket_id/32], kbc_add_slot); - unsigned int slot = (value >> kbc_shift) & 31; - //kbc_counts[kbc_bucket_id/32] = slot+1; - // THE ATOMIC ADDS ARE THE PROBLEM! - //unsigned int slot = atomicAdd(&kbc_counts[kbc_bucket_id % (32768*32)],1);// = slot+1; - out_kbc_ys[kbc_bucket_id * 32 + slot] = kbc_y; - out_kbc_xs[kbc_bucket_id * 32 + slot] = x; - } -} - -__global__ void gpu_get_max_counts_from_counter_list(unsigned int *kbc_counts, const int NUM) { - __shared__ unsigned int max_kbc_count; - __shared__ unsigned int sum_kbc_count; - if (threadIdx.x == 0) { - max_kbc_count = 0; - sum_kbc_count = 0; - } - __syncthreads(); - for (int i=threadIdx.x;i 150) printf("kbc: %u count: %u\n", i, kbc_count); - atomicMax(&max_kbc_count, kbc_count); - atomicAdd(&sum_kbc_count, kbc_count); - } - if (threadIdx.x == 0) printf("counter list counts SUM:%u MAX:%u\n", sum_kbc_count, max_kbc_count); -} - -__global__ void gpu_show_chachas(const uint32_t N, const uint32_t step, uint32_t *chachas) { - for (int i=0;i> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS(datax[j+0],0);ATTACK_WRITE_CHACHAS(datax[j+1],1);ATTACK_WRITE_CHACHAS(datax[j+2],2);ATTACK_WRITE_CHACHAS(datax[j+3],3); - ATTACK_WRITE_CHACHAS(datax[j+4],4);ATTACK_WRITE_CHACHAS(datax[j+5],5);ATTACK_WRITE_CHACHAS(datax[j+6],6);ATTACK_WRITE_CHACHAS(datax[j+7],7); - ATTACK_WRITE_CHACHAS(datax[j+8],8);ATTACK_WRITE_CHACHAS(datax[j+9],9);ATTACK_WRITE_CHACHAS(datax[j+10],10);ATTACK_WRITE_CHACHAS(datax[j+11],11); - ATTACK_WRITE_CHACHAS(datax[j+12],12);ATTACK_WRITE_CHACHAS(datax[j+13],13);ATTACK_WRITE_CHACHAS(datax[j+14],14);ATTACK_WRITE_CHACHAS(datax[j+15],15); - } - - __syncthreads(); - for (int i=threadIdx.x;i> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS_COALESCED(datax[j+0],0);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+1],1);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+2],2);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+3],3); - ATTACK_WRITE_CHACHAS_COALESCED(datax[j+4],4);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+5],5);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+6],6);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+7],7); - ATTACK_WRITE_CHACHAS_COALESCED(datax[j+8],8);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+9],9);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+10],10);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+11],11); - ATTACK_WRITE_CHACHAS_COALESCED(datax[j+12],12);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+13],13);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+14],14);ATTACK_WRITE_CHACHAS_COALESCED(datax[j+15],15); - } -} - -// run with 128 blocksize, more doesn't matter. -__global__ -void gpu_chacha8_k32_write_chachas32(const uint32_t N, const uint32_t X_START, - const __restrict__ uint32_t *input, - uint32_t *chachas) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - __shared__ uint32_t shared_chachas[128*32]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so. - - if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 32; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - const int j = 0; - if (x_group < end_n) { - uint32_t pos = x_group * 2 + X_START/16; - //printf("x group pos = %u\n", pos); - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32(datax[j+0],0);ATTACK_WRITE_CHACHAS32(datax[j+1],1);ATTACK_WRITE_CHACHAS32(datax[j+2],2);ATTACK_WRITE_CHACHAS32(datax[j+3],3); - ATTACK_WRITE_CHACHAS32(datax[j+4],4);ATTACK_WRITE_CHACHAS32(datax[j+5],5);ATTACK_WRITE_CHACHAS32(datax[j+6],6);ATTACK_WRITE_CHACHAS32(datax[j+7],7); - ATTACK_WRITE_CHACHAS32(datax[j+8],8);ATTACK_WRITE_CHACHAS32(datax[j+9],9);ATTACK_WRITE_CHACHAS32(datax[j+10],10);ATTACK_WRITE_CHACHAS32(datax[j+11],11); - ATTACK_WRITE_CHACHAS32(datax[j+12],12);ATTACK_WRITE_CHACHAS32(datax[j+13],13);ATTACK_WRITE_CHACHAS32(datax[j+14],14);ATTACK_WRITE_CHACHAS32(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32(datax[j+3],16+3); - ATTACK_WRITE_CHACHAS32(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32(datax[j+7],16+7); - ATTACK_WRITE_CHACHAS32(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32(datax[j+11],16+11); - ATTACK_WRITE_CHACHAS32(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32(datax[j+15],16+15); - - } - - __syncthreads(); - for (int i=threadIdx.x;i 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 32; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - const int j = 0; - if (x_group < end_n) { - for (int i=threadIdx.x;i<32;i+=blockDim.x) { - counts[i] = 0; - } - - uint32_t pos = x_group * 2 + X_START/16; - //printf("x group pos = %u\n", pos); - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],3); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],7); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],11); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+3],16+3); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+7],16+7); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+11],16+11); - ATTACK_WRITE_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_WRITE_CHACHAS32_PAIR(datax[j+15],16+15); - - } - - __syncthreads(); - const uint32_t TEST_BUCKET_BITS = 5; - const uint32_t TEST_MAX_PER_BUCKET = (1 << (32-TEST_BUCKET_BITS-6))*2; - for (int i=threadIdx.x;i> (32 - TEST_BUCKET_BITS); // 16 buckets - int slot = atomicAdd(&counts[bucket_id],1); - chachas_buckets[TEST_MAX_PER_BUCKET * bucket_id + base_x + slot] = shared_chachas[i]; - } -} - -__global__ -void gpu_filter_chachas( - const uint32_t NUM_PER_BLOCK, const uint32_t N, uint32_t *chachas, - xchacha_pair *results, xchacha_pair *results2) -{ - // highest performance bucket bits 4, with 1024 threads, num per block 65536. Then all blocks work with L2 cache? - const uint32_t NUM_BUCKETS = DUMBSORT_NUM_BUCKETS; - const uint32_t BUCKET_DIVISOR = 1 << (32-DUMBSORT_BUCKET_BITS); // 32bit chacha into 8 bit NUM buckets - const uint32_t NUM_THREADS = blockDim.x; - uint32_t NUM_BATCHES_OF_THREADS = NUM_PER_BLOCK / NUM_THREADS; // note num per block must be multiple of num threads - uint32_t x_group = blockIdx.x; - uint32_t x_start = x_group * NUM_PER_BLOCK; - const uint32_t GLOBAL_TILE_START = x_group * DUMBSORT_BATCHES_TILE_SPACE; - - __shared__ int filter_count; - - - if (x_start < N) { - //if (threadIdx.x == 0) { - // printf("x start: %u global_bucket_start_pos: %u vs before %u\n", x_start, global_bucket_start_pos, x_start / blockDim.x); - //} - if (threadIdx.x == 0) filter_count = 0; - __syncthreads(); - - uint32_t batch_id = 0; - for (batch_id = 0; batch_id < NUM_BATCHES_OF_THREADS; batch_id++) { - uint32_t x = x_start + batch_id * NUM_THREADS + threadIdx.x; - uint32_t chacha = chachas[x]; - xchacha_pair entry = { x, chacha }; - - uint32_t bucket_id = chacha / BUCKET_DIVISOR; - //printf("chacha %u - bucket id: %u\n", chacha, bucket_id); - if (bucket_id >= NUM_BUCKETS) printf("BUCKET OUT OF RANGE ERROR: %u", bucket_id); - if (bucket_id == 0) { - int slot = atomicAdd(&filter_count,1); - uint32_t results_address = GLOBAL_TILE_START + bucket_id * PHASE_1_DUMBSORT_MAX_PER_BUCKET + slot; - if (results_address < DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) { - results[results_address] = entry; - } else { - printf("results address overflow %u - global start pos: %u bucket %u slot %u DUMBSORT_SPACE_NEEDED_FOR_SCRATCH: %u\n", - results_address, GLOBAL_TILE_START, bucket_id, slot, DUMBSORT_SPACE_NEEDED_FOR_SCRATCH); - } - } - } - } -} - -/*template -__global__ -void gpu_attack_process_global_kbc_pairs_list( - const int PAIRS_COUNT, unsigned int *kbc_pairs_list_L_bucket_ids, - const BUCKETED_ENTRY_IN *kbc_global_entries_L, const unsigned int *kbc_global_num_entries_L, - const uint32_t *rx_list, const uint RX_START, const uint RX_END, - Match_Attack_Pair_Index *match_list, int *match_counts, - const uint32_t KBC_MAX_ENTRIES) { - - // NOTE: possible optimization is to only get y elements of a list instead of ALL the meta... - // requires splitting the meta and y fields into two separate lists. Alternatively we copy - // all the meta chunk in this round. - - int i = blockIdx.x*blockDim.x+threadIdx.x; - - if (i < PAIRS_COUNT) { - unsigned int global_kbc_L_bucket_id = kbc_pairs_list_L_bucket_ids[i]; - - uint32_t kbc_bitmask_bucket = global_kbc_L_bucket_id / 8; - uint32_t kbc_bitmask_shift = 4*(global_kbc_L_bucket_id % 8); - uint32_t bitvalue = kbc_global_num_entries_L[kbc_bitmask_bucket]; - const unsigned int num_L = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - kbc_bitmask_bucket = (global_kbc_L_bucket_id + 1) / 8; - kbc_bitmask_shift = 4*((global_kbc_L_bucket_id + 1) % 8); - bitvalue = kbc_global_num_entries_R[kbc_bitmask_bucket]; - const unsigned int num_R = (bitvalue >> (kbc_bitmask_shift)) & 0b01111; - - if ((num_L == 0) || (num_R == 0)) { - printf("ERROR: PAIRS LIST SHOULD NOT HAVE 0 COUNTS\n"); - return; // shouldn't ever happen with a pairs list... - } - - const uint32_t start_L = global_kbc_L_bucket_id*KBC_MAX_ENTRIES; - const uint32_t start_R = (global_kbc_L_bucket_id+1)*KBC_MAX_ENTRIES; - - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_global_entries_L[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_global_entries_R[start_R]; - - // For any 0 <= m < kExtraBitsPow: - // yl / kBC + 1 = yR / kBC AND - // (yr % kBC) / kC - (yl % kBC) / kC = m (mod kB) AND - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - for (int pos_R = 0; pos_R < num_R; pos_R+=1) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - int16_t yr_kbc = R_entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - int16_t yl_kbc = L_entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - int slot = atomicAdd(&match_counts[0],1); - Match_Attack_Pair_Index match = { }; - match.bucket_L_id = global_kbc_L_bucket_id; - match.idx_L = pos_L; - match.idx_R = pos_R; - // *could* coelesce pair.meta[0..4] values here and y, instead of splitting y list. - // suspect splitting y list would be faster. - match_list[slot] = match; - } - } - } - } - } -}*/ - - -void attack_method_lxs(uint32_t num_lxs) { - - std::cout << "ATTACK METHOD LXS - SORT XS/YS! " << num_lxs << std::endl; - - using milli = std::chrono::milliseconds; - auto attack_start = std::chrono::high_resolution_clock::now(); - - - const uint32_t NUM_LXS = 20000000; - const uint32_t BATCHES = 64; - const uint32_t NUM_PER_BATCH = UINT_MAX / BATCHES; - const uint32_t KBC_MAX_BUCKET_SIZE = 32; // SHOULD BE MAX 19 FOR BATCHES 64 - // for our bucketing sort, we have a total number of grouped entries and divvy that up into 256 stripes to get - // our max per entry - const uint32_t MAX_TOTAL_GROUPED_ENTRIES = DUMBSORT_BATCHES_TILE_SPACE; - //const uint32_t MAX_ENTRIES_PER_GROUPING = MAX_TOTAL_GROUPED_ENTRIES / 256; - - - auto alloc_start = std::chrono::high_resolution_clock::now(); - int blockSize; uint64_t calc_N;uint64_t calc_blockSize;uint64_t calc_numBlocks;int numBlocks; - - uint32_t *chachas; - xchacha_pair *xchachas_buffer_1; - xchacha_pair *xchachas_buffer_2; - uint32_t *batched_chachas; - uint32_t *batched_xs; - unsigned int *xchachas_counts; - uint16_t *out_kbc_ys; - uint32_t *out_kbc_xs; - unsigned int *global_kbc_counts; - - std::cout << " NUM BATCHES: " << BATCHES << std::endl; - std::cout << " NUM PER BATCH: " << NUM_PER_BATCH << std::endl; - std::cout << " KBC MAX BUCKET SIZE:" << KBC_MAX_BUCKET_SIZE << std::endl; - std::cout << " MAX_TOTAL_GROUPED_ENTRIES: " << MAX_TOTAL_GROUPED_ENTRIES << std::endl; - - std::cout << " chachas size:" << (sizeof(uint32_t)*NUM_PER_BATCH) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&chachas, sizeof(uint32_t)*NUM_PER_BATCH)); - CUDA_CHECK_RETURN(cudaMemset(chachas, 0, sizeof(uint32_t)*NUM_PER_BATCH)); - - std::cout << " xchachas_grouped size: " << (sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&xchachas_buffer_1, sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH)); - CUDA_CHECK_RETURN(cudaMalloc(&xchachas_buffer_2, sizeof(xchacha_pair)*DUMBSORT_SPACE_NEEDED_FOR_SCRATCH)); - CUDA_CHECK_RETURN(cudaMalloc(&xchachas_counts, sizeof(int)*1024)); // can be tuned to less, for now this is general - CUDA_CHECK_RETURN(cudaMemset(xchachas_counts, 0, 1024)); - batched_chachas = (uint32_t *) &xchachas_buffer_1[0]; - batched_xs = (uint32_t *) &xchachas_buffer_2[0]; - - - //std::cout << " out_kbc_ys size:" << (sizeof(uint16_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS) << std::endl; - //CUDA_CHECK_RETURN(cudaMalloc(&out_kbc_ys, sizeof(uint16_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS)); - //std::cout << " out_kbc_xs size:" << (sizeof(uint32_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS) << std::endl; - //CUDA_CHECK_RETURN(cudaMalloc(&out_kbc_xs, sizeof(uint32_t)*KBC_MAX_BUCKET_SIZE*kBC_NUM_BUCKETS)); - - std::cout << " global_kbc_counts size:" << (sizeof(int)*kBC_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&global_kbc_counts, sizeof(int)*kBC_NUM_BUCKETS)); - CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, kBC_NUM_BUCKETS*sizeof(int))); - - int deviceCount = 0; - cudaError_t error_id = cudaGetDeviceCount(&deviceCount); - - if (error_id != cudaSuccess) { - printf("cudaGetDeviceCount returned %d\n-> %s\n", - static_cast(error_id), cudaGetErrorString(error_id)); - printf("Result = FAIL\n"); - exit(EXIT_FAILURE); - } - - // This function call returns 0 if there are no CUDA capable devices. - if (deviceCount == 0) { - printf("There are no available device(s) that support CUDA\n"); - } else { - printf("Detected %d CUDA Capable device(s)\n", deviceCount); - } - - //int device_id = 0; - //cudaSetDevice(device_id); - //cudaDeviceProp deviceProp; - //cudaGetDeviceProperties(&deviceProp, device_id); - //printf("\nDevice %d: \"%s\"\n", device_id, deviceProp.name); - //cudaDeviceSetLimit(cudaLimitPersistingL2CacheSize, deviceProp.persistingL2CacheMaxSize); - //std::cout << " persisting cache size: " << deviceProp.persistingL2CacheMaxSize << std::endl; - //std::cout << " accessPolicyMaxWindowSize: " << deviceProp.accessPolicyMaxWindowSize << std::endl; - //cudaStream_t stream; - //cudaStreamCreate(&stream); - //cudaStreamAttrValue attr; - //attr.accessPolicyWindow.base_ptr = global_kbc_counts; - //attr.accessPolicyWindow.num_bytes = kBC_NUM_BUCKETS*sizeof(int) / 32; - //attr.accessPolicyWindow.hitRatio = 1.0; - //attr.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; - //attr.accessPolicyWindow.missProp = cudaAccessPropertyStreaming; - //cudaStreamSetAttribute(stream,cudaStreamAttributeAccessPolicyWindow,&attr); - - auto alloc_finish = std::chrono::high_resolution_clock::now(); - std::cout << " alloc time: " << std::chrono::duration_cast(alloc_finish - attack_start).count() << " ms\n"; - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - - - auto chacha_start = std::chrono::high_resolution_clock::now(); - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = UINT_MAX / BATCHES; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32); - numBlocks = calc_numBlocks; - - // NEW ALGORITHM!!!! - // 1) LOAD ALL LX'S INTO GLOBAL_KBC_L1_BUCKETED_YS - // 2) GO THROUGH EACH RX IN ORDER (NO SORTING!) AND FIND L VALUES IN BUCKETS AND CHECK FOR MATCHES. THAT'S IT. - // will the cache be fast enough???????? or will sorting be better!?!?!? - // can experiment with different local_kbc sizes to see if lx's fit in cache we get sufficient performance - - - const int groupingBlockSize = 1024; - - //const uint32-t GROUPING_BATCH_MAX_ENTRIES_PER_BUCKET = 65536 / 8; - int groupingNumBlocks = (NUM_PER_BATCH + groupingBlockSize - 1) / (GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK); - - int bitonicThreads = 512; - int bitonicBlocks = NUM_PER_BATCH / 1024; // should be 65536 - std::cout << "GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK: " << GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK << " NUM BLOCKS: " << groupingNumBlocks << std::endl; - uint32_t X_START = 0; - for (uint32_t batch_id=0;batch_id < 1; batch_id++) { - X_START = batch_id * (1 << (32-6)); - - gpu_chacha8_k32_write_chachas32<<>>(calc_N, X_START, chacha_input, chachas); // 24ms - - //bitonicSortShared<<>>(chachas, batched_chachas, batched_xs); - nickSortShared<<<1,SHARED_SIZE_LIMIT>>>(chachas, batched_chachas, batched_xs); - //gpu_show_chacha_xs_lists<<<1,1>>>(0,10,batched_chachas, batched_xs); - //gpu_show_chacha_xs_lists<<<1,1>>>(1024,10,batched_chachas, batched_xs); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - //CUDA_CHECK_RETURN(cudaMemset(xchachas_counts, 0, sizeof(int)*1024)); - //gpu_filter_chachas<<>>( - // GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas, - // xchachas_buffer_1, xchachas_buffer_2); - - //gpu_write_chachas_into_buckets_dumb_batches<<>>( - // GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas, - // xchachas_buffer_1, xchachas_buffer_2); - - //gpu_write_chachas_into_buckets_with_single_row_depthflush<<>>( - // GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas, - // MAX_TOTAL_GROUPED_ENTRIES, xchachas_grouped, xchachas_counts); - - /* - * GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK: 65536 - - gpu_chacha8_k32_write_chachas 4294967232 in 64 BATCHES results: 1158 ms -Freeing memory... -counter list counts SUM:67108864 MAX:263782 - */ - - //gpu_write_chachas_into_buckets_with_buffer_batches<<>>( - // GROUPING_BATCH_NUM_ENTRIES_PER_BLOCK, NUM_PER_BATCH, chachas, - // MAX_ENTRIES_PER_GROUPING, xchachas_grouped, xchachas_counts); - - // stupid thrust, not a 100% deal break but close to being too slow - //thrust::device_ptr device_xs_R_ptr(out_kbc_xs); - //thrust::device_ptr device_ys_R_ptr(chachas); - //thrust::sort_by_key(device_ys_R_ptr, device_ys_R_ptr + calc_N, device_xs_R_ptr); - //thrust::sort(device_ys_R_ptr, device_ys_R_ptr + calc_N); - - //CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, kBC_NUM_BUCKETS*sizeof(int))); // 30ms - //gpu_filter_chachas_into_global_kbc_bucket<<>>(calc_N, X_START, chachas, - // out_kbc_ys, out_kbc_xs, global_kbc_counts); // 56ms - //gpu_get_max_count_in_global_kbc_bucket<<<1,256>>>(global_kbc_counts); - - } - - - //CUDA_CHECK_RETURN(cudaMemset(device_global_kbc_num_entries_L, 0, 10000000*sizeof(int))); - //gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<>>(calc_N, chacha_input, - // local_kbc_entries, device_global_kbc_num_entries_L, 0, 2000000); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - std::cout << " - gpu_chacha8_k32_write_chachas " << (calc_N*BATCHES) << " in " << BATCHES << " BATCHES results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_counts, 256); - //gpu_show_chachas<<<1,1>>>(NUM_PER_BATCH, 10000, chachas); - //CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //gpu_get_max_counts_from_counter_list<<<1,256>>>(global_kbc_counts, kBC_NUM_BUCKETS); - - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(chachas)); - CUDA_CHECK_RETURN(cudaFree(out_kbc_ys)); - CUDA_CHECK_RETURN(cudaFree(out_kbc_xs)); - CUDA_CHECK_RETURN(cudaFree(global_kbc_counts)); - - - - auto attack_finish = std::chrono::high_resolution_clock::now(); - std::cout << " compute only time: " << std::chrono::duration_cast(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << " attack total time: " << std::chrono::duration_cast(attack_finish - attack_start).count() << " ms\n"; - std::cout << "end." << std::endl; -} - - - - - - - -#endif /* ATTACK_METHOD_LXS2_HPP_ */ diff --git a/attack_method_xpairbits.hpp b/attack_method_xpairbits.hpp deleted file mode 100644 index 34f2cae..0000000 --- a/attack_method_xpairbits.hpp +++ /dev/null @@ -1,557 +0,0 @@ -/* - * attack_method_xpairbits.hpp - * - * Created on: Dec 5, 2021 - * Author: nick - */ - -#ifndef ATTACK_METHOD_XPAIRBITS_HPP_ -#define ATTACK_METHOD_XPAIRBITS_HPP_ - -const uint32_t MAX_LXS_PER_KBC_BUCKET = 16; // 24 for 110,000,000 - -const uint32_t XPAIR_BITS = 8; -const uint32_t MAX_RX_MATCHES = (1 << (32 - XPAIR_BITS))*2; -const uint32_t CHACHA_NUM_BATCHES_BITS = 3; -const uint32_t CHACHA_NUM_BATCHES = 1 << CHACHA_NUM_BATCHES_BITS; -const uint32_t CHACHA_TOTAL_ENTRIES_PER_BATCH = (1 << (32 - XPAIR_BITS - CHACHA_NUM_BATCHES_BITS)); -const uint32_t CHACHA_BUCKET_BITS = 4; // ACROSS ALL BATCHES -const uint32_t CHACHA_NUM_BUCKETS = (1 << CHACHA_BUCKET_BITS); -const uint32_t CHACHA_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS)); -const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR = (1 << (32 - CHACHA_BUCKET_BITS - CHACHA_NUM_BATCHES_BITS)); -const uint32_t CHACHA_MAX_ENTRIES_PER_BUCKET = (11 * (CHACHA_TOTAL_ENTRIES_PER_BATCH / CHACHA_NUM_BUCKETS)) / 10; -const uint64_t CHACHA_OUT_MAX_ENTRIES_NEEDED = (CHACHA_NUM_BUCKETS * CHACHA_MAX_ENTRIES_PER_BUCKET); - -struct xchacha_pair { - uint32_t x; - uint32_t chacha; -}; - -#define KBC_MASK_SHIFT 4 -#define KBC_MASK_MOD 8 -#define KBC_MASK_BITS 0b001111 -#define ATTACK_INTO_KBC_YS_BITMASK(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD; \ - uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD); \ - uint32_t add = 1 << kbc_bitmask_shift; \ - uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add); \ - uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; \ - if (slot > MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } \ - uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; \ - kbc_global_Ly_entries_L[entries_address] = y; \ - kbc_x_entries[entries_address] = (x + i); \ -} - -__global__ -void gpu_chacha8_set_Lxs_into_kbc_ys_mask(const uint32_t N, - const __restrict__ uint32_t *input, - uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - - for (uint32_t x_group = index; x_group < end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - ATTACK_INTO_KBC_YS_BITMASK(x0,0);ATTACK_INTO_KBC_YS_BITMASK(x1,1);ATTACK_INTO_KBC_YS_BITMASK(x2,2);ATTACK_INTO_KBC_YS_BITMASK(x3,3); - ATTACK_INTO_KBC_YS_BITMASK(x4,4);ATTACK_INTO_KBC_YS_BITMASK(x5,5);ATTACK_INTO_KBC_YS_BITMASK(x6,6);ATTACK_INTO_KBC_YS_BITMASK(x7,7); - ATTACK_INTO_KBC_YS_BITMASK(x8,8);ATTACK_INTO_KBC_YS_BITMASK(x9,9);ATTACK_INTO_KBC_YS_BITMASK(x10,10);ATTACK_INTO_KBC_YS_BITMASK(x11,11); - ATTACK_INTO_KBC_YS_BITMASK(x12,12);ATTACK_INTO_KBC_YS_BITMASK(x13,13);ATTACK_INTO_KBC_YS_BITMASK(x14,14);ATTACK_INTO_KBC_YS_BITMASK(x15,15); - } -} - -__global__ -void gpu_list_xchachas(const uint32_t N, const xchacha_pair *xchachas) -{ - uint index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < N) { - xchacha_pair pair = xchachas[index]; - uint64_t y = (((uint64_t) pair.chacha) << 6) + (pair.x >> 26); - uint32_t kbc_bucket_id = uint32_t (y / kBC); - printf("set xchachas kbc mask index: %u x: %u chacha: %u y: %llu kbc_bucket_id: %u\n", - index, pair.x, pair.chacha, y, kbc_bucket_id); - } -} - -__global__ -void gpu_chacha8_set_xchachas_into_kbc_ys_mask(const uint32_t N, - const xchacha_pair *xchachas, - uint16_t *kbc_global_Ly_entries_L, uint32_t *kbc_x_entries, unsigned int *kbc_global_num_entries_L, uint32_t MAX_LXS_PER_KBC_BUCKET) -{ - int index = blockIdx.x * blockDim.x + threadIdx.x; - if (index < N) { - xchacha_pair pair = xchachas[index]; - uint64_t y = (((uint64_t) pair.chacha) << 6) + (pair.x >> 26); - uint32_t kbc_bucket_id = uint32_t (y / kBC); - if (index < 10) - printf("set xchachas kbc mask index: %u x: %u chacha: %u y: %llu kbc_bucket_id: %u\n", - index, pair.x, pair.chacha, y, kbc_bucket_id); - //uint32_t kbc_bitmask_bucket = kbc_bucket_id / KBC_MASK_MOD; - //uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id % KBC_MASK_MOD); - //uint32_t add = 1 << kbc_bitmask_shift; - //uint slot_value = atomicAdd(&kbc_global_num_entries_L[kbc_bitmask_bucket],add); - //uint slot = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; - - uint slot = atomicAdd(&kbc_global_num_entries_L[kbc_bucket_id],1); - - if (index < 10) { - printf("set xchachas kbc mask index: %u x: %u chacha: %u y: %llu kbc_bucket_id: %u\n", - index, pair.x, pair.chacha, y, kbc_bucket_id); - } - - //if (slot > MAX_LXS_PER_KBC_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u\n", MAX_LXS_PER_KBC_BUCKET, slot); } - //uint32_t entries_address = kbc_bucket_id * MAX_LXS_PER_KBC_BUCKET + slot; - //kbc_global_Ly_entries_L[entries_address] = y % kBC; - //kbc_x_entries[entries_address] = pair.x; - } - -} - -__global__ void gpu_get_max_counts_from_counter_list(unsigned int *kbc_counts, const int NUM, const bool printAll) { - __shared__ unsigned int max_kbc_count; - __shared__ unsigned int sum_kbc_count; - if (threadIdx.x == 0) { - max_kbc_count = 0; - sum_kbc_count = 0; - } - __syncthreads(); - for (uint32_t i=threadIdx.x;i> kbc_bitmask_shift) & KBC_MASK_BITS; - unsigned int kbc_count = kbc_counts[i]; - if (printAll) printf("id: %u count: %u\n", i, kbc_count); - atomicMax(&max_kbc_count, kbc_count); - atomicAdd(&sum_kbc_count, kbc_count); - } - __syncthreads(); - if (threadIdx.x == 0) printf("counter list counts SUM:%u MAX:%u\n", sum_kbc_count, max_kbc_count); -} - -#define ATTACK_BUCKETBATCH_CHACHAS32_PAIR(chacha_y,i) \ -{ \ - if ((chacha_y >= BATCH_CHACHA_RANGE_MIN) && (chacha_y <= BATCH_CHACHA_RANGE_MAX)) { \ - xchacha_pair pair = { base_x + i, chacha_y }; \ - int slot = atomicAdd(&local_filter_count,1); \ - if (slot > MAX_SHARED_CHACHAS) printf("MAX_SHARED_CHACHAS %u OVERFLOW %u\n", MAX_SHARED_CHACHAS, slot); \ - shared_chachas[slot] = pair; \ - uint32_t split_bucket_id = (chacha_y - BATCH_CHACHA_RANGE_MIN) / CHACHA_SPLIT_BUCKET_DIVISOR; \ - atomicAdd(&shared_counts[split_bucket_id],1); \ - } \ -} - -// run with 128 blocksize, more doesn't matter. -template -__global__ -void gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange(const uint32_t N, - const uint32_t BATCH_CHACHA_RANGE_MIN, const uint32_t BATCH_CHACHA_RANGE_MAX, - const uint32_t CHACHA_MAX_PER_SPLIT_BUCKET, const uint32_t CHACHA_SPLIT_BUCKET_DIVISOR, - const __restrict__ uint32_t *input, - xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - //__shared__ uint32_t datax[33*256]; // each thread (256 max) gets its own shared access starting at 32 byte boundary. - //uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - const uint32_t MAX_SHARED_CHACHAS = 128*8; // try to bring down as much as can - __shared__ xchacha_pair shared_chachas[MAX_SHARED_CHACHAS]; // *possibly* using 32 to prevent some bank conflicts can help, but don't thing so. - __shared__ uint shared_counts[NUM_SPLIT_BUCKETS]; - __shared__ uint global_counts[NUM_SPLIT_BUCKETS]; - __shared__ uint local_filter_count; - - //if (blockDim.x > 128) printf("MUST HAVE BLOCKSIZE 128 (RECOMMENDED) OR LESS, OR INCREASED SHARED MEM TO MORE\n"); - - //uint32_t base_group = blockIdx.x * blockDim.x; - - uint32_t x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - uint32_t base_x = x_group * 32; - const uint32_t end_n = N / 32; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - for (int i=threadIdx.x;i> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],3); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],7); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],11); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],15); - - pos += 1; - - datax[j+0] = input[0];datax[j+1] = input[1];datax[j+2] = input[2];datax[j+3] = input[3];datax[j+4] = input[4];datax[j+5] = input[5];datax[j+6] = input[6];datax[j+7] = input[7]; - datax[j+8] = input[8];datax[j+9] = input[9];datax[j+10] = input[10];datax[j+11] = input[11]; - datax[j+12] = pos; datax[j+13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[j+14] = input[14];datax[j+15] = input[15]; - -#pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[j+0], datax[j+4], datax[j+8], datax[j+12]);QUARTERROUND(datax[j+1], datax[j+5], datax[j+9], datax[j+13]); - QUARTERROUND(datax[j+2], datax[j+6], datax[j+10], datax[j+14]);QUARTERROUND(datax[j+3], datax[j+7], datax[j+11], datax[j+15]); - QUARTERROUND(datax[j+0], datax[j+5], datax[j+10], datax[j+15]);QUARTERROUND(datax[j+1], datax[j+6], datax[j+11], datax[j+12]); - QUARTERROUND(datax[j+2], datax[j+7], datax[j+8], datax[j+13]);QUARTERROUND(datax[j+3], datax[j+4], datax[j+9], datax[j+14]); - } - - datax[j+0] += input[0];datax[j+1] += input[1];datax[j+2] += input[2];datax[j+3] += input[3];datax[j+4] += input[4]; - datax[j+5] += input[5];datax[j+6] += input[6];datax[j+7] += input[7];datax[j+8] += input[8];datax[j+9] += input[9]; - datax[j+10] += input[10];datax[j+11] += input[11];datax[j+12] += x_group; // j12;//datax[j+13] += 0; - datax[j+14] += input[14];datax[j+15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[j+0]);BYTESWAP32(datax[j+1]);BYTESWAP32(datax[j+2]);BYTESWAP32(datax[j+3]);BYTESWAP32(datax[j+4]);BYTESWAP32(datax[j+5]); - BYTESWAP32(datax[j+6]);BYTESWAP32(datax[j+7]);BYTESWAP32(datax[j+8]);BYTESWAP32(datax[j+9]);BYTESWAP32(datax[j+10]);BYTESWAP32(datax[j+11]); - BYTESWAP32(datax[j+12]);BYTESWAP32(datax[j+13]);BYTESWAP32(datax[j+14]);BYTESWAP32(datax[j+15]); - - //uint64_t y = datax[j+0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[j+0] >> 22; // gives bucket id 0..1023 - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+0],16+0);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+1],16+1);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+2],16+2);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+3],16+3); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+4],16+4);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+5],16+5);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+6],16+6);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+7],16+7); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+8],16+8);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+9],16+9);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+10],16+10);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+11],16+11); - ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+12],16+12);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+13],16+13);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+14],16+14);ATTACK_BUCKETBATCH_CHACHAS32_PAIR(datax[j+15],16+15); - } - // at this point we have 128*32 = 4096 entries - // now we have to sort them into the buckets - // we already have the shared counts set from the ATTACK macro - __syncthreads(); - for (int i=threadIdx.x;i CHACHA_MAX_PER_SPLIT_BUCKET) printf("Overflow CHACHA_MAX_PER_BUCKET %u SLOT %u\n", CHACHA_MAX_PER_SPLIT_BUCKET, slot); - else xchachas_buckets[CHACHA_MAX_PER_SPLIT_BUCKET * split_bucket_id + slot] = shared_chachas[i]; - } -} - -#define CHECK_MATCH() \ -{ \ - int16_t yr_kbc = Ry % kBC; \ - int16_t yr_bid = yr_kbc / kC; \ - int16_t yl_bid = yl_kbc / kC; \ - int16_t formula_one = yr_bid - yl_bid; \ - if (formula_one < 0) { \ - formula_one += kB; \ - } \ - int16_t m = formula_one; \ - if (m >= kB) { \ - m -= kB; \ - } \ - if (m < 64) { \ - int16_t yl_cid = yl_kbc % kC; \ - int16_t yr_cid = yr_kbc % kC;\ - int16_t parity = (kbc_bucket_id_L) % 2; \ - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; \ - int16_t formula_two = yr_cid - yl_cid; \ - if (formula_two < 0) { \ - formula_two += kC; \ - } \ - if (formula_two == m2_parity_squared) { \ - isMatch = true; \ - } \ - } \ -} - -__global__ -void gpu_chacha8_filter_rxs_from_bucket_batch( - const uint32_t N, - const xchacha_pair* __restrict__ xchachas, - const uint16_t* __restrict__ kbc_global_Ly_entries_L, - const unsigned int* __restrict__ kbc_global_num_entries_L, - uint32_t MAX_LXS_PER_KBC_BUCKET, - uint32_t * __restrict__ rxs, - int *rx_count) -{ - int i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - xchacha_pair entry = xchachas[i]; - uint64_t Ry = (((uint64_t) entry.chacha) << 6) + (entry.x >> 26); - int kbc_bucket_id_R = (uint32_t (Ry / kBC)); - if (kbc_bucket_id_R > 0) { - int kbc_bucket_id_L = kbc_bucket_id_R - 1; - //printf("entry x:%u chacha:%u\n", entry.x, entry.chacha, kbc_bucket_id_L); - //int num = kbc_global_num_entries_L[kbc_bucket_id_L]; - - //uint num = kbc_global_num_entries_L[kbc_bucket_id_L]; - uint32_t kbc_bitmask_bucket = kbc_bucket_id_L / KBC_MASK_MOD; - uint32_t kbc_bitmask_shift = KBC_MASK_SHIFT * (kbc_bucket_id_L % KBC_MASK_MOD); - uint slot_value =kbc_global_num_entries_L[kbc_bitmask_bucket]; - uint num = (slot_value >> kbc_bitmask_shift) & KBC_MASK_BITS; - for (int nm=0;nm(alloc_finish - attack_start).count() << " ms\n"; - - auto compute_only_start = std::chrono::high_resolution_clock::now(); - - int blockSize; // # of threads per block, maximum is 1024. - uint64_t calc_N; - uint64_t calc_blockSize; - uint64_t calc_numBlocks; - int numBlocks; - - // FIRST SET LXS into global memory, these stay put for each chacha round - /*blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = 1 << (32 - XPAIR_BITS); - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - - std::cout << " gpu_chacha8_set_Lxs_into_kbc_ys num:" << calc_N << std::endl; - auto lxintokbc_start = std::chrono::high_resolution_clock::now(); - gpu_chacha8_set_Lxs_into_kbc_ys_mask<<>>(calc_N, chacha_input, - kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto lxintokbc_finish = std::chrono::high_resolution_clock::now(); - std::cout << " gpu_chacha8_set_Lxs_into_kbc_ys time: " << std::chrono::duration_cast(lxintokbc_finish - lxintokbc_start).count() << " ms\n"; - gpu_get_max_counts_from_counter_list<<<1,1024>>>(device_global_kbc_num_entries_L, kBC_NUM_BUCKETS, false); - CUDA_CHECK_RETURN(cudaDeviceSynchronize());*/ - - auto chacha_batches_start = std::chrono::high_resolution_clock::now(); - int64_t total_chacha_ms = 0; - uint32_t sum_counts = 0; - for (uint64_t chacha_batch_id = 0; chacha_batch_id < 1/*CHACHA_NUM_BATCHES*/; chacha_batch_id++) { - //std::cout << "Doing chacha batch " << chacha_batch_id << std::endl; - uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS)); - uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR; - uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch - - //std::cout << " BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl; - //std::cout << " BATCH_CHACHA_RANGE : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl; - //std::cout << " BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl; - //std::cout << " CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl; - //std::cout << " CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl; - - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = 1 << (32 - XPAIR_BITS); //CHACHA_TOTAL_ENTRIES_PER_BATCH; - uint32_t CHACHA_X_START = chacha_batch_id * calc_N; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32); - numBlocks = calc_numBlocks; - CUDA_CHECK_RETURN(cudaMemset(lxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int))); - auto chacha_start = std::chrono::high_resolution_clock::now(); - //std::cout << " calc_N : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<<>>(calc_N, - BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX, - CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR, - chacha_input, - lxchachas, lxchachas_bucket_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - total_chacha_ms += std::chrono::duration_cast(chacha_finish - chacha_start).count(); - gpu_get_max_counts_from_counter_list<<<1,1>>>(lxchachas_bucket_counts, CHACHA_NUM_BUCKETS, true); - //auto chacha_rs_start = std::chrono::high_resolution_clock::now(); - - - for (uint chacha_bucket_id=0;chacha_bucket_id>>(10, &lxchachas[chacha_bucket_id]); - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = lxchachas_bucket_counts[chacha_bucket_id]; - sum_counts += calc_N; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); - numBlocks = calc_numBlocks; - std::cout << "Setting kbcs calc_N: " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - //gpu_chacha8_set_xchachas_into_kbc_ys_mask<<>>(calc_N, &lxchachas[chacha_bucket_id], - // kbc_Ly_entries, kbc_x_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET); - } - std::cout << "sum counts: " << sum_counts << std::endl; - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //std::cout << " gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - } - - gpu_get_max_counts_from_counter_list<<<1,1>>>(device_global_kbc_num_entries_L, 100, true);//kBC_NUM_BUCKETS, false); - - - /*for (uint64_t chacha_batch_id = 0; chacha_batch_id < CHACHA_NUM_BATCHES; chacha_batch_id++) { - //std::cout << "Doing chacha batch " << chacha_batch_id << std::endl; - uint64_t BATCH_CHACHA_DIVISOR = (1 << (32 - CHACHA_NUM_BATCHES_BITS)); - uint64_t BATCH_CHACHA_RANGE_MIN = ((uint64_t) (chacha_batch_id + 0)) * BATCH_CHACHA_DIVISOR; - uint64_t BATCH_CHACHA_RANGE_MAX = ((uint64_t) (chacha_batch_id + 1)) * BATCH_CHACHA_DIVISOR - 1; // use -1 since rnage is inclusive, also helps stay in 32-bit range rather than wrap to 0 for last batch - - //std::cout << " BATCH_CHACHA_DIVISOR : " << BATCH_CHACHA_DIVISOR << std::endl; - //std::cout << " BATCH_CHACHA_RANGE : " << BATCH_CHACHA_RANGE_MIN << " <-> " << BATCH_CHACHA_RANGE_MAX << std::endl; - //std::cout << " BATCH_CHACHA_TOTAL_ENTRIES : " << CHACHA_TOTAL_ENTRIES_PER_BATCH << std::endl; - //std::cout << " CHACHA_MAX_ENTRIES_PER_BUCKET : " << CHACHA_MAX_ENTRIES_PER_BUCKET << std::endl; - //std::cout << " CHACHA_SPLIT_BUCKET_DIVISOR : " << CHACHA_SPLIT_BUCKET_DIVISOR << std::endl; - - blockSize = 128; // # of threads per block, maximum is 1024. - calc_N = 1 << (32 - XPAIR_BITS); //CHACHA_TOTAL_ENTRIES_PER_BATCH; - uint32_t CHACHA_X_START = 0;//chacha_batch_id * calc_N; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 32); - numBlocks = calc_numBlocks; - CUDA_CHECK_RETURN(cudaMemset(rxchachas_bucket_counts, 0, CHACHA_NUM_BUCKETS*sizeof(int))); - auto chacha_start = std::chrono::high_resolution_clock::now(); - //std::cout << " calc_N : " << calc_N << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - gpu_chacha8_k32_compute_chachas32_filter_buckets_bychachabatchrange<<>>(calc_N, - BATCH_CHACHA_RANGE_MIN, BATCH_CHACHA_RANGE_MAX, - CHACHA_MAX_ENTRIES_PER_BUCKET, CHACHA_SPLIT_BUCKET_DIVISOR, - chacha_input, - rxchachas, rxchachas_bucket_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto chacha_finish = std::chrono::high_resolution_clock::now(); - total_chacha_ms += std::chrono::duration_cast(chacha_finish - chacha_start).count(); - //gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, CHACHA_NUM_BUCKETS, true); - //auto chacha_rs_start = std::chrono::high_resolution_clock::now(); - - for (uint chacha_bucket_id=0;chacha_bucket_id>>( - calc_N, - &rxchachas[chacha_bucket_id], - kbc_Ly_entries, device_global_kbc_num_entries_L, MAX_LXS_PER_KBC_BUCKET, - rx_match_list, rx_match_count); - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //std::cout << " gpu_chacha8_k32_write_chachas32_buckets results: " << std::chrono::duration_cast(chacha_finish - chacha_start).count() << " ms\n"; - }*/ - - - - auto compute_only_finish = std::chrono::high_resolution_clock::now(); - - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(kbc_Ly_entries)); - CUDA_CHECK_RETURN(cudaFree(kbc_x_entries)); - CUDA_CHECK_RETURN(cudaFree(device_global_kbc_num_entries_L)); - - auto attack_finish = std::chrono::high_resolution_clock::now(); - std::cout << " found " << rx_match_count[0] << " matches" << std::endl; - std::cout << " total chachas time: " << total_chacha_ms << " ms\n"; - std::cout << " compute only time: " << std::chrono::duration_cast(compute_only_finish - compute_only_start).count() << " ms\n"; - std::cout << " attack total time: " << std::chrono::duration_cast(attack_finish - attack_start).count() << " ms\n"; - std::cout << "end." << std::endl; -} - - -#endif /* ATTACK_METHOD_XPAIRBITS_HPP_ */ diff --git a/chia/chacha8.c b/chia/chacha8.c deleted file mode 100644 index dc707a7..0000000 --- a/chia/chacha8.c +++ /dev/null @@ -1,355 +0,0 @@ -#include "chacha8.h" - - - -static const char sigma[16] = "expand 32-byte k"; -static const char tau[16] = "expand 16-byte k"; - -void chacha8_keysetup_data(uint32_t *input, const uint8_t *k, uint32_t kbits, const uint8_t *iv) -{ - const char *constants; - - input[4] = U8TO32_LITTLE(k + 0); - input[5] = U8TO32_LITTLE(k + 4); - input[6] = U8TO32_LITTLE(k + 8); - input[7] = U8TO32_LITTLE(k + 12); - if (kbits == 256) { /* recommended */ - k += 16; - constants = sigma; - } else { /* kbits == 128 */ - constants = tau; - } - input[8] = U8TO32_LITTLE(k + 0); - input[9] = U8TO32_LITTLE(k + 4); - input[10] = U8TO32_LITTLE(k + 8); - input[11] = U8TO32_LITTLE(k + 12); - input[0] = U8TO32_LITTLE(constants + 0); - input[1] = U8TO32_LITTLE(constants + 4); - input[2] = U8TO32_LITTLE(constants + 8); - input[3] = U8TO32_LITTLE(constants + 12); - if (iv) { - input[14] = U8TO32_LITTLE(iv + 0); - input[15] = U8TO32_LITTLE(iv + 4); - } else { - input[14] = 0; - input[15] = 0; - } - - //for (int i=0;i<=15;i++) { - // printf("\ninput%d:%u", i, input[i]); - //} - //exit(0); -} - -void chacha8_get_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint8_t *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - int i; - - j0 = input[0]; - j1 = input[1]; - j2 = input[2]; - j3 = input[3]; - j4 = input[4]; - j5 = input[5]; - j6 = input[6]; - j7 = input[7]; - j8 = input[8]; - j9 = input[9]; - j10 = input[10]; - j11 = input[11]; - j12 = pos; - j13 = pos >> 32; - j14 = input[14]; - j15 = input[15]; - - while (n_blocks--) { - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - for (i = 8; i > 0; i -= 2) { - QUARTERROUND(x0, x4, x8, x12); - QUARTERROUND(x1, x5, x9, x13); - QUARTERROUND(x2, x6, x10, x14); - QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15); - QUARTERROUND(x1, x6, x11, x12); - QUARTERROUND(x2, x7, x8, x13); - QUARTERROUND(x3, x4, x9, x14); - } - x0 = PLUS(x0, j0); - x1 = PLUS(x1, j1); - x2 = PLUS(x2, j2); - x3 = PLUS(x3, j3); - x4 = PLUS(x4, j4); - x5 = PLUS(x5, j5); - x6 = PLUS(x6, j6); - x7 = PLUS(x7, j7); - x8 = PLUS(x8, j8); - x9 = PLUS(x9, j9); - x10 = PLUS(x10, j10); - x11 = PLUS(x11, j11); - x12 = PLUS(x12, j12); - x13 = PLUS(x13, j13); - x14 = PLUS(x14, j14); - x15 = PLUS(x15, j15); - - j12 = PLUSONE(j12); - if (!j12) { - j13 = PLUSONE(j13); - /* stopping at 2^70 bytes per nonce is user's responsibility */ - } - - U32TO8_LITTLE(c + 0, x0); - U32TO8_LITTLE(c + 4, x1); - U32TO8_LITTLE(c + 8, x2); - U32TO8_LITTLE(c + 12, x3); - U32TO8_LITTLE(c + 16, x4); - U32TO8_LITTLE(c + 20, x5); - U32TO8_LITTLE(c + 24, x6); - U32TO8_LITTLE(c + 28, x7); - U32TO8_LITTLE(c + 32, x8); - U32TO8_LITTLE(c + 36, x9); - U32TO8_LITTLE(c + 40, x10); - U32TO8_LITTLE(c + 44, x11); - U32TO8_LITTLE(c + 48, x12); - U32TO8_LITTLE(c + 52, x13); - U32TO8_LITTLE(c + 56, x14); - U32TO8_LITTLE(c + 60, x15); - - c += 64; - } -} - -void chacha8_get_k32_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint32_t *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - int i; - - j0 = input[0]; - j1 = input[1]; - j2 = input[2]; - j3 = input[3]; - j4 = input[4]; - j5 = input[5]; - j6 = input[6]; - j7 = input[7]; - j8 = input[8]; - j9 = input[9]; - j10 = input[10]; - j11 = input[11]; - j12 = pos; - j13 = pos >> 32; - j14 = input[14]; - j15 = input[15]; - - while (n_blocks--) { - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - for (i = 8; i > 0; i -= 2) { - QUARTERROUND(x0, x4, x8, x12); - QUARTERROUND(x1, x5, x9, x13); - QUARTERROUND(x2, x6, x10, x14); - QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15); - QUARTERROUND(x1, x6, x11, x12); - QUARTERROUND(x2, x7, x8, x13); - QUARTERROUND(x3, x4, x9, x14); - } - x0 += j0; - x1 += j1; - x2 += j2; - x3 += j3; - x4 += j4; - x5 += j5; - x6 += j6; - x7 += j7; - x8 += j8; - x9 += j9; - x10 += j10; - x11 += j11; - x12 += j12; - x13 += j13; - x14 += j14; - x15 += j15; - - j12 = PLUSONE(j12); - if (!j12) { - j13 = PLUSONE(j13); - /* stopping at 2^70 bytes per nonce is user's responsibility */ - } - c[0] = __builtin_bswap32(x0); - c[1] = __builtin_bswap32(x1); - c[2] = __builtin_bswap32(x2); - c[3] = __builtin_bswap32(x3); - c[4] = __builtin_bswap32(x4); - c[5] = __builtin_bswap32(x5); - c[6] = __builtin_bswap32(x6); - c[7] = __builtin_bswap32(x7); - c[8] = __builtin_bswap32(x8); - c[9] = __builtin_bswap32(x9); - c[10] = __builtin_bswap32(x10); - c[11] = __builtin_bswap32(x11); - c[12] = __builtin_bswap32(x12); - c[13] = __builtin_bswap32(x13); - c[14] = __builtin_bswap32(x14); - c[15] = __builtin_bswap32(x15); - - c += 16; - } -} - - -void chacha8_keysetup(struct chacha8_ctx *x, const uint8_t *k, uint32_t kbits, const uint8_t *iv) -{ - const char *constants; - - x->input[4] = U8TO32_LITTLE(k + 0); - x->input[5] = U8TO32_LITTLE(k + 4); - x->input[6] = U8TO32_LITTLE(k + 8); - x->input[7] = U8TO32_LITTLE(k + 12); - if (kbits == 256) { /* recommended */ - k += 16; - constants = sigma; - } else { /* kbits == 128 */ - constants = tau; - } - x->input[8] = U8TO32_LITTLE(k + 0); - x->input[9] = U8TO32_LITTLE(k + 4); - x->input[10] = U8TO32_LITTLE(k + 8); - x->input[11] = U8TO32_LITTLE(k + 12); - x->input[0] = U8TO32_LITTLE(constants + 0); - x->input[1] = U8TO32_LITTLE(constants + 4); - x->input[2] = U8TO32_LITTLE(constants + 8); - x->input[3] = U8TO32_LITTLE(constants + 12); - if (iv) { - x->input[14] = U8TO32_LITTLE(iv + 0); - x->input[15] = U8TO32_LITTLE(iv + 4); - } else { - x->input[14] = 0; - x->input[15] = 0; - } -} - -void chacha8_get_keystream(const struct chacha8_ctx *x, uint64_t pos, uint32_t n_blocks, uint8_t *c) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - uint32_t j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; - int i; - - j0 = x->input[0]; - j1 = x->input[1]; - j2 = x->input[2]; - j3 = x->input[3]; - j4 = x->input[4]; - j5 = x->input[5]; - j6 = x->input[6]; - j7 = x->input[7]; - j8 = x->input[8]; - j9 = x->input[9]; - j10 = x->input[10]; - j11 = x->input[11]; - j12 = pos; - j13 = pos >> 32; - j14 = x->input[14]; - j15 = x->input[15]; - - while (n_blocks--) { - x0 = j0; - x1 = j1; - x2 = j2; - x3 = j3; - x4 = j4; - x5 = j5; - x6 = j6; - x7 = j7; - x8 = j8; - x9 = j9; - x10 = j10; - x11 = j11; - x12 = j12; - x13 = j13; - x14 = j14; - x15 = j15; - for (i = 8; i > 0; i -= 2) { - QUARTERROUND(x0, x4, x8, x12); - QUARTERROUND(x1, x5, x9, x13); - QUARTERROUND(x2, x6, x10, x14); - QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15); - QUARTERROUND(x1, x6, x11, x12); - QUARTERROUND(x2, x7, x8, x13); - QUARTERROUND(x3, x4, x9, x14); - } - x0 = PLUS(x0, j0); - x1 = PLUS(x1, j1); - x2 = PLUS(x2, j2); - x3 = PLUS(x3, j3); - x4 = PLUS(x4, j4); - x5 = PLUS(x5, j5); - x6 = PLUS(x6, j6); - x7 = PLUS(x7, j7); - x8 = PLUS(x8, j8); - x9 = PLUS(x9, j9); - x10 = PLUS(x10, j10); - x11 = PLUS(x11, j11); - x12 = PLUS(x12, j12); - x13 = PLUS(x13, j13); - x14 = PLUS(x14, j14); - x15 = PLUS(x15, j15); - - j12 = PLUSONE(j12); - if (!j12) { - j13 = PLUSONE(j13); - /* stopping at 2^70 bytes per nonce is user's responsibility */ - } - - U32TO8_LITTLE(c + 0, x0); - U32TO8_LITTLE(c + 4, x1); - U32TO8_LITTLE(c + 8, x2); - U32TO8_LITTLE(c + 12, x3); - U32TO8_LITTLE(c + 16, x4); - U32TO8_LITTLE(c + 20, x5); - U32TO8_LITTLE(c + 24, x6); - U32TO8_LITTLE(c + 28, x7); - U32TO8_LITTLE(c + 32, x8); - U32TO8_LITTLE(c + 36, x9); - U32TO8_LITTLE(c + 40, x10); - U32TO8_LITTLE(c + 44, x11); - U32TO8_LITTLE(c + 48, x12); - U32TO8_LITTLE(c + 52, x13); - U32TO8_LITTLE(c + 56, x14); - U32TO8_LITTLE(c + 60, x15); - - c += 64; - } -} diff --git a/chia/chacha8.h b/chia/chacha8.h deleted file mode 100644 index c0d844d..0000000 --- a/chia/chacha8.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef SRC_CHACHA8_H_ -#define SRC_CHACHA8_H_ - -#include - -struct chacha8_ctx { - uint32_t input[16]; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -// blake... -/*#define NICK_ROTR32(w,c) \ - (((w) >> (c)) | ((w) << (32 - (c)))) - -#define NICK_G(a,b,c,d,x,y) \ - state[a] = state[a] + state[b] + x; \ - state[d] = NICK_ROTR32(state[d] ^ state[a], 16); \ - state[c] = state[c] + state[d]; \ - state[b] = NICK_ROTR32(state[b] ^ state[c], 12); \ - state[a] = state[a] + state[b] + y; \ - state[d] = NICK_ROTR32(state[d] ^ state[a], 8); \ - state[c] = state[c] + state[d]; \ - state[b] = NICK_ROTR32(state[b] ^ state[c], 7); \ - -#define NICK_LOAD32(block,i) \ - ((uint32_t)(block[i+0]) << 0) | ((uint32_t)(block[i+1]) << 8) | ((uint32_t)(block[i+2]) << 16) | ((uint32_t)(block[i+3]) << 24) -// end blake*/ - -#define U32TO32_LITTLE(v) (v) -#define U8TO32_LITTLE(p) (*(const uint32_t *)(p)) -#define U32TO8_LITTLE(p, v) (((uint32_t *)(p))[0] = U32TO32_LITTLE(v)) -#define ROTL32(v, n) (((v) << (n)) | ((v) >> (32 - (n)))) - -#define ROTATE(v, c) (ROTL32(v, c)) -#define XOR(v, w) ((v) ^ (w)) -#define PLUS(v, w) ((v) + (w)) -#define PLUSONE(v) (PLUS((v), 1)) - -#define QUARTERROUND(a, b, c, d) \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 16); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 12); \ - a = PLUS(a, b); \ - d = ROTATE(XOR(d, a), 8); \ - c = PLUS(c, d); \ - b = ROTATE(XOR(b, c), 7) - -#define BYTESWAP32(x) \ - x = (x & 0x0000FFFF) << 16 | (x & 0xFFFF0000) >> 16; \ - x = (x & 0x00FF00FF) << 8 | (x & 0xFF00FF00) >> 8 - -void chacha8_keysetup_data(uint32_t *input, const uint8_t *k, uint32_t kbits, const uint8_t *iv); -void chacha8_keysetup(struct chacha8_ctx *x, const uint8_t *k, uint32_t kbits, const uint8_t *iv); -void chacha8_get_k32_keystream_data(const uint32_t *input, uint64_t pos, uint32_t n_blocks, uint32_t *c); -void chacha8_get_keystream_data(const uint32_t *input,uint64_t pos,uint32_t n_blocks,uint8_t *c); -void chacha8_get_keystream( - const struct chacha8_ctx *x, - uint64_t pos, - uint32_t n_blocks, - uint8_t *c); - -#ifdef __cplusplus -} -#endif - -#endif // SRC_CHACHA8_H_ diff --git a/chia/util.hpp b/chia/util.hpp deleted file mode 100644 index f114e1f..0000000 --- a/chia/util.hpp +++ /dev/null @@ -1,378 +0,0 @@ -// Copyright 2018 Chia Network Inc - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at - -// http://www.apache.org/licenses/LICENSE-2.0 - -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#ifndef SRC_CPP_UTIL_HPP_ -#define SRC_CPP_UTIL_HPP_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -template -constexpr inline Int cdiv(Int a, int b) { return (a + b - 1) / b; } - -#ifdef _WIN32 -#define NOMINMAX -#include -#include -#include "uint128_t.h" -#else -// __uint__128_t is only available in 64 bit architectures and on certain -// compilers. -typedef __uint128_t uint128_t; - -// Allows printing of uint128_t -std::ostream &operator<<(std::ostream &strm, uint128_t const &v) -{ - strm << "uint128(" << (uint64_t)(v >> 64) << "," << (uint64_t)(v & (((uint128_t)1 << 64) - 1)) - << ")"; - return strm; -} - -#endif - -// compiler-specific byte swap macros. -#if defined(_MSC_VER) - -#include - -// https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/byteswap-uint64-byteswap-ulong-byteswap-ushort?view=msvc-160 -inline uint16_t bswap_16(uint16_t x) { return _byteswap_ushort(x); } -inline uint32_t bswap_32(uint32_t x) { return _byteswap_ulong(x); } -inline uint64_t bswap_64(uint64_t x) { return _byteswap_uint64(x); } - -#elif defined(__clang__) || defined(__GNUC__) - -inline uint16_t bswap_16(uint16_t x) { return __builtin_bswap16(x); } -inline uint32_t bswap_32(uint32_t x) { return __builtin_bswap32(x); } -inline uint64_t bswap_64(uint64_t x) { return __builtin_bswap64(x); } - -#else -#error "unknown compiler, don't know how to swap bytes" -#endif - -/* Platform-specific cpuid include. */ -#if defined(_WIN32) -#include -#elif defined(__x86_64__) -#include -#endif - -class Timer { -public: - Timer() - { - wall_clock_time_start_ = std::chrono::steady_clock::now(); -#if _WIN32 - ::GetProcessTimes(::GetCurrentProcess(), &ft_[3], &ft_[2], &ft_[1], &ft_[0]); -#else - cpu_time_start_ = clock(); -#endif - } - - static char *GetNow() - { - auto now = std::chrono::system_clock::now(); - auto tt = std::chrono::system_clock::to_time_t(now); - return ctime(&tt); // ctime includes newline - } - - void PrintElapsed(const std::string &name) const - { - auto end = std::chrono::steady_clock::now(); - auto wall_clock_ms = std::chrono::duration_cast( - end - this->wall_clock_time_start_) - .count(); - -#if _WIN32 - FILETIME nowft_[6]; - nowft_[0] = ft_[0]; - nowft_[1] = ft_[1]; - - ::GetProcessTimes(::GetCurrentProcess(), &nowft_[5], &nowft_[4], &nowft_[3], &nowft_[2]); - ULARGE_INTEGER u[4]; - for (size_t i = 0; i < 4; ++i) { - u[i].LowPart = nowft_[i].dwLowDateTime; - u[i].HighPart = nowft_[i].dwHighDateTime; - } - double user = (u[2].QuadPart - u[0].QuadPart) / 10000.0; - double kernel = (u[3].QuadPart - u[1].QuadPart) / 10000.0; - double cpu_time_ms = user + kernel; -#else - double cpu_time_ms = - 1000.0 * (static_cast(clock()) - this->cpu_time_start_) / CLOCKS_PER_SEC; -#endif - - double cpu_ratio = static_cast(10000 * (cpu_time_ms / wall_clock_ms)) / 100.0; - - std::cout << name << " " << (wall_clock_ms / 1000.0) << " seconds. CPU (" << cpu_ratio - << "%) " << Timer::GetNow(); - } - -private: - std::chrono::time_point wall_clock_time_start_; -#if _WIN32 - FILETIME ft_[4]; -#else - clock_t cpu_time_start_; -#endif - -}; - -namespace Util { - - template - inline X Mod(X i, X n) - { - return (i % n + n) % n; - } - - inline uint32_t ByteAlign(uint32_t num_bits) { return (num_bits + (8 - ((num_bits) % 8)) % 8); } - - inline std::string HexStr(const uint8_t *data, size_t len) - { - std::stringstream s; - s << std::hex; - for (size_t i = 0; i < len; ++i) - s << std::setw(2) << std::setfill('0') << static_cast(data[i]); - s << std::dec; - return s.str(); - } - - inline void IntToTwoBytes(uint8_t *result, const uint16_t input) - { - uint16_t r = bswap_16(input); - memcpy(result, &r, sizeof(r)); - } - - // Used to encode deltas object size - inline void IntToTwoBytesLE(uint8_t *result, const uint16_t input) - { - result[0] = input & 0xff; - result[1] = input >> 8; - } - - inline uint16_t TwoBytesToInt(const uint8_t *bytes) - { - uint16_t i; - memcpy(&i, bytes, sizeof(i)); - return bswap_16(i); - } - - /* - * Converts a 64 bit int to bytes. - */ - inline void IntToEightBytes(uint8_t *result, const uint64_t input) - { - uint64_t r = bswap_64(input); - memcpy(result, &r, sizeof(r)); - } - - /* - * Converts a byte array to a 64 bit int. - */ - inline uint64_t EightBytesToInt(const uint8_t *bytes) - { - uint64_t i; - memcpy(&i, bytes, sizeof(i)); - return bswap_64(i); - } - - static void IntTo16Bytes(uint8_t *result, const uint128_t input) - { - uint64_t r = bswap_64(input >> 64); - memcpy(result, &r, sizeof(r)); - r = bswap_64((uint64_t)input); - memcpy(result + 8, &r, sizeof(r)); - } - - /* - * Retrieves the size of an integer, in Bits. - */ - inline uint8_t GetSizeBits(uint128_t value) - { - uint8_t count = 0; - while (value) { - count++; - value >>= 1; - } - return count; - } - - // 'bytes' points to a big-endian 64 bit value (possibly truncated, if - // (start_bit % 8 + num_bits > 64)). Returns the integer that starts at - // 'start_bit' that is 'num_bits' long (as a native-endian integer). - // - // Note: requires that 8 bytes after the first sliced byte are addressable - // (regardless of 'num_bits'). In practice it can be ensured by allocating - // extra 7 bytes to all memory buffers passed to this function. - inline uint64_t SliceInt64FromBytes( - const uint8_t *bytes, - uint32_t start_bit, - const uint32_t num_bits) - { - uint64_t tmp; - - if (start_bit + num_bits > 64) { - bytes += start_bit / 8; - start_bit %= 8; - } - - tmp = Util::EightBytesToInt(bytes); - tmp <<= start_bit; - tmp >>= 64 - num_bits; - return tmp; - } - - inline uint64_t SliceInt64FromBytesFull( - const uint8_t *bytes, - uint32_t start_bit, - uint32_t num_bits) - { - uint32_t last_bit = start_bit + num_bits; - uint64_t r = SliceInt64FromBytes(bytes, start_bit, num_bits); - if (start_bit % 8 + num_bits > 64) - r |= bytes[last_bit / 8] >> (8 - last_bit % 8); - return r; - } - - inline uint128_t SliceInt128FromBytes( - const uint8_t *bytes, - const uint32_t start_bit, - const uint32_t num_bits) - { - if (num_bits <= 64) - return SliceInt64FromBytesFull(bytes, start_bit, num_bits); - - uint32_t num_bits_high = num_bits - 64; - uint64_t high = SliceInt64FromBytesFull(bytes, start_bit, num_bits_high); - uint64_t low = SliceInt64FromBytesFull(bytes, start_bit + num_bits_high, 64); - return ((uint128_t)high << 64) | low; - } - - inline void GetRandomBytes(uint8_t *buf, uint32_t num_bytes) - { - std::random_device rd; - std::mt19937 mt(rd()); - std::uniform_int_distribution dist(0, 255); - for (uint32_t i = 0; i < num_bytes; i++) { - buf[i] = dist(mt); - } - } - - inline uint64_t ExtractNum( - const uint8_t *bytes, - uint32_t len_bytes, - uint32_t begin_bits, - uint32_t take_bits) - { - if ((begin_bits + take_bits) / 8 > len_bytes - 1) { - take_bits = len_bytes * 8 - begin_bits; - } - return Util::SliceInt64FromBytes(bytes, begin_bits, take_bits); - } - - // The number of memory entries required to do the custom SortInMemory algorithm, given the - // total number of entries to be sorted. - inline uint64_t RoundSize(uint64_t size) - { - size *= 2; - uint64_t result = 1; - while (result < size) result *= 2; - return result + 50; - } - - /* - * Like memcmp, but only compares starting at a certain bit. - */ - inline int MemCmpBits( - uint8_t *left_arr, - uint8_t *right_arr, - uint32_t len, - uint32_t bits_begin) - { - uint32_t start_byte = bits_begin / 8; - uint8_t mask = ((1 << (8 - (bits_begin % 8))) - 1); - if ((left_arr[start_byte] & mask) != (right_arr[start_byte] & mask)) { - return (left_arr[start_byte] & mask) - (right_arr[start_byte] & mask); - } - - for (uint32_t i = start_byte + 1; i < len; i++) { - if (left_arr[i] != right_arr[i]) - return left_arr[i] - right_arr[i]; - } - return 0; - } - - inline double RoundPow2(double a) - { - // https://stackoverflow.com/questions/54611562/truncate-float-to-nearest-power-of-2-in-c-performance - int exp; - double frac = frexp(a, &exp); - if (frac > 0.0) - frac = 0.5; - else if (frac < 0.0) - frac = -0.5; - double b = ldexp(frac, exp); - return b; - } - -#if defined(_WIN32) || defined(__x86_64__) - void CpuID(uint32_t leaf, uint32_t *regs) - { -#if defined(_WIN32) - __cpuid((int *)regs, (int)leaf); -#else - __get_cpuid(leaf, ®s[0], ®s[1], ®s[2], ®s[3]); -#endif /* defined(_WIN32) */ - } - - bool HavePopcnt(void) - { - // EAX, EBX, ECX, EDX - uint32_t regs[4] = {0}; - - CpuID(1, regs); - // Bit 23 of ECX indicates POPCNT instruction support - return (regs[2] >> 23) & 1; - } -#endif /* defined(_WIN32) || defined(__x86_64__) */ - - inline uint64_t PopCount(uint64_t n) - { -#if defined(_WIN32) - return __popcnt64(n); -#elif defined(__x86_64__) - uint64_t r; - __asm__("popcnt %1, %0" : "=r"(r) : "r"(n)); - return r; -#else - return __builtin_popcountl(n); -#endif /* defined(_WIN32) ... defined(__x86_64__) */ - } -} - -#endif // SRC_CPP_UTIL_HPP_ diff --git a/drplotter.cu b/drplotter.cu deleted file mode 100644 index 4d455c8..0000000 --- a/drplotter.cu +++ /dev/null @@ -1,3662 +0,0 @@ -/* - ============================================================================ - Name : drplotter.cu - Author : NH - Version : - Copyright : Your copyright notice - Description : CUDA compute reciprocals - ============================================================================ - */ - -#include -#include -#include -#include -#include -#include -#include - -// for mmap -#include -#include -#include -#include /* mmap() is defined in this header */ -#include - -#include "chia/util.hpp" -#include "chia/chacha8.h" -#include "nick_globals.hpp" -#include "attack.hpp" -#include "phase2.hpp" - - - -const uint16_t THREADS_FOR_MATCHING = 256; // 386 is 10600ms matching. 256 is 9761ms matching. 237 is...10109 - -int cmd_read = 0; - -using milli = std::chrono::milliseconds; -int64_t total_gpu_time_ms = 0; -int64_t total_transfer_in_time_ms = 0; -int64_t total_transfer_out_time_ms = 0; -int64_t total_chacha_time_ms = 0; -int64_t total_match_time_ms = 0; -uint64_t total_transfer_in_bytes = 0; -uint64_t total_transfer_out_bytes = 0; -int64_t table_gpu_time_ms = 0; -int64_t table_transfer_in_time_ms = 0; -int64_t table_transfer_out_time_ms = 0; -int64_t table_match_time_ms = 0; -uint64_t table_transfer_in_bytes = 0; -uint64_t table_transfer_out_bytes = 0; - -// global memory -char *host_criss_cross_blocks; // aka host_meta_blocks -char *host_refdata_blocks; -char *device_buffer_A; -char *device_buffer_B; - -char *device_buffer_C; -char *device_buffer_T3_base; -char *device_buffer_refdata; - -int* device_block_entry_counts; // [BATCHES]; -int* device_local_kbc_num_entries; -uint32_t host_criss_cross_entry_counts[BATCHES * BATCHES]; // kbc counts for each block - - -#include "nick_blake3.hpp" - - -template -__global__ -void gpu_find_tx_matches_calc_only(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - // match: 25804 ms - // phase 1: 4366ms - __shared__ uint Lys[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ uint Rys[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ Index_Match matches[512];//KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int yr_yl_bid_m_results[kB*2]; - __shared__ int yr_yl_cid_mod_kC[kC*2]; - - - uint32_t kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - const uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (num_L == 0) { - return; - } - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - Rys[pos_R] = (R_entry.y / kC) + ((R_entry.y % kC) << 8); // do mod and div entries too in bitmask. - } - for (int pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - Lys[pos_L] = (L_entry.y / kC) + ((L_entry.y % kC) << 8); - } - const int16_t parity = (global_kbc_L_bucket_id) % 2; - for (int i=threadIdx.x;i MEANS yr/kC can only match with the 64 slots including and to the right of yl/kC - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - const uint yr_data = Rys[pos_R]; - //int16_t yr_kbc = yr_data;// & 0b01111111111111111; - const int16_t yr_bid = yr_data & 0b011111111; // yr_kbc / kC; // values [0..kB] - const int16_t yr_cid = (yr_data >> 8);//yr_kbc % kC;//(yr_data >> 24); - for (int pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - const uint yl_data = Lys[pos_L]; - //int16_t yl_kbc = yl_data;// & 0b01111111111111111; - const int8_t yl_bid = yl_data & 0b011111111; //yl_kbc / kC; values [0..kB] - const int8_t yl_cid = yl_data >> 8;//yl_kbc % kC;//(yl_data >> 24); - - int16_t m_results = yr_yl_bid_m_results[yr_bid-yl_bid+kB]; - int16_t m = m_results >> 8;//& 0b011111111; - int16_t m2_parity_squared = (m_results & 0b011111111); - int16_t formula_two = yr_yl_cid_mod_kC[yr_cid - yl_cid + kC]; - - //int16_t formula_one = yr_bid - yl_bid; // this should actually give m - //if (formula_one < 0) { - // formula_one += kB; - //} - //int16_t m = formula_one; - //if (m >= kB) { - // m -= kB; - //} - - //int16_t m = (yr_bid - yl_bid); - //if (m < 0) m+=kB; - //if (m >= kB) m-=kB; - - //if (m < 64) { - // passed first test - //const int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - //int16_t formula_two = yr_cid - yl_cid; - //if (formula_two < 0) formula_two += kC; - - if ((m < 64) && (formula_two == m2_parity_squared)) { - // we have a match. - int num_matches = atomicAdd(&total_matches,1); - //if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - // printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - //} else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//value >> 4; - matches[num_matches] = match; - //} - } - //} - /* - - - - - uint16_t m = (yr_bid - yl_bid) % kB; // 77ms w/o branch mod test, big jump w/ mod. - 158ms - uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; - uint16_t formula_two = (yr_cid - yl_cid) % kC; - //if (m < 0) { - // m += kB; - //}// else if (m >= kB) m-=kB; - if ((m < 64) && (m2_parity_squared == formula_two)) { - //uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; - //uint16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//value >> 4; - matches[num_matches] = match; - } - }*/ - } - } - - /*for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - int16_t yr_kbc = R_entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (uint16_t pos_L = 0; pos_L < num_L; pos_L++) { - // do L_entry and R_entry match? - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - int16_t yl_kbc = L_entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//value >> 4; - matches[num_matches] = match; - } - } - } - } - } - - __syncthreads();*/ - - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET); - total_matches = KBC_MAX_ENTRIES_PER_BUCKET; - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) { - // printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //Ly is:[20932] Lx: [322482289] Rx: [3382886636] f result:[273114646565] - //if (blake_result == 56477140042) { - // printf(" ---** BLAKE CORRECT **\n"); - //} else { - // printf(" ---** BLAKE WRONG :(((( \n"); - //} - // Ly is:[21557] Lx: [3620724289] Rx: [2663198278] f result:[56477140042] - //} - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - if (global_kbc_L_bucket_id == 1) { - uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - } - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - if (table < 6) { - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - bucketed_out[pair_address] = pair; - } - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - //if (threadIdx.x == 0) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - //if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - //} - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if (threadIdx.x == 0) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } - // table 1 4865ms match time to beat. - // with shared mem for pos_L/R is 3942 - win! - // formula improvement (one branch) - 3810ms - // removal of max kbc test in m loop - 3639ms +33% faster. - // shared compute buffers to prevent % and division - 2280ms! - // -- now getting dangerously close to best algo time of 1606ms :) -} - - -template -__global__ -void gpu_find_tx_matches_test(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - __shared__ int total_matches; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - uint8_t doPrint = 2; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - - // For any 0 <= m < kExtraBitsPow: - // yl / kBC + 1 = yR / kBC AND - // (yr % kBC) / kC - (yl % kBC) / kC = m (mod kB) -> MEANS (1) yr/kC can only match with the 64 slots including and to the right of yl/kC - // (yr % kBC) % kC - (yl % kBC) % kC = (2m + (yl/kBC) % 2)^2 (mod kC) - - // yr_kc's : [0..127] -> contains what? Either y-value, then compute matching m, or contains %kC - // if /kC distance yr to yl is 5, m = 5, then diff %kC must be (20^2)%kC = 400 % kC = - - // 000001111111111000000 yl1 - // 000111111111100000000 y12 - // 000000011111111111000 yl3 - - const uint16_t parity = global_kbc_L_bucket_id % 2; - for (int16_t Ry = threadIdx.x; Ry < kBC; Ry+=blockDim.x) { - int16_t yr_kbc = Ry; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - for (int16_t Ly = 0; Ly < kBC; Ly++) { - int16_t yl_kbc = Ly; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - if (formula_two == m2_parity_squared) { - // we have a match. - printf("match Ly:%u Ry:%u\n", Ly, Ry); - atomicAdd(&total_matches,1); - } - } - } - } - if (threadIdx.x == 0) { - printf("Done. Total matche: %u", total_matches); - } - -} - - -template -__global__ -void gpu_find_tx_matches(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - uint8_t doPrint = 1; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - - /*bool printandquit = ((global_kbc_L_bucket_id == 0)); - if (printandquit) { - if (threadIdx.x == 0) { - - printf("R_y list:\n"); - for (size_t pos_R = 0; pos_R < num_R; pos_R++) { - uint16_t r_y = kbc_R_entries[pos_R].y; - printf("[x:%u y:%u]\n",kbc_R_entries[pos_R].meta[0], r_y); - } - printf("L_y list num %u:\n", num_L); - for (size_t pos_L = 0; pos_L < num_L; pos_L++) { - uint16_t l_y = kbc_L_entries[pos_L].y; - printf("[x:%u y:%u]\n",kbc_L_entries[pos_L].meta[0], l_y); - } - } - }*/ - //__syncthreads(); - uint16_t parity = global_kbc_L_bucket_id % 2; - - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - int kbc_map = r_y / 2; - const int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - // benchmark: 66ms at this point - //if ((nick_rmap_extras_ry[threadIdx.x % 32] + nick_rmap_extras_pos[threadIdx.x % 32]) == 2334534423) printf("bogus"); - //return; - - // load parity tables into shared - /*if (printandquit) { - if (threadIdx.x == 0) { - printf("num extras bucket %u : %u parity: %u \n", global_kbc_L_bucket_id, num_extras, parity); - - for (int i=0;i> kbc_box_shift) & 0b0111111111111111; - - //uint16_t rmap_value = nick_rmap[i]; - uint16_t pos = (rmap_value & 0b0111111111); - if (rmap_value > 0) { - printf("kbc:%i value:%u pos:%u\n", i, rmap_value, pos); - } - } - - } - - } - __syncthreads();*/ - - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - uint16_t indJ = l_y / kC; - //printf("scanning for pos_L: %u\n", pos_L); - - // this part is killer, this does add bulk of time. - // weird simplfiying the math doesn't help much unless you pragma unroll it - // might be too much branching inside too. - // setup code for loop increment "optimization" - //uint16_t indJ_mod_kB_times_kC = ((indJ + 0) % kB) * kC; - //uint16_t start_parity_add = 4 + parity * 4; - //uint16_t parity_base = (parity + l_y) % kC; - //const uint16_t m_switch_kb = kB - indJ; // calculate point at which indJ + m is %kb! - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - // 27.58ms - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - - - - - // a cute "optimization" but saves no time whatsoever...27.7ms instead of 27.58ms :/ - //if (m_switch_kb == m) indJ_mod_kB_times_kC = ((indJ + m) % kB) * kC; // 323ms // 490 - //uint16_t r_target = indJ_mod_kB_times_kC + parity_base; - //indJ_mod_kB_times_kC += kC; // 256ms - //parity_base += start_parity_add; - //if (parity_base >= kC) parity_base -= kC; - //start_parity_add += 8; - //if (start_parity_add >= kC) start_parity_add -= kC; - //if (test_target != r_target) { - // printf("Ly: %u m: %u target: %u test_target: %u \n", l_y, m, r_target, test_target); - //} - - - //if (r_target + indJ == m) bogus_match_counter++; - //if (bogus_match_counter >= KBC_MAX_ENTRIES_PER_BUCKET) { - // printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, bogus_match_counter); - //} - - // find which box our r_target is in, extra the 15bit value from that box - int kbc_map = r_target / 2; - const int kbc_box_shift = (r_target % 2) * 15; - int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - //if (printandquit) { - // printf("L_y: %u r_target hit: %u pos_R:%u\n", l_y, r_target, pos_R); - //} - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - // add the extras - /*int extra_match = 0; - for (int slot = 0; slot < num_extras; slot++) { - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - match.idxR = extra_pos_R;//value >> 4; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //extra_match++; - //matches[num_matches+extra_match] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - }*/ - - //if (global_kbc_L_bucket_id < 10) { - // if (extra_match != count-1) { - // printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count); - // } else { - // printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count); - // } - //} - } - } - } - } - } - - __syncthreads(); - - // up until this point matching takes 976ms total for k32 - // it's 936ms with only the total matches counter (so about 40ms for appending match data) - // 745ms with a bogus counter (so no shared atomic conflict) - // it's 586ms with only m computations and bogus counter (no lookups) - so rmap lookups add 140ms - // it's 128ms with only 1m -- so calculations are adding 460ms!!! - // in summary: - // -- 460ms : m loop calculations - moreso the actual m loop than the math inside! - // -- 140ms : rmap lookups (bank conflict improvements possible) - // -- 128ms : data reads - // - 66ms rmap setup - // - 62ms reading y values back in - // -- 40ms : match atomic shared counter (vs non atomic shared counter) - //if (threadIdx.x == 0) { - // if (total_matches == 1342343) printf("bogus"); - //} - //return; - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - - blake_result = 23; - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - // uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - // uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - // printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - //} - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - if (table < 6) { - uint64_t batch_bucket = blake_result >> (38-6); // 27.52ms for 1/64 of kbcs - //uint64_t batch_bucket = threadIdx.x % 64; // 25.3ms with blake computation, 20ms without. So blake adds 5ms for 1/64 of values; - //uint64_t batch_bucket = 0; // 18ms per 1/64 of values, and our block counts aren't even optimized since global locking on atomic adds - // so...in theory could reduce from 27ms time down to sub 18ms, and then do blake pass on seperate scan, which *should* be faster. - // since we write less blocks/data in here - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - //if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - // printf("ERROR: results address overflow\n"); - //} else { - // up to here takes 1508ms. Seems 1508-976 = 532ms for blake results - // quite substantial! - bucketed_out[pair_address] = pair; - // including the write-out is 1696ms - //} - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - if ((doPrint >=1) && (threadIdx.x == 0)) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - if ((global_kbc_L_bucket_id % 1000000 == 0) || (global_kbc_L_bucket_id < 10)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - - } - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if ((doPrint >= 1) && (threadIdx.x == 0)) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } -} - -template -__global__ -void gpu_find_tx_matches_direct_to_host(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - char *host_criss_cross, int *out_bucket_counts) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - uint8_t doPrint = 1; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - //bool printandquit = ((global_kbc_L_bucket_id == 75000)); - - - - - // if (printandquit) { - //printf("R_y list:\n"); - //for (size_t pos_R = 0; pos_R < num_R; pos_R++) { - // uint16_t r_y = kbc_R_entries[pos_R].y; - // printf("%u\n",r_y); - //} - //if (threadIdx.x == 0) { - // printf("L_y list num %u:\n", num_L); - // for (size_t pos_L = 0; pos_L < num_L; pos_L++) { - // uint16_t l_y = kbc_L_entries[pos_L].y; - // printf("%u\n",l_y); - // } - //} - // } - //__syncthreads(); - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - int kbc_map = r_y / 2; - const int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - // load parity tables into shared - /*if (printandquit) { - if (threadIdx.x == 0) { - printf("num extras bucket %u : %u parity: %u \n", global_kbc_L_bucket_id, num_extras, parity); - - for (int i=0;i> kbc_box_shift) & 0b0111111111111111; - - //uint16_t rmap_value = nick_rmap[i]; - uint16_t pos = (rmap_value & 0b0111111111); - if (rmap_value > 0) { - printf("kbc:%i value:%u pos:%u\n", i, rmap_value, pos); - } - } - - } - - } - __syncthreads();*/ - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - // find which box our r_target is in, extra the 15bit value from that box - int kbc_map = r_target / 2; - const int kbc_box_shift = (r_target % 2) * 15; - int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - //if (printandquit) { - // printf("L_y: %u r_target hit: %u pos_R:%u\n", l_y, r_target, pos_R); - //} - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - // add the extras - /*int extra_match = 0; - for (int slot = 0; slot < num_extras; slot++) { - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - match.idxR = extra_pos_R;//value >> 4; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //extra_match++; - //matches[num_matches+extra_match] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - }*/ - //if (global_kbc_L_bucket_id < 10) { - // if (extra_match != count-1) { - // printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count); - // } else { - // printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count); - // } - //} - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - // uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - // uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - // printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - //} - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - if (table < 6) { - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - - uint64_t criss_cross_id; - uint64_t cross_row_id = batch_id; - uint64_t cross_column_id = batch_bucket; - if ((table % 2) == 1) { - criss_cross_id = (cross_row_id * BATCHES + cross_column_id); - } else { - criss_cross_id = (cross_column_id * BATCHES + cross_row_id); - } - uint64_t host_block_entry_start_position = criss_cross_id * HOST_MAX_BLOCK_ENTRIES; - uint64_t host_bytes_start = host_block_entry_start_position * HOST_UNIT_BYTES; - - BUCKETED_ENTRY_OUT *host_block = (BUCKETED_ENTRY_OUT *) &host_criss_cross[host_bytes_start]; - host_block[block_slot] = pair; - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - if (threadIdx.x == 0) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - if ((global_kbc_L_bucket_id % 1000000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - } - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if (threadIdx.x == 0) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } -} - -template -__global__ -void gpu_find_tx_matches_with_backref(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, - char *bucketed_ref_out, int *out_bucket_counts) { - // T1 match: 1714 ms -> with delaying extras: 1630 - //Total tables time: 73726 ms - // match: 10015 ms -> 9705ms with delaying extras - const uint16_t NUM_RMAPS = (kBC/2)+1; - __shared__ int nick_rmap[NUM_RMAPS]; // positions and counts. Use 30 bits, 15 bits each entry with lower 9 bits for pos, 1024+ for count - __shared__ uint32_t nick_rmap_extras_rl[32]; - __shared__ uint16_t nick_rmap_extras_ry[32]; - __shared__ uint16_t nick_rmap_extras_pos[32]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - __shared__ int y_duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - uint8_t doPrint = 1; - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - y_duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this - for (int i = threadIdx.x; i < NUM_RMAPS; i += blockDim.x) { - nick_rmap[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - //bool printandquit = ((global_kbc_L_bucket_id == 75000)); - - - - - // if (printandquit) { - //printf("R_y list:\n"); - //for (size_t pos_R = 0; pos_R < num_R; pos_R++) { - // uint16_t r_y = kbc_R_entries[pos_R].y; - // printf("%u\n",r_y); - //} - //if (threadIdx.x == 0) { - // printf("L_y list num %u:\n", num_L); - // for (size_t pos_L = 0; pos_L < num_L; pos_L++) { - // uint16_t l_y = kbc_L_entries[pos_L].y; - // printf("%u\n",l_y); - // } - //} - // } - //__syncthreads(); - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (uint16_t pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - - // r_y's share a block across two adjacent values, so kbc_map just works out which part it's in. - int kbc_map = r_y / 2; - const int kbc_box_shift = (r_y % 2) * 15; - int add = 1024 << kbc_box_shift; // we add from 10th bit up (shifted by the box it's in) - - int rmap_value = atomicAdd(&nick_rmap[kbc_map],add); // go ahead and add the counter (which will add in bits 10 and above) - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - if (rmap_value == 0) { - // if we added to an empty spot, what we do is add the pos_R here in the lower 9 bits of the box - // and ONLY for this one. - atomicAdd(&nick_rmap[kbc_map], (pos_R << kbc_box_shift)); - //if (printandquit) { - // printf("r_y: %u pos:%u\n", r_y, pos_R); - //} - } else { - // we hit duplicate entry...add this to a row - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - - } - - __syncthreads(); // wait for all threads to write r_bid entries - - // load parity tables into shared - /*if (printandquit) { - if (threadIdx.x == 0) { - printf("num extras bucket %u : %u parity: %u \n", global_kbc_L_bucket_id, num_extras, parity); - - for (int i=0;i> kbc_box_shift) & 0b0111111111111111; - - //uint16_t rmap_value = nick_rmap[i]; - uint16_t pos = (rmap_value & 0b0111111111); - if (rmap_value > 0) { - printf("kbc:%i value:%u pos:%u\n", i, rmap_value, pos); - } - } - - } - - } - __syncthreads();*/ - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - // find which box our r_target is in, extra the 15bit value from that box - int kbc_map = r_target / 2; - const int kbc_box_shift = (r_target % 2) * 15; - int rmap_value = nick_rmap[kbc_map]; - rmap_value = (rmap_value >> kbc_box_shift) & 0b0111111111111111; - - if (rmap_value > 0) { - // the pos_R is the lower 9 bits of that 15bit boxed value - uint16_t pos_R = rmap_value & 0b0111111111; - uint16_t count = rmap_value / 1024; - - //if (printandquit) { - // printf("L_y: %u r_target hit: %u pos_R:%u\n", l_y, r_target, pos_R); - //} - int num_matches = atomicAdd(&total_matches,1);//count); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - int slot = atomicAdd(&y_duplicate_counts, 1); - nick_rmap_extras_rl[slot] = (r_target << 16) + pos_L; - // add the extras - /*int extra_match = 0; - for (int slot = 0; slot < num_extras; slot++) { - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - match.idxR = extra_pos_R;//value >> 4; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //extra_match++; - //matches[num_matches+extra_match] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - }*/ - //if (global_kbc_L_bucket_id < 10) { - // if (extra_match != count-1) { - // printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count); - // } else { - // printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count); - // } - //} - } - } - } - } - } - - __syncthreads(); - - // do the extras - - //int num_matches = atomicAdd(&total_matches,num_extras); // warning can only let thread 0 do this otherwise all will add! - for (int slot=threadIdx.x; slot> 16; - uint16_t pos_L = value & 0x0FFFF; - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = extra_pos_R; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //matches[total_matches+slot] = match; - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - } - - __syncthreads(); - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - //printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //} - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - //if (global_kbc_L_bucket_id == 1) { - // uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - // uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - // printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - //} - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - //printf("table %u blake result: %llu\n", table, blake_result); - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - if (table < 6) { - // our last table 6 doesn't write into hostmem criss cross, it just does backref with extra y instead. - pair.y = (uint32_t) (blake_result % block_mod); - bucketed_out[pair_address] = pair; - } - } - - //// TODO: export Lx's to save into table, these are x1,x3 denoting 2 pairs that can be compressed into kbc buckets - // we *could* do double the data in table 3, but then we need extra buffers and memory that we don't have - if (table == 2) { - // this task can be left to the CPU to deal with the batch buckets and write baseref to file. - } - if ((table == 3) || (table == 4) || (table == 5) || (table == 6)) { - - if (table == 6) { - // last table does backref with extra y truncated to most significant k bits. - T6BackRef ref = {}; - ref.prev_block_ref_L = L_Entry.blockposref; - ref.prev_block_ref_R = R_Entry.blockposref; - ref.y = (uint32_t) (blake_result >> kExtraBits); // get top 32 most significant bits, since calc_y is 38 bits. - //printf("blake y result table 6: %llu -> %u\n", blake_result, ref.y); - - T6BackRef *out = (T6BackRef *) bucketed_ref_out; - //if ((ref.prev_block_ref_L == 0) && (ref.prev_block_ref_R == 0)) { - // printf("Both refs are 0!\n"); - //} - out[pair_address] = ref; - } else if (table == 3) { - T3BaseRef ref = {}; - ref.Lx1 = L_Entry.meta[0]; - ref.Lx2 = L_Entry.meta[2]; - ref.Lx3 = R_Entry.meta[0]; - ref.Lx4 = R_Entry.meta[2]; - T3BaseRef *out = (T3BaseRef *) bucketed_ref_out; - out[pair_address] = ref; - } else if ((table == 3) || (table == 4) || (table == 5)) { - BackRef ref = {}; - ref.prev_block_ref_L = L_Entry.blockposref; - ref.prev_block_ref_R = R_Entry.blockposref; - BackRef *out = (BackRef *) bucketed_ref_out; - //if ((ref.prev_block_ref_L == 0) && (ref.prev_block_ref_R == 0)) { - // printf("Both refs are 0!\n"); - //} - out[pair_address] = ref; - } - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - if (threadIdx.x == 0) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - if ((global_kbc_L_bucket_id % 1000000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - } - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if (threadIdx.x == 0) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } -} - -template -__global__ -void gpu_find_tx_matches_rmap_working(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - // match: 10000 ms - // table 1 match match: 1633 ms, potentially 2.5x faster than orig method - // with extras: 1841 ms - win! - // with extras hashed counters (working): 2144 ms - // Total tables time: 77112 ms - // match: 12505 ms - // TODO: TRY THIS AS GLOBAL MEMORY COVERING BATCH SIZE - //__shared__ __half nick_rmap_counts[kBC]; // 30226 bytes - const int RMAP_NUM_COUNTS_PER_BOX = 8; // whether 8 per box, 7, 4, bit counts 4 etc doesn't change result measurably I don't think. - const int RMAP_BITS_FOR_COUNTS = 4; - const int RMAP_COUNT_MASK = 0b01111; - const int NUM_RMAP_COUNTS = (15113 / RMAP_NUM_COUNTS_PER_BOX)+1; - __shared__ int nick_rmap_counts[NUM_RMAP_COUNTS]; // kBC / 2, sharing bits [12bits pos, 3 bits counter][12 bits pos, 3 bits counter] - //__shared__ int16_t nick_rmap_counts[kBC]; // 30226 bytes - __shared__ uint16_t nick_rmap_positions[kBC]; - __shared__ uint16_t nick_rmap_extras_ry[100]; - __shared__ uint16_t nick_rmap_extras_pos[100]; - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - __shared__ int num_extras; - - //__shared__ int non_duplicate_counts; - //__shared__ int duplicate_counts; - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - //if (global_kbc_L_bucket_id > 0) { - // return; - //} - - uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed - //if (global_kbc_L_bucket_id == 75000) { - // doPrint = 100; - //} - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - if (threadIdx.x == 0) { - total_matches = 0; - num_extras = 0; - //non_duplicate_counts = 0; - //duplicate_counts = 0; - if (doPrint > 1) { - printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - } - // unfortunately to clear we have to do this 236 times for 64 threads - for (int i = threadIdx.x; i < NUM_RMAP_COUNTS; i += blockDim.x) { - nick_rmap_counts[i] = 0; - } - __syncthreads(); // all written initialize data should sync - - //bool printandquit = ((global_kbc_L_bucket_id == 75000)); - - - - - // if (printandquit) { - //printf("R_y list:\n"); - //for (size_t pos_R = 0; pos_R < num_R; pos_R++) { - // uint16_t r_y = kbc_R_entries[pos_R].y; - // printf("%u\n",r_y); - //} - //if (threadIdx.x == 0) { - // printf("L_y list num %u:\n", num_L); - // for (size_t pos_L = 0; pos_L < num_L; pos_L++) { - // uint16_t l_y = kbc_L_entries[pos_L].y; - // printf("%u\n",l_y); - // } - //} - // } - //__syncthreads(); - uint16_t parity = global_kbc_L_bucket_id % 2; - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - uint16_t r_y = R_entry.y; - //int16_t rmap_value = nick_rmap_counts[r_y]; - //uint8_t rmap_count = rmap_value & 0b0111; - - // TODO: ok, let's make it MUCH easier, and have the atomic adds on 3 bits only - // and cut kbc_map into 15 bit counts (5 counts) each. Gives us plenty of space now - // to have separate rmap_positions entries, and greaty simplifies code (hopefully). - // however...may be slower! - //int kbc_map = r_y / 2; - //const int kbc_box_shift = (r_y % 2) * 12; - //int add = 1 << kbc_box_shift; - //int rmap_value = atomicAdd(&nick_rmap_counts[kbc_map],add); - //rmap_value = (rmap_value >> kbc_box_shift) & 0x0000FFFF; - //int rmap_count = rmap_value & 0b0111; - - int kbc_map = r_y / RMAP_NUM_COUNTS_PER_BOX; - const int kbc_box_shift = (r_y % RMAP_NUM_COUNTS_PER_BOX) * RMAP_BITS_FOR_COUNTS; // 3 bits each, gives up to 111 = 7 duplicates - - int add = 1 << kbc_box_shift; - int rmap_value = atomicAdd(&nick_rmap_counts[kbc_map],add); - int rmap_count = (rmap_value >> kbc_box_shift) & RMAP_COUNT_MASK; - - if (rmap_count == 0) { - nick_rmap_positions[r_y] = pos_R; - //int add_value = (pos_R << 3) << kbc_box_shift; - //atomicAdd(&nick_rmap_counts[kbc_map], add_value); - //int16_t new_value = atomicAdd(&nick_rmap_counts[r_y], add_value); // encode position - //if ((printandquit) && (r_y == 1725)) { - // nick_rmap_counts[r_y] = add + 1; - //unsigned short prev = atomicAdd(&nick_rmap_counts[r_y],add); - //printf("***** add value is: %u prev:%u\n", add, prev); - //prev = atomicAdd(&nick_rmap_counts[r_y],1); - //printf("***** add value is: %u prev:%u\n", add, prev); - //} - //nick_rmap_counts[r_y] = 1 + (pos_R << 3); - } else { - // we hit duplicate entry... - int slot = atomicAdd(&num_extras, 1); - nick_rmap_extras_ry[slot] = r_y; - nick_rmap_extras_pos[slot] = pos_R; - } - } - - __syncthreads(); // wait for all threads to write r_bid entries - - // load parity tables into shared - /*if (doPrint > 1) { - if (threadIdx.x == 0) { - printf("num extras bucket %u : %u parity: %u \n", global_kbc_L_bucket_id, num_extras, parity); - if (printandquit) { - for (int i=1700;i<1750;i++) { - //unsigned short value = nick_rmap_counts[i]; - //unsigned short count = value & 0b0111; - //printf("kbc:%u value:%u count:%u\n", i, value, count); - - int kbc_map = i / 2; - int kbc_box_shift = (i % 2) * 12; - int rmap_value = (nick_rmap_counts[kbc_map]) >> kbc_box_shift; - int rmap_count = rmap_value & (0b0111); - int pos = (rmap_value & 0b0111111111000) >> 3; - printf("kbc:%i value:%u count:%u pos:%u\n", i, rmap_value, rmap_count,pos); - } - } - } - - } - __syncthreads();*/ - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - uint16_t l_y = L_entry.y; - //printf("scanning for pos_L: %u\n", pos_L); - - for (int m=0;m<64;m++) { - - //uint16_t r_target = L_targets[parity][l_y][m]; // this performs so badly because this lookup - // is super-inefficient. - - uint16_t indJ = l_y / kC; - uint16_t r_target = ((indJ + m) % kB) * kC + (((2 * m + parity) * (2 * m + parity) + l_y) % kC); - - //if (r_target != r_target_calc) { - // printf("CALC ERROR r_target calc %u does not match r_target %u\n", r_target_calc, r_target); - //} - - //uint16_t value = nick_rmap[r_target]; - //uint8_t count = value & 0x000F; - //__half value = nick_rmap_counts[r_target]; - //int16_t value = nick_rmap_counts[r_target]; - //unsigned short value = nick_rmap_counts[r_target]; - //unsigned short count = value & 0b0111; - - //int kbc_map = r_target / 2; - //int kbc_box_shift = (r_target % 2) * 12; - //int value = (nick_rmap_counts[kbc_map] >> kbc_box_shift) & 0x0000FFFF; - //int count = value & (0b0111); - - const int kbc_map = r_target / RMAP_NUM_COUNTS_PER_BOX; - const int kbc_box_shift = (r_target % RMAP_NUM_COUNTS_PER_BOX) * RMAP_BITS_FOR_COUNTS; // 3 bits each. - - int rmap_value = nick_rmap_counts[kbc_map]; - int count = (rmap_value >> kbc_box_shift) & RMAP_COUNT_MASK; - - //if ((printandquit) && (l_y == 13414)) { - // bool superdebug = l_y == 13414 r_target hit: 1725 - // printf(" m: %u r_target: %u count:%u\n", m, r_target, count); - //} - if (count > 0) { - //uint16_t pos_R = value >> 3; - uint16_t pos_R = nick_rmap_positions[r_target]; - //if (printandquit) { - // printf("L_y: %u r_target hit: %u\n", l_y, r_target); - //} - //printf(" has match\n"); - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R;//nick_rmap_positions[r_target];//value >> 4; - matches[num_matches] = match; - //atomicAdd(&non_duplicate_counts,1); - - // handle edge cases - // TODO: let's push these into separate array - // then test them later. - if (count > 1) { - // add the extras - //int extra_match = 0; - for (int slot = 0; slot < num_extras; slot++) { - if (nick_rmap_extras_ry[slot] == r_target) { - uint16_t extra_pos_R = nick_rmap_extras_pos[slot]; - match.idxR = extra_pos_R;//value >> 4; - int num_matches = atomicAdd(&total_matches,1); - matches[num_matches] = match; - //extra_match++; - //matches[num_matches+extra_match] = match; - //atomicAdd(&duplicate_counts,1); - //if (doPrint > 1) { - // printf("Collected extra match pos_R: %u from r_y: %u in slot:%u \n", extra_pos_R, r_target, slot); - //} - } - } - //if (global_kbc_L_bucket_id < 10) { - // if (extra_match != count-1) { - // printf("ERRORRRR! EXTRA MATCHES %u DOES NOT MATCH COUNT-1 %u\n", extra_match, count); - // } else { - // printf("BUCKET L %u SUCCESSFULLY ADDED EXTRA COUNTS %u\n", global_kbc_L_bucket_id, count); - // } - //} - } - } - } - } - } - - __syncthreads(); - - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - if (global_kbc_L_bucket_id == 1) { - //if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) { - printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //Ly is:[20932] Lx: [322482289] Rx: [3382886636] f result:[273114646565] - //if (blake_result == 56477140042) { - // printf(" ---** BLAKE CORRECT **\n"); - //} else { - // printf(" ---** BLAKE WRONG :(((( \n"); - //} - // Ly is:[21557] Lx: [3620724289] Rx: [2663198278] f result:[56477140042] - //} - } - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - if (global_kbc_L_bucket_id == 1) { - uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - } - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - bucketed_out[pair_address] = pair; - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - if (threadIdx.x == 0) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - } - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if (threadIdx.x == 0) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } -} - - -template -__global__ -void gpu_find_tx_matches_orig(uint16_t table, uint32_t batch_id, uint32_t start_kbc_L, uint32_t end_kbc_R, - const BUCKETED_ENTRY_IN *kbc_local_entries, const int *kbc_local_num_entries, - BUCKETED_ENTRY_OUT *bucketed_out, int *out_bucket_counts) { - const uint16_t MAX_BIDS = 16; - __shared__ uint16_t R_bids[kC*MAX_BIDS]; // kC is 127, this gives us 127*8 * 2 bytes = 2kb - __shared__ int R_bids_count[kC]; // size 127 bytes - __shared__ int R_bid_positions[kC*MAX_BIDS];//RBid_Entry R_bid_entries[kC*MAX_BIDS]; // size 127 * 8 * 6 bytes = 6kb - __shared__ uint8_t matching_shifts_c[64]; // 128 bytes - __shared__ Index_Match matches[KBC_MAX_ENTRIES_PER_BUCKET]; - __shared__ int total_matches; - //********************* - //Total tables time: 86822 ms - // match: 22397 ms - // phase 1: 3930ms - //__shared__ Bucketed_kBC_Entry kbc_L_entries[400]; // will copy global to here, unfortunately not faster :( - //__shared__ Bucketed_kBC_Entry kbc_R_entries[400]; - - //end_kbc_R = end_kbc_R - start_kbc_L; - //start_kbc_L = 0; - //if (threadIdx.x == 0) { - // printf("doing block inside kernel %u\n", start_kbc_L); - //} - - int kbc_L_bucket_id = blockIdx.x; // NOTE: localized so starts at 0... // + start_kbc_L; - uint32_t global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - - // doPrint 1 = end matches and bucket counts, 2 = a little debug, 3 = lots. - const uint8_t doPrint = 1;//(global_kbc_L_bucket_id < 10) ? 1 : 0; // start_kbc_L > 0 ? 1: 0; // 0 is none, 1 is basic, 2 is detailed - - - - if (gridDim.x != (end_kbc_R - start_kbc_L)) { - printf("ERROR: GRIDDIM %u MUST EQUAL NUMBER OF KBCS TO SCAN %u\n", gridDim.x, end_kbc_R - start_kbc_L); - } - int numThreadsInBlock = blockDim.x; - int threadId = threadIdx.x; - int threadStartScan = threadId; - int threadSkipScan = numThreadsInBlock; - - //printf("threadId: %u startScan: %u skipScan: %u", threadId, threadStartScan, threadSkipScan); - if (threadIdx.x == 0) { - // only do this once, should be in constant memory - /*for (uint16_t parity = 0; parity < 2; parity++) { - for (uint16_t r = 0; r < 64; r++) { - uint16_t v = ((2 * r + parity) * (2 * r + parity)) % kC; - matching_shifts_c[parity][r] = v; - //printf("matching shifts %u %u = %u\n", parity, r, v); - } - }*/ - total_matches = 0; - } - - uint16_t max_bids_found = 0; - - //const uint32_t start_L = kbc_start_addresses[kbc_L_bucket_id]; - //const uint32_t start_R = kbc_start_addresses[kbc_R_bucket_id]; - //const int num_L = start_R - start_L; - //const int num_R = (start_R < kBC_NUM_BUCKETS) ? kbc_start_addresses[kbc_R_bucket_id+1] - start_R : total_entries_count - start_R; - const uint32_t start_L = kbc_L_bucket_id*KBC_MAX_ENTRIES_PER_BUCKET; - const uint32_t start_R = (kbc_L_bucket_id+1)*KBC_MAX_ENTRIES_PER_BUCKET; - const int num_L = kbc_local_num_entries[kbc_L_bucket_id]; - const int num_R = kbc_local_num_entries[(kbc_L_bucket_id+1)]; - - if (threadIdx.x == 0) { - if (doPrint > 1) printf("find matches global kbc bucket L: %u local_b_id:%u num_L %u num_R %u\n", global_kbc_L_bucket_id, kbc_L_bucket_id, num_L, num_R); - if ((num_L >= KBC_MAX_ENTRIES_PER_BUCKET) || (num_R >= KBC_MAX_ENTRIES_PER_BUCKET)) { - printf("ERROR numL or numR > max entries\n"); - return; - } - if ((num_L == 0) || (num_R == 0) ) { - printf("ERROR: numL and numR are 0\n"); - return; - } - } - - const BUCKETED_ENTRY_IN *kbc_L_entries = &kbc_local_entries[start_L]; - const BUCKETED_ENTRY_IN *kbc_R_entries = &kbc_local_entries[start_R]; - - uint16_t parity = global_kbc_L_bucket_id % 2; - for (int r = threadIdx.x; r < 64; r += blockDim.x) { - uint16_t v = ((2 * r + parity) * (2 * r + parity)) % kC; - matching_shifts_c[r] = v; // this is a wash...doesn't save much if anything - } - for (int i = threadIdx.x; i < kC; i += blockDim.x) { - R_bids_count[i] = 0; - } - - __syncthreads(); // all written initialize data should sync - - - - //Bucketed_kBC_Entry L_entry = kbc_local_entries[0]; - BUCKETED_ENTRY_IN temp_entry = kbc_L_entries[0]; - - //uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(temp_entry, global_kbc_L_bucket_id); - //uint16_t parity = (calc_y / kBC) % 2; - - - for (int pos_R = threadStartScan; pos_R < num_R; pos_R+=threadSkipScan) { - //Bucketed_kBC_Entry R_entry = kbc_local_entries[MAX_KBC_ENTRIES+pos_R]; - BUCKETED_ENTRY_IN R_entry = kbc_R_entries[pos_R]; - //global_kbc_L_bucket_id = kbc_L_bucket_id + start_kbc_L; - //calc_y = CALC_Y_BUCKETED_KBC_ENTRY(R_entry, global_kbc_L_bucket_id+1); - uint16_t y_kC = R_entry.y % kC; // should be same as calc_y % kC ? - uint16_t y_mod_kBC_div_kC = R_entry.y / kC; // should be same as R_entry.y / kC - - - int num_bids = atomicAdd(&R_bids_count[y_kC],1); - if (num_bids >= MAX_BIDS) { - printf("ERROR KBC LOCAL MAX BIDS EXCEEDED %u in global bucket %u\n", num_bids, global_kbc_L_bucket_id); - //printf("\nR_entry y:%u meta[0]:%u y_kC:%u y_mod_kBC_div_kC: %u into slot: %u\n ", R_entry.y, R_entry.meta[0], y_kC, y_mod_kBC_div_kC, num_bids); - } else { - // uint8_t num_bids = R_bids_count[y_kC]++; - R_bids[y_kC*MAX_BIDS + num_bids] = y_mod_kBC_div_kC; - //R_bid_entries[y_kC*MAX_BIDS + num_bids].x = R_entry.x; - R_bid_positions[y_kC*MAX_BIDS + num_bids] = pos_R; - } - - //if (doPrint>2) printf("R_entry x:%u y:%u y_kC:%u y_mod_kBC_div_kC: %u into slot: %u\n ", R_entry.x, R_entry.y, y_kC, y_mod_kBC_div_kC, num_bids); - - - if (max_bids_found > num_bids) { - max_bids_found = num_bids; - } - } - - - __syncthreads(); // wait for all threads to write r_bid entries - - for (uint16_t pos_L = threadStartScan; pos_L < num_L; pos_L+=threadSkipScan) { - //Bucketed_kBC_Entry L_entry = kbc_local_entries[pos_L]; - BUCKETED_ENTRY_IN L_entry = kbc_L_entries[pos_L]; - - //if (doPrint>2) printf("CHECKING pos_L:%u entry x:%u for match\n", pos_L, L_entry.x); - uint16_t yl_bid = L_entry.y / kC; - uint16_t yl_cid = L_entry.y % kC; - - for (uint8_t m = 0; m < 64; m++) { - uint16_t target_bid = (yl_bid + m); - // TODO: benchmark if matching_shifts array is actually faster...doubt it. - uint16_t target_cid = yl_cid + matching_shifts_c[m]; // turns out it's a wash - //uint16_t target_cid = yl_cid + ((2 * m + parity) * (2 * m + parity)) % kC; - - // This is faster than % - if (target_bid >= kB) { - target_bid -= kB; - } - if (target_cid >= kC) { // check if rid of %k on = part above. - target_cid -= kC; - } - - uint16_t num_bids = R_bids_count[target_cid]; - if (num_bids > MAX_BIDS) { - printf("PRUNING NUM BIDS FROM %u TO %u", num_bids, MAX_BIDS); - num_bids = MAX_BIDS; - } - // this inner loop is inefficient as num bids can vary...maybe push into list? - for (uint32_t i = 0; i < num_bids; i++) { - uint16_t R_bid = R_bids[target_cid*MAX_BIDS + i]; - - if (target_bid == R_bid) { - int pos_R = R_bid_positions[target_cid*MAX_BIDS + i]; - int num_matches = atomicAdd(&total_matches,1); - if (num_matches >= KBC_MAX_ENTRIES_PER_BUCKET) { - printf("PRUNED: exceeded matches allowed per bucket MAX:%u current:%u\n", KBC_MAX_ENTRIES_PER_BUCKET, num_matches); - } else { - Index_Match match = { }; - match.idxL = pos_L; - match.idxR = pos_R; - matches[num_matches] = match; - } - //if (doPrint>2) { - // printf("Thread %u pos_L:%u Match #%u found Lx:%u, Rx:%u\n", threadId, pos_L, num_matches, L_entry.x, R_entry.x); - //} - //printf(" Match found Lx:%u, Rx:%u\n", match.Lx, match.Rx); - } - } - } - - } - - __syncthreads(); - - - if (threadIdx.x == 0) { - if (doPrint>1) { - // only do this once, should be in constant memory - //if (doPrint>2) { - // printf("match list\n"); - // for (int i=0;i (KBC_MAX_ENTRIES_PER_BUCKET-1)) { - printf("PRUNING MATCHES FROM %u to %u\n", total_matches, KBC_MAX_ENTRIES_PER_BUCKET-1); - total_matches = (KBC_MAX_ENTRIES_PER_BUCKET-1); - } - } - - __syncthreads(); - - /*if ((global_kbc_L_bucket_id == 0) && (threadIdx.x == 0)) { - - printf("Bucket match calc verification bucket %u num_matches: %u", global_kbc_L_bucket_id, total_matches); - for (int i=0;i < total_matches;i++) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - - printf("L_Entry y %u R_Entry y %u\n", L_Entry.y, R_Entry.y); - int16_t yr_kbc = R_Entry.y; - int16_t yr_bid = yr_kbc / kC; // values [0..kB] - int16_t yl_kbc = L_Entry.y; - int16_t yl_bid = yl_kbc / kC; // values [0..kB] - int16_t formula_one = yr_bid - yl_bid; // this should actually give m - if (formula_one < 0) { - formula_one += kB; - } - int16_t m = formula_one; - if (m >= kB) { - m -= kB; - } - printf(" m value calc: %u\n", m); - if (m < 64) { - // passed first test - int16_t yl_cid = yl_kbc % kC; // % kBC % kC = %kC since kBC perfectly divisible by kC - int16_t yr_cid = yr_kbc % kC; - int16_t parity = (global_kbc_L_bucket_id) % 2; - int16_t m2_parity_squared = (((2 * m) + parity) * ((2 * m) + parity)) % kC; // values [0..127] - int16_t formula_two = yr_cid - yl_cid; - if (formula_two < 0) { - formula_two += kC; - } - printf(" formula two %u <-> %u m2_parity %u\n", formula_two, m2_parity_squared); - if (formula_two == m2_parity_squared) { - // we have a match. - printf(" MATCH OK\n"); - } else { - printf(" FAILED TO MATCH\n"); - } - } - } - - }*/ - - // now we go through all our matches and output to next round. - for (int i=threadIdx.x;i < total_matches;i+=blockDim.x) { - Index_Match match = matches[i]; - BUCKETED_ENTRY_OUT pair = {}; - BUCKETED_ENTRY_IN L_Entry = kbc_L_entries[match.idxL]; - BUCKETED_ENTRY_IN R_Entry = kbc_R_entries[match.idxR]; - uint64_t blake_result; - uint64_t calc_y = CALC_Y_BUCKETED_KBC_ENTRY(L_Entry, global_kbc_L_bucket_id); - if (table == 1) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = R_Entry.meta[0]; - //nick_blake3_old(pair.meta[0], pair.meta[1], calc_y, &blake_result); // adds 500ms - nick_blake3(pair.meta, 2, calc_y, &blake_result, 0, NULL); - if (global_kbc_L_bucket_id == 1) { - //if ((calc_y == 21557) && (L_Entry.meta[0] == 3620724289) && (R_Entry.meta[0] == 2663198278)) { - printf("Got y %llu idxL:%u idxR:%u Lx: %u Rx: %u and f_result: %llu\n", calc_y, match.idxL, match.idxR, L_Entry.meta[0], R_Entry.meta[0], blake_result); - //Ly is:[20932] Lx: [322482289] Rx: [3382886636] f result:[273114646565] - //if (blake_result == 56477140042) { - // printf(" ---** BLAKE CORRECT **\n"); - //} else { - // printf(" ---** BLAKE WRONG :(((( \n"); - //} - // Ly is:[21557] Lx: [3620724289] Rx: [2663198278] f result:[56477140042] - //} - } - - } else if (table == 2) { - pair.meta[0] = L_Entry.meta[0]; - pair.meta[1] = L_Entry.meta[1]; - pair.meta[2] = R_Entry.meta[0]; - pair.meta[3] = R_Entry.meta[1]; - nick_blake3(pair.meta, 4, calc_y, &blake_result, 0, NULL); - if (global_kbc_L_bucket_id == 1) { - uint64_t Lx = (((uint64_t) pair.meta[0]) << 32) + pair.meta[1]; - uint64_t Rx = (((uint64_t) pair.meta[2]) << 32) + pair.meta[3]; - printf("Got y %llu idxL:%u idxR:%u Lx: %llu Rx: %llu and f_result: %llu\n", calc_y, match.idxL, match.idxR, Lx, Rx, blake_result); - } - } else if (table == 3) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 4, pair.meta); - } else if (table == 4) { - const uint32_t meta[8] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], L_Entry.meta[3], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], R_Entry.meta[3] - }; - nick_blake3(meta, 8, calc_y, &blake_result, 3, pair.meta); - } else if (table == 5) { - const uint32_t meta[6] = { - L_Entry.meta[0], L_Entry.meta[1], L_Entry.meta[2], - R_Entry.meta[0], R_Entry.meta[1], R_Entry.meta[2], - }; - nick_blake3(meta, 6, calc_y, &blake_result, 2, pair.meta); - } else if (table == 6) { - const uint32_t meta[4] = { - L_Entry.meta[0], L_Entry.meta[1], - R_Entry.meta[0], R_Entry.meta[1] - }; - nick_blake3(meta, 4, calc_y, &blake_result, 0, NULL); - } - uint64_t batch_bucket = blake_result >> (38-6); - const uint64_t block_mod = (uint64_t) 1 << (38-6); - pair.y = (uint32_t) (blake_result % block_mod); - int block_slot = atomicAdd(&out_bucket_counts[batch_bucket],1); - uint32_t pair_address = batch_bucket * HOST_MAX_BLOCK_ENTRIES + block_slot; - if (pair_address >= DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("ERROR: results address overflow\n"); - } else { - bucketed_out[pair_address] = pair; - } - - // do we have a double bucket to write into? - //uint32_t double_bucket_id = 0; - //uint32_t kbc_bucket_id = blake_result / kBC; - //uint64_t batch_bucket_min_kbc = (batch_bucket << 32) / kBC; - //uint64_t batch_bucket_max_kbc = ((batch_bucket+1) << 32) / kBC; - //if (kbc_bucket_id == batch_bucket_min_kbc) { - // double_bucket_id = batch_bucket - 1; - //} else if (kbc_bucket_id == batch_bucket_max_kbc) { - // double_bucket_id = batch_bucket + 1; - //} - } - - if (threadIdx.x == 0) { - //if ((doPrint > 0) && (global_kbc_L_bucket_id < 10 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - if ((global_kbc_L_bucket_id % 25000 == 0)) printf(" matches kbc bucket: %u num_L:%u num_R:%u pairs:%u\n", global_kbc_L_bucket_id, num_L, num_R, total_matches); - - } - /* - kBC bucket id: 0 L entries: 222 R entries: 242 matches: 219 - kBC bucket id: 1 L entries: 242 R entries: 257 matches: 248 - kBC bucket id: 2 L entries: 257 R entries: 204 matches: 222 - kBC bucket id: 3 L entries: 204 R entries: 243 matches: 185 - Total matches: 4294859632 - - Computing table 3 - Bucket 0 uniform sort. Ram: 7.678GiB, u_sort min: 2.250GiB, qs min: 0.563GiB. - kBC bucket id: 0 L entries: 228 R entries: 253 matches: 276 - kBC bucket id: 1 L entries: 253 R entries: 230 matches: 227 - kBC bucket id: 2 L entries: 230 R entries: 232 matches: 212 - kBC bucket id: 3 L entries: 232 R entries: 237 matches: 221 - Total matches: 4294848520 - */ - if (threadIdx.x == 0) { - if (table == 1) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==222) && (num_R==242) && (total_matches==219)) { - printf("- TABLE 1 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 1 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 240 R entries: 233 matches: 232 - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==240) && (num_R==233) && (total_matches==232)) { - printf("- TABLE 1 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 1 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - if (table == 2) { - if (global_kbc_L_bucket_id == 0) { - if ((num_L==228) && (num_R==253) && (total_matches==276)) { - printf("- TABLE 2 MATCHES CORRECT -\n"); - } else { - printf("*** TABLE 2 MATCHES WRONG! ***\n"); - } - } - //kBC bucket id: 4000000 L entries: 241 R entries: 238 matches: 224 - - if (global_kbc_L_bucket_id == 4000000) { - if ((num_L==241) && (num_R==238) && (total_matches==224)) { - printf("- TABLE 2 bucket 4000000 MATCHES CORRECT num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } else { - printf("*** TABLE 2 bucket 4000000 MATCHES WRONG! num_L:%u num_R:%u matches:%u-\n", num_L, num_R, total_matches); - } - } - } - } -} - -#define KBCFILTER_WITH_XINCLUDES(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - for (int j=0;j<64;j++) { \ - if (include_xs[j] == (x+i)) { printf("including x %u\n", (x+i)); \ - if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \ - F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ - } } \ -} - -//if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); } -#define KBCFILTER(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ - for (int j=0;j<64;j++) { \ - if (include_xs[j] == (x+i)) { printf("including x %u\n", (x+i)); \ - if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \ - F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ - } } \ -} - -//if ((x + i) < 256) { printf("x: %u y:%llu kbc:%u\n", (x+i), y, kbc_bucket_id); } -//if (((x+i) % (1024*1024)) == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); } -//if (kbc_bucket_id == 0) { printf("x: %u chacha: %u y:%llu kbc:%u\n", (x+i), chacha_y, y, kbc_bucket_id); } - -#define KBCFILTER(chacha_y,i) \ -{ \ - uint64_t y = (((uint64_t) chacha_y) << 6) + (x >> 26); \ - uint32_t kbc_bucket_id = uint32_t (y / kBC); \ -if ((kbc_bucket_id >= KBC_START) && (kbc_bucket_id <= KBC_END)) { \ - uint32_t local_kbc_bucket_id = kbc_bucket_id - KBC_START; \ - int slot = atomicAdd(&kbc_local_num_entries[local_kbc_bucket_id],1); \ - F1_Bucketed_kBC_Entry entry = { (x+i), (uint32_t) (y % kBC) }; \ - if (slot >= KBC_MAX_ENTRIES_PER_BUCKET) { printf("ERROR KBC OVERFLOW MAX:%u actual:%u", KBC_MAX_ENTRIES_PER_BUCKET, slot); } \ - uint32_t entries_address = local_kbc_bucket_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; \ - kbc_local_entries[entries_address] = entry; \ - } \ -} - -__global__ -void gpu_chacha8_get_k32_keystream_into_local_kbc_entries(const uint32_t N, - const __restrict__ uint32_t *input, F1_Bucketed_kBC_Entry *kbc_local_entries, int *kbc_local_num_entries, - uint32_t KBC_START, uint32_t KBC_END) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - int stride = blockDim.x * gridDim.x; - const uint32_t end_n = N / 16; // 16 x's in each group - /*const uint32_t include_xs[64] = {602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917, - 3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027, - 1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862, - 2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395, - 2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700, - 2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635, - 2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056, - 2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969};*/ - - uint32_t x_group = index; - //for (uint32_t x_group = index; x_group <= end_n; x_group += stride) { - uint32_t x = x_group << 4;// *16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += pos; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - //uint64_t y = x0 << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = x0 >> 22; // gives bucket id 0..1023 - KBCFILTER(x0,0);KBCFILTER(x1,1);KBCFILTER(x2,2);KBCFILTER(x3,3); - KBCFILTER(x4,4);KBCFILTER(x5,5);KBCFILTER(x6,6);KBCFILTER(x7,7); - KBCFILTER(x8,8);KBCFILTER(x9,9);KBCFILTER(x10,10);KBCFILTER(x11,11); - KBCFILTER(x12,12);KBCFILTER(x13,13);KBCFILTER(x14,14);KBCFILTER(x15,15); - //} -} - -__global__ -void gpu_print_kbc_counts(int *local_kbc_counts) { - for (int i = 0; i < 10/*KBC_LOCAL_NUM_BUCKETS*/; i++) { - printf("kbc bucket: %u num:%u\n", i, local_kbc_counts[i]); - } -} - - -template -__global__ -void gpu_print_kbc_bucket_contents(BUCKETED_ENTRY *entries, int *local_kbc_counts) { - for (uint32_t kbc_bucket_id = 0; kbc_bucket_id < 4/*KBC_LOCAL_NUM_BUCKETS*/; kbc_bucket_id++) { - int num = local_kbc_counts[kbc_bucket_id]; - uint64_t add_Y = CALC_KBC_BUCKET_ADD_Y(kbc_bucket_id); - printf("kbc bucket: %u num:%u\n", kbc_bucket_id, num); - for (int idxL=0;idxL -__global__ -void gpu_merge_block_buckets_into_kbc_buckets( - const uint32_t KBC_START_ID, // determined by batch_id - const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N, - BUCKETED_ENTRY *local_kbc_entries, int *local_kbc_counts) -{ - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - //for (int i = 0; i < N; i++) { - if (i < N) { - // TODO: try just reading out entries and see if they match when going in - - BUCKETED_ENTRY block_entry = in[i]; - uint64_t calc_y = (uint64_t) block_entry.y + batch_bucket_add_Y; - uint32_t kbc_id = calc_y / kBC; - uint32_t KBC_END_ID = KBC_START_ID + KBC_LOCAL_NUM_BUCKETS; - if ((kbc_id < KBC_START_ID) || (kbc_id > KBC_END_ID)) { - printf(" i:%u entry.y:%u add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u KBC_LOCAL_NUM_BUCKETS:%u START:%u END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID); - } - - uint32_t local_kbc_id = kbc_id - KBC_START_ID; - int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1); - uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; - - //printf("block_id:%u [i: %u] entry.y:%u kbc_id:%u local_kbc:%u slot:%u dest:%u\n", - // block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address); - - if (slot > KBC_MAX_ENTRIES_PER_BUCKET) { - printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n"); - } - if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address); - } - block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form. - local_kbc_entries[destination_address] = block_entry; - } -} - -template -__global__ -void gpu_merge_block_buckets_into_kbc_buckets_with_blockposref( - const uint32_t KBC_START_ID, const uint32_t block_id, // determined by batch_id - const BUCKETED_ENTRY *in, uint64_t batch_bucket_add_Y, const uint32_t N, - BUCKETED_ENTRY_BLOCKPOSREF *local_kbc_entries, int *local_kbc_counts, - int metasize) -{ - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - //for (int i = 0; i < N; i++) { - if (i < N) { - // TODO: try just reading out entries and see if they match when going in - - BUCKETED_ENTRY block_entry = in[i]; - BUCKETED_ENTRY_BLOCKPOSREF backref_entry = {}; - //size_t n = sizeof(block_entry.meta)/sizeof(block_entry.meta[0]); - for (int s=0;s KBC_END_ID)) { - printf(" i:%u entry.y:%u add_Y:%llu calc_y:%llu OUT OF RANGE: kbc id: %u KBC_LOCAL_NUM_BUCKETS:%u START:%u END:%u\n", i, block_entry.y, batch_bucket_add_Y, calc_y, kbc_id, KBC_LOCAL_NUM_BUCKETS, KBC_START_ID, KBC_END_ID); - } - - uint32_t local_kbc_id = kbc_id - KBC_START_ID; - int slot = atomicAdd(&local_kbc_counts[local_kbc_id],1); - uint32_t destination_address = local_kbc_id * KBC_MAX_ENTRIES_PER_BUCKET + slot; - - //printf("block_id:%u [i: %u] entry.y:%u kbc_id:%u local_kbc:%u slot:%u dest:%u\n", - // block_id, i, block_entry.y, kbc_id, local_kbc_id, slot, destination_address); - - if (slot > KBC_MAX_ENTRIES_PER_BUCKET) { - printf("OVERFLOW: slot > MAX ENTRIES PER BUCKET\n"); - } - if (destination_address > DEVICE_BUFFER_ALLOCATED_ENTRIES) { - printf("OVERFLOW: destination_address overflow > DEVICE_BUFFER_ALLOCATED_ENTRIES %u\n", destination_address); - } - //block_entry.y = calc_y % kBC; // hah! Don't forget to map it to kbc bucket form. - local_kbc_entries[destination_address] = backref_entry; - } -} - -void transferBlocksFromHostToDevice(const uint16_t table, const uint32_t batch_id, - char *device_buffer_in, char *device_buffer_kbc, const size_t DEVICE_ENTRY_SIZE) { - uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id); - - // consider compressing stream to cpu - // https://developer.nvidia.com/blog/optimizing-data-transfer-using-lossless-compression-with-nvcomp/ - - // clear local kbc's! - CUDA_CHECK_RETURN(cudaMemset(device_local_kbc_num_entries, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - - uint64_t device_bytes_start = 0; - uint32_t total_entries_copied = 0; - for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - //std::cout << "\n Preparing batch:" << batch_id << " block:" << block_id << " for host->device" << std::endl; - uint32_t criss_cross_id = getCrissCrossBlockId(table,batch_id,block_id); - //std::cout << " criss_cross_id:" << criss_cross_id << std::endl; - uint32_t num_entries_to_copy = host_criss_cross_entry_counts[criss_cross_id]; - //std::cout << " num_entries_to_copy: " << num_entries_to_copy << std::endl; - uint64_t host_block_entry_start_position = getCrissCrossBlockEntryStartPosition(criss_cross_id); - uint64_t host_bytes_start = host_block_entry_start_position * HOST_UNIT_BYTES; - //std::cout << " host_block_entry_start_position: " << host_block_entry_start_position << std::endl; - //std::cout << " host_bytes_start: " << host_bytes_start << std::endl; - total_entries_copied += num_entries_to_copy; - - if (num_entries_to_copy > HOST_MAX_BLOCK_ENTRIES) { - std::cout << "OVERFLOW: num_entries_to_copy " << num_entries_to_copy << " > HOST_MAX_BLOCK_ENTRIES " << HOST_MAX_BLOCK_ENTRIES << std::endl; - } - - size_t bytes_to_copy = num_entries_to_copy*DEVICE_ENTRY_SIZE; - if (device_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) { - std::cout << "ERROR: DEVICE BUFFER OVERFLOW\n size wanted: " << (device_bytes_start + bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - } - if (host_bytes_start + bytes_to_copy > HOST_ALLOCATED_BYTES) { - std::cout << "ERROR: HOST MEM OVERFLOW\n size wanted: " << (host_bytes_start + bytes_to_copy) << " size available:" << HOST_ALLOCATED_BYTES << std::endl; - } - - /* - Total tables time: 73825 ms - match: 10377 ms - ---------- -transfer time: 61610 ms - bytes: 687109273160 (639GB) - - - ******- no pci transfer, do direct fro mhost...saved 7s or 10% (ok, we don't include writing to disk) *************** -Total tables time: 66989 ms - match: 10358 ms - ---------- -transfer time: 54805 ms - bytes: 687109273464 (639GB) -********************* - */ - - //std::cout << " Copying " << num_entries_to_copy - // << " entries from device_bytes_start: " << device_bytes_start - // << " to host_bytes_start: " << host_bytes_start - // << " bytes length: " << bytes_to_copy << std::endl; - //std::cout << " Block_id: " << block_id << " device->host bytes:" << bytes_to_copy << " entries:" << num_entries_to_copy << std::endl; - const bool use_direct_from_host = true; - if (!use_direct_from_host) { - CUDA_CHECK_RETURN(cudaMemcpy(&device_buffer_in[device_bytes_start], &host_criss_cross_blocks[host_bytes_start],bytes_to_copy,cudaMemcpyHostToDevice)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - //std::cout << " done.\n"; - - // now for our block, determine what the kbc counts were, and merge entries ordered into global kbc's. - // gpu_map_in_buffer_to_global_kbc_for_batch(device_buffer_in, device_buffer_out, num_entries_to_copy); - int blockSize = 256; - int numBlocks = (num_entries_to_copy + blockSize - 1) / (blockSize); - uint64_t batch_bucket_add_Y = CALC_BATCH_BUCKET_ADD_Y(batch_id);//(((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id); - if (table == 2) { - Tx_Bucketed_Meta2 *in; - if (use_direct_from_host) in = (Tx_Bucketed_Meta2 *) &host_criss_cross_blocks[host_bytes_start]; - else in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start]; - Tx_Bucketed_Meta2 *local_kbc_entries = (Tx_Bucketed_Meta2 *) &device_buffer_kbc[0]; - gpu_merge_block_buckets_into_kbc_buckets<<>>( - KBC_START, - in, batch_bucket_add_Y, num_entries_to_copy, - local_kbc_entries, device_local_kbc_num_entries); - } else if ((table == 3) || (table == 4)) { - Tx_Bucketed_Meta4 *in; - if (use_direct_from_host) in = (Tx_Bucketed_Meta4 *) &host_criss_cross_blocks[host_bytes_start]; - else in = (Tx_Bucketed_Meta4 *) &device_buffer_in[device_bytes_start]; - //Tx_Bucketed_Meta4 *in = (Tx_Bucketed_Meta4 *) &device_buffer_in[device_bytes_start]; - //Tx_Bucketed_Meta4 *local_kbc_entries = (Tx_Bucketed_Meta4 *) &device_buffer_kbc[0]; - Tx_Bucketed_Meta4_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta4_Blockposref *) &device_buffer_kbc[0]; - gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<<>>( - KBC_START,block_id, - in, batch_bucket_add_Y, num_entries_to_copy, - local_kbc_entries, device_local_kbc_num_entries, - 4); - } else if (table == 5) { - Tx_Bucketed_Meta3 *in; - if (use_direct_from_host) in = (Tx_Bucketed_Meta3 *) &host_criss_cross_blocks[host_bytes_start]; - else in = (Tx_Bucketed_Meta3 *) &device_buffer_in[device_bytes_start]; - //Tx_Bucketed_Meta3 *in = (Tx_Bucketed_Meta3 *) &device_buffer_in[device_bytes_start]; - Tx_Bucketed_Meta3_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta3_Blockposref *) &device_buffer_kbc[0]; - gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<<>>( - KBC_START,block_id, - in, batch_bucket_add_Y, num_entries_to_copy, - local_kbc_entries, device_local_kbc_num_entries, - 3); - } else if (table == 6) { - Tx_Bucketed_Meta2 *in; - if (use_direct_from_host) in = (Tx_Bucketed_Meta2 *) &host_criss_cross_blocks[host_bytes_start]; - else in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start]; - //Tx_Bucketed_Meta2 *in = (Tx_Bucketed_Meta2 *) &device_buffer_in[device_bytes_start]; - Tx_Bucketed_Meta2_Blockposref *local_kbc_entries = (Tx_Bucketed_Meta2_Blockposref *) &device_buffer_kbc[0]; - gpu_merge_block_buckets_into_kbc_buckets_with_blockposref<<>>( - KBC_START,block_id, - in, batch_bucket_add_Y, num_entries_to_copy, - local_kbc_entries, device_local_kbc_num_entries, - 2); - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - device_bytes_start += bytes_to_copy; - table_transfer_in_bytes += bytes_to_copy; - } - //std::cout << "\nTotal entries copied in batch " << batch_id << ": " << total_entries_copied << std::endl; -} - - - -int mmap_fdout; -char *mmap_address; -void setupMMap(size_t desired_size_bytes) { - - int mode = 0x0777; - - std::string filename = "/mnt/kioxia/tmp/test-mmap.tmp"; - - std::cout << "Setting up MMap with " << desired_size_bytes << " bytes in file: " << filename << std::endl; - - if ((mmap_fdout = open (filename.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode )) < 0) { - std::cout << "can't create " << filename << " for writing" << std::endl; - return; - } - - /* go to the location corresponding to the last byte */ - if (lseek (mmap_fdout, desired_size_bytes, SEEK_SET) == -1) { - printf ("lseek error"); - return; - } - - /* write a dummy byte at the last location */ - if (write (mmap_fdout, "", 1) != 1) { - printf ("write error"); - return; - } - - if ((mmap_address = (char *) mmap (0, desired_size_bytes, PROT_READ | PROT_WRITE, MAP_SHARED, mmap_fdout, 0)) == (caddr_t) -1) { - printf ("mmap error for output"); - return; - } - - std::cout << "MMap done." << std::endl; -} - -inline void writeHostMemToMMap(uint32_t address, char *host_mem, uint32_t bytes_to_copy) { - //std::string filename = "/mnt/kioxia/tmp/test" + std::to_string(table) + "-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - //std::cout << "Writing to file " << filename << std::endl; - //FILE* pFile; - //pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing?? - //fwrite(host_mem, 1, bytes_to_copy, pFile); - //fclose(pFile); - memcpy(mmap_address, host_mem, bytes_to_copy); -} - - - -void convertAndWriteT2HostMemToBlockFiles( - uint16_t batch_id, uint16_t block_id, - Tx_Bucketed_Meta4 *t2_data, // will take meta[0] and meta[2] for Lx1 and Lx2 - uint32_t num_entries_to_copy) { - - if (num_entries_to_copy == 0) { - return; - } - // first convert to memory - T2BaseRef *t2_base = (T2BaseRef *) host_refdata_blocks; - for (int i=0;i HOST_MAX_BLOCK_ENTRIES) { - std::cout << "OVERFLOW: num_entries_to_copy " << num_entries_to_copy << " > HOST_MAX_BLOCK_ENTRIES " << HOST_MAX_BLOCK_ENTRIES << std::endl; - } - if (max_block_entries_copied_device_to_host < num_entries_to_copy) { - max_block_entries_copied_device_to_host = num_entries_to_copy; // helps determine HOST_MAX_BLOCK_ENTRIES value. - } - - uint64_t device_bytes_start = device_entry_start * DEVICE_ENTRY_SIZE; - size_t bytes_to_copy = num_entries_to_copy*DEVICE_ENTRY_SIZE; - if (device_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) { - std::cout << "ERROR: DEVICE BUFFER OVERFLOW\n size wanted: " << (device_bytes_start + bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - } - if (host_bytes_start + bytes_to_copy > HOST_ALLOCATED_BYTES) { - std::cout << "ERROR: HOST MEM OVERFLOW\n size wanted: " << (host_bytes_start + bytes_to_copy) << " size available:" << HOST_ALLOCATED_BYTES << std::endl; - } - //if (doPrint) std::cout << " Copying " << num_entries_to_copy - // << " entries from device_bytes_start: " << device_bytes_start - // << " to host_bytes_start: " << host_bytes_start - // << " bytes length: " << bytes_to_copy << std::endl; - //std::cout << " Block_id: " << block_id << " device->host bytes:" << bytes_to_copy << " entries:" << num_entries_to_copy << std::endl; - - if (table < 6) { - // we only copy criss cross memory if it's not the last table, since that only exports back ref data and no forward propagation. - CUDA_CHECK_RETURN(cudaMemcpy(&host_criss_cross_blocks[host_bytes_start],&device_buffer[device_bytes_start],bytes_to_copy,cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - batch_bytes_transfered += bytes_to_copy; - } - if (doPrint) std::cout << " done.\n"; - - //if (table == 1) { - // oof...mmap is 47000ms transfer for T1 - // memcpy(mmap_address + total_transfered_bytes, &host_criss_cross_blocks[host_bytes_start], bytes_to_copy); - //writeHostMemToMMap(total_transfered_bytes, &host_criss_cross_blocks[host_bytes_start], bytes_to_copy); - //} - - - - // for T2 we dump to file, since this becomes the baseline with 4 meta entries for x's. - /*if (table == 2) { - // 42241 ms - a wash whether we write 4 x's in T2 or use 2'xs in T1 and write a 64bit ref here. - // BUT- our goals is to get kbc's, so base level T2 can just write 2 kbc entries (only 50 bits (25 kbc * 2) but we need - // CPU to process the entry and split into the proper reference buckets at this stage. 64 batches already splits 18m kbc's - // down into 285k kbc's so should help with mem buffer. OR...we could do this in GPU and just use CPU to dumb copy, ay? - // BUUUTTT - we need enough spare memory so have to do it at end of entire first phase process. - // tables was 56..yeesh - if (doWriteT2BaseData) { - Tx_Bucketed_Meta4 *t2_data = (Tx_Bucketed_Meta4 *) &host_criss_cross_blocks[host_bytes_start]; - convertAndWriteT2HostMemToBlockFiles(batch_id, block_id, t2_data, num_entries_to_copy); - } - }*/ - if (table == 3) { - if (doWriteT3BaseData) { - uint64_t refdata_bytes_start; - size_t refdata_bytes_to_copy; - refdata_bytes_start = device_entry_start * sizeof(T3BaseRef); - refdata_bytes_to_copy = num_entries_to_copy*sizeof(T3BaseRef); - - if (refdata_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) { - std::cout << "ERROR: DEVICE REFDATA OVERFLOW\n size wanted: " << (refdata_bytes_start + refdata_bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - } - CUDA_CHECK_RETURN(cudaMemcpy(&host_refdata_blocks[refdata_bytes_start],&device_refdata[refdata_bytes_start],refdata_bytes_to_copy,cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - // now write to files - writeT3BaseDataToBlockFiles(batch_id, block_id, - &host_refdata_blocks[refdata_bytes_start], - num_entries_to_copy, refdata_bytes_to_copy); - } - } - if (table > 3) { - // transfer back ref - if (table == 6) doWriteRefData = doWriteT6Data; - if (doWriteRefData) { - uint64_t refdata_bytes_start; - size_t refdata_bytes_to_copy; - if (table == 6) { - refdata_bytes_start = device_entry_start * sizeof(T6BackRef); - refdata_bytes_to_copy = num_entries_to_copy*sizeof(T6BackRef); - } else { - refdata_bytes_start = device_entry_start * sizeof(BackRef); - refdata_bytes_to_copy = num_entries_to_copy*sizeof(BackRef); - } - if (refdata_bytes_start + bytes_to_copy > DEVICE_BUFFER_ALLOCATED_BYTES) { - std::cout << "ERROR: DEVICE REFDATA OVERFLOW\n size wanted: " << (refdata_bytes_start + refdata_bytes_to_copy) << " size available:" << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - } - CUDA_CHECK_RETURN(cudaMemcpy(&host_refdata_blocks[refdata_bytes_start],&device_refdata[refdata_bytes_start],refdata_bytes_to_copy,cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - // now write to files - writeHostRefdataToBlockFiles(table, batch_id, block_id, &host_refdata_blocks[refdata_bytes_start], num_entries_to_copy, refdata_bytes_to_copy); - } - } - - } - //fclose(pFile); - //std::cout << "Waiting for writes to finish..."; - //for(uint8_t i=0;i>>(calc_N, chacha_input, - local_kbc_entries, local_kbc_num_entries, KBC_START, KBC_END); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - total_chacha_time_ms += std::chrono::duration_cast(finish - start).count(); - //std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - // 2) gpu find_f1_matches from (F1_Bucketed_kBC_Entry *) bufferA to (T1_Pairing_Chunk *) bufferB - std::cout << " Finding matches..."; - cudaEvent_t mstart, mstop; - float milliseconds = 0; - cudaEventCreate(&mstart); - cudaEventCreate(&mstop); - - start = std::chrono::high_resolution_clock::now(); - - Tx_Bucketed_Meta1 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta1 *) device_buffer_A; - Tx_Bucketed_Meta2 *bucketed_out = (Tx_Bucketed_Meta2 *) device_buffer_B; - - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int))); // 128 is 2046, 384 is 1599 - cudaEventRecord(mstart); - gpu_find_tx_matches<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(1, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, local_kbc_num_entries, - bucketed_out, device_block_entry_counts); - cudaEventRecord(mstop); - cudaEventSynchronize(mstop); - cudaEventElapsedTime(&milliseconds, mstart, mstop); - std::cout << "gpu_find_tx_matches time: " << milliseconds << " ms\n"; - //gpu_find_tx_matches<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(1, batch_id, KBC_START, KBC_END, - // bucketed_kbc_entries_in, local_kbc_num_entries, - // host_criss_cross_blocks, device_block_entry_counts); - - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - total_match_time_ms += std::chrono::duration_cast(finish - start).count(); - std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - - // 4) gpu cp (T1_Pairing_Chunk *) bufferB into (T1_Bucketed_kBC_Entry *) bufferA - total_gpu_time_ms += std::chrono::duration_cast(finish - batch_start).count(); - //std::cout << " transferBucketedBlocksFromDeviceToHost\n"; - start = std::chrono::high_resolution_clock::now(); - transferBucketedBlocksFromDeviceToHost(1, batch_id, device_buffer_B, sizeof(Tx_Bucketed_Meta2), NULL, device_block_entry_counts); - finish = std::chrono::high_resolution_clock::now(); - table_transfer_out_time_ms += std::chrono::duration_cast(finish - start).count(); - //std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; -} - -void doTxBatch(uint16_t table, uint32_t batch_id) { - // 1) host to device transfer -> bufferB = (T1_Bucketed_kBC_Entry *) bufferB - // 2) gpu find_f1_matches from (T1_Bucketed_kBC_Entry *) bufferB to (T2_Pairing_Chunk *) bufferA - // 3) gpu exclusive scan kbc_counts to get kbc_memory_positions by blocks, and kbc_block_counts - // 4) gpu cp (T2_Pairing_Chunk *) bufferB into (T2_Bucketed_kBC_Entry *) bufferA - // 5) device to host transfer bufferA - auto batch_start = std::chrono::high_resolution_clock::now(); - auto start = std::chrono::high_resolution_clock::now(); - auto finish = std::chrono::high_resolution_clock::now(); - - size_t transfer_in_size = 0; - size_t transfer_out_size = 0; - if (table == 2) { - transfer_in_size = sizeof(Tx_Bucketed_Meta2); - transfer_out_size = sizeof(Tx_Bucketed_Meta4); - } - else if (table == 3) { - transfer_in_size = sizeof(Tx_Bucketed_Meta4); - transfer_out_size = sizeof(Tx_Bucketed_Meta4); - } - else if (table == 4) { - transfer_in_size = sizeof(Tx_Bucketed_Meta4); - transfer_out_size = sizeof(Tx_Bucketed_Meta3); - } - else if (table == 5) { - transfer_in_size = sizeof(Tx_Bucketed_Meta3); - transfer_out_size = sizeof(Tx_Bucketed_Meta2); - } - else if (table == 6) { - transfer_in_size = sizeof(Tx_Bucketed_Meta2); - transfer_out_size = 0; - // TODO: T6 could transfer to hostmem or to the backref blocks table - // since we will then read from backref blocks tables for all backrefs across tables. - } - - start = std::chrono::high_resolution_clock::now(); - transferBlocksFromHostToDevice(table, batch_id, device_buffer_B, device_buffer_A, transfer_in_size); - finish = std::chrono::high_resolution_clock::now(); - table_transfer_in_time_ms += std::chrono::duration_cast(finish - start).count(); - - //gpu_print_kbc_counts<<<1,1>>>(device_local_kbc_num_entries); - - - // 2) gpu find_f1_matches from (F1_Bucketed_kBC_Entry *) bufferA to (T1_Pairing_Chunk *) bufferB - //std::cout << " Finding matches..."; - start = std::chrono::high_resolution_clock::now(); - - //if (batch_id == 0) { - // gpu_print_kbc_bucket_contents<<<1,1>>>(bucketed_kbc_entries_in, device_local_kbc_num_entries); - //} - - const uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id); - const uint32_t next_batch = batch_id + 1; - const uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(next_batch); - - CUDA_CHECK_RETURN(cudaMemset(device_block_entry_counts, 0, (BATCHES)*sizeof(int))); - if (table == 2) { - Tx_Bucketed_Meta2 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A; - Tx_Bucketed_Meta4 *bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B; - gpu_find_tx_matches<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, device_local_kbc_num_entries, - bucketed_out, device_block_entry_counts); - } else if (table == 3) { - // at table 3 we start pulling in backref to table 2 - //Tx_Bucketed_Meta4 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4 *) device_buffer_A; - Tx_Bucketed_Meta4_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4_Blockposref *) device_buffer_A; - Tx_Bucketed_Meta4 *bucketed_out = (Tx_Bucketed_Meta4 *) device_buffer_B; - gpu_find_tx_matches_with_backref<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, device_local_kbc_num_entries, - bucketed_out, device_buffer_refdata, device_block_entry_counts); - } else if (table == 4) { - //Tx_Bucketed_Meta4 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4 *) device_buffer_A; - Tx_Bucketed_Meta4_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta4_Blockposref *) device_buffer_A; - Tx_Bucketed_Meta3 *bucketed_out = (Tx_Bucketed_Meta3 *) device_buffer_B; - gpu_find_tx_matches_with_backref<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, device_local_kbc_num_entries, - bucketed_out, device_buffer_refdata, device_block_entry_counts); - } else if (table == 5) { - //Tx_Bucketed_Meta3 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta3 *) device_buffer_A; - Tx_Bucketed_Meta3_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta3_Blockposref *) device_buffer_A; - Tx_Bucketed_Meta2 *bucketed_out = (Tx_Bucketed_Meta2 *) device_buffer_B; - gpu_find_tx_matches_with_backref<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, device_local_kbc_num_entries, - bucketed_out, device_buffer_refdata, device_block_entry_counts); - } else if (table == 6) { - //Tx_Bucketed_Meta2 *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2 *) device_buffer_A; - Tx_Bucketed_Meta2_Blockposref *bucketed_kbc_entries_in = (Tx_Bucketed_Meta2_Blockposref *) device_buffer_A; - Tx_Bucketed_Meta2 *NOT_USED = (Tx_Bucketed_Meta2 *) device_buffer_B; - gpu_find_tx_matches_with_backref<<<(KBC_END - KBC_START), THREADS_FOR_MATCHING>>>(table, batch_id, KBC_START, KBC_END, - bucketed_kbc_entries_in, device_local_kbc_num_entries, - NOT_USED, device_buffer_refdata, device_block_entry_counts); - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - table_match_time_ms += std::chrono::duration_cast(finish - start).count(); - //std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - - // 4) gpu cp (T1_Pairing_Chunk *) bufferB into (T1_Bucketed_kBC_Entry *) bufferA - //if (table < 6) { - //std::cout << " transferBucketedBlocksFromDeviceToHost\n"; - start = std::chrono::high_resolution_clock::now(); - transferBucketedBlocksFromDeviceToHost(table, batch_id, device_buffer_B, transfer_out_size, device_buffer_refdata, device_block_entry_counts); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - - table_transfer_out_time_ms += std::chrono::duration_cast(finish - start).count(); - - //std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - //} else if (table == 6) { - // TODO: handle final T6 file...maybe this can write into hostmem instead of to file. - //} - -} - -void doT1() { - - std::cout << "doT1 BATCHES:" << BATCHES << std::endl; - - auto total_start = std::chrono::high_resolution_clock::now(); - auto finish = std::chrono::high_resolution_clock::now(); // just to allocate - - // what's faster, 0.4% of kbc's, or 0.63% of xs' - for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) { - - uint32_t KBC_START = MIN_KBC_BUCKET_FOR_BATCH(batch_id); - uint32_t KBC_END = MIN_KBC_BUCKET_FOR_BATCH(batch_id+1)-1; - - auto batch_start = std::chrono::high_resolution_clock::now(); - //if (batch_id < 2) - doT1Batch(batch_id, device_local_kbc_num_entries, KBC_START, KBC_END); - finish = std::chrono::high_resolution_clock::now(); - //std::cout << " ** T1 batch " << batch_id << " finished ** " << std::chrono::duration_cast(finish - batch_start).count() << " ms\n"; - } - - finish = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "T1 Total time: " << std::chrono::duration_cast(finish - total_start).count() << " ms\n"; - std::cout << " gpu time: " << total_gpu_time_ms << " ms\n"; - std::cout << " chacha: " << total_chacha_time_ms << " ms\n"; - std::cout << " match: " << total_match_time_ms << " ms\n"; - std::cout << " ---------- " << std::endl; - std::cout << "transfer time: " << table_transfer_out_time_ms << " ms\n"; - std::cout << " bytes: " << table_transfer_out_bytes << " (" << (table_transfer_out_bytes/(1024*1024*1024)) << "GB)\n"; - std::cout << "*********************" << std::endl; - - total_transfer_in_time_ms += table_transfer_in_time_ms; - total_transfer_out_time_ms += table_transfer_out_time_ms; - total_transfer_in_bytes += table_transfer_in_bytes; - total_transfer_out_bytes += table_transfer_out_bytes; -} - -void doTx(uint16_t table) { - std::cout << "do Table " << table <<" BATCHES:" << BATCHES << std::endl; - - auto total_start = std::chrono::high_resolution_clock::now(); - auto finish = std::chrono::high_resolution_clock::now(); // just to allocate - - table_match_time_ms = 0; - table_transfer_in_time_ms = 0; - table_transfer_out_time_ms = 0; - table_transfer_in_bytes = 0; - table_transfer_out_bytes = 0; - - for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) { - auto batch_start = std::chrono::high_resolution_clock::now(); - doTxBatch(table, batch_id); - finish = std::chrono::high_resolution_clock::now(); - //std::cout << " ** T" << table << " batch " << batch_id << " finished ** " << std::chrono::duration_cast(finish - batch_start).count() << " ms\n"; - } - - finish = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "T" << table << " time: " << std::chrono::duration_cast(finish - total_start).count() << " ms\n"; - std::cout << " match: " << table_match_time_ms << " ms\n"; - std::cout << " ---------- " << std::endl; - std::cout << "transfer in time: " << table_transfer_in_time_ms << " ms\n"; - std::cout << " in bytes: " << table_transfer_in_bytes << " (" << (table_transfer_in_bytes/(1024*1024*1024)) << "GB)\n"; - std::cout << "transfer out time: " << table_transfer_out_time_ms << " ms\n"; - std::cout << " out bytes: " << table_transfer_out_bytes << " (" << (table_transfer_out_bytes/(1024*1024*1024)) << "GB)\n"; - std::cout << "*********************" << std::endl; - total_match_time_ms += table_match_time_ms; - total_transfer_in_time_ms += table_transfer_in_time_ms; - total_transfer_out_time_ms += table_transfer_out_time_ms; - total_transfer_in_bytes += table_transfer_in_bytes; - total_transfer_out_bytes += table_transfer_out_bytes; -} - - - - -void setupMemory() { - - //setupMMap(HOST_ALLOCATED_BYTES); // potentially useful if going to do random reads/writes to stored data - - std::cout << " device_block_entry_counts (" << BATCHES << "): " << BATCHES << " size:" << (sizeof(int)*BATCHES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts, BATCHES*sizeof(int))); - - std::cout << " device_local_kbc_num_entries " << KBC_LOCAL_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*KBC_LOCAL_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_local_kbc_num_entries, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - - //Tx_Pairing_Chunk_Meta4 *device_buffer_A; - std::cout << " device_buffer_A " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_A, DEVICE_BUFFER_ALLOCATED_BYTES)); - - //Tx_Pairing_Chunk_Meta4 *device_buffer_B; - std::cout << " device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES)); - - - std::cout << " device_buffer_C " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_C, DEVICE_BUFFER_ALLOCATED_BYTES)); - - std::cout << " device_buffer_refdata " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << BACKREF_UNIT_BYTES << ") = " << BACKREF_ALLOCATED_BYTES << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_refdata, BACKREF_ALLOCATED_BYTES)); - - std::cout << " HOST host_refdata_blocks ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " ALLOCATED ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " UNIT BYTES: " << BACKREF_UNIT_BYTES << " = " << (BACKREF_ALLOCATED_BYTES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_refdata_blocks, BACKREF_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE](); - - std::cout << " HOST host_criss_cross_blocks MAX_ENTRIES: " << HOST_MAX_BLOCK_ENTRIES << " ALLOCATED ENTRIES: " << HOST_ALLOCATED_ENTRIES << " UNIT BYTES: " << HOST_UNIT_BYTES << " = " << (HOST_ALLOCATED_BYTES) << std::endl; - CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_criss_cross_blocks, HOST_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE](); -} - - - -void freeMemory() { - std::cout << "Freeing memory..." << std::endl; - CUDA_CHECK_RETURN(cudaFree(device_buffer_A)); - CUDA_CHECK_RETURN(cudaFree(device_buffer_B)); - CUDA_CHECK_RETURN(cudaFree(device_buffer_C)); - - //CUDA_CHECK_RETURN(cudaFree(device_block_entry_counts)); - CUDA_CHECK_RETURN(cudaFree(device_local_kbc_num_entries)); - CUDA_CHECK_RETURN(cudaFreeHost(host_criss_cross_blocks)); - std::cout << " memory freed." << std::endl; -} - - - - -void doPhase3Compression() { - // our phase 3 compression then needs to take all pruned batches for T2, and write blocks of kbc's compressed with ANS. - // it also needs to take T6_Backref table, load all into memory, and sort by y, and put into blocks with new backref into criss cross back ref to table 2 kbc sets.p -} - -#include "k29_plotter.hpp" - -int main(int argc, char *argv[]) -{ - std::cout << "DrPlotter v0.1d" << std::endl; - chacha_setup(); - - cmd_read = 0; - - if (cmd_read == 2) { - //attack_it(); - doPhase2Pruning(); - exit(EXIT_SUCCESS); - } - if (cmd_read == 3) { - do_k29(); - exit(EXIT_SUCCESS); - } - - - doWriteT2BaseData = false; - doWriteT3BaseData = false; - doWriteRefData = false; - doWriteT6Data = false; - setupMemory(); - - - auto total_start = std::chrono::high_resolution_clock::now(); - doT1(); - doTx(2); - doTx(3); - doTx(4); - doTx(5); - doTx(6); - auto total_end = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "Total tables time: " << std::chrono::duration_cast(total_end - total_start).count() << " ms\n"; - std::cout << " match: " << total_match_time_ms << " ms\n"; - std::cout << " ---------- " << std::endl; - std::cout << "transfer in time: " << total_transfer_in_time_ms << " ms\n"; - std::cout << " bytes: " << total_transfer_in_bytes << " (" << (total_transfer_in_bytes/(1024*1024*1024)) << "GB)\n"; - std::cout << "transfer out time: " << total_transfer_out_time_ms << " ms\n"; - std::cout << " bytes: " << total_transfer_out_bytes << " (" << (total_transfer_out_bytes/(1024*1024*1024)) << "GB)\n"; - std::cout << "*********************" << std::endl; - std::cout << "Max block entries used: " << max_block_entries_copied_device_to_host << " VS HOST_MAX_BLOCK_ENTRIES:" << HOST_MAX_BLOCK_ENTRIES << std::endl; - std::cout << " freeing memory..."; - freeMemory(); - std::cout << "end." << std::endl; - exit(EXIT_SUCCESS); -} diff --git a/k29_plotter.hpp b/k29_plotter.hpp deleted file mode 100644 index d760a3b..0000000 --- a/k29_plotter.hpp +++ /dev/null @@ -1,972 +0,0 @@ -/* - * k29_plotter.hpp - * - * Created on: Mar 25, 2022 - * Author: nick - */ - -#ifndef K29_PLOTTER_HPP_ -#define K29_PLOTTER_HPP_ - -#include -#include -#include - -const uint32_t kXX_BITS = 29; - -const uint64_t k29_DEVICE_BUFFER_A_BYTES = 8589934592; // 8GB total buffer -const uint32_t k29_MAX_X_VALUE = 1 << kXX_BITS; -const uint64_t k29_MAX_Y_VALUE = 4294967296; // hack, set to 32 bit value of chacha - -const uint32_t k29_CHACHA_SPLIT_BUCKETS = 1024; // after 10 starts dropping -const uint64_t k29_CHACHA_SPLIT_BUCKET_DIVISOR = k29_MAX_Y_VALUE / (k29_CHACHA_SPLIT_BUCKETS); -const uint64_t k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET = 2 * k29_MAX_X_VALUE / k29_CHACHA_SPLIT_BUCKETS; - -uint *xchachas_bucket_counts; -uint *global_kbc_counts; - -const uint32_t k29_BATCHES = 1; -const uint32_t k29_BC_NUM_BUCKETS = 568381;//1136761;//2273523; -const uint64_t k29_BC_BUCKET_DIVISOR = k29_MAX_Y_VALUE / k29_BC_NUM_BUCKETS; -const uint32_t k29_BC_LAST_BUCKET_ID = 1136761 - 1;//2273522; -const uint32_t k29_BCS_PER_BATCH = (k29_BC_NUM_BUCKETS / BATCHES)+1; -const uint32_t k29_BC_LOCAL_NUM_BUCKETS = k29_BCS_PER_BATCH + 1; // +1 is for including last R bucket space - -const uint64_t k29_DEVICE_BUFFER_UNIT_BYTES = 32; // Tx_pairing_chunk_meta4 is 24 bytes, w/ backref is 32 bytes -const uint64_t k29_DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_LOCAL_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; // HOST_MAX_BLOCK_ENTRIES * BATCHES;// DEVICE_BUFFER_ALLOCATED_ENTRIES = 120 * ((uint64_t) 1 << 32) / (100*BATCHES); -const uint64_t k29_DEVICE_BUFFER_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * DEVICE_BUFFER_UNIT_BYTES; - - -#define ATTACK_CHACHAS_k29_YS_ONLY(datax_slot) \ -{ \ - int x_value = pos + datax_slot; \ - chacha_ys[x_value] = datax[datax_slot]; \ - chacha_xs[x_value] = x_value; \ -} - -#define ATTACK_CHACHAS_k29_TO_KBC(datax_slot) \ -{ \ - uint32_t x_value = pos + datax_slot; \ - uint32_t chacha_y = datax[datax_slot]; \ - uint32_t Ly = chacha_y; \ - uint32_t bucket_id = Ly / k29_BC_BUCKET_DIVISOR; \ - xchacha_pair pair = { x_value, chacha_y }; \ - int slot = atomicAdd(&xchachas_bucket_counts[bucket_id],1); \ - if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); \ - else { \ - xchachas_buckets[KBC_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; \ - } \ -} - -#define ATTACK_CHACHAS_k29_BUCKETADD(datax_slot) \ -{ \ - uint32_t chacha_y = datax[datax_slot]; \ - uint32_t Ly = chacha_y; \ - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \ - int slot = atomicAdd(&shared_counts[bucket_id],1); \ -} - -#define ATTACK_CHACHAS_k29_SORTEDADD(datax_slot) \ -{ \ - uint32_t x_value = pos + datax_slot; \ - uint32_t chacha_y = datax[datax_slot]; \ - uint32_t Ly = chacha_y; \ - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \ - int slot = shared_counts_offsets[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \ - shared_sorted_xs[slot] = x_value; shared_sorted_chachas[slot] = chacha_y; \ -} - -#define ATTACK_CHACHAS_k29_SORTEDADD_FILTERED(datax_slot) \ -{ \ - uint32_t x_value = pos + datax_slot; \ - uint32_t chacha_y = datax[datax_slot]; \ - uint32_t Ly = chacha_y; \ - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \ - if ((bucket_id >= filter_min) && (bucket_id < filter_max)) { \ - xchacha_pair pair = { x_value, chacha_y }; \ - int slot = shared_counts_offsets[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \ - shared_sorted_xchachas[slot] = pair; \ - } \ -} - -#define ATTACK_CHACHAS_k29_BUCKETSET(datax_slot) \ -{ \ - uint32_t x_value = pos + datax_slot; \ - uint32_t chacha_y = datax[datax_slot]; \ - uint32_t Ly = chacha_y; \ - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; \ - xchacha_pair pair = { x_value, chacha_y }; \ - int slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); \ - if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); \ - else { \ - xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; \ - } \ -} - -__global__ -void gpu_chacha8_k29_bucketadd(const uint32_t N, - const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - __shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS]; - __shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS]; - - - - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - uint32_t base_group = blockIdx.x * blockDim.x; - //uint32_t base_x = base_group * 16; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 16; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (x_group < end_n) { - - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3); - ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7); - ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11); - ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15); - - __syncthreads(); - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - ATTACK_CHACHAS_k29_BUCKETSET(0);ATTACK_CHACHAS_k29_BUCKETSET(1);ATTACK_CHACHAS_k29_BUCKETSET(2);ATTACK_CHACHAS_k29_BUCKETSET(3); - ATTACK_CHACHAS_k29_BUCKETSET(4);ATTACK_CHACHAS_k29_BUCKETSET(5);ATTACK_CHACHAS_k29_BUCKETSET(6);ATTACK_CHACHAS_k29_BUCKETSET(7); - ATTACK_CHACHAS_k29_BUCKETSET(8);ATTACK_CHACHAS_k29_BUCKETSET(9);ATTACK_CHACHAS_k29_BUCKETSET(10);ATTACK_CHACHAS_k29_BUCKETSET(11); - ATTACK_CHACHAS_k29_BUCKETSET(12);ATTACK_CHACHAS_k29_BUCKETSET(13);ATTACK_CHACHAS_k29_BUCKETSET(14);ATTACK_CHACHAS_k29_BUCKETSET(15); - - } -} - -// we do computes and tally up number in each bucket -// if number in a bucket exceeds the 128 bytes (i.e. 128/8 bytes = 16) -// and we have at least 2 buckets with said bytes, then write those out to global. -__global__ -void gpu_chacha8_k29_threshold_counters(const uint32_t N, - const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - __shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS]; - __shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS]; - - - - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - uint32_t base_group = blockIdx.x * blockDim.x; - //uint32_t base_x = base_group * 16; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 16; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (x_group < end_n) { - - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3); - ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7); - ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11); - ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15); - - __syncthreads(); - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - ATTACK_CHACHAS_k29_BUCKETSET(0);ATTACK_CHACHAS_k29_BUCKETSET(1);ATTACK_CHACHAS_k29_BUCKETSET(2);ATTACK_CHACHAS_k29_BUCKETSET(3); - ATTACK_CHACHAS_k29_BUCKETSET(4);ATTACK_CHACHAS_k29_BUCKETSET(5);ATTACK_CHACHAS_k29_BUCKETSET(6);ATTACK_CHACHAS_k29_BUCKETSET(7); - ATTACK_CHACHAS_k29_BUCKETSET(8);ATTACK_CHACHAS_k29_BUCKETSET(9);ATTACK_CHACHAS_k29_BUCKETSET(10);ATTACK_CHACHAS_k29_BUCKETSET(11); - ATTACK_CHACHAS_k29_BUCKETSET(12);ATTACK_CHACHAS_k29_BUCKETSET(13);ATTACK_CHACHAS_k29_BUCKETSET(14);ATTACK_CHACHAS_k29_BUCKETSET(15); - - } -} - -__global__ -void gpu_chacha8_k29_bucketadd_256threads_warp_buckets(const uint32_t N, - const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - __shared__ uint32_t warp_bucket_ys[32]; - __shared__ uint32_t warp_bucket_xs[32]; - __shared__ int warp_bucket_counts[256/32]; // 8 different sets of warp buckets - // idea here is to process with warps - int warp_id = threadIdx.x % 32; - uint32_t chacha_result = 23; // computed... - if ((chacha_result % 32) == warp_id) { - // add result to our bucket - int count = atomicAdd(&warp_bucket_counts[warp_id],1); - if (count == 16) { - // 8 * (4x2) = 128 bytes, full bandwidth write - } - } - // 256 threads, one bucket add at a time = 256 entries each loop. - // we need 128 bytes to make a full bandwidth global write - // = 128/8 = 16 entries from a bucket. - -} - -__global__ -void gpu_chacha8_k29_bucketadd_256threads_upto1024buckets(const uint32_t N, - // 1024 buckets = 176GB/s (240GB/s possible), 512 buckets = 276GB/s, 256 buckets = 293GB/s, 8 buckets = 337GB/s - // note we lose ~1ms on innefficient prefix sums so this can improve +20% for 1024 buckets - // with only shared counters we get 400GB/s, so this does take significant time and could be optimized - // against having bank conflicts for instance. - // possibly by doing 32 passes(!) where each thread focuses on it's own bank for shared memory. Yikes. - const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - __shared__ int shared_counts[k29_CHACHA_SPLIT_BUCKETS]; - __shared__ int global_counts[k29_CHACHA_SPLIT_BUCKETS]; - __shared__ int shared_counts_offsets[k29_CHACHA_SPLIT_BUCKETS]; - // a 256 thread x 16 pass gives 4096 values total. - // for 1024 buckets that's only 4 values average per bucket. We want to write 128 bytes = 128/8 = 16 entries minimum. - // so want minimum multiple of 4 so we average 16 entries - // our shared space only allows 32k / 8 = 4096 entries - // 1024 buckets = 176GB/s - // 512 buckets = 276GB/s - // 256 buckets = 293GB/s - // 8 buckets = 337GB/s - - //__shared__ xchacha_pair shared_sorted_xchachas[4096];// 32k - __shared__ uint32_t shared_sorted_xs[4096];// 16k <- tried to resolve bank conflicts but didn't do much - __shared__ uint32_t shared_sorted_chachas[4096];// 16k - - if (blockDim.x != 256) printf("ERROR BLOCKDIM MUST BE 256\n"); - if (k29_CHACHA_SPLIT_BUCKETS > 1024) printf("ERROR SPLIT BUCKETS MUST BE <1024\n"); - - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - uint32_t base_group = blockIdx.x * blockDim.x; - //uint32_t base_x = base_group * 16; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / (16); // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (x_group < end_n) { - - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - ATTACK_CHACHAS_k29_BUCKETADD(0);ATTACK_CHACHAS_k29_BUCKETADD(1);ATTACK_CHACHAS_k29_BUCKETADD(2);ATTACK_CHACHAS_k29_BUCKETADD(3); - ATTACK_CHACHAS_k29_BUCKETADD(4);ATTACK_CHACHAS_k29_BUCKETADD(5);ATTACK_CHACHAS_k29_BUCKETADD(6);ATTACK_CHACHAS_k29_BUCKETADD(7); - ATTACK_CHACHAS_k29_BUCKETADD(8);ATTACK_CHACHAS_k29_BUCKETADD(9);ATTACK_CHACHAS_k29_BUCKETADD(10);ATTACK_CHACHAS_k29_BUCKETADD(11); - ATTACK_CHACHAS_k29_BUCKETADD(12);ATTACK_CHACHAS_k29_BUCKETADD(13);ATTACK_CHACHAS_k29_BUCKETADD(14);ATTACK_CHACHAS_k29_BUCKETADD(15); - - __syncthreads(); - - /* - * 1.6 - 2.06746 ms with only bucket adds and shared to global counts - * 2.43 - 2.64 with our single thread prefix sum = +0.8 to 0.64 - * then 5.94 total after writing out. = 180GB/s but minus 0.64 = 12% faster which is 200GB/s - */ - if (threadIdx.x == 0) { - // yes this can be sped up, it adds 1.6ms/multiple - i.e. mult = 1 = +1.6ms, 2 = +0.8ms etc. - shared_counts_offsets[0] = 0; - //int min = shared_counts[0]; int max = shared_counts[0]; int num_above_16 = 0; - for (int i=1;i shared_counts[i]) min = shared_counts[i]; - //if (max < shared_counts[i]) max = shared_counts[i]; - //if (shared_counts[i] >= 16) num_above_16++; - //printf(" %i ", shared_counts[i]); - shared_counts_offsets[i] = shared_counts[i-1] + shared_counts_offsets[i-1]; - } - //printf("min: %i max: %i above16: %i\n", min, max,num_above_16); - } - __syncthreads(); - - - /*if ((base_group == 0) && (threadIdx.x == 0)) { - printf("base group %u : ",base_group); - for (int i=0;i<1024;i++) printf("%u ",shared_counts[i]); - printf("\n"); - for (int i=0;i<1024;i++) printf("%u ",shared_counts_offsets[i]); - printf("\n"); - } - __syncthreads();*/ - - for (int i=threadIdx.x;i> 32; - datax[14] = input[14];datax[15] = input[15]; - - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - // makes it 3.61 from 2.41 so yes did add a lot. - ATTACK_CHACHAS_k29_SORTEDADD(0);ATTACK_CHACHAS_k29_SORTEDADD(1);ATTACK_CHACHAS_k29_SORTEDADD(2);ATTACK_CHACHAS_k29_SORTEDADD(3); - ATTACK_CHACHAS_k29_SORTEDADD(4);ATTACK_CHACHAS_k29_SORTEDADD(5);ATTACK_CHACHAS_k29_SORTEDADD(6);ATTACK_CHACHAS_k29_SORTEDADD(7); - ATTACK_CHACHAS_k29_SORTEDADD(8);ATTACK_CHACHAS_k29_SORTEDADD(9);ATTACK_CHACHAS_k29_SORTEDADD(10);ATTACK_CHACHAS_k29_SORTEDADD(11); - ATTACK_CHACHAS_k29_SORTEDADD(12);ATTACK_CHACHAS_k29_SORTEDADD(13);ATTACK_CHACHAS_k29_SORTEDADD(14);ATTACK_CHACHAS_k29_SORTEDADD(15); - - // now push to global - __syncthreads(); - for (int i=threadIdx.x;i<4096;i+=blockDim.x) { - - uint32_t x = shared_sorted_xs[i]; - uint32_t Ly = shared_sorted_chachas[i];//pair.chacha; - xchacha_pair pair = {}; pair.x = x; pair.chacha = Ly; - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; - int slot = global_counts[bucket_id] + atomicAdd(&shared_counts[bucket_id],1); - if (slot > k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); - else xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; - } - - //gpu_chacha8_k29_bucketadd time: 10.9147 ms w/ 1024 buckets no multipasses, w/o writing is 3.6ms so writes take 7ms - //Effective Bandwidth (GB/s): 196.752304 - } -} - - - - - -__global__ -void gpu_chacha8_k29_linear(const uint32_t N, - const __restrict__ uint32_t *input, uint32_t *chacha_xs, uint32_t *chacha_ys) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 16; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 16; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (x_group < end_n) { - uint32_t pos = x_group * 16;// + X_START/16; - //printf("x group pos = %u\n", pos); - - datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7]; - datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11]; - datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - ATTACK_CHACHAS_k29_YS_ONLY(0);ATTACK_CHACHAS_k29_YS_ONLY(1);ATTACK_CHACHAS_k29_YS_ONLY(2);ATTACK_CHACHAS_k29_YS_ONLY(3); - ATTACK_CHACHAS_k29_YS_ONLY(4);ATTACK_CHACHAS_k29_YS_ONLY(5);ATTACK_CHACHAS_k29_YS_ONLY(6);ATTACK_CHACHAS_k29_YS_ONLY(7); - ATTACK_CHACHAS_k29_YS_ONLY(8);ATTACK_CHACHAS_k29_YS_ONLY(9);ATTACK_CHACHAS_k29_YS_ONLY(10);ATTACK_CHACHAS_k29_YS_ONLY(11); - ATTACK_CHACHAS_k29_YS_ONLY(12);ATTACK_CHACHAS_k29_YS_ONLY(13);ATTACK_CHACHAS_k29_YS_ONLY(14);ATTACK_CHACHAS_k29_YS_ONLY(15); - } -} - -__global__ -void gpu_chacha8_k29_to_kbc(const uint32_t N, - const __restrict__ uint32_t *input, xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - uint32_t datax[16]; // shared memory can go as fast as 32ms but still slower than 26ms with local - uint32_t base_group = blockIdx.x * blockDim.x; - uint32_t base_x = base_group * 16; - int x_group = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N / 16; // 16 x's in each group - //printf("blockIdx.x: %u blockDim.x: %u gridDim.x: %u base_x: %u x_group:%u\n", blockIdx.x, blockDim.x, gridDim.x, base_x, x_group); - - if (x_group < end_n) { - uint32_t pos = x_group * 16;// + X_START/16; - //printf("x group pos = %u\n", pos); - - datax[0] = input[0];datax[1] = input[1];datax[2] = input[2];datax[3] = input[3];datax[4] = input[4];datax[5] = input[5];datax[6] = input[6];datax[7] = input[7]; - datax[8] = input[8];datax[9] = input[9];datax[10] = input[10];datax[11] = input[11]; - datax[12] = pos; datax[13]= 0; // pos never bigger than 32 bit pos >> 32; - datax[14] = input[14];datax[15] = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(datax[0], datax[4], datax[8], datax[12]);QUARTERROUND(datax[1], datax[5], datax[9], datax[13]); - QUARTERROUND(datax[2], datax[6], datax[10], datax[14]);QUARTERROUND(datax[3], datax[7], datax[11], datax[15]); - QUARTERROUND(datax[0], datax[5], datax[10], datax[15]);QUARTERROUND(datax[1], datax[6], datax[11], datax[12]); - QUARTERROUND(datax[2], datax[7], datax[8], datax[13]);QUARTERROUND(datax[3], datax[4], datax[9], datax[14]); - } - - datax[0] += input[0];datax[1] += input[1];datax[2] += input[2];datax[3] += input[3];datax[4] += input[4]; - datax[5] += input[5];datax[6] += input[6];datax[7] += input[7];datax[8] += input[8];datax[9] += input[9]; - datax[10] += input[10];datax[11] += input[11];datax[12] += x_group; // j12;//datax[13] += 0; - datax[14] += input[14];datax[15] += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(datax[0]);BYTESWAP32(datax[1]);BYTESWAP32(datax[2]);BYTESWAP32(datax[3]);BYTESWAP32(datax[4]);BYTESWAP32(datax[5]); - BYTESWAP32(datax[6]);BYTESWAP32(datax[7]);BYTESWAP32(datax[8]);BYTESWAP32(datax[9]);BYTESWAP32(datax[10]);BYTESWAP32(datax[11]); - BYTESWAP32(datax[12]);BYTESWAP32(datax[13]);BYTESWAP32(datax[14]);BYTESWAP32(datax[15]); - - //uint64_t y = datax[0] << 6 + x >> 26; for 2^10 (1024 buckets) is >> (38-10) => 28, >> 28 -> x >> 22 - //int nick_bucket_id; // = datax[0] >> 22; // gives bucket id 0..1023 - ATTACK_CHACHAS_k29_TO_KBC(0);ATTACK_CHACHAS_k29_TO_KBC(1);ATTACK_CHACHAS_k29_TO_KBC(2);ATTACK_CHACHAS_k29_TO_KBC(3); - ATTACK_CHACHAS_k29_TO_KBC(4);ATTACK_CHACHAS_k29_TO_KBC(5);ATTACK_CHACHAS_k29_TO_KBC(6);ATTACK_CHACHAS_k29_TO_KBC(7); - ATTACK_CHACHAS_k29_TO_KBC(8);ATTACK_CHACHAS_k29_TO_KBC(9);ATTACK_CHACHAS_k29_TO_KBC(10);ATTACK_CHACHAS_k29_TO_KBC(11); - ATTACK_CHACHAS_k29_TO_KBC(12);ATTACK_CHACHAS_k29_TO_KBC(13);ATTACK_CHACHAS_k29_TO_KBC(14);ATTACK_CHACHAS_k29_TO_KBC(15); - } -} - -__global__ -void gpu_chacha_ys_bucket_direct(const uint32_t N, const __restrict__ uint32_t *chacha_ys, - xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N; - - if (x < end_n) { - uint32_t chacha_y = chacha_ys[x]; - uint32_t Ly = chacha_y; // (((uint64_t) chacha_y) << 6) + (x >> 26); - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; - int slot = atomicAdd(&xchachas_bucket_counts[bucket_id],1); - xchacha_pair pair = { x, chacha_y }; - xchachas_buckets[bucket_id * k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET + slot] = pair; - } -} - -__global__ -void gpu_chacha_ys_bucket_shared_counts(const uint32_t N, const __restrict__ uint32_t *chacha_ys, - xchacha_pair *xchachas_buckets, uint *xchachas_bucket_counts) -{ - __shared__ uint shared_counts[k29_CHACHA_SPLIT_BUCKETS]; - __shared__ uint global_counts[k29_CHACHA_SPLIT_BUCKETS]; - - uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N; - - if (x < end_n) { - - for (int i=threadIdx.x;i> 26); - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; - xchacha_pair pair = { x, chacha_y }; - atomicAdd(&shared_counts[bucket_id],1); - - __syncthreads(); - for (int i=threadIdx.x;i k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET) printf("Overflow k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET %u SLOT %u\n", k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET, slot); - else xchachas_buckets[k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * bucket_id + slot] = pair; // shared_chachas[i]; - } - } - - -} - -__global__ -void gpu_test_cache_cp(const uint32_t N, const uint32_t cache_size_bytes, uint32_t *cache) -{ - uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - const uint32_t end_n = N; - //if (threadIdx.x == 0) { - // printf("gridDim.x: %u blockIdx.x: %u our block id: %u total blocks: %u\n", gridDim.x, blockIdx.x, block_id, total_blocks); - // } - const uint32_t CACHE_SIZE = 1024*1024; - if (x < end_n) { - //uint32_t address = x*1; // x*1: 1 write - 862GB/s vs 1444G/s cached = 1.67x - //uint32_t address = x*4; // x*4: 1 write - 171GB/s in no cache zone, and 404GB/s in cache zone = 2.3x - uint32_t address = x*64 + 1; // Xt*8: 1 write - 85GB/s in no cache zone, and 204GB/s in cache zone = 2.4x (2.63ms) - // *8 with 2 writes (8 byte) : 85GB/s same as 1 write, so effective 170GB/s cache zone: 137GB/s effective 274GB/s in cache zone - // *8 with 4 writes (16 byte): 81GB/s..so x4 = 240GB/s - // *64 with 1 writes (4 byte): 38GB/s .. cache: 127GB/s in cache, - // *64 with 4 writes (16 byte): 32GB/s full random write effective x4 = 120GB/s, cache doesn't seem to help here. - // also writing at address*64+1 didn't affect write speed strangely. - // *64 with 6 writes (24 byte): 21GB/s - x6 = 120GB/s - // *64 with 8 writes (32 byte): 16GB/s - x8 = 128GB/s - // *8 with 32 bytes: 17.9ms - const int BOUNDS = false ? CACHE_SIZE : N; - cache[(address + 0) % BOUNDS] = x; - cache[(address + 1) % BOUNDS] = x; - cache[(address + 2) % BOUNDS] = x; - cache[(address + 3) % BOUNDS] = x; - - //float4 val; - //const float4* myinput = cache+address; - //asm("ld.global.cv.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(val.x), "=f"(val.y), "=f"(val.z), "=f"(val.w) : "l"(myinput)); - - //cache[(address + 4) % BOUNDS] = x; - //cache[(address + 5) % BOUNDS] = x; - //cache[(address + 6) % BOUNDS] = x; - //cache[(address + 7) % BOUNDS] = x; - - //cache[(x*8) % (N)] = x; - // x*1: 862GB/s vs 1444G/s cached = 1.67x - // x*4: 171GB/s in no cache zone, and 404GB/s in cache zone = 2.3x - // x*8: 85GB/s in no cache zone, and 204GB/s in cache zone = 2.4x - } -} - - - - -__global__ -void gpu_count_xpairs_kbc_buckets( - const xchacha_pair *xchachas_buckets, const uint *xchachas_block_counts, uint *global_kbc_counts) -{ - uint32_t block_id = blockIdx.x; - const uint32_t num_in_block = xchachas_block_counts[block_id]; - const uint32_t block_offset = k29_CHACHA_SPLIT_MAX_ENTRIES_PER_BUCKET * block_id; - - for (int i=block_offset + threadIdx.x;i> 26); - //uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; - uint32_t kbc_id = pair.chacha / k29_BC_BUCKET_DIVISOR; // hack for k28 size kBC; - //printf("x: %u chacha: %u bucket: %u kbc_id:%u\n", pair.x, pair.chacha, bucket_id, kbc_id); - int slot = atomicAdd(&global_kbc_counts[kbc_id],1); - } -} - - -__global__ void gpu_check_xpairs(const xchacha_pair *xchachas_in, const uint32_t N) { - if (threadIdx.x == 0) { - for (int i=0;i> 26); - uint32_t bucket_id = Ly / k29_CHACHA_SPLIT_BUCKET_DIVISOR; - printf("%u = %u bucket: %u\n", pair.x, pair.chacha, bucket_id); - } - } -} - -#include // or equivalently - - -void do_k29_T1() { - - std::cout << "do k29 T1 BATCHES:" << k29_BATCHES << std::endl; - - auto total_start = std::chrono::high_resolution_clock::now(); - auto finish = std::chrono::high_resolution_clock::now(); // just to allocate - - cudaEvent_t start, stop; - cudaEventCreate(&start); - cudaEventCreate(&stop); - cudaEvent_t begin, end; - cudaEventCreate(&begin); - cudaEventCreate(&end); - cudaEventRecord(begin); - - - int blockSize; // # of threads per block, maximum is 1024. - uint64_t calc_N; - uint64_t calc_blockSize; - uint64_t calc_numBlocks; - int numBlocks; - - // first phase is writing chacha results - uint32_t *chacha_ys = (uint32_t *) &device_buffer_A[0]; // set ys to beginning of device buffer A - uint32_t *chacha_xs = (uint32_t *) &device_buffer_A[k29_MAX_X_VALUE*4]; // set ys to beginning of device buffer A - xchacha_pair *xchachas_buckets = (xchacha_pair *) &device_buffer_A[k29_MAX_X_VALUE*4]; - float milliseconds = 0; - - std::cout << " gpu_chacha8_k29_bucketadd ys num:" << calc_N << std::endl; - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = k29_MAX_X_VALUE; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, k29_BC_NUM_BUCKETS*sizeof(int))); - cudaEventRecord(start); - //gpu_chacha8_k29_to_kbc<<>>(calc_N, chacha_input,xchachas_buckets, global_kbc_counts); - // cuda event total time: 65.0044 ms - //gpu_chacha8_k29_bucketadd time: 23.0625 ms - //Effective Bandwidth (GB/s): 46.557852 - //gpu_chacha8_k29_bucketadd time: 23.0697 ms - //Effective Bandwidth (GB/s): 46.543388 - - //gpu_chacha8_k29_bucketadd<<>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts); - //gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts); - gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts); - //gpu_chacha8_k29_bucketadd_256threads_upto1024buckets<<>>(calc_N, chacha_input,xchachas_buckets, xchachas_bucket_counts); - - // counter list counts SUM:134217728 MAX:132347 id: 0 count: 131044 6.06925 ms (GB/s): 176.706432 - - cudaEventRecord(stop); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - cudaEventSynchronize(stop); - - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "gpu_chacha8_k29_bucketadd_256threads_upto1024buckets time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", calc_N*8/milliseconds/1e6); - - //1024 buckets multiple 1 gpu_chacha8_k29_bucketadd time: 11.008 ms Effective Bandwidth (GB/s): 195.083904 -/* - cudaEventRecord(start); - // 1 block per split bucket, threads will have to work out how much to parse - gpu_count_xpairs_kbc_buckets<<>>(xchachas_buckets, xchachas_bucket_counts, global_kbc_counts); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "gpu_count_xpairs_kbc_buckets time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", k29_MAX_X_VALUE*8/milliseconds/1e6); - - thrust::device_ptr device_kbc_counts(global_kbc_counts); - cudaEventRecord(start); - thrust::exclusive_scan(device_kbc_counts, device_kbc_counts + k29_BC_NUM_BUCKETS, device_kbc_counts); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "exclusive scan kbc_buckets time: " << milliseconds << " ms\n"; -*/ - - std::cout << " gpu_test_cache ys num:" << calc_N << std::endl; - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = k29_MAX_X_VALUE; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); - numBlocks = calc_numBlocks; - std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - cudaEventRecord(start); - gpu_test_cache_cp<<>>(calc_N, calc_N, chacha_ys); - cudaEventRecord(stop); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - cudaEventSynchronize(stop);//auto sort_start = std::chrono::high_resolution_clock::now(); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "cache linear test " << calc_N << " time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", calc_N*4/milliseconds/1e6); - - - { - // thrust linear then sort method - - std::cout << " gpu_chacha8_k29_linear ys num:" << calc_N << std::endl; - - blockSize = 256; // # of threads per block, maximum is 1024. - calc_N = k29_MAX_X_VALUE; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - numBlocks = calc_numBlocks; - std::cout << " numBlocks: " << numBlocks << " blockSize: " << blockSize << std::endl; - cudaEventRecord(start); - gpu_chacha8_k29_linear<<>>(calc_N, chacha_input,chacha_xs,chacha_ys); - cudaEventRecord(stop); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - cudaEventSynchronize(stop);//auto sort_start = std::chrono::high_resolution_clock::now(); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "write chachas time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", calc_N*8/milliseconds/1e6); - - /*auto sort_start = std::chrono::high_resolution_clock::now(); - cudaEventRecord(start); - thrust::device_ptr device_xs_L_ptr(chacha_xs); - thrust::device_ptr device_ys_L_ptr(chacha_ys); - thrust::sort_by_key(device_ys_L_ptr, device_ys_L_ptr + calc_N, device_xs_L_ptr); - cudaEventRecord(stop); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto sort_finish = std::chrono::high_resolution_clock::now(); - std::cout << " sort time: " << std::chrono::duration_cast(sort_finish - sort_start).count() << " ms\n"; - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "thrust sort " << calc_N << " time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", calc_N*8*2/milliseconds/1e6);*/ - } - - {// Declare, allocate, and initialize device-accessible pointers for sorting data - - // Determine temporary device storage requirements - void *d_temp_storage = NULL; - size_t temp_storage_bytes = 0; - cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - chacha_ys, chacha_ys, chacha_xs, chacha_xs, k29_MAX_X_VALUE); - // Allocate temporary storage - cudaMalloc(&d_temp_storage, temp_storage_bytes); - // Run sorting operation - cudaEventRecord(start); - cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, - chacha_ys, chacha_ys, chacha_xs, chacha_xs, k29_MAX_X_VALUE); - cudaEventRecord(stop); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - // thrust is 13ms - std::cout << "cuda sort " << calc_N << " time: " << milliseconds << " ms\n"; - // d_keys_out <-- [0, 3, 5, 6, 7, 8, 9] - // d_values_out <-- [5, 4, 3, 1, 2, 0, 6] - } - - - /*std::cout << " gpu_chacha split buckets (num: " << k29_CHACHA_SPLIT_BUCKETS << " divisor:" << k29_CHACHA_SPLIT_BUCKET_DIVISOR << ") ys num:" << calc_N << std::endl; - blockSize = 1024; // # of threads per block, maximum is 1024. - calc_N = k29_MAX_X_VALUE; - calc_blockSize = blockSize; - calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize); - numBlocks = calc_numBlocks; - cudaEventRecord(start); - // erm...thrust sort on 268435456 is 27ms...just saying, beats most other timings. - // and sort on 33million elements (1/8 of total) is 3ms. In other words...worth doing... - //gpu_chacha_ys_bucket_shared_counts<<>>(calc_N, chacha_ys, xchachas_buckets, xchachas_bucket_counts); - //counter list counts SUM:268435456 MAX:33561699 - // gpu_chacha split buckets (num: 1024 divisor:4194304) ys num:268435456 time: 47.1419 ms (GB/s): 68.330432 - // gpu_chacha split buckets (num: 128 divisor:33554432) ys num:268435456 time: 38.0465 ms (GB/s): 84.665424 - // gpu_chacha_ys_bucket_shared (num: 32 divisor:134217728) ys num:268435456 time: 17.9118 ms (GB/s): 179.838096 - // gpu_chacha split buckets (num: 8 divisor:536870912) ys num:268435456 time: 6.79731 ms (GB/s): 473.896960 - // -> note 8*8*8*8 = 4096, and would take 27ms, which is less than 1024 @ 47ms - //gpu_chacha_ys_bucket_direct<<>>(calc_N, chacha_ys, xchachas_buckets, xchachas_bucket_counts); - // gpu_chacha_ys_bucket_direct (num: 1136761 divisor:3778) ys num:268435456 time: 102.703 ms (GB/s): 31.364442 - // gpu_chacha_ys_bucket_direct (num: 1024 divisor:4194304) ys num:268435456 time: 48.5682 ms (GB/s): 66.323720 - // gpu_chacha_ys_bucket_direct (num: 128 divisor:33554432) ys num:268435456 time: 73.6359 ms (GB/s): 43.745292 - // gpu_chacha_ys_bucket_direct (num: 32 divisor:134217728) ys num:268435456 time: 85.1845 ms (GB/s): 37.814688 - cudaEventRecord(stop); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - cudaEventSynchronize(stop); - cudaEventElapsedTime(&milliseconds, start, stop); - std::cout << "bucket chachas time: " << milliseconds << " ms\n"; - printf("Effective Bandwidth (GB/s): %f\n", (calc_N*(4+8))/milliseconds/1e6);*/ - - //gpu_get_max_counts_from_counter_list<<<1,1>>>(xchachas_bucket_counts, k29_CHACHA_SPLIT_BUCKETS, true); - //<<<1,1>>>(global_kbc_counts, 1024, false); - cudaEventRecord(end); - cudaEventSynchronize(end); - cudaEventElapsedTime(&milliseconds, begin, end); - std::cout << "cuda event total time: " << milliseconds << " ms\n"; -} - - -void setup_memory_k29() { - - //setupMMap(HOST_ALLOCATED_BYTES); // potentially useful if going to do random reads/writes to stored data - - //std::cout << " device_block_entry_counts (" << k29_BATCHES << "): " << k29_BATCHES << " size:" << (sizeof(int)*k29_BATCHES) << std::endl; - //CUDA_CHECK_RETURN(cudaMallocManaged(&device_block_entry_counts, k29_BATCHES*sizeof(int))); - - std::cout << " device_local_kbc_num_entries " << k29_BC_NUM_BUCKETS << " * (max per bucket: " << KBC_MAX_ENTRIES_PER_BUCKET << ") size:" << (sizeof(int)*k29_BC_NUM_BUCKETS) << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&global_kbc_counts, k29_BC_NUM_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(global_kbc_counts, 0, k29_BC_NUM_BUCKETS*sizeof(int))); - - //Tx_Pairing_Chunk_Meta4 *device_buffer_A; - std::cout << " device_buffer_A " << k29_DEVICE_BUFFER_A_BYTES << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_A, k29_DEVICE_BUFFER_A_BYTES)); - - std::cout << " xchachas_bucket_counts k29_CHACHA_SPLIT_BUCKETS:" << k29_CHACHA_SPLIT_BUCKETS << std::endl; - CUDA_CHECK_RETURN(cudaMallocManaged(&xchachas_bucket_counts, k29_CHACHA_SPLIT_BUCKETS*sizeof(int))); - CUDA_CHECK_RETURN(cudaMemset(xchachas_bucket_counts, 0, k29_CHACHA_SPLIT_BUCKETS*sizeof(int))); - - - //Tx_Pairing_Chunk_Meta4 *device_buffer_B; - //std::cout << " device_buffer_B " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << DEVICE_BUFFER_UNIT_BYTES << ") = " << DEVICE_BUFFER_ALLOCATED_BYTES << std::endl; - //CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_B, DEVICE_BUFFER_ALLOCATED_BYTES)); - - //std::cout << " device_buffer_refdata " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " * (UNIT BYTES:" << BACKREF_UNIT_BYTES << ") = " << BACKREF_ALLOCATED_BYTES << std::endl; - //CUDA_CHECK_RETURN(cudaMalloc(&device_buffer_refdata, BACKREF_ALLOCATED_BYTES)); - - //std::cout << " HOST host_refdata_blocks ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " ALLOCATED ENTRIES: " << DEVICE_BUFFER_ALLOCATED_ENTRIES << " UNIT BYTES: " << BACKREF_UNIT_BYTES << " = " << (BACKREF_ALLOCATED_BYTES) << std::endl; - //CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_refdata_blocks, BACKREF_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE](); - - //std::cout << " HOST host_criss_cross_blocks MAX_ENTRIES: " << HOST_MAX_BLOCK_ENTRIES << " ALLOCATED ENTRIES: " << HOST_ALLOCATED_ENTRIES << " UNIT BYTES: " << HOST_UNIT_BYTES << " = " << (HOST_ALLOCATED_BYTES) << std::endl; - //CUDA_CHECK_RETURN(cudaMallocHost((void**)&host_criss_cross_blocks, HOST_ALLOCATED_BYTES)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE](); -} - -void do_k29() { - std::cout << "****** PROGRAM START K29 V0.1 *********" << std::endl; - - setup_memory_k29(); - - - auto total_start = std::chrono::high_resolution_clock::now(); - do_k29_T1(); - std::cout << " freeing memory..."; - freeMemory(); - std::cout << "end." << std::endl; - exit(EXIT_SUCCESS); -} - - - - -#endif /* K29_PLOTTER_HPP_ */ diff --git a/nick_blake3.hpp b/nick_blake3.hpp deleted file mode 100644 index 930f50a..0000000 --- a/nick_blake3.hpp +++ /dev/null @@ -1,336 +0,0 @@ -/* - * nick_blake3.hpp - * - * Created on: Oct 26, 2021 - * Author: nick - */ - -#ifndef NICK_BLAKE3_HPP_ -#define NICK_BLAKE3_HPP_ - -#define CALC_Y_BUCKETED_KBC_ENTRY(entry, bucket_id) \ - (((uint64_t) bucket_id) * ((uint64_t) 15113) + (uint64_t) entry.y) - -#define BSWAP32(i) (__byte_perm ((i), 0, 0x0123)) - -#define NICK_ROTR32(w,c) \ - (((w) >> (c)) | ((w) << (32 - (c)))) - -// rotate32 by 8 * c bits (1 byte) -#define NICK_ROTR32_BYTE8(w,c) __byte_perm (w, w, 0x3210 + 0x1111 * c); - -// optimized for cuda instructions with rotate by multiples of 8 bites -#define NICK_G(a,b,c,d,x,y) \ - state[a] = state[a] + state[b] + x; \ - state[d] = NICK_ROTR32_BYTE8(state[d] ^ state[a], 2); \ - state[c] = state[c] + state[d]; \ - state[b] = NICK_ROTR32(state[b] ^ state[c], 12); \ - state[a] = state[a] + state[b] + y; \ - state[d] = NICK_ROTR32_BYTE8(state[d] ^ state[a], 1); \ - state[c] = state[c] + state[d]; \ - state[b] = NICK_ROTR32(state[b] ^ state[c], 7); \ - - - -__device__ -void nick_blake3(const uint32_t* meta, int meta_len, const uint64_t y, - uint64_t *y_result, uint8_t c_len, uint32_t *c_results) { - uint32_t state[16]; - uint32_t block_words[16];// = {0}; - size_t input_len = 21; - - block_words[0] = BSWAP32(y >> 6); - block_words[1] = BSWAP32(__funnelshift_l ( meta[0], y, 26)); - block_words[2] = BSWAP32(__funnelshift_l ( meta[1], meta[0], 26)); - if (meta_len == 2) { - // [32][6-26][6-26][6-] - block_words[3] = BSWAP32(meta[1] << 26); - input_len = 13; - } - else if (meta_len == 3) { - // [32][6-26][6-26][6-26][6-26][6-] - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(meta[2] << 26); - input_len = 17; - } - else if (meta_len == 4) { - // [32][6-26][6-26][6-26][6-26][6-26][6-] - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(meta[3] << 26); - input_len = 21; - } - else if (meta_len == 6) { - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26)); - block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26)); - block_words[7] = BSWAP32(meta[5] << 26); - input_len = 29; - } - else if (meta_len == 8) { - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26)); - block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26)); - block_words[7] = BSWAP32(__funnelshift_l ( meta[6], meta[5], 26)); - block_words[8] = BSWAP32(__funnelshift_l ( meta[7], meta[6], 26)); - block_words[9] = BSWAP32(meta[7] << 26); - input_len = 37; - } - - for (int i=meta_len+2;i<16;i++) block_words[i]=0; - - - state[0] = 0x6A09E667UL; - state[1] = 0xBB67AE85UL; - state[2] = 0x3C6EF372UL; - state[3] = 0xA54FF53AUL; - state[4] = 0x510E527FUL; - state[5] = 0x9B05688CUL; - state[6] = 0x1F83D9ABUL; - state[7] = 0x5BE0CD19UL; - state[8] = 0x6A09E667UL; - state[9] = 0xBB67AE85UL; - state[10] = 0x3C6EF372UL; - state[11] = 0xA54FF53AUL; - state[12] = 0; // counter_low(0); - state[13] = 0; // counter_high(0); - state[14] = (uint32_t) input_len; // take;// (uint32_t)output.block_len; - state[15] = (uint32_t) (1 | 2 | 8);// (output.flags | ROOT); - - NICK_G(0,4,8,12,block_words[0],block_words[1]); - NICK_G(1,5,9,13,block_words[2],block_words[3]); - NICK_G(2,6,10,14,block_words[4],block_words[5]); - NICK_G(3,7,11,15,block_words[6],block_words[7]); - NICK_G(0,5,10,15,block_words[8],block_words[9]); - NICK_G(1,6,11,12,block_words[10],block_words[11]); - NICK_G(2,7,8,13,block_words[12],block_words[13]); - NICK_G(3,4,9,14,block_words[14],block_words[15]); - NICK_G(0,4,8,12,block_words[2],block_words[6]); - NICK_G(1,5,9,13,block_words[3],block_words[10]); - NICK_G(2,6,10,14,block_words[7],block_words[0]); - NICK_G(3,7,11,15,block_words[4],block_words[13]); - NICK_G(0,5,10,15,block_words[1],block_words[11]); - NICK_G(1,6,11,12,block_words[12],block_words[5]); - NICK_G(2,7,8,13,block_words[9],block_words[14]); - NICK_G(3,4,9,14,block_words[15],block_words[8]); - NICK_G(0,4,8,12,block_words[3],block_words[4]); - NICK_G(1,5,9,13,block_words[10],block_words[12]); - NICK_G(2,6,10,14,block_words[13],block_words[2]); - NICK_G(3,7,11,15,block_words[7],block_words[14]); - NICK_G(0,5,10,15,block_words[6],block_words[5]); - NICK_G(1,6,11,12,block_words[9],block_words[0]); - NICK_G(2,7,8,13,block_words[11],block_words[15]); - NICK_G(3,4,9,14,block_words[8],block_words[1]); - NICK_G(0,4,8,12,block_words[10],block_words[7]); - NICK_G(1,5,9,13,block_words[12],block_words[9]); - NICK_G(2,6,10,14,block_words[14],block_words[3]); - NICK_G(3,7,11,15,block_words[13],block_words[15]); - NICK_G(0,5,10,15,block_words[4],block_words[0]); - NICK_G(1,6,11,12,block_words[11],block_words[2]); - NICK_G(2,7,8,13,block_words[5],block_words[8]); - NICK_G(3,4,9,14,block_words[1],block_words[6]); - NICK_G(0,4,8,12,block_words[12],block_words[13]); - NICK_G(1,5,9,13,block_words[9],block_words[11]); - NICK_G(2,6,10,14,block_words[15],block_words[10]); - NICK_G(3,7,11,15,block_words[14],block_words[8]); - NICK_G(0,5,10,15,block_words[7],block_words[2]); - NICK_G(1,6,11,12,block_words[5],block_words[3]); - NICK_G(2,7,8,13,block_words[0],block_words[1]); - NICK_G(3,4,9,14,block_words[6],block_words[4]); - NICK_G(0,4,8,12,block_words[9],block_words[14]); - NICK_G(1,5,9,13,block_words[11],block_words[5]); - NICK_G(2,6,10,14,block_words[8],block_words[12]); - NICK_G(3,7,11,15,block_words[15],block_words[1]); - NICK_G(0,5,10,15,block_words[13],block_words[3]); - NICK_G(1,6,11,12,block_words[0],block_words[10]); - NICK_G(2,7,8,13,block_words[2],block_words[6]); - NICK_G(3,4,9,14,block_words[4],block_words[7]); - NICK_G(0,4,8,12,block_words[11],block_words[15]); - NICK_G(1,5,9,13,block_words[5],block_words[0]); - NICK_G(2,6,10,14,block_words[1],block_words[9]); - NICK_G(3,7,11,15,block_words[8],block_words[6]); - NICK_G(0,5,10,15,block_words[14],block_words[10]); - NICK_G(1,6,11,12,block_words[2],block_words[12]); - NICK_G(2,7,8,13,block_words[3],block_words[4]); - NICK_G(3,4,9,14,block_words[7],block_words[13]); - - - uint32_t r0 = BSWAP32(state[0] ^ state[8]); - uint32_t r1 = BSWAP32(state[1] ^ state[9]); // y_result is 38 bits of [a][6-] - uint32_t r2 = BSWAP32(state[2] ^ state[10]); - uint32_t r3 = BSWAP32(state[3] ^ state[11]); - uint32_t r4 = BSWAP32(state[4] ^ state[12]); - uint32_t r5 = BSWAP32(state[5] ^ state[13]); - - // MINOR OPTIMIZATION: on last table could just return top 32 bits instead of the 38 bits. - uint64_t y_hi = __funnelshift_l ( r0, 0, 6); // shift 6 of top bits of r0 into y_hi - uint32_t y_lo = __funnelshift_l ( r1, r0, 6); - if (c_len > 0) { - c_results[0] = __funnelshift_l ( r2, r1, 6); - c_results[1] = __funnelshift_l ( r3, r2, 6); - } - if (c_len > 2) { - c_results[2] = __funnelshift_l ( r4, r3, 6); - } - if (c_len > 3) { - c_results[3] = __funnelshift_l ( r5, r4, 6); - } - - (*y_result) = (y_hi << 32) + y_lo; - -} - -__device__ -void nick_blake_k29(const uint32_t* meta, int meta_len, const uint64_t y, - uint64_t *y_result, uint8_t c_len, uint32_t *c_results) { - uint32_t state[16]; - uint32_t block_words[16];// = {0}; - size_t input_len = 21; - - block_words[0] = BSWAP32(y >> 6); - block_words[1] = BSWAP32(__funnelshift_l ( meta[0], y, 26)); - block_words[2] = BSWAP32(__funnelshift_l ( meta[1], meta[0], 26)); - if (meta_len == 2) { - // [32][6-26][6-26][6-] - block_words[3] = BSWAP32(meta[1] << 26); - input_len = 13; - } - else if (meta_len == 3) { - // [32][6-26][6-26][6-26][6-26][6-] - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(meta[2] << 26); - input_len = 17; - } - else if (meta_len == 4) { - // [32][6-26][6-26][6-26][6-26][6-26][6-] - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(meta[3] << 26); - input_len = 21; - } - else if (meta_len == 6) { - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26)); - block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26)); - block_words[7] = BSWAP32(meta[5] << 26); - input_len = 29; - } - else if (meta_len == 8) { - block_words[3] = BSWAP32(__funnelshift_l ( meta[2], meta[1], 26)); - block_words[4] = BSWAP32(__funnelshift_l ( meta[3], meta[2], 26)); - block_words[5] = BSWAP32(__funnelshift_l ( meta[4], meta[3], 26)); - block_words[6] = BSWAP32(__funnelshift_l ( meta[5], meta[4], 26)); - block_words[7] = BSWAP32(__funnelshift_l ( meta[6], meta[5], 26)); - block_words[8] = BSWAP32(__funnelshift_l ( meta[7], meta[6], 26)); - block_words[9] = BSWAP32(meta[7] << 26); - input_len = 37; - } - - for (int i=meta_len+2;i<16;i++) block_words[i]=0; - - - state[0] = 0x6A09E667UL; - state[1] = 0xBB67AE85UL; - state[2] = 0x3C6EF372UL; - state[3] = 0xA54FF53AUL; - state[4] = 0x510E527FUL; - state[5] = 0x9B05688CUL; - state[6] = 0x1F83D9ABUL; - state[7] = 0x5BE0CD19UL; - state[8] = 0x6A09E667UL; - state[9] = 0xBB67AE85UL; - state[10] = 0x3C6EF372UL; - state[11] = 0xA54FF53AUL; - state[12] = 0; // counter_low(0); - state[13] = 0; // counter_high(0); - state[14] = (uint32_t) input_len; // take;// (uint32_t)output.block_len; - state[15] = (uint32_t) (1 | 2 | 8);// (output.flags | ROOT); - - NICK_G(0,4,8,12,block_words[0],block_words[1]); - NICK_G(1,5,9,13,block_words[2],block_words[3]); - NICK_G(2,6,10,14,block_words[4],block_words[5]); - NICK_G(3,7,11,15,block_words[6],block_words[7]); - NICK_G(0,5,10,15,block_words[8],block_words[9]); - NICK_G(1,6,11,12,block_words[10],block_words[11]); - NICK_G(2,7,8,13,block_words[12],block_words[13]); - NICK_G(3,4,9,14,block_words[14],block_words[15]); - NICK_G(0,4,8,12,block_words[2],block_words[6]); - NICK_G(1,5,9,13,block_words[3],block_words[10]); - NICK_G(2,6,10,14,block_words[7],block_words[0]); - NICK_G(3,7,11,15,block_words[4],block_words[13]); - NICK_G(0,5,10,15,block_words[1],block_words[11]); - NICK_G(1,6,11,12,block_words[12],block_words[5]); - NICK_G(2,7,8,13,block_words[9],block_words[14]); - NICK_G(3,4,9,14,block_words[15],block_words[8]); - NICK_G(0,4,8,12,block_words[3],block_words[4]); - NICK_G(1,5,9,13,block_words[10],block_words[12]); - NICK_G(2,6,10,14,block_words[13],block_words[2]); - NICK_G(3,7,11,15,block_words[7],block_words[14]); - NICK_G(0,5,10,15,block_words[6],block_words[5]); - NICK_G(1,6,11,12,block_words[9],block_words[0]); - NICK_G(2,7,8,13,block_words[11],block_words[15]); - NICK_G(3,4,9,14,block_words[8],block_words[1]); - NICK_G(0,4,8,12,block_words[10],block_words[7]); - NICK_G(1,5,9,13,block_words[12],block_words[9]); - NICK_G(2,6,10,14,block_words[14],block_words[3]); - NICK_G(3,7,11,15,block_words[13],block_words[15]); - NICK_G(0,5,10,15,block_words[4],block_words[0]); - NICK_G(1,6,11,12,block_words[11],block_words[2]); - NICK_G(2,7,8,13,block_words[5],block_words[8]); - NICK_G(3,4,9,14,block_words[1],block_words[6]); - NICK_G(0,4,8,12,block_words[12],block_words[13]); - NICK_G(1,5,9,13,block_words[9],block_words[11]); - NICK_G(2,6,10,14,block_words[15],block_words[10]); - NICK_G(3,7,11,15,block_words[14],block_words[8]); - NICK_G(0,5,10,15,block_words[7],block_words[2]); - NICK_G(1,6,11,12,block_words[5],block_words[3]); - NICK_G(2,7,8,13,block_words[0],block_words[1]); - NICK_G(3,4,9,14,block_words[6],block_words[4]); - NICK_G(0,4,8,12,block_words[9],block_words[14]); - NICK_G(1,5,9,13,block_words[11],block_words[5]); - NICK_G(2,6,10,14,block_words[8],block_words[12]); - NICK_G(3,7,11,15,block_words[15],block_words[1]); - NICK_G(0,5,10,15,block_words[13],block_words[3]); - NICK_G(1,6,11,12,block_words[0],block_words[10]); - NICK_G(2,7,8,13,block_words[2],block_words[6]); - NICK_G(3,4,9,14,block_words[4],block_words[7]); - NICK_G(0,4,8,12,block_words[11],block_words[15]); - NICK_G(1,5,9,13,block_words[5],block_words[0]); - NICK_G(2,6,10,14,block_words[1],block_words[9]); - NICK_G(3,7,11,15,block_words[8],block_words[6]); - NICK_G(0,5,10,15,block_words[14],block_words[10]); - NICK_G(1,6,11,12,block_words[2],block_words[12]); - NICK_G(2,7,8,13,block_words[3],block_words[4]); - NICK_G(3,4,9,14,block_words[7],block_words[13]); - - uint32_t r0 = BSWAP32(state[0] ^ state[8]); - uint32_t r1 = BSWAP32(state[1] ^ state[9]); // y_result is 38 bits of [a][6-] - uint32_t r2 = BSWAP32(state[2] ^ state[10]); - uint32_t r3 = BSWAP32(state[3] ^ state[11]); - uint32_t r4 = BSWAP32(state[4] ^ state[12]); - uint32_t r5 = BSWAP32(state[5] ^ state[13]); - - // MINOR OPTIMIZATION: on last table could just return top 32 bits instead of the 38 bits. - uint64_t y_hi = __funnelshift_l ( r0, 0, 6); // shift 6 of top bits of r0 into y_hi - uint32_t y_lo = __funnelshift_l ( r1, r0, 6); - if (c_len > 0) { - c_results[0] = __funnelshift_l ( r2, r1, 6); - c_results[1] = __funnelshift_l ( r3, r2, 6); - } - if (c_len > 2) { - c_results[2] = __funnelshift_l ( r4, r3, 6); - } - if (c_len > 3) { - c_results[3] = __funnelshift_l ( r5, r4, 6); - } - - (*y_result) = ((y_hi << 32) + y_lo) >> 3; - -} - - - -#endif /* NICK_BLAKE3_HPP_ */ diff --git a/nick_globals.hpp b/nick_globals.hpp deleted file mode 100644 index badc637..0000000 --- a/nick_globals.hpp +++ /dev/null @@ -1,457 +0,0 @@ -#ifndef NICK_GLOBALS_HPP_ -#define NICK_GLOBALS_HPP_ - -#include -#include - -using std::string; - -const uint32_t BATCHES = 64; -const uint64_t BATCHBC = (uint64_t) 1 << (38 - 6); - -const uint32_t KBC_MAX_ENTRIES_PER_BUCKET = 400; -const uint32_t kBC_NUM_BUCKETS = 18188177; -const uint32_t kBC_LAST_BUCKET_ID = 18188176; -const uint16_t kBC = 15113; -const uint32_t KBCS_PER_BATCH = (kBC_NUM_BUCKETS / BATCHES)+1; -const uint32_t KBC_LOCAL_NUM_BUCKETS = KBCS_PER_BATCH + 1; // +1 is for including last R bucket space - -#define CALC_BATCH_BUCKET_ADD_Y(batch_id) ((((uint64_t) 1) << (38-6)) * ((uint64_t) batch_id)) -#define CALC_KBC_BUCKET_ADD_Y(kbc_bucket_id) (((uint64_t) kBC) * ((uint64_t) kbc_bucket_id)) - -#define MIN_KBC_BUCKET_FOR_BATCH(batch_id) \ - ( (uint32_t) ((((uint64_t) 1 << 32) * ((uint64_t) (batch_id))) / ((uint64_t) kBC) )); - - -const uint64_t HOST_UNIT_BYTES = 20; //12// Bytes used for biggest host entry. -const uint64_t HOST_MAX_BLOCK_ENTRIES = 1114112;//1114112; // MUST be multiple of 32 so it works with bit masking // 1052614 (min calculated) // 1258291; // (120 * ((uint64_t) 1 << 32)) / (100*(BATCHES * BATCHES)); -const uint64_t HOST_ALLOCATED_ENTRIES = HOST_MAX_BLOCK_ENTRIES * BATCHES * BATCHES; -const uint64_t HOST_ALLOCATED_BYTES = HOST_UNIT_BYTES * HOST_ALLOCATED_ENTRIES; - - -const uint64_t DEVICE_BUFFER_UNIT_BYTES = 24;//32; // Tx_pairing_chunk_meta4 is 24 bytes, w/ backref is 32 bytes - -const uint64_t DEVICE_BUFFER_ALLOCATED_ENTRIES = KBC_LOCAL_NUM_BUCKETS * KBC_MAX_ENTRIES_PER_BUCKET; // HOST_MAX_BLOCK_ENTRIES * BATCHES;// DEVICE_BUFFER_ALLOCATED_ENTRIES = 120 * ((uint64_t) 1 << 32) / (100*BATCHES); -const uint64_t DEVICE_BUFFER_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * DEVICE_BUFFER_UNIT_BYTES; -const uint64_t BACKREF_UNIT_BYTES = 12; // backref w/y for last table is 12 bytes -const uint64_t BACKREF_ALLOCATED_BYTES = DEVICE_BUFFER_ALLOCATED_ENTRIES * BACKREF_UNIT_BYTES; - - -const uint64_t CROSS_MATRIX_BC = (2097152 * 128) + kBC - ((2097152 * 128) % kBC); -const uint64_t CROSS_MATRIX_NUM_BUCKETS = 1024; // each batch splits into buckets, the max per bucket is dependent on size of batch -const uint64_t CROSS_MATRIX_BATCH_MAX_ENTRIES_PER_BUCKET = (119 * ((uint64_t)1 << 32)) / (100*(CROSS_MATRIX_NUM_BUCKETS * BATCHES)); -const uint64_t CROSS_MATRIX_ALLOCATED_SPACE_PER_BATCH = CROSS_MATRIX_BATCH_MAX_ENTRIES_PER_BUCKET * CROSS_MATRIX_NUM_BUCKETS; -const uint64_t CROSS_MATRIX_ALLOCATED_SPACE = CROSS_MATRIX_ALLOCATED_SPACE_PER_BATCH * BATCHES; - - - - - - - - -static void CheckCudaErrorAux (const char *, unsigned, const char *, cudaError_t); -#define CUDA_CHECK_RETURN(value) CheckCudaErrorAux(__FILE__,__LINE__, #value, value) - -uint32_t *chacha_input; - -// output from F(x) -> chacha -struct F1_Bucketed_kBC_Entry { - uint32_t x; - uint32_t y; -}; - -struct T1_Match { - uint32_t Lx; - uint32_t Rx; - uint32_t y; -}; - -struct T1_Pairing_Chunk { - uint32_t Lx; - uint32_t Rx; - uint32_t y; -}; - -struct Tx_Bucketed_Final_Y { - uint32_t y; -}; - -struct Tx_Bucketed_Meta1 { - uint32_t meta[1]; - uint32_t y; -}; - -struct Tx_Bucketed_Meta2 { - uint32_t meta[2]; - uint32_t y; -}; - -struct Tx_Bucketed_Meta3 { - uint32_t meta[3]; - uint32_t y; -}; - -struct Tx_Bucketed_Meta4 { - uint32_t meta[4]; - uint32_t y; -}; - -struct Tx_Bucketed_Meta2_Blockposref { - uint32_t meta[2]; - uint32_t y; - uint32_t blockposref; -}; - -struct Tx_Bucketed_Meta3_Blockposref { - uint32_t meta[3]; - uint32_t y; - uint32_t blockposref; -}; - -struct Tx_Bucketed_Meta4_Blockposref { - uint32_t meta[4]; - uint32_t y; - uint32_t blockposref; -}; - - -struct Tx_Pairing_Chunk_Meta2 { - uint64_t y; - uint32_t meta[2]; - //uint16_t idxL; - //uint16_t idxR; - //uint32_t p_b_id; -}; - -struct Tx_Pairing_Chunk_Meta3 { - uint64_t y; - uint32_t meta[2]; - //uint16_t idxL; - //uint16_t idxR; - //uint32_t p_b_id; -}; - -struct Tx_Pairing_Chunk_Meta4 { - uint64_t y; - uint32_t meta[4]; - //uint16_t idxL; - //uint16_t idxR; - //uint32_t p_b_id; -}; - -struct Index_Match { - uint16_t idxL; - uint16_t idxR; -}; - -// our base pairing struct T3. -struct T2BaseRef { - uint32_t Lx1; - uint32_t Lx2; -}; - -struct T3BaseRef { - uint32_t Lx1; - uint32_t Lx2; - uint32_t Lx3; - uint32_t Lx4; -}; - -struct T2BaseRefWithUsed { - uint32_t Lx1; - uint32_t Lx2; - bool used; -}; - -struct BackRef { - uint32_t prev_block_ref_L; // (block_id(L) << (32 - 6)) + block_pos - uint32_t prev_block_ref_R; // (block_id(R) << (32 - 6)) + block_pos -}; - -struct T6BackRef { // 12 bytes - uint32_t prev_block_ref_L; // (block_id(L) << (32 - 6)) + block_pos - uint32_t prev_block_ref_R; // (block_id(R) << (32 - 6)) + block_pos - uint32_t y; -}; - -struct T6FinalEntry { - uint32_t refL; // 6,6,6 = 24 - uint32_t refR; // 6,6,6 = 24 - uint32_t y; // 32 -}; - -struct T4FinalEntry { - uint32_t Lx1,Lx2,Lx3,Lx4,Lx5,Lx6,Lx7,Lx8; -}; - - -struct RBid_Entry { - uint32_t x; - uint16_t pos; -}; - -// chia specific constants -const uint32_t K_SIZE = 32; -const uint64_t K_MAX = ((uint64_t) 1 << K_SIZE); -const uint64_t K_MAX_Y = K_MAX << 6; -const uint8_t kExtraBits = 6; -const uint16_t kB = 119; -const uint16_t kC = 127; -const uint32_t nickBC = (2097152 * 128) + kBC - ((2097152 * 128) % kBC); -const uint32_t NICK_BUCKET_MAX_ENTRIES = 34000 * 128; -const uint32_t NICK_NUM_BUCKETS = 1024; - -// code below is WRONG! 2nd clause only uses batch_id -//#define CRISS_CROSS_BLOCK_ID(table, batch_id, block_id) \ -//(((table % 2) == 1) ? batch_id * BATCHES + block_id : batch_id * BATCHES + batch_id) - - -uint64_t getCrissCrossBlockId(uint8_t table, uint32_t batch_id, uint32_t block_id) { - uint64_t cross_row_id = batch_id; - uint64_t cross_column_id = block_id; - if ((table % 2) == 1) { - return (cross_row_id * BATCHES + cross_column_id); - } else { - return (cross_column_id * BATCHES + cross_row_id); - } -} - -inline uint64_t getCrissCrossBlockEntryStartPosition(uint64_t criss_cross_id) { - return criss_cross_id * HOST_MAX_BLOCK_ENTRIES; -} - - -string Strip0x(const string &hex) -{ - if (hex.size() > 1 && (hex.substr(0, 2) == "0x" || hex.substr(0, 2) == "0X")) { - return hex.substr(2); - } - return hex; -} - -void HexToBytes(const string &hex, uint8_t *result) -{ - for (uint32_t i = 0; i < hex.length(); i += 2) { - string byteString = hex.substr(i, 2); - uint8_t byte = (uint8_t)strtol(byteString.c_str(), NULL, 16); - result[i / 2] = byte; - } -} - -void chacha_setup() { - string id = "022fb42c08c12de3a6af053880199806532e79515f94e83461612101f9412f9e"; - //string id = "0000000000000000000000000000000000000000000000000000000000000000"; - - uint8_t enc_key[32]; - - id = Strip0x(id); - std::array id_bytes; - HexToBytes(id, id_bytes.data()); - uint8_t* orig_key = id_bytes.data(); - - enc_key[0] = 1; - memcpy(enc_key + 1, orig_key, 31); - - CUDA_CHECK_RETURN(cudaMallocManaged(&chacha_input, 16*sizeof(uint32_t))); - // Setup ChaCha8 context with zero-filled IV - chacha8_keysetup_data(chacha_input, enc_key, 256, NULL); - -} - -// chacha specific macros end - -/** - * Check the return value of the CUDA runtime API call and exit - * the application if the call has failed. - */ -static void CheckCudaErrorAux (const char *file, unsigned line, const char *statement, cudaError_t err) -{ - if (err == cudaSuccess) - return; - std::cerr << statement<<" returned " << cudaGetErrorString(err) << "("<(finish - total_start).count() << " ms\n"; - - auto total_start_without_memory = std::chrono::high_resolution_clock::now(); - - int blockSize = 64; // # of threads per block, maximum is 1024. - const uint64_t calc_N = N; - const uint64_t calc_blockSize = blockSize; - const uint64_t calc_numBlocks = (calc_N + calc_blockSize - 1) / (blockSize * 16); - int numBlocks = calc_numBlocks; - std::cout << " Block configuration: [blockSize:" << blockSize << " numBlocks:" << numBlocks << "]" << std::endl; - - //batches = 2; - int64_t total_compute_time_ms = 0; - int64_t total_transfer_time_ms = 0; - uint32_t total_f2_results_count = 0; - - // map for table 1. - { - T1_Pairing_Chunk *t1_pairing_chunks = (T1_Pairing_Chunk *) device_pairing_chunks; - F1_Bucketed_kBC_Entry *local_kbc_entries = (F1_Bucketed_kBC_Entry *) device_bucketed_meta_entries; - T1_Pairing_Chunk *host_t1_pairing_chunks = (T1_Pairing_Chunk *) host_copy_buffer; - uint32_t batches_to_go = BATCHES; - while (batches_to_go > 0) { - - std::cout << " gpuScanIntoKbcBuckets BATCHES to go: " << batches_to_go << std::endl << - " SPANNING FOR BUCKETS count:" << (KBC_END - KBC_START + 1) << " KBC_START: " << KBC_START << " KBC_END: " << KBC_END << std::endl; - std::cout << " Generating F1 results into kbc buckets..."; - auto batch_start = std::chrono::high_resolution_clock::now(); - auto start = std::chrono::high_resolution_clock::now(); - - // don't forget to clear counter... - CUDA_CHECK_RETURN(cudaMemset(local_kbc_num_entries, 0, KBC_LOCAL_NUM_BUCKETS*sizeof(int))); - - gpu_chacha8_get_k32_keystream_into_local_kbc_entries<<>>(N, chacha_input, - local_kbc_entries, local_kbc_num_entries, KBC_START, KBC_END); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto finish = std::chrono::high_resolution_clock::now(); - std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - std::cout << " Finding matches..."; - (*pairing_chunks_count) = 0; // set... - CUDA_CHECK_RETURN(cudaMemset(global_kbc_num_entries, 0, (kBC_LAST_BUCKET_ID+1)*sizeof(int))); - gpu_find_f1_matches<<<(KBC_LOCAL_NUM_BUCKETS-1), 256>>>(KBC_START, KBC_END, - local_kbc_entries, local_kbc_num_entries, - t1_pairing_chunks, pairing_chunks_count, MAX_RESULTS); - //gpu_find_fx_matches<<<(KBC_LOCAL_NUM_BUCKETS-1), 256>>>(KBC_START, KBC_END, - // local_kbc_entries, local_kbc_num_entries, - // t1_pairing_chunks, t1_pairing_chunks_count, MAX_RESULTS); - //gpu_find_matches<<<1, 64>>>(1,2, KBC_MAX_ENTRIES_PER_BUCKET, local_kbc_entries, local_kbc_num_entries); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - - total_compute_time_ms += std::chrono::duration_cast(finish - batch_start).count(); - std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - - - - // now copy pair results to CPU memory. - int num_results = (*pairing_chunks_count); - total_f2_results_count += num_results; - std::cout << " Copying " << num_results << " T1 pairing chunks to CPU..."; - start = std::chrono::high_resolution_clock::now(); - CUDA_CHECK_RETURN(cudaMemcpy(host_t1_pairing_chunks,t1_pairing_chunks,num_results*sizeof(T1_Pairing_Chunk),cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - finish = std::chrono::high_resolution_clock::now(); - total_transfer_time_ms += std::chrono::duration_cast(finish - start).count(); - std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - - // and move from CPU memory into reserved criss cross buckets - std::cout << " Moving pairing chunks into CPU criss cross storage\n"; - start = std::chrono::high_resolution_clock::now(); - uint32_t batch_id = BATCHES-batches_to_go; - Tx_Bucketed_Meta2 *host_cast = (Tx_Bucketed_Meta2 *) host_criss_cross_store; - //cpuT1MoveCopyBufferToCrissCross(batch_id, host_t1_pairing_chunks, num_results, host_cast, &criss_cross_num_entries[batch_id]); - finish = std::chrono::high_resolution_clock::now(); - total_transfer_time_ms += std::chrono::duration_cast(finish - start).count(); - std::cout << " done. " << std::chrono::duration_cast(finish - start).count() << " ms\n"; - - - std::cout << " ** batch finish ** " << std::chrono::duration_cast(finish - batch_start).count() << " ms\n"; - batches_to_go--; - - KBC_START += KBCS_PER_BATCH; - //if (BATCHES == 0) { - // KBC_END = kBC_LAST_BUCKET_ID; - //} else { - KBC_END = KBC_START + KBCS_PER_BATCH; - //} - if ((KBC_END - KBC_START + 1) > KBC_LOCAL_NUM_BUCKETS) { - std::cout << "ERROR: kbc span is more than local buckets allocated!\n" << std::endl; - } - } - } - - - finish = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "Total time: " << std::chrono::duration_cast(finish - total_start).count() << " ms\n"; - std::cout << " w/o alloc: " << std::chrono::duration_cast(finish - total_start_without_memory).count() << " ms\n"; - std::cout << " gpu compute: " << total_compute_time_ms << " ms\n"; - std::cout << " transfer: " << total_transfer_time_ms << " ms\n"; - - std::cout << "*********************" << std::endl; - /*uint32_t total_entries = 0; - for (int bucket_id=0;bucket_id<2;bucket_id++) { // NICK_NUM_BUCKETS;i++) { - int num = local_kbc_num_entries[bucket_id]; - std::cout << "KBC LOCAL num entries bucket " << bucket_id << " : " << num << std::endl; - total_entries += num; - //for (int i=0;i -#include -#include - - -// bladebit -// phase 1: 209s -// phase 2: 25s -// phase 3: 102s -// phase 4: <1s - -const uint64_t PHASE_2_MAX_BYTES_PER_UNIT = 12; // enter max. bytes used per entry for any of the tables -const uint64_t PHASE_2_ALLOCATED_BYTES_PER_TABLE = PHASE_2_MAX_BYTES_PER_UNIT * DEVICE_BUFFER_ALLOCATED_ENTRIES; // enter max. bytes used per entry for any of the tables - -uint32_t num_set_t4 = 0; -uint32_t num_same_addresses = 0; -uint32_t num_set_t5 = 0; - -void readT2BlockFilesToHostMem(uint32_t batch_id, T2BaseRef *t2_data, uint32_t *num_entries) { - for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - //if (batch_id == 0) { - // std::cout << "Reading file [" << filename << "]"; - //} else { - // std::cout << " [" << filename << "]"; - //} - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries[block_id], sizeof(uint32_t), 1, pFile)) { - //std::cout << " num_entries: " << num_entries[block_id] << std::endl; - if (fread(t2_data, sizeof(T2BaseRef), num_entries[block_id], pFile)) { - //std::cout << "success."; - } else { - std::cout << "failed."; - } - } - fclose(pFile); - //if (batch_id == BATCHES-1) { - // std::cout << " done." << std::endl; - //} - //for (int i = 0; i < 1; i++) { - // std::cout << "Value " << i << " is: " << t2_data[0].Lx1 << std::endl; - //} - } -} - -void readTxBackRefBlockFilesToHostMem(uint32_t table, uint32_t batch_id, BackRef *tx_data, uint32_t *num_entries) { - for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - std::string filename = "/mnt/kioxia/tmp/T"+std::to_string(table)+"BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - if (batch_id == 0) { - std::cout << "Reading file [" << filename << "]"; - } else { - std::cout << " [" << filename << "]"; - } - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - //uint32_t num_entries; - if (fread(&num_entries[block_id], sizeof(uint32_t), 1, pFile)) { - std::cout << " num_entries: " << num_entries[block_id] << std::endl; - if (fread(tx_data, sizeof(BackRef), num_entries[block_id], pFile)) { - std::cout << "success."; - } else { - std::cout << "failed."; - } - } else { - std::cout << "Failed to read count " << std::endl; - } - fclose(pFile); - if (batch_id == BATCHES-1) { - std::cout << " done." << std::endl; - } - } - -} - -void readT6BackRefBlockFilesToHostMem(uint32_t batch_id, uint32_t block_id, T6BackRef *tx_data, uint32_t &num_entries) { - std::string filename = "/mnt/kioxia/tmp/T6BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) { - std::cout << "reading..." << num_entries << std::endl; - if (!fread(&tx_data, sizeof(T6BackRef), num_entries, pFile)) { - std::cout << "failed."; - } - } - fclose(pFile); -} - -void readT2BlockFile(uint32_t batch_id, uint32_t block_id, T2BaseRef *t2_data, uint32_t &num_entries) { - std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - //if (batch_id == 0) { - // std::cout << "Reading file [" << filename << "]"; - //} else { - //// std::cout << " [" << filename << "]"; - //} - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) { - //std::cout << " num_entries: " << num_entries << std::endl; - if (fread(t2_data, sizeof(T2BaseRef), num_entries, pFile)) { - //std::cout << "success."; - } else { - std::cout << "failed."; - } - } - fclose(pFile); - //if (batch_id == BATCHES-1) { - // std::cout << " done." << std::endl; - //} - //for (int i = 0; i < 1; i++) { - // std::cout << "Value " << i << " is: " << t2_data[0].Lx1 << std::endl; - //} -} - -void readBackRefBlockFile(uint32_t table, uint32_t batch_id, uint32_t block_id, BackRef *tx_data, uint32_t &num_entries) { - std::string filename = "/mnt/kioxia/tmp/T"+std::to_string(table)+"BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - FILE* pFile; - //std::cout << "reading " << filename << std::endl; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) { - if (!fread(tx_data, sizeof(BackRef), num_entries, pFile)) { - std::cout << "failed reading " << filename; - } - } - fclose(pFile); -} - -void readT6BlockFile(uint32_t batch_id, uint32_t block_id, T6BackRef *t6_data, uint32_t &num_entries) { - std::string filename = "/mnt/kioxia/tmp/T6BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) { - if (fread(t6_data, sizeof(T6BackRef), num_entries, pFile)) { - //std::cout << "success."; - } else { - std::cout << "failed."; - } - } - fclose(pFile); -} - -void readT3BaseRefBlockFile(uint32_t batch_id, uint32_t block_id, T3BaseRef *t3_data, uint32_t &num_entries) { - std::string filename = "/mnt/kioxia/tmp/T3BaseRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - FILE* pFile; - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - if (fread(&num_entries, sizeof(uint32_t), 1, pFile)) { - if (fread(t3_data, sizeof(T3BaseRef), num_entries, pFile)) { - //std::cout << "success."; - } else { - std::cout << "failed."; - } - } - fclose(pFile); -} - -// should total around 48GB...so maybe don't have to write to disk... -void writeT6FinalBlockFile(uint32_t batch_id, uint32_t block_id, T6FinalEntry *t6_final_data, uint32_t &num_entries) { - if (num_entries == 0) { - return; - } - std::string filename = "/mnt/kioxia/tmp/T6Final-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - //if (batch_id == 0) { - // std::cout << "Writing backref to file [" << filename << "]"; - //} else { - // std::cout << " [" << filename << "]"; - //} - FILE* pFile; - pFile = fopen(filename.c_str(), "wb"); // 41228ms for block level writing, 40912ms for batch writing?? - fwrite(&num_entries, sizeof(uint32_t), 1, pFile); // write the num entries first. - fwrite(t6_final_data, 1, num_entries * sizeof(T6FinalEntry), pFile); - fclose(pFile); - //if (batch_id == BATCHES-1) { - // std::cout << " done." << std::endl; - //} - -} - -void readT2BlockEntry(uint32_t batch_id, uint32_t block_id, uint32_t idx, T2BaseRef *t2_entry) { - std::string filename = "/mnt/kioxia/tmp/T2-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - uint32_t seekpos = idx * sizeof(T2BaseRef) + sizeof(uint32_t); - std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl; - FILE* pFile; - - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - fseek ( pFile , seekpos , SEEK_SET ); - fread(t2_entry, sizeof(T2BaseRef), 1, pFile); - fclose(pFile); -} - -void readT3BlockEntry(uint32_t batch_id, uint32_t block_id, uint32_t idx, T3BaseRef *t3_entry) { - std::string filename = "/mnt/kioxia/tmp/T3BaseRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - uint32_t seekpos = idx * sizeof(T3BaseRef) + sizeof(uint32_t); - std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl; - FILE* pFile; - - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - fseek ( pFile , seekpos , SEEK_SET ); - fread(t3_entry, sizeof(T3BaseRef), 1, pFile); - fclose(pFile); -} - -void readBackRefBlockEntry(uint32_t table, uint32_t batch_id, uint32_t block_id, uint32_t idx, BackRef *return_data) { - std::string filename = "/mnt/kioxia/tmp/T" + std::to_string(table) + "BackRef-" + std::to_string(batch_id) + "-" + std::to_string(block_id) + ".tmp"; - uint32_t seekpos = idx * sizeof(BackRef) + sizeof(uint32_t); - std::cout << "Reading single entry from " << filename << " pos: " << seekpos << std::endl; - FILE* pFile; - - pFile = fopen(filename.c_str(), "rb"); // 41228ms for block level writing, 40912ms for batch writing?? - fseek ( pFile , seekpos , SEEK_SET ); - fread(return_data, sizeof(BackRef), 1, pFile); - fclose(pFile); -} - -void backPropagate(uint32_t table, uint32_t batch_id, uint32_t block_id, uint32_t idx) { - std::cout << "Back propagate to table: " << table << " batch_id:" << batch_id << " block_id:" << block_id << " idx:" << idx << std::endl; - BackRef entry; - readBackRefBlockEntry(table, batch_id, block_id, idx, &entry); - //std::cout << "Ready entry L:" << entry_data.prev_block_ref_L << " R:" << entry_data.prev_block_ref_R << std::endl; - uint32_t prev_block_id_L = entry.prev_block_ref_L >> (32 - 6); - uint32_t prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint32_t prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint32_t prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - printf("T%uBackRef batch_id:%u block_id:%u! L:%u R:%u L_block_id:%u L_idx:%u R_block_id:%u R_idx:%u y:%u\n", - table, batch_id, block_id, entry.prev_block_ref_L, entry.prev_block_ref_R, - prev_block_id_L, prev_idx_L, - prev_block_id_R, prev_idx_R); - /*if (table > 3) { - backPropagate(table-1, prev_block_id_L, batch_id, prev_idx_L); - backPropagate(table-1, prev_block_id_R, batch_id, prev_idx_R); - } else if (table == 3) { - // read T2 entries right? - T2BaseRef L, R; - readT2BlockEntry(prev_block_id_L, batch_id, prev_idx_L, &L); - readT2BlockEntry(prev_block_id_R, batch_id, prev_idx_R, &R); - printf("T2 L: %u %u\n", L.Lx1, L.Lx2); - printf("T2 R: %u %u\n", R.Lx1, R.Lx2); - }*/ - - if (table > 4) { - backPropagate(table-1, prev_block_id_L, batch_id, prev_idx_L); - backPropagate(table-1, prev_block_id_R, batch_id, prev_idx_R); - } else if (table == 4) { - // read T3 entries right? - T3BaseRef L, R; - readT3BlockEntry(prev_block_id_L, batch_id, prev_idx_L, &L); - readT3BlockEntry(prev_block_id_R, batch_id, prev_idx_R, &R); - printf("T3 pos: %u L: %u %u %u %u\n", prev_idx_L, L.Lx1, L.Lx2, L.Lx3, L.Lx4); - printf("T3 pos: %u R: %u %u %u %u\n", prev_idx_R, R.Lx1, R.Lx2, R.Lx3, R.Lx4); - } - -} - - - -// try to see if we have correct back propagation values stored. -// y = 573855352 -// xs 602009779,2127221679, 3186459061,443532047, 1234434947,1652736830, 396228306,464118917,3981993340, -// 3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,1468569780,2075860307, -// 2880258779,999340005,1240438978,4293399624,4226635802,1031429862,2391120891,3533658526,3823422504, -// 3983813271,4180778279,2403148863,2441456056,319558395,2338010591,196206622,1637393731,853158574,2704638588, -// 2368357012,1703808356,451208700,2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502, -// 1429077635,2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,2222781147, -// 1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969 - - -/* - * this is what a single solution looks like file-wise - * -rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-10-15.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-11-0.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-13-51.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-15-12.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-17-3.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-19-3.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-25-51.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-26-17.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-30-55.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-31-0.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-33-1.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-34-15.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-4-1.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-43-17.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-53-12.tmp --rw-rw-r-- 1 nick nick 24 Okt 19 11:16 T2-60-55.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-0-10.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-12-51.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-1-43.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-15-43.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-17-51.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-3-10.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-51-35.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T3BackRef-55-35.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-10-38.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-35-40.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-43-38.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-51-40.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T5BackRef-38-5.tmp --rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T5BackRef-40-5.tmp --rw-rw-r-- 1 nick nick 16 Okt 19 11:16 T6BackRef-5-8.tmp - * - */ - -// y = 573855352 -// xs 602009779,2127221679,3186459061,443532047,1234434947,1652736830,396228306,464118917,3981993340,3878862024,1730679522,3234011360,521197720,2635193875,2251292298,608281027,1468569780,2075860307,2880258779,999340005,1240438978,4293399624,4226635802,1031429862,2391120891,3533658526,3823422504,3983813271,4180778279,2403148863,2441456056,319558395,2338010591,196206622,1637393731,853158574,2704638588,2368357012,1703808356,451208700,2145291166,2741727812,3305809226,1748168268,415625277,3051905493,4257489502,1429077635,2438113590,3028543211,3993396297,2678430597,458920999,889121073,3577485087,1822568056,2222781147,1942400192,195608354,1460166215,2544813525,3231425778,2958837604,2710532969 -void findYsolution(char *memstore) { - if (memstore == NULL) { - memstore = (char *) malloc(1738014720); - } - uint32_t y = 573855352; - std::cout << "findYsolution: " << y << std::endl; - T6BackRef *t6_data = (T6BackRef *) &memstore[0]; - - // how to back propagate all? - // read batch. Sort by all blocks. Then read batch related to sorted blocks. - // loop - //uint32_t t6_num; - //readT6BlockFile(0,0,t6_data, t6_num); - - //for (uint32_t batch_id = 0; batch_id < BATCHES; batch_id++) { - //std::cout << "Scanning T6 batch " << batch_id << std::endl; - - // for (uint32_t block_id = 0; block_id < BATCHES; block_id++) { - uint32_t batch_id = 5; - uint32_t block_id = 8; - uint32_t num_entries; - readT6BlockFile(batch_id,block_id,t6_data, num_entries); - std::cout << "Scanning T6 batch-block " << batch_id << "-" << block_id << " : " << num_entries << " entries" << std::endl; - - for (int i=0;i> (32 - 6); - uint32_t prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint32_t prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint32_t prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - printf("T6BackRef Y FOUND! L:%u R:%u L_block_id:%u L_idx:%u R_block_id:%u R_idx:%u y:%u\n", - entry.prev_block_ref_L, entry.prev_block_ref_R, - prev_block_id_L, prev_idx_L, - prev_block_id_R, prev_idx_R, - entry.y); - backPropagate(5,prev_block_id_L, batch_id, prev_idx_L ); - backPropagate(5,prev_block_id_R, batch_id, prev_idx_R ); - } - } - // } - //} - -} - -__global__ -void gpu_set_t6_final_data_and_t4_tags_directly(const uint32_t N, T6BackRef *t6_data, BackRef *t5_data, T6FinalEntry *t6_final_data, uint32_t *t4_tags) { - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - T6BackRef entry = t6_data[i]; - uint64_t t6_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6); - uint64_t t6_prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint64_t t6_prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - - // now could back ref t5... - BackRef t5_L, t5_R; - uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L; - uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R; - t5_L = t5_data[t5_address_L]; - t5_R = t5_data[t5_address_R]; - uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6); - uint64_t t5_L_prev_idx_L = t5_L.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6); - uint64_t t5_L_prev_idx_R = t5_L.prev_block_ref_R & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6); - uint64_t t5_R_prev_idx_L = t5_R.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6); - uint64_t t5_R_prev_idx_R = t5_R.prev_block_ref_R & 0x3FFFFFF; - - T6FinalEntry final_entry = {}; - final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12); - final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12); - //std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl; - //std::cout << " [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl; - final_entry.y = entry.y; - t6_final_data[i] = final_entry; - - - - // directly set t4 tags - if (true) { // w/ this is 571ms, without is 440ms. Max optimization is 8 seconds over 64 batches. - uint32_t value; - uint64_t file_batch_id, file_block_id, file_idx; - uint64_t address; - uint32_t bits_to_set; - file_batch_id = t5_L_prev_block_id_L; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - - file_batch_id = t5_L_prev_block_id_R; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - - file_batch_id = t5_R_prev_block_id_L; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - - file_batch_id = t5_R_prev_block_id_R; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - } - - } -} - -__global__ -void gpu_backref_t5_tag(const uint32_t N, T6BackRef *t6_data, BackRef *t5_data, T6FinalEntry *t6_final_data, uint32_t *t5_tags) { - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - T6BackRef entry = t6_data[i]; - uint64_t t6_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6); - uint64_t t6_prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint64_t t6_prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - - // now could back ref t5... - BackRef t5_L, t5_R; - uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L; - uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R; - t5_L = t5_data[t5_address_L]; - t5_R = t5_data[t5_address_R]; - uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6); - uint64_t t5_L_prev_idx_L = t5_L.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6); - uint64_t t5_L_prev_idx_R = t5_L.prev_block_ref_R & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6); - uint64_t t5_R_prev_idx_L = t5_R.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6); - uint64_t t5_R_prev_idx_R = t5_R.prev_block_ref_R & 0x3FFFFFF; - - // tag addresses that were used here... - - uint32_t bits_to_set; - bits_to_set = 1 << (t5_address_L % 32); - atomicOr(&t5_tags[t5_address_L / 32], bits_to_set); - - bits_to_set = 1 << (t5_address_R % 32); - atomicOr(&t5_tags[t5_address_R / 32], bits_to_set); - - - T6FinalEntry final_entry = {}; - final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12); - final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12); - //std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl; - //std::cout << " [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl; - final_entry.y = entry.y; - t6_final_data[i] = final_entry; - } -} - -// t6's map to t4's, t5's map to t3's -__global__ -void gpu_backref_t4_tag(const uint32_t N, BackRef *t4_data, T3BaseRef *t3_data, T4FinalEntry *t4_final_data, uint32_t *t4_tags) { - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - BackRef entry = t4_data[i]; - uint64_t t4_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6); - uint64_t t4_prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t4_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint64_t t4_prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - - // now could back ref t5... - T3BaseRef t3_L, t3_R; - uint32_t t3_address_L = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_L + t4_prev_idx_L; - uint32_t t3_address_R = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_R + t4_prev_idx_R; - t3_L = t3_data[t3_address_L]; - t3_R = t3_data[t3_address_R]; - - T4FinalEntry finalEntry; - finalEntry.Lx1 = t3_L.Lx1; - finalEntry.Lx2 = t3_L.Lx2; - finalEntry.Lx3 = t3_L.Lx3; - finalEntry.Lx4 = t3_L.Lx4; - finalEntry.Lx5 = t3_R.Lx1; - finalEntry.Lx6 = t3_R.Lx2; - finalEntry.Lx7 = t3_R.Lx3; - finalEntry.Lx8 = t3_R.Lx4; - - t4_final_data[i] = finalEntry; - } -} - -__global__ -void gpu_backref_t4_lxlists(const uint32_t N, BackRef *t4_data, T3BaseRef *t3_data, uint32_t *t4_lx_list, uint32_t *t4_tags) { - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - BackRef entry = t4_data[i]; - uint64_t t4_prev_block_id_L = entry.prev_block_ref_L >> (32 - 6); - uint64_t t4_prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t4_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint64_t t4_prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - - // now could back ref t5... - T3BaseRef t3_L, t3_R; - uint32_t t3_address_L = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_L + t4_prev_idx_L; - uint32_t t3_address_R = HOST_MAX_BLOCK_ENTRIES * t4_prev_block_id_R + t4_prev_idx_R; - t3_L = t3_data[t3_address_L]; - t3_R = t3_data[t3_address_R]; - - uint32_t base_address = i*8; - t4_lx_list[base_address+0] = t3_L.Lx1; - t4_lx_list[base_address+1] = t3_L.Lx2; - t4_lx_list[base_address+2] = t3_L.Lx3; - t4_lx_list[base_address+3] = t3_L.Lx4; - t4_lx_list[base_address+4] = t3_R.Lx1; - t4_lx_list[base_address+5] = t3_R.Lx2; - t4_lx_list[base_address+6] = t3_R.Lx3; - t4_lx_list[base_address+7] = t3_R.Lx4; - - } -} - -__global__ -void gpu_t5_tag_to_t4(const uint32_t N, const uint32_t t5_block_id, BackRef *t5_data, uint32_t *t5_tags, uint32_t *t4_tags) { - uint32_t i = blockIdx.x*blockDim.x+threadIdx.x; - if (i < N) { - uint32_t t5_address = i; - uint32_t bits_to_set = 1 << (t5_address % 32); - uint32_t has_set = t5_tags[t5_address / 32] & bits_to_set; - if (has_set > 0) { - BackRef t5_entry = t5_data[t5_address]; - uint64_t t5_L_prev_block_id_L = t5_entry.prev_block_ref_L >> (32 - 6); - uint64_t t5_L_prev_idx_L = t5_entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_L_prev_block_id_R = t5_entry.prev_block_ref_R >> (32 - 6); - uint64_t t5_L_prev_idx_R = t5_entry.prev_block_ref_R & 0x3FFFFFF; - - uint64_t file_batch_id, file_block_id, file_idx; - uint64_t address; - uint32_t bits_to_set; - - file_batch_id = t5_L_prev_block_id_L; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - - file_batch_id = t5_L_prev_block_id_R; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - atomicOr(&t4_tags[address / 32], bits_to_set); - } - } -} - -void tagPreviousTable(uint32_t t5_block_id, BackRef *t5_data, uint32_t num_entries, uint32_t *t5_tags, uint32_t *t4_tags) { - // we have to read all T2 entries and merge into T3 table that then contains 4 Lx entries. - //std::cout << " doing table block " << t5_block_id << std::endl; - for (int i=0;i 0) { - //std::cout << "WAS SET: t5 block_id: " << t5_block_id << " entry i: " << i << std::endl; - BackRef t5_entry = t5_data[t5_address]; - uint64_t t5_L_prev_block_id_L = t5_entry.prev_block_ref_L >> (32 - 6); - uint64_t t5_L_prev_idx_L = t5_entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_L_prev_block_id_R = t5_entry.prev_block_ref_R >> (32 - 6); - uint64_t t5_L_prev_idx_R = t5_entry.prev_block_ref_R & 0x3FFFFFF; - - uint64_t file_batch_id, file_block_id, file_idx; - uint64_t address; - uint32_t bits_to_set; - file_batch_id = t5_L_prev_block_id_L; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - address = address / 32; - //uint32_t has_set = t4_tags[address] & bits_to_set; - //if (has_set == 0) printf("error did not set first time some address mistake\n"); - t4_tags[address] |= bits_to_set; - - file_batch_id = t5_L_prev_block_id_R; file_block_id = t5_block_id; file_idx = t5_L_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - address = address / 32; - //has_set = t4_tags[address] & bits_to_set; - //if (has_set == 0) printf("error did not set first time some address mistake\n"); - - t4_tags[address] |= bits_to_set; - - num_set_t4 += 2; - } - } - //std::cout << " done table block " << t5_block_id << std::endl; -} - -void createT6FinalEntries_oldbenchmarks(char *memstore) { - // 2) T6 must propagate down to T4 and tag all used entries, and then update T6 references to include T4. - // 3) T6 reads one block at a time for each batch, small memory print - // - then T5 one whole batch, since each block references 0..BATCHES - // - T4 tag list can be set (booleans) - // - update T6 data to include y, 6,6,6 and 6,6,6 references - const uint64_t T4_TAG_MEM_BYTES_NEEDED = (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES * BATCHES)) * sizeof(uint32_t)) / 32; - const uint64_t T5_TAG_MEM_BYTES_NEEDED = T4_TAG_MEM_BYTES_NEEDED; // (HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(uint32_t)) / 32; - const uint64_t T6_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T6BackRef); - const uint64_t T6_FINAL_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * sizeof(T6FinalEntry); - const uint64_t T5_MEM_BYTES_NEEDED = HOST_MAX_BLOCK_ENTRIES * ((uint64_t) (BATCHES)) * sizeof(BackRef); - const uint64_t TOTAL_MEM_BYTES_NEEDED = T4_TAG_MEM_BYTES_NEEDED + T5_TAG_MEM_BYTES_NEEDED + T6_MEM_BYTES_NEEDED + T6_FINAL_MEM_BYTES_NEEDED + T5_MEM_BYTES_NEEDED; - - T6BackRef *device_t6_data; - BackRef *device_t5_data; - T6FinalEntry *device_t6_final_data; - uint32_t *device_t4_tags; - uint32_t *device_t5_tags; - - if (memstore==NULL) { - std::cout << "Allocating memory bytes: " << TOTAL_MEM_BYTES_NEEDED << std::endl; - //memstore = (char *) malloc(TOTAL_MEM_BYTES_NEEDED); - CUDA_CHECK_RETURN(cudaMallocHost((void**)&memstore, TOTAL_MEM_BYTES_NEEDED)); // = new F2_Result_Pair[HOST_F2_RESULTS_SPACE](); - std::cout << " host mem allocated..." << std::endl; - CUDA_CHECK_RETURN(cudaMalloc(&device_t6_data, T6_MEM_BYTES_NEEDED)); - CUDA_CHECK_RETURN(cudaMalloc(&device_t5_data, T5_MEM_BYTES_NEEDED)); - CUDA_CHECK_RETURN(cudaMalloc(&device_t6_final_data, T6_FINAL_MEM_BYTES_NEEDED)); - CUDA_CHECK_RETURN(cudaMalloc(&device_t4_tags, T4_TAG_MEM_BYTES_NEEDED)); - CUDA_CHECK_RETURN(cudaMalloc(&device_t5_tags, T5_TAG_MEM_BYTES_NEEDED)); - // clear bits... - CUDA_CHECK_RETURN(cudaMemset(device_t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED)); - CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED)); - - std::cout << " gpu mem allocated..." << std::endl; - - if (memstore == NULL) { - exit (1); - } - } - - // TODO: THIS IS SUPER SLOW ON HOST CPU! but it only needs 5GB so could load into GPU and set it all there... - - uint64_t NEXT_MEM_BYTES_START = 0; - - const uint64_t T5_DATA_START = NEXT_MEM_BYTES_START; - BackRef *t5_data = (BackRef *) &memstore[T5_DATA_START]; - uint32_t t5_num_entries[BATCHES]; - NEXT_MEM_BYTES_START += T5_MEM_BYTES_NEEDED; - - const uint64_t T6_DATA_START = NEXT_MEM_BYTES_START; - T6BackRef *t6_data = (T6BackRef *) &memstore[T6_DATA_START]; - NEXT_MEM_BYTES_START += T6_MEM_BYTES_NEEDED; - - const uint64_t T6_FINAL_DATA_START = NEXT_MEM_BYTES_START; - T6FinalEntry *t6_final_data = (T6FinalEntry *) &memstore[T6_FINAL_DATA_START]; - NEXT_MEM_BYTES_START += T6_FINAL_MEM_BYTES_NEEDED; - - uint32_t *t4_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes - // will reference this as if file, like memstore[batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + block_id*HOST_MAX_BLOCK_ENTRIES] - memset(t4_tags, 0, T4_TAG_MEM_BYTES_NEEDED); - NEXT_MEM_BYTES_START += T4_TAG_MEM_BYTES_NEEDED; - - uint32_t *t5_tags = (uint32_t *) &memstore[NEXT_MEM_BYTES_START]; // needs HOST_MAX_BLOCK_ENTRIES * 64 * 64 bytes - // will reference this as if file, like memstore[batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + block_id*HOST_MAX_BLOCK_ENTRIES] - memset(t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED); - NEXT_MEM_BYTES_START += T5_TAG_MEM_BYTES_NEEDED; - - using milli = std::chrono::milliseconds; - - std::cout << "Starting..." << std::endl; - uint64_t total_t4_tagged = 0; - num_set_t4 = 0; - num_same_addresses = 0; - num_set_t5 = 0; - - const int doCPUmethod = 0; - const int doGPUmethod = 2; // 1 is single shot setting, 2 is 2-phase setting - /* -*******method 1************** -All compute loop time: 30760 ms -********************* - Tagged T4 entries: 3531979836 should be 114437654 out of max 4563402752 -********************* -Total time: 33862 ms - -*******method 2************** -All compute loop time: 28907 ms -********************* -All compute loop time: 28753 ms -********************* - Tagged T4 entries: 3531979836 should be 114437654 out of max 4563402752 -********************* -Total time: 32010 ms - -*/ - int blockSize = 256; - - - auto compute_loop_start = std::chrono::high_resolution_clock::now(); - //for (uint32_t t6_batch_id = 5; t6_batch_id < 6; t6_batch_id++) { - for (uint32_t t6_batch_id = 0; t6_batch_id < BATCHES; t6_batch_id++) { - auto batch_start = std::chrono::high_resolution_clock::now(); - if (doCPUmethod > 0) { - memset(t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED); - } else { - CUDA_CHECK_RETURN(cudaMemset(device_t5_tags, 0, T5_TAG_MEM_BYTES_NEEDED)); - } - for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) { - readBackRefBlockFile(5, t5_block_id, t6_batch_id, - &t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id], - t5_num_entries[t5_block_id]); - //std::cout << "Loading T5 batch-block " << t5_block_id << "-" << t6_batch_id << " : " << t5_num_entries[t5_block_id] << " entries" << std::endl; - if (doGPUmethod > 0) - CUDA_CHECK_RETURN(cudaMemcpy(&device_t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],&t5_data[HOST_MAX_BLOCK_ENTRIES*t5_block_id],t5_num_entries[t5_block_id]*sizeof(BackRef),cudaMemcpyHostToDevice)); - } - if (doGPUmethod > 0) - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - // TODO: we need to make sure we are getting the correct values/tags set - // find the file for the single value y and follow that y back to see if we are doing it right.... - - //for (uint32_t t6_block_id = 8; t6_block_id < 9; t6_block_id++) { // BATCHES; t6_block_id++) { - for (uint32_t t6_block_id = 0; t6_block_id < BATCHES; t6_block_id++) { - uint32_t t6_num_entries; - readT6BlockFile(t6_batch_id,t6_block_id,t6_data, t6_num_entries); - if (doGPUmethod > 0) { - CUDA_CHECK_RETURN(cudaMemcpy(device_t6_data, t6_data,t6_num_entries*sizeof(T6BackRef),cudaMemcpyHostToDevice)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //std::cout << "Scanning T6 batch-block " << t6_batch_id << "-" << t6_block_id << " : " << t6_num_entries << " entries" << std::endl; - int numBlocks = (t6_num_entries + blockSize - 1) / (blockSize); - if (doGPUmethod == 1) - gpu_set_t6_final_data_and_t4_tags_directly<<>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t4_tags); - else - gpu_backref_t5_tag<<>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t5_tags); - - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - // now write back results to hostmem - CUDA_CHECK_RETURN(cudaMemcpy(t6_final_data, device_t6_final_data,t6_num_entries*sizeof(T6FinalEntry),cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - } - if (doCPUmethod > 0) - for (int i=0;i> (32 - 6); - uint64_t t6_prev_idx_L = entry.prev_block_ref_L & 0x3FFFFFF; - uint64_t t6_prev_block_id_R = entry.prev_block_ref_R >> (32 - 6); - uint64_t t6_prev_idx_R = entry.prev_block_ref_R & 0x3FFFFFF; - - // now could back ref t5... - BackRef t5_L, t5_R; - uint32_t t5_address_L = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_L + t6_prev_idx_L; - uint32_t t5_address_R = HOST_MAX_BLOCK_ENTRIES * t6_prev_block_id_R + t6_prev_idx_R; - t5_L = t5_data[t5_address_L]; - t5_R = t5_data[t5_address_R]; - uint64_t t5_L_prev_block_id_L = t5_L.prev_block_ref_L >> (32 - 6); - uint64_t t5_L_prev_idx_L = t5_L.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_L_prev_block_id_R = t5_L.prev_block_ref_R >> (32 - 6); - uint64_t t5_L_prev_idx_R = t5_L.prev_block_ref_R & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_L = t5_R.prev_block_ref_L >> (32 - 6); - uint64_t t5_R_prev_idx_L = t5_R.prev_block_ref_L & 0x3FFFFFF; - uint64_t t5_R_prev_block_id_R = t5_R.prev_block_ref_R >> (32 - 6); - uint64_t t5_R_prev_idx_R = t5_R.prev_block_ref_R & 0x3FFFFFF; - - // tag addresses that were used here... - if (doCPUmethod == 2) { - uint32_t bits_to_set; - bits_to_set = 1 << (t5_address_L % 32); - uint32_t value = t5_tags[t5_address_L / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t5_tags[t5_address_L / 32] |= bits_to_set; - - bits_to_set = 1 << (t5_address_R % 32); - value = t5_tags[t5_address_R / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t5_tags[t5_address_R / 32] |= bits_to_set; - - num_set_t5 += 2; - } - - T6FinalEntry final_entry = {}; - final_entry.refL = t5_L_prev_block_id_L + (t5_L_prev_block_id_R << 6) + (t6_prev_block_id_L << 12); - final_entry.refR = t5_R_prev_block_id_L + (t5_R_prev_block_id_R << 6) + (t6_prev_block_id_R << 12); - //std::cout << "T6 Final set: [" << t5_L_prev_block_id_L << " | " << t5_L_prev_block_id_R << "] - " << t6_prev_block_id_L << std::endl; - //std::cout << " [" << t5_R_prev_block_id_L << " | " << t5_R_prev_block_id_R << "] - " << t6_prev_block_id_R << std::endl; - final_entry.y = entry.y; - t6_final_data[i] = final_entry; - - - - // directly set t4 tags - if (doCPUmethod == 1) { - uint32_t value; - uint64_t file_batch_id, file_block_id, file_idx; - uint64_t address; - uint32_t bits_to_set; - file_batch_id = t5_L_prev_block_id_L; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - value = t4_tags[address / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t4_tags[address / 32] |= bits_to_set; - - file_batch_id = t5_L_prev_block_id_R; file_block_id = t6_prev_block_id_L; file_idx = t5_L_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - value = t4_tags[address / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t4_tags[address / 32] |= bits_to_set; - - file_batch_id = t5_R_prev_block_id_L; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_L; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - value = t4_tags[address / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t4_tags[address / 32] |= bits_to_set; - - file_batch_id = t5_R_prev_block_id_R; file_block_id = t6_prev_block_id_R; file_idx = t5_R_prev_idx_R; - address = file_batch_id*HOST_MAX_BLOCK_ENTRIES*BATCHES + file_block_id*HOST_MAX_BLOCK_ENTRIES + file_idx; - bits_to_set = 1 << (address % 32); - value = t4_tags[address / 32] & bits_to_set; - if (value > 1) { num_same_addresses++; } - t4_tags[address / 32] |= bits_to_set; - - num_set_t4 += 4; - } - - // just for benchmarks - // CPU: - // 10239 ms with / 32 per batch, writing to t4 tags directly - // 5057 ms with / 32 per batch but writing to t5 then scan t5 tags and write to t4 tags - // - // 1800ms without writing to tags, but fetch t5 data and setting t6 backrefs. -> still 1.9 minutes - // 1588ms with T6 writing tags for t5 instead of fetching t5, doesn't seem to save much huh? - // 1479ms without reading t5 at all -- so almost no gain (although to be fair t5 was cached reads). - - // Bladebit is 25s phase 2 - // read t5+t6 data only is 232ms, lowest bound, 15s min. - // read and transfer to gpu is 324ms - total 20s - // gpu setting data is 350ms...hallejuya - // - 26 seconds total but without tags written - // - 28.8 seconds writting back final data T6 - // - 41 seconds w/ tags written. - // even settings tags is 640ms hot god damn I love gpu, vs 6500ms = x10! - // but can this be improved so less random writes? - // total time is 41s - - //backPropagate(5,prev_block_id_L, batch_id, prev_idx_L ); - //backPropagate(5,prev_block_id_R, batch_id, prev_idx_R ); - //printf("%u %u %u %u\n", t5_L_prev_block_id_L, t5_L_prev_block_id_R, t5_R_prev_block_id_L, t5_R_prev_block_id_R); - - } - //}// entry.y - - //writeT6FinalBlockFile(t6_batch_id,t6_block_id,t6_data,t6_num_entries); - - } - - // 2067ms w/o any tagging - // 3865ms w/ tagging but not tagging t4 - // 6299ms w tag on 5 and t4 tags all set - // 10954ms tagging 4 directly (skipping 5) - if (doCPUmethod == 2) { - for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) { - uint32_t num_entries = t5_num_entries[t5_block_id]; - //std::cout << "Doing previous table tag for t6_batch_id: " << t6_batch_id << std::endl; - tagPreviousTable(t5_block_id, - &t5_data[HOST_MAX_BLOCK_ENTRIES * t5_block_id], t5_num_entries[t5_block_id], - &t5_tags[(HOST_MAX_BLOCK_ENTRIES * t5_block_id) / 32], // note /32 since 32 bits - t4_tags); - } - } - - if (doGPUmethod == 2) { - for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) { - uint32_t num_entries = t5_num_entries[t5_block_id]; - uint32_t t5_address = HOST_MAX_BLOCK_ENTRIES * t5_block_id; - int numBlocks = (num_entries + blockSize - 1) / (blockSize); - gpu_t5_tag_to_t4<<>>(num_entries, t5_block_id, - &device_t5_data[t5_address], - &device_t5_tags[t5_address/32], device_t4_tags); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - } - - auto batch_end = std::chrono::high_resolution_clock::now(); - //std::cout << "*********************" << std::endl; - std::cout << "*** Batch " << t6_batch_id << " time: " << std::chrono::duration_cast(batch_end - batch_start).count() << " ms ***\n"; - //std::cout << "*********************" << std::endl; - } - if (doGPUmethod > 0) { - // technically don't need to do this if stays in device memory...just for verfication purposes. - CUDA_CHECK_RETURN(cudaMemcpy(t4_tags, device_t4_tags,T4_TAG_MEM_BYTES_NEEDED,cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - auto compute_loop_end = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "All compute loop time: " << std::chrono::duration_cast(compute_loop_end - compute_loop_start).count() << " ms\n"; - std::cout << "*********************" << std::endl; - - - - /*std::cout << "setting tags..." << std::endl; - for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) { - //std::cout << "setting batch " << t4_batch_id << std::endl; - for (uint64_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) { - //for (uint64_t i=0;i<1;i++) { - for (uint64_t i=0;i 0) { - total_t4_tagged++; - //std::cout << " Tagged entry t4 batch_id: " << t4_batch_id << " block:" << t4_block_id << std::endl; - }; - } - } - //std::cout << "partial result: " << total_t4_tagged << std::endl; - } - std::cout << " Num set t5: " << num_set_t5 << std::endl; - std::cout << " Num set t4: " << num_set_t4 << std::endl; - std::cout << " Num same addresses: " << num_same_addresses << std::endl; - std::cout << " Tagged T4 entries: " << total_t4_tagged << " should be 114437654 out of max 4563402752" << std::endl; - - std::cout << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-10-38.tmp" << std::endl - << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-35-40.tmp" << std::endl - << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-43-38.tmp" << std::endl - << " -rw-rw-r-- 1 nick nick 12 Okt 19 11:16 T4BackRef-51-40.tmp" << std::endl; -} - - -__global__ -void gpu_chacha8_xs_to_kbcs(const uint32_t N, - const __restrict__ uint32_t *input, - uint32_t *xs, uint32_t *kbcs) -{ - uint32_t x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; - - int index = blockIdx.x * blockDim.x + threadIdx.x; // + x_start/16; - - if (index < N) { - uint32_t x = xs[index]; - uint32_t x_group = x / 16; - uint32_t x_selection = x % 16; - uint32_t pos = x_group; - - x0 = input[0];x1 = input[1];x2 = input[2];x3 = input[3];x4 = input[4];x5 = input[5];x6 = input[6];x7 = input[7]; - x8 = input[8];x9 = input[9];x10 = input[10];x11 = input[11]; - x12 = pos; x13 = 0; // pos never bigger than 32 bit pos >> 32; - x14 = input[14];x15 = input[15]; - - #pragma unroll - for (int i = 0; i < 4; i++) { - QUARTERROUND(x0, x4, x8, x12);QUARTERROUND(x1, x5, x9, x13);QUARTERROUND(x2, x6, x10, x14);QUARTERROUND(x3, x7, x11, x15); - QUARTERROUND(x0, x5, x10, x15);QUARTERROUND(x1, x6, x11, x12);QUARTERROUND(x2, x7, x8, x13);QUARTERROUND(x3, x4, x9, x14); - } - - x0 += input[0];x1 += input[1];x2 += input[2];x3 += input[3];x4 += input[4]; - x5 += input[5];x6 += input[6];x7 += input[7];x8 += input[8];x9 += input[9]; - x10 += input[10];x11 += input[11];x12 += x_group; // j12;//x13 += 0; - x14 += input[14];x15 += input[15]; - - // convert to little endian/big endian whatever, chia needs it like this - BYTESWAP32(x0);BYTESWAP32(x1);BYTESWAP32(x2);BYTESWAP32(x3);BYTESWAP32(x4);BYTESWAP32(x5); - BYTESWAP32(x6);BYTESWAP32(x7);BYTESWAP32(x8);BYTESWAP32(x9);BYTESWAP32(x10);BYTESWAP32(x11); - BYTESWAP32(x12);BYTESWAP32(x13);BYTESWAP32(x14);BYTESWAP32(x15); - - uint32_t result_x; - if (x_selection == 0) result_x = x0; - if (x_selection == 1) result_x = x1; - if (x_selection == 2) result_x = x2; - if (x_selection == 3) result_x = x3; - if (x_selection == 4) result_x = x4; - if (x_selection == 5) result_x = x5; - if (x_selection == 6) result_x = x6; - if (x_selection == 7) result_x = x7; - if (x_selection == 8) result_x = x8; - if (x_selection == 9) result_x = x9; - if (x_selection == 10) result_x = x10; - if (x_selection == 11) result_x = x11; - if (x_selection == 12) result_x = x12; - if (x_selection == 13) result_x = x13; - if (x_selection == 14) result_x = x14; - if (x_selection == 15) result_x = x15; - uint64_t y = (((uint64_t) result_x) << 6) + (x >> 26); - uint32_t kbc_bucket_id = uint32_t (y / kBC); - //printf("x: %u y:%llu kbc:%u\n", x, y, kbc_bucket_id); - kbcs[index] = kbc_bucket_id; - } -} - - -__global__ -void showSorted(const uint32_t N, uint32_t *list) { - for (int i=0;i>>(t6_num_entries,device_t6_data, device_t5_data, device_t6_final_data, device_t5_tags); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - auto tag_end = std::chrono::high_resolution_clock::now(); - std::cout << "*** gpu tag ms: " << std::chrono::duration_cast(tag_end - tag_start).count() << " ms ***\n"; - // now write back results to hostmem - std::cout << "writing results to hostmem" << std::endl; - - CUDA_CHECK_RETURN(cudaMemcpy(t6_final_data, device_t6_final_data,t6_num_entries*sizeof(T6FinalEntry),cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - - auto tag_start = std::chrono::high_resolution_clock::now(); - for (uint64_t t5_block_id = 0; t5_block_id < BATCHES; t5_block_id++) { - std::cout << " gpu t5 tag to t4 t5_block_id:" << std::endl; - uint32_t num_entries = t5_num_entries[t5_block_id]; - uint32_t t5_address = HOST_MAX_BLOCK_ENTRIES * t5_block_id; - int numBlocks = (num_entries + blockSize - 1) / (blockSize); - gpu_t5_tag_to_t4<<>>(num_entries, t5_block_id, - &device_t5_data[t5_address], - &device_t5_tags[t5_address/32], device_t4_tags); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - } - auto tag_end = std::chrono::high_resolution_clock::now(); - std::cout << "*** gpu tag ms: " << std::chrono::duration_cast(tag_end - tag_start).count() << " ms ***\n"; - auto batch_end = std::chrono::high_resolution_clock::now(); - //std::cout << "*********************" << std::endl; - std::cout << "*** Batch " << t6_batch_id << " time: " << std::chrono::duration_cast(batch_end - batch_start).count() << " ms ***\n"; - //std::cout << "*********************" << std::endl; - } - - auto compute_loop_end = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "All compute loop time: " << std::chrono::duration_cast(compute_loop_end - compute_loop_start).count() << " ms\n"; - std::cout << "*********************" << std::endl; - auto t4_final_start = std::chrono::high_resolution_clock::now(); - // TODO: free gpu mem and setup t3 and t4 mem - std::cout << "Doing T4->T3 tags" << std::endl; - - uint32_t t3_num_entries[BATCHES]; - - return; - - for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) { - auto batch_start = std::chrono::high_resolution_clock::now(); - std::cout << "Loading T3BaseRef [0-63]-batch " << t4_batch_id << std::endl; - - for (uint64_t t3_block_id = 0; t3_block_id < BATCHES; t3_block_id++) { - readT3BaseRefBlockFile(t3_block_id, t4_batch_id, - &t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id], - t3_num_entries[t3_block_id]); - - CUDA_CHECK_RETURN(cudaMemcpy(&device_t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id],&t3_baseref_data[HOST_MAX_BLOCK_ENTRIES*t3_block_id], - t3_num_entries[t3_block_id]*sizeof(T3BaseRef), // note T3BaseRef - cudaMemcpyHostToDevice)); - } - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - // now we have all t3 entries in block row for back referencing from t4 blocks. - // the t4 blocks now just need to fetch the t3 entries and get the Lx1,Lx2,Lx3,Lx4 * 2 = 8 Lx entries. - - for (uint32_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) { - uint32_t t4_num_entries; - readBackRefBlockFile(4, t4_batch_id,t4_block_id,t4_data, t4_num_entries); - - CUDA_CHECK_RETURN(cudaMemcpy(device_t4_data, t4_data,t4_num_entries*sizeof(BackRef),cudaMemcpyHostToDevice)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - //std::cout << "Scanning T6 batch-block " << t6_batch_id << "-" << t6_block_id << " : " << t6_num_entries << " entries" << std::endl; - int numBlocks = (t4_num_entries + blockSize - 1) / (blockSize); - // gpu_backref_t4_tag<<>>(t4_num_entries,device_t4_data, device_t3_baseref_data, device_t4_final_data, device_t4_tags); - gpu_backref_t4_lxlists<<>>(t4_num_entries,device_t4_data, device_t3_baseref_data, device_t4_lx_list, device_t4_tags); - gpu_chacha8_xs_to_kbcs<<>>(t4_num_entries*8, chacha_input, device_t4_lx_list, kbcs); - // wrap raw pointer with a device_ptr - thrust::device_ptr device_t4_lx_list_ptr(device_t4_lx_list); - thrust::sort(device_t4_lx_list_ptr, device_t4_lx_list_ptr + t4_num_entries*8); // modify your sort line - showSorted<<<1,1>>>(30,device_t4_lx_list); - - thrust::device_ptr device_kbcs_ptr(kbcs); - thrust::sort(device_kbcs_ptr, device_kbcs_ptr + t4_num_entries*8); // modify your sort line - showSorted<<<1,1>>>(30,kbcs); - - //thrust::sort(device_t4_lx_list_ptr.begin(), device_t4_lx_list_ptr.end() + t4_num_entries*4); - //uint32_t new_end = thrust::unique(device_t4_lx_list_ptr, device_t4_lx_list_ptr + t4_num_entries*4); - //std::cout << "Thrust sorted " << (t4_num_entries*4) << " down to " << new_end << std::endl; - - // T4 final time: 61754 ms (57808ms without backref t4 tag) - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - // now write back results to hostmem - // erm...t4_num_entries can change, ay? Since output will be pruned somewhat. - CUDA_CHECK_RETURN(cudaMemcpy(t4_final_data, device_t4_final_data,t4_num_entries*sizeof(T4FinalEntry),cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - - // todo: get t4_final_data into a unique sorted list and compressed...of course could do this in GPU mem - } - } - - auto t4_final_end = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "T4 final time: " << std::chrono::duration_cast(t4_final_end - t4_final_start).count() << " ms\n"; - std::cout << "*********************" << std::endl; - - - - if (verify_results) { - // technically don't need to copy t4_tags if stays in device memory...just for verification purposes. - CUDA_CHECK_RETURN(cudaMemcpy(t4_tags, device_t4_tags,T4_TAG_MEM_BYTES_NEEDED,cudaMemcpyDeviceToHost)); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - std::cout << "Counting tags..." << std::endl; - for (uint32_t t4_batch_id = 0; t4_batch_id < BATCHES; t4_batch_id++) { - //std::cout << "Counting batch " << t4_batch_id << std::endl; - for (uint64_t t4_block_id = 0; t4_block_id < BATCHES; t4_block_id++) { - for (uint64_t i=0;i 0) { - total_t4_tagged++; - //std::cout << " Tagged entry t4 batch_id: " << t4_batch_id << " block:" << t4_block_id << std::endl; - }; - } - } - //std::cout << "partial result: " << total_t4_tagged << std::endl; - } - std::cout << " Tagged T4 entries: " << total_t4_tagged << " should be 114437654 out of max 4563402752" << std::endl; - } -} - - -void doPhase2Pruning() { - char *memstore; - - /*if (true) { - // test xs'... - uint32_t *xs; - uint32_t *kbcs; - CUDA_CHECK_RETURN(cudaMallocManaged(&xs, 256)); - CUDA_CHECK_RETURN(cudaMallocManaged(&kbcs, 256)); - for (int i=0;i<256;i++) { - xs[i] = i; - } - std::cout << "Doing chacha single xs" << std::endl; - gpu_chacha8_xs_to_kbcs<<<1,256>>>(256, chacha_input, xs, kbcs); - CUDA_CHECK_RETURN(cudaDeviceSynchronize()); - for (int i=0;i<256;i++) { - std::cout << " kbc " << i << " = " << kbcs[i] << std::endl; - } - - }*/ - - if (true) { - using milli = std::chrono::milliseconds; - auto total_start = std::chrono::high_resolution_clock::now(); - createT6FinalEntriesGPU(memstore); - auto total_end = std::chrono::high_resolution_clock::now(); - std::cout << "*********************" << std::endl; - std::cout << "Total time: " << std::chrono::duration_cast(total_end - total_start).count() << " ms\n"; - std::cout << "*********************" << std::endl; - } - - //batch_id:30 block_id: 55dx:6169 - //findYsolution(memstore); - - //std::cout << "Phase 2 Pruning" << std::endl; - //for (uint32_t batch_id = 0; batch_id < 1; batch_id++) { - // readPruneToT2(batch_id,memstore); - // TODO: now that we have all data in mem for a batch, test whether getting the y will get the actual lx pairs! - //} - std::cout << "Done doPhase2Pruning." << std::endl; -} - - -#endif /* PHASE2_HPP_ */