Skip to content

Commit

Permalink
chore(gpu): rename "test vector" -> "luts" and "tvi" -> "lut_indexes"
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jan 23, 2024
1 parent 16f457b commit bd26d0e
Show file tree
Hide file tree
Showing 13 changed files with 171 additions and 171 deletions.
8 changes: 4 additions & 4 deletions backends/tfhe-cuda-backend/cuda/include/bootstrap.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ void cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cleanup_cuda_bootstrap_amortized(cuda_stream_t *stream,
int8_t **pbs_buffer);
Expand All @@ -71,15 +71,15 @@ void cuda_bootstrap_low_latency_lwe_ciphertext_vector_32(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cuda_bootstrap_low_latency_lwe_ciphertext_vector_64(
cuda_stream_t *stream, void *lwe_array_out, void *lwe_output_indexes,
void *lut_vector, void *lut_vector_indexes, void *lwe_array_in,
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t num_samples,
uint32_t num_lut_vectors, uint32_t lwe_idx, uint32_t max_shared_memory);
uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory);

void cleanup_cuda_bootstrap_low_latency(cuda_stream_t *stream,
int8_t **pbs_buffer);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ void cuda_multi_bit_pbs_lwe_ciphertext_vector_64(
void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t num_samples, uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, uint32_t chunk_size = 0);

void scratch_cuda_multi_bit_pbs_64(
Expand Down
100 changes: 50 additions & 50 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ void cleanup_cuda_propagate_single_carry_low_latency(cuda_stream_t *stream,
}

/*
* generate bivariate accumulator for device pointer
* generate bivariate accumulator (lut) for device pointer
* v_stream - cuda stream
* acc_bivariate - device pointer for bivariate accumulator
* ...
Expand All @@ -212,7 +212,7 @@ void generate_device_accumulator_bivariate(
std::function<Torus(Torus, Torus)> f);

/*
* generate univariate accumulator for device pointer
* generate univariate accumulator (lut) for device pointer
* v_stream - cuda stream
* acc - device pointer for univariate accumulator
* ...
Expand Down Expand Up @@ -408,7 +408,7 @@ template <typename Torus> struct int_radix_lut {
return &lut[ind * (params.glwe_dimension + 1) * params.polynomial_size];
}

Torus *get_tvi(size_t ind) { return &lut_indexes[ind]; }
Torus *get_lut_indexes(size_t ind) { return &lut_indexes[ind]; }
void release(cuda_stream_t *stream) {
cuda_drop_async(lut_indexes, stream);
cuda_drop_async(lwe_indexes, stream);
Expand Down Expand Up @@ -437,10 +437,10 @@ template <typename Torus> struct int_sc_prop_memory {
Torus *generates_or_propagates;
Torus *step_output;

// test_vector_array[2] = {lut_does_block_generate_carry,
// luts_array[2] = {lut_does_block_generate_carry,
// lut_does_block_generate_or_propagate}
int_radix_lut<Torus> *test_vector_array;
int_radix_lut<Torus> *lut_carry_propagation_sum;
int_radix_lut<Torus> *luts_array;
int_radix_lut<Torus> *luts_carry_propagation_sum;
int_radix_lut<Torus> *message_acc;

int_radix_params params;
Expand All @@ -461,7 +461,7 @@ template <typename Torus> struct int_sc_prop_memory {
step_output = (Torus *)cuda_malloc_async(
num_radix_blocks * big_lwe_size_bytes, stream);

// declare functions for test vector generation
// declare functions for lut generation
auto f_lut_does_block_generate_carry = [message_modulus](Torus x) -> Torus {
if (x >= message_modulus)
return OUTPUT_CARRY::GENERATED;
Expand All @@ -477,7 +477,7 @@ template <typename Torus> struct int_sc_prop_memory {
return OUTPUT_CARRY::NONE;
};

auto f_lut_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
auto f_luts_carry_propagation_sum = [](Torus msb, Torus lsb) -> Torus {
if (msb == OUTPUT_CARRY::PROPAGATED)
return lsb;
return msb;
Expand All @@ -487,18 +487,18 @@ template <typename Torus> struct int_sc_prop_memory {
return x % message_modulus;
};

// create test vector objects
test_vector_array = new int_radix_lut<Torus>(
// create lut objects
luts_array = new int_radix_lut<Torus>(
stream, params, 2, num_radix_blocks, allocate_gpu_memory);
lut_carry_propagation_sum = new struct int_radix_lut<Torus>(
luts_carry_propagation_sum = new struct int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);
message_acc = new struct int_radix_lut<Torus>(
stream, params, 1, num_radix_blocks, allocate_gpu_memory);

auto lut_does_block_generate_carry = test_vector_array->get_lut(0);
auto lut_does_block_generate_or_propagate = test_vector_array->get_lut(1);
auto lut_does_block_generate_carry = luts_array->get_lut(0);
auto lut_does_block_generate_or_propagate = luts_array->get_lut(1);

// generate test vectors
// generate luts (aka accumulators)
generate_device_accumulator<Torus>(
stream, lut_does_block_generate_carry, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_does_block_generate_carry);
Expand All @@ -507,12 +507,12 @@ template <typename Torus> struct int_sc_prop_memory {
polynomial_size, message_modulus, carry_modulus,
f_lut_does_block_generate_or_propagate);
cuda_set_value_async<Torus>(&(stream->stream),
test_vector_array->get_tvi(1), 1,
luts_array->get_lut_indexes(1), 1,
num_radix_blocks - 1);

generate_device_accumulator_bivariate<Torus>(
stream, lut_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_lut_carry_propagation_sum);
stream, luts_carry_propagation_sum->lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f_luts_carry_propagation_sum);

generate_device_accumulator<Torus>(stream, message_acc->lut, glwe_dimension,
polynomial_size, message_modulus,
Expand All @@ -523,12 +523,12 @@ template <typename Torus> struct int_sc_prop_memory {
cuda_drop_async(generates_or_propagates, stream);
cuda_drop_async(step_output, stream);

test_vector_array->release(stream);
lut_carry_propagation_sum->release(stream);
luts_array->release(stream);
luts_carry_propagation_sum->release(stream);
message_acc->release(stream);

delete test_vector_array;
delete lut_carry_propagation_sum;
delete luts_array;
delete luts_carry_propagation_sum;
delete message_acc;
}
};
Expand All @@ -538,9 +538,9 @@ template <typename Torus> struct int_mul_memory {
Torus *block_mul_res;
Torus *small_lwe_vector;
Torus *lwe_pbs_out_array;
int_radix_lut<Torus> *test_vector_array; // lsb msb
int_radix_lut<Torus> *test_vector_message;
int_radix_lut<Torus> *test_vector_carry;
int_radix_lut<Torus> *luts_array; // lsb msb
int_radix_lut<Torus> *luts_message;
int_radix_lut<Torus> *luts_carry;
int_sc_prop_memory<Torus> *scp_mem;
int_radix_params params;

Expand Down Expand Up @@ -583,18 +583,18 @@ template <typename Torus> struct int_mul_memory {
stream);

// create int_radix_lut objects for lsb, msb, message, carry
// test_vector_array -> lut = {lsb_acc, msb_acc}
test_vector_array = new int_radix_lut<Torus>(
// luts_array -> lut = {lsb_acc, msb_acc}
luts_array = new int_radix_lut<Torus>(
stream, params, 2, total_block_count, allocate_gpu_memory);
test_vector_message = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, test_vector_array);
test_vector_carry = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, test_vector_array);
luts_message = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, luts_array);
luts_carry = new int_radix_lut<Torus>(
stream, params, 1, total_block_count, luts_array);

auto lsb_acc = test_vector_array->get_lut(0);
auto msb_acc = test_vector_array->get_lut(1);
auto message_acc = test_vector_message->get_lut(0);
auto carry_acc = test_vector_carry->get_lut(0);
auto lsb_acc = luts_array->get_lut(0);
auto msb_acc = luts_array->get_lut(1);
auto message_acc = luts_message->get_lut(0);
auto carry_acc = luts_carry->get_lut(0);

// define functions for each accumulator
auto lut_f_lsb = [message_modulus](Torus x, Torus y) -> Torus {
Expand Down Expand Up @@ -624,12 +624,12 @@ template <typename Torus> struct int_mul_memory {
stream, msb_acc, glwe_dimension, polynomial_size, message_modulus,
carry_modulus, lut_f_msb);

// tvi for test_vector_array should be reinitialized
// lut_indexes for luts_array should be reinitialized
// first lsb_vector_block_count value should reference to lsb_acc
// last msb_vector_block_count values should reference to msb_acc
// for message and carry default tvi is fine
// for message and carry default lut_indexes is fine
cuda_set_value_async<Torus>(
&(stream->stream), test_vector_array->get_tvi(lsb_vector_block_count),
&(stream->stream), luts_array->get_lut_indexes(lsb_vector_block_count),
1, msb_vector_block_count);
}

Expand All @@ -639,15 +639,15 @@ template <typename Torus> struct int_mul_memory {
cuda_drop_async(small_lwe_vector, stream);
cuda_drop_async(lwe_pbs_out_array, stream);

test_vector_array->release(stream);
test_vector_message->release(stream);
test_vector_carry->release(stream);
luts_array->release(stream);
luts_message->release(stream);
luts_carry->release(stream);

scp_mem->release(stream);

delete test_vector_array;
delete test_vector_message;
delete test_vector_carry;
delete luts_array;
delete luts_message;
delete luts_carry;

delete scp_mem;
}
Expand Down Expand Up @@ -681,12 +681,12 @@ template <typename Torus> struct int_shift_buffer {
// LUT
// pregenerate lut vector and indexes
// lut for left shift
// here we generate 'num_bits_in_block' times test_vector
// here we generate 'num_bits_in_block' times lut
// one for each 'shift_within_block' = 'shift' % 'num_bits_in_block'
// even though test_vector_left contains 'num_bits_in_block' lut
// tvi will have indexes for single lut only and those indexes will be 0
// even though lut_left contains 'num_bits_in_block' lut
// lut_indexes will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// tvi filled with zeros
// lut_indexes filled with zeros

// calculate bivariate lut for each 'shift_within_block'
for (int s_w_b = 1; s_w_b < num_bits_in_block; s_w_b++) {
Expand Down Expand Up @@ -738,11 +738,11 @@ template <typename Torus> struct int_shift_buffer {
lut_buffers_bivariate.push_back(cur_lut_bivariate);
}

// here we generate 'message_modulus' times test_vector
// here we generate 'message_modulus' times lut
// one for each 'shift'
// tvi will have indexes for single lut only and those indexes will be 0
// lut_indexes will have indexes for single lut only and those indexes will be 0
// it means for pbs corresponding lut should be selected and pass along
// tvi filled with zeros
// lut_indexes filled with zeros

// calculate lut for each 'shift'
for (int shift = 0; shift < params.message_modulus; shift++) {
Expand Down
26 changes: 13 additions & 13 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
uint32_t lwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t num_lut_vectors, uint32_t lwe_idx,
uint32_t num_luts, uint32_t lwe_idx,
uint32_t max_shared_memory, PBS_TYPE pbs_type) {
if (sizeof(Torus) == sizeof(uint32_t)) {
// 32 bits
Expand All @@ -37,15 +37,15 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_32(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
Expand All @@ -59,7 +59,7 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, grouping_factor, base_log, level_count,
input_lwe_ciphertext_count, num_lut_vectors, lwe_idx,
input_lwe_ciphertext_count, num_luts, lwe_idx,
max_shared_memory);
break;
case LOW_LAT:
Expand All @@ -68,15 +68,15 @@ void execute_pbs(cuda_stream_t *stream, Torus *lwe_array_out,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
num_luts, lwe_idx, max_shared_memory);
break;
case AMORTIZED:
cuda_bootstrap_amortized_lwe_ciphertext_vector_64(
stream, lwe_array_out, lwe_output_indexes, lut_vector,
lut_vector_indexes, lwe_array_in, lwe_input_indexes,
bootstrapping_key, pbs_buffer, lwe_dimension, glwe_dimension,
polynomial_size, base_log, level_count, input_lwe_ciphertext_count,
num_lut_vectors, lwe_idx, max_shared_memory);
num_luts, lwe_idx, max_shared_memory);
break;
default:
break;
Expand Down Expand Up @@ -303,7 +303,7 @@ void generate_device_accumulator_bivariate(
generate_lookup_table_bivariate<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);

// copy host lut and tvi to device
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc_bivariate, h_lut,
(glwe_dimension + 1) * polynomial_size * sizeof(Torus), stream);
Expand Down Expand Up @@ -335,7 +335,7 @@ void generate_device_accumulator(cuda_stream_t *stream, Torus *acc,
generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
message_modulus, carry_modulus, f);

// copy host lut and tvi to device
// copy host lut and lut_indexes to device
cuda_memcpy_async_to_gpu(
acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),
stream);
Expand Down Expand Up @@ -370,13 +370,13 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
auto generates_or_propagates = mem->generates_or_propagates;
auto step_output = mem->step_output;

auto test_vector_array = mem->test_vector_array;
auto lut_carry_propagation_sum = mem->lut_carry_propagation_sum;
auto luts_array = mem->luts_array;
auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
auto message_acc = mem->message_acc;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
stream, generates_or_propagates, lwe_array, bsk, ksk, num_blocks,
test_vector_array);
luts_array);

// compute prefix sum with hillis&steele

Expand All @@ -392,7 +392,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
stream, cur_blocks, cur_blocks, prev_blocks, bsk, ksk, cur_total_blocks,
lut_carry_propagation_sum);
luts_carry_propagation_sum);

cuda_memcpy_async_gpu_to_gpu(&generates_or_propagates[space * big_lwe_size],
cur_blocks,
Expand All @@ -414,7 +414,7 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
/*
* input_blocks: input radix ciphertext propagation will happen inplace
* acc_message_carry: list of two lut s, [(message_acc), (carry_acc)]
* tvi_message_carry: tvi for message and carry, should always be {0, 1}
* lut_indexes_message_carry: lut_indexes for message and carry, should always be {0, 1}
* small_lwe_vector: output of keyswitch should have
* size = 2 * (lwe_dimension + 1) * sizeof(Torus)
* big_lwe_vector: output of pbs should have
Expand Down
Loading

0 comments on commit bd26d0e

Please sign in to comment.