Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(gpu): track noise level/degree in apply lut #2007

Merged
merged 1 commit into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
uint64_t lut_degree, bool allocate_gpu_memory);
void scratch_cuda_apply_many_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand All @@ -63,12 +63,11 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
uint32_t num_many_lut, bool allocate_gpu_memory);
void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks);
void cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks);

void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
7 changes: 3 additions & 4 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ __host__ void zero_out_if(cudaStream_t const *streams,
predicate->lwe_indexes_in, params.big_lwe_dimension,
params.message_modulus, num_radix_blocks);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks,
ksks, num_radix_blocks, predicate);
}
Expand Down Expand Up @@ -68,7 +68,7 @@ __host__ void legacy_host_integer_radix_cmux_kb(
mem_false, params.big_lwe_dimension,
num_radix_blocks);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
num_radix_blocks, mem_ptr->message_extract_lut);
}
Expand Down Expand Up @@ -122,8 +122,7 @@ __host__ void host_integer_radix_cmux_kb(
mem_false);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(lwe_array_out->ptr),
(Torus *)(added_cts->ptr), bsks, ksks, num_radix_blocks,
streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks,
mem_ptr->message_extract_lut);
delete mem_true;
delete mem_false;
Expand Down
22 changes: 11 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -148,12 +148,12 @@ __host__ void are_all_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks,
num_chunks, lut);
}
Expand Down Expand Up @@ -219,12 +219,12 @@ __host__ void is_at_least_one_comparisons_block_true(
// Applies the LUT
if (remaining_blocks == 1) {
// In the last iteration we copy the output to the final address
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks,
ksks, 1, lut);
return;
} else {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out,
accumulator, bsks, ksks, num_chunks, lut);
}
Expand Down Expand Up @@ -305,7 +305,7 @@ __host__ void host_compare_with_zero_equality(
}
}

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks,
zero_comparison);
are_all_comparisons_block_true<Torus>(streams, gpu_indexes, gpu_count,
Expand Down Expand Up @@ -371,7 +371,7 @@ __host__ void compare_radix_blocks_kb(

// Apply LUT to compare to 0
auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks,
num_radix_blocks, is_non_zero_lut);

Expand Down Expand Up @@ -422,7 +422,7 @@ __host__ void tree_sign_reduction(
pack_blocks<Torus>(streams[0], gpu_indexes[0], y, x, big_lwe_dimension,
partial_block_count, 4);

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, x, y, bsks, ksks,
partial_block_count >> 1, inner_tree_leaf);

Expand Down Expand Up @@ -468,7 +468,7 @@ __host__ void tree_sign_reduction(
last_lut->broadcast_lut(streams, gpu_indexes, 0);

// Last leaf
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1,
last_lut);
}
Expand Down Expand Up @@ -514,7 +514,7 @@ __host__ void host_integer_radix_difference_check_kb(

// Clean noise
auto identity_lut = mem_ptr->identity_lut;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks,
2 * packed_num_radix_blocks, identity_lut);

Expand Down Expand Up @@ -552,11 +552,11 @@ __host__ void host_integer_radix_difference_check_kb(
packed_left + packed_num_radix_blocks * big_lwe_size;
Torus *last_right_block_before_sign_block =
packed_right + packed_num_radix_blocks * big_lwe_size;
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_left_block_before_sign_block,
lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1,
identity_lut);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, last_right_block_before_sign_block,
lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks,
1, identity_lut);
Expand Down
10 changes: 5 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// Shift the mask so that we will only keep bits we should
uint32_t shifted_mask = full_message_mask >> shift_amount;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, interesting_divisor.last_block(),
interesting_divisor.last_block(), bsks, ksks, 1,
mem_ptr->masking_luts_1[shifted_mask]);
Expand Down Expand Up @@ -314,7 +314,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
// the estimated degree of the output is < msg_modulus
shifted_mask = shifted_mask & full_message_mask;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(),
divisor_ms_blocks.first_block(), bsks, ksks, 1,
mem_ptr->masking_luts_2[shifted_mask]);
Expand Down Expand Up @@ -481,7 +481,7 @@ __host__ void host_unsigned_integer_div_rem_kb(
auto create_clean_version_of_merged_remainder =
[&](cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count,
cleaned_merged_interesting_remainder.data,
cleaned_merged_interesting_remainder.data, bsks, ksks,
Expand Down Expand Up @@ -595,10 +595,10 @@ __host__ void host_unsigned_integer_div_rem_kb(
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder,
bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1);
integer_radix_apply_univariate_lookup_table_kb<Torus>(
legacy_integer_radix_apply_univariate_lookup_table_kb<Torus>(
mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks,
ksks, num_blocks, mem_ptr->message_extract_lut_2);
for (uint j = 0; j < mem_ptr->active_gpu_count; j++) {
Expand Down
23 changes: 10 additions & 13 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log,
uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
uint64_t lut_degree, bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
Expand All @@ -195,7 +195,7 @@ void scratch_cuda_apply_univariate_lut_kb_64(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr,
static_cast<const uint64_t *>(input_lut), num_radix_blocks, params,
allocate_gpu_memory);
lut_degree, allocate_gpu_memory);
}

void scratch_cuda_apply_many_univariate_lut_kb_64(
Expand All @@ -219,19 +219,16 @@ void scratch_cuda_apply_many_univariate_lut_kb_64(
num_many_lut, allocate_gpu_memory);
}

void cuda_apply_univariate_lut_kb_64(void *const *streams,
uint32_t const *gpu_indexes,
uint32_t gpu_count, void *output_radix_lwe,
void const *input_radix_lwe,
int8_t *mem_ptr, void *const *ksks,
void *const *bsks, uint32_t num_blocks) {
void cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
CudaRadixCiphertextFFI *output_radix_lwe,
CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr,
void *const *ksks, void *const *bsks) {

host_apply_univariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<const uint64_t *>(input_radix_lwe),
(int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks), bsks,
num_blocks);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe,
input_radix_lwe, (int_radix_lut<uint64_t> *)mem_ptr, (uint64_t **)(ksks),
bsks);
}

void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams,
Expand Down
Loading
Loading