From 703b010a298c78cfec155b019e345f21f9a65852 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 24 Jan 2025 11:55:01 +0100 Subject: [PATCH] chor(gpu): add degree information to cuda int_radix_lut --- .../compression/compression_utilities.h | 2 + .../cuda/include/integer/integer_utilities.h | 387 ++++++++++++------ .../cuda/src/integer/comparison.cuh | 7 +- .../cuda/src/integer/integer.cuh | 125 +++--- .../cuda/src/integer/multiplication.cuh | 9 +- .../cuda/src/integer/scalar_comparison.cuh | 21 +- 6 files changed, 358 insertions(+), 193 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h index b3ab0acefe..646d0bfde8 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/compression/compression_utilities.h @@ -112,6 +112,8 @@ template struct int_decompression { generate_device_accumulator_with_encoding( streams[0], gpu_indexes[0], decompression_rescale_lut->get_lut(0, 0), + decompression_rescale_lut->get_degree(0), + decompression_rescale_lut->get_max_degree(0), encryption_params.glwe_dimension, encryption_params.polynomial_size, effective_compression_message_modulus, effective_compression_carry_modulus, diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h index 65d63c58fa..c6a938cf60 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h @@ -10,6 +10,14 @@ #include #include +class NoiseLevel { +public: + // Constants equivalent to the Rust code + static const uint64_t NOMINAL = 1; + static const uint64_t ZERO = 0; + static const uint64_t UNKNOWN = std::numeric_limits::max(); +}; + template __global__ void radix_blocks_rotate_right(Torus *dst, Torus *src, uint32_t value, uint32_t blocks_count, @@ -32,19 +40,21 @@ void generate_ids_update_degrees(int *terms_degree, size_t *h_lwe_idx_in, template void generate_device_accumulator_bivariate( cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, std::function f); + uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, + std::function f); template void generate_device_accumulator_bivariate_with_factor( cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, std::function f, int factor); + uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, + std::function f, int factor); template void generate_device_accumulator_with_encoding( - cudaStream_t stream, uint32_t gpu_index, Torus *acc, - uint32_t glwe_dimension, uint32_t polynomial_size, + cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_message_modulus, uint32_t input_carry_modulus, uint32_t output_message_modulus, uint32_t output_carry_modulus, std::function f); @@ -58,7 +68,8 @@ void generate_device_accumulator_with_encoding( */ template void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, - Torus *acc, uint32_t glwe_dimension, + Torus *acc, uint64_t *degree, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, @@ -66,9 +77,10 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, template void generate_many_lut_device_accumulator( - cudaStream_t stream, uint32_t gpu_index, Torus *acc, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, std::vector> &f); + cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t message_modulus, uint32_t carry_modulus, + std::vector> &f); struct int_radix_params { PBS_TYPE pbs_type; @@ -117,6 +129,12 @@ template struct int_radix_lut { int_radix_params params; uint32_t num_blocks; uint32_t num_luts; + uint32_t num_many_lut = 1; + // Tracks the degree of each LUT and the max degree on CPU + // The max degree is (message_modulus * carry_modulus - 1) except for many lut + // for which it's different + uint64_t *degrees; + uint64_t *max_degrees; int active_gpu_count; bool mem_reuse = false; @@ -125,7 +143,7 @@ template struct int_radix_lut { // (same for tmp lwe arrays) std::vector buffer; - // These arrays will all reside on GPU 0 + // These arrays will reside on all GPUs // lut could actually be allocated & initialized GPU per GPU but this is not // done at the moment std::vector lut_vec; @@ -254,6 +272,8 @@ template struct int_radix_lut { tmp_lwe_before_ks = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); } + degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); + max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } // constructor to reuse memory @@ -331,6 +351,8 @@ template struct int_radix_lut { gpu_indexes[0]); memcpy(h_lwe_indexes_out, h_lwe_indexes_in, num_radix_blocks * sizeof(Torus)); + degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); + max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } // Construction for many luts @@ -339,6 +361,7 @@ template struct int_radix_lut { uint32_t num_radix_blocks, uint32_t num_many_lut, bool allocate_gpu_memory) { + this->num_many_lut = num_many_lut; this->params = params; this->num_blocks = num_radix_blocks; this->num_luts = num_luts; @@ -438,6 +461,8 @@ template struct int_radix_lut { tmp_lwe_before_ks = (Torus *)cuda_malloc_async(big_size, streams[0], gpu_indexes[0]); } + degrees = (uint64_t *)malloc(num_many_lut * num_luts * sizeof(uint64_t)); + max_degrees = (uint64_t *)malloc(num_luts * sizeof(uint64_t)); } // Return a pointer to idx-ith lut at gpu_index's global memory @@ -449,6 +474,12 @@ template struct int_radix_lut { return &lut[idx * lut_size]; } + // Return a pointer to idx-ith degree at gpu_index's global memory + Torus *get_degree(size_t idx) { return °rees[num_many_lut * idx]; } + + // Return a pointer to idx-ith max degree at gpu_index's global memory + Torus *get_max_degree(size_t idx) { return &max_degrees[idx]; } + // Return a pointer to idx-ith lut indexes at gpu_index's global memory Torus *get_lut_indexes(uint32_t gpu_index, size_t ind) { auto lut_indexes = lut_indexes_vec[gpu_index]; @@ -542,6 +573,8 @@ template struct int_radix_lut { lwe_after_pbs_vec.clear(); lwe_trivial_indexes_vec.clear(); } + free(degrees); + free(max_degrees); } }; template struct int_bit_extract_luts_buffer { @@ -569,9 +602,10 @@ template struct int_bit_extract_luts_buffer { }; generate_device_accumulator( - streams[0], gpu_indexes[0], lut->get_lut(0, i), - params.glwe_dimension, params.polynomial_size, - params.message_modulus, params.carry_modulus, operator_f); + streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i), + lut->get_max_degree(i), params.glwe_dimension, + params.polynomial_size, params.message_modulus, + params.carry_modulus, operator_f); } /** @@ -768,6 +802,7 @@ template struct int_shift_and_rotate_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], mux_lut->get_lut(0, 0), + mux_lut->get_degree(0), mux_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, mux_lut_f); mux_lut->broadcast_lut(streams, gpu_indexes, 0); @@ -775,6 +810,7 @@ template struct int_shift_and_rotate_buffer { auto cleaning_lut_f = [](Torus x) -> Torus { return x; }; generate_device_accumulator( streams[0], gpu_indexes[0], cleaning_lut->get_lut(0, 0), + cleaning_lut->get_degree(0), cleaning_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cleaning_lut_f); cleaning_lut->broadcast_lut(streams, gpu_indexes, 0); @@ -829,17 +865,21 @@ template struct int_fullprop_buffer { // Torus *lut_buffer_message = lut->get_lut(0, 0); + uint64_t *message_degree = lut->get_degree(0); + uint64_t *message_max_degree = lut->get_max_degree(0); Torus *lut_buffer_carry = lut->get_lut(0, 1); + uint64_t *carry_degree = lut->get_degree(1); + uint64_t *carry_max_degree = lut->get_max_degree(1); generate_device_accumulator( - streams[0], gpu_indexes[0], lut_buffer_message, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - lut_f_message); + streams[0], gpu_indexes[0], lut_buffer_message, message_degree, + message_max_degree, params.glwe_dimension, params.polynomial_size, + params.message_modulus, params.carry_modulus, lut_f_message); generate_device_accumulator( - streams[0], gpu_indexes[0], lut_buffer_carry, params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - lut_f_carry); + streams[0], gpu_indexes[0], lut_buffer_carry, carry_degree, + carry_max_degree, params.glwe_dimension, params.polynomial_size, + params.message_modulus, params.carry_modulus, lut_f_carry); Torus lwe_indexes_size = 2 * sizeof(Torus); Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size); @@ -956,10 +996,12 @@ template struct int_legacy_sc_prop_memory { // generate luts (aka accumulators) generate_device_accumulator( streams[0], gpu_indexes[0], lut_does_block_generate_carry, + luts_array->get_degree(0), luts_array->get_max_degree(0), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_carry); generate_device_accumulator( streams[0], gpu_indexes[0], lut_does_block_generate_or_propagate, + luts_array->get_degree(1), luts_array->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate); cuda_set_value_async(streams[0], gpu_indexes[0], @@ -968,12 +1010,16 @@ template struct int_legacy_sc_prop_memory { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], luts_carry_propagation_sum->get_lut(0, 0), - glwe_dimension, polynomial_size, message_modulus, carry_modulus, + luts_carry_propagation_sum->get_degree(0), + luts_carry_propagation_sum->get_max_degree(0), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_luts_carry_propagation_sum); generate_device_accumulator( - streams[0], gpu_indexes[0], message_acc->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_message_acc); + streams[0], gpu_indexes[0], message_acc->get_lut(0, 0), + message_acc->get_degree(0), message_acc->get_max_degree(0), + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_message_acc); luts_array->broadcast_lut(streams, gpu_indexes, 0); luts_carry_propagation_sum->broadcast_lut(streams, gpu_indexes, 0); @@ -1071,10 +1117,12 @@ template struct int_overflowing_sub_memory { // generate luts (aka accumulators) generate_device_accumulator( streams[0], gpu_indexes[0], lut_does_block_generate_carry, + luts_array->get_degree(0), luts_array->get_max_degree(0), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_carry); generate_device_accumulator( streams[0], gpu_indexes[0], lut_does_block_generate_or_propagate, + luts_array->get_degree(1), luts_array->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_lut_does_block_generate_or_propagate); cuda_set_value_async(streams[0], gpu_indexes[0], @@ -1083,12 +1131,16 @@ template struct int_overflowing_sub_memory { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], luts_borrow_propagation_sum->get_lut(0, 0), - glwe_dimension, polynomial_size, message_modulus, carry_modulus, + luts_borrow_propagation_sum->get_degree(0), + luts_borrow_propagation_sum->get_max_degree(0), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_luts_borrow_propagation_sum); generate_device_accumulator( - streams[0], gpu_indexes[0], message_acc->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_message_acc); + streams[0], gpu_indexes[0], message_acc->get_lut(0, 0), + message_acc->get_degree(0), message_acc->get_max_degree(0), + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_message_acc); luts_array->broadcast_lut(streams, gpu_indexes, 0); luts_borrow_propagation_sum->broadcast_lut(streams, gpu_indexes, 0); @@ -1256,8 +1308,10 @@ template struct int_seq_group_prop_memory { }; auto seq_lut = lut_sequential_algorithm->get_lut(0, index); generate_device_accumulator( - streams[0], gpu_indexes[0], seq_lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_lut_sequential); + streams[0], gpu_indexes[0], seq_lut, + lut_sequential_algorithm->get_degree(0), + lut_sequential_algorithm->get_max_degree(0), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_lut_sequential); h_seq_lut_indexes[index] = index; } Torus *seq_lut_indexes = lut_sequential_algorithm->get_lut_indexes(0, 0); @@ -1312,10 +1366,11 @@ template struct int_hs_group_prop_memory { new int_radix_lut(streams, gpu_indexes, gpu_count, params, 1, num_groups, allocate_gpu_memory); - auto hillis_steele_lut = lut_hillis_steele->get_lut(0, 0); generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], hillis_steele_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_lut_hillis_steele); + streams[0], gpu_indexes[0], lut_hillis_steele->get_lut(0, 0), + lut_hillis_steele->get_degree(0), lut_hillis_steele->get_max_degree(0), + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_lut_hillis_steele); lut_hillis_steele->broadcast_lut(streams, gpu_indexes, 0); }; @@ -1383,10 +1438,12 @@ template struct int_shifted_blocks_and_states_memory { f_first_block_state, f_shift_block}; auto first_block_lut = luts_array_first_step->get_lut(0, 0); - + auto first_block_lut_degrees = luts_array_first_step->get_degree(0); + auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + streams[0], gpu_indexes[0], first_block_lut, first_block_lut_degrees, + first_block_lut_max_degree, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_first_grouping_luts); // luts for other blocks of the first grouping for (int lut_id = 1; lut_id < grouping_size; lut_id++) { @@ -1404,9 +1461,12 @@ template struct int_shifted_blocks_and_states_memory { std::vector> f_grouping_luts = { f_state, f_shift_block}; auto lut = luts_array_first_step->get_lut(0, lut_id); + auto lut_degrees = luts_array_first_step->get_degree(lut_id); + auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_grouping_luts); + streams[0], gpu_indexes[0], lut, lut_degrees, lut_max_degree, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_grouping_luts); } // luts for the rest of groupings (except for the last block) @@ -1427,10 +1487,12 @@ template struct int_shifted_blocks_and_states_memory { f_state, f_shift_block}; auto lut = luts_array_first_step->get_lut(0, lut_id); - + auto lut_degrees = luts_array_first_step->get_degree(lut_id); + auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_grouping_luts); + streams[0], gpu_indexes[0], lut, lut_degrees, lut_max_degree, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_grouping_luts); } // For the last block we need to generate a new lut @@ -1444,13 +1506,17 @@ template struct int_shifted_blocks_and_states_memory { uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step auto last_block_lut = luts_array_first_step->get_lut(0, lut_id); + auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id); + auto last_block_lut_max_degree = + luts_array_first_step->get_max_degree(lut_id); std::vector> f_last_grouping_luts = { f_last_block_state, f_shift_block}; generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts); + streams[0], gpu_indexes[0], last_block_lut, last_block_lut_degrees, + last_block_lut_max_degree, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_last_grouping_luts); // Generate the indexes to switch between luts within the pbs Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); @@ -1604,10 +1670,13 @@ template struct int_prop_simu_group_carries_memory { } }; - auto lut = luts_array_second_step->get_lut(0, lut_id); generate_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_first_grouping_inner_propagation); + streams[0], gpu_indexes[0], + luts_array_second_step->get_lut(0, lut_id), + luts_array_second_step->get_degree(0), + luts_array_second_step->get_max_degree(0), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_first_grouping_inner_propagation); } auto f_first_grouping_outer_propagation = @@ -1616,9 +1685,10 @@ template struct int_prop_simu_group_carries_memory { }; int lut_id = grouping_size - 1; - auto lut_first_group_outer = luts_array_second_step->get_lut(0, lut_id); generate_device_accumulator( - streams[0], gpu_indexes[0], lut_first_group_outer, glwe_dimension, + streams[0], gpu_indexes[0], luts_array_second_step->get_lut(0, lut_id), + luts_array_second_step->get_degree(lut_id), + luts_array_second_step->get_max_degree(lut_id), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_first_grouping_outer_propagation); @@ -1638,10 +1708,13 @@ template struct int_prop_simu_group_carries_memory { } }; - auto lut = luts_array_second_step->get_lut(0, lut_id); generate_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_other_groupings_inner_propagation); + streams[0], gpu_indexes[0], + luts_array_second_step->get_lut(0, lut_id), + luts_array_second_step->get_degree(lut_id), + luts_array_second_step->get_max_degree(lut_id), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_other_groupings_inner_propagation); } if (use_sequential_algorithm_to_resolver_group_carries) { @@ -1657,10 +1730,13 @@ template struct int_prop_simu_group_carries_memory { } }; - auto lut = luts_array_second_step->get_lut(0, lut_id); generate_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_group_propagation); + streams[0], gpu_indexes[0], + luts_array_second_step->get_lut(0, lut_id), + luts_array_second_step->get_degree(lut_id), + luts_array_second_step->get_max_degree(lut_id), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, + f_group_propagation); } } else { uint32_t lut_id = 2 * grouping_size; @@ -1672,10 +1748,12 @@ template struct int_prop_simu_group_carries_memory { } }; - auto lut = luts_array_second_step->get_lut(0, lut_id); generate_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_group_propagation); + streams[0], gpu_indexes[0], + luts_array_second_step->get_lut(0, lut_id), + luts_array_second_step->get_degree(lut_id), + luts_array_second_step->get_max_degree(lut_id), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, f_group_propagation); } Torus *h_second_lut_indexes = (Torus *)malloc(lut_indexes_size); @@ -1848,11 +1926,11 @@ template struct int_sc_prop_memory { return (block >> 1) % message_modulus; }; - auto extract_lut = lut_message_extract->get_lut(0, 0); - generate_device_accumulator( - streams[0], gpu_indexes[0], extract_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_message_extract); + streams[0], gpu_indexes[0], lut_message_extract->get_lut(0, 0), + lut_message_extract->get_degree(0), + lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_message_extract); lut_message_extract->broadcast_lut(streams, gpu_indexes, 0); @@ -1906,10 +1984,10 @@ template struct int_sc_prop_memory { return output1 << 3 | output2 << 2; }; - auto overflow_flag_prep_lut = lut_overflow_flag_prep->get_lut(0, 0); - generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], overflow_flag_prep_lut, glwe_dimension, + streams[0], gpu_indexes[0], lut_overflow_flag_prep->get_lut(0, 0), + lut_overflow_flag_prep->get_degree(0), + lut_overflow_flag_prep->get_max_degree(0), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_overflow_fp); lut_overflow_flag_prep->broadcast_lut(streams, gpu_indexes, 0); @@ -1934,10 +2012,11 @@ template struct int_sc_prop_memory { return does_overflow_if_carry_is_0; } }; - auto overflow_flag_last = lut_message_extract->get_lut(0, 1); generate_device_accumulator( - streams[0], gpu_indexes[0], overflow_flag_last, glwe_dimension, + streams[0], gpu_indexes[0], lut_message_extract->get_lut(0, 1), + lut_message_extract->get_degree(1), + lut_message_extract->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_overflow_last); Torus *h_lut_indexes = @@ -1961,10 +2040,11 @@ template struct int_sc_prop_memory { auto f_carry_last = [](Torus block) -> Torus { return ((block >> 2) & 1); }; - auto carry_flag_last = lut_message_extract->get_lut(0, 1); generate_device_accumulator( - streams[0], gpu_indexes[0], carry_flag_last, glwe_dimension, + streams[0], gpu_indexes[0], lut_message_extract->get_lut(0, 1), + lut_message_extract->get_degree(1), + lut_message_extract->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f_carry_last); Torus *h_lut_indexes = @@ -2100,10 +2180,13 @@ template struct int_shifted_blocks_and_borrow_states_memory { f_first_block_state, f_shift_block}; auto first_block_lut = luts_array_first_step->get_lut(0, 0); + auto first_block_lut_degrees = luts_array_first_step->get_degree(0); + auto first_block_lut_max_degree = luts_array_first_step->get_max_degree(0); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], first_block_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_first_grouping_luts); + streams[0], gpu_indexes[0], first_block_lut, first_block_lut_degrees, + first_block_lut_max_degree, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_first_grouping_luts); // luts for other blocks of the first grouping for (int lut_id = 1; lut_id < grouping_size; lut_id++) { @@ -2121,9 +2204,12 @@ template struct int_shifted_blocks_and_borrow_states_memory { std::vector> f_grouping_luts = { f_state, f_shift_block}; auto lut = luts_array_first_step->get_lut(0, lut_id); + auto lut_degrees = luts_array_first_step->get_degree(lut_id); + auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_grouping_luts); + streams[0], gpu_indexes[0], lut, lut_degrees, lut_max_degree, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_grouping_luts); } // luts for the rest of groupings (except for the last block) @@ -2144,10 +2230,12 @@ template struct int_shifted_blocks_and_borrow_states_memory { f_state, f_shift_block}; auto lut = luts_array_first_step->get_lut(0, lut_id); - + auto lut_degrees = luts_array_first_step->get_degree(lut_id); + auto lut_max_degree = luts_array_first_step->get_max_degree(lut_id); generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f_grouping_luts); + streams[0], gpu_indexes[0], lut, lut_degrees, lut_max_degree, + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_grouping_luts); } auto f_last_block_state = [message_modulus](Torus block) -> Torus { @@ -2160,13 +2248,17 @@ template struct int_shifted_blocks_and_borrow_states_memory { uint32_t lut_id = num_luts_first_step - 1; // The last lut of the first step auto last_block_lut = luts_array_first_step->get_lut(0, lut_id); + auto last_block_lut_degrees = luts_array_first_step->get_degree(lut_id); + auto last_block_lut_max_degree = + luts_array_first_step->get_max_degree(lut_id); std::vector> f_last_grouping_luts = { f_last_block_state, f_shift_block}; generate_many_lut_device_accumulator( - streams[0], gpu_indexes[0], last_block_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_last_grouping_luts); + streams[0], gpu_indexes[0], last_block_lut, last_block_lut_degrees, + last_block_lut_max_degree, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_last_grouping_luts); // Generate the indexes to switch between luts within the pbs Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); @@ -2295,11 +2387,11 @@ template struct int_borrow_prop_memory { return (block >> 1) % message_modulus; }; - auto extract_lut = lut_message_extract->get_lut(0, 0); - generate_device_accumulator( - streams[0], gpu_indexes[0], extract_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_message_extract); + streams[0], gpu_indexes[0], lut_message_extract->get_lut(0, 0), + lut_message_extract->get_degree(0), + lut_message_extract->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, f_message_extract); lut_message_extract->broadcast_lut(streams, gpu_indexes, 0); @@ -2312,11 +2404,11 @@ template struct int_borrow_prop_memory { return ((block >> 2) & 1); }; - auto borrow_flag_lut = lut_borrow_flag->get_lut(0, 0); - generate_device_accumulator( - streams[0], gpu_indexes[0], borrow_flag_lut, glwe_dimension, - polynomial_size, message_modulus, carry_modulus, f_borrow_flag); + streams[0], gpu_indexes[0], lut_borrow_flag->get_lut(0, 0), + lut_borrow_flag->get_degree(0), lut_borrow_flag->get_max_degree(0), + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f_borrow_flag); lut_borrow_flag->broadcast_lut(streams, gpu_indexes, 0); } @@ -2472,8 +2564,10 @@ template struct int_mul_memory { num_radix_blocks, allocate_gpu_memory); generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], zero_out_predicate_lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, params.message_modulus, - params.carry_modulus, zero_out_predicate_lut_f); + zero_out_predicate_lut->get_degree(0), + zero_out_predicate_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + zero_out_predicate_lut_f); zero_out_predicate_lut->broadcast_lut(streams, gpu_indexes, 0); zero_out_mem = new int_zero_out_if_buffer( @@ -2530,10 +2624,12 @@ template struct int_mul_memory { // generate accumulators generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], lsb_acc, glwe_dimension, polynomial_size, + streams[0], gpu_indexes[0], lsb_acc, luts_array->get_degree(0), + luts_array->get_max_degree(0), glwe_dimension, polynomial_size, message_modulus, carry_modulus, lut_f_lsb); generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], msb_acc, glwe_dimension, polynomial_size, + streams[0], gpu_indexes[0], msb_acc, luts_array->get_degree(1), + luts_array->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, lut_f_msb); // lut_indexes_vec for luts_array should be reinitialized @@ -2675,8 +2771,10 @@ template struct int_logical_scalar_shift_buffer { // right shift generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], cur_lut_bivariate->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, - params.message_modulus, params.carry_modulus, shift_lut_f); + cur_lut_bivariate->get_degree(0), + cur_lut_bivariate->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, + params.carry_modulus, shift_lut_f); cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0); lut_buffers_bivariate.push_back(cur_lut_bivariate); @@ -2761,8 +2859,10 @@ template struct int_logical_scalar_shift_buffer { // right shift generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], cur_lut_bivariate->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, - params.message_modulus, params.carry_modulus, shift_lut_f); + cur_lut_bivariate->get_degree(0), + cur_lut_bivariate->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, + params.carry_modulus, shift_lut_f); cur_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0); lut_buffers_bivariate.push_back(cur_lut_bivariate); @@ -2866,6 +2966,8 @@ template struct int_arithmetic_scalar_shift_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], shift_last_block_lut_univariate->get_lut(0, 0), + shift_last_block_lut_univariate->get_degree(0), + shift_last_block_lut_univariate->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, last_block_lut_f); shift_last_block_lut_univariate->broadcast_lut(streams, gpu_indexes, 0); @@ -2888,9 +2990,11 @@ template struct int_arithmetic_scalar_shift_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], - padding_block_lut_univariate->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - padding_block_lut_f); + padding_block_lut_univariate->get_lut(0, 0), + padding_block_lut_univariate->get_degree(0), + padding_block_lut_univariate->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, padding_block_lut_f); padding_block_lut_univariate->broadcast_lut(streams, gpu_indexes, 0); lut_buffers_univariate.push_back(padding_block_lut_univariate); @@ -2928,9 +3032,11 @@ template struct int_arithmetic_scalar_shift_buffer { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], - shift_blocks_lut_bivariate->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, - params.carry_modulus, blocks_lut_f); + shift_blocks_lut_bivariate->get_lut(0, 0), + shift_blocks_lut_bivariate->get_degree(0), + shift_blocks_lut_bivariate->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, + params.message_modulus, params.carry_modulus, blocks_lut_f); shift_blocks_lut_bivariate->broadcast_lut(streams, gpu_indexes, 0); lut_buffers_bivariate.push_back(shift_blocks_lut_bivariate); @@ -3011,18 +3117,22 @@ template struct int_cmux_buffer { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 0), + predicate_lut->get_degree(0), predicate_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, inverted_lut_f); generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], predicate_lut->get_lut(0, 1), + predicate_lut->get_degree(1), predicate_lut->get_max_degree(1), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f); generate_device_accumulator( streams[0], gpu_indexes[0], message_extract_lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, params.message_modulus, - params.carry_modulus, message_extract_lut_f); + message_extract_lut->get_degree(0), + message_extract_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + message_extract_lut_f); Torus *h_lut_indexes = (Torus *)malloc(2 * num_radix_blocks * sizeof(Torus)); for (int index = 0; index < 2 * num_radix_blocks; index++) { @@ -3098,6 +3208,7 @@ template struct int_are_all_block_true_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], is_max_value->get_lut(0, 0), + is_max_value->get_degree(0), is_max_value->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, is_max_value_f); @@ -3155,6 +3266,7 @@ template struct int_comparison_eq_buffer { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], operator_lut->get_lut(0, 0), + operator_lut->get_degree(0), operator_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, operator_f); @@ -3172,6 +3284,7 @@ template struct int_comparison_eq_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], is_non_zero_lut->get_lut(0, 0), + is_non_zero_lut->get_degree(0), is_non_zero_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, is_non_zero_lut_f); @@ -3187,10 +3300,10 @@ template struct int_comparison_eq_buffer { return operator_f(i, x); }; - Torus *lut = scalar_comparison_luts->get_lut(0, i); - generate_device_accumulator( - streams[0], gpu_indexes[0], lut, params.glwe_dimension, + streams[0], gpu_indexes[0], scalar_comparison_luts->get_lut(0, i), + scalar_comparison_luts->get_degree(i), + scalar_comparison_luts->get_max_degree(i), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f); } @@ -3260,8 +3373,10 @@ template struct int_tree_sign_reduction_buffer { streams, gpu_indexes, gpu_count, params, 1, 1, allocate_gpu_memory); generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], tree_inner_leaf_lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, params.message_modulus, - params.carry_modulus, block_selector_f); + tree_inner_leaf_lut->get_degree(0), + tree_inner_leaf_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + block_selector_f); tree_inner_leaf_lut->broadcast_lut(streams, gpu_indexes, 0); } @@ -3432,6 +3547,7 @@ template struct int_comparison_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], identity_lut->get_lut(0, 0), + identity_lut->get_degree(0), identity_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, identity_lut_f); @@ -3448,6 +3564,7 @@ template struct int_comparison_buffer { generate_device_accumulator( streams[0], gpu_indexes[0], is_zero_lut->get_lut(0, 0), + is_zero_lut->get_degree(0), is_zero_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, is_zero_f); @@ -3526,6 +3643,7 @@ template struct int_comparison_buffer { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], signed_lut->get_lut(0, 0), + signed_lut->get_degree(0), signed_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, signed_lut_f); @@ -3704,6 +3822,7 @@ template struct unsigned_int_div_rem_memory { for (int j = 0; j < 2; j++) { generate_device_accumulator( streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0), + luts[j]->get_degree(0), luts[j]->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_masking); luts[j]->broadcast_lut(streams, gpu_indexes, 0); @@ -3728,6 +3847,7 @@ template struct unsigned_int_div_rem_memory { for (int j = 0; j < 2; j++) { generate_device_accumulator( streams[0], gpu_indexes[0], luts[j]->get_lut(0, 0), + luts[j]->get_degree(0), luts[j]->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_message_extract); luts[j]->broadcast_lut(streams, gpu_indexes, 0); @@ -3759,6 +3879,8 @@ template struct unsigned_int_div_rem_memory { generate_device_accumulator_bivariate_with_factor( streams[0], gpu_indexes[0], zero_out_if_overflow_did_not_happen[0]->get_lut(0, 0), + zero_out_if_overflow_did_not_happen[0]->get_degree(0), + zero_out_if_overflow_did_not_happen[0]->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cur_lut_f, 2); zero_out_if_overflow_did_not_happen[0]->broadcast_lut(streams, gpu_indexes, @@ -3766,6 +3888,8 @@ template struct unsigned_int_div_rem_memory { generate_device_accumulator_bivariate_with_factor( streams[0], gpu_indexes[0], zero_out_if_overflow_did_not_happen[1]->get_lut(0, 0), + zero_out_if_overflow_did_not_happen[1]->get_degree(0), + zero_out_if_overflow_did_not_happen[1]->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, cur_lut_f, 3); zero_out_if_overflow_did_not_happen[1]->broadcast_lut(streams, gpu_indexes, @@ -3788,15 +3912,19 @@ template struct unsigned_int_div_rem_memory { generate_device_accumulator_bivariate_with_factor( streams[0], gpu_indexes[0], - zero_out_if_overflow_happened[0]->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - overflow_happened_f, 2); + zero_out_if_overflow_happened[0]->get_lut(0, 0), + zero_out_if_overflow_happened[0]->get_degree(0), + zero_out_if_overflow_happened[0]->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, overflow_happened_f, 2); zero_out_if_overflow_happened[0]->broadcast_lut(streams, gpu_indexes, 0); generate_device_accumulator_bivariate_with_factor( streams[0], gpu_indexes[0], - zero_out_if_overflow_happened[1]->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - overflow_happened_f, 3); + zero_out_if_overflow_happened[1]->get_lut(0, 0), + zero_out_if_overflow_happened[1]->get_degree(0), + zero_out_if_overflow_happened[1]->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, overflow_happened_f, 3); zero_out_if_overflow_happened[1]->broadcast_lut(streams, gpu_indexes, 0); // merge_overflow_flags_luts @@ -3811,9 +3939,11 @@ template struct unsigned_int_div_rem_memory { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], - merge_overflow_flags_luts[i]->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, params.carry_modulus, - lut_f_bit); + merge_overflow_flags_luts[i]->get_lut(0, 0), + merge_overflow_flags_luts[i]->get_degree(0), + merge_overflow_flags_luts[i]->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, params.message_modulus, + params.carry_modulus, lut_f_bit); merge_overflow_flags_luts[i]->broadcast_lut(streams, gpu_indexes, 0); } } @@ -4128,9 +4258,11 @@ template struct int_last_block_inner_propagate_memory { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], - last_block_inner_propagation_lut->get_lut(0, 0), params.glwe_dimension, - params.polynomial_size, message_modulus, params.carry_modulus, - f_last_block_inner_propagation_lut); + last_block_inner_propagation_lut->get_lut(0, 0), + last_block_inner_propagation_lut->get_degree(0), + last_block_inner_propagation_lut->get_max_degree(0), + params.glwe_dimension, params.polynomial_size, message_modulus, + params.carry_modulus, f_last_block_inner_propagation_lut); last_block_inner_propagation_lut->broadcast_lut(streams, gpu_indexes, 0); } @@ -4189,8 +4321,10 @@ template struct int_resolve_signed_overflow_memory { generate_device_accumulator( streams[0], gpu_indexes[0], resolve_overflow_lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, message_modulus, - params.carry_modulus, f_resolve_overflow_lut); + resolve_overflow_lut->get_degree(0), + resolve_overflow_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, message_modulus, params.carry_modulus, + f_resolve_overflow_lut); resolve_overflow_lut->broadcast_lut(streams, gpu_indexes, 0); } @@ -4234,9 +4368,10 @@ template struct int_bitop_buffer { }; generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, - params.message_modulus, params.carry_modulus, lut_bivariate_f); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, + params.carry_modulus, lut_bivariate_f); lut->broadcast_lut(streams, gpu_indexes, 0); } break; @@ -4247,7 +4382,6 @@ template struct int_bitop_buffer { allocate_gpu_memory); for (int i = 0; i < params.message_modulus; i++) { - auto lut_block = lut->get_lut(0, i); auto rhs = i; auto lut_univariate_scalar_f = [op, rhs](Torus x) -> Torus { @@ -4263,7 +4397,8 @@ template struct int_bitop_buffer { } }; generate_device_accumulator( - streams[0], gpu_indexes[0], lut_block, params.glwe_dimension, + streams[0], gpu_indexes[0], lut->get_lut(0, i), lut->get_degree(i), + lut->get_max_degree(i), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_univariate_scalar_f); lut->broadcast_lut(streams, gpu_indexes, 0); @@ -4517,8 +4652,10 @@ template struct int_div_rem_memory { generate_device_accumulator_bivariate( streams[0], gpu_indexes[0], compare_signed_bits_lut->get_lut(0, 0), - params.glwe_dimension, params.polynomial_size, params.message_modulus, - params.carry_modulus, f_compare_extracted_signed_bits); + compare_signed_bits_lut->get_degree(0), + compare_signed_bits_lut->get_max_degree(0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + f_compare_extracted_signed_bits); compare_signed_bits_lut->broadcast_lut(streams, gpu_indexes, 0); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index ed311f6389..bad342274c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -122,7 +122,9 @@ __host__ void are_all_comparisons_block_true( }; generate_device_accumulator( streams[0], gpu_indexes[0], is_max_value_lut->get_lut(0, 1), - glwe_dimension, polynomial_size, message_modulus, carry_modulus, + is_max_value_lut->get_degree(1), + is_max_value_lut->get_max_degree(1), glwe_dimension, + polynomial_size, message_modulus, carry_modulus, is_equal_to_num_blocks_lut_f); Torus *h_lut_indexes = (Torus *)malloc(num_chunks * sizeof(Torus)); @@ -460,7 +462,8 @@ __host__ void tree_sign_reduction( f = sign_handler_f; } generate_device_accumulator( - streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), glwe_dimension, + streams[0], gpu_indexes[0], last_lut->get_lut(0, 0), + last_lut->get_degree(0), last_lut->get_max_degree(0), glwe_dimension, polynomial_size, message_modulus, carry_modulus, f); last_lut->broadcast_lut(streams, gpu_indexes, 0); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index e85877211b..7140a683a2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -644,13 +644,11 @@ void rotate_left(Torus *buffer, int mid, uint32_t array_length) { /// scaling is done in the output space, as there are more bits in the output /// space, the delta is smaller hence the apparent "division" happening. template -void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t input_message_modulus, - uint32_t input_carry_modulus, - uint32_t output_message_modulus, - uint32_t output_carry_modulus, - std::function f) { +uint64_t generate_lookup_table_with_encoding( + Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t input_message_modulus, uint32_t input_carry_modulus, + uint32_t output_message_modulus, uint32_t output_carry_modulus, + std::function f) { uint32_t input_modulus_sup = input_message_modulus * input_carry_modulus; uint32_t output_modulus_sup = output_message_modulus * output_carry_modulus; @@ -662,12 +660,14 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension, memset(acc, 0, glwe_dimension * polynomial_size * sizeof(Torus)); auto body = &acc[glwe_dimension * polynomial_size]; + uint64_t degree = 0; // This accumulator extracts the carry bits for (int i = 0; i < input_modulus_sup; i++) { int index = i * box_size; for (int j = index; j < index + box_size; j++) { auto f_eval = f(i); + degree = max(degree, f_eval); body[j] = f_eval * output_delta; } } @@ -680,22 +680,23 @@ void generate_lookup_table_with_encoding(Torus *acc, uint32_t glwe_dimension, } rotate_left(body, half_box_size, polynomial_size); + return degree; } template -void generate_lookup_table(Torus *acc, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, - std::function f) { - generate_lookup_table_with_encoding(acc, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, - message_modulus, carry_modulus, f); +uint64_t generate_lookup_table(Torus *acc, uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t message_modulus, uint32_t carry_modulus, + std::function f) { + return generate_lookup_table_with_encoding( + acc, glwe_dimension, polynomial_size, message_modulus, carry_modulus, + message_modulus, carry_modulus, f); } template -void generate_many_lookup_table( - Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t message_modulus, uint32_t carry_modulus, +uint64_t generate_many_lookup_table( + Torus *acc, uint64_t *degrees, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::vector> &functions) { uint32_t modulus_sup = message_modulus * carry_modulus; @@ -713,6 +714,10 @@ void generate_many_lookup_table( // Space used for each sub lut uint32_t single_function_sub_lut_size = (modulus_sup / fn_counts) * box_size; + uint64_t max_degree = (modulus_sup / fn_counts - 1); + for (int f = 0; f < fn_counts; f++) { + degrees[f] = 0; + } // This accumulator extracts the carry bits for (int f = 0; f < fn_counts; f++) { @@ -721,6 +726,7 @@ void generate_many_lookup_table( int index = i * box_size + lut_offset; for (int j = index; j < index + box_size; j++) { auto f_eval = functions[f](i); + degrees[f] = max(f_eval, degrees[f]); body[j] = f_eval * delta; } } @@ -733,14 +739,15 @@ void generate_many_lookup_table( } rotate_left(body, half_box_size, polynomial_size); + return max_degree; } template -void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension, - uint32_t polynomial_size, - uint32_t message_modulus, - uint32_t carry_modulus, - std::function f) { +uint64_t generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t message_modulus, + uint32_t carry_modulus, + std::function f) { Torus factor_u64 = message_modulus; auto wrapped_f = [factor_u64, message_modulus, f](Torus input) -> Torus { @@ -750,12 +757,13 @@ void generate_lookup_table_bivariate(Torus *acc, uint32_t glwe_dimension, return f(lhs, rhs); }; - generate_lookup_table(acc, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, wrapped_f); + return generate_lookup_table(acc, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, + wrapped_f); } template -void generate_lookup_table_bivariate_with_factor( +uint64_t generate_lookup_table_bivariate_with_factor( Torus *acc, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f, int factor) { @@ -768,8 +776,9 @@ void generate_lookup_table_bivariate_with_factor( return f(lhs, rhs); }; - generate_lookup_table(acc, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, wrapped_f); + return generate_lookup_table(acc, glwe_dimension, polynomial_size, + message_modulus, carry_modulus, + wrapped_f); } /* @@ -782,16 +791,18 @@ void generate_lookup_table_bivariate_with_factor( template void generate_device_accumulator_bivariate( cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, std::function f) { + uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, + std::function f) { // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); - + *max_degree = message_modulus * carry_modulus - 1; // fill bivariate accumulator - generate_lookup_table_bivariate(h_lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, f); + *degree = generate_lookup_table_bivariate( + h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, + f); // copy host lut and lut_indexes_vec to device cuda_memcpy_async_to_gpu(acc_bivariate, h_lut, @@ -813,15 +824,17 @@ void generate_device_accumulator_bivariate( template void generate_device_accumulator_bivariate_with_factor( cudaStream_t stream, uint32_t gpu_index, Torus *acc_bivariate, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, std::function f, int factor) { + uint64_t *degree, uint64_t *max_degree, uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, + std::function f, int factor) { // host lut Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); + *max_degree = message_modulus * carry_modulus - 1; // fill bivariate accumulator - generate_lookup_table_bivariate_with_factor( + *degree = generate_lookup_table_bivariate_with_factor( h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f, factor); @@ -838,8 +851,8 @@ void generate_device_accumulator_bivariate_with_factor( template void generate_device_accumulator_with_encoding( - cudaStream_t stream, uint32_t gpu_index, Torus *acc, - uint32_t glwe_dimension, uint32_t polynomial_size, + cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degree, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_message_modulus, uint32_t input_carry_modulus, uint32_t output_message_modulus, uint32_t output_carry_modulus, std::function f) { @@ -848,8 +861,9 @@ void generate_device_accumulator_with_encoding( Torus *h_lut = (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); + *max_degree = input_message_modulus * input_carry_modulus - 1; // fill accumulator - generate_lookup_table_with_encoding( + *degree = generate_lookup_table_with_encoding( h_lut, glwe_dimension, polynomial_size, input_message_modulus, input_carry_modulus, output_message_modulus, output_carry_modulus, f); @@ -871,15 +885,17 @@ void generate_device_accumulator_with_encoding( */ template void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, - Torus *acc, uint32_t glwe_dimension, + Torus *acc, uint64_t *degree, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, uint32_t carry_modulus, std::function f) { generate_device_accumulator_with_encoding( - stream, gpu_index, acc, glwe_dimension, polynomial_size, message_modulus, - carry_modulus, message_modulus, carry_modulus, f); + stream, gpu_index, acc, degree, max_degree, glwe_dimension, + polynomial_size, message_modulus, carry_modulus, message_modulus, + carry_modulus, f); } /* @@ -891,9 +907,9 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, */ template void generate_many_lut_device_accumulator( - cudaStream_t stream, uint32_t gpu_index, Torus *acc, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t message_modulus, - uint32_t carry_modulus, + cudaStream_t stream, uint32_t gpu_index, Torus *acc, uint64_t *degrees, + uint64_t *max_degree, uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t message_modulus, uint32_t carry_modulus, std::vector> &functions) { // host lut @@ -901,8 +917,9 @@ void generate_many_lut_device_accumulator( (Torus *)malloc((glwe_dimension + 1) * polynomial_size * sizeof(Torus)); // fill accumulator - generate_many_lookup_table(h_lut, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, functions); + *max_degree = generate_many_lookup_table( + h_lut, degrees, glwe_dimension, polynomial_size, message_modulus, + carry_modulus, functions); // copy host lut and lut_indexes_vec to device cuda_memcpy_async_to_gpu( @@ -1455,9 +1472,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, if (num_sign_blocks > 2) { auto lut = diff_buffer->reduce_signs_lut; generate_device_accumulator( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, - reduce_two_orderings_function); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, reduce_two_orderings_function); lut->broadcast_lut(streams, gpu_indexes, 0); while (num_sign_blocks > 2) { @@ -1489,8 +1506,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, auto lut = diff_buffer->reduce_signs_lut; generate_device_accumulator( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, final_lut_f); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, final_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, @@ -1508,8 +1526,9 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, auto lut = mem_ptr->diff_buffer->reduce_signs_lut; generate_device_accumulator( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, final_lut_f); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, final_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); integer_radix_apply_univariate_lookup_table_kb( diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index a828f70b62..1344625f28 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -280,10 +280,13 @@ __host__ void host_integer_partial_sum_ciphertexts_vec_kb( // generate accumulators generate_device_accumulator( - streams[0], gpu_indexes[0], message_acc, glwe_dimension, polynomial_size, - message_modulus, carry_modulus, lut_f_message); + streams[0], gpu_indexes[0], message_acc, + luts_message_carry->get_degree(0), luts_message_carry->get_max_degree(0), + glwe_dimension, polynomial_size, message_modulus, carry_modulus, + lut_f_message); generate_device_accumulator( - streams[0], gpu_indexes[0], carry_acc, glwe_dimension, polynomial_size, + streams[0], gpu_indexes[0], carry_acc, luts_message_carry->get_degree(1), + luts_message_carry->get_max_degree(1), glwe_dimension, polynomial_size, message_modulus, carry_modulus, lut_f_carry); luts_message_carry->broadcast_lut(streams, gpu_indexes, 0); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index e70489bf2a..e8e2bbe3fe 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -110,10 +110,10 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( }; auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut; - generate_device_accumulator(streams[0], gpu_indexes[0], - lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, - carry_modulus, scalar_last_leaf_lut_f); + generate_device_accumulator( + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, scalar_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); integer_radix_apply_univariate_lookup_table_kb( @@ -195,9 +195,9 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( auto lut = diff_buffer->tree_buffer->tree_last_leaf_scalar_lut; generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, - scalar_bivariate_last_leaf_lut_f); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); integer_radix_apply_bivariate_lookup_table_kb( @@ -331,9 +331,9 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( auto lut = mem_ptr->diff_buffer->tree_buffer->tree_last_leaf_scalar_lut; generate_device_accumulator_bivariate( - streams[0], gpu_indexes[0], lut->get_lut(0, 0), glwe_dimension, - polynomial_size, message_modulus, carry_modulus, - scalar_bivariate_last_leaf_lut_f); + streams[0], gpu_indexes[0], lut->get_lut(0, 0), lut->get_degree(0), + lut->get_max_degree(0), glwe_dimension, polynomial_size, + message_modulus, carry_modulus, scalar_bivariate_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); integer_radix_apply_bivariate_lookup_table_kb( @@ -426,6 +426,7 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( auto signed_msb_lut = mem_ptr->signed_msb_lut; generate_device_accumulator_bivariate( msb_streams[0], gpu_indexes[0], signed_msb_lut->get_lut(0, 0), + signed_msb_lut->get_degree(0), signed_msb_lut->get_max_degree(0), params.glwe_dimension, params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f); signed_msb_lut->broadcast_lut(streams, gpu_indexes, 0);