diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index f101c381f8..93fcadaa0e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -54,7 +54,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); + uint64_t lut_degree, bool allocate_gpu_memory); void scratch_cuda_apply_many_univariate_lut_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension, @@ -63,12 +63,11 @@ void scratch_cuda_apply_many_univariate_lut_kb_64( uint32_t grouping_factor, uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, uint32_t num_many_lut, bool allocate_gpu_memory); -void cuda_apply_univariate_lut_kb_64(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, void *output_radix_lwe, - void const *input_radix_lwe, - int8_t *mem_ptr, void *const *ksks, - void *const *bsks, uint32_t num_blocks); +void cuda_apply_univariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + CudaRadixCiphertextFFI *output_radix_lwe, + CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr, + void *const *ksks, void *const *bsks); void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams, uint32_t const *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index b2783460c8..c5b7083528 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -24,7 +24,7 @@ __host__ void zero_out_if(cudaStream_t const *streams, predicate->lwe_indexes_in, params.big_lwe_dimension, params.message_modulus, num_radix_blocks); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, tmp_lwe_array_input, bsks, ksks, num_radix_blocks, predicate); } @@ -68,7 +68,7 @@ __host__ void legacy_host_integer_radix_cmux_kb( mem_false, params.big_lwe_dimension, num_radix_blocks); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks, num_radix_blocks, mem_ptr->message_extract_lut); } @@ -122,8 +122,7 @@ __host__ void host_integer_radix_cmux_kb( mem_false); integer_radix_apply_univariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, (Torus *)(lwe_array_out->ptr), - (Torus *)(added_cts->ptr), bsks, ksks, num_radix_blocks, + streams, gpu_indexes, gpu_count, lwe_array_out, added_cts, bsks, ksks, mem_ptr->message_extract_lut); delete mem_true; delete mem_false; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index bad342274c..b8151e4526 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -148,12 +148,12 @@ __host__ void are_all_comparisons_block_true( // Applies the LUT if (remaining_blocks == 1) { // In the last iteration we copy the output to the final address - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks, ksks, 1, lut); return; } else { - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, tmp_out, accumulator, bsks, ksks, num_chunks, lut); } @@ -219,12 +219,12 @@ __host__ void is_at_least_one_comparisons_block_true( // Applies the LUT if (remaining_blocks == 1) { // In the last iteration we copy the output to the final address - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, accumulator, bsks, ksks, 1, lut); return; } else { - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, mem_ptr->tmp_lwe_array_out, accumulator, bsks, ksks, num_chunks, lut); } @@ -305,7 +305,7 @@ __host__ void host_compare_with_zero_equality( } } - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, sum, sum, bsks, ksks, num_sum_blocks, zero_comparison); are_all_comparisons_block_true(streams, gpu_indexes, gpu_count, @@ -371,7 +371,7 @@ __host__ void compare_radix_blocks_kb( // Apply LUT to compare to 0 auto is_non_zero_lut = mem_ptr->eq_buffer->is_non_zero_lut; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_out, bsks, ksks, num_radix_blocks, is_non_zero_lut); @@ -422,7 +422,7 @@ __host__ void tree_sign_reduction( pack_blocks(streams[0], gpu_indexes[0], y, x, big_lwe_dimension, partial_block_count, 4); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, x, y, bsks, ksks, partial_block_count >> 1, inner_tree_leaf); @@ -468,7 +468,7 @@ __host__ void tree_sign_reduction( last_lut->broadcast_lut(streams, gpu_indexes, 0); // Last leaf - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, y, bsks, ksks, 1, last_lut); } @@ -514,7 +514,7 @@ __host__ void host_integer_radix_difference_check_kb( // Clean noise auto identity_lut = mem_ptr->identity_lut; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, packed_left, packed_left, bsks, ksks, 2 * packed_num_radix_blocks, identity_lut); @@ -552,11 +552,11 @@ __host__ void host_integer_radix_difference_check_kb( packed_left + packed_num_radix_blocks * big_lwe_size; Torus *last_right_block_before_sign_block = packed_right + packed_num_radix_blocks * big_lwe_size; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_left_block_before_sign_block, lwe_array_left + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, identity_lut); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, last_right_block_before_sign_block, lwe_array_right + (num_radix_blocks - 2) * big_lwe_size, bsks, ksks, 1, identity_lut); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index b080c5edf7..6f0c271a36 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -285,7 +285,7 @@ __host__ void host_unsigned_integer_div_rem_kb( // Shift the mask so that we will only keep bits we should uint32_t shifted_mask = full_message_mask >> shift_amount; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, interesting_divisor.last_block(), interesting_divisor.last_block(), bsks, ksks, 1, mem_ptr->masking_luts_1[shifted_mask]); @@ -314,7 +314,7 @@ __host__ void host_unsigned_integer_div_rem_kb( // the estimated degree of the output is < msg_modulus shifted_mask = shifted_mask & full_message_mask; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, divisor_ms_blocks.first_block(), divisor_ms_blocks.first_block(), bsks, ksks, 1, mem_ptr->masking_luts_2[shifted_mask]); @@ -481,7 +481,7 @@ __host__ void host_unsigned_integer_div_rem_kb( auto create_clean_version_of_merged_remainder = [&](cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count) { - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, cleaned_merged_interesting_remainder.data, cleaned_merged_interesting_remainder.data, bsks, ksks, @@ -595,10 +595,10 @@ __host__ void host_unsigned_integer_div_rem_kb( for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index 50de77a464..9157c64929 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -184,7 +184,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( uint32_t ks_base_log, uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { + uint64_t lut_degree, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, @@ -195,7 +195,7 @@ void scratch_cuda_apply_univariate_lut_kb_64( (cudaStream_t *)(streams), gpu_indexes, gpu_count, (int_radix_lut **)mem_ptr, static_cast(input_lut), num_radix_blocks, params, - allocate_gpu_memory); + lut_degree, allocate_gpu_memory); } void scratch_cuda_apply_many_univariate_lut_kb_64( @@ -219,19 +219,16 @@ void scratch_cuda_apply_many_univariate_lut_kb_64( num_many_lut, allocate_gpu_memory); } -void cuda_apply_univariate_lut_kb_64(void *const *streams, - uint32_t const *gpu_indexes, - uint32_t gpu_count, void *output_radix_lwe, - void const *input_radix_lwe, - int8_t *mem_ptr, void *const *ksks, - void *const *bsks, uint32_t num_blocks) { +void cuda_apply_univariate_lut_kb_64( + void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, + CudaRadixCiphertextFFI *output_radix_lwe, + CudaRadixCiphertextFFI const *input_radix_lwe, int8_t *mem_ptr, + void *const *ksks, void *const *bsks) { host_apply_univariate_lut_kb( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(output_radix_lwe), - static_cast(input_radix_lwe), - (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), bsks, - num_blocks); + (cudaStream_t *)(streams), gpu_indexes, gpu_count, output_radix_lwe, + input_radix_lwe, (int_radix_lut *)mem_ptr, (uint64_t **)(ksks), + bsks); } void cleanup_cuda_apply_univariate_lut_kb_64(void *const *streams, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 7140a683a2..4bf269acd2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -355,6 +355,111 @@ __host__ void pack_bivariate_blocks_with_single_block( template __host__ void integer_radix_apply_univariate_lookup_table_kb( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array_out, + CudaRadixCiphertextFFI const *lwe_array_in, void *const *bsks, + Torus *const *ksks, int_radix_lut *lut) { + // apply_lookup_table + auto params = lut->params; + auto pbs_type = params.pbs_type; + auto big_lwe_dimension = params.big_lwe_dimension; + auto small_lwe_dimension = params.small_lwe_dimension; + auto ks_level = params.ks_level; + auto ks_base_log = params.ks_base_log; + auto pbs_level = params.pbs_level; + auto pbs_base_log = params.pbs_base_log; + auto glwe_dimension = params.glwe_dimension; + auto polynomial_size = params.polynomial_size; + auto grouping_factor = params.grouping_factor; + + if (lwe_array_out->num_radix_blocks != lwe_array_in->num_radix_blocks) + PANIC("Cuda error: input and output radix ciphertexts should have the same " + "number of blocks") + if (lwe_array_out->lwe_dimension != lwe_array_in->lwe_dimension) + PANIC("Cuda error: input and output radix ciphertexts should have the same " + "lwe dimension") + + // In the case of extracting a single LWE this parameters are dummy + uint32_t num_many_lut = 1; + uint32_t lut_stride = 0; + uint32_t num_radix_blocks = lwe_array_in->num_radix_blocks; + /// For multi GPU execution we create vectors of pointers for inputs and + /// outputs + std::vector lwe_array_in_vec = lut->lwe_array_in_vec; + std::vector lwe_after_ks_vec = lut->lwe_after_ks_vec; + std::vector lwe_after_pbs_vec = lut->lwe_after_pbs_vec; + std::vector lwe_trivial_indexes_vec = lut->lwe_trivial_indexes_vec; + + auto active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count); + if (active_gpu_count == 1) { + execute_keyswitch_async( + streams, gpu_indexes, 1, lwe_after_ks_vec[0], + lwe_trivial_indexes_vec[0], (Torus *)lwe_array_in->ptr, + lut->lwe_indexes_in, ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, 1, (Torus *)lwe_array_out->ptr, + lut->lwe_indexes_out, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, + lut_stride); + } else { + /// Make sure all data that should be on GPU 0 is indeed there + cuda_synchronize_stream(streams[0], gpu_indexes[0]); + + /// With multiple GPUs we push to the vectors on each GPU then when we + /// gather data to GPU 0 we can copy back to the original indexing + multi_gpu_scatter_lwe_async( + streams, gpu_indexes, active_gpu_count, lwe_array_in_vec, + (Torus *)lwe_array_in->ptr, lut->h_lwe_indexes_in, + lut->using_trivial_lwe_indexes, num_radix_blocks, + big_lwe_dimension + 1); + + /// Apply KS to go from a big LWE dimension to a small LWE dimension + execute_keyswitch_async(streams, gpu_indexes, active_gpu_count, + lwe_after_ks_vec, lwe_trivial_indexes_vec, + lwe_array_in_vec, lwe_trivial_indexes_vec, + ksks, big_lwe_dimension, small_lwe_dimension, + ks_base_log, ks_level, num_radix_blocks); + + /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE + /// dimension to a big LWE dimension + execute_pbs_async( + streams, gpu_indexes, active_gpu_count, lwe_after_pbs_vec, + lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, + lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, + glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, + pbs_level, grouping_factor, num_radix_blocks, pbs_type, num_many_lut, + lut_stride); + + /// Copy data back to GPU 0 and release vecs + multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, + (Torus *)lwe_array_out->ptr, + lwe_after_pbs_vec, lut->h_lwe_indexes_out, + lut->using_trivial_lwe_indexes, + num_radix_blocks, big_lwe_dimension + 1); + + /// Synchronize all GPUs + for (uint i = 0; i < active_gpu_count; i++) { + cuda_synchronize_stream(streams[i], gpu_indexes[i]); + } + } + uint32_t lut_indexes[lut->num_blocks]; + cuda_memcpy_async_to_cpu(&lut_indexes, lut->get_lut_indexes(0, 0), + lut->num_blocks * sizeof(uint32_t), streams[0], + gpu_indexes[0]); + for (uint i = 0; i < lwe_array_out->num_radix_blocks; i++) { + lwe_array_out->degrees[i] = lut->degrees[lut_indexes[i]]; + lwe_array_out->noise_levels[i] = NoiseLevel::NOMINAL; + } +} + +template +__host__ void legacy_integer_radix_apply_univariate_lookup_table_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array_out, Torus const *lwe_array_in, void *const *bsks, Torus *const *ksks, uint32_t num_radix_blocks, @@ -1023,7 +1128,7 @@ void host_resolve_group_carries_sequentially( // Apply the lut auto luts_sequential = mem->lut_sequential_algorithm; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, group_resolved_carries + big_lwe_size, group_resolved_carries + big_lwe_size, bsks, ksks, blocks_to_solve, @@ -1105,7 +1210,7 @@ void host_compute_propagation_simulators_and_group_carries( num_radix_blocks, big_lwe_size, group_size); auto luts_array_second_step = mem->luts_array_second_step; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, propagation_cum_sums, propagation_cum_sums, bsks, ksks, num_radix_blocks, luts_array_second_step); @@ -1214,7 +1319,7 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams, auto luts_carry_propagation_sum = mem->luts_borrow_propagation_sum; auto message_acc = mem->message_acc; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks, ksks, num_blocks, luts_array); @@ -1237,7 +1342,7 @@ void host_propagate_single_sub_borrow(cudaStream_t const *streams, step_output, glwe_dimension * polynomial_size, num_blocks); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, lwe_array, bsks, ksks, num_blocks, message_acc); } @@ -1432,7 +1537,7 @@ __host__ void extract_n_bits(cudaStream_t const *streams, uint32_t num_radix_blocks, uint32_t bits_per_block, int_bit_extract_luts_buffer *bit_extract) { - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_in, bsks, ksks, num_radix_blocks * bits_per_block, bit_extract->lut); } @@ -1480,7 +1585,7 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, while (num_sign_blocks > 2) { pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension, num_sign_blocks, 4); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, signs_a, signs_b, bsks, ksks, num_sign_blocks / 2, lut); @@ -1513,7 +1618,7 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, pack_blocks(streams[0], gpu_indexes[0], signs_b, signs_a, big_lwe_dimension, 2, 4); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, signs_array_out, signs_b, bsks, ksks, 1, lut); @@ -1531,7 +1636,7 @@ reduce_signs(cudaStream_t const *streams, uint32_t const *gpu_indexes, message_modulus, carry_modulus, final_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, signs_array_out, signs_a, bsks, ksks, 1, lut); } @@ -1541,7 +1646,7 @@ template void scratch_cuda_apply_univariate_lut_kb( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, int_radix_lut **mem_ptr, Torus const *input_lut, - uint32_t num_radix_blocks, int_radix_params params, + uint32_t num_radix_blocks, int_radix_params params, uint64_t lut_degree, bool allocate_gpu_memory) { *mem_ptr = new int_radix_lut(streams, gpu_indexes, gpu_count, params, @@ -1552,20 +1657,22 @@ void scratch_cuda_apply_univariate_lut_kb( (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus), streams[0], gpu_indexes[0]); + *(*mem_ptr)->get_degree(0) = lut_degree; (*mem_ptr)->broadcast_lut(streams, gpu_indexes, 0); } template void host_apply_univariate_lut_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes, - uint32_t gpu_count, Torus *radix_lwe_out, - Torus const *radix_lwe_in, + uint32_t gpu_count, + CudaRadixCiphertextFFI *radix_lwe_out, + CudaRadixCiphertextFFI const *radix_lwe_in, int_radix_lut *mem, Torus *const *ksks, - void *const *bsks, uint32_t num_blocks) { + void *const *bsks) { integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, radix_lwe_out, radix_lwe_in, bsks, ksks, - num_blocks, mem); + mem); } template @@ -1717,7 +1824,7 @@ void host_propagate_single_carry(cudaStream_t const *streams, cuda_memcpy_async_gpu_to_gpu( prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, big_lwe_size_bytes, streams[0], gpu_indexes[0]); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks, bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract); @@ -1729,7 +1836,7 @@ void host_propagate_single_carry(cudaStream_t const *streams, big_lwe_size_bytes, streams[0], gpu_indexes[0]); } else { auto message_extract = mem->lut_message_extract; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, prepared_blocks, bsks, ksks, num_radix_blocks, message_extract); } @@ -1838,7 +1945,7 @@ void host_add_and_propagate_single_carry( cuda_memcpy_async_gpu_to_gpu( prepared_blocks + num_radix_blocks * big_lwe_size, output_flag, big_lwe_size_bytes, streams[0], gpu_indexes[0]); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, mem->output_flag, prepared_blocks, bsks, ksks, num_radix_blocks + 1, mem->lut_message_extract); @@ -1849,7 +1956,7 @@ void host_add_and_propagate_single_carry( carry_out, mem->output_flag + num_radix_blocks * big_lwe_size, big_lwe_size_bytes, streams[0], gpu_indexes[0]); } else { - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lhs_array, prepared_blocks, bsks, ksks, num_radix_blocks, mem->lut_message_extract); } @@ -1954,7 +2061,7 @@ void host_single_borrow_propagate( if (compute_overflow == outputFlag::FLAG_OVERFLOW) { auto borrow_flag = mem->lut_borrow_flag; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem->sub_streams_1, gpu_indexes, gpu_count, overflow_block, mem->overflow_block, bsks, ksks, 1, borrow_flag); } @@ -1972,7 +2079,7 @@ void host_single_borrow_propagate( resolved_borrows, num_radix_blocks, big_lwe_size, mem->group_size); auto message_extract = mem->lut_message_extract; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem->sub_streams_2, gpu_indexes, gpu_count, lhsrhs_array, prepared_blocks, bsks, ksks, num_radix_blocks, message_extract); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh index 0303f42fb6..24673f4bde 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh @@ -36,7 +36,7 @@ __host__ void host_integer_radix_scalar_bitop_kb( gpu_indexes[0]); lut->broadcast_lut(streams, gpu_indexes, 0); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, lwe_array_input, bsks, ksks, num_clear_blocks, lut); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index e8e2bbe3fe..64abddb79f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -43,7 +43,7 @@ __host__ void scalar_compare_radix_blocks_kb( // Apply LUT to compare to 0 auto sign_lut = mem_ptr->eq_buffer->is_non_zero_lut; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, subtracted_blocks, bsks, ksks, num_radix_blocks, sign_lut); @@ -116,7 +116,7 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( message_modulus, carry_modulus, scalar_last_leaf_lut_f); lut->broadcast_lut(streams, gpu_indexes, 0); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array_out, mem_ptr->tmp_lwe_array_out, bsks, ksks, 1, lut); @@ -686,7 +686,7 @@ __host__ void host_integer_radix_scalar_equality_check_kb( lsb_streams[0], gpu_indexes[0]); scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0); - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 510ce0967a..b232c30d2c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -213,7 +213,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( } auto lut_univariate_padding_block = mem->lut_buffers_univariate[num_bits_in_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem->local_streams_1, gpu_indexes, gpu_count, padding_block, last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); // Replace blocks 'pulled' from the left with the correct padding @@ -227,7 +227,7 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( if (shift_within_block != 0) { auto lut_univariate_shift_last_block = mem->lut_buffers_univariate[shift_within_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( mem->local_streams_2, gpu_indexes, gpu_count, last_block, last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 5f770ec2b4..f10937640f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -151,7 +151,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // we have // control_bit|b|a - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, input_bits_a, mux_inputs, bsks, ksks, total_nb_bits, mux_lut); } @@ -189,7 +189,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( // To give back a clean ciphertext auto cleaning_lut = mem->cleaning_lut; - integer_radix_apply_univariate_lookup_table_kb( + legacy_integer_radix_apply_univariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_last_out, lwe_last_out, bsks, ksks, num_radix_blocks, cleaning_lut); } diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 1a805c497f..dd58cf9afa 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -185,6 +185,7 @@ unsafe extern "C" { message_modulus: u32, carry_modulus: u32, pbs_type: PBS_TYPE, + lut_degree: u64, allocate_gpu_memory: bool, ); } @@ -216,12 +217,11 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - output_radix_lwe: *mut ffi::c_void, - input_radix_lwe: *const ffi::c_void, + output_radix_lwe: *mut CudaRadixCiphertextFFI, + input_radix_lwe: *const CudaRadixCiphertextFFI, mem_ptr: *mut i8, ksks: *const *mut ffi::c_void, bsks: *const *mut ffi::c_void, - num_blocks: u32, ); } unsafe extern "C" { diff --git a/tfhe/src/core_crypto/gpu/slice.rs b/tfhe/src/core_crypto/gpu/slice.rs index 668c608f29..98d388148c 100644 --- a/tfhe/src/core_crypto/gpu/slice.rs +++ b/tfhe/src/core_crypto/gpu/slice.rs @@ -7,18 +7,18 @@ use tfhe_cuda_backend::cuda_bind::{cuda_memcpy_async_gpu_to_gpu, cuda_memcpy_asy #[derive(Debug, Clone)] pub struct CudaSlice<'a, T: Numeric> { - ptrs: Vec<*const c_void>, - _lengths: Vec, - gpu_indexes: Vec, + pub(crate) ptrs: Vec<*const c_void>, + pub(crate) _lengths: Vec, + pub(crate) gpu_indexes: Vec, _phantom_1: PhantomData, _phantom_2: PhantomData<&'a ()>, } #[derive(Debug)] pub struct CudaSliceMut<'a, T: Numeric> { - ptrs: Vec<*mut c_void>, - lengths: Vec, - gpu_indexes: Vec, + pub(crate) ptrs: Vec<*mut c_void>, + pub(crate) lengths: Vec, + pub(crate) gpu_indexes: Vec, _phantom_1: PhantomData, _phantom_2: PhantomData<&'a mut ()>, } diff --git a/tfhe/src/integer/gpu/ciphertext/info.rs b/tfhe/src/integer/gpu/ciphertext/info.rs index 613d8d61ed..866abc1f64 100644 --- a/tfhe/src/integer/gpu/ciphertext/info.rs +++ b/tfhe/src/integer/gpu/ciphertext/info.rs @@ -529,22 +529,6 @@ impl CudaRadixCiphertextInfo { } } - pub(crate) fn after_aggregate_one_hot_vector(&self) -> Self { - Self { - blocks: self - .blocks - .iter() - .map(|left| CudaBlockInfo { - degree: Degree::new(left.message_modulus.0 - 1), - message_modulus: left.message_modulus, - carry_modulus: left.carry_modulus, - pbs_order: left.pbs_order, - noise_level: NoiseLevel::NOMINAL, - }) - .collect(), - } - } - pub(crate) fn after_ne(&self) -> Self { Self { blocks: self diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 9588c742b3..4e326f7e26 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -10,15 +10,14 @@ use crate::core_crypto::prelude::{ DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweBskGroupingFactor, LweDimension, Numeric, PolynomialSize, UnsignedInteger, }; -use crate::integer::{ClientKey, RadixClientKey}; -use crate::shortint::{CarryModulus, MessageModulus}; -pub use server_key::CudaServerKey; -use std::cmp::min; - use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock; use crate::integer::gpu::ciphertext::CudaRadixCiphertext; use crate::integer::server_key::radix_parallel::OutputFlag; +use crate::integer::{ClientKey, RadixClientKey}; use crate::shortint::ciphertext::{Degree, NoiseLevel}; +use crate::shortint::{CarryModulus, MessageModulus}; +pub use server_key::CudaServerKey; +use std::cmp::min; use tfhe_cuda_backend::bindings::*; use tfhe_cuda_backend::cuda_bind::*; @@ -78,18 +77,50 @@ fn prepare_cuda_radix_ffi( } } +fn prepare_cuda_radix_ffi_from_slice( + input: &CudaSlice, + degrees_vec: &mut Vec, + noise_levels_vec: &mut Vec, + num_radix_blocks: u32, + lwe_dimension: u32, +) -> CudaRadixCiphertextFFI { + CudaRadixCiphertextFFI { + ptr: input.ptrs[0].cast_mut(), + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks, + lwe_dimension, + } +} + +fn prepare_cuda_radix_ffi_from_slice_mut( + input: &CudaSliceMut, + degrees_vec: &mut Vec, + noise_levels_vec: &mut Vec, + num_radix_blocks: u32, + lwe_dimension: u32, +) -> CudaRadixCiphertextFFI { + CudaRadixCiphertextFFI { + ptr: input.ptrs[0], + degrees: degrees_vec.as_mut_ptr(), + noise_levels: noise_levels_vec.as_mut_ptr(), + num_radix_blocks, + lwe_dimension, + } +} + unsafe fn update_noise_degree( - radix_lwe_left: &mut CudaRadixCiphertext, - radix_lwe_left_data: &CudaRadixCiphertextFFI, + radix_ct: &mut CudaRadixCiphertext, + cuda_ffi_radix_ct: &CudaRadixCiphertextFFI, ) { - radix_lwe_left + radix_ct .info .blocks .iter_mut() .enumerate() .for_each(|(i, b)| { - b.degree = Degree(*radix_lwe_left_data.degrees.wrapping_add(i)); - b.noise_level = NoiseLevel(*radix_lwe_left_data.noise_levels.wrapping_add(i)); + b.degree = Degree(*cuda_ffi_radix_ct.degrees.wrapping_add(i)); + b.noise_level = NoiseLevel(*cuda_ffi_radix_ct.noise_levels.wrapping_add(i)); }); } pub fn gen_keys_gpu

(parameters_set: P, streams: &CudaStreams) -> (ClientKey, CudaServerKey) @@ -552,8 +583,7 @@ pub unsafe fn unchecked_add_integer_radix_assign_async( .iter() .map(|b| b.noise_level.0) .collect(); - // Remove prepare_data function - let mut radix_lwe_left_data = prepare_cuda_radix_ffi( + let mut cuda_ffi_radix_lwe_left = prepare_cuda_radix_ffi( radix_lwe_left, &mut radix_lwe_left_degrees, &mut radix_lwe_left_noise_levels, @@ -576,7 +606,7 @@ pub unsafe fn unchecked_add_integer_radix_assign_async( .iter() .map(|b| b.noise_level.0) .collect(); - let radix_lwe_right_data = prepare_cuda_radix_ffi( + let cuda_ffi_radix_lwe_right = prepare_cuda_radix_ffi( radix_lwe_right, &mut radix_lwe_right_degrees, &mut radix_lwe_right_noise_levels, @@ -584,11 +614,11 @@ pub unsafe fn unchecked_add_integer_radix_assign_async( cuda_add_lwe_ciphertext_vector_64( streams.ptr[0], streams.gpu_indexes[0].0, - &mut radix_lwe_left_data, - &radix_lwe_left_data, - &radix_lwe_right_data, + &mut cuda_ffi_radix_lwe_left, + &cuda_ffi_radix_lwe_left, + &cuda_ffi_radix_lwe_right, ); - update_noise_degree(radix_lwe_left, &radix_lwe_left_data); + update_noise_degree(radix_lwe_left, &cuda_ffi_radix_lwe_left); } #[allow(clippy::too_many_arguments)] @@ -2232,7 +2262,7 @@ pub unsafe fn unchecked_cmux_integer_radix_kb_async>() .as_ptr(), streams.len() as u32, - &mut radix_lwe_out_data, - &condition_data, - &radix_lwe_true_data, - &radix_lwe_false_data, + &mut cuda_ffi_radix_lwe_out, + &cuda_ffi_condition, + &cuda_ffi_radix_lwe_true, + &cuda_ffi_radix_lwe_false, mem_ptr, bootstrapping_key.ptr.as_ptr(), keyswitch_key.ptr.as_ptr(), @@ -2657,9 +2687,12 @@ pub unsafe fn unchecked_partial_sum_ciphertexts_integer_radix_kb_assign_async< /// is required pub unsafe fn apply_univariate_lut_kb_async( streams: &CudaStreams, - radix_lwe_output: &mut CudaSliceMut, - radix_lwe_input: &CudaSlice, + output: &mut CudaSliceMut, + output_degrees: &mut Vec, + output_noise_levels: &mut Vec, + input: &CudaSlice, input_lut: &[T], + lut_degree: u64, bootstrapping_key: &CudaVec, keyswitch_key: &CudaVec, lwe_dimension: LweDimension, @@ -2677,12 +2710,12 @@ pub unsafe fn apply_univariate_lut_kb_async( ) { assert_eq!( streams.gpu_indexes[0], - radix_lwe_input.gpu_index(0), + input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( streams.gpu_indexes[0], - radix_lwe_output.gpu_index(0), + output.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( @@ -2696,6 +2729,20 @@ pub unsafe fn apply_univariate_lut_kb_async( "GPU error: all data should reside on the same GPU." ); let mut mem_ptr: *mut i8 = std::ptr::null_mut(); + let mut cuda_ffi_output = prepare_cuda_radix_ffi_from_slice_mut( + output, + output_degrees, + output_noise_levels, + num_blocks, + (glwe_dimension.0 * polynomial_size.0) as u32, + ); + let cuda_ffi_input = prepare_cuda_radix_ffi_from_slice( + input, + output_degrees, + output_noise_levels, + num_blocks, + (glwe_dimension.0 * polynomial_size.0) as u32, + ); scratch_cuda_apply_univariate_lut_kb_64( streams.ptr.as_ptr(), streams @@ -2719,6 +2766,7 @@ pub unsafe fn apply_univariate_lut_kb_async( message_modulus.0 as u32, carry_modulus.0 as u32, pbs_type as u32, + lut_degree, true, ); cuda_apply_univariate_lut_kb_64( @@ -2730,12 +2778,11 @@ pub unsafe fn apply_univariate_lut_kb_async( .collect::>() .as_ptr(), streams.len() as u32, - radix_lwe_output.as_mut_c_ptr(0), - radix_lwe_input.as_c_ptr(0), + &mut cuda_ffi_output, + &cuda_ffi_input, mem_ptr, keyswitch_key.ptr.as_ptr(), bootstrapping_key.ptr.as_ptr(), - num_blocks, ); cleanup_cuda_apply_univariate_lut_kb_64( streams.ptr.as_ptr(), @@ -3327,7 +3374,7 @@ pub unsafe fn unchecked_signed_abs_radix_kb_assign_async>() .as_ptr(), streams.len() as u32, - &mut ct_data, + &mut cuda_ffi_ct, mem_ptr, true, bootstrapping_key.ptr.as_ptr(), diff --git a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs index 43186f2310..6a858f3cd4 100644 --- a/tfhe/src/integer/gpu/server_key/radix/ilog2.rs +++ b/tfhe/src/integer/gpu/server_key/radix/ilog2.rs @@ -12,6 +12,7 @@ use crate::integer::gpu::{ reverse_blocks_inplace_async, PBSType, }; use crate::integer::server_key::radix_parallel::ilog2::{BitValue, Direction}; +use crate::shortint::ciphertext::{Degree, NoiseLevel}; impl CudaServerKey { /// This function takes a ciphertext in radix representation @@ -73,6 +74,8 @@ impl CudaServerKey { let mut output_slice = tmp_radix .as_mut_slice(0..lwe_size * num_ct_blocks, 0) .unwrap(); + let mut output_degrees = vec![0_u64; num_ct_blocks]; + let mut output_noise_levels = vec![0_u64; num_ct_blocks]; let input_slice = ct .as_ref() @@ -89,8 +92,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut output_slice, + &mut output_degrees, + &mut output_noise_levels, &input_slice, lut.acc.as_ref(), + lut.degree.0, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -113,8 +119,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut output_slice, + &mut output_degrees, + &mut output_noise_levels, &input_slice, lut.acc.as_ref(), + lut.degree.0, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -573,6 +582,8 @@ impl CudaServerKey { .d_vec .as_mut_slice(0..lwe_size * counter_num_blocks, 0) .unwrap(); + let mut message_blocks_degrees = vec![0_u64; counter_num_blocks]; + let mut message_blocks_noise_levels = vec![0_u64; counter_num_blocks]; let result_slice = result .as_mut() .d_blocks @@ -586,8 +597,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut message_blocks_slice, + &mut message_blocks_degrees, + &mut message_blocks_noise_levels, &result_slice, lut_a.acc.as_ref(), + lut_a.degree.0, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -610,8 +624,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut message_blocks_slice, + &mut message_blocks_degrees, + &mut message_blocks_noise_levels, &result_slice, lut_a.acc.as_ref(), + lut_a.degree.0, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -674,14 +691,19 @@ impl CudaServerKey { .d_vec .as_mut_slice(0..lwe_size * counter_num_blocks, 0) .unwrap(); + let mut carry_blocks_degrees = vec![0_u64; counter_num_blocks]; + let mut carry_blocks_noise_levels = vec![0_u64; counter_num_blocks]; unsafe { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { apply_univariate_lut_kb_async( streams, &mut carry_blocks_slice, + &mut carry_blocks_degrees, + &mut carry_blocks_noise_levels, &result_slice, lut_b.acc.as_ref(), + lut_b.degree.0, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -704,8 +726,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut carry_blocks_slice, + &mut carry_blocks_degrees, + &mut carry_blocks_noise_levels, &result_slice, lut_b.acc.as_ref(), + lut_b.degree.0, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -731,6 +756,10 @@ impl CudaServerKey { let mut new_item: CudaSignedRadixCiphertext = self.create_trivial_zero_radix_async(counter_num_blocks, streams); + for (i, b) in new_item.ciphertext.info.blocks.iter_mut().enumerate() { + b.degree = Degree(message_blocks_degrees[i]); + b.noise_level = NoiseLevel(message_blocks_noise_levels[i]); + } let mut dest_slice = new_item .as_mut() .d_blocks @@ -751,6 +780,10 @@ impl CudaServerKey { let mut new_item: CudaSignedRadixCiphertext = self.create_trivial_zero_radix_async(counter_num_blocks, streams); + for (i, b) in new_item.ciphertext.info.blocks.iter_mut().enumerate() { + b.degree = Degree(carry_blocks_degrees[i]); + b.noise_level = NoiseLevel(carry_blocks_noise_levels[i]); + } let mut dest_slice = new_item .as_mut() .d_blocks diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index ba0bed262c..db0ed05a7a 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -920,6 +920,7 @@ impl CudaServerKey { let lwe_dimension = input.d_blocks.lwe_dimension(); let lwe_size = lwe_dimension.to_lwe_size().0; + let num_output_blocks = output.d_blocks.lwe_ciphertext_count().0; let input_slice = input .d_blocks @@ -928,62 +929,72 @@ impl CudaServerKey { .as_slice(lwe_size * block_range.start..lwe_size * block_range.end, 0) .unwrap(); let mut output_slice = output.d_blocks.0.d_vec.as_mut_slice(.., 0).unwrap(); + let mut output_degrees = vec![0_u64; num_output_blocks]; + let mut output_noise_levels = vec![0_u64; num_output_blocks]; let num_ct_blocks = block_range.len() as u32; - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut output_slice, - &input_slice, - lut.acc.as_ref(), - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_ct_blocks, - self.message_modulus, - self.carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut output_slice, - &input_slice, - lut.acc.as_ref(), - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_ct_blocks, - self.message_modulus, - self.carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - }; + unsafe { + match &self.bootstrapping_key { + CudaBootstrappingKey::Classic(d_bsk) => { + apply_univariate_lut_kb_async( + streams, + &mut output_slice, + &mut output_degrees, + &mut output_noise_levels, + &input_slice, + lut.acc.as_ref(), + lut.degree.0, + &d_bsk.d_vec, + &self.key_switching_key.d_vec, + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + d_bsk.glwe_dimension, + d_bsk.polynomial_size, + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_bsk.decomp_level_count, + d_bsk.decomp_base_log, + num_ct_blocks, + self.message_modulus, + self.carry_modulus, + PBSType::Classical, + LweBskGroupingFactor(0), + ); + } + CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { + apply_univariate_lut_kb_async( + streams, + &mut output_slice, + &mut output_degrees, + &mut output_noise_levels, + &input_slice, + lut.acc.as_ref(), + lut.degree.0, + &d_multibit_bsk.d_vec, + &self.key_switching_key.d_vec, + self.key_switching_key + .output_key_lwe_size() + .to_lwe_dimension(), + d_multibit_bsk.glwe_dimension, + d_multibit_bsk.polynomial_size, + self.key_switching_key.decomposition_level_count(), + self.key_switching_key.decomposition_base_log(), + d_multibit_bsk.decomp_level_count, + d_multibit_bsk.decomp_base_log, + num_ct_blocks, + self.message_modulus, + self.carry_modulus, + PBSType::MultiBit, + d_multibit_bsk.grouping_factor, + ); + } + }; + } - for info in output.info.blocks[block_range].iter_mut() { - info.degree = lut.degree; - info.noise_level = NoiseLevel::NOMINAL; + for (i, info) in output.info.blocks[block_range].iter_mut().enumerate() { + info.degree = Degree(output_degrees[i]); + info.noise_level = NoiseLevel(output_noise_levels[i]); } } /// Applies many lookup tables on the range of ciphertexts @@ -1218,6 +1229,8 @@ impl CudaServerKey { .unwrap(); let (padding_block, new_blocks) = output_slice.split_at_mut(lwe_size, 0); let mut padding_block = padding_block.unwrap(); + let mut padding_block_degree = vec![0_u64; 1]; + let mut padding_block_noise_level = vec![0_u64; 1]; let mut new_blocks = new_blocks.unwrap(); match &self.bootstrapping_key { @@ -1225,8 +1238,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut padding_block, + &mut padding_block_degree, + &mut padding_block_noise_level, &last_block, padding_block_creator_lut.acc.as_ref(), + padding_block_creator_lut.degree.0, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -1249,8 +1265,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut padding_block, + &mut padding_block_degree, + &mut padding_block_noise_level, &last_block, padding_block_creator_lut.acc.as_ref(), + padding_block_creator_lut.degree.0, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -1283,9 +1302,11 @@ impl CudaServerKey { ciphertext_modulus: self.ciphertext_modulus, }); let mut info = ct.as_ref().info.clone(); - let last_block_info = ct.as_ref().info.blocks.last().unwrap(); + let mut last_block_info = *ct.as_ref().info.blocks.last().unwrap(); + last_block_info.degree = Degree(padding_block_degree[0]); + last_block_info.noise_level = NoiseLevel(padding_block_noise_level[0]); for _ in num_ct_blocks..new_num_ct_blocks { - info.blocks.push(*last_block_info); + info.blocks.push(last_block_info); } T::from(CudaRadixCiphertext::new(output_lwe_list, info)) diff --git a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs index 3f88e83f0c..98505b5036 100644 --- a/tfhe/src/integer/gpu/server_key/radix/vector_find.rs +++ b/tfhe/src/integer/gpu/server_key/radix/vector_find.rs @@ -1614,74 +1614,17 @@ impl CudaServerKey { ) }; } - let mut temp = aggregated_vector.duplicate(streams); - let mut aggregated_mut_slice = aggregated_vector - .as_mut() - .d_blocks - .0 - .d_vec - .as_mut_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); + let temp = unsafe { aggregated_vector.duplicate_async(streams) }; unsafe { - let aggregated_slice = temp - .as_mut() - .d_blocks - .0 - .d_vec - .as_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut aggregated_mut_slice, - &aggregated_slice, - identity_lut.acc.as_ref(), - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut aggregated_mut_slice, - &aggregated_slice, - identity_lut.acc.as_ref(), - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - } - } + self.apply_lookup_table_async( + aggregated_vector.as_mut(), + temp.as_ref(), + &identity_lut, + 0..num_ct_blocks, + streams, + ) + }; } let last_chunk_size = one_hot_vector.len() - (num_chunks - 1) * chunk_size; for ct_idx in 0..last_chunk_size { @@ -1700,157 +1643,60 @@ impl CudaServerKey { let carry_extract_lut = self.generate_lookup_table(|x| (x / self.message_modulus.0)); let mut message_ct: T = unsafe { self.create_trivial_zero_radix_async(num_ct_blocks, streams) }; - let mut message_mut_slice = message_ct - .as_mut() - .d_blocks - .0 - .d_vec - .as_mut_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); let mut carry_ct: T = unsafe { self.create_trivial_zero_radix_async(num_ct_blocks, streams) }; - let mut carry_mut_slice = carry_ct - .as_mut() - .d_blocks - .0 - .d_vec - .as_mut_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); + let temp = unsafe { aggregated_vector.duplicate_async(streams) }; unsafe { - let mut temp = aggregated_vector.duplicate(streams); - let aggregated_slice = temp - .as_mut() - .d_blocks - .0 - .d_vec - .as_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut carry_mut_slice, - &aggregated_slice, - carry_extract_lut.acc.as_ref(), - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut carry_mut_slice, - &aggregated_slice, - carry_extract_lut.acc.as_ref(), - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - } - } + self.apply_lookup_table_async( + carry_ct.as_mut(), + temp.as_ref(), + &carry_extract_lut, + 0..num_ct_blocks, + streams, + ) + }; unsafe { - let mut temp = aggregated_vector.duplicate(streams); - let aggregated_slice = temp - .as_mut() - .d_blocks - .0 - .d_vec - .as_slice(0..lwe_size * num_ct_blocks, 0) - .unwrap(); - match &self.bootstrapping_key { - CudaBootstrappingKey::Classic(d_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut message_mut_slice, - &aggregated_slice, - message_extract_lut.acc.as_ref(), - &d_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_bsk.glwe_dimension, - d_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_bsk.decomp_level_count, - d_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::Classical, - LweBskGroupingFactor(0), - ); - } - CudaBootstrappingKey::MultiBit(d_multibit_bsk) => { - apply_univariate_lut_kb_async( - streams, - &mut message_mut_slice, - &aggregated_slice, - message_extract_lut.acc.as_ref(), - &d_multibit_bsk.d_vec, - &self.key_switching_key.d_vec, - self.key_switching_key - .output_key_lwe_size() - .to_lwe_dimension(), - d_multibit_bsk.glwe_dimension, - d_multibit_bsk.polynomial_size, - self.key_switching_key.decomposition_level_count(), - self.key_switching_key.decomposition_base_log(), - d_multibit_bsk.decomp_level_count, - d_multibit_bsk.decomp_base_log, - num_ct_blocks as u32, - self.message_modulus, - self.carry_modulus, - PBSType::MultiBit, - d_multibit_bsk.grouping_factor, - ); - } - } - } + self.apply_lookup_table_async( + message_ct.as_mut(), + temp.as_ref(), + &message_extract_lut, + 0..num_ct_blocks, + streams, + ) + }; let mut output_ct: T = unsafe { self.create_trivial_zero_radix_async(num_ct_blocks * 2, streams) }; // unpacked_blocks for index in 0..num_ct_blocks { - let mut output_mut_slice1 = output_ct - .as_mut() + let output_ct_inner = output_ct.as_mut(); + let mut output_mut_slice1 = output_ct_inner .d_blocks .0 .d_vec .as_mut_slice(2 * index * lwe_size..(2 * (index) * lwe_size + lwe_size), 0) .unwrap(); + output_ct_inner + .info + .blocks + .get_mut(2 * index) + .unwrap() + .degree = message_ct.as_mut().info.blocks.get(index).unwrap().degree; + output_ct_inner + .info + .blocks + .get_mut(2 * index) + .unwrap() + .noise_level = message_ct + .as_mut() + .info + .blocks + .get(index) + .unwrap() + .noise_level; let message_mut_slice = message_ct .as_mut() @@ -1863,8 +1709,8 @@ impl CudaServerKey { unsafe { output_mut_slice1.copy_from_gpu_async(&message_mut_slice, streams, 0) }; } for index in 0..num_ct_blocks { - let mut output_mut_slice2 = output_ct - .as_mut() + let output_ct_inner = output_ct.as_mut(); + let mut output_mut_slice2 = output_ct_inner .d_blocks .0 .d_vec @@ -1873,6 +1719,24 @@ impl CudaServerKey { 0, ) .unwrap(); + output_ct_inner + .info + .blocks + .get_mut(2 * index + 1) + .unwrap() + .degree = carry_ct.as_mut().info.blocks.get(index).unwrap().degree; + output_ct_inner + .info + .blocks + .get_mut(2 * index + 1) + .unwrap() + .noise_level = carry_ct + .as_mut() + .info + .blocks + .get(index) + .unwrap() + .noise_level; let carry_mut_slice = carry_ct .as_mut() @@ -1885,7 +1749,6 @@ impl CudaServerKey { unsafe { output_mut_slice2.copy_from_gpu_async(&carry_mut_slice, streams, 0) }; } streams.synchronize(); - output_ct.as_mut().info = output_ct.as_ref().info.after_aggregate_one_hot_vector(); output_ct } @@ -2011,14 +1874,19 @@ impl CudaServerKey { .d_vec .as_mut_slice(0..lwe_size * num_ct_blocks, 0) .unwrap(); + let mut degrees_out = vec![0_u64; num_ct_blocks]; + let mut noise_levels_out = vec![0_u64; num_ct_blocks]; unsafe { match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { apply_univariate_lut_kb_async( streams, &mut slice_out, + &mut degrees_out, + &mut noise_levels_out, &slice_in_final, lut.acc.as_ref(), + lut.degree.0, &d_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key @@ -2041,8 +1909,11 @@ impl CudaServerKey { apply_univariate_lut_kb_async( streams, &mut slice_out, + &mut degrees_out, + &mut noise_levels_out, &slice_in_final, lut.acc.as_ref(), + lut.degree.0, &d_multibit_bsk.d_vec, &self.key_switching_key.d_vec, self.key_switching_key