diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index 0e9144930d..bdfdabdd5d 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -41,8 +41,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t num_samples, uint32_t max_shared_memory); void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -50,8 +49,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t num_samples, uint32_t max_shared_memory); void cleanup_cuda_programmable_bootstrap_amortized(void *stream, uint32_t gpu_index, @@ -75,8 +73,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t num_samples, uint32_t max_shared_memory); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -84,8 +81,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory); + uint32_t num_samples, uint32_t max_shared_memory); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); @@ -353,8 +349,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -363,8 +358,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); #if (CUDA_ARCH >= 900) template @@ -374,8 +368,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_tbc( diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index 7a636f404c..6d5d30781f 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -28,8 +28,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, + uint32_t lwe_chunk_size = 0); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, @@ -63,8 +63,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t max_shared_memory, uint32_t lwe_chunk_size); #endif template @@ -90,8 +89,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); template void scratch_cuda_multi_bit_programmable_bootstrap( @@ -109,8 +107,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); template __host__ __device__ uint64_t diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 71f9532a47..4469e71f3b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -177,7 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, + grouping_factor, num_radix_blocks, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); } else { /// Make sure all data that should be on GPU 0 is indeed there @@ -204,7 +204,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, 1, 0, + pbs_level, grouping_factor, num_radix_blocks, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); /// Copy data back to GPU 0 and release vecs @@ -270,7 +270,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, 1, 0, + grouping_factor, num_radix_blocks, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -293,7 +293,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, 1, 0, + pbs_level, grouping_factor, num_radix_blocks, cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); /// Copy data back to GPU 0 and release vecs @@ -696,8 +696,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_base_log, params.pbs_level, - params.grouping_factor, 2, 2, 0, - cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type); + params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]), + params.pbs_type); cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector, big_lwe_size * sizeof(Torus), streams[0], diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index a5feeb8102..d2caa89927 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -372,7 +372,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, max_shared_memory, mem_ptr->params.pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -420,7 +420,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, max_shared_memory, mem_ptr->params.pbs_type); multi_gpu_gather_lwe_async( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index cc5bd54fa3..dacddb7bcc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -127,8 +127,8 @@ void execute_pbs_async( std::vector pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory, PBS_TYPE pbs_type) { + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, + PBS_TYPE pbs_type) { switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -160,8 +160,8 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu, + max_shared_memory); } break; default: @@ -200,7 +200,7 @@ void execute_pbs_async( current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory); + num_inputs_on_gpu, max_shared_memory); } break; case CLASSICAL: @@ -228,8 +228,8 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts, - lwe_idx, max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu, + max_shared_memory); } break; default: diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 0cf9b91498..084756a374 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -143,8 +143,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t num_samples, uint32_t max_shared_memory) { if (base_log > 32) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -158,7 +157,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 512: host_programmable_bootstrap_amortized>( @@ -167,7 +166,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 1024: host_programmable_bootstrap_amortized>( @@ -176,7 +175,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 2048: host_programmable_bootstrap_amortized>( @@ -185,7 +184,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 4096: host_programmable_bootstrap_amortized>( @@ -194,7 +193,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 8192: host_programmable_bootstrap_amortized>( @@ -203,7 +202,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 16384: host_programmable_bootstrap_amortized>( @@ -212,7 +211,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -254,15 +253,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8) * - level_count: number of decomposition levels in the gadget matrix (~4) * - num_samples: number of encrypted input messages - * - num_luts: parameter to set the actual number of luts to be * used - * - lwe_idx: the index of the LWE input to consider for the GPU of index - * gpu_index. In case of multi-GPU computing, it is assumed that only a part of - * the input LWE array is copied to each GPU, but the whole LUT array is copied - * (because the case when the number of LUTs is smaller than the number of input - * LWEs is not trivial to take into account in the data repartition on the - * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE - * input in the LUT array `lut_vector`. * - 'max_shared_memory' maximum amount of shared memory to be used inside * device functions * @@ -292,8 +283,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t num_samples, uint32_t max_shared_memory) { if (base_log > 64) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -307,7 +297,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 512: host_programmable_bootstrap_amortized>( @@ -316,7 +306,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 1024: host_programmable_bootstrap_amortized>( @@ -325,7 +315,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 2048: host_programmable_bootstrap_amortized>( @@ -334,7 +324,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 4096: host_programmable_bootstrap_amortized>( @@ -343,7 +333,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 8192: host_programmable_bootstrap_amortized>( @@ -352,7 +342,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; case 16384: host_programmable_bootstrap_amortized>( @@ -361,7 +351,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory); + num_samples, max_shared_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index be7cbf5b75..ee9fb4e1b9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -42,7 +42,6 @@ template * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) * - level_count: number of decomposition levels in the gadget matrix (~4) * - gpu_num: index of the current GPU (useful for multi-GPU computations) - * - lwe_idx: equal to the number of samples per gpu x gpu_num * - device_memory_size_per_sample: amount of global memory to allocate if SMD * is not FULLSM */ @@ -54,7 +53,7 @@ __global__ void device_programmable_bootstrap_amortized( const Torus *__restrict__ lwe_input_indexes, const double2 *__restrict__ bootstrapping_key, int8_t *device_mem, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, uint32_t lwe_idx, + uint32_t base_log, uint32_t level_count, size_t device_memory_size_per_sample) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -84,7 +83,7 @@ __global__ void device_programmable_bootstrap_amortized( auto block_lwe_array_in = &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)]; const Torus *block_lut_vector = - &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree * + &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree * (glwe_dimension + 1)]; // Put "b", the body, in [0, 2N[ @@ -300,8 +299,7 @@ __host__ void host_programmable_bootstrap_amortized( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { cudaSetDevice(gpu_index); uint64_t SM_FULL = @@ -335,14 +333,14 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_FULL); + level_count, DM_FULL); } else if (max_shared_memory < SM_FULL) { device_programmable_bootstrap_amortized <<>>( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, DM_PART); + level_count, DM_PART); } else { // For devices with compute capability 7.x a single thread block can // address the full capacity of shared memory. Shared memory on the @@ -354,7 +352,7 @@ __host__ void host_programmable_bootstrap_amortized( lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, - level_count, lwe_idx, 0); + level_count, 0); } check_cuda_error(cudaGetLastError()); } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 32913f5f21..537eb5c66d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -204,7 +204,7 @@ __host__ void host_programmable_bootstrap_cg( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t max_shared_memory) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 92b7d560e6..2da2e4581c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -336,8 +336,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index fed7b191b8..09a689405c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -131,8 +131,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -141,7 +140,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 512: host_programmable_bootstrap_tbc>( @@ -149,7 +148,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 1024: host_programmable_bootstrap_tbc>( @@ -157,7 +156,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 2048: host_programmable_bootstrap_tbc>( @@ -165,7 +164,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 4096: host_programmable_bootstrap_tbc>( @@ -173,7 +172,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 8192: host_programmable_bootstrap_tbc>( @@ -181,7 +180,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 16384: host_programmable_bootstrap_tbc>( @@ -189,7 +188,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -410,8 +409,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -420,7 +418,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 512: host_programmable_bootstrap_cg>( @@ -428,7 +426,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 1024: host_programmable_bootstrap_cg>( @@ -436,7 +434,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 2048: host_programmable_bootstrap_cg>( @@ -444,7 +442,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 4096: host_programmable_bootstrap_cg>( @@ -452,7 +450,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 8192: host_programmable_bootstrap_cg>( @@ -460,7 +458,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 16384: host_programmable_bootstrap_cg>( @@ -468,7 +466,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -484,8 +482,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { switch (polynomial_size) { case 256: @@ -494,7 +491,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 512: host_programmable_bootstrap>( @@ -502,7 +499,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 1024: host_programmable_bootstrap>( @@ -510,7 +507,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 2048: host_programmable_bootstrap>( @@ -518,7 +515,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 4096: host_programmable_bootstrap>( @@ -526,7 +523,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 8192: host_programmable_bootstrap>( @@ -534,7 +531,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; case 16384: host_programmable_bootstrap>( @@ -542,7 +539,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, max_shared_memory); + max_shared_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -559,15 +556,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t num_samples, uint32_t max_shared_memory) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (32)"); - pbs_buffer *buffer = - (pbs_buffer *)mem_ptr; + pbs_buffer *buffer = + (pbs_buffer *)mem_ptr; switch (buffer->pbs_variant) { case TBC: @@ -579,14 +575,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); + break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif - break; case CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), @@ -595,10 +590,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -608,10 +602,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -654,15 +647,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) * - level_count: number of decomposition levels in the gadget matrix (~4) * - num_samples: number of encrypted input messages - * - num_luts: parameter to set the actual number of luts to be - * used - * - lwe_idx: the index of the LWE input to consider for the GPU of index - * gpu_index. In case of multi-GPU computing, it is assumed that only a part of - * the input LWE array is copied to each GPU, but the whole LUT array is copied - * (because the case when the number of LUTs is smaller than the number of input - * LWEs is not trivial to take into account in the data repartition on the - * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE - * input in the LUT array `lut_vector`. * - 'max_shared_memory' maximum amount of shared memory to be used inside * device functions * @@ -696,8 +680,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx, - uint32_t max_shared_memory) { + uint32_t num_samples, uint32_t max_shared_memory) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (64)"); @@ -715,14 +698,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); + break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") #endif - break; case PBS_VARIANT::CG: cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( stream, gpu_index, static_cast(lwe_array_out), @@ -731,10 +713,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -744,10 +725,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lut_vector_indexes), static_cast(lwe_array_in), static_cast(lwe_input_indexes), - static_cast(bootstrapping_key), - (pbs_buffer *)buffer, lwe_dimension, + static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, base_log, level_count, num_samples, - num_luts, lwe_idx, max_shared_memory); + max_shared_memory); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -760,7 +740,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( */ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **buffer) { - cudaSetDevice(gpu_index); auto x = (pbs_buffer *)(*buffer); x->release(static_cast(stream), gpu_index); } @@ -776,8 +755,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -786,8 +764,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -809,8 +786,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -819,8 +795,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, @@ -850,8 +825,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t *lwe_output_indexes, uint64_t *lut_vector, @@ -859,8 +833,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index ebce576c81..da33298a2f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -419,7 +419,7 @@ __host__ void host_programmable_bootstrap( pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t max_shared_memory) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 3bff782749..48ad5fd8fe 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -74,8 +74,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -88,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 512: host_cg_multi_bit_programmable_bootstrap>( @@ -96,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 1024: host_cg_multi_bit_programmable_bootstrap>( @@ -104,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 2048: host_cg_multi_bit_programmable_bootstrap>( @@ -112,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 4096: host_cg_multi_bit_programmable_bootstrap>( @@ -120,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 8192: host_cg_multi_bit_programmable_bootstrap>( @@ -128,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 16384: host_cg_multi_bit_programmable_bootstrap>( @@ -136,7 +135,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -153,8 +152,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -167,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -175,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 1024: host_multi_bit_programmable_bootstrap>( @@ -183,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 2048: host_multi_bit_programmable_bootstrap>( @@ -191,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 4096: host_multi_bit_programmable_bootstrap>( @@ -199,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 8192: host_multi_bit_programmable_bootstrap>( @@ -207,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 16384: host_multi_bit_programmable_bootstrap>( @@ -215,7 +213,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -230,8 +228,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t num_luts, - uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, + uint32_t lwe_chunk_size) { pbs_buffer *buffer = (pbs_buffer *)mem_ptr; @@ -248,7 +246,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") @@ -263,7 +261,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -275,7 +273,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -527,8 +525,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t max_shared_memory, uint32_t lwe_chunk_size); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, @@ -546,8 +543,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t max_shared_memory, uint32_t lwe_chunk_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( @@ -627,8 +623,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -642,7 +637,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size); + num_samples, max_shared_memory, lwe_chunk_size); break; case 512: host_tbc_multi_bit_programmable_bootstrap( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size); + uint32_t max_shared_memory, uint32_t lwe_chunk_size); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index da8c70bb05..b409f935ec 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -123,8 +123,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( synchronize_threads_in_block(); // Move from local memory back to shared memory but as complex - double2 *fft = (double2 *)selected_memory; tid = threadIdx.x; + double2 *fft = (double2 *)selected_memory; #pragma unroll for (int i = 0; i < params::opt / 2; i++) { fft[tid] = temp[i]; @@ -630,8 +630,7 @@ __host__ void host_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); // If a chunk size is not passed to this function, select one. diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index a66473310d..f63f22748e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -227,7 +227,7 @@ __host__ void host_programmable_bootstrap_tbc( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t num_luts, uint32_t max_shared_memory) { + uint32_t max_shared_memory) { cudaSetDevice(gpu_index); auto supports_dsm = diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 656d4133bf..57378b5127 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -365,8 +365,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0) { + uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index 4f411ead83..f084244d04 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -128,8 +128,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, 1, 0, - cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * @@ -191,8 +190,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, 1, 0, - cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 322be12f2a..8d84b14a63 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -168,13 +168,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `num_lut_vectors`: parameter to set the actual number of test vectors to be used - /// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In case - /// of multi-GPU computing, it is assumed that only a part of the input LWE array is copied to - /// each GPU, but the whole LUT array is copied (because the case when the number of LUTs is - /// smaller than the number of input LWEs is not trivial to take into account in the data - /// repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider for a given - /// LWE input in the LUT array `lut_vector`. /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions /// /// This function calls a wrapper to a device kernel that performs the @@ -212,10 +205,7 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - num_lut_vectors: u32, - lwe_idx: u32, max_shared_memory: u32, - gpu_offset: u32, ); /// This cleanup function frees the data for the low latency PBS on GPU @@ -269,13 +259,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `num_lut_vectors`: parameter to set the actual number of test vectors to be used - /// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In case - /// of multi-GPU computing, it is assumed that only a part of the input LWE array is copied to - /// each GPU, but the whole LUT array is copied (because the case when the number of LUTs is - /// smaller than the number of input LWEs is not trivial to take into account in the data - /// repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider for a given - /// LWE input in the LUT array `lut_vector`. /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut c_void, @@ -295,10 +278,7 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - num_lut_vectors: u32, - lwe_idx: u32, max_shared_memory: u32, - gpu_offset: u32, lwe_chunk_size: u32, ); diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs index e949140503..ff4196e266 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs @@ -3,7 +3,7 @@ use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextLis use crate::core_crypto::gpu::entities::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::{programmable_bootstrap_multi_bit_async, CudaStreams}; -use crate::core_crypto::prelude::{CastInto, LweCiphertextIndex, UnsignedTorus}; +use crate::core_crypto::prelude::{CastInto, UnsignedTorus}; /// # Safety /// @@ -91,7 +91,6 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async multi_bit_bsk.decomp_level_count(), multi_bit_bsk.grouping_factor(), input.lwe_ciphertext_count().0 as u32, - LweCiphertextIndex(0), ); } diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs index 8eca70a49a..5a27a56440 100644 --- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs +++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs @@ -3,9 +3,7 @@ use crate::core_crypto::gpu::entities::lwe_bootstrap_key::CudaLweBootstrapKey; use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList; use crate::core_crypto::gpu::vec::CudaVec; use crate::core_crypto::gpu::{programmable_bootstrap_async, CudaStreams}; -use crate::core_crypto::prelude::{ - CastInto, LweCiphertextCount, LweCiphertextIndex, UnsignedTorus, -}; +use crate::core_crypto::prelude::{CastInto, LweCiphertextCount, UnsignedTorus}; /// # Safety /// @@ -93,7 +91,6 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async( bsk.decomp_base_log(), bsk.decomp_level_count(), num_samples.0 as u32, - LweCiphertextIndex(0), ); } diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 5a54d2a569..a7e261104e 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -110,7 +110,6 @@ pub unsafe fn programmable_bootstrap_async( base_log: DecompositionBaseLog, level: DecompositionLevelCount, num_samples: u32, - lwe_idx: LweCiphertextIndex, ) { let mut pbs_buffer: *mut i8 = std::ptr::null_mut(); scratch_cuda_programmable_bootstrap_64( @@ -141,10 +140,7 @@ pub unsafe fn programmable_bootstrap_async( base_log.0 as u32, level.0 as u32, num_samples, - num_samples, - lwe_idx.0 as u32, get_max_shared_memory(streams.gpu_indexes[0]) as u32, - 0, ); cleanup_cuda_programmable_bootstrap( streams.ptr[0], @@ -176,7 +172,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( level: DecompositionLevelCount, grouping_factor: LweBskGroupingFactor, num_samples: u32, - lwe_idx: LweCiphertextIndex, ) { let mut pbs_buffer: *mut i8 = std::ptr::null_mut(); scratch_cuda_multi_bit_programmable_bootstrap_64( @@ -211,10 +206,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( base_log.0 as u32, level.0 as u32, num_samples, - num_samples, - lwe_idx.0 as u32, get_max_shared_memory(0) as u32, - 0u32, 0, ); cleanup_cuda_multi_bit_programmable_bootstrap(