From 821ce5a7703ff2f3e6f138d8cf96a94ce547186a Mon Sep 17 00:00:00 2001 From: Kelong Cong Date: Wed, 31 Jul 2024 10:09:50 +0200 Subject: [PATCH] refactor(gpu): remove max_shared_memory from pbs arguments Always use max shared memory from device 0 to configure the kernels, to avoid bugs with multi-GPU configurations --- .../tfhe-cuda-backend/cuda/include/integer.h | 3 +- .../cuda/include/programmable_bootstrap.h | 57 ++-- .../include/programmable_bootstrap_multibit.h | 38 ++- .../cuda/src/integer/integer.cuh | 15 +- .../cuda/src/integer/multiplication.cuh | 4 +- .../cuda/src/pbs/bootstrapping_key.cuh | 28 +- .../cuda/src/pbs/programmable_bootstrap.cuh | 22 +- .../pbs/programmable_bootstrap_amortized.cu | 73 +++--- .../pbs/programmable_bootstrap_amortized.cuh | 19 +- .../pbs/programmable_bootstrap_cg_classic.cuh | 41 ++- .../programmable_bootstrap_cg_multibit.cuh | 61 ++--- .../src/pbs/programmable_bootstrap_classic.cu | 248 +++++++----------- .../pbs/programmable_bootstrap_classic.cuh | 66 ++--- .../pbs/programmable_bootstrap_multibit.cu | 209 +++++++-------- .../pbs/programmable_bootstrap_multibit.cuh | 68 +++-- .../programmable_bootstrap_tbc_classic.cuh | 29 +- .../programmable_bootstrap_tbc_multibit.cuh | 44 ++-- .../benchmarks/benchmark_pbs.cpp | 43 ++- .../tests/test_classical_pbs.cpp | 12 +- .../tests/test_multibit_pbs.cpp | 6 +- backends/tfhe-cuda-backend/src/cuda_bind.rs | 10 - tfhe/src/core_crypto/gpu/mod.rs | 11 - tfhe/src/integer/gpu/mod.rs | 3 +- 23 files changed, 489 insertions(+), 621 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 33d862cd29..2fc75c10b9 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -526,8 +526,7 @@ template struct int_radix_lut { execute_scratch_pbs( streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_level, - params.grouping_factor, num_blocks_on_gpu, - cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type, + params.grouping_factor, num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory); cuda_synchronize_stream(streams[i], gpu_indexes[i]); buffer.push_back(gpu_pbs_buffer); diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index bdfdabdd5d..312a361a76 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64( void scratch_cuda_programmable_bootstrap_amortized_32( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void scratch_cuda_programmable_bootstrap_amortized_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -41,7 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -49,7 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cleanup_cuda_programmable_bootstrap_amortized(void *stream, uint32_t gpu_index, @@ -58,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream, void scratch_cuda_programmable_bootstrap_32( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void scratch_cuda_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -73,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -81,18 +77,18 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); uint64_t get_buffer_size_programmable_bootstrap_amortized_64( uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory); + uint32_t input_lwe_ciphertext_count); uint64_t get_buffer_size_programmable_bootstrap_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory); + uint32_t input_lwe_ciphertext_count); } template @@ -155,7 +151,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) { template __host__ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( - uint32_t polynomial_size, uint32_t max_shared_memory); + uint32_t polynomial_size); template struct pbs_buffer; @@ -174,7 +170,7 @@ template struct pbs_buffer { this->pbs_variant = pbs_variant; - auto max_shared_memory = cuda_get_max_shared_memory(gpu_index); + auto max_shared_memory = cuda_get_max_shared_memory(0); if (allocate_gpu_memory) { switch (pbs_variant) { @@ -251,7 +247,7 @@ template struct pbs_buffer { bool supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory); + Torus>(polynomial_size); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( @@ -310,10 +306,10 @@ template struct pbs_buffer { }; template -__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg( +__host__ uint64_t get_buffer_size_programmable_bootstrap_cg( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { - + uint32_t input_lwe_ciphertext_count) { + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); uint64_t partial_sm = @@ -339,8 +335,7 @@ template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples, - uint32_t max_shared_memory); + uint32_t num_samples); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( @@ -349,7 +344,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -358,7 +353,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); #if (CUDA_ARCH >= 900) template @@ -368,36 +363,32 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); #endif template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, - uint32_t max_shared_memory); + uint32_t level_count); #ifdef __CUDACC__ __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size, diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index 6d5d30781f..ce1617872c 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -8,7 +8,7 @@ extern "C" { bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64( void *stream, uint32_t gpu_index, void *dest, void *src, @@ -19,8 +19,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t chunk_size = 0); void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -28,8 +28,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, @@ -39,12 +38,12 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, template __host__ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size, uint32_t max_shared_memory); + uint32_t polynomial_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory); + uint32_t level_count); #if CUDA_ARCH >= 900 template @@ -52,8 +51,8 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -63,7 +62,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif template @@ -71,15 +70,15 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -89,15 +88,15 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template void scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -107,7 +106,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template __host__ __device__ uint64_t @@ -314,7 +313,6 @@ template struct pbs_buffer { template __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size, - uint32_t max_shared_memory); + uint32_t polynomial_size); #endif // CUDA_MULTI_BIT_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 4469e71f3b..13642d9ee0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -177,8 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + grouping_factor, num_radix_blocks, pbs_type); } else { /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -204,8 +203,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + pbs_level, grouping_factor, num_radix_blocks, pbs_type); /// Copy data back to GPU 0 and release vecs multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, @@ -270,8 +268,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + grouping_factor, num_radix_blocks, pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); multi_gpu_scatter_lwe_async( @@ -293,8 +290,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + pbs_level, grouping_factor, num_radix_blocks, pbs_type); /// Copy data back to GPU 0 and release vecs multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, @@ -696,8 +692,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_base_log, params.pbs_level, - params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]), - params.pbs_type); + params.grouping_factor, 2, params.pbs_type); cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector, big_lwe_size * sizeof(Torus), streams[0], diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 00b5e23a8d..fc81a409c5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -357,7 +357,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, mem_ptr->params.pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -405,7 +405,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, mem_ptr->params.pbs_type); multi_gpu_gather_lwe_async( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 8ee59ff3a9..966be0464d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -115,7 +115,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -133,7 +133,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -151,7 +151,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -169,7 +169,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -187,7 +187,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -205,7 +205,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -223,7 +223,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -268,7 +268,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, double2 *buffer; switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -289,7 +289,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -310,7 +310,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -331,7 +331,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -352,7 +352,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -373,7 +373,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -394,7 +394,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index dacddb7bcc..f8f12d4403 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -127,8 +127,7 @@ void execute_pbs_async( std::vector pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - PBS_TYPE pbs_type) { + uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) { switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -160,8 +159,7 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, - max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu); } break; default: @@ -200,7 +198,7 @@ void execute_pbs_async( current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, max_shared_memory); + num_inputs_on_gpu); } break; case CLASSICAL: @@ -228,8 +226,7 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, - max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu); } break; default: @@ -247,8 +244,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, PBS_TYPE pbs_type, + uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, bool allocate_gpu_memory) { switch (sizeof(Torus)) { case sizeof(uint32_t): @@ -259,8 +255,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, case CLASSICAL: scratch_cuda_programmable_bootstrap_32( stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size, - level_count, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory); + level_count, input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Error: unsupported cuda PBS type.") @@ -275,13 +270,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, scratch_cuda_multi_bit_programmable_bootstrap_64( stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case CLASSICAL: scratch_cuda_programmable_bootstrap_64( stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size, - level_count, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory); + level_count, input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Error: unsupported cuda PBS type.") diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 084756a374..5891732459 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -5,10 +5,9 @@ */ uint64_t get_buffer_size_programmable_bootstrap_amortized_64( uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { return get_buffer_size_programmable_bootstrap_amortized( - glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory); + glwe_dimension, polynomial_size, input_lwe_ciphertext_count); } /* @@ -20,51 +19,50 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64( void scratch_cuda_programmable_bootstrap_amortized_32( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -81,51 +79,50 @@ void scratch_cuda_programmable_bootstrap_amortized_32( void scratch_cuda_programmable_bootstrap_amortized_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -143,7 +140,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 32) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -157,7 +154,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 512: host_programmable_bootstrap_amortized>( @@ -166,7 +163,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 1024: host_programmable_bootstrap_amortized>( @@ -175,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 2048: host_programmable_bootstrap_amortized>( @@ -184,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 4096: host_programmable_bootstrap_amortized>( @@ -193,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 8192: host_programmable_bootstrap_amortized>( @@ -202,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 16384: host_programmable_bootstrap_amortized>( @@ -211,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -254,8 +251,6 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( * - level_count: number of decomposition levels in the gadget matrix (~4) * - num_samples: number of encrypted input messages * used - * - 'max_shared_memory' maximum amount of shared memory to be used inside - * device functions * * This function calls a wrapper to a device kernel that performs the * bootstrapping: @@ -283,7 +278,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 64) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -297,7 +292,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 512: host_programmable_bootstrap_amortized>( @@ -306,7 +301,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 1024: host_programmable_bootstrap_amortized>( @@ -315,7 +310,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 2048: host_programmable_bootstrap_amortized>( @@ -324,7 +319,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 4096: host_programmable_bootstrap_amortized>( @@ -333,7 +328,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 8192: host_programmable_bootstrap_amortized>( @@ -342,7 +337,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 16384: host_programmable_bootstrap_amortized>( @@ -351,7 +346,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index ee9fb4e1b9..7018f635ea 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -232,10 +232,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_amortized( } template -__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized( +__host__ uint64_t get_buffer_size_programmable_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); @@ -257,16 +258,15 @@ template __host__ void scratch_programmable_bootstrap_amortized( cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap_amortized( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { cudaFuncSetAttribute( device_programmable_bootstrap_amortized, @@ -285,8 +285,7 @@ __host__ void scratch_programmable_bootstrap_amortized( if (allocate_gpu_memory) { uint64_t buffer_size = get_buffer_size_programmable_bootstrap_amortized( - glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory); + glwe_dimension, polynomial_size, input_lwe_ciphertext_count); *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index); check_cuda_error(cudaGetLastError()); } @@ -299,9 +298,8 @@ __host__ void host_programmable_bootstrap_amortized( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { - cudaSetDevice(gpu_index); uint64_t SM_FULL = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); @@ -314,6 +312,9 @@ __host__ void host_programmable_bootstrap_amortized( uint64_t DM_FULL = SM_FULL; + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); + // Create a 1-dimensional grid of threads // where each block handles 1 sample and each thread // handles opt polynomial coefficients diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 537eb5c66d..94d383a2f3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -161,15 +161,14 @@ __host__ void scratch_programmable_bootstrap_cg( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap_cg( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { check_cuda_error(cudaFuncSetAttribute( device_programmable_bootstrap_cg, @@ -203,9 +202,7 @@ __host__ void host_programmable_bootstrap_cg( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { - cudaSetDevice(gpu_index); + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -216,6 +213,9 @@ __host__ void host_programmable_bootstrap_cg( get_buffer_size_partial_sm_programmable_bootstrap_cg( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); + uint64_t full_dm = full_sm; uint64_t partial_dm = full_dm - partial_sm; @@ -265,8 +265,7 @@ __host__ void host_programmable_bootstrap_cg( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( - int glwe_dimension, int level_count, int num_samples, - uint32_t max_shared_memory) { + int glwe_dimension, int level_count, int num_samples) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -286,6 +285,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; int max_active_blocks_per_sm; + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, @@ -311,37 +311,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool supports_cooperative_groups_on_programmable_bootstrap( - int glwe_dimension, int polynomial_size, int level_count, int num_samples, - uint32_t max_shared_memory) { + int glwe_dimension, int polynomial_size, int level_count, int num_samples) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples); case 512: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples); case 1024: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples); case 2048: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples); case 4096: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples); case 8192: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples); case 16384: return verify_cuda_programmable_bootstrap_cg_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, + num_samples); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " "Supported N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 2da2e4581c..4f9e96c4f4 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -156,8 +156,7 @@ template __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap( uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t grouping_factor, uint32_t lwe_chunk_size, - uint32_t max_shared_memory) { + uint32_t grouping_factor, uint32_t lwe_chunk_size) { uint64_t buffer_size = 0; buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count * @@ -177,8 +176,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0) { uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( @@ -190,6 +189,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle, @@ -243,9 +243,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG, @@ -260,9 +259,8 @@ __host__ void execute_cg_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { - cudaSetDevice(gpu_index); uint64_t full_dm = get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( polynomial_size); @@ -271,6 +269,9 @@ __host__ void execute_cg_external_product_loop( polynomial_size); uint64_t no_dm = 0; + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); + uint32_t keybundle_size_per_input = lwe_chunk_size * level_count * (glwe_dimension + 1) * (glwe_dimension + 1) * (polynomial_size / 2); @@ -336,12 +337,11 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { - cudaSetDevice(gpu_index); + uint32_t lwe_chunk_size = 0) { if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -350,24 +350,21 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate execute_cg_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); } } // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( - int glwe_dimension, int level_count, int num_samples, - uint32_t max_shared_memory) { + int glwe_dimension, int level_count, int num_samples) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -387,6 +384,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; int max_active_blocks_per_sm; + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm_cg_accumulate) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, @@ -417,37 +415,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size( // group constraints template __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap( - int glwe_dimension, int polynomial_size, int level_count, int num_samples, - uint32_t max_shared_memory) { + int glwe_dimension, int polynomial_size, int level_count, int num_samples) { switch (polynomial_size) { case 256: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples); case 512: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples); case 1024: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples); case 2048: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples); case 4096: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples); case 8192: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples); case 16384: return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size< - Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples, - max_shared_memory); + Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, + num_samples); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 09a689405c..90403cbb81 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -8,54 +8,46 @@ template bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples, - uint32_t max_shared_memory) { + uint32_t num_samples) { return supports_cooperative_groups_on_programmable_bootstrap( - glwe_dimension, polynomial_size, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, level_count, num_samples); } template -bool has_support_to_cuda_programmable_bootstrap_tbc( - uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory) { +bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples, + uint32_t glwe_dimension, + uint32_t polynomial_size, + uint32_t level_count) { #if CUDA_ARCH >= 900 switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 512: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 1024: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 2048: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 4096: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 8192: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 16384: return supports_thread_block_clusters_on_classic_programmable_bootstrap< Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); default: PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -71,51 +63,50 @@ template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -131,7 +122,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -139,56 +130,49 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -203,69 +187,68 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( */ uint64_t get_buffer_size_programmable_bootstrap_64( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) + input_lwe_ciphertext_count)) return get_buffer_size_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory); + input_lwe_ciphertext_count); else return get_buffer_size_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory); + input_lwe_ciphertext_count); } template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -278,51 +261,50 @@ template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -340,31 +322,30 @@ void scratch_cuda_programmable_bootstrap( void scratch_cuda_programmable_bootstrap_32( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count, max_shared_memory)) + level_count)) scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else #endif if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) + input_lwe_ciphertext_count)) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); } /* @@ -375,31 +356,30 @@ void scratch_cuda_programmable_bootstrap_32( void scratch_cuda_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count, max_shared_memory)) + level_count)) scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else #endif if (has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) + input_lwe_ciphertext_count)) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); } template @@ -409,7 +389,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -417,56 +397,49 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -482,7 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -490,56 +463,49 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -556,7 +522,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be > number of bits " @@ -576,8 +542,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -591,8 +556,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -603,8 +567,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -647,8 +610,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( * - base_log: log base used for the gadget matrix - B = 2^base_log (~8) * - level_count: number of decomposition levels in the gadget matrix (~4) * - num_samples: number of encrypted input messages - * - 'max_shared_memory' maximum amount of shared memory to be used inside - * device functions * * This function calls a wrapper to a device kernel that performs the * bootstrapping: @@ -680,7 +641,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (64)"); @@ -699,8 +660,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -714,8 +674,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -726,8 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -746,7 +704,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, template bool has_support_to_cuda_programmable_bootstrap_cg( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -755,7 +713,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -764,20 +722,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -786,7 +742,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -795,27 +751,25 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory); + uint32_t level_count); template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory); + uint32_t level_count); #if CUDA_ARCH >= 900 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( @@ -825,7 +779,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t *lwe_output_indexes, uint64_t *lut_vector, @@ -833,17 +787,15 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index da33298a2f..3b00c87fd6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -230,7 +230,7 @@ __global__ void device_programmable_bootstrap_step_two( template __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { uint64_t full_sm_step_one = get_buffer_size_full_sm_programmable_bootstrap_step_one( @@ -246,6 +246,7 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap( uint64_t full_dm = full_sm_step_one; uint64_t device_mem = 0; + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm) { device_mem = full_dm * input_lwe_ciphertext_count * level_count * (glwe_dimension + 1); @@ -273,10 +274,8 @@ __host__ void scratch_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); uint64_t full_sm_step_one = get_buffer_size_full_sm_programmable_bootstrap_step_one( polynomial_size); @@ -286,6 +285,8 @@ __host__ void scratch_programmable_bootstrap( uint64_t partial_sm = get_buffer_size_partial_sm_programmable_bootstrap(polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + // Configure step one if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) { check_cuda_error(cudaFuncSetAttribute( @@ -330,16 +331,18 @@ __host__ void scratch_programmable_bootstrap( } template -__host__ void execute_step_one( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { - +__host__ void +execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, + uint64_t full_sm, uint64_t full_dm) { + + int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); @@ -370,16 +373,18 @@ __host__ void execute_step_one( } template -__host__ void execute_step_two( - cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { - +__host__ void +execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, + uint64_t full_sm, uint64_t full_dm) { + + int max_shared_memory = cuda_get_max_shared_memory(0); cudaSetDevice(gpu_index); int thds = polynomial_size / params::opt; dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1); @@ -418,8 +423,7 @@ __host__ void host_programmable_bootstrap( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -448,16 +452,14 @@ __host__ void host_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, d_mem, - max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one, - full_dm_step_one); + glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, + partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, d_mem, - max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two, - full_dm_step_two); + glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, + partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 4cc025c139..2e2e30141b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -9,53 +9,45 @@ bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { return supports_cooperative_groups_on_multibit_programmable_bootstrap< - uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples, - max_shared_memory); + uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples); } template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory) { + uint32_t level_count) { #if CUDA_ARCH >= 900 switch (polynomial_size) { case 256: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<256>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 512: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<512>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 1024: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 2048: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 4096: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 8192: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); case 16384: return supports_thread_block_clusters_on_multibit_programmable_bootstrap< Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension, - polynomial_size, level_count, - max_shared_memory); + polynomial_size, level_count); default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " "N's are powers of two" @@ -74,7 +66,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -87,7 +79,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_cg_multi_bit_programmable_bootstrap>( @@ -95,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_cg_multi_bit_programmable_bootstrap>( @@ -103,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_cg_multi_bit_programmable_bootstrap>( @@ -111,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_cg_multi_bit_programmable_bootstrap>( @@ -119,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_cg_multi_bit_programmable_bootstrap>( @@ -127,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_cg_multi_bit_programmable_bootstrap>( @@ -135,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -152,7 +144,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -165,7 +157,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -173,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_multi_bit_programmable_bootstrap>( @@ -181,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_multi_bit_programmable_bootstrap>( @@ -189,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_multi_bit_programmable_bootstrap>( @@ -197,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_multi_bit_programmable_bootstrap>( @@ -205,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_multi_bit_programmable_bootstrap>( @@ -213,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -228,8 +220,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size) { pbs_buffer *buffer = (pbs_buffer *)mem_ptr; @@ -246,7 +237,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") @@ -261,7 +252,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -273,7 +264,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -284,51 +275,51 @@ template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 512: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 1024: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 2048: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 4096: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 8192: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 16384: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -342,58 +333,58 @@ void scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 512: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 1024: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 2048: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 4096: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 8192: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 16384: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -406,34 +397,32 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size) { + bool allocate_gpu_memory, uint32_t lwe_chunk_size) { #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - level_count, max_shared_memory)) + level_count)) scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory, + lwe_chunk_size); else #endif if (supports_cooperative_groups_on_multibit_programmable_bootstrap< uint64_t>(glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory)) + input_lwe_ciphertext_count)) scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory, - lwe_chunk_size); + input_lwe_ciphertext_count, allocate_gpu_memory, lwe_chunk_size); else scratch_cuda_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory, + lwe_chunk_size); } void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, @@ -456,14 +445,15 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, */ template __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size, - uint32_t max_shared_memory) { + uint32_t polynomial_size) { uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); int max_blocks_per_sm; + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); if (max_shared_memory < full_sm_keybundle) cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks_per_sm, @@ -513,8 +503,7 @@ template void scratch_cuda_multi_bit_programmable_bootstrap( pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size); + bool allocate_gpu_memory, uint32_t lwe_chunk_size); template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -525,14 +514,14 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -543,12 +532,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory); + uint32_t level_count); #if (CUDA_ARCH >= 900) template @@ -556,58 +545,58 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 512: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 1024: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 2048: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 4096: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 8192: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 16384: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -623,7 +612,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -636,7 +625,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_tbc_multi_bit_programmable_bootstrap>( @@ -644,7 +633,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_tbc_multi_bit_programmable_bootstrap>( @@ -652,7 +641,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_tbc_multi_bit_programmable_bootstrap>( @@ -660,7 +649,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_tbc_multi_bit_programmable_bootstrap>( @@ -668,7 +657,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_tbc_multi_bit_programmable_bootstrap>( @@ -676,7 +665,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_tbc_multi_bit_programmable_bootstrap>( @@ -684,7 +673,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -697,8 +686,8 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -709,5 +698,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index b409f935ec..f913280eb9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -376,9 +376,9 @@ __host__ void scratch_multi_bit_programmable_bootstrap( pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size = 0) { + bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); @@ -470,9 +470,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT, @@ -486,9 +485,8 @@ __host__ void execute_compute_keybundle( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { - cudaSetDevice(gpu_index); uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -499,6 +497,8 @@ __host__ void execute_compute_keybundle( uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); auto d_mem = buffer->d_mem_keybundle; auto keybundle_fft = buffer->keybundle_fft; @@ -526,22 +526,23 @@ __host__ void execute_compute_keybundle( } template -__host__ void -execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, int j, int lwe_offset) { +__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, + Torus *lut_vector, Torus *lut_vector_indexes, + Torus *lwe_array_in, Torus *lwe_input_indexes, + pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int j, int lwe_offset) { - cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_one = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( polynomial_size); uint64_t partial_sm_accumulate_step_one = get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< Torus>(polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); // auto d_mem = buffer->d_mem_acc_step_one; @@ -581,19 +582,20 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, } template -__host__ void -execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, - pbs_buffer *buffer, uint32_t num_samples, - uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, int32_t grouping_factor, - uint32_t level_count, uint32_t max_shared_memory, int j, - int lwe_offset, uint32_t lwe_chunk_size) { +__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index, + Torus *lwe_array_out, Torus *lwe_output_indexes, + pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, + uint32_t polynomial_size, + int32_t grouping_factor, uint32_t level_count, + int j, int lwe_offset, uint32_t lwe_chunk_size) { - cudaSetDevice(gpu_index); uint64_t full_sm_accumulate_step_two = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); auto d_mem = buffer->d_mem_acc_step_two; auto keybundle_fft = buffer->keybundle_fft; @@ -630,13 +632,12 @@ __host__ void host_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { - cudaSetDevice(gpu_index); + uint32_t lwe_chunk_size = 0) { // If a chunk size is not passed to this function, select one. if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -645,8 +646,7 @@ __host__ void host_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate uint32_t chunk_size = std::min( lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -654,14 +654,12 @@ __host__ void host_multi_bit_programmable_bootstrap( execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, max_shared_memory, j, - lwe_offset); + polynomial_size, base_log, level_count, j, lwe_offset); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, level_count, max_shared_memory, j, lwe_offset, - lwe_chunk_size); + grouping_factor, level_count, j, lwe_offset, lwe_chunk_size); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index f63f22748e..a32a88c0e7 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -165,13 +165,11 @@ __host__ void scratch_programmable_bootstrap_tbc( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { - cudaSetDevice(gpu_index); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { bool supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory); + Torus>(polynomial_size); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( polynomial_size); @@ -183,6 +181,7 @@ __host__ void scratch_programmable_bootstrap_tbc( minimum_sm_tbc = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory >= full_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( @@ -226,13 +225,11 @@ __host__ void host_programmable_bootstrap_tbc( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { - cudaSetDevice(gpu_index); + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { auto supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory); + Torus>(polynomial_size); // With SM each block corresponds to either the mask or body, no need to // duplicate data for each @@ -247,6 +244,9 @@ __host__ void host_programmable_bootstrap_tbc( get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); + uint64_t full_dm = full_sm; uint64_t partial_dm = full_dm - partial_sm; @@ -306,8 +306,7 @@ __host__ void host_programmable_bootstrap_tbc( // Verify if the grid size satisfies the cooperative group constraints template __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( - int glwe_dimension, int level_count, int num_samples, - uint32_t max_shared_memory) { + int glwe_dimension, int level_count, int num_samples) { // If Cooperative Groups is not supported, no need to check anything else if (!cuda_check_support_cooperative_groups()) @@ -321,12 +320,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( get_buffer_size_partial_sm_programmable_bootstrap_tbc( params::degree); + int max_shared_memory = cuda_get_max_shared_memory(0); int thds = params::degree / params::opt; // Get the maximum number of active blocks per streaming multiprocessors int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples; int max_active_blocks_per_sm; - if (max_shared_memory < partial_sm) { cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_active_blocks_per_sm, @@ -353,11 +352,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size( template __host__ bool supports_distributed_shared_memory_on_classic_programmable_bootstrap( - uint32_t polynomial_size, uint32_t max_shared_memory) { + uint32_t polynomial_size) { uint64_t minimum_sm = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < minimum_sm) { // If we cannot store a single polynomial in a block shared memory we cannot // use TBC @@ -370,7 +370,7 @@ supports_distributed_shared_memory_on_classic_programmable_bootstrap( template __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory) { + uint32_t level_count) { if (!cuda_check_support_thread_block_clusters() || num_samples > 128) return false; @@ -382,7 +382,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( polynomial_size); uint64_t minimum_sm_tbc = 0; if (supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory)) + Torus>(polynomial_size)) minimum_sm_tbc = get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( polynomial_size); @@ -405,6 +405,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap( * case and it will fail if we try. Thus, since level_count * * (glwe_dimension+1) is usually smaller than 8 at this moment, we will * disable cudaFuncAttributeNonPortableClusterSizeAllowed */ + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm + minimum_sm_tbc) { check_cuda_error(cudaFuncSetAttribute( device_programmable_bootstrap_tbc, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 57378b5127..7e13451fd1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -172,14 +172,11 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size = 0) { - - cudaSetDevice(gpu_index); + bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { bool supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory); + Torus>(polynomial_size); uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( @@ -196,6 +193,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle, @@ -254,9 +253,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC, @@ -271,12 +269,11 @@ __host__ void execute_tbc_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { - cudaSetDevice(gpu_index); auto supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory); + Torus>(polynomial_size); uint64_t full_dm = get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( @@ -290,6 +287,9 @@ __host__ void execute_tbc_external_product_loop( get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); + uint32_t keybundle_size_per_input = lwe_chunk_size * level_count * (glwe_dimension + 1) * (glwe_dimension + 1) * (polynomial_size / 2); @@ -365,12 +365,12 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -379,27 +379,26 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate execute_tbc_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); } } template __host__ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size, uint32_t max_shared_memory) { + uint32_t polynomial_size) { uint64_t minimum_sm = get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory <= minimum_sm) { // If we cannot store a single polynomial in a block shared memory we // cannot use TBC @@ -412,7 +411,7 @@ supports_distributed_shared_memory_on_multibit_programmable_bootstrap( template __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t level_count, uint32_t max_shared_memory) { + uint32_t level_count) { if (!cuda_check_support_thread_block_clusters()) return false; @@ -425,7 +424,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( polynomial_size); uint64_t minimum_sm_tbc_accumulate = 0; if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size, max_shared_memory)) + Torus>(polynomial_size)) minimum_sm_tbc_accumulate = get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( polynomial_size); @@ -448,6 +447,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( * case and it will fail if we try. Thus, since level_count * * (glwe_dimension+1) is usually smaller than 8 at this moment, we will * disable cudaFuncAttributeNonPortableClusterSizeAllowed */ + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { check_cuda_error(cudaFuncSetAttribute( @@ -487,5 +487,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap( template __host__ bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap( - uint32_t polynomial_size, uint32_t max_shared_memory); + uint32_t polynomial_size); #endif // FASTMULTIBIT_PBS_H diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 6fc06098da..d926486e67 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -174,7 +174,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) { + pbs_level)) { st.SkipWithError("Configuration not supported for tbc operation"); return; } @@ -182,8 +182,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, (pbs_buffer **)&buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_level, grouping_factor, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index), - true, chunk_size); + input_lwe_ciphertext_count, true, chunk_size); for (auto _ : st) { // Execute PBS @@ -192,8 +191,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index), chunk_size); + pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream); } @@ -205,7 +203,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit( glwe_dimension, polynomial_size, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) { + input_lwe_ciphertext_count)) { st.SkipWithError("Configuration not supported for fast operation"); return; } @@ -213,7 +211,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true, chunk_size); + true, chunk_size); for (auto _ : st) { // Execute PBS @@ -222,8 +220,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), chunk_size); + pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream, gpu_index); } @@ -235,8 +232,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) scratch_cuda_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_level, - grouping_factor, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true, chunk_size); + grouping_factor, input_lwe_ciphertext_count, true, chunk_size); for (auto _ : st) { // Execute PBS @@ -245,8 +241,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), chunk_size); + pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream, gpu_index); } @@ -258,15 +253,14 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) { + pbs_level)) { st.SkipWithError("Configuration not supported for tbc operation"); return; } scratch_cuda_programmable_bootstrap_tbc( stream, (pbs_buffer **)&buffer, glwe_dimension, - polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index), true); + polynomial_size, pbs_level, input_lwe_ciphertext_count, true); for (auto _ : st) { // Execute PBS @@ -277,8 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream); } @@ -290,7 +283,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_cg( glwe_dimension, polynomial_size, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) { + input_lwe_ciphertext_count)) { st.SkipWithError("Configuration not supported for fast operation"); return; } @@ -298,7 +291,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true); + true); for (auto _ : st) { // Execute PBS @@ -309,7 +302,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } @@ -322,7 +315,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true); + true); for (auto _ : st) { // Execute PBS @@ -333,7 +326,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } @@ -345,7 +338,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS) scratch_cuda_programmable_bootstrap_amortized_64( stream, gpu_index, &buffer, glwe_dimension, polynomial_size, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true); + input_lwe_ciphertext_count, true); for (auto _ : st) { // Execute PBS @@ -355,7 +348,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS) (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in_array, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index f084244d04..cc6b11ba37 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -107,7 +107,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { int8_t *pbs_buffer; scratch_cuda_programmable_bootstrap_amortized_64( stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, - number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + number_of_inputs, true); int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * polynomial_size * (lwe_dimension + 1); @@ -128,7 +128,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * @@ -165,9 +165,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { int8_t *pbs_buffer; - scratch_cuda_programmable_bootstrap_64( - stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + scratch_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer, + glwe_dimension, polynomial_size, + pbs_level, number_of_inputs, true); int number_of_sm = 0; cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); @@ -190,7 +190,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index b116a777e4..82e3bbb193 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -93,8 +93,7 @@ class MultiBitProgrammableBootstrapTestPrimitives_u64 scratch_cuda_multi_bit_programmable_bootstrap_64( stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, pbs_level, grouping_factor, number_of_inputs, - cuda_get_max_shared_memory(gpu_index), true); + polynomial_size, pbs_level, grouping_factor, number_of_inputs, true); lwe_ct_out_array = (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) * @@ -136,8 +135,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), - 0); + pbs_level, number_of_inputs, 0); // Copy result to the host memory cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 8d84b14a63..d028c1a554 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -67,9 +67,6 @@ extern "C" { /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32); - /// Get the maximum amount of shared memory on GPU `gpu_index` - pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32; - pub fn cuda_setup_multi_gpu() -> i32; /// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to @@ -136,7 +133,6 @@ extern "C" { polynomial_size: u32, level_count: u32, input_lwe_ciphertext_count: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, ); @@ -168,7 +164,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions /// /// This function calls a wrapper to a device kernel that performs the /// bootstrapping: @@ -205,7 +200,6 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - max_shared_memory: u32, ); /// This cleanup function frees the data for the low latency PBS on GPU @@ -228,7 +222,6 @@ extern "C" { level_count: u32, grouping_factor: u32, input_lwe_ciphertext_count: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, lwe_chunk_size: u32, ); @@ -259,7 +252,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut c_void, gpu_index: u32, @@ -278,7 +270,6 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - max_shared_memory: u32, lwe_chunk_size: u32, ); @@ -471,7 +462,6 @@ extern "C" { grouping_factor: u32, num_blocks: u32, pbs_type: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, ); diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 709da6fca4..26ae612e8e 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -119,7 +119,6 @@ pub unsafe fn programmable_bootstrap_async( polynomial_size.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(0) as u32, true, ); cuda_programmable_bootstrap_lwe_ciphertext_vector_64( @@ -139,7 +138,6 @@ pub unsafe fn programmable_bootstrap_async( base_log.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(streams.gpu_indexes[0]) as u32, ); cleanup_cuda_programmable_bootstrap( streams.ptr[0], @@ -183,7 +181,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( level.0 as u32, grouping_factor.0 as u32, num_samples, - get_max_shared_memory(0) as u32, true, 0u32, ); @@ -205,7 +202,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( base_log.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(0) as u32, 0, ); cleanup_cuda_multi_bit_programmable_bootstrap( @@ -577,11 +573,6 @@ pub struct CudaGlweList { pub ciphertext_modulus: CiphertextModulus, } -/// Get the maximum amount of shared memory on a device -pub fn get_max_shared_memory(gpu_index: u32) -> i32 { - unsafe { cuda_get_max_shared_memory(gpu_index) } -} - /// Get the number of GPUs on the machine pub fn get_number_of_gpus() -> i32 { unsafe { cuda_get_number_of_gpus() } @@ -611,8 +602,6 @@ mod tests { #[test] fn print_gpu_info() { println!("Number of GPUs: {}", get_number_of_gpus()); - let gpu_index: u32 = 0; - println!("Max shared memory: {}", get_max_shared_memory(gpu_index)) } #[test] fn allocate_and_copy() { diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 654ddce257..553b18d1a3 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -3,7 +3,7 @@ pub mod server_key; use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; -use crate::core_crypto::gpu::{get_max_shared_memory, CudaStreams}; +use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{ DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweBskGroupingFactor, LweDimension, Numeric, PolynomialSize, UnsignedInteger, @@ -368,7 +368,6 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async