diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 3c520092f2..8119b5cc90 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -247,14 +247,5 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { cudaDeviceGetAttribute(&max_shared_memory, cudaDevAttrMaxSharedMemoryPerBlock, gpu_index); check_cuda_error(cudaGetLastError()); -#if CUDA_ARCH == 900 - max_shared_memory = 226000; -#elif CUDA_ARCH == 890 - max_shared_memory = 127000; -#elif CUDA_ARCH == 800 - max_shared_memory = 163000; -#elif CUDA_ARCH == 700 - max_shared_memory = 95000; -#endif return max_shared_memory; } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 966be0464d..4cb70805bd 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -116,12 +116,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, switch (polynomial_size) { case 256: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -134,12 +128,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 512: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -152,12 +140,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -170,12 +152,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -188,12 +164,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -206,12 +176,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -224,12 +188,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, break; case 16384: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { - check_cuda_error(cudaFuncSetAttribute( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_NSMFFT, ForwardFFT>, FULLSM>, - cudaFuncCachePreferShared)); batch_NSMFFT, ForwardFFT>, FULLSM> <<>>(d_bsk, dest, buffer); @@ -270,14 +228,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 256: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -291,14 +241,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 512: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -312,14 +254,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 1024: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -333,14 +267,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 2048: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -354,14 +280,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 4096: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -375,14 +293,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 8192: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, output, buffer); @@ -396,14 +306,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, case 16384: if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); - check_cuda_error(cudaFuncSetAttribute( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); - check_cuda_error(cudaFuncSetCacheConfig( - batch_polynomial_mul, ForwardFFT>, - FULLSM>, - cudaFuncCachePreferShared)); batch_polynomial_mul, ForwardFFT>, FULLSM> <<>>(input1, input2, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index 6a60a0f6d1..250f147988 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -258,28 +258,6 @@ __host__ void scratch_programmable_bootstrap_amortized( uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - uint64_t full_sm = - get_buffer_size_full_sm_programmable_bootstrap_amortized( - polynomial_size, glwe_dimension); - uint64_t partial_sm = - get_buffer_size_partial_sm_programmable_bootstrap_amortized( - polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { - cudaFuncSetAttribute( - device_programmable_bootstrap_amortized, - cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_amortized, - cudaFuncCachePreferShared); - } else if (max_shared_memory >= partial_sm) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_amortized, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); - check_cuda_error(cudaFuncSetCacheConfig( - device_programmable_bootstrap_amortized, - cudaFuncCachePreferShared)); - } if (allocate_gpu_memory) { uint64_t buffer_size = get_buffer_size_programmable_bootstrap_amortized( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 04ff5348c5..668cf2e0ae 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -194,30 +194,6 @@ __host__ void scratch_programmable_bootstrap_cg( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - uint64_t full_sm = - get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); - uint64_t partial_sm = - get_buffer_size_partial_sm_programmable_bootstrap_cg( - polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, - cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory >= partial_sm) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_cg, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_cg, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 9ad863708e..d17a953151 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -213,69 +213,6 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - uint64_t full_sm_keybundle = - get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( - polynomial_size); - uint64_t full_sm_cg_accumulate = - get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( - polynomial_size); - uint64_t partial_sm_cg_accumulate = - get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( - polynomial_size); - - int max_shared_memory = cuda_get_max_shared_memory(0); - if (max_shared_memory < full_sm_keybundle) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - - if (max_shared_memory < partial_sm_cg_accumulate) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory < full_sm_cg_accumulate) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm_cg_accumulate)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_cg_accumulate)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_cg_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index b9dfdf415c..d4966f1cdc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -309,55 +309,6 @@ __host__ void scratch_programmable_bootstrap( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - uint64_t full_sm_step_one = - get_buffer_size_full_sm_programmable_bootstrap_step_one( - polynomial_size); - uint64_t full_sm_step_two = - get_buffer_size_full_sm_programmable_bootstrap_step_two( - polynomial_size); - uint64_t partial_sm = - get_buffer_size_partial_sm_programmable_bootstrap(polynomial_size); - - int max_shared_memory = cuda_get_max_shared_memory(0); - - // Configure step one - if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, - cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory >= partial_sm) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_one, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_one)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_one, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - - // Configure step two - if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_two) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_two, - cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_two, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory >= partial_sm) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_step_two, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_step_two)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_step_two, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::DEFAULT, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index c39816e3c1..74b3669479 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -390,97 +390,6 @@ __host__ void scratch_multi_bit_programmable_bootstrap( uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor, bool allocate_gpu_memory) { - int max_shared_memory = cuda_get_max_shared_memory(0); - uint64_t full_sm_keybundle = - get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( - polynomial_size); - uint64_t full_sm_accumulate_step_one = - get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( - polynomial_size); - uint64_t full_sm_accumulate_step_two = - get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( - polynomial_size); - uint64_t partial_sm_accumulate_step_one = - get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one< - Torus>(polynomial_size); - - if (max_shared_memory < full_sm_keybundle) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - - if (max_shared_memory < partial_sm_accumulate_step_one) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, NOSM>, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory < full_sm_accumulate_step_one) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, - partial_sm_accumulate_step_one)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, PARTIALSM>, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, - full_sm_accumulate_step_one)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_accumulate_step_one< - Torus, params, FULLSM>, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - - if (max_shared_memory < full_sm_accumulate_step_two) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_accumulate_step_two< - Torus, params, NOSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_accumulate_step_two< - Torus, params, NOSM>, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_accumulate_step_two< - Torus, params, FULLSM>, - cudaFuncAttributeMaxDynamicSharedMemorySize, - full_sm_accumulate_step_two)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_accumulate_step_two< - Torus, params, FULLSM>, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index 5dccab3606..515527ade1 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -200,49 +200,6 @@ __host__ void scratch_programmable_bootstrap_tbc( uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { - bool supports_dsm = - supports_distributed_shared_memory_on_classic_programmable_bootstrap< - Torus>(polynomial_size); - - uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc( - polynomial_size); - uint64_t partial_sm = - get_buffer_size_partial_sm_programmable_bootstrap_tbc( - polynomial_size); - uint64_t minimum_sm_tbc = 0; - if (supports_dsm) - minimum_sm_tbc = - get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap( - polynomial_size); - int max_shared_memory = cuda_get_max_shared_memory(0); - - if (max_shared_memory >= full_sm + minimum_sm_tbc) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm + minimum_sm_tbc)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory >= partial_sm + minimum_sm_tbc) { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, - cudaFuncAttributeMaxDynamicSharedMemorySize, - partial_sm + minimum_sm_tbc)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_programmable_bootstrap_tbc, - cudaFuncAttributeMaxDynamicSharedMemorySize, minimum_sm_tbc)); - cudaFuncSetCacheConfig( - device_programmable_bootstrap_tbc, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, PBS_VARIANT::TBC, allocate_gpu_memory); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index d2cc68d6c7..b1fac308ac 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -204,84 +204,6 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor, bool allocate_gpu_memory) { - bool supports_dsm = - supports_distributed_shared_memory_on_multibit_programmable_bootstrap< - Torus>(polynomial_size); - - uint64_t full_sm_keybundle = - get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( - polynomial_size); - uint64_t full_sm_tbc_accumulate = - get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap( - polynomial_size); - uint64_t partial_sm_tbc_accumulate = - get_buffer_size_partial_sm_tbc_multibit_programmable_bootstrap( - polynomial_size); - uint64_t minimum_sm_tbc_accumulate = 0; - if (supports_dsm) - minimum_sm_tbc_accumulate = - get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap( - polynomial_size); - - int max_shared_memory = cuda_get_max_shared_memory(0); - - if (max_shared_memory < full_sm_keybundle) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, 0)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm_keybundle)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_keybundle, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - - if (max_shared_memory < - partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, - minimum_sm_tbc_accumulate)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else if (max_shared_memory < - full_sm_tbc_accumulate + minimum_sm_tbc_accumulate) { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, - partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } else { - check_cuda_error(cudaFuncSetAttribute( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncAttributeMaxDynamicSharedMemorySize, - full_sm_tbc_accumulate + minimum_sm_tbc_accumulate)); - cudaFuncSetCacheConfig( - device_multi_bit_programmable_bootstrap_tbc_accumulate, - cudaFuncCachePreferShared); - check_cuda_error(cudaGetLastError()); - } - auto lwe_chunk_size = get_lwe_chunk_size( gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index fc0e5d9d76..b18b791b3d 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -1,5 +1,5 @@ use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; -use crate::core_crypto::gpu::CudaStreams; +use crate::core_crypto::gpu::{synchronize_device, CudaStreams}; use crate::core_crypto::prelude::Numeric; use std::collections::Bound::{Excluded, Included, Unbounded}; use std::ffi::c_void; @@ -447,6 +447,8 @@ impl Drop for CudaVec { /// Free memory for pointer `ptr` synchronously fn drop(&mut self) { for &gpu_index in self.gpu_indexes.iter() { + // Synchronizes the device to be sure no stream is still using this pointer + synchronize_device(gpu_index); unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) }; } }