zama-ai · agnesLeroy · Jul 31, 2024 · Jul 31, 2024
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -187,21 +187,6 @@ __host__ void scratch_cuda_integer_sum_ciphertexts_vec_kb(
     uint32_t num_blocks_in_radix, uint32_t max_num_radix_in_vec,
     int_radix_params params, bool allocate_gpu_memory) {
 
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<Torus, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
   *mem_ptr = new int_sum_ciphertexts_vec_memory<Torus>(
       streams, gpu_indexes, gpu_count, params, num_blocks_in_radix,
       max_num_radix_in_vec, allocate_gpu_memory);
@@ -576,22 +561,6 @@ __host__ void scratch_cuda_integer_mult_radix_ciphertext_kb(
     cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
     int_mul_memory<Torus> **mem_ptr, uint32_t num_radix_blocks,
     int_radix_params params, bool allocate_gpu_memory) {
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(Torus);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<Torus, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<Torus, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<Torus, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
-
   *mem_ptr = new int_mul_memory<Torus>(streams, gpu_indexes, gpu_count, params,
                                        num_radix_blocks, allocate_gpu_memory);
 }

diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_mul.cuh
@@ -33,22 +33,6 @@ __host__ void scratch_cuda_integer_radix_scalar_mul_kb(
     int_scalar_mul_buffer<T> **mem_ptr, uint32_t num_radix_blocks,
     int_radix_params params, bool allocate_gpu_memory) {
 
-  size_t sm_size = (params.big_lwe_dimension + 1) * sizeof(T);
-  if (sm_size < cuda_get_max_shared_memory(gpu_indexes[0])) {
-    check_cuda_error(cudaFuncSetAttribute(
-        tree_add_chunks<T, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize,
-        sm_size));
-    cudaFuncSetCacheConfig(tree_add_chunks<T, FULLSM>,
-                           cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else {
-    check_cuda_error(
-        cudaFuncSetAttribute(tree_add_chunks<T, NOSM>,
-                             cudaFuncAttributeMaxDynamicSharedMemorySize, 0));
-    cudaFuncSetCacheConfig(tree_add_chunks<T, NOSM>, cudaFuncCachePreferL1);
-    check_cuda_error(cudaGetLastError());
-  }
-
   *mem_ptr =
       new int_scalar_mul_buffer<T>(streams, gpu_indexes, gpu_count, params,
                                    num_radix_blocks, allocate_gpu_memory);

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -112,16 +112,10 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
 
   cuda_memcpy_async_to_gpu(d_bsk, h_bsk, buffer_size, stream, gpu_index);
 
-  double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
+  double2 *buffer = nullptr;
   switch (polynomial_size) {
   case 256:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -134,12 +128,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -152,12 +140,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -170,12 +152,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -188,12 +164,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -206,12 +176,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -224,12 +188,6 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     break;
   case 16384:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
-          cudaFuncCachePreferShared));
       batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(d_bsk, dest,
                                                                 buffer);
@@ -265,19 +223,10 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   int gridSize = total_polynomials;
   int blockSize = polynomial_size / choose_opt_amortized(polynomial_size);
 
-  double2 *buffer;
+  double2 *buffer = nullptr;
   switch (polynomial_size) {
   case 256:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -290,15 +239,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 512:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -311,15 +251,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 1024:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -332,15 +263,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 2048:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -353,15 +275,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 4096:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -374,15 +287,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 8192:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,
                                                                 output, buffer);
@@ -395,15 +299,6 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     break;
   case 16384:
     if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
-      buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
-      check_cuda_error(cudaFuncSetAttribute(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
-      check_cuda_error(cudaFuncSetCacheConfig(
-          batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
-                               FULLSM>,
-          cudaFuncCachePreferShared));
       batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
                            FULLSM>
           <<<gridSize, blockSize, shared_memory_size, stream>>>(input1, input2,

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -260,28 +260,6 @@ __host__ void scratch_programmable_bootstrap_amortized(
     uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
 
-  cudaSetDevice(gpu_index);
-  uint64_t full_sm =
-      get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
-          polynomial_size, glwe_dimension);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
-          polynomial_size);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    cudaFuncSetAttribute(
-        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm);
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    check_cuda_error(cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_amortized<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared));
-  }
   if (allocate_gpu_memory) {
     uint64_t buffer_size =
         get_buffer_size_programmable_bootstrap_amortized<Torus>(

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -164,30 +164,6 @@ __host__ void scratch_programmable_bootstrap_cg(
     uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
     bool allocate_gpu_memory) {
 
-  cudaSetDevice(gpu_index);
-  uint64_t full_sm =
-      get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
-  uint64_t partial_sm =
-      get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
-          polynomial_size);
-  if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, partial_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  } else if (max_shared_memory >= partial_sm) {
-    check_cuda_error(cudaFuncSetAttribute(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize, full_sm));
-    cudaFuncSetCacheConfig(
-        device_programmable_bootstrap_cg<Torus, params, FULLSM>,
-        cudaFuncCachePreferShared);
-    check_cuda_error(cudaGetLastError());
-  }
-
   *buffer = new pbs_buffer<Torus, CLASSICAL>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, PBS_VARIANT::CG, allocate_gpu_memory);