From 821ce5a7703ff2f3e6f138d8cf96a94ce547186a Mon Sep 17 00:00:00 2001
From: Kelong Cong <kc1212@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:09:50 +0200
Subject: [PATCH] refactor(gpu): remove max_shared_memory from pbs arguments

Always use max shared memory from device 0 to configure the
kernels, to avoid bugs with multi-GPU configurations
---
 .../tfhe-cuda-backend/cuda/include/integer.h  |   3 +-
 .../cuda/include/programmable_bootstrap.h     |  57 ++--
 .../include/programmable_bootstrap_multibit.h |  38 ++-
 .../cuda/src/integer/integer.cuh              |  15 +-
 .../cuda/src/integer/multiplication.cuh       |   4 +-
 .../cuda/src/pbs/bootstrapping_key.cuh        |  28 +-
 .../cuda/src/pbs/programmable_bootstrap.cuh   |  22 +-
 .../pbs/programmable_bootstrap_amortized.cu   |  73 +++---
 .../pbs/programmable_bootstrap_amortized.cuh  |  19 +-
 .../pbs/programmable_bootstrap_cg_classic.cuh |  41 ++-
 .../programmable_bootstrap_cg_multibit.cuh    |  61 ++---
 .../src/pbs/programmable_bootstrap_classic.cu | 248 +++++++-----------
 .../pbs/programmable_bootstrap_classic.cuh    |  66 ++---
 .../pbs/programmable_bootstrap_multibit.cu    | 209 +++++++--------
 .../pbs/programmable_bootstrap_multibit.cuh   |  68 +++--
 .../programmable_bootstrap_tbc_classic.cuh    |  29 +-
 .../programmable_bootstrap_tbc_multibit.cuh   |  44 ++--
 .../benchmarks/benchmark_pbs.cpp              |  43 ++-
 .../tests/test_classical_pbs.cpp              |  12 +-
 .../tests/test_multibit_pbs.cpp               |   6 +-
 backends/tfhe-cuda-backend/src/cuda_bind.rs   |  10 -
 tfhe/src/core_crypto/gpu/mod.rs               |  11 -
 tfhe/src/integer/gpu/mod.rs                   |   3 +-
 23 files changed, 489 insertions(+), 621 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 33d862cd29..2fc75c10b9 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -526,8 +526,7 @@ template <typename Torus> struct int_radix_lut {
       execute_scratch_pbs<Torus>(
           streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension,
           params.small_lwe_dimension, params.polynomial_size, params.pbs_level,
-          params.grouping_factor, num_blocks_on_gpu,
-          cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type,
+          params.grouping_factor, num_blocks_on_gpu, params.pbs_type,
           allocate_gpu_memory);
       cuda_synchronize_stream(streams[i], gpu_indexes[i]);
       buffer.push_back(gpu_pbs_buffer);
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
index bdfdabdd5d..312a361a76 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64(
 void scratch_cuda_programmable_bootstrap_amortized_32(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 void scratch_cuda_programmable_bootstrap_amortized_64(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -41,7 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -49,7 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                    uint32_t gpu_index,
@@ -58,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
 void scratch_cuda_programmable_bootstrap_32(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 void scratch_cuda_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -73,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -81,18 +77,18 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **pbs_buffer);
 
 uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+    uint32_t input_lwe_ciphertext_count);
 
 uint64_t get_buffer_size_programmable_bootstrap_64(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory);
+    uint32_t input_lwe_ciphertext_count);
 }
 
 template <typename Torus>
@@ -155,7 +151,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {
 template <typename Torus>
 __host__ bool
 supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+    uint32_t polynomial_size);
 
 template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;
 
@@ -174,7 +170,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 
     this->pbs_variant = pbs_variant;
 
-    auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
+    auto max_shared_memory = cuda_get_max_shared_memory(0);
 
     if (allocate_gpu_memory) {
       switch (pbs_variant) {
@@ -251,7 +247,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 
         bool supports_dsm =
             supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-                Torus>(polynomial_size, max_shared_memory);
+                Torus>(polynomial_size);
 
         uint64_t full_sm =
             get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
@@ -310,10 +306,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
 };
 
 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_cg(
+__host__ uint64_t get_buffer_size_programmable_bootstrap_cg(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
-
+    uint32_t input_lwe_ciphertext_count) {
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
   uint64_t partial_sm =
@@ -339,8 +335,7 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory);
+                                                   uint32_t num_samples);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
@@ -349,7 +344,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -358,7 +353,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -368,36 +363,32 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 #endif
 
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
                                                     uint32_t glwe_dimension,
                                                     uint32_t polynomial_size,
-                                                    uint32_t level_count,
-                                                    uint32_t max_shared_memory);
+                                                    uint32_t level_count);
 
 #ifdef __CUDACC__
 __device__ inline int get_start_ith_ggsw(int i, uint32_t polynomial_size,
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
index 6d5d30781f..ce1617872c 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -8,7 +8,7 @@ extern "C" {
 
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
     void *stream, uint32_t gpu_index, void *dest, void *src,
@@ -19,8 +19,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t chunk_size = 0);
 
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -28,8 +28,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0);
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
@@ -39,12 +38,12 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
 template <typename Torus>
 __host__ bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+    uint32_t polynomial_size);
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);
 
 #if CUDA_ARCH >= 900
 template <typename Torus>
@@ -52,8 +51,8 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size);
 
 template <typename Torus>
 void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -63,7 +62,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 #endif
 
 template <typename Torus>
@@ -71,15 +70,15 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -89,15 +88,15 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **pbs_buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
@@ -107,7 +106,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 __host__ __device__ uint64_t
@@ -314,7 +313,6 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
 
 template <typename Torus, class params>
 __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                                     uint32_t polynomial_size,
-                                     uint32_t max_shared_memory);
+                                     uint32_t polynomial_size);
 
 #endif // CUDA_MULTI_BIT_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 4469e71f3b..13642d9ee0 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -177,8 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -204,8 +203,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
     multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -270,8 +268,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+        grouping_factor, num_radix_blocks, pbs_type);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
     multi_gpu_scatter_lwe_async<Torus>(
@@ -293,8 +290,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks,
-        cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
+        pbs_level, grouping_factor, num_radix_blocks, pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
     multi_gpu_gather_lwe_async<Torus>(streams, gpu_indexes, active_gpu_count,
@@ -696,8 +692,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
         mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
         params.glwe_dimension, params.small_lwe_dimension,
         params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]),
-        params.pbs_type);
+        params.grouping_factor, 2, params.pbs_type);
 
     cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                  big_lwe_size * sizeof(Torus), streams[0],
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 00b5e23a8d..fc81a409c5 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -357,7 +357,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-          mem_ptr->params.grouping_factor, total_count, max_shared_memory,
+          mem_ptr->params.grouping_factor, total_count,
           mem_ptr->params.pbs_type);
     } else {
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -405,7 +405,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-          mem_ptr->params.grouping_factor, total_count, max_shared_memory,
+          mem_ptr->params.grouping_factor, total_count,
           mem_ptr->params.pbs_type);
 
       multi_gpu_gather_lwe_async<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
index 8ee59ff3a9..966be0464d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh
@@ -115,7 +115,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
   double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<256>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -133,7 +133,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<512>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -151,7 +151,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<1024>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -169,7 +169,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<2048>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -187,7 +187,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<4096>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -205,7 +205,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<8192>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -223,7 +223,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       check_cuda_error(cudaFuncSetAttribute(
           batch_NSMFFT<FFTDegree<AmortizedDegree<16384>, ForwardFFT>, FULLSM>,
           cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size));
@@ -268,7 +268,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
   double2 *buffer;
   switch (polynomial_size) {
   case 256:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<256>, ForwardFFT>,
@@ -289,7 +289,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 512:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<521>, ForwardFFT>,
@@ -310,7 +310,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 1024:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<1024>, ForwardFFT>,
@@ -331,7 +331,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 2048:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<2048>, ForwardFFT>,
@@ -352,7 +352,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 4096:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<4096>, ForwardFFT>,
@@ -373,7 +373,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 8192:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<8192>, ForwardFFT>,
@@ -394,7 +394,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index,
     }
     break;
   case 16384:
-    if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) {
+    if (shared_memory_size <= cuda_get_max_shared_memory(0)) {
       buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index);
       check_cuda_error(cudaFuncSetAttribute(
           batch_polynomial_mul<FFTDegree<AmortizedDegree<16384>, ForwardFFT>,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index dacddb7bcc..f8f12d4403 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -127,8 +127,7 @@ void execute_pbs_async(
     std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    PBS_TYPE pbs_type) {
+    uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) {
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
     // 32 bits
@@ -160,8 +159,7 @@ void execute_pbs_async(
             current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            max_shared_memory);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu);
       }
       break;
     default:
@@ -200,7 +198,7 @@ void execute_pbs_async(
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, max_shared_memory);
+            num_inputs_on_gpu);
       }
       break;
     case CLASSICAL:
@@ -228,8 +226,7 @@ void execute_pbs_async(
             current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu,
-            max_shared_memory);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu);
       }
       break;
     default:
@@ -247,8 +244,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
                          int8_t **pbs_buffer, uint32_t glwe_dimension,
                          uint32_t lwe_dimension, uint32_t polynomial_size,
                          uint32_t level_count, uint32_t grouping_factor,
-                         uint32_t input_lwe_ciphertext_count,
-                         uint32_t max_shared_memory, PBS_TYPE pbs_type,
+                         uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type,
                          bool allocate_gpu_memory) {
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
@@ -259,8 +255,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
     case CLASSICAL:
       scratch_cuda_programmable_bootstrap_32(
           stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
+          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
       break;
     default:
       PANIC("Error: unsupported cuda PBS type.")
@@ -275,13 +270,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index,
       scratch_cuda_multi_bit_programmable_bootstrap_64(
           stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension,
           polynomial_size, level_count, grouping_factor,
-          input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+          input_lwe_ciphertext_count, allocate_gpu_memory);
       break;
     case CLASSICAL:
       scratch_cuda_programmable_bootstrap_64(
           stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size,
-          level_count, input_lwe_ciphertext_count, max_shared_memory,
-          allocate_gpu_memory);
+          level_count, input_lwe_ciphertext_count, allocate_gpu_memory);
       break;
     default:
       PANIC("Error: unsupported cuda PBS type.")
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
index 084756a374..5891732459 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -5,10 +5,9 @@
  */
 uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
   return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
-      glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-      max_shared_memory);
+      glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
 }
 
 /*
@@ -20,51 +19,50 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
 void scratch_cuda_programmable_bootstrap_amortized_32(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
   case 256:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 512:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 1024:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 2048:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 4096:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 8192:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 16384:
     scratch_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -81,51 +79,50 @@ void scratch_cuda_programmable_bootstrap_amortized_32(
 void scratch_cuda_programmable_bootstrap_amortized_64(
     void *stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
   case 256:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 512:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 1024:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 2048:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 4096:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 8192:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 16384:
     scratch_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -143,7 +140,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
 
   if (base_log > 32)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -157,7 +154,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -166,7 +163,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -175,7 +172,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -184,7 +181,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -193,7 +190,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -202,7 +199,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -211,7 +208,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -254,8 +251,6 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
  * - level_count: number of decomposition levels in the gadget matrix (~4)
  * - num_samples: number of encrypted input messages
  * used
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
  *
  * This function calls a wrapper to a device kernel that performs the
  * bootstrapping:
@@ -283,7 +278,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
 
   if (base_log > 64)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -297,7 +292,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -306,7 +301,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -315,7 +310,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -324,7 +319,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -333,7 +328,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -342,7 +337,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -351,7 +346,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, max_shared_memory);
+        num_samples);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index ee9fb4e1b9..7018f635ea 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -232,10 +232,11 @@ get_buffer_size_partial_sm_programmable_bootstrap_amortized(
 }
 
 template <typename Torus>
-__host__ __device__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
+__host__ uint64_t get_buffer_size_programmable_bootstrap_amortized(
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size, glwe_dimension);
@@ -257,16 +258,15 @@ template <typename Torus, typename params>
 __host__ void scratch_programmable_bootstrap_amortized(
     cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size, glwe_dimension);
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     cudaFuncSetAttribute(
         device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>,
@@ -285,8 +285,7 @@ __host__ void scratch_programmable_bootstrap_amortized(
   if (allocate_gpu_memory) {
     uint64_t buffer_size =
         get_buffer_size_programmable_bootstrap_amortized<Torus>(
-            glwe_dimension, polynomial_size, input_lwe_ciphertext_count,
-            max_shared_memory);
+            glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
     *pbs_buffer = (int8_t *)cuda_malloc_async(buffer_size, stream, gpu_index);
     check_cuda_error(cudaGetLastError());
   }
@@ -299,9 +298,8 @@ __host__ void host_programmable_bootstrap_amortized(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
 
-  cudaSetDevice(gpu_index);
   uint64_t SM_FULL =
       get_buffer_size_full_sm_programmable_bootstrap_amortized<Torus>(
           polynomial_size, glwe_dimension);
@@ -314,6 +312,9 @@ __host__ void host_programmable_bootstrap_amortized(
 
   uint64_t DM_FULL = SM_FULL;
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
   // Create a 1-dimensional grid of threads
   // where each block handles 1 sample and each thread
   // handles opt polynomial coefficients
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 537eb5c66d..94d383a2f3 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -161,15 +161,14 @@ __host__ void scratch_programmable_bootstrap_cg(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_sm =
       get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm) {
     check_cuda_error(cudaFuncSetAttribute(
         device_programmable_bootstrap_cg<Torus, params, PARTIALSM>,
@@ -203,9 +202,7 @@ __host__ void host_programmable_bootstrap_cg(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -216,6 +213,9 @@ __host__ void host_programmable_bootstrap_cg(
       get_buffer_size_partial_sm_programmable_bootstrap_cg<Torus>(
           polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
   uint64_t full_dm = full_sm;
 
   uint64_t partial_dm = full_dm - partial_sm;
@@ -265,8 +265,7 @@ __host__ void host_programmable_bootstrap_cg(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -286,6 +285,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -311,37 +311,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_grid_size(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                       num_samples);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
           "Supported N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 2da2e4581c..4f9e96c4f4 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -156,8 +156,7 @@ template <typename Torus>
 __host__ __device__ uint64_t get_buffer_size_cg_multibit_programmable_bootstrap(
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t grouping_factor, uint32_t lwe_chunk_size,
-    uint32_t max_shared_memory) {
+    uint32_t grouping_factor, uint32_t lwe_chunk_size) {
 
   uint64_t buffer_size = 0;
   buffer_size += input_lwe_ciphertext_count * lwe_chunk_size * level_count *
@@ -177,8 +176,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size = 0) {
 
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -190,6 +189,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
       get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -243,9 +243,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap(
   }
 
   if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG,
@@ -260,9 +259,8 @@ __host__ void execute_cg_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
+    uint32_t lwe_chunk_size, int lwe_offset) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_dm =
       get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
@@ -271,6 +269,9 @@ __host__ void execute_cg_external_product_loop(
           polynomial_size);
   uint64_t no_dm = 0;
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
   uint32_t keybundle_size_per_input =
       lwe_chunk_size * level_count * (glwe_dimension + 1) *
       (glwe_dimension + 1) * (polynomial_size / 2);
@@ -336,12 +337,11 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
+    uint32_t lwe_chunk_size = 0) {
 
   if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
+                                                       polynomial_size);
 
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
@@ -350,24 +350,21 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     execute_compute_keybundle<Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
 
     // Accumulate
     execute_cg_external_product_loop<Torus, params>(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
   }
 }
 
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -387,6 +384,7 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm_cg_accumulate) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -417,37 +415,30 @@ __host__ bool verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size(
 // group constraints
 template <typename Torus>
 __host__ bool supports_cooperative_groups_on_multibit_programmable_bootstrap(
-    int glwe_dimension, int polynomial_size, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int polynomial_size, int level_count, int num_samples) {
   switch (polynomial_size) {
   case 256:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<256>>(glwe_dimension, level_count, num_samples);
   case 512:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples,
-                                     max_shared_memory);
+        Torus, AmortizedDegree<512>>(glwe_dimension, level_count, num_samples);
   case 1024:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<1024>>(glwe_dimension, level_count, num_samples);
   case 2048:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<2048>>(glwe_dimension, level_count, num_samples);
   case 4096:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<4096>>(glwe_dimension, level_count, num_samples);
   case 8192:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples,
-                                      max_shared_memory);
+        Torus, AmortizedDegree<8192>>(glwe_dimension, level_count, num_samples);
   case 16384:
     return verify_cuda_programmable_bootstrap_cg_multi_bit_grid_size<
-        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count, num_samples,
-                                       max_shared_memory);
+        Torus, AmortizedDegree<16384>>(glwe_dimension, level_count,
+                                       num_samples);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index 09a689405c..90403cbb81 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -8,54 +8,46 @@ template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
                                                    uint32_t polynomial_size,
                                                    uint32_t level_count,
-                                                   uint32_t num_samples,
-                                                   uint32_t max_shared_memory) {
+                                                   uint32_t num_samples) {
   return supports_cooperative_groups_on_programmable_bootstrap<Torus>(
-      glwe_dimension, polynomial_size, level_count, num_samples,
-      max_shared_memory);
+      glwe_dimension, polynomial_size, level_count, num_samples);
 }
 
 template <typename Torus>
-bool has_support_to_cuda_programmable_bootstrap_tbc(
-    uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples,
+                                                    uint32_t glwe_dimension,
+                                                    uint32_t polynomial_size,
+                                                    uint32_t level_count) {
 #if CUDA_ARCH >= 900
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
   case 512:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
   case 1024:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 2048:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 4096:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 8192:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 16384:
     return supports_thread_block_clusters_on_classic_programmable_bootstrap<
         Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+                                       polynomial_size, level_count);
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -71,51 +63,50 @@ template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
   case 256:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 512:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 1024:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 2048:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 4096:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 8192:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 16384:
     scratch_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -131,7 +122,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples) {
 
   switch (polynomial_size) {
   case 256:
@@ -139,56 +130,49 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 512:
     host_programmable_bootstrap_tbc<Torus, Degree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 1024:
     host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 2048:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 4096:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 8192:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 16384:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -203,69 +187,68 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
  */
 uint64_t get_buffer_size_programmable_bootstrap_64(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
 
   if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
           glwe_dimension, polynomial_size, level_count,
-          input_lwe_ciphertext_count, max_shared_memory))
+          input_lwe_ciphertext_count))
     return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
+        input_lwe_ciphertext_count);
   else
     return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory);
+        input_lwe_ciphertext_count);
 }
 
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_cg(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
   case 256:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 512:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 1024:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 2048:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 4096:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 8192:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   case 16384:
     scratch_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, pbs_buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -278,51 +261,50 @@ template <typename Torus>
 void scratch_cuda_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   switch (polynomial_size) {
   case 256:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 512:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 1024:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 2048:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 4096:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 8192:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   case 16384:
     scratch_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory);
+        allocate_gpu_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -340,31 +322,30 @@ void scratch_cuda_programmable_bootstrap(
 void scratch_cuda_programmable_bootstrap_32(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
+          level_count))
     scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
         stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
   else
 #endif
       if (has_support_to_cuda_programmable_bootstrap_cg<uint32_t>(
               glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count, max_shared_memory))
+              input_lwe_ciphertext_count))
     scratch_cuda_programmable_bootstrap_cg<uint32_t>(
         stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
   else
     scratch_cuda_programmable_bootstrap<uint32_t>(
         stream, gpu_index, (pbs_buffer<uint32_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
 }
 
 /*
@@ -375,31 +356,30 @@ void scratch_cuda_programmable_bootstrap_32(
 void scratch_cuda_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
+          level_count))
     scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
   else
 #endif
       if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
               glwe_dimension, polynomial_size, level_count,
-              input_lwe_ciphertext_count, max_shared_memory))
+              input_lwe_ciphertext_count))
     scratch_cuda_programmable_bootstrap_cg<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
   else
     scratch_cuda_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory);
+        input_lwe_ciphertext_count, allocate_gpu_memory);
 }
 
 template <typename Torus>
@@ -409,7 +389,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples) {
 
   switch (polynomial_size) {
   case 256:
@@ -417,56 +397,49 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 512:
     host_programmable_bootstrap_cg<Torus, Degree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 1024:
     host_programmable_bootstrap_cg<Torus, Degree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 2048:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 4096:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 8192:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 16384:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -482,7 +455,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples) {
 
   switch (polynomial_size) {
   case 256:
@@ -490,56 +463,49 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 512:
     host_programmable_bootstrap<Torus, Degree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 1024:
     host_programmable_bootstrap<Torus, Degree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 2048:
     host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 4096:
     host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 8192:
     host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case 16384:
     host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, lwe_array_out,
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
-        lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        lwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -556,7 +522,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
 
   if (base_log > 32)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
@@ -576,8 +542,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -591,8 +556,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -603,8 +567,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -647,8 +610,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
  * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
  * - level_count: number of decomposition levels in the gadget matrix (~4)
  * - num_samples: number of encrypted input messages
- *  - 'max_shared_memory' maximum amount of shared memory to be used inside
- * device functions
  *
  * This function calls a wrapper to a device kernel that performs the
  * bootstrapping:
@@ -680,7 +641,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
   if (base_log > 64)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
           "in the ciphertext representation (64)");
@@ -699,8 +660,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
@@ -714,8 +674,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -726,8 +685,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        max_shared_memory);
+        glwe_dimension, polynomial_size, base_log, level_count, num_samples);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -746,7 +704,7 @@ void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
 
 template bool has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t num_samples);
 
 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -755,7 +713,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -764,20 +722,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void scratch_cuda_programmable_bootstrap<uint64_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, CLASSICAL> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -786,7 +742,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -795,27 +751,25 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template void scratch_cuda_programmable_bootstrap<uint32_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint32_t, CLASSICAL> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint32_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);
 template bool has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);
 
 #if CUDA_ARCH >= 900
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
@@ -825,7 +779,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -833,17 +787,15 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 template void scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index da33298a2f..3b00c87fd6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -230,7 +230,7 @@ __global__ void device_programmable_bootstrap_step_two(
 template <typename Torus>
 __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count) {
 
   uint64_t full_sm_step_one =
       get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
@@ -246,6 +246,7 @@ __host__ __device__ uint64_t get_buffer_size_programmable_bootstrap(
   uint64_t full_dm = full_sm_step_one;
 
   uint64_t device_mem = 0;
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm) {
     device_mem = full_dm * input_lwe_ciphertext_count * level_count *
                  (glwe_dimension + 1);
@@ -273,10 +274,8 @@ __host__ void scratch_programmable_bootstrap(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_sm_step_one =
       get_buffer_size_full_sm_programmable_bootstrap_step_one<Torus>(
           polynomial_size);
@@ -286,6 +285,8 @@ __host__ void scratch_programmable_bootstrap(
   uint64_t partial_sm =
       get_buffer_size_partial_sm_programmable_bootstrap<Torus>(polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
   // Configure step one
   if (max_shared_memory >= partial_sm && max_shared_memory < full_sm_step_one) {
     check_cuda_error(cudaFuncSetAttribute(
@@ -330,16 +331,18 @@ __host__ void scratch_programmable_bootstrap(
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_one(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
-    Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
-    double2 *bootstrapping_key, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-    uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
-    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
-
+__host__ void
+execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
+                 Torus *lut_vector_indexes, Torus *lwe_array_in,
+                 Torus *lwe_input_indexes, double2 *bootstrapping_key,
+                 Torus *global_accumulator, double2 *global_accumulator_fft,
+                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
+                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+                 uint64_t full_sm, uint64_t full_dm) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
   dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count);
@@ -370,16 +373,18 @@ __host__ void execute_step_one(
 }
 
 template <typename Torus, class params>
-__host__ void execute_step_two(
-    cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-    Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes,
-    double2 *bootstrapping_key, Torus *global_accumulator,
-    double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count,
-    uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, int8_t *d_mem,
-    uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm,
-    uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) {
-
+__host__ void
+execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
+                 Torus *lwe_output_indexes, Torus *lut_vector,
+                 Torus *lut_vector_indexes, double2 *bootstrapping_key,
+                 Torus *global_accumulator, double2 *global_accumulator_fft,
+                 uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension,
+                 uint32_t glwe_dimension, uint32_t polynomial_size,
+                 uint32_t base_log, uint32_t level_count, int8_t *d_mem,
+                 int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm,
+                 uint64_t full_sm, uint64_t full_dm) {
+
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   cudaSetDevice(gpu_index);
   int thds = polynomial_size / params::opt;
   dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1);
@@ -418,8 +423,7 @@ __host__ void host_programmable_bootstrap(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
@@ -448,16 +452,14 @@ __host__ void host_programmable_bootstrap(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, global_accumulator,
         global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one,
-        full_dm_step_one);
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one);
     execute_step_two<Torus, params>(
         stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector,
         lut_vector_indexes, bootstrapping_key, global_accumulator,
         global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension,
-        glwe_dimension, polynomial_size, base_log, level_count, d_mem,
-        max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two,
-        full_dm_step_two);
+        glwe_dimension, polynomial_size, base_log, level_count, d_mem, i,
+        partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two);
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index 4cc025c139..2e2e30141b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -9,53 +9,45 @@
 
 bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t num_samples, uint32_t max_shared_memory) {
+    uint32_t num_samples) {
   return supports_cooperative_groups_on_multibit_programmable_bootstrap<
-      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples,
-                max_shared_memory);
+      uint64_t>(glwe_dimension, polynomial_size, level_count, num_samples);
 }
 
 template <typename Torus>
 bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {
 #if CUDA_ARCH >= 900
   switch (polynomial_size) {
   case 256:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<256>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
   case 512:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<512>>(num_samples, glwe_dimension,
-                                     polynomial_size, level_count,
-                                     max_shared_memory);
+                                     polynomial_size, level_count);
   case 1024:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<1024>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 2048:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<2048>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 4096:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<4096>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 8192:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<8192>>(num_samples, glwe_dimension,
-                                      polynomial_size, level_count,
-                                      max_shared_memory);
+                                      polynomial_size, level_count);
   case 16384:
     return supports_thread_block_clusters_on_multibit_programmable_bootstrap<
         Torus, AmortizedDegree<16384>>(num_samples, glwe_dimension,
-                                       polynomial_size, level_count,
-                                       max_shared_memory);
+                                       polynomial_size, level_count);
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
           "N's are powers of two"
@@ -74,7 +66,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -87,7 +79,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 512:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -95,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 1024:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -103,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 2048:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -111,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 4096:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -119,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 8192:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -127,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 16384:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -135,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -152,7 +144,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -165,7 +157,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 512:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -173,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 1024:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -181,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 2048:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -189,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 4096:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -197,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 8192:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -205,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 16384:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -213,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -228,8 +220,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size) {
+    uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size) {
 
   pbs_buffer<uint64_t, MULTI_BIT> *buffer =
       (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -246,7 +237,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
 #else
     PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -261,7 +252,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -273,7 +264,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -284,51 +275,51 @@ template <typename Torus>
 void scratch_cuda_cg_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size) {
 
   switch (polynomial_size) {
   case 256:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 512:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 1024:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 2048:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 4096:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 8192:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   case 16384:
     scratch_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, glwe_dimension,
         polynomial_size, level_count, input_lwe_ciphertext_count,
-        max_shared_memory, allocate_gpu_memory, lwe_chunk_size);
+        allocate_gpu_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -342,58 +333,58 @@ void scratch_cuda_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size) {
 
   switch (polynomial_size) {
   case 256:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 512:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 1024:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 2048:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 4096:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 8192:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 16384:
     scratch_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -406,34 +397,32 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64(
     void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size) {
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
 
 #if (CUDA_ARCH >= 900)
   if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          level_count, max_shared_memory))
+          level_count))
     scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
+        lwe_chunk_size);
   else
 #endif
       if (supports_cooperative_groups_on_multibit_programmable_bootstrap<
               uint64_t>(glwe_dimension, polynomial_size, level_count,
-                        input_lwe_ciphertext_count, max_shared_memory))
+                        input_lwe_ciphertext_count))
     scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory,
-        lwe_chunk_size);
+        input_lwe_ciphertext_count, allocate_gpu_memory, lwe_chunk_size);
   else
     scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
         stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, level_count,
-        grouping_factor, input_lwe_ciphertext_count, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory,
+        lwe_chunk_size);
 }
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
@@ -456,14 +445,15 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
  */
 template <typename Torus, class params>
 __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
-                                     uint32_t polynomial_size,
-                                     uint32_t max_shared_memory) {
+                                     uint32_t polynomial_size) {
 
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
           polynomial_size);
 
   int max_blocks_per_sm;
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
   if (max_shared_memory < full_sm_keybundle)
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_blocks_per_sm,
@@ -513,8 +503,7 @@ template void scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size);
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
 
 template void
 cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -525,14 +514,14 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 
 template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint64_t, MULTI_BIT> **pbs_buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size);
 
 template void
 cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -543,12 +532,12 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory);
+    uint32_t level_count);
 
 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -556,58 +545,58 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap(
     void *stream, uint32_t gpu_index, pbs_buffer<Torus, MULTI_BIT> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size) {
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size) {
 
   switch (polynomial_size) {
   case 256:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<256>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 512:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 1024:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 2048:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 4096:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 8192:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   case 16384:
     scratch_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
         static_cast<cudaStream_t>(stream), gpu_index, buffer, lwe_dimension,
         glwe_dimension, polynomial_size, level_count,
-        input_lwe_ciphertext_count, grouping_factor, max_shared_memory,
-        allocate_gpu_memory, lwe_chunk_size);
+        input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory,
+        lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -623,7 +612,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+    uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -636,7 +625,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 512:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -644,7 +633,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 1024:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -652,7 +641,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 2048:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -660,7 +649,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 4096:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -668,7 +657,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 8192:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -676,7 +665,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   case 16384:
     host_tbc_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -684,7 +673,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, max_shared_memory, lwe_chunk_size);
+        num_samples, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -697,8 +686,8 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
     void *stream, uint32_t gpu_index, pbs_buffer<uint64_t, MULTI_BIT> **buffer,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory, uint32_t lwe_chunk_size);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory,
+    uint32_t lwe_chunk_size);
 
 template void
 cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -709,5 +698,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
+    uint32_t lwe_chunk_size);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index b409f935ec..f913280eb9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -376,9 +376,9 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
           polynomial_size);
@@ -470,9 +470,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap(
   }
 
   if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<Torus, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT,
@@ -486,9 +485,8 @@ __host__ void execute_compute_keybundle(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) {
+    uint32_t lwe_chunk_size, int lwe_offset) {
 
-  cudaSetDevice(gpu_index);
   uint32_t chunk_size =
       std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
 
@@ -499,6 +497,8 @@ __host__ void execute_compute_keybundle(
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
           polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
 
   auto d_mem = buffer->d_mem_keybundle;
   auto keybundle_fft = buffer->keybundle_fft;
@@ -526,22 +526,23 @@ __host__ void execute_compute_keybundle(
 }
 
 template <typename Torus, class params>
-__host__ void
-execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
-                 Torus *lut_vector_indexes, Torus *lwe_array_in,
-                 Torus *lwe_input_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
-                 uint32_t num_samples, uint32_t lwe_dimension,
-                 uint32_t glwe_dimension, uint32_t polynomial_size,
-                 uint32_t base_log, uint32_t level_count,
-                 uint32_t max_shared_memory, int j, int lwe_offset) {
+__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
+                               Torus *lut_vector, Torus *lut_vector_indexes,
+                               Torus *lwe_array_in, Torus *lwe_input_indexes,
+                               pbs_buffer<Torus, MULTI_BIT> *buffer,
+                               uint32_t num_samples, uint32_t lwe_dimension,
+                               uint32_t glwe_dimension,
+                               uint32_t polynomial_size, uint32_t base_log,
+                               uint32_t level_count, int j, int lwe_offset) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_sm_accumulate_step_one =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
           polynomial_size);
   uint64_t partial_sm_accumulate_step_one =
       get_buffer_size_partial_sm_multibit_programmable_bootstrap_step_one<
           Torus>(polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
 
   //
   auto d_mem = buffer->d_mem_acc_step_one;
@@ -581,19 +582,20 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
 }
 
 template <typename Torus, class params>
-__host__ void
-execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
-                 Torus *lwe_output_indexes,
-                 pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
-                 uint32_t lwe_dimension, uint32_t glwe_dimension,
-                 uint32_t polynomial_size, int32_t grouping_factor,
-                 uint32_t level_count, uint32_t max_shared_memory, int j,
-                 int lwe_offset, uint32_t lwe_chunk_size) {
+__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
+                               Torus *lwe_array_out, Torus *lwe_output_indexes,
+                               pbs_buffer<Torus, MULTI_BIT> *buffer,
+                               uint32_t num_samples, uint32_t lwe_dimension,
+                               uint32_t glwe_dimension,
+                               uint32_t polynomial_size,
+                               int32_t grouping_factor, uint32_t level_count,
+                               int j, int lwe_offset, uint32_t lwe_chunk_size) {
 
-  cudaSetDevice(gpu_index);
   uint64_t full_sm_accumulate_step_two =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
           polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
 
   auto d_mem = buffer->d_mem_acc_step_two;
   auto keybundle_fft = buffer->keybundle_fft;
@@ -630,13 +632,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
-  cudaSetDevice(gpu_index);
+    uint32_t lwe_chunk_size = 0) {
 
   // If a chunk size is not passed to this function, select one.
   if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
+                                                       polynomial_size);
 
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
@@ -645,8 +646,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     execute_compute_keybundle<Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
     // Accumulate
     uint32_t chunk_size = std::min(
         lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
@@ -654,14 +654,12 @@ __host__ void host_multi_bit_programmable_bootstrap(
       execute_step_one<Torus, params>(
           stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
           lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
-          polynomial_size, base_log, level_count, max_shared_memory, j,
-          lwe_offset);
+          polynomial_size, base_log, level_count, j, lwe_offset);
 
       execute_step_two<Torus, params>(
           stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer,
           num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-          grouping_factor, level_count, max_shared_memory, j, lwe_offset,
-          lwe_chunk_size);
+          grouping_factor, level_count, j, lwe_offset, lwe_chunk_size);
     }
   }
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index f63f22748e..a32a88c0e7 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -165,13 +165,11 @@ __host__ void scratch_programmable_bootstrap_tbc(
     cudaStream_t stream, uint32_t gpu_index,
     pbs_buffer<Torus, CLASSICAL> **buffer, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
-    bool allocate_gpu_memory) {
-  cudaSetDevice(gpu_index);
+    uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) {
 
   bool supports_dsm =
       supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);
 
   uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
       polynomial_size);
@@ -183,6 +181,7 @@ __host__ void scratch_programmable_bootstrap_tbc(
     minimum_sm_tbc =
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
+  int max_shared_memory = cuda_get_max_shared_memory(0);
 
   if (max_shared_memory >= full_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
@@ -226,13 +225,11 @@ __host__ void host_programmable_bootstrap_tbc(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t max_shared_memory) {
-  cudaSetDevice(gpu_index);
+    uint32_t level_count, uint32_t input_lwe_ciphertext_count) {
 
   auto supports_dsm =
       supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);
 
   // With SM each block corresponds to either the mask or body, no need to
   // duplicate data for each
@@ -247,6 +244,9 @@ __host__ void host_programmable_bootstrap_tbc(
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
   uint64_t full_dm = full_sm;
 
   uint64_t partial_dm = full_dm - partial_sm;
@@ -306,8 +306,7 @@ __host__ void host_programmable_bootstrap_tbc(
 // Verify if the grid size satisfies the cooperative group constraints
 template <typename Torus, class params>
 __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
-    int glwe_dimension, int level_count, int num_samples,
-    uint32_t max_shared_memory) {
+    int glwe_dimension, int level_count, int num_samples) {
 
   // If Cooperative Groups is not supported, no need to check anything else
   if (!cuda_check_support_cooperative_groups())
@@ -321,12 +320,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
       get_buffer_size_partial_sm_programmable_bootstrap_tbc<Torus>(
           params::degree);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   int thds = params::degree / params::opt;
 
   // Get the maximum number of active blocks per streaming multiprocessors
   int number_of_blocks = level_count * (glwe_dimension + 1) * num_samples;
   int max_active_blocks_per_sm;
-
   if (max_shared_memory < partial_sm) {
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_active_blocks_per_sm,
@@ -353,11 +352,12 @@ __host__ bool verify_cuda_programmable_bootstrap_tbc_grid_size(
 template <typename Torus>
 __host__ bool
 supports_distributed_shared_memory_on_classic_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory) {
+    uint32_t polynomial_size) {
   uint64_t minimum_sm =
       get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
           polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < minimum_sm) {
     // If we cannot store a single polynomial in a block shared memory we cannot
     // use TBC
@@ -370,7 +370,7 @@ supports_distributed_shared_memory_on_classic_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {
 
   if (!cuda_check_support_thread_block_clusters() || num_samples > 128)
     return false;
@@ -382,7 +382,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
           polynomial_size);
   uint64_t minimum_sm_tbc = 0;
   if (supports_distributed_shared_memory_on_classic_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory))
+          Torus>(polynomial_size))
     minimum_sm_tbc =
         get_buffer_size_sm_dsm_plus_tbc_classic_programmable_bootstrap<Torus>(
             polynomial_size);
@@ -405,6 +405,7 @@ __host__ bool supports_thread_block_clusters_on_classic_programmable_bootstrap(
    * case and it will fail if we try. Thus, since level_count *
    * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
    * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory < partial_sm + minimum_sm_tbc) {
     check_cuda_error(cudaFuncSetAttribute(
         device_programmable_bootstrap_tbc<Torus, params, NOSM>,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 57378b5127..7e13451fd1 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -172,14 +172,11 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<uint64_t, MULTI_BIT> **buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
     uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor,
-    uint32_t max_shared_memory, bool allocate_gpu_memory,
-    uint32_t lwe_chunk_size = 0) {
-
-  cudaSetDevice(gpu_index);
+    bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) {
 
   bool supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);
 
   uint64_t full_sm_keybundle =
       get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle<Torus>(
@@ -196,6 +193,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+
   if (max_shared_memory < full_sm_keybundle) {
     check_cuda_error(cudaFuncSetAttribute(
         device_multi_bit_programmable_bootstrap_keybundle<Torus, params, NOSM>,
@@ -254,9 +253,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap(
   }
 
   if (!lwe_chunk_size)
-    lwe_chunk_size =
-        get_lwe_chunk_size<Torus, params>(gpu_index, input_lwe_ciphertext_count,
-                                          polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
+        gpu_index, input_lwe_ciphertext_count, polynomial_size);
   *buffer = new pbs_buffer<uint64_t, MULTI_BIT>(
       stream, gpu_index, glwe_dimension, polynomial_size, level_count,
       input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC,
@@ -271,12 +269,11 @@ __host__ void execute_tbc_external_product_loop(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
     uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
     uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
-    uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) {
+    uint32_t lwe_chunk_size, int lwe_offset) {
 
-  cudaSetDevice(gpu_index);
   auto supports_dsm =
       supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory);
+          Torus>(polynomial_size);
 
   uint64_t full_dm =
       get_buffer_size_full_sm_tbc_multibit_programmable_bootstrap<Torus>(
@@ -290,6 +287,9 @@ __host__ void execute_tbc_external_product_loop(
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
+
   uint32_t keybundle_size_per_input =
       lwe_chunk_size * level_count * (glwe_dimension + 1) *
       (glwe_dimension + 1) * (polynomial_size / 2);
@@ -365,12 +365,12 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
+    uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
-    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(
-        gpu_index, num_samples, polynomial_size, max_shared_memory);
+    lwe_chunk_size = get_lwe_chunk_size<Torus, params>(gpu_index, num_samples,
+                                                       polynomial_size);
 
   for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor);
        lwe_offset += lwe_chunk_size) {
@@ -379,27 +379,26 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     execute_compute_keybundle<Torus, params>(
         stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key,
         buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, max_shared_memory,
-        lwe_chunk_size, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
 
     // Accumulate
     execute_tbc_external_product_loop<Torus, params>(
         stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer,
         num_samples, lwe_dimension, glwe_dimension, polynomial_size,
-        grouping_factor, base_log, level_count, lwe_chunk_size,
-        max_shared_memory, lwe_offset);
+        grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset);
   }
 }
 
 template <typename Torus>
 __host__ bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
-    uint32_t polynomial_size, uint32_t max_shared_memory) {
+    uint32_t polynomial_size) {
   uint64_t minimum_sm =
       get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
           polynomial_size);
 
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory <= minimum_sm) {
     // If we cannot store a single polynomial in a block shared memory we
     // cannot use TBC
@@ -412,7 +411,7 @@ supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
 template <typename Torus, class params>
 __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
     uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
-    uint32_t level_count, uint32_t max_shared_memory) {
+    uint32_t level_count) {
 
   if (!cuda_check_support_thread_block_clusters())
     return false;
@@ -425,7 +424,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
           polynomial_size);
   uint64_t minimum_sm_tbc_accumulate = 0;
   if (supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
-          Torus>(polynomial_size, max_shared_memory))
+          Torus>(polynomial_size))
     minimum_sm_tbc_accumulate =
         get_buffer_size_sm_dsm_plus_tbc_multibit_programmable_bootstrap<Torus>(
             polynomial_size);
@@ -448,6 +447,7 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
    * case and it will fail if we try. Thus, since level_count *
    * (glwe_dimension+1) is usually smaller than 8 at this moment, we will
    * disable cudaFuncAttributeNonPortableClusterSizeAllowed */
+  int max_shared_memory = cuda_get_max_shared_memory(0);
   if (max_shared_memory <
       partial_sm_tbc_accumulate + minimum_sm_tbc_accumulate) {
     check_cuda_error(cudaFuncSetAttribute(
@@ -487,5 +487,5 @@ __host__ bool supports_thread_block_clusters_on_multibit_programmable_bootstrap(
 
 template __host__ bool
 supports_distributed_shared_memory_on_multibit_programmable_bootstrap<uint64_t>(
-    uint32_t polynomial_size, uint32_t max_shared_memory);
+    uint32_t polynomial_size);
 #endif // FASTMULTIBIT_PBS_H
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
index 6fc06098da..d926486e67 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp
@@ -174,7 +174,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
+          pbs_level)) {
     st.SkipWithError("Configuration not supported for tbc operation");
     return;
   }
@@ -182,8 +182,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
   scratch_cuda_tbc_multi_bit_programmable_bootstrap<uint64_t>(
       stream, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer, lwe_dimension,
       glwe_dimension, polynomial_size, pbs_level, grouping_factor,
-      input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index),
-      true, chunk_size);
+      input_lwe_ciphertext_count, true, chunk_size);
 
   for (auto _ : st) {
     // Execute PBS
@@ -192,8 +191,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit)
         d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk,
         (pbs_buffer<uint64_t, MULTI_BIT> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-        pbs_level, input_lwe_ciphertext_count,
-        cuda_get_max_shared_memory(stream->gpu_index), chunk_size);
+        pbs_level, input_lwe_ciphertext_count, chunk_size);
     cuda_synchronize_stream(stream);
   }
 
@@ -205,7 +203,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
           glwe_dimension, polynomial_size, pbs_level,
-          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
+          input_lwe_ciphertext_count)) {
     st.SkipWithError("Configuration not supported for fast operation");
     return;
   }
@@ -213,7 +211,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
   scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
-      cuda_get_max_shared_memory(gpu_index), true, chunk_size);
+      true, chunk_size);
 
   for (auto _ : st) {
     // Execute PBS
@@ -222,8 +220,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit)
         d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
         d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count,
-        cuda_get_max_shared_memory(gpu_index), chunk_size);
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -235,8 +232,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
   scratch_cuda_multi_bit_programmable_bootstrap<uint64_t>(
       stream, gpu_index, (pbs_buffer<uint64_t, MULTI_BIT> **)&buffer,
       lwe_dimension, glwe_dimension, polynomial_size, pbs_level,
-      grouping_factor, input_lwe_ciphertext_count,
-      cuda_get_max_shared_memory(gpu_index), true, chunk_size);
+      grouping_factor, input_lwe_ciphertext_count, true, chunk_size);
 
   for (auto _ : st) {
     // Execute PBS
@@ -245,8 +241,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit)
         d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array,
         d_lwe_input_indexes, d_bsk, (pbs_buffer<uint64_t, MULTI_BIT> *)buffer,
         lwe_dimension, glwe_dimension, polynomial_size, grouping_factor,
-        pbs_base_log, pbs_level, input_lwe_ciphertext_count,
-        cuda_get_max_shared_memory(gpu_index), chunk_size);
+        pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -258,15 +253,14 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_tbc<uint64_t>(
           input_lwe_ciphertext_count, glwe_dimension, polynomial_size,
-          pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) {
+          pbs_level)) {
     st.SkipWithError("Configuration not supported for tbc operation");
     return;
   }
 
   scratch_cuda_programmable_bootstrap_tbc<uint64_t>(
       stream, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer, glwe_dimension,
-      polynomial_size, pbs_level, input_lwe_ciphertext_count,
-      cuda_get_max_shared_memory(stream->gpu_index), true);
+      polynomial_size, pbs_level, input_lwe_ciphertext_count, true);
 
   for (auto _ : st) {
     // Execute PBS
@@ -277,8 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count,
-        cuda_get_max_shared_memory(stream->gpu_index));
+        input_lwe_ciphertext_count);
     cuda_synchronize_stream(stream);
   }
 
@@ -290,7 +283,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
 (benchmark::State &st) {
   if (!has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
           glwe_dimension, polynomial_size, pbs_level,
-          input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index))) {
+          input_lwe_ciphertext_count)) {
     st.SkipWithError("Configuration not supported for fast operation");
     return;
   }
@@ -298,7 +291,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
   scratch_cuda_programmable_bootstrap_cg<uint64_t>(
       stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
-      cuda_get_max_shared_memory(gpu_index), true);
+      true);
 
   for (auto _ : st) {
     // Execute PBS
@@ -309,7 +302,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
+        input_lwe_ciphertext_count);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -322,7 +315,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
   scratch_cuda_programmable_bootstrap<uint64_t>(
       stream, gpu_index, (pbs_buffer<uint64_t, CLASSICAL> **)&buffer,
       glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count,
-      cuda_get_max_shared_memory(gpu_index), true);
+      true);
 
   for (auto _ : st) {
     // Execute PBS
@@ -333,7 +326,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS)
         (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk,
         (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
         glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
+        input_lwe_ciphertext_count);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
@@ -345,7 +338,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
 
   scratch_cuda_programmable_bootstrap_amortized_64(
       stream, gpu_index, &buffer, glwe_dimension, polynomial_size,
-      input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true);
+      input_lwe_ciphertext_count, true);
 
   for (auto _ : st) {
     // Execute PBS
@@ -355,7 +348,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS)
         (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in_array,
         (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer,
         lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index));
+        input_lwe_ciphertext_count);
     cuda_synchronize_stream(stream, gpu_index);
   }
 
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
index f084244d04..cc6b11ba37 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -107,7 +107,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
   int8_t *pbs_buffer;
   scratch_cuda_programmable_bootstrap_amortized_64(
       stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
+      number_of_inputs, true);
 
   int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level *
                  polynomial_size * (lwe_dimension + 1);
@@ -128,7 +128,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
           lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
+          pbs_level, number_of_inputs);
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                                (glwe_dimension * polynomial_size + 1) *
@@ -165,9 +165,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
 
 TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
   int8_t *pbs_buffer;
-  scratch_cuda_programmable_bootstrap_64(
-      stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size,
-      pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true);
+  scratch_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer,
+                                         glwe_dimension, polynomial_size,
+                                         pbs_level, number_of_inputs, true);
 
   int number_of_sm = 0;
   cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0);
@@ -190,7 +190,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
           lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
+          pbs_level, number_of_inputs);
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                                (glwe_dimension * polynomial_size + 1) *
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
index b116a777e4..82e3bbb193 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp
@@ -93,8 +93,7 @@ class MultiBitProgrammableBootstrapTestPrimitives_u64
 
     scratch_cuda_multi_bit_programmable_bootstrap_64(
         stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension,
-        polynomial_size, pbs_level, grouping_factor, number_of_inputs,
-        cuda_get_max_shared_memory(gpu_index), true);
+        polynomial_size, pbs_level, grouping_factor, number_of_inputs, true);
 
     lwe_ct_out_array =
         (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) *
@@ -136,8 +135,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64,
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension,
           glwe_dimension, polynomial_size, grouping_factor, pbs_base_log,
-          pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index),
-          0);
+          pbs_level, number_of_inputs, 0);
 
       // Copy result to the host memory
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 8d84b14a63..d028c1a554 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -67,9 +67,6 @@ extern "C" {
     /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
     pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
 
-    /// Get the maximum amount of shared memory on GPU `gpu_index`
-    pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
-
     pub fn cuda_setup_multi_gpu() -> i32;
 
     /// Copy a bootstrap key `src` represented with 64 bits in the standard domain from the CPU to
@@ -136,7 +133,6 @@ extern "C" {
         polynomial_size: u32,
         level_count: u32,
         input_lwe_ciphertext_count: u32,
-        max_shared_memory: u32,
         allocate_gpu_memory: bool,
     );
 
@@ -168,7 +164,6 @@ extern "C" {
     /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
     /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
     /// - `num_samples`: number of encrypted input messages
-    ///  - `max_shared_memory` maximum amount of shared memory to be used inside device functions
     ///
     /// This function calls a wrapper to a device kernel that performs the
     /// bootstrapping:
@@ -205,7 +200,6 @@ extern "C" {
         base_log: u32,
         level: u32,
         num_samples: u32,
-        max_shared_memory: u32,
     );
 
     /// This cleanup function frees the data for the low latency PBS on GPU
@@ -228,7 +222,6 @@ extern "C" {
         level_count: u32,
         grouping_factor: u32,
         input_lwe_ciphertext_count: u32,
-        max_shared_memory: u32,
         allocate_gpu_memory: bool,
         lwe_chunk_size: u32,
     );
@@ -259,7 +252,6 @@ extern "C" {
     /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
     /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
     /// - `num_samples`: number of encrypted input messages
-    ///  - `max_shared_memory` maximum amount of shared memory to be used inside device functions
     pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut c_void,
         gpu_index: u32,
@@ -278,7 +270,6 @@ extern "C" {
         base_log: u32,
         level: u32,
         num_samples: u32,
-        max_shared_memory: u32,
         lwe_chunk_size: u32,
     );
 
@@ -471,7 +462,6 @@ extern "C" {
         grouping_factor: u32,
         num_blocks: u32,
         pbs_type: u32,
-        max_shared_memory: u32,
         allocate_gpu_memory: bool,
     );
 
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 709da6fca4..26ae612e8e 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -119,7 +119,6 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
         polynomial_size.0 as u32,
         level.0 as u32,
         num_samples,
-        get_max_shared_memory(0) as u32,
         true,
     );
     cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
@@ -139,7 +138,6 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        get_max_shared_memory(streams.gpu_indexes[0]) as u32,
     );
     cleanup_cuda_programmable_bootstrap(
         streams.ptr[0],
@@ -183,7 +181,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
         level.0 as u32,
         grouping_factor.0 as u32,
         num_samples,
-        get_max_shared_memory(0) as u32,
         true,
         0u32,
     );
@@ -205,7 +202,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        get_max_shared_memory(0) as u32,
         0,
     );
     cleanup_cuda_multi_bit_programmable_bootstrap(
@@ -577,11 +573,6 @@ pub struct CudaGlweList<T: UnsignedInteger> {
     pub ciphertext_modulus: CiphertextModulus<T>,
 }
 
-/// Get the maximum amount of shared memory on a device
-pub fn get_max_shared_memory(gpu_index: u32) -> i32 {
-    unsafe { cuda_get_max_shared_memory(gpu_index) }
-}
-
 /// Get the number of GPUs on the machine
 pub fn get_number_of_gpus() -> i32 {
     unsafe { cuda_get_number_of_gpus() }
@@ -611,8 +602,6 @@ mod tests {
     #[test]
     fn print_gpu_info() {
         println!("Number of GPUs: {}", get_number_of_gpus());
-        let gpu_index: u32 = 0;
-        println!("Max shared memory: {}", get_max_shared_memory(gpu_index))
     }
     #[test]
     fn allocate_and_copy() {
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index 654ddce257..553b18d1a3 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -3,7 +3,7 @@ pub mod server_key;
 
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
 use crate::core_crypto::gpu::vec::CudaVec;
-use crate::core_crypto::gpu::{get_max_shared_memory, CudaStreams};
+use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{
     DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweBskGroupingFactor,
     LweDimension, Numeric, PolynomialSize, UnsignedInteger,
@@ -368,7 +368,6 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async<T: UnsignedInteger, B:
         grouping_factor.0 as u32,
         num_blocks,
         pbs_type as u32,
-        get_max_shared_memory(streams.gpu_indexes[0]) as u32,
         true,
     );
     cuda_integer_mult_radix_ciphertext_kb_64(