diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
index 0e9144930d..bdfdabdd5d 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h
@@ -41,8 +41,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);
 
 void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -50,8 +49,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);
 
 void cleanup_cuda_programmable_bootstrap_amortized(void *stream,
                                                    uint32_t gpu_index,
@@ -75,8 +73,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);
 
 void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *stream, uint32_t gpu_index, void *lwe_array_out,
@@ -84,8 +81,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory);
+    uint32_t num_samples, uint32_t max_shared_memory);
 
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **pbs_buffer);
@@ -353,8 +349,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template <typename Torus>
 void cuda_programmable_bootstrap_lwe_ciphertext_vector(
@@ -363,8 +358,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 #if (CUDA_ARCH >= 900)
 template <typename Torus>
@@ -374,8 +368,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template <typename Torus>
 void scratch_cuda_programmable_bootstrap_tbc(
diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
index 7a636f404c..6d5d30781f 100644
--- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
+++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h
@@ -28,8 +28,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size = 0);
 
 void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream,
                                                    uint32_t gpu_index,
@@ -63,8 +63,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
 #endif
 
 template <typename Torus>
@@ -90,8 +89,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 void scratch_cuda_multi_bit_programmable_bootstrap(
@@ -109,8 +107,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0);
 
 template <typename Torus>
 __host__ __device__ uint64_t
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 71f9532a47..4469e71f3b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -177,7 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, 1, 0,
+        grouping_factor, num_radix_blocks,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
   } else {
     /// Make sure all data that should be on GPU 0 is indeed there
@@ -204,7 +204,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        pbs_level, grouping_factor, num_radix_blocks,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
@@ -270,7 +270,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0],
         lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension,
         small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level,
-        grouping_factor, num_radix_blocks, 1, 0,
+        grouping_factor, num_radix_blocks,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
   } else {
     cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -293,7 +293,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb(
         lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec,
         lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer,
         glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log,
-        pbs_level, grouping_factor, num_radix_blocks, 1, 0,
+        pbs_level, grouping_factor, num_radix_blocks,
         cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type);
 
     /// Copy data back to GPU 0 and release vecs
@@ -696,8 +696,8 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
         mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer,
         params.glwe_dimension, params.small_lwe_dimension,
         params.polynomial_size, params.pbs_base_log, params.pbs_level,
-        params.grouping_factor, 2, 2, 0,
-        cuda_get_max_shared_memory(gpu_indexes[0]), params.pbs_type);
+        params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]),
+        params.pbs_type);
 
     cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector,
                                  big_lwe_size * sizeof(Torus), streams[0],
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index a5feeb8102..d2caa89927 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -372,7 +372,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer,
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.grouping_factor, total_count, max_shared_memory,
           mem_ptr->params.pbs_type);
     } else {
       cuda_synchronize_stream(streams[0], gpu_indexes[0]);
@@ -420,7 +420,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
           lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer,
           glwe_dimension, small_lwe_dimension, polynomial_size,
           mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level,
-          mem_ptr->params.grouping_factor, total_count, 2, 0, max_shared_memory,
+          mem_ptr->params.grouping_factor, total_count, max_shared_memory,
           mem_ptr->params.pbs_type);
 
       multi_gpu_gather_lwe_async<Torus>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
index cc5bd54fa3..dacddb7bcc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh
@@ -127,8 +127,8 @@ void execute_pbs_async(
     std::vector<int8_t *> pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t grouping_factor,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory, PBS_TYPE pbs_type) {
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory,
+    PBS_TYPE pbs_type) {
   switch (sizeof(Torus)) {
   case sizeof(uint32_t):
     // 32 bits
@@ -160,8 +160,8 @@ void execute_pbs_async(
             current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            max_shared_memory);
       }
       break;
     default:
@@ -200,7 +200,7 @@ void execute_pbs_async(
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
             polynomial_size, grouping_factor, base_log, level_count,
-            num_inputs_on_gpu, num_luts, lwe_idx, max_shared_memory);
+            num_inputs_on_gpu, max_shared_memory);
       }
       break;
     case CLASSICAL:
@@ -228,8 +228,8 @@ void execute_pbs_async(
             current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes,
             current_lwe_array_in, current_lwe_input_indexes,
             bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension,
-            polynomial_size, base_log, level_count, num_inputs_on_gpu, num_luts,
-            lwe_idx, max_shared_memory);
+            polynomial_size, base_log, level_count, num_inputs_on_gpu,
+            max_shared_memory);
       }
       break;
     default:
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
index 0cf9b91498..084756a374 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu
@@ -143,8 +143,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t num_samples, uint32_t max_shared_memory) {
 
   if (base_log > 32)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -158,7 +157,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<512>>(
@@ -167,7 +166,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<1024>>(
@@ -176,7 +175,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<2048>>(
@@ -185,7 +184,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<4096>>(
@@ -194,7 +193,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<8192>>(
@@ -203,7 +202,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint32_t, AmortizedDegree<16384>>(
@@ -212,7 +211,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
         (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in,
         (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
@@ -254,15 +253,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32(
  * - base_log: log of the base used for the gadget matrix - B = 2^base_log (~8)
  * - level_count: number of decomposition levels in the gadget matrix (~4)
  * - num_samples: number of encrypted input messages
- * - num_luts: parameter to set the actual number of luts to be
  * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
  *  - 'max_shared_memory' maximum amount of shared memory to be used inside
  * device functions
  *
@@ -292,8 +283,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t num_samples, uint32_t max_shared_memory) {
 
   if (base_log > 64)
     PANIC("Cuda error (amortized PBS): base log should be > number of bits in "
@@ -307,7 +297,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<512>>(
@@ -316,7 +306,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<1024>>(
@@ -325,7 +315,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<2048>>(
@@ -334,7 +324,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<4096>>(
@@ -343,7 +333,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<8192>>(
@@ -352,7 +342,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_amortized<uint64_t, AmortizedDegree<16384>>(
@@ -361,7 +351,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64(
         (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in,
         (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer,
         glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory);
+        num_samples, max_shared_memory);
     break;
   default:
     PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported "
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index be7cbf5b75..ee9fb4e1b9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -42,7 +42,6 @@ template <typename Torus, class params, sharedMemDegree SMD>
  *  - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
  *  - level_count: number of decomposition levels in the gadget matrix (~4)
  *  - gpu_num: index of the current GPU (useful for multi-GPU computations)
- *  - lwe_idx: equal to the number of samples per gpu x gpu_num
  *  - device_memory_size_per_sample: amount of global memory to allocate if SMD
  * is not FULLSM
  */
@@ -54,7 +53,7 @@ __global__ void device_programmable_bootstrap_amortized(
     const Torus *__restrict__ lwe_input_indexes,
     const double2 *__restrict__ bootstrapping_key, int8_t *device_mem,
     uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size,
-    uint32_t base_log, uint32_t level_count, uint32_t lwe_idx,
+    uint32_t base_log, uint32_t level_count,
     size_t device_memory_size_per_sample) {
   // We use shared memory for the polynomials that are used often during the
   // bootstrap, since shared memory is kept in L1 cache and accessing it is
@@ -84,7 +83,7 @@ __global__ void device_programmable_bootstrap_amortized(
   auto block_lwe_array_in =
       &lwe_array_in[lwe_input_indexes[blockIdx.x] * (lwe_dimension + 1)];
   const Torus *block_lut_vector =
-      &lut_vector[lut_vector_indexes[lwe_idx + blockIdx.x] * params::degree *
+      &lut_vector[lut_vector_indexes[blockIdx.x] * params::degree *
                   (glwe_dimension + 1)];
 
   // Put "b", the body, in [0, 2N[
@@ -300,8 +299,7 @@ __host__ void host_programmable_bootstrap_amortized(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t input_lwe_ciphertext_count, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
 
   cudaSetDevice(gpu_index);
   uint64_t SM_FULL =
@@ -335,14 +333,14 @@ __host__ void host_programmable_bootstrap_amortized(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_FULL);
+            level_count, DM_FULL);
   } else if (max_shared_memory < SM_FULL) {
     device_programmable_bootstrap_amortized<Torus, params, PARTIALSM>
         <<<grid, thds, SM_PART, stream>>>(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, DM_PART);
+            level_count, DM_PART);
   } else {
     // For devices with compute capability 7.x a single thread block can
     // address the full capacity of shared memory. Shared memory on the
@@ -354,7 +352,7 @@ __host__ void host_programmable_bootstrap_amortized(
             lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes,
             lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer,
             glwe_dimension, lwe_dimension, polynomial_size, base_log,
-            level_count, lwe_idx, 0);
+            level_count, 0);
   }
   check_cuda_error(cudaGetLastError());
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 32913f5f21..537eb5c66d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -204,7 +204,7 @@ __host__ void host_programmable_bootstrap_cg(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory) {
+    uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index 92b7d560e6..2da2e4581c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -336,8 +336,7 @@ __host__ void host_cg_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
index fed7b191b8..09a689405c 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu
@@ -131,8 +131,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -141,7 +140,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_tbc<Torus, Degree<512>>(
@@ -149,7 +148,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_tbc<Torus, Degree<1024>>(
@@ -157,7 +156,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<2048>>(
@@ -165,7 +164,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<4096>>(
@@ -173,7 +172,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<8192>>(
@@ -181,7 +180,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_tbc<Torus, AmortizedDegree<16384>>(
@@ -189,7 +188,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -410,8 +409,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -420,7 +418,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap_cg<Torus, Degree<512>>(
@@ -428,7 +426,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap_cg<Torus, Degree<1024>>(
@@ -436,7 +434,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<2048>>(
@@ -444,7 +442,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<4096>>(
@@ -452,7 +450,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<8192>>(
@@ -460,7 +458,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap_cg<Torus, AmortizedDegree<16384>>(
@@ -468,7 +466,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -484,8 +482,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
     Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory) {
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) {
 
   switch (polynomial_size) {
   case 256:
@@ -494,7 +491,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 512:
     host_programmable_bootstrap<Torus, Degree<512>>(
@@ -502,7 +499,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 1024:
     host_programmable_bootstrap<Torus, Degree<1024>>(
@@ -510,7 +507,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 2048:
     host_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -518,7 +515,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 4096:
     host_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -526,7 +523,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 8192:
     host_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -534,7 +531,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   case 16384:
     host_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -542,7 +539,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension,
         lwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, max_shared_memory);
+        max_shared_memory);
     break;
   default:
     PANIC("Cuda error (classical PBS): unsupported polynomial size. "
@@ -559,15 +556,14 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t num_samples, uint32_t max_shared_memory) {
 
   if (base_log > 32)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
           "in the ciphertext representation (32)");
 
-  pbs_buffer<uint64_t, CLASSICAL> *buffer =
-      (pbs_buffer<uint64_t, CLASSICAL> *)mem_ptr;
+  pbs_buffer<uint32_t, CLASSICAL> *buffer =
+      (pbs_buffer<uint32_t, CLASSICAL> *)mem_ptr;
 
   switch (buffer->pbs_variant) {
   case TBC:
@@ -579,14 +575,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lut_vector_indexes),
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
+    break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-    break;
   case CG:
     cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
         stream, gpu_index, static_cast<uint32_t *>(lwe_array_out),
@@ -595,10 +590,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lut_vector_indexes),
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
     break;
   case DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
@@ -608,10 +602,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
         static_cast<uint32_t *>(lut_vector_indexes),
         static_cast<uint32_t *>(lwe_array_in),
         static_cast<uint32_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint32_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -654,15 +647,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32(
  * - base_log: log base used for the gadget matrix - B = 2^base_log (~8)
  * - level_count: number of decomposition levels in the gadget matrix (~4)
  * - num_samples: number of encrypted input messages
- * - num_luts: parameter to set the actual number of luts to be
- * used
- * - lwe_idx: the index of the LWE input to consider for the GPU of index
- * gpu_index. In case of multi-GPU computing, it is assumed that only a part of
- * the input LWE array is copied to each GPU, but the whole LUT array is copied
- * (because the case when the number of LUTs is smaller than the number of input
- * LWEs is not trivial to take into account in the data repartition on the
- * GPUs). `lwe_idx` is used to determine which LUT to consider for a given LWE
- * input in the LUT array `lut_vector`.
  *  - 'max_shared_memory' maximum amount of shared memory to be used inside
  * device functions
  *
@@ -696,8 +680,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
-    uint32_t num_samples, uint32_t num_luts, uint32_t lwe_idx,
-    uint32_t max_shared_memory) {
+    uint32_t num_samples, uint32_t max_shared_memory) {
   if (base_log > 64)
     PANIC("Cuda error (classical PBS): base log should be > number of bits "
           "in the ciphertext representation (64)");
@@ -715,14 +698,13 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lut_vector_indexes),
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
+    break;
 #else
     PANIC("Cuda error (PBS): TBC pbs is not supported.")
 #endif
-    break;
   case PBS_VARIANT::CG:
     cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
         stream, gpu_index, static_cast<uint64_t *>(lwe_array_out),
@@ -731,10 +713,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lut_vector_indexes),
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -744,10 +725,9 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lut_vector_indexes),
         static_cast<uint64_t *>(lwe_array_in),
         static_cast<uint64_t *>(lwe_input_indexes),
-        static_cast<double2 *>(bootstrapping_key),
-        (pbs_buffer<uint64_t, CLASSICAL> *)buffer, lwe_dimension,
+        static_cast<double2 *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, base_log, level_count, num_samples,
-        num_luts, lwe_idx, max_shared_memory);
+        max_shared_memory);
     break;
   default:
     PANIC("Cuda error (PBS): unknown pbs variant.")
@@ -760,7 +740,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(
  */
 void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
                                          int8_t **buffer) {
-  cudaSetDevice(gpu_index);
   auto x = (pbs_buffer<uint64_t, CLASSICAL> *)(*buffer);
   x->release(static_cast<cudaStream_t>(stream), gpu_index);
 }
@@ -776,8 +755,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
@@ -786,8 +764,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint64_t>(
     void *stream, uint32_t gpu_index,
@@ -809,8 +786,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     void *stream, uint32_t gpu_index, uint32_t *lwe_array_out,
@@ -819,8 +795,7 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 
 template void scratch_cuda_programmable_bootstrap_cg<uint32_t>(
     void *stream, uint32_t gpu_index,
@@ -850,8 +825,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint32_t>(
     uint32_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint32_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     void *stream, uint32_t gpu_index, uint64_t *lwe_array_out,
     uint64_t *lwe_output_indexes, uint64_t *lut_vector,
@@ -859,8 +833,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector<uint64_t>(
     uint64_t *lwe_input_indexes, double2 *bootstrapping_key,
     pbs_buffer<uint64_t, CLASSICAL> *buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory);
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory);
 template void scratch_cuda_programmable_bootstrap_tbc<uint32_t>(
     void *stream, uint32_t gpu_index,
     pbs_buffer<uint32_t, CLASSICAL> **pbs_buffer, uint32_t glwe_dimension,
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index ebce576c81..da33298a2f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -419,7 +419,7 @@ __host__ void host_programmable_bootstrap(
     pbs_buffer<Torus, CLASSICAL> *pbs_buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory) {
+    uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   // With SM each block corresponds to either the mask or body, no need to
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index 3bff782749..48ad5fd8fe 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -74,8 +74,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -88,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -96,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -104,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -112,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -120,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -128,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_cg_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -136,7 +135,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -153,8 +152,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -167,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<512>>(
@@ -175,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<1024>>(
@@ -183,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<2048>>(
@@ -191,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<4096>>(
@@ -199,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<8192>>(
@@ -207,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_multi_bit_programmable_bootstrap<Torus, AmortizedDegree<16384>>(
@@ -215,7 +213,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -230,8 +228,8 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
     void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key,
     int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension,
     uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log,
-    uint32_t level_count, uint32_t num_samples, uint32_t num_luts,
-    uint32_t lwe_idx, uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
+    uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory,
+    uint32_t lwe_chunk_size) {
 
   pbs_buffer<uint64_t, MULTI_BIT> *buffer =
       (pbs_buffer<uint64_t, MULTI_BIT> *)mem_ptr;
@@ -248,7 +246,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
 #else
     PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.")
@@ -263,7 +261,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case PBS_VARIANT::DEFAULT:
     cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
@@ -275,7 +273,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         static_cast<uint64_t *>(lwe_input_indexes),
         static_cast<uint64_t *>(bootstrapping_key), buffer, lwe_dimension,
         glwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.")
@@ -527,8 +525,7 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
 
 template void scratch_cuda_cg_multi_bit_programmable_bootstrap<uint64_t>(
     void *stream, uint32_t gpu_index,
@@ -546,8 +543,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
 
 template bool
 has_support_to_cuda_programmable_bootstrap_tbc_multi_bit<uint64_t>(
@@ -627,8 +623,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
     pbs_buffer<Torus, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size) {
 
   if (base_log > 64)
     PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in "
@@ -642,7 +637,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 512:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -651,7 +646,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 1024:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -660,7 +655,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 2048:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -669,7 +664,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 4096:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -678,7 +673,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 8192:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -687,7 +682,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   case 16384:
     host_tbc_multi_bit_programmable_bootstrap<Torus, int64_t,
@@ -696,7 +691,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector(
         lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in,
         lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension,
         lwe_dimension, polynomial_size, grouping_factor, base_log, level_count,
-        num_samples, num_luts, lwe_idx, max_shared_memory, lwe_chunk_size);
+        num_samples, max_shared_memory, lwe_chunk_size);
     break;
   default:
     PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported "
@@ -721,6 +716,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector<uint64_t>(
     pbs_buffer<uint64_t, MULTI_BIT> *pbs_buffer, uint32_t lwe_dimension,
     uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size);
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size);
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index da8c70bb05..b409f935ec 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -123,8 +123,8 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
     synchronize_threads_in_block();
     // Move from local memory back to shared memory but as complex
-    double2 *fft = (double2 *)selected_memory;
     tid = threadIdx.x;
+    double2 *fft = (double2 *)selected_memory;
 #pragma unroll
     for (int i = 0; i < params::opt / 2; i++) {
       fft[tid] = temp[i];
@@ -630,8 +630,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   // If a chunk size is not passed to this function, select one.
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
index a66473310d..f63f22748e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh
@@ -227,7 +227,7 @@ __host__ void host_programmable_bootstrap_tbc(
     pbs_buffer<Torus, CLASSICAL> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
     uint32_t level_count, uint32_t input_lwe_ciphertext_count,
-    uint32_t num_luts, uint32_t max_shared_memory) {
+    uint32_t max_shared_memory) {
   cudaSetDevice(gpu_index);
 
   auto supports_dsm =
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
index 656d4133bf..57378b5127 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh
@@ -365,8 +365,7 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap(
     pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t glwe_dimension,
     uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples,
-    uint32_t num_luts, uint32_t lwe_idx, uint32_t max_shared_memory,
-    uint32_t lwe_chunk_size = 0) {
+    uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) {
   cudaSetDevice(gpu_index);
 
   if (!lwe_chunk_size)
diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
index 4f411ead83..f084244d04 100644
--- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
+++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp
@@ -128,8 +128,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) {
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
           lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
+          pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                                (glwe_dimension * polynomial_size + 1) *
@@ -191,8 +190,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) {
           (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in,
           (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer,
           lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log,
-          pbs_level, number_of_inputs, 1, 0,
-          cuda_get_max_shared_memory(gpu_index));
+          pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index));
       // Copy result back
       cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array,
                                (glwe_dimension * polynomial_size + 1) *
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 322be12f2a..8d84b14a63 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -168,13 +168,6 @@ extern "C" {
     /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
     /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
     /// - `num_samples`: number of encrypted input messages
-    /// - `num_lut_vectors`: parameter to set the actual number of test vectors to be used
-    /// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In case
-    ///   of multi-GPU computing, it is assumed that only a part of the input LWE array is copied to
-    ///   each GPU, but the whole LUT array is copied (because the case when the number of LUTs is
-    ///   smaller than the number of input LWEs is not trivial to take into account in the data
-    ///   repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider for a given
-    ///   LWE input in the LUT array `lut_vector`.
     ///  - `max_shared_memory` maximum amount of shared memory to be used inside device functions
     ///
     /// This function calls a wrapper to a device kernel that performs the
@@ -212,10 +205,7 @@ extern "C" {
         base_log: u32,
         level: u32,
         num_samples: u32,
-        num_lut_vectors: u32,
-        lwe_idx: u32,
         max_shared_memory: u32,
-        gpu_offset: u32,
     );
 
     /// This cleanup function frees the data for the low latency PBS on GPU
@@ -269,13 +259,6 @@ extern "C" {
     /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8)
     /// - `level_count`: number of decomposition levels in the gadget matrix (~4)
     /// - `num_samples`: number of encrypted input messages
-    /// - `num_lut_vectors`: parameter to set the actual number of test vectors to be used
-    /// - `lwe_idx`: the index of the LWE input to consider for the GPU of index gpu_index. In case
-    ///   of multi-GPU computing, it is assumed that only a part of the input LWE array is copied to
-    ///   each GPU, but the whole LUT array is copied (because the case when the number of LUTs is
-    ///   smaller than the number of input LWEs is not trivial to take into account in the data
-    ///   repartition on the GPUs). `lwe_idx` is used to determine which LUT to consider for a given
-    ///   LWE input in the LUT array `lut_vector`.
     ///  - `max_shared_memory` maximum amount of shared memory to be used inside device functions
     pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64(
         stream: *mut c_void,
@@ -295,10 +278,7 @@ extern "C" {
         base_log: u32,
         level: u32,
         num_samples: u32,
-        num_lut_vectors: u32,
-        lwe_idx: u32,
         max_shared_memory: u32,
-        gpu_offset: u32,
         lwe_chunk_size: u32,
     );
 
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
index e949140503..ff4196e266 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_multi_bit_programmable_bootstrapping.rs
@@ -3,7 +3,7 @@ use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextLis
 use crate::core_crypto::gpu::entities::lwe_multi_bit_bootstrap_key::CudaLweMultiBitBootstrapKey;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::{programmable_bootstrap_multi_bit_async, CudaStreams};
-use crate::core_crypto::prelude::{CastInto, LweCiphertextIndex, UnsignedTorus};
+use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
 
 /// # Safety
 ///
@@ -91,7 +91,6 @@ pub unsafe fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_async<Scalar>
         multi_bit_bsk.decomp_level_count(),
         multi_bit_bsk.grouping_factor(),
         input.lwe_ciphertext_count().0 as u32,
-        LweCiphertextIndex(0),
     );
 }
 
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
index 8eca70a49a..5a27a56440 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_programmable_bootstrapping.rs
@@ -3,9 +3,7 @@ use crate::core_crypto::gpu::entities::lwe_bootstrap_key::CudaLweBootstrapKey;
 use crate::core_crypto::gpu::entities::lwe_ciphertext_list::CudaLweCiphertextList;
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::{programmable_bootstrap_async, CudaStreams};
-use crate::core_crypto::prelude::{
-    CastInto, LweCiphertextCount, LweCiphertextIndex, UnsignedTorus,
-};
+use crate::core_crypto::prelude::{CastInto, LweCiphertextCount, UnsignedTorus};
 
 /// # Safety
 ///
@@ -93,7 +91,6 @@ pub unsafe fn cuda_programmable_bootstrap_lwe_ciphertext_async<Scalar>(
         bsk.decomp_base_log(),
         bsk.decomp_level_count(),
         num_samples.0 as u32,
-        LweCiphertextIndex(0),
     );
 }
 
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 5a54d2a569..a7e261104e 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -110,7 +110,6 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
     base_log: DecompositionBaseLog,
     level: DecompositionLevelCount,
     num_samples: u32,
-    lwe_idx: LweCiphertextIndex,
 ) {
     let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
     scratch_cuda_programmable_bootstrap_64(
@@ -141,10 +140,7 @@ pub unsafe fn programmable_bootstrap_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        num_samples,
-        lwe_idx.0 as u32,
         get_max_shared_memory(streams.gpu_indexes[0]) as u32,
-        0,
     );
     cleanup_cuda_programmable_bootstrap(
         streams.ptr[0],
@@ -176,7 +172,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
     level: DecompositionLevelCount,
     grouping_factor: LweBskGroupingFactor,
     num_samples: u32,
-    lwe_idx: LweCiphertextIndex,
 ) {
     let mut pbs_buffer: *mut i8 = std::ptr::null_mut();
     scratch_cuda_multi_bit_programmable_bootstrap_64(
@@ -211,10 +206,7 @@ pub unsafe fn programmable_bootstrap_multi_bit_async<T: UnsignedInteger>(
         base_log.0 as u32,
         level.0 as u32,
         num_samples,
-        num_samples,
-        lwe_idx.0 as u32,
         get_max_shared_memory(0) as u32,
-        0u32,
         0,
     );
     cleanup_cuda_multi_bit_programmable_bootstrap(