zama-ai · agnesLeroy · Sep 19, 2024 · Sep 17, 2024 · Sep 19, 2024
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -443,7 +443,6 @@ void generate_device_accumulator_bivariate(
                                          message_modulus, carry_modulus, f);
 
   // copy host lut and lut_indexes_vec to device
-  cuda_synchronize_stream(stream, gpu_index);
   cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(Torus),
@@ -509,7 +508,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
   generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                                message_modulus, carry_modulus, f);
 
-  cuda_synchronize_stream(stream, gpu_index);
   // copy host lut and lut_indexes_vec to device
   cuda_memcpy_async_to_gpu(
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -440,6 +440,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
 
   int max_blocks_per_sm;
   int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
   if (max_shared_memory < full_sm_keybundle)
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_blocks_per_sm,