From 804b52663b07309d670945967fa8b488b6923769 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Tue, 17 Sep 2024 17:14:42 +0200
Subject: [PATCH 1/2] chore(gpu): remove useless syncs

---
 backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh | 2 --
 1 file changed, 2 deletions(-)
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index b23f6e5ab2..ca0e29ea23 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -443,7 +443,6 @@ void generate_device_accumulator_bivariate(
                                          message_modulus, carry_modulus, f);
 
   // copy host lut and lut_indexes_vec to device
-  cuda_synchronize_stream(stream, gpu_index);
   cuda_memcpy_async_to_gpu(acc_bivariate, h_lut,
                            (glwe_dimension + 1) * polynomial_size *
                                sizeof(Torus),
@@ -509,7 +508,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index,
   generate_lookup_table<Torus>(h_lut, glwe_dimension, polynomial_size,
                                message_modulus, carry_modulus, f);
 
-  cuda_synchronize_stream(stream, gpu_index);
   // copy host lut and lut_indexes_vec to device
   cuda_memcpy_async_to_gpu(
       acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus),

From 8f98df94e8fb77a6b16a92193531cb778b9912b2 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 19 Sep 2024 11:32:33 +0200
Subject: [PATCH 2/2] chore(gpu): fix multi-gpu div performance

---
 .../cuda/src/pbs/programmable_bootstrap_multibit.cu              | 1 +
 1 file changed, 1 insertion(+)

diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
index ec5e39eb41..fdd2ab415b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu
@@ -440,6 +440,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs,
 
   int max_blocks_per_sm;
   int max_shared_memory = cuda_get_max_shared_memory(0);
+  cudaSetDevice(gpu_index);
   if (max_shared_memory < full_sm_keybundle)
     cudaOccupancyMaxActiveBlocksPerMultiprocessor(
         &max_blocks_per_sm,