From 804b52663b07309d670945967fa8b488b6923769 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Tue, 17 Sep 2024 17:14:42 +0200 Subject: [PATCH 1/2] chore(gpu): remove useless syncs --- backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index b23f6e5ab2..ca0e29ea23 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -443,7 +443,6 @@ void generate_device_accumulator_bivariate( message_modulus, carry_modulus, f); // copy host lut and lut_indexes_vec to device - cuda_synchronize_stream(stream, gpu_index); cuda_memcpy_async_to_gpu(acc_bivariate, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), @@ -509,7 +508,6 @@ void generate_device_accumulator(cudaStream_t stream, uint32_t gpu_index, generate_lookup_table(h_lut, glwe_dimension, polynomial_size, message_modulus, carry_modulus, f); - cuda_synchronize_stream(stream, gpu_index); // copy host lut and lut_indexes_vec to device cuda_memcpy_async_to_gpu( acc, h_lut, (glwe_dimension + 1) * polynomial_size * sizeof(Torus), From 8f98df94e8fb77a6b16a92193531cb778b9912b2 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 19 Sep 2024 11:32:33 +0200 Subject: [PATCH 2/2] chore(gpu): fix multi-gpu div performance --- .../cuda/src/pbs/programmable_bootstrap_multibit.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index ec5e39eb41..fdd2ab415b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -440,6 +440,7 @@ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, int max_blocks_per_sm; int max_shared_memory = cuda_get_max_shared_memory(0); + cudaSetDevice(gpu_index); if (max_shared_memory < full_sm_keybundle) cudaOccupancyMaxActiveBlocksPerMultiprocessor( &max_blocks_per_sm,