From bc4cd08e7a69f59c5462c859e3663a26c3b4af26 Mon Sep 17 00:00:00 2001 From: Guillermo Oyarzun Date: Fri, 2 Aug 2024 18:31:07 +0200 Subject: [PATCH] refactor(gpu): Specify launch bounds on kernels --- .../programmable_bootstrap_cg_multibit.cuh | 25 ++++++------ .../pbs/programmable_bootstrap_classic.cuh | 40 ++++++++++--------- .../pbs/programmable_bootstrap_multibit.cuh | 36 +++++++++-------- .../programmable_bootstrap_tbc_multibit.cuh | 26 ++++++------ 4 files changed, 67 insertions(+), 60 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 53c1348915..f905edf483 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -18,18 +18,19 @@ #include template -__global__ void device_multi_bit_programmable_bootstrap_cg_accumulate( - Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, - const Torus *__restrict__ lut_vector, - const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, - const Torus *__restrict__ lwe_input_indexes, - const double2 *__restrict__ keybundle_array, double2 *join_buffer, - Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, - uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_cg_accumulate( + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ keybundle_array, double2 *join_buffer, + Torus *global_accumulator, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, + uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, + int8_t *device_mem, uint64_t device_memory_size_per_block) { grid_group grid = this_grid(); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index 3b00c87fd6..548fcb970f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -17,16 +17,17 @@ #include "types/complex/operations.cuh" template -__global__ void device_programmable_bootstrap_step_one( - const Torus *__restrict__ lut_vector, - const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, - const Torus *__restrict__ lwe_input_indexes, - const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t lwe_iteration, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_programmable_bootstrap_step_one( + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t lwe_iteration, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + int8_t *device_mem, uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -131,15 +132,16 @@ __global__ void device_programmable_bootstrap_step_one( } template -__global__ void device_programmable_bootstrap_step_two( - Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, - const Torus *__restrict__ lut_vector, - const Torus *__restrict__ lut_vector_indexes, - const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t lwe_iteration, - uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, int8_t *device_mem, - uint64_t device_memory_size_per_block) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_programmable_bootstrap_step_two( + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const double2 *__restrict__ bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t lwe_iteration, uint32_t lwe_dimension, + uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, + int8_t *device_mem, uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index 43a6537c28..fe3819a09e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -145,15 +145,16 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle( } template -__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( - const Torus *__restrict__ lwe_array_in, - const Torus *__restrict__ lwe_input_indexes, - const Torus *__restrict__ lut_vector, - const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem, - uint64_t device_memory_size_per_block) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_accumulate_step_one( + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem, + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is @@ -242,14 +243,15 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one( } template -__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two( - Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, - const double2 *__restrict__ keybundle_array, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, - uint32_t lwe_chunk_size, int8_t *device_mem, - uint64_t device_memory_size_per_block) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_accumulate_step_two( + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const double2 *__restrict__ keybundle_array, Torus *global_accumulator, + double2 *global_accumulator_fft, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, + uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset, + uint32_t lwe_chunk_size, int8_t *device_mem, + uint64_t device_memory_size_per_block) { // We use shared memory for the polynomials that are used often during the // bootstrap, since shared memory is kept in L1 cache and accessing it is // much faster than global memory diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 7ef43e6294..58b96bbce3 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -18,18 +18,20 @@ #include template -__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate( - Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, - const Torus *__restrict__ lut_vector, - const Torus *__restrict__ lut_vector_indexes, - const Torus *__restrict__ lwe_array_in, - const Torus *__restrict__ lwe_input_indexes, - const double2 *__restrict__ keybundle_array, double2 *join_buffer, - Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size, - uint32_t keybundle_size_per_input, int8_t *device_mem, - uint64_t device_memory_size_per_block, bool support_dsm) { +__global__ void __launch_bounds__(params::degree / params::opt) + device_multi_bit_programmable_bootstrap_tbc_accumulate( + Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes, + const Torus *__restrict__ lut_vector, + const Torus *__restrict__ lut_vector_indexes, + const Torus *__restrict__ lwe_array_in, + const Torus *__restrict__ lwe_input_indexes, + const double2 *__restrict__ keybundle_array, double2 *join_buffer, + Torus *global_accumulator, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset, + uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input, + int8_t *device_mem, uint64_t device_memory_size_per_block, + bool support_dsm) { cluster_group cluster = this_cluster();