diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 2fc75c10b9..e61e6aa047 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -84,9 +84,8 @@ void scratch_cuda_full_propagation_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory); + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory); void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, void *input_blocks, @@ -1035,10 +1034,10 @@ template struct int_fullprop_buffer { int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_radix_params params, - uint32_t num_radix_blocks, bool allocate_gpu_memory) { + bool allocate_gpu_memory) { this->params = params; - lut = new int_radix_lut(streams, gpu_indexes, 1, params, 2, - num_radix_blocks, allocate_gpu_memory); + lut = new int_radix_lut(streams, gpu_indexes, 1, params, 2, 2, + allocate_gpu_memory); if (allocate_gpu_memory) { @@ -1064,9 +1063,9 @@ template struct int_fullprop_buffer { params.polynomial_size, params.message_modulus, params.carry_modulus, lut_f_carry); - Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus); + Torus lwe_indexes_size = 2 * sizeof(Torus); Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size); - for (int i = 0; i < num_radix_blocks; i++) + for (int i = 0; i < 2; i++) h_lwe_indexes[i] = i; Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0); cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu index abaf9a7514..9152b0fd39 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cu @@ -19,9 +19,8 @@ void scratch_cuda_full_propagation_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level, - uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks, - uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type, - bool allocate_gpu_memory) { + uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus, + uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, glwe_dimension * polynomial_size, lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log, @@ -29,8 +28,7 @@ void scratch_cuda_full_propagation_64( scratch_cuda_full_propagation( (cudaStream_t *)streams, gpu_indexes, gpu_count, - (int_fullprop_buffer **)mem_ptr, params, num_radix_blocks, - allocate_gpu_memory); + (int_fullprop_buffer **)mem_ptr, params, allocate_gpu_memory); } void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 13642d9ee0..b65e097f31 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -672,7 +672,6 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, for (int i = 0; i < num_blocks; i++) { auto cur_input_block = &input_blocks[i * big_lwe_size]; - cudaSetDevice(gpu_indexes[0]); /// Since the keyswitch is done on one input only, use only 1 GPU execute_keyswitch_async( streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector, @@ -713,12 +712,10 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count, int_fullprop_buffer **mem_ptr, int_radix_params params, - uint32_t num_radix_blocks, bool allocate_gpu_memory) { - *mem_ptr = - new int_fullprop_buffer(streams, gpu_indexes, gpu_count, params, - num_radix_blocks, allocate_gpu_memory); + *mem_ptr = new int_fullprop_buffer(streams, gpu_indexes, gpu_count, + params, allocate_gpu_memory); } // (lwe_dimension+1) threads diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index d028c1a554..217efeb0ec 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -669,7 +669,6 @@ extern "C" { pbs_level: u32, pbs_base_log: u32, grouping_factor: u32, - num_blocks: u32, message_modulus: u32, carry_modulus: u32, pbs_type: u32, diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 553b18d1a3..bb2151caa2 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -828,7 +828,6 @@ pub unsafe fn full_propagate_assign_async( pbs_level.0 as u32, pbs_base_log.0 as u32, grouping_factor.0 as u32, - num_blocks, message_modulus.0 as u32, carry_modulus.0 as u32, pbs_type as u32,