Skip to content

Commit

Permalink
fix(gpu): fix full prop with 1 radix block
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Aug 2, 2024
1 parent 0e71ca6 commit b88f561
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 20 deletions.
15 changes: 7 additions & 8 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,8 @@ void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_full_propagation_64_inplace(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count, void *input_blocks,
Expand Down Expand Up @@ -1035,10 +1034,10 @@ template <typename Torus> struct int_fullprop_buffer {

int_fullprop_buffer(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
bool allocate_gpu_memory) {
this->params = params;
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2,
num_radix_blocks, allocate_gpu_memory);
lut = new int_radix_lut<Torus>(streams, gpu_indexes, 1, params, 2, 2,
allocate_gpu_memory);

if (allocate_gpu_memory) {

Expand All @@ -1064,9 +1063,9 @@ template <typename Torus> struct int_fullprop_buffer {
params.polynomial_size, params.message_modulus, params.carry_modulus,
lut_f_carry);

Torus lwe_indexes_size = num_radix_blocks * sizeof(Torus);
Torus lwe_indexes_size = 2 * sizeof(Torus);
Torus *h_lwe_indexes = (Torus *)malloc(lwe_indexes_size);
for (int i = 0; i < num_radix_blocks; i++)
for (int i = 0; i < 2; i++)
h_lwe_indexes[i] = i;
Torus *lwe_indexes = lut->get_lut_indexes(gpu_indexes[0], 0);
cuda_memcpy_async_to_gpu(lwe_indexes, h_lwe_indexes, lwe_indexes_size,
Expand Down
8 changes: 3 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,16 @@ void scratch_cuda_full_propagation_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t num_radix_blocks,
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory) {
uint32_t pbs_base_log, uint32_t grouping_factor, uint32_t message_modulus,
uint32_t carry_modulus, PBS_TYPE pbs_type, bool allocate_gpu_memory) {
int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);

scratch_cuda_full_propagation<uint64_t>(
(cudaStream_t *)streams, gpu_indexes, gpu_count,
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, num_radix_blocks,
allocate_gpu_memory);
(int_fullprop_buffer<uint64_t> **)mem_ptr, params, allocate_gpu_memory);
}

void cleanup_cuda_full_propagation(void **streams, uint32_t *gpu_indexes,
Expand Down
7 changes: 2 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -672,7 +672,6 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
for (int i = 0; i < num_blocks; i++) {
auto cur_input_block = &input_blocks[i * big_lwe_size];

cudaSetDevice(gpu_indexes[0]);
/// Since the keyswitch is done on one input only, use only 1 GPU
execute_keyswitch_async<Torus>(
streams, gpu_indexes, 1, mem_ptr->tmp_small_lwe_vector,
Expand Down Expand Up @@ -713,12 +712,10 @@ void scratch_cuda_full_propagation(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int_fullprop_buffer<Torus> **mem_ptr,
int_radix_params params,
uint32_t num_radix_blocks,
bool allocate_gpu_memory) {

*mem_ptr =
new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count, params,
num_radix_blocks, allocate_gpu_memory);
*mem_ptr = new int_fullprop_buffer<Torus>(streams, gpu_indexes, gpu_count,
params, allocate_gpu_memory);
}

// (lwe_dimension+1) threads
Expand Down
1 change: 0 additions & 1 deletion backends/tfhe-cuda-backend/src/cuda_bind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,7 +677,6 @@ extern "C" {
pbs_level: u32,
pbs_base_log: u32,
grouping_factor: u32,
num_blocks: u32,
message_modulus: u32,
carry_modulus: u32,
pbs_type: u32,
Expand Down
1 change: 0 additions & 1 deletion tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -828,7 +828,6 @@ pub unsafe fn full_propagate_assign_async<T: UnsignedInteger, B: Numeric>(
pbs_level.0 as u32,
pbs_base_log.0 as u32,
grouping_factor.0 as u32,
num_blocks,
message_modulus.0 as u32,
carry_modulus.0 as u32,
pbs_type as u32,
Expand Down

0 comments on commit b88f561

Please sign in to comment.