From 5e4972709e918a1c700ba795e781d9e92d4674e8 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 25 Jul 2024 15:08:36 +0200 Subject: [PATCH 1/2] fix(gpu): fix scalar shifts --- .../tfhe-cuda-backend/cuda/include/integer.h | 4 +- .../cuda/src/integer/scalar_shifts.cuh | 117 +++++++++--------- 2 files changed, 62 insertions(+), 59 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index df4eb39df0..33d862cd29 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -1755,12 +1755,12 @@ template struct int_arithmetic_scalar_shift_buffer { uint32_t big_lwe_size = params.big_lwe_dimension + 1; uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) * + tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) * big_lwe_size_bytes, streams[0], gpu_indexes[0]); cuda_memset_async(tmp_rotated, 0, - (num_radix_blocks + 2) * big_lwe_size_bytes, streams[0], + (num_radix_blocks + 3) * big_lwe_size_bytes, streams[0], gpu_indexes[0]); uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index e612c9ab2f..47eea7b426 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -52,8 +52,6 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( Torus *full_rotated_buffer = mem->tmp_rotated; Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size]; - auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; - // rotate right all the blocks in radix ciphertext // copy result in new buffer // 1024 threads are used in every block @@ -76,6 +74,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( return; } + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; auto partial_current_blocks = &lwe_array[rotations * big_lwe_size]; auto partial_previous_blocks = &full_rotated_buffer[rotations * big_lwe_size]; @@ -109,6 +108,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace( auto partial_current_blocks = lwe_array; auto partial_next_blocks = &rotated_buffer[big_lwe_size]; + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; size_t partial_block_count = num_blocks - rotations; @@ -139,8 +139,6 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( int_arithmetic_scalar_shift_buffer *mem, void **bsks, Torus **ksks, uint32_t num_blocks) { - cudaSetDevice(gpu_indexes[0]); - auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; @@ -160,15 +158,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( size_t shift_within_block = shift % num_bits_in_block; Torus *rotated_buffer = mem->tmp_rotated; - Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size]; + Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size]; Torus *last_block_copy = &padding_block[big_lwe_size]; - auto lut_univariate_shift_last_block = - mem->lut_buffers_univariate[shift_within_block - 1]; - auto lut_univariate_padding_block = - mem->lut_buffers_univariate[num_bits_in_block - 1]; - auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; - if (mem->shift_type == RIGHT_SHIFT) { host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, rotated_buffer, lwe_array, rotations, @@ -197,59 +189,70 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( return; } - // In the arithmetic shift case we have to pad with the value of the sign - // bit. This creates the need for a different shifting lut than in the - // logical shift case. We also need another PBS to create the padding block. - Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size; - cuda_memcpy_async_gpu_to_gpu( - last_block_copy, - rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size, - big_lwe_size_bytes, streams[0], gpu_indexes[0]); - auto partial_current_blocks = lwe_array; - auto partial_next_blocks = &rotated_buffer[big_lwe_size]; - size_t partial_block_count = num_blocks - rotations; - if (shift_within_block != 0 && rotations != num_blocks) { - integer_radix_apply_bivariate_lookup_table_kb( - streams, gpu_indexes, gpu_count, partial_current_blocks, - partial_current_blocks, partial_next_blocks, bsks, ksks, - partial_block_count, lut_bivariate, - lut_bivariate->params.message_modulus); - } - // Since our CPU threads will be working on different streams we shall - // assert the work in the main stream is completed - for (uint j = 0; j < gpu_count; j++) { - cuda_synchronize_stream(streams[j], gpu_indexes[j]); - } + if (num_blocks != rotations) { + // In the arithmetic shift case we have to pad with the value of the sign + // bit. This creates the need for a different shifting lut than in the + // logical shift case. We also need another PBS to create the padding + // block. + Torus *last_block = + lwe_array + (num_blocks - rotations - 1) * big_lwe_size; + cuda_memcpy_async_gpu_to_gpu( + last_block_copy, + rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size, + big_lwe_size_bytes, streams[0], gpu_indexes[0]); + if (shift_within_block != 0) { + auto partial_current_blocks = lwe_array; + auto partial_next_blocks = &rotated_buffer[big_lwe_size]; + size_t partial_block_count = num_blocks - rotations; + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; + + integer_radix_apply_bivariate_lookup_table_kb( + streams, gpu_indexes, gpu_count, partial_current_blocks, + partial_current_blocks, partial_next_blocks, bsks, ksks, + partial_block_count, lut_bivariate, + lut_bivariate->params.message_modulus); + } + // Since our CPU threads will be working on different streams we shall + // assert the work in the main stream is completed + for (uint j = 0; j < gpu_count; j++) { + cuda_synchronize_stream(streams[j], gpu_indexes[j]); + } #pragma omp parallel sections - { - // All sections may be executed in parallel -#pragma omp section { - integer_radix_apply_univariate_lookup_table_kb( - mem->local_streams_1, gpu_indexes, gpu_count, padding_block, - last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); - // Replace blocks 'pulled' from the left with the correct padding block - for (uint i = 0; i < rotations; i++) { - cuda_memcpy_async_gpu_to_gpu( - lwe_array + (num_blocks - rotations + i) * big_lwe_size, - padding_block, big_lwe_size_bytes, mem->local_streams_1[0], - gpu_indexes[0]); - } - } + // All sections may be executed in parallel #pragma omp section - { - if (shift_within_block != 0 && rotations != num_blocks) { + { + auto lut_univariate_padding_block = + mem->lut_buffers_univariate[num_bits_in_block - 1]; integer_radix_apply_univariate_lookup_table_kb( - mem->local_streams_2, gpu_indexes, gpu_count, last_block, - last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block); + mem->local_streams_1, gpu_indexes, gpu_count, padding_block, + last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); + // Replace blocks 'pulled' from the left with the correct padding + // block + for (uint i = 0; i < rotations; i++) { + cuda_memcpy_async_gpu_to_gpu( + lwe_array + (num_blocks - rotations + i) * big_lwe_size, + padding_block, big_lwe_size_bytes, mem->local_streams_1[0], + gpu_indexes[0]); + } + } +#pragma omp section + { + if (shift_within_block != 0) { + auto lut_univariate_shift_last_block = + mem->lut_buffers_univariate[shift_within_block - 1]; + integer_radix_apply_univariate_lookup_table_kb( + mem->local_streams_2, gpu_indexes, gpu_count, last_block, + last_block_copy, bsks, ksks, 1, + lut_univariate_shift_last_block); + } } } + for (uint j = 0; j < mem->active_gpu_count; j++) { + cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]); + cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]); + } } - for (uint j = 0; j < mem->active_gpu_count; j++) { - cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]); - cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]); - } - } else { PANIC("Cuda error (scalar shift): left scalar shift is never of the " "arithmetic type") From 2674d177c98301a2aeedf94d211c571e0144e909 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Fri, 26 Jul 2024 10:51:22 +0200 Subject: [PATCH 2/2] fix(gpu): fix scalar rotate and add some checks --- backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh | 3 ++- .../tfhe-cuda-backend/cuda/src/integer/multiplication.cuh | 7 ++----- .../cuda/src/integer/scalar_comparison.cuh | 2 ++ .../tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh | 6 ++++-- .../cuda/src/integer/shift_and_rotate.cuh | 2 -- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index b0e8d4e7de..023b5bfadc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -762,8 +762,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, Torus *lwe_array_in, uint32_t lwe_dimension, uint32_t num_radix_blocks, uint32_t factor) { + if (num_radix_blocks == 0) + return; cudaSetDevice(gpu_index); - int num_blocks = 0, num_threads = 0; int num_entries = (lwe_dimension + 1); getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 96a410408d..fdfab13cdc 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -138,7 +138,6 @@ template __global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks, Torus *msb_blocks, uint32_t glwe_dimension, - uint32_t lsb_count, uint32_t msb_count, uint32_t num_blocks) { size_t big_lwe_dimension = glwe_dimension * params::degree + 1; size_t big_lwe_id = blockIdx.x; @@ -321,8 +320,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0], h_lwe_idx_in, h_lwe_idx_out); - size_t copy_size = total_count * sizeof(Torus); - copy_size = sm_copy_count * sizeof(int32_t); + size_t copy_size = sm_copy_count * sizeof(int32_t); cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size, streams[0], gpu_indexes[0]); cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size, @@ -548,8 +546,7 @@ __host__ void host_integer_mult_radix_kb( fill_radix_from_lsb_msb <<>>(vector_result_sb, vector_result_lsb, vector_result_msb, - glwe_dimension, lsb_vector_block_count, - msb_vector_block_count, num_blocks); + glwe_dimension, num_blocks); check_cuda_error(cudaGetLastError()); int terms_degree[2 * num_blocks * num_blocks]; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index a8b6882394..6d065eb79e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -565,6 +565,8 @@ __host__ void scalar_compare_radix_blocks_kb( int_comparison_buffer *mem_ptr, void **bsks, Torus **ksks, uint32_t num_radix_blocks) { + if (num_radix_blocks == 0) + return; auto params = mem_ptr->params; auto big_lwe_dimension = params.big_lwe_dimension; auto message_modulus = params.message_modulus; diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh index 2666951a3c..e951d2b2a6 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh @@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( Torus *rotated_buffer = mem->tmp_rotated; - auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; - // rotate right all the blocks in radix ciphertext // copy result in new buffer // 256 threads are used in every block @@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( giver_blocks, lwe_array, 1, num_blocks, big_lwe_size); + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, giver_blocks, bsks, ksks, num_blocks, lut_bivariate, @@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace( host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks, lwe_array, 1, num_blocks, big_lwe_size); + auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1]; + integer_radix_apply_bivariate_lookup_table_kb( streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks, giver_blocks, bsks, ksks, num_blocks, lut_bivariate, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh index 2106cd6295..e595781256 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh @@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace( auto big_lwe_size = big_lwe_dimension + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); - cudaSetDevice(gpu_indexes[0]); - // Extract all bits auto bits = mem->tmp_bits; extract_n_bits(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,