diff --git a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h index 8bec68a6b8..c77956729b 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer/integer.h @@ -148,9 +148,8 @@ void cuda_negate_integer_radix_ciphertext_64( void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus); + CudaRadixCiphertextFFI *lwe_array, void const *scalar_input, + uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus); void scratch_cuda_integer_radix_logical_scalar_shift_kb_64( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index ff01505874..a37ce04d80 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -1441,7 +1441,7 @@ void host_compute_propagation_simulators_and_group_carries( auto scalar_array_cum_sum = mem->scalar_array_cum_sum; auto big_lwe_dimension = big_lwe_size - 1; - host_integer_radix_scalar_addition_inplace( + legacy_host_integer_radix_scalar_addition_inplace( streams, gpu_indexes, gpu_count, propagation_cum_sums, scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks, message_modulus, carry_modulus); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu index cae83b55bd..72e3d2513b 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu @@ -2,13 +2,11 @@ void cuda_scalar_addition_integer_radix_ciphertext_64_inplace( void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, - void *lwe_array, void const *scalar_input, uint32_t lwe_dimension, - uint32_t lwe_ciphertext_count, uint32_t message_modulus, - uint32_t carry_modulus) { + CudaRadixCiphertextFFI *lwe_array, void const *scalar_input, + uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) { host_integer_radix_scalar_addition_inplace( - (cudaStream_t *)(streams), gpu_indexes, gpu_count, - static_cast(lwe_array), - static_cast(scalar_input), lwe_dimension, - lwe_ciphertext_count, message_modulus, carry_modulus); + (cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array, + static_cast(scalar_input), num_scalars, message_modulus, + carry_modulus); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh index 32b0443db4..a05dab4a3d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh @@ -24,7 +24,7 @@ __global__ void device_integer_radix_scalar_addition_inplace( } template -__host__ void host_integer_radix_scalar_addition_inplace( +__host__ void legacy_host_integer_radix_scalar_addition_inplace( cudaStream_t const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input, uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count, @@ -49,6 +49,42 @@ __host__ void host_integer_radix_scalar_addition_inplace( delta); check_cuda_error(cudaGetLastError()); } +template +__host__ void host_integer_radix_scalar_addition_inplace( + cudaStream_t const *streams, uint32_t const *gpu_indexes, + uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array, + Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus, + uint32_t carry_modulus) { + if (lwe_array->num_radix_blocks < num_scalars) + PANIC("Cuda error: num scalars should be smaller or equal to input num " + "radix blocks") + cuda_set_device(gpu_indexes[0]); + + // Create a 1-dimensional grid of threads + int num_blocks = 0, num_threads = 0; + int num_entries = num_scalars; + getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads); + dim3 grid(num_blocks, 1, 1); + dim3 thds(num_threads, 1, 1); + + // Value of the shift we multiply our messages by + // If message_modulus and carry_modulus are always powers of 2 we can simplify + // this + uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus); + + device_integer_radix_scalar_addition_inplace + <<>>((Torus *)lwe_array->ptr, scalar_input, + num_scalars, lwe_array->lwe_dimension, + delta); + check_cuda_error(cudaGetLastError()); + Torus scalar_input_cpu[num_scalars]; + cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input, + num_scalars * sizeof(Torus), streams[0], + gpu_indexes[0]); + for (uint i = 0; i < num_scalars; i++) { + lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i]; + } +} template __global__ void device_integer_radix_add_scalar_one_inplace( diff --git a/backends/tfhe-cuda-backend/src/bindings.rs b/backends/tfhe-cuda-backend/src/bindings.rs index 4ffb772ba3..035ea0e848 100644 --- a/backends/tfhe-cuda-backend/src/bindings.rs +++ b/backends/tfhe-cuda-backend/src/bindings.rs @@ -398,10 +398,9 @@ unsafe extern "C" { streams: *const *mut ffi::c_void, gpu_indexes: *const u32, gpu_count: u32, - lwe_array: *mut ffi::c_void, + lwe_array: *mut CudaRadixCiphertextFFI, scalar_input: *const ffi::c_void, - lwe_dimension: u32, - lwe_ciphertext_count: u32, + num_scalars: u32, message_modulus: u32, carry_modulus: u32, ); diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 13ff2665e2..b4d686cb52 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -228,16 +228,15 @@ where /// is required pub unsafe fn scalar_addition_integer_radix_assign_async( streams: &CudaStreams, - lwe_array: &mut CudaVec, + lwe_array: &mut CudaRadixCiphertext, scalar_input: &CudaVec, - lwe_dimension: LweDimension, - num_samples: u32, + num_scalars: u32, message_modulus: u32, carry_modulus: u32, ) { assert_eq!( streams.gpu_indexes[0], - lwe_array.gpu_index(0), + lwe_array.d_blocks.0.d_vec.gpu_index(0), "GPU error: all data should reside on the same GPU." ); assert_eq!( @@ -245,6 +244,18 @@ pub unsafe fn scalar_addition_integer_radix_assign_async( scalar_input.gpu_index(0), "GPU error: all data should reside on the same GPU." ); + let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect(); + let mut lwe_array_noise_levels = lwe_array + .info + .blocks + .iter() + .map(|b| b.noise_level.0) + .collect(); + let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi( + lwe_array, + &mut lwe_array_degrees, + &mut lwe_array_noise_levels, + ); cuda_scalar_addition_integer_radix_ciphertext_64_inplace( streams.ptr.as_ptr(), streams @@ -254,13 +265,13 @@ pub unsafe fn scalar_addition_integer_radix_assign_async( .collect::>() .as_ptr(), streams.len() as u32, - lwe_array.as_mut_c_ptr(0), + &mut cuda_ffi_lwe_array, scalar_input.as_c_ptr(0), - lwe_dimension.0 as u32, - num_samples, + num_scalars, message_modulus, carry_modulus, ); + update_noise_degree(lwe_array, &cuda_ffi_lwe_array); } #[allow(clippy::too_many_arguments)] diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs index 1a93403e7b..e04776ae8a 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_add.rs @@ -92,14 +92,12 @@ impl CudaServerKey { .collect::>(); d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams, 0); - let lwe_dimension = ct.as_ref().d_blocks.lwe_dimension(); // If the scalar is decomposed using less than the number of blocks our ciphertext // has, we just don't touch ciphertext's last blocks scalar_addition_integer_radix_assign_async( streams, - &mut ct.as_mut().d_blocks.0.d_vec, + ct.as_mut(), &d_decomposed_scalar, - lwe_dimension, decomposed_scalar.len() as u32, self.message_modulus.0 as u32, self.carry_modulus.0 as u32,