Skip to content

Commit

Permalink
chore(gpu): track noise/degree through scalar add
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Feb 4, 2025
1 parent e05a40a commit 67aada8
Show file tree
Hide file tree
Showing 7 changed files with 66 additions and 25 deletions.
5 changes: 2 additions & 3 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,8 @@ void cuda_negate_integer_radix_ciphertext_64(

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus);
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus);

void scratch_cuda_integer_radix_logical_scalar_shift_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1441,7 +1441,7 @@ void host_compute_propagation_simulators_and_group_carries(
auto scalar_array_cum_sum = mem->scalar_array_cum_sum;
auto big_lwe_dimension = big_lwe_size - 1;

host_integer_radix_scalar_addition_inplace<Torus>(
legacy_host_integer_radix_scalar_addition_inplace<Torus>(
streams, gpu_indexes, gpu_count, propagation_cum_sums,
scalar_array_cum_sum, big_lwe_dimension, num_radix_blocks,
message_modulus, carry_modulus);
Expand Down
12 changes: 5 additions & 7 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,11 @@

void cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array, void const *scalar_input, uint32_t lwe_dimension,
uint32_t lwe_ciphertext_count, uint32_t message_modulus,
uint32_t carry_modulus) {
CudaRadixCiphertextFFI *lwe_array, void const *scalar_input,
uint32_t num_scalars, uint32_t message_modulus, uint32_t carry_modulus) {

host_integer_radix_scalar_addition_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array),
static_cast<const uint64_t *>(scalar_input), lwe_dimension,
lwe_ciphertext_count, message_modulus, carry_modulus);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array,
static_cast<const uint64_t *>(scalar_input), num_scalars, message_modulus,
carry_modulus);
}
38 changes: 37 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/scalar_addition.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ __global__ void device_integer_radix_scalar_addition_inplace(
}

template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
__host__ void legacy_host_integer_radix_scalar_addition_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array, Torus const *scalar_input,
uint32_t lwe_dimension, uint32_t input_lwe_ciphertext_count,
Expand All @@ -49,6 +49,42 @@ __host__ void host_integer_radix_scalar_addition_inplace(
delta);
check_cuda_error(cudaGetLastError());
}
template <typename Torus>
__host__ void host_integer_radix_scalar_addition_inplace(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextFFI *lwe_array,
Torus const *scalar_input, uint32_t num_scalars, uint32_t message_modulus,
uint32_t carry_modulus) {
if (lwe_array->num_radix_blocks < num_scalars)
PANIC("Cuda error: num scalars should be smaller or equal to input num "
"radix blocks")
cuda_set_device(gpu_indexes[0]);

// Create a 1-dimensional grid of threads
int num_blocks = 0, num_threads = 0;
int num_entries = num_scalars;
getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
dim3 grid(num_blocks, 1, 1);
dim3 thds(num_threads, 1, 1);

// Value of the shift we multiply our messages by
// If message_modulus and carry_modulus are always powers of 2 we can simplify
// this
uint64_t delta = ((uint64_t)1 << 63) / (message_modulus * carry_modulus);

device_integer_radix_scalar_addition_inplace<Torus>
<<<grid, thds, 0, streams[0]>>>((Torus *)lwe_array->ptr, scalar_input,
num_scalars, lwe_array->lwe_dimension,
delta);
check_cuda_error(cudaGetLastError());
Torus scalar_input_cpu[num_scalars];
cuda_memcpy_async_to_cpu(&scalar_input_cpu, scalar_input,
num_scalars * sizeof(Torus), streams[0],
gpu_indexes[0]);
for (uint i = 0; i < num_scalars; i++) {
lwe_array->degrees[i] = lwe_array->degrees[i] + scalar_input_cpu[i];
}
}

template <typename Torus>
__global__ void device_integer_radix_add_scalar_one_inplace(
Expand Down
5 changes: 2 additions & 3 deletions backends/tfhe-cuda-backend/src/bindings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -398,10 +398,9 @@ unsafe extern "C" {
streams: *const *mut ffi::c_void,
gpu_indexes: *const u32,
gpu_count: u32,
lwe_array: *mut ffi::c_void,
lwe_array: *mut CudaRadixCiphertextFFI,
scalar_input: *const ffi::c_void,
lwe_dimension: u32,
lwe_ciphertext_count: u32,
num_scalars: u32,
message_modulus: u32,
carry_modulus: u32,
);
Expand Down
25 changes: 18 additions & 7 deletions tfhe/src/integer/gpu/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,23 +228,34 @@ where
/// is required
pub unsafe fn scalar_addition_integer_radix_assign_async<T: UnsignedInteger>(
streams: &CudaStreams,
lwe_array: &mut CudaVec<T>,
lwe_array: &mut CudaRadixCiphertext,
scalar_input: &CudaVec<T>,
lwe_dimension: LweDimension,
num_samples: u32,
num_scalars: u32,
message_modulus: u32,
carry_modulus: u32,
) {
assert_eq!(
streams.gpu_indexes[0],
lwe_array.gpu_index(0),
lwe_array.d_blocks.0.d_vec.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
assert_eq!(
streams.gpu_indexes[0],
scalar_input.gpu_index(0),
"GPU error: all data should reside on the same GPU."
);
let mut lwe_array_degrees = lwe_array.info.blocks.iter().map(|b| b.degree.0).collect();
let mut lwe_array_noise_levels = lwe_array
.info
.blocks
.iter()
.map(|b| b.noise_level.0)
.collect();
let mut cuda_ffi_lwe_array = prepare_cuda_radix_ffi(
lwe_array,
&mut lwe_array_degrees,
&mut lwe_array_noise_levels,
);
cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
streams.ptr.as_ptr(),
streams
Expand All @@ -254,13 +265,13 @@ pub unsafe fn scalar_addition_integer_radix_assign_async<T: UnsignedInteger>(
.collect::<Vec<u32>>()
.as_ptr(),
streams.len() as u32,
lwe_array.as_mut_c_ptr(0),
&mut cuda_ffi_lwe_array,
scalar_input.as_c_ptr(0),
lwe_dimension.0 as u32,
num_samples,
num_scalars,
message_modulus,
carry_modulus,
);
update_noise_degree(lwe_array, &cuda_ffi_lwe_array);
}

#[allow(clippy::too_many_arguments)]
Expand Down
4 changes: 1 addition & 3 deletions tfhe/src/integer/gpu/server_key/radix/scalar_add.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,12 @@ impl CudaServerKey {
.collect::<Vec<_>>();
d_decomposed_scalar.copy_from_cpu_async(decomposed_scalar.as_slice(), streams, 0);

let lwe_dimension = ct.as_ref().d_blocks.lwe_dimension();
// If the scalar is decomposed using less than the number of blocks our ciphertext
// has, we just don't touch ciphertext's last blocks
scalar_addition_integer_radix_assign_async(
streams,
&mut ct.as_mut().d_blocks.0.d_vec,
ct.as_mut(),
&d_decomposed_scalar,
lwe_dimension,
decomposed_scalar.len() as u32,
self.message_modulus.0 as u32,
self.carry_modulus.0 as u32,
Expand Down

0 comments on commit 67aada8

Please sign in to comment.