Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: zama-ai/tfhe-rs
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: c3b85abcd04d59cf1092dcb2b00416d31f0a8c82
Choose a base ref
..
head repository: zama-ai/tfhe-rs
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: 1e453263afebdf8afaf1045aea87b6c25a7d943b
Choose a head ref
Original file line number Diff line number Diff line change
@@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
uint32_t lwe_chunk_size, int lwe_offset) {

uint64_t full_dm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
@@ -275,8 +275,6 @@ __host__ void execute_cg_external_product_loop(

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;

auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;
Original file line number Diff line number Diff line change
@@ -465,7 +465,7 @@ __host__ void execute_compute_keybundle(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
uint32_t lwe_chunk_size, int lwe_offset) {

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
@@ -506,12 +506,14 @@ __host__ void execute_compute_keybundle(
}

template <typename Torus, class params>
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {
__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int j, int lwe_offset) {

uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
@@ -560,12 +562,14 @@ __host__ void execute_step_one(
}

template <typename Torus, class params>
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) {
__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
int32_t grouping_factor, uint32_t level_count,
int j, int lwe_offset, uint32_t lwe_chunk_size) {

uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
@@ -623,7 +627,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
// Accumulate
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
for (uint32_t j = 0; j < chunk_size; j++) {
for (int j = 0; j < chunk_size; j++) {
execute_step_one<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
Original file line number Diff line number Diff line change
@@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, uint32_t lwe_offset) {
uint32_t lwe_chunk_size, int lwe_offset) {

auto supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
@@ -294,8 +294,6 @@ __host__ void execute_tbc_external_product_loop(

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;

auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;