diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh index 28b02a57f8..e44cd0ad60 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/addition.cuh @@ -14,7 +14,6 @@ #include "utils/kernel_dimensions.cuh" #include #include -#include #include #include #include @@ -110,26 +109,14 @@ __host__ void host_integer_signed_overflowing_add_or_sub_kb( cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { - // generate input_carries and output_carry -#pragma omp section - { - host_propagate_single_carry( - mem_ptr->sub_streams_1, gpu_indexes, gpu_count, result, output_carry, - input_carries, mem_ptr->scp_mem, bsks, ksks, num_blocks); - } - - // generate generate_last_block_inner_propagation -#pragma omp section - { - host_generate_last_block_inner_propagation( - mem_ptr->sub_streams_2, gpu_indexes, gpu_count, - last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size], - &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, - bsks, ksks); - } - } + host_propagate_single_carry(mem_ptr->sub_streams_1, gpu_indexes, gpu_count, + result, output_carry, input_carries, + mem_ptr->scp_mem, bsks, ksks, num_blocks); + host_generate_last_block_inner_propagation( + mem_ptr->sub_streams_2, gpu_indexes, gpu_count, + last_block_inner_propagation, &lhs[(num_blocks - 1) * big_lwe_size], + &rhs[(num_blocks - 1) * big_lwe_size], mem_ptr->las_block_prop_mem, bsks, + ksks); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh index a1057a443b..3082d5998c 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/cmux.cuh @@ -2,7 +2,6 @@ #define CUDA_INTEGER_CMUX_CUH #include "integer.cuh" -#include template __host__ void zero_out_if(cudaStream_t *streams, uint32_t *gpu_indexes, @@ -57,25 +56,14 @@ __host__ void host_integer_radix_cmux_kb( cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { - // Both sections may be executed in parallel -#pragma omp section - { - auto mem_true = mem_ptr->zero_if_true_buffer; - zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct, - lwe_array_true, lwe_condition, mem_true, - mem_ptr->inverted_predicate_lut, bsks, ksks, - num_radix_blocks); - } -#pragma omp section - { - auto mem_false = mem_ptr->zero_if_false_buffer; - zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct, - lwe_array_false, lwe_condition, mem_false, - mem_ptr->predicate_lut, bsks, ksks, num_radix_blocks); - } - } + auto mem_true = mem_ptr->zero_if_true_buffer; + zero_out_if(true_streams, gpu_indexes, gpu_count, mem_ptr->tmp_true_ct, + lwe_array_true, lwe_condition, mem_true, + mem_ptr->inverted_predicate_lut, bsks, ksks, num_radix_blocks); + auto mem_false = mem_ptr->zero_if_false_buffer; + zero_out_if(false_streams, gpu_indexes, gpu_count, mem_ptr->tmp_false_ct, + lwe_array_false, lwe_condition, mem_false, mem_ptr->predicate_lut, + bsks, ksks, num_radix_blocks); for (uint j = 0; j < mem_ptr->zero_if_true_buffer->active_gpu_count; j++) { cuda_synchronize_stream(true_streams[j], gpu_indexes[j]); } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh index f38c591678..e2b95a0203 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/div_rem.cuh @@ -14,7 +14,6 @@ #include "utils/kernel_dimensions.cuh" #include #include -#include #include #include #include @@ -372,34 +371,18 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { -#pragma omp section - { - // interesting_divisor - trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes, - gpu_count); - } -#pragma omp section - { - // divisor_ms_blocks - trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, - gpu_count); - } -#pragma omp section - { - // interesting_remainder1 - // numerator_block_stack - left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes, - gpu_count); - } -#pragma omp section - { - // interesting_remainder2 - left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes, - gpu_count); - } - } + // interesting_divisor + trim_last_interesting_divisor_bits(mem_ptr->sub_streams_1, gpu_indexes, + gpu_count); + // divisor_ms_blocks + trim_first_divisor_ms_bits(mem_ptr->sub_streams_2, gpu_indexes, gpu_count); + // interesting_remainder1 + // numerator_block_stack + left_shift_interesting_remainder1(mem_ptr->sub_streams_3, gpu_indexes, + gpu_count); + // interesting_remainder2 + left_shift_interesting_remainder2(mem_ptr->sub_streams_4, gpu_indexes, + gpu_count); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); @@ -489,27 +472,14 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { -#pragma omp section - { - // new_remainder - // subtraction_overflowed - do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count); - } -#pragma omp section - { - // at_least_one_upper_block_is_non_zero - check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, - gpu_count); - } -#pragma omp section - { - // cleaned_merged_interesting_remainder - create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3, - gpu_indexes, gpu_count); - } - } + // new_remainder + // subtraction_overflowed + do_overflowing_sub(mem_ptr->sub_streams_1, gpu_indexes, gpu_count); + // at_least_one_upper_block_is_non_zero + check_divisor_upper_blocks(mem_ptr->sub_streams_2, gpu_indexes, gpu_count); + // cleaned_merged_interesting_remainder + create_clean_version_of_merged_remainder(mem_ptr->sub_streams_3, + gpu_indexes, gpu_count); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); @@ -567,26 +537,14 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { -#pragma omp section - { - // cleaned_merged_interesting_remainder - conditionally_zero_out_merged_interesting_remainder( - mem_ptr->sub_streams_1, gpu_indexes, gpu_count); - } -#pragma omp section - { - // new_remainder - conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2, - gpu_indexes, gpu_count); - } -#pragma omp section - { - // quotient - set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count); - } - } + // cleaned_merged_interesting_remainder + conditionally_zero_out_merged_interesting_remainder(mem_ptr->sub_streams_1, + gpu_indexes, gpu_count); + // new_remainder + conditionally_zero_out_merged_new_remainder(mem_ptr->sub_streams_2, + gpu_indexes, gpu_count); + // quotient + set_quotient_bit(mem_ptr->sub_streams_3, gpu_indexes, gpu_count); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); @@ -613,21 +571,12 @@ host_integer_div_rem_kb(cudaStream_t *streams, uint32_t *gpu_indexes, for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { -#pragma omp section - { - integer_radix_apply_univariate_lookup_table_kb( - mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder, - bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1); - } -#pragma omp section - { - integer_radix_apply_univariate_lookup_table_kb( - mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, - bsks, ksks, num_blocks, mem_ptr->message_extract_lut_2); - } - } + integer_radix_apply_univariate_lookup_table_kb( + mem_ptr->sub_streams_1, gpu_indexes, gpu_count, remainder, remainder, + bsks, ksks, num_blocks, mem_ptr->message_extract_lut_1); + integer_radix_apply_univariate_lookup_table_kb( + mem_ptr->sub_streams_2, gpu_indexes, gpu_count, quotient, quotient, bsks, + ksks, num_blocks, mem_ptr->message_extract_lut_2); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(mem_ptr->sub_streams_1[j], gpu_indexes[j]); cuda_synchronize_stream(mem_ptr->sub_streams_2[j], gpu_indexes[j]); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 6d065eb79e..7cc3e6cec0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -2,7 +2,6 @@ #define CUDA_INTEGER_SCALAR_COMPARISON_OPS_CUH #include "integer/comparison.cuh" -#include template __host__ void integer_radix_unsigned_scalar_difference_check_kb( @@ -87,52 +86,42 @@ __host__ void integer_radix_unsigned_scalar_difference_check_kb( cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { - // Both sections may be executed in parallel -#pragma omp section - { - ////////////// - // lsb - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; - - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - total_num_scalar_blocks, message_modulus); - - // From this point we have half number of blocks - num_lsb_radix_blocks /= 2; - num_lsb_radix_blocks += (total_num_scalar_blocks % 2); - - // comparisons will be assigned - // - 0 if lhs < rhs - // - 1 if lhs == rhs - // - 2 if lhs > rhs - - auto comparisons = mem_ptr->tmp_block_comparisons; - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - comparisons, lhs, rhs, mem_ptr, bsks, - ksks, num_lsb_radix_blocks); - - // Reduces a vec containing radix blocks that encrypts a sign - // (inferior, equal, superior) to one single radix block containing the - // final sign - tree_sign_reduction( - lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons, - mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, - ksks, num_lsb_radix_blocks); - } -#pragma omp section - { - ////////////// - // msb - host_compare_with_zero_equality( - msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, msb, - mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); - } - } + ////////////// + // lsb + Torus *lhs = diff_buffer->tmp_packed_left; + Torus *rhs = diff_buffer->tmp_packed_right; + + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + total_num_scalar_blocks, message_modulus); + + // From this point we have half number of blocks + num_lsb_radix_blocks /= 2; + num_lsb_radix_blocks += (total_num_scalar_blocks % 2); + + // comparisons will be assigned + // - 0 if lhs < rhs + // - 1 if lhs == rhs + // - 2 if lhs > rhs + + auto comparisons = mem_ptr->tmp_block_comparisons; + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, ksks, + num_lsb_radix_blocks); + + // Reduces a vec containing radix blocks that encrypts a sign + // (inferior, equal, superior) to one single radix block containing the + // final sign + tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, + comparisons, mem_ptr->diff_buffer->tree_buffer, + mem_ptr->identity_lut_f, bsks, ksks, + num_lsb_radix_blocks); + ////////////// + // msb + host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, + lwe_array_msb_out, msb, mem_ptr, bsks, ksks, + num_msb_radix_blocks, mem_ptr->is_zero_lut); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); @@ -310,92 +299,82 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { - // Both sections may be executed in parallel -#pragma omp section - { - ////////////// - // lsb - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; - - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - total_num_scalar_blocks, message_modulus); - - // From this point we have half number of blocks - num_lsb_radix_blocks /= 2; - num_lsb_radix_blocks += (total_num_scalar_blocks % 2); - - // comparisons will be assigned - // - 0 if lhs < rhs - // - 1 if lhs == rhs - // - 2 if lhs > rhs - - auto comparisons = mem_ptr->tmp_block_comparisons; - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - comparisons, lhs, rhs, mem_ptr, bsks, - ksks, num_lsb_radix_blocks); - - // Reduces a vec containing radix blocks that encrypts a sign - // (inferior, equal, superior) to one single radix block containing the - // final sign - tree_sign_reduction( - lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, comparisons, - mem_ptr->diff_buffer->tree_buffer, mem_ptr->identity_lut_f, bsks, - ksks, num_lsb_radix_blocks); - } -#pragma omp section - { - ////////////// - // msb - // We remove the last block (which is the sign) - Torus *are_all_msb_zeros = lwe_array_msb_out; - host_compare_with_zero_equality( - msb_streams, gpu_indexes, gpu_count, are_all_msb_zeros, msb, - mem_ptr, bsks, ksks, num_msb_radix_blocks, mem_ptr->is_zero_lut); - - auto sign_bit_pos = (int)log2(message_modulus) - 1; - - auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block, - Torus msb_are_zeros) { - bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1; - CMP_ORDERING sign_block_ordering; - if (sign_bit_is_set) { - sign_block_ordering = CMP_ORDERING::IS_INFERIOR; - } else if (sign_block != 0) { - sign_block_ordering = CMP_ORDERING::IS_SUPERIOR; - } else { - sign_block_ordering = CMP_ORDERING::IS_EQUAL; - } - - CMP_ORDERING msb_ordering; - if (msb_are_zeros == 1) - msb_ordering = CMP_ORDERING::IS_EQUAL; - else - msb_ordering = CMP_ORDERING::IS_SUPERIOR; - - return mem_ptr->diff_buffer->tree_buffer->block_selector_f( - sign_block_ordering, msb_ordering); - }; - - auto signed_msb_lut = mem_ptr->signed_msb_lut; - generate_device_accumulator_bivariate( - msb_streams[0], gpu_indexes[0], - signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension, - params.polynomial_size, params.message_modulus, - params.carry_modulus, lut_f); - signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); - - Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; - integer_radix_apply_bivariate_lookup_table_kb( - msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block, - are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut, - signed_msb_lut->params.message_modulus); + ////////////// + // lsb + Torus *lhs = diff_buffer->tmp_packed_left; + Torus *rhs = diff_buffer->tmp_packed_right; + + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + total_num_scalar_blocks, message_modulus); + + // From this point we have half number of blocks + num_lsb_radix_blocks /= 2; + num_lsb_radix_blocks += (total_num_scalar_blocks % 2); + + // comparisons will be assigned + // - 0 if lhs < rhs + // - 1 if lhs == rhs + // - 2 if lhs > rhs + + auto comparisons = mem_ptr->tmp_block_comparisons; + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + comparisons, lhs, rhs, mem_ptr, bsks, ksks, + num_lsb_radix_blocks); + + // Reduces a vec containing radix blocks that encrypts a sign + // (inferior, equal, superior) to one single radix block containing the + // final sign + tree_sign_reduction(lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, + comparisons, mem_ptr->diff_buffer->tree_buffer, + mem_ptr->identity_lut_f, bsks, ksks, + num_lsb_radix_blocks); + ////////////// + // msb + // We remove the last block (which is the sign) + Torus *are_all_msb_zeros = lwe_array_msb_out; + host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, + are_all_msb_zeros, msb, mem_ptr, bsks, ksks, + num_msb_radix_blocks, mem_ptr->is_zero_lut); + + auto sign_bit_pos = (int)log2(message_modulus) - 1; + + auto lut_f = [mem_ptr, sign_bit_pos](Torus sign_block, + Torus msb_are_zeros) { + bool sign_bit_is_set = (sign_block >> sign_bit_pos) == 1; + CMP_ORDERING sign_block_ordering; + if (sign_bit_is_set) { + sign_block_ordering = CMP_ORDERING::IS_INFERIOR; + } else if (sign_block != 0) { + sign_block_ordering = CMP_ORDERING::IS_SUPERIOR; + } else { + sign_block_ordering = CMP_ORDERING::IS_EQUAL; } - } + + CMP_ORDERING msb_ordering; + if (msb_are_zeros == 1) + msb_ordering = CMP_ORDERING::IS_EQUAL; + else + msb_ordering = CMP_ORDERING::IS_SUPERIOR; + + return mem_ptr->diff_buffer->tree_buffer->block_selector_f( + sign_block_ordering, msb_ordering); + }; + + auto signed_msb_lut = mem_ptr->signed_msb_lut; + generate_device_accumulator_bivariate( + msb_streams[0], gpu_indexes[0], + signed_msb_lut->get_lut(gpu_indexes[0], 0), params.glwe_dimension, + params.polynomial_size, params.message_modulus, params.carry_modulus, + lut_f); + signed_msb_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]); + + Torus *sign_block = msb + (num_msb_radix_blocks - 1) * big_lwe_size; + integer_radix_apply_bivariate_lookup_table_kb( + msb_streams, gpu_indexes, gpu_count, lwe_array_msb_out, sign_block, + are_all_msb_zeros, bsks, ksks, 1, signed_msb_lut, + signed_msb_lut->params.message_modulus); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); @@ -421,49 +400,37 @@ __host__ void integer_radix_signed_scalar_difference_check_kb( auto lwe_array_ct_out = mem_ptr->tmp_lwe_array_out; auto lwe_array_sign_out = lwe_array_ct_out + (num_lsb_radix_blocks / 2) * big_lwe_size; -#pragma omp parallel sections - { - // Both sections may be executed in parallel -#pragma omp section - { - Torus *lhs = diff_buffer->tmp_packed_left; - Torus *rhs = diff_buffer->tmp_packed_right; - - pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, - big_lwe_dimension, num_lsb_radix_blocks - 1, - message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, - num_lsb_radix_blocks - 1, message_modulus); - - // From this point we have half number of blocks - num_lsb_radix_blocks /= 2; - - // comparisons will be assigned - // - 0 if lhs < rhs - // - 1 if lhs == rhs - // - 2 if lhs > rhs - scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, - lwe_array_ct_out, lhs, rhs, mem_ptr, - bsks, ksks, num_lsb_radix_blocks); - } -#pragma omp section - { - Torus *encrypted_sign_block = - lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; - Torus *scalar_sign_block = - scalar_blocks + (total_num_scalar_blocks - 1); - - auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block; - create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block, - scalar_sign_block, big_lwe_dimension, 1, 1, - message_modulus, carry_modulus); - - integer_radix_apply_bivariate_lookup_table_kb( - msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out, - encrypted_sign_block, trivial_sign_block, bsks, ksks, 1, - mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); - } - } + Torus *lhs = diff_buffer->tmp_packed_left; + Torus *rhs = diff_buffer->tmp_packed_right; + + pack_blocks(lsb_streams[0], gpu_indexes[0], lhs, lwe_array_in, + big_lwe_dimension, num_lsb_radix_blocks - 1, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], rhs, scalar_blocks, 0, + num_lsb_radix_blocks - 1, message_modulus); + + // From this point we have half number of blocks + num_lsb_radix_blocks /= 2; + + // comparisons will be assigned + // - 0 if lhs < rhs + // - 1 if lhs == rhs + // - 2 if lhs > rhs + scalar_compare_radix_blocks_kb(lsb_streams, gpu_indexes, gpu_count, + lwe_array_ct_out, lhs, rhs, mem_ptr, bsks, + ksks, num_lsb_radix_blocks); + Torus *encrypted_sign_block = + lwe_array_in + (total_num_radix_blocks - 1) * big_lwe_size; + Torus *scalar_sign_block = scalar_blocks + (total_num_scalar_blocks - 1); + + auto trivial_sign_block = mem_ptr->tmp_trivial_sign_block; + create_trivial_radix(msb_streams[0], gpu_indexes[0], trivial_sign_block, + scalar_sign_block, big_lwe_dimension, 1, 1, + message_modulus, carry_modulus); + + integer_radix_apply_bivariate_lookup_table_kb( + msb_streams, gpu_indexes, gpu_count, lwe_array_sign_out, + encrypted_sign_block, trivial_sign_block, bsks, ksks, 1, + mem_ptr->signed_lut, mem_ptr->signed_lut->params.message_modulus); for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { cuda_synchronize_stream(lsb_streams[j], gpu_indexes[j]); cuda_synchronize_stream(msb_streams[j], gpu_indexes[j]); @@ -687,55 +654,44 @@ __host__ void host_integer_radix_scalar_equality_check_kb( auto lsb_streams = mem_ptr->lsb_streams; auto msb_streams = mem_ptr->msb_streams; -#pragma omp parallel sections - { - // Both sections may be executed in parallel -#pragma omp section - { - if (num_halved_scalar_blocks > 0) { - auto packed_blocks = mem_ptr->tmp_packed_input; - auto packed_scalar = - packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks; - - pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb, - big_lwe_dimension, num_lsb_radix_blocks, message_modulus); - pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, - scalar_blocks, 0, num_scalar_blocks, message_modulus); - - cuda_memcpy_async_gpu_to_gpu( - scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0), - packed_scalar, num_halved_scalar_blocks * sizeof(Torus), - lsb_streams[0], gpu_indexes[0]); - scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0); - - integer_radix_apply_univariate_lookup_table_kb( - lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, - packed_blocks, bsks, ksks, num_halved_lsb_radix_blocks, - scalar_comparison_luts); - } - } -#pragma omp section - { - ////////////// - // msb - if (num_msb_radix_blocks > 0) { - int_radix_lut *msb_lut; - switch (mem_ptr->op) { - case COMPARISON_TYPE::EQ: - msb_lut = mem_ptr->is_zero_lut; - break; - case COMPARISON_TYPE::NE: - msb_lut = mem_ptr->eq_buffer->is_non_zero_lut; - break; - default: - PANIC("Cuda error: integer operation not supported") - } - - host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, - lwe_array_msb_out, msb, mem_ptr, bsks, - ksks, num_msb_radix_blocks, msb_lut); - } + if (num_halved_scalar_blocks > 0) { + auto packed_blocks = mem_ptr->tmp_packed_input; + auto packed_scalar = + packed_blocks + big_lwe_size * num_halved_lsb_radix_blocks; + + pack_blocks(lsb_streams[0], gpu_indexes[0], packed_blocks, lsb, + big_lwe_dimension, num_lsb_radix_blocks, message_modulus); + pack_blocks(lsb_streams[0], gpu_indexes[0], packed_scalar, scalar_blocks, 0, + num_scalar_blocks, message_modulus); + + cuda_memcpy_async_gpu_to_gpu( + scalar_comparison_luts->get_lut_indexes(gpu_indexes[0], 0), + packed_scalar, num_halved_scalar_blocks * sizeof(Torus), lsb_streams[0], + gpu_indexes[0]); + scalar_comparison_luts->broadcast_lut(lsb_streams, gpu_indexes, 0); + + integer_radix_apply_univariate_lookup_table_kb( + lsb_streams, gpu_indexes, gpu_count, lwe_array_lsb_out, packed_blocks, + bsks, ksks, num_halved_lsb_radix_blocks, scalar_comparison_luts); + } + ////////////// + // msb + if (num_msb_radix_blocks > 0) { + int_radix_lut *msb_lut; + switch (mem_ptr->op) { + case COMPARISON_TYPE::EQ: + msb_lut = mem_ptr->is_zero_lut; + break; + case COMPARISON_TYPE::NE: + msb_lut = mem_ptr->eq_buffer->is_non_zero_lut; + break; + default: + PANIC("Cuda error: integer operation not supported") } + + host_compare_with_zero_equality(msb_streams, gpu_indexes, gpu_count, + lwe_array_msb_out, msb, mem_ptr, bsks, ksks, + num_msb_radix_blocks, msb_lut); } for (uint j = 0; j < mem_ptr->active_gpu_count; j++) { diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 1d5fa20a15..348655d1db 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -10,7 +10,6 @@ #include "types/complex/operations.cuh" #include "utils/helper.cuh" #include "utils/kernel_dimensions.cuh" -#include template __host__ void scratch_cuda_integer_radix_logical_scalar_shift_kb( @@ -212,36 +211,25 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace( for (uint j = 0; j < gpu_count; j++) { cuda_synchronize_stream(streams[j], gpu_indexes[j]); } -#pragma omp parallel sections - { - // All sections may be executed in parallel -#pragma omp section - { - auto lut_univariate_padding_block = - mem->lut_buffers_univariate[num_bits_in_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( - mem->local_streams_1, gpu_indexes, gpu_count, padding_block, - last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); - // Replace blocks 'pulled' from the left with the correct padding - // block - for (uint i = 0; i < rotations; i++) { - cuda_memcpy_async_gpu_to_gpu( - lwe_array + (num_blocks - rotations + i) * big_lwe_size, - padding_block, big_lwe_size_bytes, mem->local_streams_1[0], - gpu_indexes[0]); - } - } -#pragma omp section - { - if (shift_within_block != 0) { - auto lut_univariate_shift_last_block = - mem->lut_buffers_univariate[shift_within_block - 1]; - integer_radix_apply_univariate_lookup_table_kb( - mem->local_streams_2, gpu_indexes, gpu_count, last_block, - last_block_copy, bsks, ksks, 1, - lut_univariate_shift_last_block); - } - } + auto lut_univariate_padding_block = + mem->lut_buffers_univariate[num_bits_in_block - 1]; + integer_radix_apply_univariate_lookup_table_kb( + mem->local_streams_1, gpu_indexes, gpu_count, padding_block, + last_block_copy, bsks, ksks, 1, lut_univariate_padding_block); + // Replace blocks 'pulled' from the left with the correct padding + // block + for (uint i = 0; i < rotations; i++) { + cuda_memcpy_async_gpu_to_gpu(lwe_array + (num_blocks - rotations + i) * + big_lwe_size, + padding_block, big_lwe_size_bytes, + mem->local_streams_1[0], gpu_indexes[0]); + } + if (shift_within_block != 0) { + auto lut_univariate_shift_last_block = + mem->lut_buffers_univariate[shift_within_block - 1]; + integer_radix_apply_univariate_lookup_table_kb( + mem->local_streams_2, gpu_indexes, gpu_count, last_block, + last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block); } for (uint j = 0; j < mem->active_gpu_count; j++) { cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);