Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(gpu): fix scalar shifts #1420

Merged
merged 2 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -1755,12 +1755,12 @@ template <typename Torus> struct int_arithmetic_scalar_shift_buffer {
uint32_t big_lwe_size = params.big_lwe_dimension + 1;
uint32_t big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 2) *
tmp_rotated = (Torus *)cuda_malloc_async((num_radix_blocks + 3) *
big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

cuda_memset_async(tmp_rotated, 0,
(num_radix_blocks + 2) * big_lwe_size_bytes, streams[0],
(num_radix_blocks + 3) * big_lwe_size_bytes, streams[0],
gpu_indexes[0]);

uint32_t num_bits_in_block = (uint32_t)std::log2(params.message_modulus);
Expand Down
3 changes: 2 additions & 1 deletion backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -762,8 +762,9 @@ __host__ void pack_blocks(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_array_in,
uint32_t lwe_dimension, uint32_t num_radix_blocks,
uint32_t factor) {
if (num_radix_blocks == 0)
return;
cudaSetDevice(gpu_index);

int num_blocks = 0, num_threads = 0;
int num_entries = (lwe_dimension + 1);
getNumBlocksAndThreads(num_entries, 1024, num_blocks, num_threads);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ template <typename Torus, class params>
__global__ void fill_radix_from_lsb_msb(Torus *result_blocks, Torus *lsb_blocks,
Torus *msb_blocks,
uint32_t glwe_dimension,
uint32_t lsb_count, uint32_t msb_count,
uint32_t num_blocks) {
size_t big_lwe_dimension = glwe_dimension * params::degree + 1;
size_t big_lwe_id = blockIdx.x;
Expand Down Expand Up @@ -321,8 +320,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb(
luts_message_carry->set_lwe_indexes(streams[0], gpu_indexes[0],
h_lwe_idx_in, h_lwe_idx_out);

size_t copy_size = total_count * sizeof(Torus);
copy_size = sm_copy_count * sizeof(int32_t);
size_t copy_size = sm_copy_count * sizeof(int32_t);
cuda_memcpy_async_to_gpu(d_smart_copy_in, h_smart_copy_in, copy_size,
streams[0], gpu_indexes[0]);
cuda_memcpy_async_to_gpu(d_smart_copy_out, h_smart_copy_out, copy_size,
Expand Down Expand Up @@ -548,8 +546,7 @@ __host__ void host_integer_mult_radix_kb(
fill_radix_from_lsb_msb<Torus, params>
<<<num_blocks * num_blocks, params::degree / params::opt, 0,
streams[0]>>>(vector_result_sb, vector_result_lsb, vector_result_msb,
glwe_dimension, lsb_vector_block_count,
msb_vector_block_count, num_blocks);
glwe_dimension, num_blocks);
check_cuda_error(cudaGetLastError());

int terms_degree[2 * num_blocks * num_blocks];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,8 @@ __host__ void scalar_compare_radix_blocks_kb(
int_comparison_buffer<Torus> *mem_ptr, void **bsks, Torus **ksks,
uint32_t num_radix_blocks) {

if (num_radix_blocks == 0)
return;
auto params = mem_ptr->params;
auto big_lwe_dimension = params.big_lwe_dimension;
auto message_modulus = params.message_modulus;
Expand Down
6 changes: 4 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_rotate.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,6 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(

Torus *rotated_buffer = mem->tmp_rotated;

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

// rotate right all the blocks in radix ciphertext
// copy result in new buffer
// 256 threads are used in every block
Expand All @@ -76,6 +74,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
giver_blocks, lwe_array, 1, num_blocks,
big_lwe_size);

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
Expand All @@ -100,6 +100,8 @@ __host__ void host_integer_radix_scalar_rotate_kb_inplace(
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count, giver_blocks,
lwe_array, 1, num_blocks, big_lwe_size);

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, lwe_array, receiver_blocks,
giver_blocks, bsks, ksks, num_blocks, lut_bivariate,
Expand Down
117 changes: 60 additions & 57 deletions backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,6 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
Torus *full_rotated_buffer = mem->tmp_rotated;
Torus *rotated_buffer = &full_rotated_buffer[big_lwe_size];

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

// rotate right all the blocks in radix ciphertext
// copy result in new buffer
// 1024 threads are used in every block
Expand All @@ -76,6 +74,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(
return;
}

auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];
auto partial_current_blocks = &lwe_array[rotations * big_lwe_size];
auto partial_previous_blocks =
&full_rotated_buffer[rotations * big_lwe_size];
Expand Down Expand Up @@ -109,6 +108,7 @@ __host__ void host_integer_radix_logical_scalar_shift_kb_inplace(

auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

size_t partial_block_count = num_blocks - rotations;

Expand Down Expand Up @@ -139,8 +139,6 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
int_arithmetic_scalar_shift_buffer<Torus> *mem, void **bsks, Torus **ksks,
uint32_t num_blocks) {

cudaSetDevice(gpu_indexes[0]);

auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
Expand All @@ -160,15 +158,9 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
size_t shift_within_block = shift % num_bits_in_block;

Torus *rotated_buffer = mem->tmp_rotated;
Torus *padding_block = &rotated_buffer[num_blocks * big_lwe_size];
Torus *padding_block = &rotated_buffer[(num_blocks + 1) * big_lwe_size];
Torus *last_block_copy = &padding_block[big_lwe_size];

auto lut_univariate_shift_last_block =
mem->lut_buffers_univariate[shift_within_block - 1];
auto lut_univariate_padding_block =
mem->lut_buffers_univariate[num_bits_in_block - 1];
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

if (mem->shift_type == RIGHT_SHIFT) {
host_radix_blocks_rotate_left(streams, gpu_indexes, gpu_count,
rotated_buffer, lwe_array, rotations,
Expand Down Expand Up @@ -197,59 +189,70 @@ __host__ void host_integer_radix_arithmetic_scalar_shift_kb_inplace(
return;
}

// In the arithmetic shift case we have to pad with the value of the sign
// bit. This creates the need for a different shifting lut than in the
// logical shift case. We also need another PBS to create the padding block.
Torus *last_block = lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(
last_block_copy,
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
size_t partial_block_count = num_blocks - rotations;
if (shift_within_block != 0 && rotations != num_blocks) {
integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, partial_next_blocks, bsks, ksks,
partial_block_count, lut_bivariate,
lut_bivariate->params.message_modulus);
}
// Since our CPU threads will be working on different streams we shall
// assert the work in the main stream is completed
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
if (num_blocks != rotations) {
// In the arithmetic shift case we have to pad with the value of the sign
// bit. This creates the need for a different shifting lut than in the
// logical shift case. We also need another PBS to create the padding
// block.
Torus *last_block =
lwe_array + (num_blocks - rotations - 1) * big_lwe_size;
cuda_memcpy_async_gpu_to_gpu(
last_block_copy,
rotated_buffer + (num_blocks - rotations - 1) * big_lwe_size,
big_lwe_size_bytes, streams[0], gpu_indexes[0]);
if (shift_within_block != 0) {
auto partial_current_blocks = lwe_array;
auto partial_next_blocks = &rotated_buffer[big_lwe_size];
size_t partial_block_count = num_blocks - rotations;
auto lut_bivariate = mem->lut_buffers_bivariate[shift_within_block - 1];

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, partial_current_blocks,
partial_current_blocks, partial_next_blocks, bsks, ksks,
partial_block_count, lut_bivariate,
lut_bivariate->params.message_modulus);
}
// Since our CPU threads will be working on different streams we shall
// assert the work in the main stream is completed
for (uint j = 0; j < gpu_count; j++) {
cuda_synchronize_stream(streams[j], gpu_indexes[j]);
}
#pragma omp parallel sections
{
// All sections may be executed in parallel
#pragma omp section
{
integer_radix_apply_univariate_lookup_table_kb(
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
// Replace blocks 'pulled' from the left with the correct padding block
for (uint i = 0; i < rotations; i++) {
cuda_memcpy_async_gpu_to_gpu(
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
gpu_indexes[0]);
}
}
// All sections may be executed in parallel
#pragma omp section
{
if (shift_within_block != 0 && rotations != num_blocks) {
{
auto lut_univariate_padding_block =
mem->lut_buffers_univariate[num_bits_in_block - 1];
integer_radix_apply_univariate_lookup_table_kb(
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
last_block_copy, bsks, ksks, 1, lut_univariate_shift_last_block);
mem->local_streams_1, gpu_indexes, gpu_count, padding_block,
last_block_copy, bsks, ksks, 1, lut_univariate_padding_block);
// Replace blocks 'pulled' from the left with the correct padding
// block
for (uint i = 0; i < rotations; i++) {
cuda_memcpy_async_gpu_to_gpu(
lwe_array + (num_blocks - rotations + i) * big_lwe_size,
padding_block, big_lwe_size_bytes, mem->local_streams_1[0],
gpu_indexes[0]);
}
}
#pragma omp section
{
if (shift_within_block != 0) {
auto lut_univariate_shift_last_block =
mem->lut_buffers_univariate[shift_within_block - 1];
integer_radix_apply_univariate_lookup_table_kb(
mem->local_streams_2, gpu_indexes, gpu_count, last_block,
last_block_copy, bsks, ksks, 1,
lut_univariate_shift_last_block);
}
}
}
for (uint j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
}
}
for (uint j = 0; j < mem->active_gpu_count; j++) {
cuda_synchronize_stream(mem->local_streams_1[j], gpu_indexes[j]);
cuda_synchronize_stream(mem->local_streams_2[j], gpu_indexes[j]);
}

} else {
PANIC("Cuda error (scalar shift): left scalar shift is never of the "
"arithmetic type")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
auto big_lwe_size = big_lwe_dimension + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

cudaSetDevice(gpu_indexes[0]);

// Extract all bits
auto bits = mem->tmp_bits;
extract_n_bits<Torus>(streams, gpu_indexes, gpu_count, bits, lwe_array, bsks,
Expand Down
Loading