Skip to content

Commit

Permalink
chore(gpu): start using a struct to pass data across rust/c++
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Jan 20, 2025
1 parent b46affa commit c3fc1cb
Show file tree
Hide file tree
Showing 46 changed files with 971 additions and 807 deletions.
3 changes: 3 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@ void cuda_destroy_stream(cudaStream_t stream, uint32_t gpu_index);

void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index);

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count);

uint32_t cuda_is_available();

void *cuda_malloc(uint64_t size, uint32_t gpu_index);
Expand Down
21 changes: 16 additions & 5 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,15 @@ enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
enum outputFlag { FLAG_NONE = 0, FLAG_OVERFLOW = 1, FLAG_CARRY = 2 };

extern "C" {

typedef struct {
void *ptr;
uint64_t *degrees;
uint64_t *noise_levels;
uint32_t num_radix_blocks;
uint32_t lwe_dimension;
} CudaRadixCiphertextData;

void scratch_cuda_apply_univariate_lut_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr, void const *input_lut, uint32_t lwe_dimension,
Expand Down Expand Up @@ -258,9 +267,11 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count);
CudaRadixCiphertextData *lwe_array_out,
CudaRadixCiphertextData const *lwe_condition,
CudaRadixCiphertextData const *lwe_array_true,
CudaRadixCiphertextData const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks);

void cleanup_cuda_integer_radix_cmux(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down Expand Up @@ -439,8 +450,8 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks);
CudaRadixCiphertextData *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks, uint32_t num_blocks);

void cleanup_cuda_integer_abs_inplace(void *const *streams,
uint32_t const *gpu_indexes,
Expand Down
32 changes: 17 additions & 15 deletions backends/tfhe-cuda-backend/cuda/include/integer/integer_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "integer.h"
#include "keyswitch.h"
#include "pbs/programmable_bootstrap.cuh"
#include "radix_ciphertext.h"
#include <cassert>
#include <cmath>
#include <functional>
Expand Down Expand Up @@ -2963,9 +2964,9 @@ template <typename Torus> struct int_cmux_buffer {
int_radix_lut<Torus> *predicate_lut;
int_radix_lut<Torus> *message_extract_lut;

Torus *buffer_in;
Torus *buffer_out;
Torus *condition_array;
CudaRadixCiphertextData *buffer_in;
CudaRadixCiphertextData *buffer_out;
CudaRadixCiphertextData *condition_array;

int_radix_params params;

Expand All @@ -2981,12 +2982,15 @@ template <typename Torus> struct int_cmux_buffer {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

buffer_in =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
buffer_out =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
condition_array =
(Torus *)cuda_malloc_async(2 * big_size, streams[0], gpu_indexes[0]);
create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
buffer_in, num_radix_blocks,
params.big_lwe_dimension);
create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
buffer_out, num_radix_blocks,
params.big_lwe_dimension);
create_trivial_radix_ciphertext_async<Torus>(
streams[0], gpu_indexes[0], condition_array, num_radix_blocks,
params.big_lwe_dimension);

auto lut_f = [predicate_lut_f](Torus block, Torus condition) -> Torus {
return predicate_lut_f(condition) ? 0 : block;
Expand Down Expand Up @@ -4351,7 +4355,7 @@ template <typename Torus> struct int_abs_buffer {
int_sc_prop_memory<Torus> *scp_mem;
int_bitop_buffer<Torus> *bitxor_mem;

Torus *mask;
CudaRadixCiphertextData *mask;
int_abs_buffer(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, int_radix_params params,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {
Expand All @@ -4372,11 +4376,9 @@ template <typename Torus> struct int_abs_buffer {
streams, gpu_indexes, gpu_count, BITOP_TYPE::BITXOR, params,
num_radix_blocks, allocate_gpu_memory);

uint32_t lwe_size = params.big_lwe_dimension + 1;
uint32_t lwe_size_bytes = lwe_size * sizeof(Torus);

mask = (Torus *)cuda_malloc_async(num_radix_blocks * lwe_size_bytes,
streams[0], gpu_indexes[0]);
create_trivial_radix_ciphertext_async<Torus>(streams[0], gpu_indexes[0],
mask, num_radix_blocks,
params.big_lwe_dimension);
}
}

Expand Down
80 changes: 80 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/integer/radix_ciphertext.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#ifndef CUDA_RADIX_CIPHERTEXT_H
#define CUDA_RADIX_CIPHERTEXT_H

#include "device.h"
#include "integer.h"

template <typename Torus>
void create_trivial_radix_ciphertext_async(
cudaStream_t const stream, uint32_t const gpu_index,
CudaRadixCiphertextData *output_radix, uint32_t num_radix_blocks,
uint32_t lwe_dimension) {
uint32_t lwe_size_bytes = (lwe_dimension + 1) * sizeof(Torus);
output_radix->ptr = (void *)cuda_malloc_async(
num_radix_blocks * lwe_size_bytes, stream, gpu_index);
for (uint i = 0; i < output_radix->num_radix_blocks; i++) {
output_radix->degrees[i] = 0;
output_radix->noise_levels[i] = 0;
}
output_radix->lwe_dimension = lwe_dimension;
output_radix->num_radix_blocks = num_radix_blocks;
}

template <typename Torus>
void as_radix_ciphertext_slice(CudaRadixCiphertextData *output_radix,
const CudaRadixCiphertextData *input_radix,
uint32_t start_lwe_index,
uint32_t end_lwe_index) {
if (input_radix->num_radix_blocks < start_lwe_index - end_lwe_index + 1)
PANIC("Cuda error: input radix should have more blocks than the specified "
"range")
if (start_lwe_index <= end_lwe_index)
PANIC("Cuda error: slice range should be strictly positive")

auto lwe_size = input_radix->lwe_dimension + 1;
output_radix->num_radix_blocks = end_lwe_index - start_lwe_index + 1;
output_radix->lwe_dimension = input_radix->lwe_dimension;
Torus *in_ptr = (Torus *)input_radix->ptr;
output_radix->ptr = (void *)(&in_ptr[start_lwe_index * lwe_size]);
for (uint i = 0; i < output_radix->num_radix_blocks; i++) {
output_radix->degrees[i] =
input_radix->degrees[i + start_lwe_index * lwe_size];
output_radix->noise_levels[i] =
input_radix->noise_levels[i + start_lwe_index * lwe_size];
}
}

template <typename Torus>
void copy_radix_ciphertext_to_larger_output_slice_async(
cudaStream_t const stream, uint32_t const gpu_index,
CudaRadixCiphertextData *output_radix,
const CudaRadixCiphertextData *input_radix,
uint32_t output_start_lwe_index) {
if (output_radix->lwe_dimension != input_radix->lwe_dimension)
PANIC("Cuda error: input lwe dimension should be equal to output lwe "
"dimension")
if (output_radix->num_radix_blocks - output_start_lwe_index !=
input_radix->num_radix_blocks)
PANIC("Cuda error: input radix should have the same number of blocks as "
"the output range")
if (output_start_lwe_index >= output_radix->num_radix_blocks)
PANIC("Cuda error: output index should be strictly smaller than the number "
"of blocks")

auto lwe_size = input_radix->lwe_dimension + 1;
Torus *out_ptr = (Torus *)output_radix->ptr;
out_ptr = &out_ptr[output_start_lwe_index * lwe_size];

cuda_memcpy_async_gpu_to_gpu(out_ptr, input_radix->ptr,
input_radix->num_radix_blocks *
(input_radix->lwe_dimension + 1) *
sizeof(Torus),
stream, gpu_index);
for (uint i = 0; i < input_radix->num_radix_blocks; i++) {
output_radix->degrees[i + output_start_lwe_index] = input_radix->degrees[i];
output_radix->noise_levels[i + output_start_lwe_index] =
input_radix->noise_levels[i];
}
}

#endif // CUDA_RADIX_CIPHERTEXT_H
17 changes: 5 additions & 12 deletions backends/tfhe-cuda-backend/cuda/include/linear_algebra.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#ifndef CUDA_LINALG_H_
#define CUDA_LINALG_H_

#include "integer/integer.h"
#include <stdint.h>

extern "C" {
Expand All @@ -15,18 +16,10 @@ void cuda_negate_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void const *lwe_array_in,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_32(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(void *stream, uint32_t gpu_index,
void *lwe_array_out,
void const *lwe_array_in_1,
void const *lwe_array_in_2,
uint32_t input_lwe_dimension,
uint32_t input_lwe_ciphertext_count);
void cuda_add_lwe_ciphertext_vector_64(
void *stream, uint32_t gpu_index, CudaRadixCiphertextData *lwe_array_out,
CudaRadixCiphertextData const *lwe_array_in_1,
CudaRadixCiphertextData const *lwe_array_in_2);

void cuda_add_lwe_ciphertext_vector_plaintext_vector_32(
void *stream, uint32_t gpu_index, void *lwe_array_out,
Expand Down
7 changes: 7 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,13 @@ void cuda_synchronize_stream(cudaStream_t stream, uint32_t gpu_index) {
check_cuda_error(cudaStreamSynchronize(stream));
}

void synchronize_streams(cudaStream_t const *streams,
uint32_t const *gpu_indexes, uint32_t gpu_count) {
for (uint i = 0; i < gpu_count; i++) {
cuda_synchronize_stream(streams[i], gpu_indexes[i]);
}
}

// Determine if a CUDA device is available at runtime
uint32_t cuda_is_available() { return cudaSetDevice(0) == cudaSuccess; }

Expand Down
9 changes: 4 additions & 5 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cu
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,14 @@ void scratch_cuda_integer_abs_inplace_radix_ciphertext_kb_64(

void cuda_integer_abs_inplace_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *ct, int8_t *mem_ptr, bool is_signed, void *const *bsks,
void *const *ksks, uint32_t num_blocks) {
CudaRadixCiphertextData *ct, int8_t *mem_ptr, bool is_signed,
void *const *bsks, void *const *ksks) {

auto mem = (int_abs_buffer<uint64_t> *)mem_ptr;

host_integer_abs_kb<uint64_t>((cudaStream_t *)(streams), gpu_indexes,
gpu_count, static_cast<uint64_t *>(ct), bsks,
(uint64_t **)(ksks), mem, is_signed,
num_blocks);
gpu_count, ct, bsks, (uint64_t **)(ksks), mem,
is_signed);
}

void cleanup_cuda_integer_abs_inplace(void *const *streams,
Expand Down
60 changes: 49 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/abs.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "integer/integer.cuh"
#include "integer/integer_utilities.h"
#include "integer/negation.cuh"
#include "integer/radix_ciphertext.h"
#include "integer/scalar_shifts.cuh"
#include "linear_algebra.h"
#include "pbs/programmable_bootstrap.h"
Expand All @@ -32,16 +33,15 @@ __host__ void scratch_cuda_integer_abs_kb(
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks,
uint64_t *const *ksks, int_abs_buffer<uint64_t> *mem_ptr,
bool is_signed, uint32_t num_blocks) {
__host__ void legacy_host_integer_abs_kb_async(
cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, Torus *ct, void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed, uint32_t num_blocks) {
if (!is_signed)
return;

auto radix_params = mem_ptr->params;
auto mask = mem_ptr->mask;
auto mask = (Torus *)(mem_ptr->mask->ptr);

auto big_lwe_dimension = radix_params.big_lwe_dimension;
auto big_lwe_size = big_lwe_dimension + 1;
Expand All @@ -52,20 +52,58 @@ host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
cuda_memcpy_async_gpu_to_gpu(mask, ct, num_blocks * big_lwe_size_bytes,
streams[0], gpu_indexes[0]);

host_integer_radix_arithmetic_scalar_shift_kb_inplace(
host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, mask, num_bits_in_ciphertext - 1,
mem_ptr->arithmetic_scalar_shift_mem, bsks, ksks, num_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);
legacy_host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct,
radix_params.big_lwe_dimension, num_blocks);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, ct, nullptr, nullptr, mem_ptr->scp_mem,
bsks, ksks, num_blocks, requested_flag, uses_carry);

host_integer_radix_bitop_kb(streams, gpu_indexes, gpu_count, ct, mask, ct,
mem_ptr->bitxor_mem, bsks, ksks, num_blocks);
host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count, ct, mask,
ct, mem_ptr->bitxor_mem, bsks, ksks,
num_blocks);
}

template <typename Torus>
__host__ void
host_integer_abs_kb(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count, CudaRadixCiphertextData *ct,
void *const *bsks, uint64_t *const *ksks,
int_abs_buffer<uint64_t> *mem_ptr, bool is_signed) {
if (!is_signed)
return;

auto mask = mem_ptr->mask;

uint32_t num_bits_in_ciphertext =
(31 - __builtin_clz(mem_ptr->params.message_modulus)) *
ct->num_radix_blocks;

copy_radix_ciphertext_to_larger_output_slice_async<Torus>(
streams[0], gpu_indexes[0], mask, ct, 0);

host_integer_radix_arithmetic_scalar_shift_kb_inplace<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(mask->ptr),
num_bits_in_ciphertext - 1, mem_ptr->arithmetic_scalar_shift_mem, bsks,
ksks, ct->num_radix_blocks);
host_addition<Torus>(streams[0], gpu_indexes[0], ct, mask, ct);

uint32_t requested_flag = outputFlag::FLAG_NONE;
uint32_t uses_carry = 0;
host_propagate_single_carry<Torus>(
streams, gpu_indexes, gpu_count, (Torus *)(ct->ptr), nullptr, nullptr,
mem_ptr->scp_mem, bsks, ksks, ct->num_radix_blocks, requested_flag,
uses_carry);

host_integer_radix_bitop_kb<Torus>(streams, gpu_indexes, gpu_count,
(Torus *)(ct->ptr), (Torus *)(mask->ptr),
(Torus *)(ct->ptr), mem_ptr->bitxor_mem,
bsks, ksks, ct->num_radix_blocks);
}

#endif // TFHE_RS_ABS_CUH
19 changes: 8 additions & 11 deletions backends/tfhe-cuda-backend/cuda/src/integer/cmux.cu
Original file line number Diff line number Diff line change
Expand Up @@ -25,19 +25,16 @@ void scratch_cuda_integer_radix_cmux_kb_64(

void cuda_cmux_integer_radix_ciphertext_kb_64(
void *const *streams, uint32_t const *gpu_indexes, uint32_t gpu_count,
void *lwe_array_out, void const *lwe_condition, void const *lwe_array_true,
void const *lwe_array_false, int8_t *mem_ptr, void *const *bsks,
void *const *ksks, uint32_t lwe_ciphertext_count) {
CudaRadixCiphertextData *lwe_array_out,
CudaRadixCiphertextData const *lwe_condition,
CudaRadixCiphertextData const *lwe_array_true,
CudaRadixCiphertextData const *lwe_array_false, int8_t *mem_ptr,
void *const *bsks, void *const *ksks) {

host_integer_radix_cmux_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(lwe_array_out),
static_cast<const uint64_t *>(lwe_condition),
static_cast<const uint64_t *>(lwe_array_true),
static_cast<const uint64_t *>(lwe_array_false),
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),

lwe_ciphertext_count);
(cudaStream_t *)(streams), gpu_indexes, gpu_count, lwe_array_out,
lwe_condition, lwe_array_true, lwe_array_false,
(int_cmux_buffer<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks));
}

void cleanup_cuda_integer_radix_cmux(void *const *streams,
Expand Down
Loading

0 comments on commit c3fc1cb

Please sign in to comment.