Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(gpu): general fixes on indexes used in multi-gpu context #1997

Merged
merged 4 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ inline void cuda_error(cudaError_t code, const char *file, int line) {
std::abort(); \
}

void cuda_set_device(uint32_t gpu_index);

cudaEvent_t cuda_create_event(uint32_t gpu_index);

void cuda_event_record(cudaEvent_t event, cudaStream_t stream,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ template <typename Torus> struct int_radix_lut {
std::vector<Torus *> lwe_after_pbs_vec;
std::vector<Torus *> lwe_trivial_indexes_vec;

int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t *gpu_indexes;

int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, bool allocate_gpu_memory) {

Expand All @@ -162,11 +164,14 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cudaSetDevice(i);
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
Expand Down Expand Up @@ -257,7 +262,7 @@ template <typename Torus> struct int_radix_lut {
}

// constructor to reuse memory
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, int_radix_lut *base_lut_object) {

Expand All @@ -268,6 +273,9 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

// base lut object should have bigger or equal memory than current one
assert(num_radix_blocks <= base_lut_object->num_blocks);
// pbs
Expand Down Expand Up @@ -334,7 +342,7 @@ template <typename Torus> struct int_radix_lut {
}

// Construction for many luts
int_radix_lut(cudaStream_t const *streams, uint32_t const *gpu_indexes,
int_radix_lut(cudaStream_t const *streams, uint32_t const *input_gpu_indexes,
uint32_t gpu_count, int_radix_params params, uint32_t num_luts,
uint32_t num_radix_blocks, uint32_t num_many_lut,
bool allocate_gpu_memory) {
Expand All @@ -346,11 +354,14 @@ template <typename Torus> struct int_radix_lut {
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

gpu_indexes = (uint32_t *)malloc(gpu_count * sizeof(uint32_t));
std::memcpy(gpu_indexes, input_gpu_indexes, gpu_count * sizeof(uint32_t));

///////////////
active_gpu_count = get_active_gpu_count(num_radix_blocks, gpu_count);
cuda_synchronize_stream(streams[0], gpu_indexes[0]);
for (uint i = 0; i < active_gpu_count; i++) {
cudaSetDevice(i);
cuda_set_device(i);
int8_t *gpu_pbs_buffer;
auto num_blocks_on_gpu =
get_num_inputs_on_gpu(num_radix_blocks, i, active_gpu_count);
Expand Down Expand Up @@ -496,6 +507,7 @@ template <typename Torus> struct int_radix_lut {

void release(cudaStream_t const *streams, uint32_t const *gpu_indexes,
uint32_t gpu_count) {
free(this->gpu_indexes);
for (uint i = 0; i < active_gpu_count; i++) {
cuda_drop_async(lut_vec[i], streams[i], gpu_indexes[i]);
cuda_drop_async(lut_indexes_vec[i], streams[i], gpu_indexes[i]);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@

template <typename Torus>
bool supports_distributed_shared_memory_on_multibit_programmable_bootstrap(
uint32_t polynomial_size);
uint32_t polynomial_size, int max_shared_memory);

template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit(
uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t level_count);
uint32_t level_count, int max_shared_memory);

#if CUDA_ARCH >= 900
template <typename Torus>
Expand Down Expand Up @@ -114,6 +114,8 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::MULTI_BIT> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, uint32_t lwe_chunk_size,
PBS_VARIANT pbs_variant, bool allocate_gpu_memory) {
cuda_set_device(gpu_index);

this->pbs_variant = pbs_variant;
this->lwe_chunk_size = lwe_chunk_size;
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);
Expand Down
14 changes: 7 additions & 7 deletions backends/tfhe-cuda-backend/cuda/include/pbs/pbs_utilities.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ get_buffer_size_partial_sm_programmable_bootstrap_cg(uint32_t polynomial_size) {

template <typename Torus>
bool supports_distributed_shared_memory_on_classic_programmable_bootstrap(
uint32_t polynomial_size);
uint32_t polynomial_size, int max_shared_memory);

template <typename Torus, PBS_TYPE pbs_type> struct pbs_buffer;

Expand All @@ -77,10 +77,10 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count, PBS_VARIANT pbs_variant,
bool allocate_gpu_memory) {

cuda_set_device(gpu_index);
this->pbs_variant = pbs_variant;

auto max_shared_memory = cuda_get_max_shared_memory(0);
auto max_shared_memory = cuda_get_max_shared_memory(gpu_index);

if (allocate_gpu_memory) {
switch (pbs_variant) {
Expand Down Expand Up @@ -157,7 +157,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {

bool supports_dsm =
supports_distributed_shared_memory_on_classic_programmable_bootstrap<
Torus>(polynomial_size);
Torus>(polynomial_size, max_shared_memory);

uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_tbc<Torus>(
Expand Down Expand Up @@ -218,8 +218,7 @@ template <typename Torus> struct pbs_buffer<Torus, PBS_TYPE::CLASSICAL> {
template <typename Torus>
uint64_t get_buffer_size_programmable_bootstrap_cg(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {
int max_shared_memory = cuda_get_max_shared_memory(0);
uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) {
uint64_t full_sm =
get_buffer_size_full_sm_programmable_bootstrap_cg<Torus>(polynomial_size);
uint64_t partial_sm =
Expand All @@ -245,7 +244,8 @@ template <typename Torus>
bool has_support_to_cuda_programmable_bootstrap_cg(uint32_t glwe_dimension,
uint32_t polynomial_size,
uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples,
int max_shared_memory);

template <typename Torus>
void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ extern "C" {

bool has_support_to_cuda_programmable_bootstrap_cg_multi_bit(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t num_samples);
uint32_t num_samples, int max_shared_memory);

void cuda_convert_lwe_multi_bit_programmable_bootstrap_key_64(
void *stream, uint32_t gpu_index, void *dest, void const *src,
Expand Down
6 changes: 3 additions & 3 deletions backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ void cuda_convert_lwe_ciphertext_vector_to_gpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_gpu(dest, src, size, stream, gpu_index);
}
Expand All @@ -21,7 +21,7 @@ void cuda_convert_lwe_ciphertext_vector_to_cpu(cudaStream_t stream,
uint32_t gpu_index, T *dest,
T *src, uint32_t number_of_cts,
uint32_t lwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
uint64_t size = number_of_cts * (lwe_dimension + 1) * sizeof(T);
cuda_memcpy_async_to_cpu(dest, src, size, stream, gpu_index);
}
Expand Down Expand Up @@ -55,7 +55,7 @@ __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
Torus const *glwe_array_in,
uint32_t const *nth_array, uint32_t num_nths,
uint32_t glwe_dimension) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

dim3 grid(num_nths);
dim3 thds(params::degree / params::opt);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,7 @@ __host__ void host_fast_packing_keyswitch_lwe_list_to_glwe(

// Optimization of packing keyswitch when packing many LWEs

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);
check_cuda_error(cudaGetLastError());

int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/ggsw.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ void batch_fft_ggsw_vector(cudaStream_t *streams, uint32_t *gpu_indexes,
if (gpu_count != 1)
PANIC("GPU error (batch_fft_ggsw_vector): multi-GPU execution is not "
"supported yet.")
cudaSetDevice(gpu_indexes[0]);
cuda_set_device(gpu_indexes[0]);

int shared_memory_size = sizeof(double) * polynomial_size;

Expand Down
4 changes: 2 additions & 2 deletions backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ __host__ void host_keyswitch_lwe_ciphertext_vector(
uint32_t lwe_dimension_out, uint32_t base_log, uint32_t level_count,
uint32_t num_samples) {

cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

constexpr int num_threads_y = 32;
int num_blocks, num_threads_x;
Expand Down Expand Up @@ -160,7 +160,7 @@ __host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t num_lwes, bool allocate_gpu_memory) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;

Expand Down
2 changes: 1 addition & 1 deletion backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ template <typename Torus>
__host__ void host_modulus_switch_inplace(cudaStream_t stream,
uint32_t gpu_index, Torus *array,
int size, uint32_t log_modulus) {
cudaSetDevice(gpu_index);
cuda_set_device(gpu_index);

int num_threads = 0, num_blocks = 0;
getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
Expand Down
Loading
Loading