Skip to content

Commit

Permalink
chore(gpu): check for all cuda errors and abort in device.cu/.h
Browse files Browse the repository at this point in the history
Remove some legacy compilation warnings
  • Loading branch information
agnesLeroy committed Jan 31, 2024
1 parent eeaf45d commit 71bff09
Show file tree
Hide file tree
Showing 11 changed files with 197 additions and 290 deletions.
63 changes: 35 additions & 28 deletions backends/tfhe-cuda-backend/cuda/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,75 +11,82 @@

extern "C" {

#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line) {
if (code != cudaSuccess) {
std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code),
file, line);
std::abort();
}
}
#define PANIC(format, ...) \
{ \
std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \
__LINE__, __func__, ##__VA_ARGS__); \
std::abort(); \
}

struct cuda_stream_t {
cudaStream_t stream;
uint32_t gpu_index;

cuda_stream_t(uint32_t gpu_index) {
this->gpu_index = gpu_index;

cudaStreamCreate(&stream);
check_cuda_error(cudaStreamCreate(&stream));
}

void release() {
cudaSetDevice(gpu_index);
cudaStreamDestroy(stream);
check_cuda_error(cudaSetDevice(gpu_index));
check_cuda_error(cudaStreamDestroy(stream));
}

void synchronize() { cudaStreamSynchronize(stream); }
void synchronize() {
check_cuda_error(cudaStreamSynchronize(stream));
}
};

cuda_stream_t *cuda_create_stream(uint32_t gpu_index);

int cuda_destroy_stream(cuda_stream_t *stream);
void cuda_destroy_stream(cuda_stream_t *stream);

void *cuda_malloc(uint64_t size, uint32_t gpu_index);

void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);

int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);

int cuda_check_support_cooperative_groups();
bool cuda_check_support_cooperative_groups();

int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);

int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);

int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
cuda_stream_t *stream);

int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);

int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
cuda_stream_t *stream);

int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
cuda_stream_t *stream);

int cuda_get_number_of_gpus();

int cuda_synchronize_device(uint32_t gpu_index);
void cuda_synchronize_device(uint32_t gpu_index);

int cuda_drop(void *ptr, uint32_t gpu_index);
void cuda_drop(void *ptr, uint32_t gpu_index);

int cuda_drop_async(void *ptr, cuda_stream_t *stream);
void cuda_drop_async(void *ptr, cuda_stream_t *stream);

int cuda_get_max_shared_memory(uint32_t gpu_index);

int cuda_synchronize_stream(cuda_stream_t *stream);
void cuda_synchronize_stream(cuda_stream_t *stream);

#define check_cuda_error(ans) \
{ cuda_error((ans), __FILE__, __LINE__); }
inline void cuda_error(cudaError_t code, const char *file, int line,
bool abort = true) {
if (code != cudaSuccess) {
fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
line);
if (abort)
exit(code);
}
}
}

template <typename Torus>
Expand Down
19 changes: 0 additions & 19 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -361,10 +361,6 @@ template <typename Torus> struct int_radix_lut {
this->params = params;
this->num_blocks = num_radix_blocks;
Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus small_size =
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus lut_buffer_size =
(params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);

Expand Down Expand Up @@ -537,7 +533,6 @@ template <typename Torus> struct int_mul_memory {
Torus *vector_result_sb;
Torus *block_mul_res;
Torus *small_lwe_vector;
Torus *lwe_pbs_out_array;
int_radix_lut<Torus> *luts_array; // lsb msb
int_radix_lut<Torus> *luts_message;
int_radix_lut<Torus> *luts_carry;
Expand Down Expand Up @@ -577,10 +572,6 @@ template <typename Torus> struct int_mul_memory {
stream);
small_lwe_vector = (Torus *)cuda_malloc_async(
total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream);
lwe_pbs_out_array =
(Torus *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
total_block_count * sizeof(Torus),
stream);

// create int_radix_lut objects for lsb, msb, message, carry
// luts_array -> lut = {lsb_acc, msb_acc}
Expand Down Expand Up @@ -637,7 +628,6 @@ template <typename Torus> struct int_mul_memory {
cuda_drop_async(vector_result_sb, stream);
cuda_drop_async(block_mul_res, stream);
cuda_drop_async(small_lwe_vector, stream);
cuda_drop_async(lwe_pbs_out_array, stream);

luts_array->release(stream);
luts_message->release(stream);
Expand Down Expand Up @@ -834,8 +824,6 @@ template <typename Torus> struct int_cmux_buffer {
if (allocate_gpu_memory) {
Torus big_size =
(params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
Torus small_size =
(params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);

tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream);
tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream);
Expand Down Expand Up @@ -1048,13 +1036,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
return msb;
};

auto last_leaf_noop_lut_f = [this](Torus x) -> Torus {
int msb = (x >> 2) & 3;
int lsb = x & 3;

return this->block_selector_f(msb, lsb);
};

if (allocate_gpu_memory) {
tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
num_radix_blocks * sizeof(Torus),
Expand Down
Loading

0 comments on commit 71bff09

Please sign in to comment.