From 71bff0963cb6a00500736a065643cbe6e4abb548 Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Thu, 25 Jan 2024 11:10:46 +0100 Subject: [PATCH] chore(gpu): check for all cuda errors and abort in device.cu/.h Remove some legacy compilation warnings --- .../tfhe-cuda-backend/cuda/include/device.h | 63 ++--- .../tfhe-cuda-backend/cuda/include/integer.h | 19 -- backends/tfhe-cuda-backend/cuda/src/device.cu | 233 +++++------------- .../cuda/src/integer/comparison.cuh | 6 +- .../cuda/src/integer/integer.cuh | 1 - .../cuda/src/integer/multiplication.cuh | 23 +- .../cuda/src/integer/scalar_bitops.cuh | 4 +- .../cuda/src/integer/scalar_comparison.cuh | 28 ++- .../cuda/src/integer/scalar_shifts.cuh | 15 +- backends/tfhe-cuda-backend/src/cuda_bind.rs | 25 +- tfhe/src/core_crypto/gpu/mod.rs | 70 +++--- 11 files changed, 197 insertions(+), 290 deletions(-) diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h index bfe8c64f49..0dc28684b5 100644 --- a/backends/tfhe-cuda-backend/cuda/include/device.h +++ b/backends/tfhe-cuda-backend/cuda/include/device.h @@ -11,6 +11,22 @@ extern "C" { +#define check_cuda_error(ans) \ + { cuda_error((ans), __FILE__, __LINE__); } +inline void cuda_error(cudaError_t code, const char *file, int line) { + if (code != cudaSuccess) { + std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), + file, line); + std::abort(); + } +} +#define PANIC(format, ...) \ + { \ + std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__, \ + __LINE__, __func__, ##__VA_ARGS__); \ + std::abort(); \ + } + struct cuda_stream_t { cudaStream_t stream; uint32_t gpu_index; @@ -18,68 +34,59 @@ struct cuda_stream_t { cuda_stream_t(uint32_t gpu_index) { this->gpu_index = gpu_index; - cudaStreamCreate(&stream); + check_cuda_error(cudaStreamCreate(&stream)); } void release() { - cudaSetDevice(gpu_index); - cudaStreamDestroy(stream); + check_cuda_error(cudaSetDevice(gpu_index)); + check_cuda_error(cudaStreamDestroy(stream)); } - void synchronize() { cudaStreamSynchronize(stream); } + void synchronize() { + check_cuda_error(cudaStreamSynchronize(stream)); + } }; cuda_stream_t *cuda_create_stream(uint32_t gpu_index); -int cuda_destroy_stream(cuda_stream_t *stream); +void cuda_destroy_stream(cuda_stream_t *stream); void *cuda_malloc(uint64_t size, uint32_t gpu_index); void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream); -int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); +void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index); -int cuda_check_support_cooperative_groups(); +bool cuda_check_support_cooperative_groups(); -int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size); +void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size); -int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, +void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, cuda_stream_t *stream); -int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, +void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, cuda_stream_t *stream); -int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size); +void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size); -int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, +void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, cuda_stream_t *stream); -int cuda_memset_async(void *dest, uint64_t val, uint64_t size, +void cuda_memset_async(void *dest, uint64_t val, uint64_t size, cuda_stream_t *stream); int cuda_get_number_of_gpus(); -int cuda_synchronize_device(uint32_t gpu_index); +void cuda_synchronize_device(uint32_t gpu_index); -int cuda_drop(void *ptr, uint32_t gpu_index); +void cuda_drop(void *ptr, uint32_t gpu_index); -int cuda_drop_async(void *ptr, cuda_stream_t *stream); +void cuda_drop_async(void *ptr, cuda_stream_t *stream); int cuda_get_max_shared_memory(uint32_t gpu_index); -int cuda_synchronize_stream(cuda_stream_t *stream); +void cuda_synchronize_stream(cuda_stream_t *stream); -#define check_cuda_error(ans) \ - { cuda_error((ans), __FILE__, __LINE__); } -inline void cuda_error(cudaError_t code, const char *file, int line, - bool abort = true) { - if (code != cudaSuccess) { - fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file, - line); - if (abort) - exit(code); - } -} } template diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 7f7b9eb2c8..b822deaf6e 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -361,10 +361,6 @@ template struct int_radix_lut { this->params = params; this->num_blocks = num_radix_blocks; Torus lut_indexes_size = num_radix_blocks * sizeof(Torus); - Torus big_size = - (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - Torus small_size = - (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); Torus lut_buffer_size = (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus); @@ -537,7 +533,6 @@ template struct int_mul_memory { Torus *vector_result_sb; Torus *block_mul_res; Torus *small_lwe_vector; - Torus *lwe_pbs_out_array; int_radix_lut *luts_array; // lsb msb int_radix_lut *luts_message; int_radix_lut *luts_carry; @@ -577,10 +572,6 @@ template struct int_mul_memory { stream); small_lwe_vector = (Torus *)cuda_malloc_async( total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream); - lwe_pbs_out_array = - (Torus *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) * - total_block_count * sizeof(Torus), - stream); // create int_radix_lut objects for lsb, msb, message, carry // luts_array -> lut = {lsb_acc, msb_acc} @@ -637,7 +628,6 @@ template struct int_mul_memory { cuda_drop_async(vector_result_sb, stream); cuda_drop_async(block_mul_res, stream); cuda_drop_async(small_lwe_vector, stream); - cuda_drop_async(lwe_pbs_out_array, stream); luts_array->release(stream); luts_message->release(stream); @@ -834,8 +824,6 @@ template struct int_cmux_buffer { if (allocate_gpu_memory) { Torus big_size = (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); - Torus small_size = - (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus); tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream); tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream); @@ -1048,13 +1036,6 @@ template struct int_tree_sign_reduction_buffer { return msb; }; - auto last_leaf_noop_lut_f = [this](Torus x) -> Torus { - int msb = (x >> 2) & 3; - int lsb = x & 3; - - return this->block_selector_f(msb, lsb); - }; - if (allocate_gpu_memory) { tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus), diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu index 7b811c540a..d9ec4c81c9 100644 --- a/backends/tfhe-cuda-backend/cuda/src/device.cu +++ b/backends/tfhe-cuda-backend/cuda/src/device.cu @@ -4,25 +4,23 @@ /// Unsafe function to create a CUDA stream, must check first that GPU exists cuda_stream_t *cuda_create_stream(uint32_t gpu_index) { - cudaSetDevice(gpu_index); + check_cuda_error(cudaSetDevice(gpu_index)); cuda_stream_t *stream = new cuda_stream_t(gpu_index); return stream; } /// Unsafe function to destroy CUDA stream, must check first the GPU exists -int cuda_destroy_stream(cuda_stream_t *stream) { +void cuda_destroy_stream(cuda_stream_t *stream) { stream->release(); - return 0; } /// Unsafe function that will try to allocate even if gpu_index is invalid /// or if there's not enough memory. A safe wrapper around it must call /// cuda_check_valid_malloc() first void *cuda_malloc(uint64_t size, uint32_t gpu_index) { - cudaSetDevice(gpu_index); + check_cuda_error(cudaSetDevice(gpu_index)); void *ptr; - cudaMalloc((void **)&ptr, size); - check_cuda_error(cudaGetLastError()); + check_cuda_error(cudaMalloc((void **)&ptr, size)); return ptr; } @@ -30,7 +28,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) { /// Allocates a size-byte array at the device memory. Tries to do it /// asynchronously. void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) { - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); void *ptr; #ifndef CUDART_VERSION @@ -52,184 +50,104 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) { return ptr; } -/// Checks that allocation is valid -/// 0: valid -/// -1: invalid, not enough memory in device -/// -2: invalid, gpu index doesn't exist -int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) { - - if (gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } - cudaSetDevice(gpu_index); +/// Check that allocation is valid +void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); size_t total_mem, free_mem; - cudaMemGetInfo(&free_mem, &total_mem); + check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem)); if (size > free_mem) { - // error code: not enough memory - return -1; + PANIC("Cuda error: not enough memory on device. " + "Available: %zu vs Requested: %lu", free_mem, size); } - return 0; } /// Returns -/// -> 0 if Cooperative Groups is not supported. -/// -> 1 otherwise -int cuda_check_support_cooperative_groups() { +/// false if Cooperative Groups is not supported. +/// true otherwise +bool cuda_check_support_cooperative_groups() { int cooperative_groups_supported = 0; - cudaDeviceGetAttribute(&cooperative_groups_supported, - cudaDevAttrCooperativeLaunch, 0); + check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported, + cudaDevAttrCooperativeLaunch, 0)); return cooperative_groups_supported > 0; } -/// Tries to copy memory to the GPU asynchronously -/// 0: success -/// -1: error, invalid device pointer -/// -2: error, gpu index doesn't exist -/// -3: error, zero copy size -int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, +/// Copy memory to the GPU asynchronously +void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size, cuda_stream_t *stream) { - if (size == 0) { - // error code: zero copy size - return -3; - } - - if (stream->gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dest); + check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid device pointer in async copy to GPU."); } - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); check_cuda_error( cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream)); - return 0; } -/// Tries to copy memory to the GPU synchronously -/// 0: success -/// -1: error, invalid device pointer -/// -2: error, gpu index doesn't exist -/// -3: error, zero copy size -int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) { - if (size == 0) { - // error code: zero copy size - return -3; - } - +/// Copy memory to the GPU synchronously +void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) { cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dest); + check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (attr.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid device pointer in copy to GPU."); } - check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice)); - return 0; } -/// Tries to copy memory to the CPU synchronously -/// 0: success -/// -1: error, invalid device pointer -/// -2: error, gpu index doesn't exist -/// -3: error, zero copy size -int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) { - if (size == 0) { - // error code: zero copy size - return -3; - } - +/// Copy memory to the CPU synchronously +void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) { cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, src); + check_cuda_error(cudaPointerGetAttributes(&attr, src)); if (attr.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid device pointer in copy to CPU."); } - check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost)); - return 0; } -/// Tries to copy memory within a GPU asynchronously -/// 0: success -/// -1: error, invalid device pointer -/// -2: error, gpu index doesn't exist -/// -3: error, zero copy size -int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, +/// Copy memory within a GPU asynchronously +void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size, cuda_stream_t *stream) { - if (size == 0) { - // error code: zero copy size - return -3; - } - - if (stream->gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } cudaPointerAttributes attr_dest; - cudaPointerGetAttributes(&attr_dest, dest); + check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest)); if (attr_dest.device != stream->gpu_index && attr_dest.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU."); } cudaPointerAttributes attr_src; - cudaPointerGetAttributes(&attr_src, src); + check_cuda_error(cudaPointerGetAttributes(&attr_src, src)); if (attr_src.device != stream->gpu_index && attr_src.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU."); } if (attr_src.device != attr_dest.device) { - // error code: different devices - return -1; + PANIC("Cuda error: different devices specified in copy from GPU to GPU."); } - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice, stream->stream)); - return 0; } /// Synchronizes device -/// 0: success -/// -2: error, gpu index doesn't exist -int cuda_synchronize_device(uint32_t gpu_index) { - if (gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } - cudaSetDevice(gpu_index); - cudaDeviceSynchronize(); - return 0; +void cuda_synchronize_device(uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); + check_cuda_error(cudaDeviceSynchronize()); } -int cuda_memset_async(void *dest, uint64_t val, uint64_t size, +void cuda_memset_async(void *dest, uint64_t val, uint64_t size, cuda_stream_t *stream) { + cudaPointerAttributes attr; + check_cuda_error(cudaPointerGetAttributes(&attr, dest)); if (size == 0) { - // error code: zero copy size - return -3; - } - - if (stream->gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; + PANIC("Cuda error: invalid zero size in cuda_memset_async."); } - cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, dest); if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid dest device pointer in cuda memset."); } - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream)); - return 0; } template @@ -246,8 +164,9 @@ void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value, int num_blocks = (n + block_size - 1) / block_size; // Launch the kernel - cuda_set_value_kernel<<>>(d_array, value, - n); + cuda_set_value_kernel<<>> + (d_array, value, n); + check_cuda_error(cudaGetLastError()); } /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits @@ -256,57 +175,37 @@ template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array, template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array, uint32_t value, uint32_t n); -/// Tries to copy memory to the GPU asynchronously -/// 0: success -/// -1: error, invalid device pointer -/// -2: error, gpu index doesn't exist -/// -3: error, zero copy size -int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, +/// Copy memory to the CPU asynchronously +void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size, cuda_stream_t *stream) { - if (size == 0) { - // error code: zero copy size - return -3; - } - - if (stream->gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } cudaPointerAttributes attr; - cudaPointerGetAttributes(&attr, src); + check_cuda_error(cudaPointerGetAttributes(&attr, src)); if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) { - // error code: invalid device pointer - return -1; + PANIC("Cuda error: invalid src device pointer in copy to CPU async."); } - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); check_cuda_error( cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream)); - return 0; } /// Return number of GPUs available int cuda_get_number_of_gpus() { int num_gpus; - cudaGetDeviceCount(&num_gpus); + check_cuda_error(cudaGetDeviceCount(&num_gpus)); return num_gpus; } /// Drop a cuda array -int cuda_drop(void *ptr, uint32_t gpu_index) { - if (gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } - cudaSetDevice(gpu_index); +void cuda_drop(void *ptr, uint32_t gpu_index) { + check_cuda_error(cudaSetDevice(gpu_index)); check_cuda_error(cudaFree(ptr)); - return 0; } -/// Drop a cuda array. Tries to do it asynchronously -int cuda_drop_async(void *ptr, cuda_stream_t *stream) { +/// Drop a cuda array asynchronously, if supported on the device +void cuda_drop_async(void *ptr, cuda_stream_t *stream) { - cudaSetDevice(stream->gpu_index); + check_cuda_error(cudaSetDevice(stream->gpu_index)); #ifndef CUDART_VERSION #error CUDART_VERSION Undefined! #elif (CUDART_VERSION >= 11020) @@ -323,18 +222,13 @@ int cuda_drop_async(void *ptr, cuda_stream_t *stream) { #else check_cuda_error(cudaFree(ptr)); #endif - return 0; } /// Get the maximum size for the shared memory int cuda_get_max_shared_memory(uint32_t gpu_index) { - if (gpu_index >= cuda_get_number_of_gpus()) { - // error code: invalid gpu_index - return -2; - } - cudaSetDevice(gpu_index); + check_cuda_error(cudaSetDevice(gpu_index)); cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, gpu_index); + check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index)); int max_shared_memory = 0; if (prop.major >= 6) { max_shared_memory = prop.sharedMemPerMultiprocessor; @@ -344,7 +238,6 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) { return max_shared_memory; } -int cuda_synchronize_stream(cuda_stream_t *stream) { +void cuda_synchronize_stream(cuda_stream_t *stream) { stream->synchronize(); - return 0; } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh index 3434c58c47..323a36cdf2 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh @@ -202,8 +202,10 @@ __host__ void host_compare_with_zero_equality( // The result will be in the two first block. Everything else is // garbage. - cuda_memset_async(lwe_array_out + big_lwe_size, 0, - big_lwe_size_bytes * (num_radix_blocks - 1), stream); + if (num_radix_blocks > 1) { + cuda_memset_async(lwe_array_out + big_lwe_size, 0, + big_lwe_size_bytes * (num_radix_blocks - 1), stream); + } } template diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 0bf4994798..fcc7a82f7a 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -363,7 +363,6 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream, auto params = mem->params; auto glwe_dimension = params.glwe_dimension; auto polynomial_size = params.polynomial_size; - auto message_modulus = params.message_modulus; auto big_lwe_size = glwe_dimension * polynomial_size + 1; auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 4a9c59c8f8..e6b9f18236 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -109,8 +109,10 @@ void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src, if (!is_message) { int zero_block_count = num_blocks - number_of_unit; - cuda_memset_async(cur_dst_radix, 0, - zero_block_count * unit_size * sizeof(Torus), stream); + if (zero_block_count != 0) { + cuda_memset_async(cur_dst_radix, 0, + zero_block_count * unit_size * sizeof(Torus), stream); + } s_index = zero_block_count; } @@ -277,11 +279,6 @@ __host__ void host_integer_mult_radix_kb( // lwe_dimension +1 coefficients auto small_lwe_vector = mem_ptr->small_lwe_vector; - // buffer to keep pbs result for num_blocks^2 lwe_ciphertext - // in total it has num_blocks^2 big lwe ciphertexts with - // glwe_dimension * polynomial_size + 1 coefficients - auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array; - // it contains two lut, first for lsb extraction, // second for msb extraction, with total length = // 2 * (glwe_dimension + 1) * polynomial_size @@ -362,9 +359,13 @@ __host__ void host_integer_mult_radix_kb( ch_amount = r / chunk_size; dim3 add_grid(ch_amount, num_blocks, 1); size_t sm_size = big_lwe_size * sizeof(Torus); - cuda_memset_async(new_blocks, 0, - ch_amount * num_blocks * big_lwe_size * sizeof(Torus), - stream); + if (ch_amount != 0) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async( + new_blocks, 0, + ch_amount * num_blocks * big_lwe_size * sizeof(Torus), + stream); + } tree_add_chunks<<stream>>>( new_blocks, old_blocks, chunk_size, num_blocks); @@ -448,7 +449,7 @@ __host__ void host_integer_mult_radix_kb( dim3 add_grid(1, num_blocks, 1); size_t sm_size = big_lwe_size * sizeof(Torus); - cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus), + cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus), stream); tree_add_chunks<<stream>>>( radix_lwe_out, old_blocks, r, num_blocks); diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh index 96519d29df..5ad735be1f 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh @@ -19,7 +19,6 @@ __host__ void host_integer_radix_scalar_bitop_kb( if (num_clear_blocks == 0) { if (op == SCALAR_BITAND) { - auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size; cuda_memset_async(lwe_array_out, 0, num_radix_blocks * lwe_size * sizeof(Torus), stream); } else { @@ -28,7 +27,6 @@ __host__ void host_integer_radix_scalar_bitop_kb( stream); } } else { - auto lut_buffer = lut->lut; // We have all possible LUTs pre-computed and we use the decomposed scalar // as index to recover the right one cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks, @@ -38,7 +36,7 @@ __host__ void host_integer_radix_scalar_bitop_kb( stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks, lut); - if (op == SCALAR_BITAND) { + if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) { auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size; cuda_memset_async(lwe_array_out_block, 0, (num_radix_blocks - num_clear_blocks) * lwe_size * diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh index 3cf22f0ce9..5d4be87597 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh @@ -66,9 +66,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb( // The result will be in the two first block. Everything else is // garbage. - cuda_memset_async(lwe_array_out + big_lwe_size, 0, - big_lwe_size_bytes * (total_num_radix_blocks - 1), - stream); + if (total_num_radix_blocks > 1) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async( + lwe_array_out + big_lwe_size, 0, + big_lwe_size_bytes * (total_num_radix_blocks - 1), + stream); + } } else if (total_num_scalar_blocks < total_num_radix_blocks) { // We have to handle both part of the work described above @@ -156,9 +160,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb( 1, lut); // The result will be in the first block. Everything else is garbage. - cuda_memset_async(lwe_array_out + big_lwe_size, 0, - (total_num_radix_blocks - 1) * big_lwe_size_bytes, - stream); + if (total_num_radix_blocks > 1) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async(lwe_array_out + big_lwe_size, 0, + (total_num_radix_blocks - 1) * big_lwe_size_bytes, + stream); + } } else { // We only have to do the regular comparison // And not the part where we compare most significant blocks with zeros @@ -196,9 +203,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb( ksk, num_lsb_radix_blocks); // The result will be in the first block. Everything else is garbage. - cuda_memset_async(lwe_array_out + big_lwe_size, 0, - (total_num_radix_blocks - 1) * big_lwe_size_bytes, - stream); + if (total_num_radix_blocks > 1) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async(lwe_array_out + big_lwe_size, 0, + (total_num_radix_blocks - 1) * big_lwe_size_bytes, + stream); + } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh index 098aa901a5..e271310e95 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh @@ -59,8 +59,11 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace( rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size); // create trivial assign for value = 0 - cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes, - stream); + if (rotations > 0) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes, + stream); + } cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, stream); @@ -93,8 +96,12 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace( // rotate left as the blocks are from LSB to MSB // create trivial assign for value = 0 - cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size, - 0, rotations * big_lwe_size_bytes, stream); + if (rotations > 0) { + // cuda_memset with size 0 is invalid, so avoid it + cuda_memset_async( + rotated_buffer + (num_blocks - rotations) * big_lwe_size, + 0, rotations * big_lwe_size_bytes, stream); + } cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer, num_blocks * big_lwe_size_bytes, stream); diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 40d0404bcc..9a10f38396 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -6,8 +6,8 @@ extern "C" { /// Create a new Cuda stream on GPU `gpu_index` pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void; - /// Destroy the Cuda stream `v_stream` on GPU `gpu_index` - pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32; + /// Destroy the Cuda stream `v_stream` + pub fn cuda_destroy_stream(v_stream: *mut c_void); /// Allocate `size` memory on GPU `gpu_index` asynchronously pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void; @@ -19,7 +19,7 @@ extern "C" { src: *const c_void, size: u64, v_stream: *const c_void, - ) -> i32; + ); /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using /// the Cuda stream `v_stream`. @@ -28,7 +28,7 @@ extern "C" { src: *const c_void, size: u64, v_stream: *const c_void, - ) -> i32; + ); /// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using /// the Cuda stream `v_stream`. @@ -37,31 +37,26 @@ extern "C" { src: *const c_void, size: u64, v_stream: *const c_void, - ) -> i32; + ); /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using /// the Cuda stream `v_stream`. - pub fn cuda_memset_async( - dest: *mut c_void, - value: u64, - size: u64, - v_stream: *const c_void, - ) -> i32; + pub fn cuda_memset_async(dest: *mut c_void, value: u64, size: u64, v_stream: *const c_void); /// Get the total number of Nvidia GPUs detected on the platform pub fn cuda_get_number_of_gpus() -> i32; /// Synchronize all streams on GPU `gpu_index` - pub fn cuda_synchronize_device(gpu_index: u32) -> i32; + pub fn cuda_synchronize_device(gpu_index: u32); /// Synchronize Cuda stream - pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32; + pub fn cuda_synchronize_stream(v_stream: *const c_void); /// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream` - pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32; + pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void); /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously - pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32) -> i32; + pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32); /// Get the maximum amount of shared memory on GPU `gpu_index` pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32; diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 9cec947a59..5502c323f3 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -90,12 +90,15 @@ impl CudaStream { T: Numeric + Into, { let dest_size = dest.len() * std::mem::size_of::(); - cuda_memset_async( - dest.as_mut_c_ptr(), - value.into(), - dest_size as u64, - self.as_c_ptr(), - ); + // We have to check that dest is not empty, because cuda_memset with size 0 is invalid + if dest_size > 0 { + cuda_memset_async( + dest.as_mut_c_ptr(), + value.into(), + dest_size as u64, + self.as_c_ptr(), + ); + } } /// Copies data from slice into GPU pointer @@ -112,12 +115,16 @@ impl CudaStream { let src_size = std::mem::size_of_val(src); assert!(dest.len() * std::mem::size_of::() >= src_size); - cuda_memcpy_async_to_gpu( - dest.as_mut_c_ptr(), - src.as_ptr().cast(), - src_size as u64, - self.as_c_ptr(), - ); + // We have to check that src is not empty, because Rust slice with size 0 results in an + // invalid pointer being passed to copy_to_gpu_async + if src_size > 0 { + cuda_memcpy_async_to_gpu( + dest.as_mut_c_ptr(), + src.as_ptr().cast(), + src_size as u64, + self.as_c_ptr(), + ); + } } /// Copies data between different arrays in the GPU @@ -133,13 +140,16 @@ impl CudaStream { T: Numeric, { assert!(dest.len() >= src.len()); - let size = dest.len() * std::mem::size_of::(); - cuda_memcpy_async_gpu_to_gpu( - dest.as_mut_c_ptr(), - src.as_c_ptr(), - size as u64, - self.as_c_ptr(), - ); + let size = src.len() * std::mem::size_of::(); + // We check that src is not empty to avoid invalid pointers + if size > 0 { + cuda_memcpy_async_gpu_to_gpu( + dest.as_mut_c_ptr(), + src.as_c_ptr(), + size as u64, + self.as_c_ptr(), + ); + } } /// Copies data from GPU pointer into slice @@ -153,15 +163,19 @@ impl CudaStream { where T: Numeric, { - let dest_size = std::mem::size_of_val(dest); - assert!(dest_size >= src.len() * std::mem::size_of::()); - - cuda_memcpy_async_to_cpu( - dest.as_mut_ptr().cast(), - src.as_c_ptr(), - dest_size as u64, - self.as_c_ptr(), - ); + let src_size = src.len() * std::mem::size_of::(); + assert!(std::mem::size_of_val(dest) >= src_size); + + // We have to check that src is not empty, because Rust slice with size 0 results in an + // invalid pointer being passed to copy_to_cpu_async + if src_size > 0 { + cuda_memcpy_async_to_cpu( + dest.as_mut_ptr().cast(), + src.as_c_ptr(), + src_size as u64, + self.as_c_ptr(), + ); + } } /// Discarding bootstrap on a vector of LWE ciphertexts