From 71bff0963cb6a00500736a065643cbe6e4abb548 Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Thu, 25 Jan 2024 11:10:46 +0100
Subject: [PATCH] chore(gpu): check for all cuda errors and abort in
 device.cu/.h

Remove some legacy compilation warnings
---
 .../tfhe-cuda-backend/cuda/include/device.h   |  63 ++---
 .../tfhe-cuda-backend/cuda/include/integer.h  |  19 --
 backends/tfhe-cuda-backend/cuda/src/device.cu | 233 +++++-------------
 .../cuda/src/integer/comparison.cuh           |   6 +-
 .../cuda/src/integer/integer.cuh              |   1 -
 .../cuda/src/integer/multiplication.cuh       |  23 +-
 .../cuda/src/integer/scalar_bitops.cuh        |   4 +-
 .../cuda/src/integer/scalar_comparison.cuh    |  28 ++-
 .../cuda/src/integer/scalar_shifts.cuh        |  15 +-
 backends/tfhe-cuda-backend/src/cuda_bind.rs   |  25 +-
 tfhe/src/core_crypto/gpu/mod.rs               |  70 +++---
 11 files changed, 197 insertions(+), 290 deletions(-)

diff --git a/backends/tfhe-cuda-backend/cuda/include/device.h b/backends/tfhe-cuda-backend/cuda/include/device.h
index bfe8c64f49..0dc28684b5 100644
--- a/backends/tfhe-cuda-backend/cuda/include/device.h
+++ b/backends/tfhe-cuda-backend/cuda/include/device.h
@@ -11,6 +11,22 @@
 
 extern "C" {
 
+#define check_cuda_error(ans)                                                  \
+  { cuda_error((ans), __FILE__, __LINE__); }
+inline void cuda_error(cudaError_t code, const char *file, int line) {
+  if (code != cudaSuccess) {
+    std::fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), 
+              file, line);
+    std::abort();
+  }
+}
+#define PANIC(format, ...)                                                     \
+  {                                                                            \
+    std::fprintf(stderr, "%s::%d::%s: panic.\n" format "\n", __FILE__,         \
+            __LINE__, __func__, ##__VA_ARGS__);                                \
+    std::abort();                                                              \
+  }
+
 struct cuda_stream_t {
   cudaStream_t stream;
   uint32_t gpu_index;
@@ -18,68 +34,59 @@ struct cuda_stream_t {
   cuda_stream_t(uint32_t gpu_index) {
     this->gpu_index = gpu_index;
 
-    cudaStreamCreate(&stream);
+    check_cuda_error(cudaStreamCreate(&stream));
   }
 
   void release() {
-    cudaSetDevice(gpu_index);
-    cudaStreamDestroy(stream);
+    check_cuda_error(cudaSetDevice(gpu_index));
+    check_cuda_error(cudaStreamDestroy(stream));
   }
 
-  void synchronize() { cudaStreamSynchronize(stream); }
+  void synchronize() { 
+    check_cuda_error(cudaStreamSynchronize(stream)); 
+  }
 };
 
 cuda_stream_t *cuda_create_stream(uint32_t gpu_index);
 
-int cuda_destroy_stream(cuda_stream_t *stream);
+void cuda_destroy_stream(cuda_stream_t *stream);
 
 void *cuda_malloc(uint64_t size, uint32_t gpu_index);
 
 void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream);
 
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
+void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index);
 
-int cuda_check_support_cooperative_groups();
+bool cuda_check_support_cooperative_groups();
 
-int cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
+void cuda_memcpy_to_cpu(void *dest, const void *src, uint64_t size);
 
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cuda_stream_t *stream);
 
-int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
                                  cuda_stream_t *stream);
 
-int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
+void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size);
 
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cuda_stream_t *stream);
 
-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cuda_stream_t *stream);
 
 int cuda_get_number_of_gpus();
 
-int cuda_synchronize_device(uint32_t gpu_index);
+void cuda_synchronize_device(uint32_t gpu_index);
 
-int cuda_drop(void *ptr, uint32_t gpu_index);
+void cuda_drop(void *ptr, uint32_t gpu_index);
 
-int cuda_drop_async(void *ptr, cuda_stream_t *stream);
+void cuda_drop_async(void *ptr, cuda_stream_t *stream);
 
 int cuda_get_max_shared_memory(uint32_t gpu_index);
 
-int cuda_synchronize_stream(cuda_stream_t *stream);
+void cuda_synchronize_stream(cuda_stream_t *stream);
 
-#define check_cuda_error(ans)                                                  \
-  { cuda_error((ans), __FILE__, __LINE__); }
-inline void cuda_error(cudaError_t code, const char *file, int line,
-                       bool abort = true) {
-  if (code != cudaSuccess) {
-    fprintf(stderr, "Cuda error: %s %s %d\n", cudaGetErrorString(code), file,
-            line);
-    if (abort)
-      exit(code);
-  }
-}
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 7f7b9eb2c8..b822deaf6e 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -361,10 +361,6 @@ template <typename Torus> struct int_radix_lut {
     this->params = params;
     this->num_blocks = num_radix_blocks;
     Torus lut_indexes_size = num_radix_blocks * sizeof(Torus);
-    Torus big_size =
-        (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
-    Torus small_size =
-        (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
     Torus lut_buffer_size =
         (params.glwe_dimension + 1) * params.polynomial_size * sizeof(Torus);
 
@@ -537,7 +533,6 @@ template <typename Torus> struct int_mul_memory {
   Torus *vector_result_sb;
   Torus *block_mul_res;
   Torus *small_lwe_vector;
-  Torus *lwe_pbs_out_array;
   int_radix_lut<Torus> *luts_array; // lsb msb
   int_radix_lut<Torus> *luts_message;
   int_radix_lut<Torus> *luts_carry;
@@ -577,10 +572,6 @@ template <typename Torus> struct int_mul_memory {
         stream);
     small_lwe_vector = (Torus *)cuda_malloc_async(
         total_block_count * (lwe_dimension + 1) * sizeof(Torus), stream);
-    lwe_pbs_out_array =
-        (Torus *)cuda_malloc_async((glwe_dimension * polynomial_size + 1) *
-                                       total_block_count * sizeof(Torus),
-                                   stream);
 
     // create int_radix_lut objects for lsb, msb, message, carry
     // luts_array -> lut = {lsb_acc, msb_acc}
@@ -637,7 +628,6 @@ template <typename Torus> struct int_mul_memory {
     cuda_drop_async(vector_result_sb, stream);
     cuda_drop_async(block_mul_res, stream);
     cuda_drop_async(small_lwe_vector, stream);
-    cuda_drop_async(lwe_pbs_out_array, stream);
 
     luts_array->release(stream);
     luts_message->release(stream);
@@ -834,8 +824,6 @@ template <typename Torus> struct int_cmux_buffer {
     if (allocate_gpu_memory) {
       Torus big_size =
           (params.big_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
-      Torus small_size =
-          (params.small_lwe_dimension + 1) * num_radix_blocks * sizeof(Torus);
 
       tmp_true_ct = (Torus *)cuda_malloc_async(big_size, stream);
       tmp_false_ct = (Torus *)cuda_malloc_async(big_size, stream);
@@ -1048,13 +1036,6 @@ template <typename Torus> struct int_tree_sign_reduction_buffer {
         return msb;
     };
 
-    auto last_leaf_noop_lut_f = [this](Torus x) -> Torus {
-      int msb = (x >> 2) & 3;
-      int lsb = x & 3;
-
-      return this->block_selector_f(msb, lsb);
-    };
-
     if (allocate_gpu_memory) {
       tmp_x = (Torus *)cuda_malloc_async((params.big_lwe_dimension + 1) *
                                              num_radix_blocks * sizeof(Torus),
diff --git a/backends/tfhe-cuda-backend/cuda/src/device.cu b/backends/tfhe-cuda-backend/cuda/src/device.cu
index 7b811c540a..d9ec4c81c9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/device.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/device.cu
@@ -4,25 +4,23 @@
 
 /// Unsafe function to create a CUDA stream, must check first that GPU exists
 cuda_stream_t *cuda_create_stream(uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
   cuda_stream_t *stream = new cuda_stream_t(gpu_index);
   return stream;
 }
 
 /// Unsafe function to destroy CUDA stream, must check first the GPU exists
-int cuda_destroy_stream(cuda_stream_t *stream) {
+void cuda_destroy_stream(cuda_stream_t *stream) {
   stream->release();
-  return 0;
 }
 
 /// Unsafe function that will try to allocate even if gpu_index is invalid
 /// or if there's not enough memory. A safe wrapper around it must call
 /// cuda_check_valid_malloc() first
 void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
-  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
   void *ptr;
-  cudaMalloc((void **)&ptr, size);
-  check_cuda_error(cudaGetLastError());
+  check_cuda_error(cudaMalloc((void **)&ptr, size));
 
   return ptr;
 }
@@ -30,7 +28,7 @@ void *cuda_malloc(uint64_t size, uint32_t gpu_index) {
 /// Allocates a size-byte array at the device memory. Tries to do it
 /// asynchronously.
 void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
   void *ptr;
 
 #ifndef CUDART_VERSION
@@ -52,184 +50,104 @@ void *cuda_malloc_async(uint64_t size, cuda_stream_t *stream) {
   return ptr;
 }
 
-/// Checks that allocation is valid
-/// 0: valid
-/// -1: invalid, not enough memory in device
-/// -2: invalid, gpu index doesn't exist
-int cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
-
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
+/// Check that allocation is valid
+void cuda_check_valid_malloc(uint64_t size, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
   size_t total_mem, free_mem;
-  cudaMemGetInfo(&free_mem, &total_mem);
+  check_cuda_error(cudaMemGetInfo(&free_mem, &total_mem));
   if (size > free_mem) {
-    // error code: not enough memory
-    return -1;
+    PANIC("Cuda error: not enough memory on device. "
+            "Available: %zu vs Requested: %lu", free_mem, size);
   }
-  return 0;
 }
 
 /// Returns
-///  -> 0 if Cooperative Groups is not supported.
-///  -> 1 otherwise
-int cuda_check_support_cooperative_groups() {
+///  false if Cooperative Groups is not supported.
+///  true otherwise
+bool cuda_check_support_cooperative_groups() {
   int cooperative_groups_supported = 0;
-  cudaDeviceGetAttribute(&cooperative_groups_supported,
-                         cudaDevAttrCooperativeLaunch, 0);
+  check_cuda_error(cudaDeviceGetAttribute(&cooperative_groups_supported,
+                         cudaDevAttrCooperativeLaunch, 0));
 
   return cooperative_groups_supported > 0;
 }
 
-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
+/// Copy memory to the GPU asynchronously
+void cuda_memcpy_async_to_gpu(void *dest, void *src, uint64_t size,
                              cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
   cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
   if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid device pointer in async copy to GPU.");
   }
 
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
   check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyHostToDevice, stream->stream));
-  return 0;
 }
 
-/// Tries to copy memory to the GPU synchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
+/// Copy memory to the GPU synchronously
+void cuda_memcpy_to_gpu(void *dest, void *src, uint64_t size) {
   cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
   if (attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid device pointer in copy to GPU.");
   }
-
   check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyHostToDevice));
-  return 0;
 }
 
-/// Tries to copy memory to the CPU synchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
+/// Copy memory to the CPU synchronously
+void cuda_memcpy_to_cpu(void *dest, void *src, uint64_t size) {
   cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
+  check_cuda_error(cudaPointerGetAttributes(&attr, src));
   if (attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid device pointer in copy to CPU.");
   }
-
   check_cuda_error(cudaMemcpy(dest, src, size, cudaMemcpyDeviceToHost));
-  return 0;
 }
 
-/// Tries to copy memory within a GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
+/// Copy memory within a GPU asynchronously
+void cuda_memcpy_async_gpu_to_gpu(void *dest, void *src, uint64_t size,
                                  cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
   cudaPointerAttributes attr_dest;
-  cudaPointerGetAttributes(&attr_dest, dest);
+  check_cuda_error(cudaPointerGetAttributes(&attr_dest, dest));
   if (attr_dest.device != stream->gpu_index &&
       attr_dest.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid dest device pointer in copy from GPU to GPU.");
   }
   cudaPointerAttributes attr_src;
-  cudaPointerGetAttributes(&attr_src, src);
+  check_cuda_error(cudaPointerGetAttributes(&attr_src, src));
   if (attr_src.device != stream->gpu_index &&
       attr_src.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid src device pointer in copy from GPU to GPU.");
   }
   if (attr_src.device != attr_dest.device) {
-    // error code: different devices
-    return -1;
+    PANIC("Cuda error: different devices specified in copy from GPU to GPU.");
   }
 
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
   check_cuda_error(cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToDevice,
                                    stream->stream));
-  return 0;
 }
 
 /// Synchronizes device
-/// 0: success
-/// -2: error, gpu index doesn't exist
-int cuda_synchronize_device(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
-  cudaDeviceSynchronize();
-  return 0;
+void cuda_synchronize_device(uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
+  check_cuda_error(cudaDeviceSynchronize());
 }
 
-int cuda_memset_async(void *dest, uint64_t val, uint64_t size,
+void cuda_memset_async(void *dest, uint64_t val, uint64_t size,
                       cuda_stream_t *stream) {
+  cudaPointerAttributes attr;
+  check_cuda_error(cudaPointerGetAttributes(&attr, dest));
   if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
+    PANIC("Cuda error: invalid zero size in cuda_memset_async.");
   }
-  cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, dest);
   if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid dest device pointer in cuda memset.");
   }
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
   check_cuda_error(cudaMemsetAsync(dest, val, size, stream->stream));
-  return 0;
 }
 
 template <typename Torus>
@@ -246,8 +164,9 @@ void cuda_set_value_async(cudaStream_t *stream, Torus *d_array, Torus value,
   int num_blocks = (n + block_size - 1) / block_size;
 
   // Launch the kernel
-  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>(d_array, value,
-                                                                n);
+  cuda_set_value_kernel<<<num_blocks, block_size, 0, *stream>>>
+      (d_array, value, n);
+  check_cuda_error(cudaGetLastError());
 }
 
 /// Explicitly instantiate cuda_set_value_async for 32 and 64 bits
@@ -256,57 +175,37 @@ template void cuda_set_value_async(cudaStream_t *stream, uint64_t *d_array,
 template void cuda_set_value_async(cudaStream_t *stream, uint32_t *d_array,
                                    uint32_t value, uint32_t n);
 
-/// Tries to copy memory to the GPU asynchronously
-/// 0: success
-/// -1: error, invalid device pointer
-/// -2: error, gpu index doesn't exist
-/// -3: error, zero copy size
-int cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
+/// Copy memory to the CPU asynchronously
+void cuda_memcpy_async_to_cpu(void *dest, const void *src, uint64_t size,
                              cuda_stream_t *stream) {
-  if (size == 0) {
-    // error code: zero copy size
-    return -3;
-  }
-
-  if (stream->gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
   cudaPointerAttributes attr;
-  cudaPointerGetAttributes(&attr, src);
+  check_cuda_error(cudaPointerGetAttributes(&attr, src));
   if (attr.device != stream->gpu_index && attr.type != cudaMemoryTypeDevice) {
-    // error code: invalid device pointer
-    return -1;
+    PANIC("Cuda error: invalid src device pointer in copy to CPU async.");
   }
 
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
   check_cuda_error(
       cudaMemcpyAsync(dest, src, size, cudaMemcpyDeviceToHost, stream->stream));
-  return 0;
 }
 
 /// Return number of GPUs available
 int cuda_get_number_of_gpus() {
   int num_gpus;
-  cudaGetDeviceCount(&num_gpus);
+  check_cuda_error(cudaGetDeviceCount(&num_gpus));
   return num_gpus;
 }
 
 /// Drop a cuda array
-int cuda_drop(void *ptr, uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
+void cuda_drop(void *ptr, uint32_t gpu_index) {
+  check_cuda_error(cudaSetDevice(gpu_index));
   check_cuda_error(cudaFree(ptr));
-  return 0;
 }
 
-/// Drop a cuda array. Tries to do it asynchronously
-int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
+/// Drop a cuda array asynchronously, if supported on the device
+void cuda_drop_async(void *ptr, cuda_stream_t *stream) {
 
-  cudaSetDevice(stream->gpu_index);
+  check_cuda_error(cudaSetDevice(stream->gpu_index));
 #ifndef CUDART_VERSION
 #error CUDART_VERSION Undefined!
 #elif (CUDART_VERSION >= 11020)
@@ -323,18 +222,13 @@ int cuda_drop_async(void *ptr, cuda_stream_t *stream) {
 #else
   check_cuda_error(cudaFree(ptr));
 #endif
-  return 0;
 }
 
 /// Get the maximum size for the shared memory
 int cuda_get_max_shared_memory(uint32_t gpu_index) {
-  if (gpu_index >= cuda_get_number_of_gpus()) {
-    // error code: invalid gpu_index
-    return -2;
-  }
-  cudaSetDevice(gpu_index);
+  check_cuda_error(cudaSetDevice(gpu_index));
   cudaDeviceProp prop;
-  cudaGetDeviceProperties(&prop, gpu_index);
+  check_cuda_error(cudaGetDeviceProperties(&prop, gpu_index));
   int max_shared_memory = 0;
   if (prop.major >= 6) {
     max_shared_memory = prop.sharedMemPerMultiprocessor;
@@ -344,7 +238,6 @@ int cuda_get_max_shared_memory(uint32_t gpu_index) {
   return max_shared_memory;
 }
 
-int cuda_synchronize_stream(cuda_stream_t *stream) {
+void cuda_synchronize_stream(cuda_stream_t *stream) {
   stream->synchronize();
-  return 0;
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
index 3434c58c47..323a36cdf2 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/comparison.cuh
@@ -202,8 +202,10 @@ __host__ void host_compare_with_zero_equality(
 
   // The result will be in the two first block. Everything else is
   //  garbage.
-  cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                    big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+  if (num_radix_blocks > 1) {
+    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                      big_lwe_size_bytes * (num_radix_blocks - 1), stream);
+  }
 }
 
 template <typename Torus>
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index 0bf4994798..fcc7a82f7a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -363,7 +363,6 @@ void host_propagate_single_carry_low_latency(cuda_stream_t *stream,
   auto params = mem->params;
   auto glwe_dimension = params.glwe_dimension;
   auto polynomial_size = params.polynomial_size;
-  auto message_modulus = params.message_modulus;
   auto big_lwe_size = glwe_dimension * polynomial_size + 1;
   auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
index 4a9c59c8f8..e6b9f18236 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh
@@ -109,8 +109,10 @@ void extract_message_carry_to_full_radix(cuda_stream_t *stream, Torus *src,
 
     if (!is_message) {
       int zero_block_count = num_blocks - number_of_unit;
-      cuda_memset_async(cur_dst_radix, 0,
-                        zero_block_count * unit_size * sizeof(Torus), stream);
+      if (zero_block_count != 0) {
+        cuda_memset_async(cur_dst_radix, 0,
+                          zero_block_count * unit_size * sizeof(Torus), stream);
+      }
       s_index = zero_block_count;
     }
 
@@ -277,11 +279,6 @@ __host__ void host_integer_mult_radix_kb(
   // lwe_dimension +1 coefficients
   auto small_lwe_vector = mem_ptr->small_lwe_vector;
 
-  // buffer to keep pbs result for num_blocks^2 lwe_ciphertext
-  // in total it has num_blocks^2 big lwe ciphertexts with
-  // glwe_dimension * polynomial_size + 1 coefficients
-  auto lwe_pbs_out_array = mem_ptr->lwe_pbs_out_array;
-
   // it contains two lut, first for lsb extraction,
   // second for msb extraction, with total length =
   // 2 * (glwe_dimension + 1) * polynomial_size
@@ -362,9 +359,13 @@ __host__ void host_integer_mult_radix_kb(
     ch_amount = r / chunk_size;
     dim3 add_grid(ch_amount, num_blocks, 1);
     size_t sm_size = big_lwe_size * sizeof(Torus);
-    cuda_memset_async(new_blocks, 0,
-                      ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
-                      stream);
+    if (ch_amount != 0) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(
+          new_blocks, 0,
+          ch_amount * num_blocks * big_lwe_size * sizeof(Torus),
+          stream);
+    }
 
     tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
         new_blocks, old_blocks, chunk_size, num_blocks);
@@ -448,7 +449,7 @@ __host__ void host_integer_mult_radix_kb(
 
   dim3 add_grid(1, num_blocks, 1);
   size_t sm_size = big_lwe_size * sizeof(Torus);
-  cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus),
+  cuda_memset_async(radix_lwe_out, 0, num_blocks * big_lwe_size * sizeof(Torus), 
                     stream);
   tree_add_chunks<Torus, params><<<add_grid, 256, sm_size, stream->stream>>>(
       radix_lwe_out, old_blocks, r, num_blocks);
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
index 96519d29df..5ad735be1f 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_bitops.cuh
@@ -19,7 +19,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
 
   if (num_clear_blocks == 0) {
     if (op == SCALAR_BITAND) {
-      auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
       cuda_memset_async(lwe_array_out, 0,
                         num_radix_blocks * lwe_size * sizeof(Torus), stream);
     } else {
@@ -28,7 +27,6 @@ __host__ void host_integer_radix_scalar_bitop_kb(
                                    stream);
     }
   } else {
-    auto lut_buffer = lut->lut;
     // We have all possible LUTs pre-computed and we use the decomposed scalar
     // as index to recover the right one
     cuda_memcpy_async_gpu_to_gpu(lut->lut_indexes, clear_blocks,
@@ -38,7 +36,7 @@ __host__ void host_integer_radix_scalar_bitop_kb(
         stream, lwe_array_out, lwe_array_input, bsk, ksk, num_clear_blocks,
         lut);
 
-    if (op == SCALAR_BITAND) {
+    if (op == SCALAR_BITAND && num_clear_blocks < num_radix_blocks) {
       auto lwe_array_out_block = lwe_array_out + num_clear_blocks * lwe_size;
       cuda_memset_async(lwe_array_out_block, 0,
                         (num_radix_blocks - num_clear_blocks) * lwe_size *
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
index 3cf22f0ce9..5d4be87597 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_comparison.cuh
@@ -66,9 +66,13 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
 
     // The result will be in the two first block. Everything else is
     //  garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      big_lwe_size_bytes * (total_num_radix_blocks - 1),
-                      stream);
+    if (total_num_radix_blocks > 1) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(
+          lwe_array_out + big_lwe_size, 0,
+          big_lwe_size_bytes * (total_num_radix_blocks - 1),
+          stream);
+    }
 
   } else if (total_num_scalar_blocks < total_num_radix_blocks) {
     // We have to handle both part of the work described above
@@ -156,9 +160,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
         1, lut);
 
     // The result will be in the first block. Everything else is garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
-                      stream);
+    if (total_num_radix_blocks > 1) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                        (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                        stream);
+    }
   } else {
     // We only have to do the regular comparison
     // And not the part where we compare most significant blocks with zeros
@@ -196,9 +203,12 @@ __host__ void host_integer_radix_scalar_difference_check_kb(
                         ksk, num_lsb_radix_blocks);
 
     // The result will be in the first block. Everything else is garbage.
-    cuda_memset_async(lwe_array_out + big_lwe_size, 0,
-                      (total_num_radix_blocks - 1) * big_lwe_size_bytes,
-                      stream);
+    if (total_num_radix_blocks > 1) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(lwe_array_out + big_lwe_size, 0,
+                        (total_num_radix_blocks - 1) * big_lwe_size_bytes,
+                        stream);
+    }
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
index 098aa901a5..e271310e95 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/scalar_shifts.cuh
@@ -59,8 +59,11 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
         rotated_buffer, lwe_array, rotations, num_blocks, big_lwe_size);
 
     // create trivial assign for value = 0
-    cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
-                      stream);
+    if (rotations > 0) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(rotated_buffer, 0, rotations * big_lwe_size_bytes,
+                        stream);
+    }
     cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                  num_blocks * big_lwe_size_bytes, stream);
 
@@ -93,8 +96,12 @@ __host__ void host_integer_radix_scalar_shift_kb_inplace(
 
     // rotate left as the blocks are from LSB to MSB
     // create trivial assign for value = 0
-    cuda_memset_async(rotated_buffer + (num_blocks - rotations) * big_lwe_size,
-                      0, rotations * big_lwe_size_bytes, stream);
+    if (rotations > 0) {
+      // cuda_memset with size 0 is invalid, so avoid it
+      cuda_memset_async(
+          rotated_buffer + (num_blocks - rotations) * big_lwe_size,
+          0, rotations * big_lwe_size_bytes, stream);
+    }
     cuda_memcpy_async_gpu_to_gpu(lwe_array, rotated_buffer,
                                  num_blocks * big_lwe_size_bytes, stream);
 
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 40d0404bcc..9a10f38396 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -6,8 +6,8 @@ extern "C" {
     /// Create a new Cuda stream on GPU `gpu_index`
     pub fn cuda_create_stream(gpu_index: u32) -> *mut c_void;
 
-    /// Destroy the Cuda stream `v_stream` on GPU `gpu_index`
-    pub fn cuda_destroy_stream(v_stream: *mut c_void) -> i32;
+    /// Destroy the Cuda stream `v_stream`
+    pub fn cuda_destroy_stream(v_stream: *mut c_void);
 
     /// Allocate `size` memory on GPU `gpu_index` asynchronously
     pub fn cuda_malloc_async(size: u64, v_stream: *const c_void) -> *mut c_void;
@@ -19,7 +19,7 @@ extern "C" {
         src: *const c_void,
         size: u64,
         v_stream: *const c_void,
-    ) -> i32;
+    );
 
     /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
     /// the Cuda stream `v_stream`.
@@ -28,7 +28,7 @@ extern "C" {
         src: *const c_void,
         size: u64,
         v_stream: *const c_void,
-    ) -> i32;
+    );
 
     /// Copy `size` memory asynchronously from `src` to `dest` on the same GPU `gpu_index` using
     /// the Cuda stream `v_stream`.
@@ -37,31 +37,26 @@ extern "C" {
         src: *const c_void,
         size: u64,
         v_stream: *const c_void,
-    ) -> i32;
+    );
 
     /// Copy `size` memory asynchronously from `src` on CPU to `dest` on GPU `gpu_index` using
     /// the Cuda stream `v_stream`.
-    pub fn cuda_memset_async(
-        dest: *mut c_void,
-        value: u64,
-        size: u64,
-        v_stream: *const c_void,
-    ) -> i32;
+    pub fn cuda_memset_async(dest: *mut c_void, value: u64, size: u64, v_stream: *const c_void);
 
     /// Get the total number of Nvidia GPUs detected on the platform
     pub fn cuda_get_number_of_gpus() -> i32;
 
     /// Synchronize all streams on GPU `gpu_index`
-    pub fn cuda_synchronize_device(gpu_index: u32) -> i32;
+    pub fn cuda_synchronize_device(gpu_index: u32);
 
     /// Synchronize Cuda stream
-    pub fn cuda_synchronize_stream(v_stream: *const c_void) -> i32;
+    pub fn cuda_synchronize_stream(v_stream: *const c_void);
 
     /// Free memory for pointer `ptr` on GPU `gpu_index` asynchronously, using stream `v_stream`
-    pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void) -> i32;
+    pub fn cuda_drop_async(ptr: *mut c_void, v_stream: *const c_void);
 
     /// Free memory for pointer `ptr` on GPU `gpu_index` synchronously
-    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32) -> i32;
+    pub fn cuda_drop(ptr: *mut c_void, gpu_index: u32);
 
     /// Get the maximum amount of shared memory on GPU `gpu_index`
     pub fn cuda_get_max_shared_memory(gpu_index: u32) -> i32;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 9cec947a59..5502c323f3 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -90,12 +90,15 @@ impl CudaStream {
         T: Numeric + Into<u64>,
     {
         let dest_size = dest.len() * std::mem::size_of::<T>();
-        cuda_memset_async(
-            dest.as_mut_c_ptr(),
-            value.into(),
-            dest_size as u64,
-            self.as_c_ptr(),
-        );
+        // We have to check that dest is not empty, because cuda_memset with size 0 is invalid
+        if dest_size > 0 {
+            cuda_memset_async(
+                dest.as_mut_c_ptr(),
+                value.into(),
+                dest_size as u64,
+                self.as_c_ptr(),
+            );
+        }
     }
 
     /// Copies data from slice into GPU pointer
@@ -112,12 +115,16 @@ impl CudaStream {
         let src_size = std::mem::size_of_val(src);
         assert!(dest.len() * std::mem::size_of::<T>() >= src_size);
 
-        cuda_memcpy_async_to_gpu(
-            dest.as_mut_c_ptr(),
-            src.as_ptr().cast(),
-            src_size as u64,
-            self.as_c_ptr(),
-        );
+        // We have to check that src is not empty, because Rust slice with size 0 results in an
+        // invalid pointer being passed to copy_to_gpu_async
+        if src_size > 0 {
+            cuda_memcpy_async_to_gpu(
+                dest.as_mut_c_ptr(),
+                src.as_ptr().cast(),
+                src_size as u64,
+                self.as_c_ptr(),
+            );
+        }
     }
 
     /// Copies data between different arrays in the GPU
@@ -133,13 +140,16 @@ impl CudaStream {
         T: Numeric,
     {
         assert!(dest.len() >= src.len());
-        let size = dest.len() * std::mem::size_of::<T>();
-        cuda_memcpy_async_gpu_to_gpu(
-            dest.as_mut_c_ptr(),
-            src.as_c_ptr(),
-            size as u64,
-            self.as_c_ptr(),
-        );
+        let size = src.len() * std::mem::size_of::<T>();
+        // We check that src is not empty to avoid invalid pointers
+        if size > 0 {
+            cuda_memcpy_async_gpu_to_gpu(
+                dest.as_mut_c_ptr(),
+                src.as_c_ptr(),
+                size as u64,
+                self.as_c_ptr(),
+            );
+        }
     }
 
     /// Copies data from GPU pointer into slice
@@ -153,15 +163,19 @@ impl CudaStream {
     where
         T: Numeric,
     {
-        let dest_size = std::mem::size_of_val(dest);
-        assert!(dest_size >= src.len() * std::mem::size_of::<T>());
-
-        cuda_memcpy_async_to_cpu(
-            dest.as_mut_ptr().cast(),
-            src.as_c_ptr(),
-            dest_size as u64,
-            self.as_c_ptr(),
-        );
+        let src_size = src.len() * std::mem::size_of::<T>();
+        assert!(std::mem::size_of_val(dest) >= src_size);
+
+        // We have to check that src is not empty, because Rust slice with size 0 results in an
+        // invalid pointer being passed to copy_to_cpu_async
+        if src_size > 0 {
+            cuda_memcpy_async_to_cpu(
+                dest.as_mut_ptr().cast(),
+                src.as_c_ptr(),
+                src_size as u64,
+                self.as_c_ptr(),
+            );
+        }
     }
 
     /// Discarding bootstrap on a vector of LWE ciphertexts