diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index c1874aed98..a5337a91d1 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -15,7 +15,6 @@ enum SHIFT_OR_ROTATE_TYPE {
   LEFT_ROTATE = 2,
   RIGHT_ROTATE = 3
 };
-enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
 enum BITOP_TYPE {
   BITAND = 0,
   BITOR = 1,
@@ -36,6 +35,11 @@ enum COMPARISON_TYPE {
   MIN = 7,
 };
 
+enum COMPRESSION_MODE {
+  COMPRESS = 0,
+  DECOMPRESS = 1,
+};
+
 enum CMP_ORDERING { IS_INFERIOR = 0, IS_EQUAL = 1, IS_SUPERIOR = 2 };
 
 enum SIGNED_OPERATION { ADDITION = 1, SUBTRACTION = -1 };
@@ -202,6 +206,30 @@ void cuda_scalar_comparison_integer_radix_ciphertext_kb_64(
 void cleanup_cuda_integer_comparison(void **streams, uint32_t *gpu_indexes,
                                      uint32_t gpu_count, int8_t **mem_ptr_void);
 
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    COMPRESSION_MODE mode, bool allocate_gpu_memory);
+
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr);
+
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void);
+
 void scratch_cuda_integer_radix_bitop_kb_64(
     void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
     uint32_t glwe_dimension, uint32_t polynomial_size,
@@ -452,7 +480,8 @@ struct int_radix_params {
         message_modulus(message_modulus), carry_modulus(carry_modulus){};
 
   void print() {
-    printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
+    printf("pbs_type: %u, glwe_dimension: %u, "
+           "polynomial_size: %u, "
            "big_lwe_dimension: %u, "
            "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
            "%u, pbs_base_log: "
@@ -790,6 +819,91 @@ template <typename Torus> struct int_radix_lut {
   }
 };
 
+template <typename Torus> struct int_compression {
+  COMPRESSION_MODE mode;
+  int_radix_params encryption_params;
+  int_radix_params compression_params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  uint32_t body_count;
+
+  // Compression
+  Torus *tmp_lwe;
+  Torus *tmp_glwe_array_out;
+
+  // Decompression
+  Torus *tmp_extracted_glwe;
+  Torus *tmp_extracted_lwe;
+  int_radix_lut<Torus> *carry_extract_lut;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params encryption_params,
+                  int_radix_params compression_params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+                  bool allocate_gpu_memory) {
+    this->mode = mode;
+    this->encryption_params = encryption_params;
+    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+    this->body_count = num_radix_blocks;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      tmp_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+          glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      if (mode == COMPRESSION_MODE::DECOMPRESS) {
+        carry_extract_lut = new int_radix_lut<Torus>(
+            streams, gpu_indexes, gpu_count, encryption_params, 1,
+            num_radix_blocks, allocate_gpu_memory);
+
+        tmp_extracted_glwe = (Torus *)cuda_malloc_async(
+            glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+        tmp_extracted_lwe = (Torus *)cuda_malloc_async(
+            num_radix_blocks *
+                (compression_params.glwe_dimension *
+                     compression_params.polynomial_size +
+                 1) *
+                sizeof(Torus),
+            streams[0], gpu_indexes[0]);
+        // Decompression
+        // Carry extract LUT
+        auto carry_extract_f = [encryption_params](Torus x) -> Torus {
+          return x / encryption_params.message_modulus;
+        };
+
+        generate_device_accumulator<Torus>(
+            streams[0], gpu_indexes[0],
+            carry_extract_lut->get_lut(gpu_indexes[0], 0),
+            encryption_params.glwe_dimension, encryption_params.polynomial_size,
+            encryption_params.message_modulus, encryption_params.carry_modulus,
+            carry_extract_f);
+
+        carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+      }
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
+    if (mode == COMPRESSION_MODE::DECOMPRESS) {
+      carry_extract_lut->release(streams, gpu_indexes, gpu_count);
+      delete (carry_extract_lut);
+      cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
+      cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
+    }
+  }
+};
 template <typename Torus> struct int_bit_extract_luts_buffer {
   int_radix_params params;
   int_radix_lut<Torus> *lut;
diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
index 924ec131f9..202c3d6329 100644
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,12 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
index 3190501fc5..5fec699fbc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
index b45353a616..9971b5c000 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
   auto lwe_out = lwe_array_out + input_id * lwe_output_size;
 
   // We assume each GLWE will store the first polynomial_size inputs
-  uint32_t nth_per_glwe = params::degree;
-  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+  uint32_t lwe_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
 
   auto nth = nth_array[input_id];
 
@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
 template <typename Torus, class params>
 __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                   Torus *lwe_array_out, Torus *glwe_array_in,
-                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t *nth_array, uint32_t num_nths,
                                   uint32_t glwe_dimension) {
   cudaSetDevice(gpu_index);
 
-  dim3 grid(num_glwes);
+  dim3 grid(num_nths);
   dim3 thds(params::degree / params::opt);
   sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
       lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index 524a1fa45e..921aa01d7b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
       static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ */
+void cuda_fp_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_fp_keyswitch_lwe_list_to_glwe(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index db78104a98..423375bccf 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }
 
 template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+__host__ void host_keyswitch_lwe_ciphertext_vector(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
     Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
         GET_VARIANT_ELEMENT(lwe_input_indexes, i);
 
     // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
         streams[i], gpu_indexes[i], current_lwe_array_out,
         current_lwe_output_indexes, current_lwe_array_in,
         current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,146 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
   }
 }
 
+// chunk_count = glwe_size * polynomial_size / threads.
+// each threads will responsible to process only lwe_size times multiplication
+template <typename Torus>
+__device__ void keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+    Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count) {
+
+  extern __shared__ int8_t sharedmem[];
+
+  // result accumulator, shared memory is used because of frequent access
+  Torus *local_glwe_chunk = (Torus *)sharedmem;
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int shmem_index = threadIdx.x;
+  // the output_glwe is split in chunks and each x-block takes one of them
+  size_t chunk_id = blockIdx.x;
+  size_t coef_per_block = blockDim.x;
+
+  // dimensions
+  size_t glwe_size = (glwe_dimension + 1);
+  // number of coefficients inside fp-ksk block for each lwe_input coefficient
+  size_t ksk_block_size = glwe_size * polynomial_size * level_count;
+
+  // initialize accumulator to 0
+  local_glwe_chunk[shmem_index] =
+      SEL(0, lwe_in[lwe_dimension_in], tid == glwe_dimension * polynomial_size);
+
+  // Iterate through all lwe elements
+  for (int i = 0; i < lwe_dimension_in; i++) {
+    // Round and prepare decomposition
+    Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+
+    Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+    Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+    // block of key for current lwe coefficient (cur_input_lwe[i])
+    auto ksk_block = &fp_ksk[i * ksk_block_size];
+    for (int j = 0; j < level_count; j++) {
+      auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
+      // Iterate through each level and multiply by the ksk piece
+      auto ksk_glwe_chunk = &ksk_glwe[chunk_id * coef_per_block];
+      Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
+      local_glwe_chunk[shmem_index] -= decomposed * ksk_glwe_chunk[shmem_index];
+    }
+  }
+
+  // Persist
+  glwe_out[tid] = local_glwe_chunk[shmem_index];
+}
+
+// public functional packing keyswitch
+//
+// blockIdx.y - input
+// chunk_count = glwe_size * polynomial_size / threads.
+template <typename Torus>
+__global__ void
+fp_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
+                              Torus *fp_ksk, uint32_t lwe_dimension_in,
+                              uint32_t glwe_dimension, uint32_t polynomial_size,
+                              uint32_t base_log, uint32_t level_count,
+                              Torus *d_mem) {
+  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+  const int lwe_size = (lwe_dimension_in + 1);
+
+  const int input_id = blockIdx.y;
+  const int degree = input_id;
+
+  // Select an input
+  auto lwe_in = lwe_array_in + input_id * lwe_size;
+  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
+  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+  // KS LWE to GLWE
+  keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+
+  // P * x ^degree
+  polynomial_accumulate_monic_monomial_mul(
+      glwe_out, ks_glwe_out, degree, polynomial_size, true);
+}
+
+/// To-do: Rewrite this kernel for efficiency
+template <typename Torus>
+__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size, uint32_t num_lwes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  extern __shared__ int8_t sharedmem[];
+  const int shmem_index = threadIdx.x;
+
+  Torus *glwe_acc = (Torus *)sharedmem;
+  glwe_acc[shmem_index] = glwe_array_in[tid];
+
+  // Accumulate
+  for (int i = 1; i < num_lwes; i++) {
+    auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
+    glwe_acc[shmem_index] += glwe_in[tid];
+  }
+
+  glwe_out[tid] = glwe_acc[shmem_index];
+}
+
+template <typename Torus>
+__host__ void host_fp_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus *lwe_array_in, Torus *fp_ksk_array, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count, uint32_t num_lwes) {
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 512, num_blocks, num_threads);
+
+  auto shared_mem = sizeof(Torus) * num_threads;
+  dim3 grid(num_blocks, num_lwes);
+  dim3 threads(num_threads);
+
+  auto d_mem = (Torus *)cuda_malloc_async(
+      num_lwes * glwe_accumulator_size * sizeof(Torus), stream, gpu_index);
+  auto d_tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+      num_lwes * glwe_accumulator_size * sizeof(Torus), stream, gpu_index);
+
+  // individually keyswitch each lwe
+  fp_keyswitch_lwe_list_to_glwe<<<grid, threads, shared_mem, stream>>>(
+      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
+  check_cuda_error(cudaGetLastError());
+
+  // accumulate to a single glwe
+  accumulate_glwes<<<num_blocks, threads, shared_mem, stream>>>(
+      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
+      num_lwes);
+  check_cuda_error(cudaGetLastError());
+
+  cuda_drop_async(d_mem, stream, gpu_index);
+  cuda_drop_async(d_tmp_glwe_array_out, stream, gpu_index);
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index 177892e5a9..69b85c30af 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -2,6 +2,7 @@
 #define CNCRT_TORUS_CUH
 
 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>
 
 template <typename T>
@@ -29,29 +30,51 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                               uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
+  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
+  const T shift = non_rep_bit_count - 1;
   T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
+  res += 1;
+  res &= (T)(-2);
+  return res << shift;
 }
 
 template <typename T>
-__device__ __forceinline__ void modulus_switch(T input, T &output,
-                                               uint32_t log_modulus) {
+__device__ __forceinline__ void apply_modulus_switch(T input, T &output,
+                                                     uint32_t log_modulus) {
   constexpr uint32_t BITS = sizeof(T) * 8;
-
   output = input + (((T)1) << (BITS - log_modulus - 1));
   output >>= (BITS - log_modulus);
 }
 
 template <typename T>
-__device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
+__device__ __forceinline__ T apply_modulus_switch(T input,
+                                                  uint32_t log_modulus) {
   T output;
-  modulus_switch(input, output, log_modulus);
+  apply_modulus_switch(input, output, log_modulus);
   return output;
 }
 
+template <typename Torus>
+__global__ void apply_modulus_switch_inplace(Torus *array, int size,
+                                             uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = apply_modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void modulus_switch_inplace(cudaStream_t stream, uint32_t gpu_index,
+                                     Torus *array, int size,
+                                     uint32_t log_modulus) {
+  cudaSetDevice(gpu_index);
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  apply_modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(
+      array, size, log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
new file mode 100644
index 0000000000..22cd3c6161
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,61 @@
+#include "compression.cuh"
+
+void scratch_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    COMPRESSION_MODE mode, bool allocate_gpu_memory) {
+
+  int_radix_params encryption_params(
+      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
+      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
+      lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log,
+      grouping_factor, message_modulus, carry_modulus);
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      (compression_glwe_dimension + 1) * compression_polynomial_size,
+      lwe_dimension, ks_level, ks_base_log, pbs_level, pbs_base_log,
+      grouping_factor, message_modulus, carry_modulus);
+
+  scratch_cuda_compression_integer_radix_ciphertext_64(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
+      compression_params, lwe_per_glwe, storage_log_modulus, mode,
+      allocate_gpu_memory);
+}
+void cuda_compression_compress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr) {
+
+  host_integer_compression_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_compression_decompress_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
+
+  host_integer_compression_decompress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
+      static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_compression_integer_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
new file mode 100644
index 0000000000..0bf8b7c54a
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,225 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "ciphertext.h"
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__global__ void kernel_pack(Torus *array_out, Torus *array_in,
+                            uint32_t log_modulus, uint32_t in_len,
+                            uint32_t len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+  if (i < len) {
+    auto k = nbits * i / log_modulus;
+    auto j = k;
+
+    auto start_shift = i * nbits - j * log_modulus;
+
+    auto value = array_in[j] >> start_shift;
+    j++;
+
+    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
+      auto shift = j * log_modulus - i * nbits;
+      value |= array_in[j] << shift;
+      j++;
+    }
+
+    array_out[i] = value;
+  }
+}
+
+template <typename Torus>
+__host__ void pack(cudaStream_t stream, Torus *array_out, Torus *array_in,
+                   uint32_t num_inputs, uint32_t body_count,
+                   int_compression<Torus> *mem_ptr) {
+
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+  auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
+  auto number_bits_to_pack = in_len * log_modulus;
+
+  auto nbits = sizeof(Torus) * 8;
+  // number_bits_to_pack.div_ceil(Scalar::BITS)
+  auto len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(len, 1024, num_blocks, num_threads);
+
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  kernel_pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus,
+                                            in_len, len);
+}
+
+template <typename Torus>
+__host__ void host_integer_compression_compress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *glwe_array_out, Torus *lwe_array_in, Torus **fp_ksk,
+    uint32_t num_lwes, int_compression<Torus> *mem_ptr) {
+
+  auto compression_params = mem_ptr->compression_params;
+  auto input_lwe_dimension = compression_params.small_lwe_dimension;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe;
+  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
+                                lwe_array_in,
+                                (uint64_t)compression_params.message_modulus,
+                                input_lwe_dimension, num_lwes);
+
+  uint32_t lwe_in_size = input_lwe_dimension + 1;
+  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
+                           compression_params.polynomial_size;
+  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+
+  // Keyswitch LWEs to GLWE
+  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
+  for (int i = 0; i < num_glwes; i++) {
+    auto lwe_subset = lwe_shifted + i * lwe_in_size;
+    auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
+
+    host_fp_keyswitch_lwe_list_to_glwe(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
+  }
+
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
+
+  // Modulus switch
+  modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
+                         num_glwes * (compression_params.glwe_dimension *
+                                          compression_params.polynomial_size +
+                                      body_count),
+                         mem_ptr->storage_log_modulus);
+  check_cuda_error(cudaGetLastError());
+
+  pack(streams[0], glwe_array_out, tmp_glwe_array_out, num_glwes, body_count,
+       mem_ptr);
+}
+
+template <typename Torus>
+__global__ void kernel_extract(Torus *glwe_array_out, Torus *array_in,
+                               uint32_t index, uint32_t log_modulus,
+                               uint32_t initial_out_len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (i < initial_out_len) {
+    // Unpack
+    Torus mask = ((Torus)1 << log_modulus) - 1;
+    auto start = i * log_modulus;
+    auto end = (i + 1) * log_modulus;
+
+    auto start_block = start / nbits;
+    auto start_remainder = start % nbits;
+
+    auto end_block_inclusive = (end - 1) / nbits;
+
+    Torus unpacked_i;
+    if (start_block == end_block_inclusive) {
+      auto single_part = array_in[start_block] >> start_remainder;
+      unpacked_i = single_part & mask;
+    } else {
+      auto first_part = array_in[start_block] >> start_remainder;
+      auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
+
+      unpacked_i = (first_part | second_part) & mask;
+    }
+
+    // Extract
+    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void extract(cudaStream_t stream, uint32_t gpu_index,
+                      Torus *glwe_array_out, Torus *array_in,
+                      uint32_t glwe_index, int_compression<Torus> *mem_ptr) {
+
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+
+  uint32_t body_count = mem_ptr->body_count;
+  auto initial_out_len =
+      params.glwe_dimension * params.polynomial_size + body_count * body_count;
+
+  // We assure the tail of the glwe is zeroed
+  auto zeroed_slice = glwe_array_out + initial_out_len;
+  cuda_memset_async(zeroed_slice, 0,
+                    (params.polynomial_size - body_count * body_count) *
+                        sizeof(Torus),
+                    stream, gpu_index);
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  kernel_extract<<<grid, threads, 0, stream>>>(
+      glwe_array_out, array_in, glwe_index, log_modulus, initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void host_integer_compression_decompress(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    Torus *lwe_array_out, Torus *packed_glwe_in, uint32_t *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int_compression<Torus> *mem_ptr) {
+
+  auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
+  auto compression_params = mem_ptr->compression_params;
+  extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
+          mem_ptr);
+
+  auto num_lwes = mem_ptr->body_count;
+
+  // Sample extract
+  auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
+  cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
+                              extracted_glwe, indexes_array, indexes_array_size,
+                              compression_params.glwe_dimension,
+                              compression_params.polynomial_size);
+
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  auto encryption_params = mem_ptr->encryption_params;
+  auto carry_extract_lut = mem_ptr->carry_extract_lut;
+  execute_pbs_async<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
+      carry_extract_lut->lut_indexes_vec, extracted_lwe,
+      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
+      encryption_params.glwe_dimension,
+      compression_params.glwe_dimension * compression_params.polynomial_size,
+      encryption_params.polynomial_size, encryption_params.pbs_base_log,
+      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
+      encryption_params.pbs_type);
+
+  cudaDeviceSynchronize();
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_compression_integer_radix_ciphertext_64(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params encryption_params, int_radix_params compression_params,
+    uint32_t lwe_per_glwe, uint32_t storage_log_modulus, COMPRESSION_MODE mode,
+    bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
+      num_lwes, lwe_per_glwe, storage_log_modulus, mode, allocate_gpu_memory);
+}
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
index 2a936d7cdf..a64c15378d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
  * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
index 9fd9be9419..b9864973a0 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -14,9 +14,10 @@
 #include <vector>
 
 template <typename T>
-__global__ void
-cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
-                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
+                                             T *cleartext_input,
+                                             uint32_t input_lwe_dimension,
+                                             uint32_t num_entries) {
 
   int tid = threadIdx.x;
   int index = blockIdx.x * blockDim.x + tid;
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
   }
 }
 
+template <typename T>
+__host__ void
+host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                                  T *output, T *lwe_input, T *cleartext_input,
+                                  uint32_t input_lwe_dimension,
+                                  uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void
+cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
+                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T *lwe_input, T *cleartext_input,
+                              T *output, T *lwe_input, T cleartext_input,
                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
index 6a60a0f6d1..ea95197ec9 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh
@@ -88,8 +88,8 @@ __global__ void device_programmable_bootstrap_amortized(
 
   // Put "b", the body, in [0, 2N[
   Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                       params::log2_degree + 1);
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
@@ -104,8 +104,8 @@ __global__ void device_programmable_bootstrap_amortized(
 
     // Put "a" in [0, 2N[ instead of Zq
     Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[iteration], a_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[iteration], a_hat,
+                         params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
index 94d383a2f3..9d0c50c78a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh
@@ -93,8 +93,8 @@ __global__ void device_programmable_bootstrap_cg(
 
   // Put "b" in [0, 2N[
   Torus b_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                 params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                       params::log2_degree + 1);
 
   divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                         params::degree / params::opt>(
@@ -106,7 +106,7 @@ __global__ void device_programmable_bootstrap_cg(
 
     // Put "a" in [0, 2N[
     Torus a_hat = 0;
-    modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[i], a_hat, params::log2_degree + 1);
 
     // Perform ACC * (X^ä - 1)
     multiply_by_monomial_negacyclic_and_sub_polynomial<
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
index a40fdf8b9d..daf43bb097 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh
@@ -80,8 +80,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
   if (lwe_offset == 0) {
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
index 47d6955cb1..64a73b3afc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh
@@ -75,8 +75,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     // First iteration
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
     // The y-dimension is used to select the element of the GLWE this block will
     // compute
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
@@ -94,8 +94,9 @@ __global__ void __launch_bounds__(params::degree / params::opt)
 
   // Put "a" in [0, 2N[
   Torus a_hat = 0;
-  modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
-                 params::log2_degree + 1); // 2 * params::log2_degree + 1);
+  apply_modulus_switch(block_lwe_array_in[lwe_iteration], a_hat,
+                       params::log2_degree +
+                           1); // 2 * params::log2_degree + 1);
 
   synchronize_threads_in_block();
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 776880a550..e8947df630 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -28,7 +28,7 @@ __device__ Torus calculates_monomial_degree(const Torus *lwe_array_group,
     x += selection_bit * lwe_array_group[i];
   }
 
-  return modulus_switch(x, params::log2_degree + 1);
+  return apply_modulus_switch(x, params::log2_degree + 1);
 }
 
 template <typename Torus, class params, sharedMemDegree SMD>
@@ -102,7 +102,7 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
       synchronize_threads_in_block();
       // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
+      polynomial_accumulate_monic_monomial_mul<Torus, params>(
           accumulator, bsk_poly, monomial_degree, false);
     }
 
@@ -203,8 +203,8 @@ __global__ void __launch_bounds__(params::degree / params::opt)
     // Initializes the accumulator with the body of LWE
     // Put "b" in [0, 2N[
     Torus b_hat = 0;
-    modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
-                   params::log2_degree + 1);
+    apply_modulus_switch(block_lwe_array_in[lwe_dimension], b_hat,
+                         params::log2_degree + 1);
 
     divide_by_monomial_negacyclic_inplace<Torus, params::opt,
                                           params::degree / params::opt>(
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index a6d7901f20..9654e66b68 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -55,14 +55,15 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
   }
 }
 
-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
+// This method expects to work with polynomial_size / compression_params::opt
+// threads in the x-block If init_accumulator is set, assumes that result was
+// not initialized and does that with the outcome of first * second
 template <typename T, class params>
 __device__ void
-polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
-                                          uint64_t monomial_degree,
-                                          bool init_accumulator = false) {
-  // monomial_degree \in [0, 2 * params::degree)
+polynomial_accumulate_monic_monomial_mul(T *result, const T *__restrict__ poly,
+                                         uint64_t monomial_degree,
+                                         bool init_accumulator = false) {
+  // monomial_degree \in [0, 2 * compression_params::degree)
   int full_cycles_count = monomial_degree / params::degree;
   int remainder_degrees = monomial_degree % params::degree;
 
@@ -82,4 +83,37 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
   }
 }
 
+// This method expects to work with num_poly * polynomial_size threads in the
+// grid
+template <typename T>
+__device__ void polynomial_accumulate_monic_monomial_mul(
+    T *result_array, T *poly_array, uint64_t monomial_degree,
+    uint32_t polynomial_size,
+    bool init_accumulator = false) {
+  // monomial_degree \in [0, 2 * compression_params::degree)
+  int full_cycles_count = monomial_degree / polynomial_size;
+  int remainder_degrees = monomial_degree % polynomial_size;
+
+  auto tid = threadIdx.x + blockIdx.x * blockDim.x;
+  int pos = tid % polynomial_size;
+
+  // Select a input
+  auto poly = poly_array + (tid / polynomial_size) * polynomial_size;
+  auto result = result_array + (tid / polynomial_size) * polynomial_size;
+
+  // Calculate the rotation
+  T element = poly[pos];
+  int new_pos = (pos + monomial_degree) % polynomial_size;
+
+  // Calculate the new coefficient
+  T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
+  x = SEL(-x, x, new_pos >= remainder_degrees);
+
+  // Write result
+  if (init_accumulator)
+    result[new_pos] = x;
+  else
+    result[new_pos] += x;
+}
+
 #endif // CNCRT_POLYNOMIAL_MATH_H
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 70f9b2a607..2d2a609e09 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -310,6 +310,19 @@ extern "C" {
         level_count: u32,
         num_samples: u32,
     );
+    pub fn cuda_fp_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk_array: *const c_void,
+        input_lwe_dimension: u32,
+        output_glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_lwes: u32,
+    );
 
     /// Perform the negation of a u64 input LWE ciphertext vector.
     /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
@@ -484,6 +497,61 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        encryption_glwe_dimension: u32,
+        encryption_polynomial_size: u32,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        grouping_factor: u32,
+        num_lwes: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        lwe_per_glwe: u32,
+        storage_log_modulus: u32,
+        mode: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_compression_compress_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk: *const *mut c_void,
+        num_lwes: u32,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cuda_compression_decompress_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_out: *mut c_void,
+        glwe_array_in: *const c_void,
+        indexes_array: *const c_void,
+        indexes_array_size: u32,
+        bsks: *const *mut c_void,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
     pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..3fe1180952
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
@@ -0,0 +1,34 @@
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams};
+use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
+
+pub fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext<Scalar>(
+    lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
+    input_lwe_ciphertext_list: &CudaLweCiphertextList<Scalar>,
+    output_glwe_ciphertext: &mut CudaGlweCiphertextList<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension();
+    let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
+    let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
+
+    unsafe {
+        packing_keyswitch_list_async(
+            streams,
+            &mut output_glwe_ciphertext.0.d_vec,
+            &input_lwe_ciphertext_list.0.d_vec,
+            input_lwe_dimension,
+            output_glwe_dimension,
+            output_polynomial_size,
+            &lwe_pksk.d_vec,
+            lwe_pksk.decomposition_base_log(),
+            lwe_pksk.decomposition_level_count(),
+            input_lwe_ciphertext_list.lwe_ciphertext_count(),
+        );
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
index ee6ddd2b1d..489bbb1565 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -1,13 +1,15 @@
+pub mod glwe_sample_extraction;
+pub mod lwe_keyswitch;
 pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
+pub mod lwe_packing_keyswitch;
 pub mod lwe_programmable_bootstrapping;
 
-pub mod glwe_sample_extraction;
-mod lwe_keyswitch;
 #[cfg(test)]
 mod test;
 
 pub use lwe_keyswitch::*;
 pub use lwe_linear_algebra::*;
 pub use lwe_multi_bit_programmable_bootstrapping::*;
+pub use lwe_packing_keyswitch::*;
 pub use lwe_programmable_bootstrapping::*;
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..0c782862c2
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
@@ -0,0 +1,227 @@
+use super::*;
+use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+
+const NB_TESTS: usize = 10;
+fn generate_keys<Scalar: UnsignedTorus + Sync + Send + Serialize + DeserializeOwned>(
+    params: PackingKeySwitchTestParams<Scalar>,
+    streams: &CudaStreams,
+    rsc: &mut TestResources,
+) -> CudaPackingKeySwitchKeys<Scalar> {
+    let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
+        params.lwe_dimension,
+        &mut rsc.secret_random_generator,
+    );
+
+    let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
+        &lwe_sk,
+        &glwe_sk,
+        params.pbs_base_log,
+        params.pbs_level,
+        params.glwe_noise_distribution,
+        params.ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &pksk,
+        params.ciphertext_modulus
+    ));
+
+    let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams);
+
+    CudaPackingKeySwitchKeys {
+        lwe_sk,
+        glwe_sk,
+        pksk: cuda_pksk,
+    }
+}
+
+fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let keys = generate_keys(params, &stream, &mut rsc);
+            let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+            let plaintext = Plaintext(msg * delta);
+
+            let input_lwe = allocate_and_encrypt_new_lwe_ciphertext(
+                &lwe_sk,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus,
+                &mut rsc.encryption_random_generator,
+            );
+
+            let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream);
+
+            assert!(check_encrypted_content_respects_mod(
+                &input_lwe,
+                ciphertext_modulus
+            ));
+
+            let mut d_output_glwe = CudaGlweCiphertextList::new(
+                glwe_sk.glwe_dimension(),
+                glwe_sk.polynomial_size(),
+                GlweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext(
+                &pksk,
+                &d_input_lwe,
+                &mut d_output_glwe,
+                &stream,
+            );
+
+            let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
+            let mut decrypted_plaintext_list = PlaintextList::new(
+                Scalar::ZERO,
+                PlaintextCount(output_glwe_list.polynomial_size().0),
+            );
+
+            decrypt_glwe_ciphertext_list(
+                &glwe_sk,
+                &output_glwe_list,
+                &mut decrypted_plaintext_list,
+            );
+            let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus;
+
+            assert_eq!(msg, decoded);
+        }
+
+        // In coverage, we break after one while loop iteration, changing message values does not
+        // yield higher coverage
+        #[cfg(tarpaulin)]
+        break;
+    }
+}
+
+fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    // while msg != Scalar::ZERO {
+    msg = msg.wrapping_sub(Scalar::ONE);
+    for _ in 0..NB_TESTS {
+        let keys = generate_keys(params, &stream, &mut rsc);
+        let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+        let mut input_lwe_list = LweCiphertextList::new(
+            Scalar::ZERO,
+            lwe_sk.lwe_dimension().to_lwe_size(),
+            LweCiphertextCount(glwe_sk.polynomial_size().0),
+            ciphertext_modulus,
+        );
+
+        let mut input_plaintext_list =
+            PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0));
+
+        encrypt_lwe_ciphertext_list(
+            &lwe_sk,
+            &mut input_lwe_list,
+            &input_plaintext_list,
+            lwe_noise_distribution,
+            &mut rsc.encryption_random_generator,
+        );
+
+        let d_input_lwe_list =
+            CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
+
+        assert!(check_encrypted_content_respects_mod(
+            &input_lwe_list,
+            ciphertext_modulus
+        ));
+
+        let mut d_output_glwe = CudaGlweCiphertextList::new(
+            glwe_sk.glwe_dimension(),
+            glwe_sk.polynomial_size(),
+            GlweCiphertextCount(1),
+            ciphertext_modulus,
+            &stream,
+        );
+
+        cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext(
+            &pksk,
+            &d_input_lwe_list,
+            &mut d_output_glwe,
+            &stream,
+        );
+
+        let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
+
+        let mut decrypted_plaintext_list = PlaintextList::new(
+            Scalar::ZERO,
+            PlaintextCount(output_glwe_list.polynomial_size().0),
+        );
+
+        decrypt_glwe_ciphertext_list(&glwe_sk, &output_glwe_list, &mut decrypted_plaintext_list);
+
+        decrypted_plaintext_list
+            .iter_mut()
+            .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus);
+        input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta);
+
+        assert_eq!(decrypted_plaintext_list, input_plaintext_list);
+    }
+
+    // In coverage, we break after one while loop iteration, changing message values does not
+    // yield higher coverage
+    #[cfg(tarpaulin)]
+    break;
+    // }
+}
+
+create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod);
+create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 59ba20af18..779a419702 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -4,8 +4,15 @@ mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
+mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
 
+pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
+    pub lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
+    pub pksk: CudaLwePackingKeyswitchKey<Scalar>,
+}
+
 // Macro to generate tests for all parameter sets
 macro_rules! create_gpu_parametrized_test{
     ($name:ident { $($param:ident),* }) => {
@@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{
     };
 }
 
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
 use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test};
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
index 9defa579f6..1fde0545bf 100644
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -4,6 +4,7 @@ use crate::core_crypto::prelude::{
     CiphertextModulus, Container, LweCiphertext, LweCiphertextCount, LweCiphertextList,
     LweDimension, LweSize, UnsignedInteger,
 };
+use std::cmp::min;
 use tfhe_cuda_backend::cuda_bind::cuda_memcpy_async_gpu_to_gpu;
 
 /// A structure representing a vector of LWE ciphertexts with 64 bits of precision on the GPU.
@@ -199,49 +200,60 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
         LweCiphertext::from_container(container, self.ciphertext_modulus())
     }
 
-    /// ```rust
-    /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
-    /// use tfhe::core_crypto::gpu::CudaStreams;
-    /// use tfhe::core_crypto::prelude::{
-    ///     CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize,
-    /// };
-    ///
-    /// let mut streams = CudaStreams::new_single_gpu(0);
-    ///
-    /// let lwe_size = LweSize(743);
-    /// let ciphertext_modulus = CiphertextModulus::new_native();
-    /// let lwe_ciphertext_count = LweCiphertextCount(2);
-    ///
-    /// // Create a new LweCiphertextList
-    /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus);
-    ///
-    /// // Copy to GPU
-    /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams);
-    /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams);
-    ///
-    /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams);
-    ///
-    /// assert_eq!(lwe_list, lwe_list_copied);
-    /// ```
-    pub fn duplicate(&self, streams: &CudaStreams) -> Self {
-        let lwe_dimension = self.lwe_dimension();
-        let lwe_ciphertext_count = self.lwe_ciphertext_count();
-        let ciphertext_modulus = self.ciphertext_modulus();
+    // Retrieve [start,end) lwe ciphertexts in the list
+    pub fn retrieve_subset(
+        &self,
+        start: usize,
+        end: usize,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> Option<Self> {
+        if end < start {
+            None
+        } else {
+            let lwe_dimension = self.lwe_dimension();
+            let lwe_ciphertext_count =
+                LweCiphertextCount(min(end - start, self.lwe_ciphertext_count().0));
+            let ciphertext_modulus = self.ciphertext_modulus();
 
-        // Copy to the GPU
-        let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0);
-        unsafe {
-            d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0);
-        }
-        streams.synchronize();
+            // Copy to the GPU
+            let d_vec = unsafe {
+                let mut d_vec = CudaVec::new_async(
+                    lwe_ciphertext_count.0 * lwe_dimension.to_lwe_size().0,
+                    streams,
+                    gpu_index,
+                );
+                // Todo: We might use copy_src_range_gpu_to_gpu_async here
+                let src_ptr = self
+                    .0
+                    .d_vec
+                    .as_c_ptr(gpu_index)
+                    .add(start * lwe_dimension.to_lwe_size().0 * std::mem::size_of::<T>());
+                let size = lwe_ciphertext_count.0
+                    * lwe_dimension.to_lwe_size().0
+                    * std::mem::size_of::<T>();
+                cuda_memcpy_async_gpu_to_gpu(
+                    d_vec.as_mut_c_ptr(gpu_index),
+                    src_ptr,
+                    size as u64,
+                    streams.ptr[gpu_index as usize],
+                    streams.gpu_indexes[gpu_index as usize],
+                );
 
-        let cuda_lwe_list = CudaLweList {
-            d_vec,
-            lwe_ciphertext_count,
-            lwe_dimension,
-            ciphertext_modulus,
-        };
-        Self(cuda_lwe_list)
+                d_vec
+            };
+
+            streams.synchronize();
+
+            let cuda_lwe_list = CudaLweList {
+                d_vec,
+                lwe_ciphertext_count,
+                lwe_dimension,
+                ciphertext_modulus,
+            };
+
+            Some(Self(cuda_lwe_list))
+        }
     }
 
     pub(crate) fn lwe_dimension(&self) -> LweDimension {
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
new file mode 100644
index 0000000000..0c6664aa59
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
@@ -0,0 +1,84 @@
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams};
+use crate::core_crypto::prelude::{
+    lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus,
+    DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension,
+    LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger,
+};
+
+#[derive(Debug)]
+pub struct CudaLwePackingKeyswitchKey<T: UnsignedInteger> {
+    pub(crate) d_vec: CudaVec<T>,
+    decomp_base_log: DecompositionBaseLog,
+    decomp_level_count: DecompositionLevelCount,
+    output_glwe_size: GlweSize,
+    output_polynomial_size: PolynomialSize,
+    ciphertext_modulus: CiphertextModulus<T>,
+}
+
+impl<T: UnsignedInteger> CudaLwePackingKeyswitchKey<T> {
+    pub fn from_lwe_packing_keyswitch_key(
+        h_ksk: &LwePackingKeyswitchKeyOwned<T>,
+        streams: &CudaStreams,
+    ) -> Self {
+        let decomp_base_log = h_ksk.decomposition_base_log();
+        let decomp_level_count = h_ksk.decomposition_level_count();
+        let input_lwe_size = h_ksk.input_key_lwe_dimension().to_lwe_size();
+        let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size();
+        let output_polynomial_size = h_ksk.output_polynomial_size();
+        let ciphertext_modulus = h_ksk.ciphertext_modulus();
+
+        // Allocate memory
+        let mut d_vec = CudaVec::<T>::new_multi_gpu(
+            input_lwe_size.to_lwe_dimension().0
+                * lwe_packing_keyswitch_key_input_key_element_encrypted_size(
+                    decomp_level_count,
+                    output_glwe_size,
+                    output_polynomial_size,
+                ),
+            streams,
+        );
+
+        unsafe {
+            convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref());
+        }
+
+        streams.synchronize();
+
+        Self {
+            d_vec,
+            decomp_base_log,
+            decomp_level_count,
+            output_glwe_size,
+            output_polynomial_size,
+            ciphertext_modulus,
+        }
+    }
+
+    pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog {
+        self.decomp_base_log
+    }
+    pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount {
+        self.decomp_level_count
+    }
+
+    pub(crate) fn output_glwe_size(&self) -> GlweSize {
+        self.output_glwe_size
+    }
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.ciphertext_modulus
+    }
+    pub(crate) fn output_polynomial_size(&self) -> PolynomialSize {
+        self.output_polynomial_size
+    }
+    pub fn input_key_lwe_dimension(&self) -> LweDimension {
+        LweDimension(
+            self.d_vec.len
+                / lwe_packing_keyswitch_key_input_key_element_encrypted_size(
+                    self.decomp_level_count,
+                    self.output_glwe_size,
+                    self.output_polynomial_size,
+                ),
+        )
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs
index 34dcb5f03e..623bf42497 100644
--- a/tfhe/src/core_crypto/gpu/entities/mod.rs
+++ b/tfhe/src/core_crypto/gpu/entities/mod.rs
@@ -3,3 +3,4 @@ pub mod lwe_bootstrap_key;
 pub mod lwe_ciphertext_list;
 pub mod lwe_keyswitch_key;
 pub mod lwe_multi_bit_bootstrap_key;
+pub mod lwe_packing_keyswitch_key;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 3341ba6e5e..40f9d11e0f 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -13,7 +13,6 @@ pub use algorithms::*;
 pub use entities::*;
 use std::ffi::c_void;
 pub(crate) use tfhe_cuda_backend::cuda_bind::*;
-
 #[derive(Debug)]
 pub struct CudaStreams {
     pub ptr: Vec<*mut c_void>,
@@ -260,6 +259,40 @@ pub unsafe fn convert_lwe_keyswitch_key_async<T: UnsignedInteger>(
     dest.copy_from_cpu_multi_gpu_async(src, streams);
 }
 
+/// Discarding packing keyswitch on a vector of LWE ciphertexts
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn packing_keyswitch_list_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    fp_keyswitch_key: &CudaVec<T>,
+    base_log: DecompositionBaseLog,
+    l_gadget: DecompositionLevelCount,
+    num_lwes: LweCiphertextCount,
+) {
+    cuda_fp_keyswitch_lwe_list_to_glwe_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.as_c_ptr(0),
+        input_lwe_dimension.0 as u32,
+        output_glwe_dimension.0 as u32,
+        output_polynomial_size.0 as u32,
+        base_log.0 as u32,
+        l_gadget.0 as u32,
+        num_lwes.0 as u32,
+    );
+}
+
 /// Convert programmable bootstrap key
 ///
 /// # Safety
diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs
index 36333ddbc2..fce78b2335 100644
--- a/tfhe/src/integer/client_key/radix.rs
+++ b/tfhe/src/integer/client_key/radix.rs
@@ -1,11 +1,22 @@
 //! Definition of the client key for radix decomposition
 
 use super::{ClientKey, RecomposableSignedInteger, SecretEncryptionKeyView};
+#[cfg(feature = "gpu")]
+use crate::core_crypto::gpu::CudaStreams;
 use crate::core_crypto::prelude::{SignedNumeric, UnsignedNumeric};
 use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions;
 use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom};
 use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext};
+#[cfg(feature = "gpu")]
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
 use crate::integer::BooleanBlock;
+use crate::shortint::list_compression::{
+    CompressedCompressionKey, CompressedDecompressionKey, CompressionKey, CompressionPrivateKeys,
+    DecompressionKey,
+};
+use crate::shortint::parameters::CompressionParameters;
 use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters};
 use serde::{Deserialize, Serialize};
 use tfhe_versionable::Versionize;
@@ -131,6 +142,40 @@ impl RadixClientKey {
     pub fn num_blocks(&self) -> usize {
         self.num_blocks
     }
+
+    pub fn new_compression_private_key(
+        &self,
+        params: CompressionParameters,
+    ) -> CompressionPrivateKeys {
+        self.key.key.new_compression_private_key(params)
+    }
+
+    pub fn new_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+    ) -> (CompressionKey, DecompressionKey) {
+        self.key
+            .key
+            .new_compression_decompression_keys(private_compression_key)
+    }
+    pub fn new_compressed_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+    ) -> (CompressedCompressionKey, CompressedDecompressionKey) {
+        self.key
+            .key
+            .new_compressed_compression_decompression_keys(private_compression_key)
+    }
+    #[cfg(feature = "gpu")]
+    pub fn new_cuda_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+        streams: &CudaStreams,
+    ) -> (CudaCompressionKey, CudaDecompressionKey) {
+        self.key
+            .key
+            .new_cuda_compression_decompression_keys(private_compression_key, streams)
+    }
 }
 
 impl From<(ClientKey, usize)> for RadixClientKey {
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
new file mode 100644
index 0000000000..f734d59279
--- /dev/null
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -0,0 +1,205 @@
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::CiphertextModulusLog;
+use crate::integer::ciphertext::DataKind;
+use crate::integer::gpu::ciphertext::info::CudaBlockInfo;
+use crate::integer::gpu::ciphertext::{
+    CudaRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+};
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
+
+pub struct CudaCompressedCiphertextList {
+    pub(crate) packed_list: (
+        CudaGlweCiphertextList<u64>,
+        Vec<CudaBlockInfo>,
+        CiphertextModulusLog,
+    ),
+    info: Vec<DataKind>,
+}
+impl CudaCompressedCiphertextList {
+    pub fn len(&self) -> usize {
+        self.info.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.info.len() == 0
+    }
+
+    pub fn get(
+        &self,
+        index: usize,
+        decomp_key: &CudaDecompressionKey,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> CudaRadixCiphertext
+where {
+        let preceding_infos = self.info.get(..index).unwrap();
+        let current_info = self.info.get(index).copied().unwrap();
+
+        let start_block_index: usize = preceding_infos
+            .iter()
+            .copied()
+            .map(DataKind::num_blocks)
+            .sum();
+
+        let end_block_index = start_block_index + current_info.num_blocks() - 1;
+
+        decomp_key.unpack(
+            &self.packed_list,
+            start_block_index,
+            end_block_index,
+            streams,
+            gpu_index,
+        )
+    }
+}
+
+pub trait CudaCompressible {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind;
+}
+
+// Todo: Can we combine these two impl using CudaIntegerRadixCiphertext?
+impl CudaCompressible for CudaSignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+
+        let copy = x.duplicate(streams);
+        messages.push(copy);
+
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+        DataKind::Signed(num_blocks)
+    }
+}
+impl CudaCompressible for CudaUnsignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+
+        let copy = x.duplicate(streams);
+        messages.push(copy);
+
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+
+        DataKind::Unsigned(num_blocks)
+    }
+}
+
+pub struct CudaCompressedCiphertextListBuilder {
+    pub(crate) ciphertexts: Vec<CudaRadixCiphertext>,
+    pub(crate) info: Vec<DataKind>,
+}
+
+impl CudaCompressedCiphertextListBuilder {
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self {
+            ciphertexts: vec![],
+            info: vec![],
+        }
+    }
+
+    pub fn push<T: CudaCompressible>(&mut self, data: T, streams: &CudaStreams) -> &mut Self {
+        let kind = data.compress_into(&mut self.ciphertexts, streams);
+
+        if kind.num_blocks() != 0 {
+            self.info.push(kind);
+        }
+
+        self
+    }
+
+    pub fn build(
+        &self,
+        comp_key: &CudaCompressionKey,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> CudaCompressedCiphertextList {
+        let packed_list =
+            comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams, gpu_index);
+
+        CudaCompressedCiphertextList {
+            packed_list,
+            info: self.info.clone(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::integer::gpu::gen_keys_radix_gpu;
+    use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    use crate::shortint::prelude::PARAM_MESSAGE_2_CARRY_2_KS_PBS;
+
+    #[test]
+    fn test_gpu_ciphertext_compression() {
+        let gpu_index = 0;
+        let streams = CudaStreams::new_single_gpu(gpu_index);
+
+        let num_blocks = 4;
+        let (cks, _) = gen_keys_radix_gpu(PARAM_MESSAGE_2_CARRY_2_KS_PBS, num_blocks, &streams);
+
+        let private_compression_key =
+            cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+
+        let (compressed_compression_key, compressed_decompression_key) =
+            cks.new_compressed_compression_decompression_keys(&private_compression_key);
+
+        let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&streams);
+        let cuda_decompression_key =
+            compressed_decompression_key.decompress_to_cuda(cks.parameters(), &streams);
+
+        let ct1 = cks.encrypt(3_u32);
+        let ct2 = cks.encrypt(2_u32);
+        let ct3 = cks.encrypt_signed(-2);
+
+        // Copy to GPU
+        let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
+        let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams);
+        let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams);
+
+        let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
+            .push(d_ct1, &streams)
+            .push(d_ct2, &streams)
+            .push(d_ct3, &streams)
+            .build(&cuda_compression_key, &streams, gpu_index);
+
+        let d_decompressed1 = CudaUnsignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams, 0),
+        };
+
+        let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
+        let decrypted: u32 = cks.decrypt(&decompressed1);
+
+        assert_eq!(decrypted, 3_u32);
+        let d_decompressed2 = CudaUnsignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams, 0),
+        };
+
+        let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams);
+        let decrypted: u32 = cks.decrypt(&decompressed2);
+
+        assert_eq!(decrypted, 2_u32);
+        let d_decompressed3 = CudaSignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams, 0),
+        };
+
+        let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams);
+        let decrypted: i32 = cks.decrypt_signed(&decompressed3);
+
+        assert_eq!(decrypted, -2);
+    }
+}
diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs
index 7f0408eddb..51e702d0c8 100644
--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -1,4 +1,5 @@
 pub mod boolean_value;
+pub mod compressed_ciphertext_list;
 pub mod info;
 
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
diff --git a/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs
new file mode 100644
index 0000000000..0a89c6efba
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs
@@ -0,0 +1,51 @@
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::shortint::list_compression::{
+    CompressedCompressionKey, CompressedDecompressionKey, CompressionKey,
+};
+use crate::shortint::PBSParameters;
+
+impl CompressedDecompressionKey {
+    pub fn decompress_to_cuda(
+        &self,
+        parameters: PBSParameters,
+        streams: &CudaStreams,
+    ) -> CudaDecompressionKey {
+        let h_bootstrap_key = self
+            .blind_rotate_key
+            .as_view()
+            .par_decompress_into_lwe_bootstrap_key();
+
+        let d_bootstrap_key =
+            CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
+
+        let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key);
+
+        CudaDecompressionKey {
+            blind_rotate_key,
+            lwe_per_glwe: self.lwe_per_glwe,
+            parameters,
+        }
+    }
+}
+
+impl CompressedCompressionKey {
+    pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaCompressionKey {
+        let packing_key_switching_key = self
+            .packing_key_switching_key
+            .as_view()
+            .decompress_into_lwe_packing_keyswitch_key();
+
+        let glwe_compression_key = CompressionKey {
+            packing_key_switching_key,
+            lwe_per_glwe: self.lwe_per_glwe,
+            storage_log_modulus: self.storage_log_modulus,
+        };
+
+        CudaCompressionKey::from_compression_key(&glwe_compression_key, streams)
+    }
+}
diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs
new file mode 100644
index 0000000000..8cb3b2e38c
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/mod.rs
@@ -0,0 +1,2 @@
+pub mod compressed_server_keys;
+pub mod server_keys;
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
new file mode 100644
index 0000000000..44193c3ee4
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -0,0 +1,328 @@
+use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{
+    allocate_and_generate_new_lwe_packing_keyswitch_key,
+    par_allocate_and_generate_new_lwe_bootstrap_key, CiphertextModulusLog, GlweCiphertextCount,
+    LweBootstrapKeyOwned, LweCiphertextCount, LwePackingKeyswitchKey,
+};
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::integer::gpu::{
+    compression_compress_integer_radix_async, compression_decompress_integer_radix_async,
+    cuda_memcpy_async_gpu_to_gpu,
+};
+use crate::integer::parameters::{GlweDimension, PolynomialSize};
+use crate::shortint::client_key::ClientKey;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::list_compression::{CompressionKey, CompressionPrivateKeys};
+use crate::shortint::{ClassicPBSParameters, EncryptionKeyChoice, PBSParameters};
+use itertools::Itertools;
+
+#[derive(Debug)]
+pub struct CudaCompressionKey {
+    pub packing_key_switching_key: CudaLwePackingKeyswitchKey<u64>,
+    pub h_packing_key_switching_key: LwePackingKeyswitchKey<Vec<u64>>,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub storage_log_modulus: CiphertextModulusLog,
+}
+
+pub struct CudaDecompressionKey {
+    pub blind_rotate_key: CudaBootstrappingKey,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub parameters: PBSParameters,
+}
+
+impl CudaCompressionKey {
+    pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self {
+        Self {
+            packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(
+                &compression_key.packing_key_switching_key,
+                streams,
+            ),
+            h_packing_key_switching_key: compression_key.clone().packing_key_switching_key,
+            lwe_per_glwe: compression_key.lwe_per_glwe,
+            storage_log_modulus: compression_key.storage_log_modulus,
+        }
+    }
+
+    fn flatten(
+        vec_ciphertexts: &Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> (CudaLweCiphertextList<u64>, Vec<CudaBlockInfo>) {
+        let first_ct = &vec_ciphertexts.first().unwrap().d_blocks;
+
+        // We assume all ciphertexts will have the same lwe dimension
+        let lwe_dimension = first_ct.lwe_dimension();
+        let ciphertext_modulus = first_ct.ciphertext_modulus();
+
+        // Compute total number of lwe ciphertexts we will be handling
+        let total_num_blocks: usize = vec_ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks);
+
+        let d_vec = unsafe {
+            let mut d_vec = CudaVec::new_async(
+                lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
+                streams,
+                gpu_index,
+            );
+            let mut offset: usize = 0;
+            for ciphertext in vec_ciphertexts {
+                // Todo: We might use copy_self_range_gpu_to_gpu_async here
+                let dest_ptr = d_vec
+                    .as_mut_c_ptr(gpu_index)
+                    .add(offset * std::mem::size_of::<u64>());
+                let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
+                cuda_memcpy_async_gpu_to_gpu(
+                    dest_ptr,
+                    ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index),
+                    size as u64,
+                    streams.ptr[gpu_index as usize],
+                    streams.gpu_indexes[gpu_index as usize],
+                );
+
+                offset += ciphertext.d_blocks.0.d_vec.len;
+            }
+
+            streams.synchronize();
+            d_vec
+        };
+
+        let flattened_ciphertexts =
+            CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus);
+
+        let info = vec_ciphertexts
+            .iter()
+            .flat_map(|x| x.info.blocks.clone())
+            .collect_vec();
+
+        (flattened_ciphertexts, info)
+    }
+
+    pub fn compress_ciphertexts_into_list(
+        &self,
+        ciphertexts: &Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> (
+        CudaGlweCiphertextList<u64>,
+        Vec<CudaBlockInfo>,
+        CiphertextModulusLog,
+    ) {
+        let lwe_pksk = &self.packing_key_switching_key;
+
+        let ciphertext_modulus = lwe_pksk.ciphertext_modulus();
+        let compress_polynomial_size = lwe_pksk.output_polynomial_size();
+        let compress_glwe_size = lwe_pksk.output_glwe_size();
+
+        let first_ct = ciphertexts.first().unwrap();
+        let first_ct_info = first_ct.info.blocks.first().unwrap();
+        let message_modulus = first_ct_info.message_modulus;
+        let carry_modulus = first_ct_info.carry_modulus;
+
+        let lwe_dimension = first_ct.d_blocks.lwe_dimension();
+
+        let num_lwes: usize = ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let mut output_glwe = CudaGlweCiphertextList::new(
+            compress_glwe_size.to_glwe_dimension(),
+            compress_polynomial_size,
+            GlweCiphertextCount(ciphertexts.len()),
+            ciphertext_modulus,
+            streams,
+        );
+
+        let (input_lwes, info) = Self::flatten(ciphertexts, streams, gpu_index);
+
+        unsafe {
+            compression_compress_integer_radix_async(
+                streams,
+                &mut output_glwe.0.d_vec,
+                &input_lwes.0.d_vec,
+                &self.packing_key_switching_key.d_vec,
+                message_modulus,
+                carry_modulus,
+                GlweDimension(0),
+                PolynomialSize(0),
+                compress_glwe_size.to_glwe_dimension(),
+                compress_polynomial_size,
+                lwe_dimension,
+                lwe_pksk.decomposition_base_log(),
+                lwe_pksk.decomposition_level_count(),
+                self.lwe_per_glwe.0 as u32,
+                self.storage_log_modulus.0 as u32,
+                num_lwes as u32,
+            );
+        }
+
+        (output_glwe, info, self.storage_log_modulus)
+    }
+}
+
+impl CudaDecompressionKey {
+    pub fn unpack(
+        &self,
+        packed_list: &(
+            CudaGlweCiphertextList<u64>,
+            Vec<CudaBlockInfo>,
+            CiphertextModulusLog,
+        ),
+        start_block_index: usize,
+        end_block_index: usize,
+        streams: &CudaStreams,
+        gpu_index: u32,
+    ) -> CudaRadixCiphertext {
+        let indexes_array = (start_block_index..=end_block_index)
+            .map(|x| x as u32)
+            .collect_vec();
+
+        let encryption_glwe_dimension = self.parameters.glwe_dimension();
+        let encryption_polynomial_size = self.parameters.polynomial_size();
+        let compression_glwe_dimension = packed_list.0.glwe_dimension();
+        let compression_polynomial_size = packed_list.0.polynomial_size();
+        let lwe_ciphertext_count = LweCiphertextCount(end_block_index - start_block_index + 1);
+        let message_modulus = self.parameters.message_modulus();
+        let carry_modulus = self.parameters.carry_modulus();
+        let ciphertext_modulus = self.parameters.ciphertext_modulus();
+        let storage_log_modulus = packed_list.2;
+
+        match &self.blind_rotate_key {
+            CudaBootstrappingKey::Classic(bsk) => {
+                let lwe_dimension = bsk.output_lwe_dimension();
+
+                let mut output_lwe = CudaLweCiphertextList::new(
+                    lwe_dimension,
+                    lwe_ciphertext_count,
+                    ciphertext_modulus,
+                    streams,
+                );
+
+                unsafe {
+                    let d_indexes_array =
+                        CudaVec::from_cpu_async(indexes_array.as_slice(), streams, gpu_index);
+
+                    compression_decompress_integer_radix_async(
+                        streams,
+                        &mut output_lwe.0.d_vec,
+                        &packed_list.0 .0.d_vec,
+                        &bsk.d_vec,
+                        message_modulus,
+                        carry_modulus,
+                        encryption_glwe_dimension,
+                        encryption_polynomial_size,
+                        compression_glwe_dimension,
+                        compression_polynomial_size,
+                        lwe_dimension,
+                        bsk.decomp_base_log(),
+                        bsk.decomp_level_count(),
+                        self.lwe_per_glwe.0 as u32,
+                        storage_log_modulus.0 as u32,
+                        &d_indexes_array,
+                        lwe_ciphertext_count.0 as u32,
+                    );
+                }
+
+                CudaRadixCiphertext {
+                    d_blocks: output_lwe,
+                    info: CudaRadixCiphertextInfo {
+                        blocks: packed_list.1.clone(),
+                    },
+                }
+            }
+            CudaBootstrappingKey::MultiBit(_) => {
+                panic! {"Unsupported PBS"}
+            }
+        }
+    }
+}
+
+impl ClientKey {
+    pub fn new_cuda_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+        streams: &CudaStreams,
+    ) -> (CudaCompressionKey, CudaDecompressionKey) {
+        let params = &private_compression_key.params;
+        let cks_params: ClassicPBSParameters = match self.parameters.pbs_parameters().unwrap() {
+            PBSParameters::PBS(a) => a,
+            PBSParameters::MultiBitPBS(_) => {
+                panic!("Compression is currently not compatible with Multi Bit PBS")
+            }
+        };
+
+        assert_eq!(
+            cks_params.encryption_key_choice,
+            EncryptionKeyChoice::Big,
+            "Compression is only compatible with ciphertext in post PBS dimension"
+        );
+
+        let packing_key_switching_key = ShortintEngine::with_thread_local_mut(|engine| {
+            allocate_and_generate_new_lwe_packing_keyswitch_key(
+                &self.large_lwe_secret_key(),
+                &private_compression_key.post_packing_ks_key,
+                params.packing_ks_base_log,
+                params.packing_ks_level,
+                params.packing_ks_key_noise_distribution,
+                self.parameters.ciphertext_modulus(),
+                &mut engine.encryption_generator,
+            )
+        });
+
+        assert!(
+            private_compression_key.params.storage_log_modulus.0
+                <= cks_params
+                    .polynomial_size
+                    .to_blind_rotation_input_modulus_log()
+                    .0,
+            "Compression parameters say to store more bits than useful"
+        );
+
+        let glwe_compression_key = CompressionKey {
+            packing_key_switching_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            storage_log_modulus: private_compression_key.params.storage_log_modulus,
+        };
+
+        let mut engine = ShortintEngine::new();
+        let h_bootstrap_key: LweBootstrapKeyOwned<u64> =
+            par_allocate_and_generate_new_lwe_bootstrap_key(
+                &private_compression_key
+                    .post_packing_ks_key
+                    .as_lwe_secret_key(),
+                &self.glwe_secret_key,
+                private_compression_key.params.br_base_log,
+                private_compression_key.params.br_level,
+                self.parameters.glwe_noise_distribution(),
+                self.parameters.ciphertext_modulus(),
+                &mut engine.encryption_generator,
+            );
+
+        let d_bootstrap_key =
+            CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
+
+        let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key);
+
+        let cuda_glwe_decompression_key = CudaDecompressionKey {
+            blind_rotate_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            parameters: self.parameters.pbs_parameters().unwrap(),
+        };
+
+        (
+            CudaCompressionKey::from_compression_key(&glwe_compression_key, streams),
+            cuda_glwe_decompression_key,
+        )
+    }
+}
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index bb2151caa2..62669adfe1 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -1,6 +1,6 @@
 pub mod ciphertext;
+pub mod list_compression;
 pub mod server_key;
-
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
 use crate::core_crypto::gpu::vec::CudaVec;
 use crate::core_crypto::gpu::CudaStreams;
@@ -12,6 +12,7 @@ use crate::integer::{ClientKey, RadixClientKey};
 use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
+
 use tfhe_cuda_backend::cuda_bind::*;
 
 #[repr(u32)]
@@ -32,6 +33,13 @@ pub enum PBSType {
     Classical = 1,
 }
 
+#[allow(dead_code)]
+#[repr(u32)]
+pub enum CompressionType {
+    Compress = 0,
+    Decompress = 1,
+}
+
 #[repr(u32)]
 pub enum ShiftRotateType {
     LeftShift = 0,
@@ -273,6 +281,164 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
     );
 }
 
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn compression_compress_integer_radix_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    fp_keyswitch_key: &CudaVec<u64>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    encryption_glwe_dimension: GlweDimension,
+    encryption_polynomial_size: PolynomialSize,
+    compression_glwe_dimension: GlweDimension,
+    compression_polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_base_log: DecompositionBaseLog,
+    ks_level: DecompositionLevelCount,
+    lwe_per_glwe: u32,
+    storage_log_modulus: u32,
+    num_blocks: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        fp_keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        encryption_glwe_dimension.0 as u32,
+        encryption_polynomial_size.0 as u32,
+        compression_glwe_dimension.0 as u32,
+        compression_polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        0u32,
+        0u32,
+        0u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        lwe_per_glwe,
+        storage_log_modulus,
+        CompressionType::Compress as u32,
+        true,
+    );
+
+    cuda_compression_compress_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+        mem_ptr,
+    );
+
+    cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn compression_decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    glwe_in: &CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    encryption_glwe_dimension: GlweDimension,
+    encryption_polynomial_size: PolynomialSize,
+    compression_glwe_dimension: GlweDimension,
+    compression_polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    lwe_per_glwe: u32,
+    storage_log_modulus: u32,
+    vec_indexes: &CudaVec<u32>,
+    num_blocks: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        encryption_glwe_dimension.0 as u32,
+        encryption_polynomial_size.0 as u32,
+        compression_glwe_dimension.0 as u32,
+        compression_polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        0u32,
+        0u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        0,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        lwe_per_glwe,
+        storage_log_modulus,
+        CompressionType::Decompress as u32,
+        true,
+    );
+
+    cuda_compression_decompress_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        lwe_array_out.as_mut_c_ptr(0),
+        glwe_in.as_c_ptr(0),
+        vec_indexes.as_c_ptr(0),
+        vec_indexes.len as u32,
+        bootstrapping_key.ptr.as_ptr(),
+        mem_ptr,
+    );
+
+    cleanup_cuda_compression_integer_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///