From de4674665c9745f947ad77f290de623442400976 Mon Sep 17 00:00:00 2001
From: Pedro Alves <pedro.alves@zama.ai>
Date: Wed, 17 Jul 2024 17:14:54 -0300
Subject: [PATCH] feat(gpu): implement CUDA-based Radix Integer compression and
 public functional packing keyswitch

---
 .../cuda/include/compression.h                | 156 +++++++++++
 .../tfhe-cuda-backend/cuda/include/integer.h  |   6 +-
 .../cuda/include/keyswitch.h                  |  15 ++
 .../tfhe-cuda-backend/cuda/src/CMakeLists.txt |  14 -
 .../cuda/src/crypto/ciphertext.cuh            |   8 +-
 .../cuda/src/crypto/keyswitch.cu              |  36 ++-
 .../cuda/src/crypto/keyswitch.cuh             | 155 ++++++++++-
 .../cuda/src/crypto/torus.cuh                 |  36 ++-
 .../src/integer/compression/compression.cu    |  87 ++++++
 .../src/integer/compression/compression.cuh   | 238 +++++++++++++++++
 .../cuda/src/integer/integer.cuh              |   2 +-
 .../cuda/src/integer/shift_and_rotate.cuh     |   2 +-
 .../cuda/src/linearalgebra/multiplication.cu  |  24 +-
 .../cuda/src/linearalgebra/multiplication.cuh |  45 +++-
 .../pbs/programmable_bootstrap_multibit.cuh   |   5 +-
 .../cuda/src/polynomial/polynomial_math.cuh   |  29 +-
 backends/tfhe-cuda-backend/src/cuda_bind.rs   | 108 ++++++++
 tfhe/benches/integer/bench.rs                 | 131 ++++++++-
 .../integer/glwe_packing_compression.rs       |   8 +-
 .../gpu/algorithms/lwe_packing_keyswitch.rs   |  36 +++
 tfhe/src/core_crypto/gpu/algorithms/mod.rs    |   6 +-
 .../algorithms/test/lwe_packing_keyswitch.rs  | 234 ++++++++++++++++
 .../core_crypto/gpu/algorithms/test/mod.rs    |   8 +
 .../gpu/entities/lwe_ciphertext_list.rs       |  45 ----
 .../gpu/entities/lwe_packing_keyswitch_key.rs |  75 ++++++
 tfhe/src/core_crypto/gpu/entities/mod.rs      |   1 +
 tfhe/src/core_crypto/gpu/mod.rs               |  51 +++-
 tfhe/src/integer/client_key/radix.rs          |  12 +
 .../ciphertext/compressed_ciphertext_list.rs  | 216 +++++++++++++++
 tfhe/src/integer/gpu/ciphertext/mod.rs        |   1 +
 tfhe/src/integer/gpu/client_key/mod.rs        |   1 +
 tfhe/src/integer/gpu/client_key/radix.rs      |  95 +++++++
 .../compressed_server_keys.rs                 |  51 ++++
 tfhe/src/integer/gpu/list_compression/mod.rs  |   2 +
 .../gpu/list_compression/server_keys.rs       | 251 ++++++++++++++++++
 tfhe/src/integer/gpu/mod.rs                   | 157 +++++++++++
 36 files changed, 2225 insertions(+), 122 deletions(-)
 create mode 100644 backends/tfhe-cuda-backend/cuda/include/compression.h
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
 create mode 100644 backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
 create mode 100644 tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
 create mode 100644 tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
 create mode 100644 tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
 create mode 100644 tfhe/src/integer/gpu/client_key/mod.rs
 create mode 100644 tfhe/src/integer/gpu/client_key/radix.rs
 create mode 100644 tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs
 create mode 100644 tfhe/src/integer/gpu/list_compression/mod.rs
 create mode 100644 tfhe/src/integer/gpu/list_compression/server_keys.rs

diff --git a/backends/tfhe-cuda-backend/cuda/include/compression.h b/backends/tfhe-cuda-backend/cuda/include/compression.h
new file mode 100644
index 0000000000..6c1beab5b4
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/include/compression.h
@@ -0,0 +1,156 @@
+#ifndef CUDA_INTEGER_COMPRESSION_H
+#define CUDA_INTEGER_COMPRESSION_H
+
+#include "integer.h"
+
+extern "C" {
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory);
+
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory);
+
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr);
+
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr);
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void);
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(void **streams,
+                                                         uint32_t *gpu_indexes,
+                                                         uint32_t gpu_count,
+                                                         int8_t **mem_ptr_void);
+}
+
+template <typename Torus> struct int_compression {
+  int_radix_params compression_params;
+  uint32_t storage_log_modulus;
+  uint32_t lwe_per_glwe;
+
+  uint32_t body_count;
+
+  // Compression
+  int8_t *fp_ks_buffer;
+  Torus *tmp_lwe;
+  Torus *tmp_glwe_array_out;
+
+  int_compression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                  uint32_t gpu_count, int_radix_params compression_params,
+                  uint32_t num_radix_blocks, uint32_t lwe_per_glwe,
+                  uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+    this->compression_params = compression_params;
+    this->lwe_per_glwe = lwe_per_glwe;
+    this->storage_log_modulus = storage_log_modulus;
+    this->body_count = num_radix_blocks;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      tmp_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks * (compression_params.small_lwe_dimension + 1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      tmp_glwe_array_out = (Torus *)cuda_malloc_async(
+          glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+
+      scratch_packing_keyswitch_lwe_list_to_glwe_64(
+          streams[0], gpu_indexes[0], &fp_ks_buffer,
+          compression_params.glwe_dimension, compression_params.polynomial_size,
+          num_radix_blocks, true);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_lwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_glwe_array_out, streams[0], gpu_indexes[0]);
+    cleanup_packing_keyswitch_lwe_list_to_glwe(streams[0], gpu_indexes[0],
+                                               &fp_ks_buffer);
+  }
+};
+
+template <typename Torus> struct int_decompression {
+  int_radix_params encryption_params;
+  int_radix_params compression_params;
+
+  uint32_t storage_log_modulus;
+
+  uint32_t body_count;
+
+  Torus *tmp_extracted_glwe;
+  Torus *tmp_extracted_lwe;
+
+  int_radix_lut<Torus> *carry_extract_lut;
+
+  int_decompression(cudaStream_t *streams, uint32_t *gpu_indexes,
+                    uint32_t gpu_count, int_radix_params encryption_params,
+                    int_radix_params compression_params,
+                    uint32_t num_radix_blocks, uint32_t storage_log_modulus,
+                    bool allocate_gpu_memory) {
+    this->encryption_params = encryption_params;
+    this->compression_params = compression_params;
+    this->storage_log_modulus = storage_log_modulus;
+    this->body_count = num_radix_blocks;
+
+    if (allocate_gpu_memory) {
+      Torus glwe_accumulator_size = (compression_params.glwe_dimension + 1) *
+                                    compression_params.polynomial_size;
+
+      carry_extract_lut = new int_radix_lut<Torus>(
+          streams, gpu_indexes, gpu_count, encryption_params, 1,
+          num_radix_blocks, allocate_gpu_memory);
+
+      tmp_extracted_glwe = (Torus *)cuda_malloc_async(
+          glwe_accumulator_size * sizeof(Torus), streams[0], gpu_indexes[0]);
+      tmp_extracted_lwe = (Torus *)cuda_malloc_async(
+          num_radix_blocks *
+              (compression_params.glwe_dimension *
+                   compression_params.polynomial_size +
+               1) *
+              sizeof(Torus),
+          streams[0], gpu_indexes[0]);
+      // Decompression
+      // Carry extract LUT
+      auto carry_extract_f = [encryption_params](Torus x) -> Torus {
+        return x / encryption_params.message_modulus;
+      };
+
+      generate_device_accumulator<Torus>(
+          streams[0], gpu_indexes[0],
+          carry_extract_lut->get_lut(gpu_indexes[0], 0),
+          encryption_params.glwe_dimension, encryption_params.polynomial_size,
+          encryption_params.message_modulus, encryption_params.carry_modulus,
+          carry_extract_f);
+
+      carry_extract_lut->broadcast_lut(streams, gpu_indexes, gpu_indexes[0]);
+    }
+  }
+  void release(cudaStream_t *streams, uint32_t *gpu_indexes,
+               uint32_t gpu_count) {
+    cuda_drop_async(tmp_extracted_glwe, streams[0], gpu_indexes[0]);
+    cuda_drop_async(tmp_extracted_lwe, streams[0], gpu_indexes[0]);
+
+    carry_extract_lut->release(streams, gpu_indexes, gpu_count);
+    delete (carry_extract_lut);
+  }
+};
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h
index 871829abeb..99860fd1c7 100644
--- a/backends/tfhe-cuda-backend/cuda/include/integer.h
+++ b/backends/tfhe-cuda-backend/cuda/include/integer.h
@@ -1,6 +1,7 @@
 #ifndef CUDA_INTEGER_H
 #define CUDA_INTEGER_H
 
+#include "keyswitch.h"
 #include "pbs/programmable_bootstrap.cuh"
 #include "programmable_bootstrap.h"
 #include "programmable_bootstrap_multibit.h"
@@ -15,7 +16,6 @@ enum SHIFT_OR_ROTATE_TYPE {
   LEFT_ROTATE = 2,
   RIGHT_ROTATE = 3
 };
-enum LUT_TYPE { OPERATOR = 0, MAXVALUE = 1, ISNONZERO = 2, BLOCKSLEN = 3 };
 enum BITOP_TYPE {
   BITAND = 0,
   BITOR = 1,
@@ -475,7 +475,8 @@ struct int_radix_params {
         message_modulus(message_modulus), carry_modulus(carry_modulus){};
 
   void print() {
-    printf("pbs_type: %u, glwe_dimension: %u, polynomial_size: %u, "
+    printf("pbs_type: %u, glwe_dimension: %u, "
+           "polynomial_size: %u, "
            "big_lwe_dimension: %u, "
            "small_lwe_dimension: %u, ks_level: %u, ks_base_log: %u, pbs_level: "
            "%u, pbs_base_log: "
@@ -812,7 +813,6 @@ template <typename Torus> struct int_radix_lut {
     }
   }
 };
-
 template <typename Torus> struct int_bit_extract_luts_buffer {
   int_radix_params params;
   int_radix_lut<Torus> *lut;
diff --git a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
index 924ec131f9..9de953d053 100644
--- a/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
+++ b/backends/tfhe-cuda-backend/cuda/include/keyswitch.h
@@ -16,6 +16,21 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples);
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory);
+
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes);
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer);
 }
 
 #endif // CNCRT_KS_H_
diff --git a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
index 3190501fc5..5fec699fbc 100644
--- a/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
+++ b/backends/tfhe-cuda-backend/cuda/src/CMakeLists.txt
@@ -1,17 +1,3 @@
-set(SOURCES
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bit_extraction.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bitwise_ops.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/bootstrap_multibit.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/ciphertext.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/circuit_bootstrap.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/device.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/integer.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/keyswitch.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/linear_algebra.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/shifts.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/vertical_packing.h
-    ${CMAKE_SOURCE_DIR}/${INCLUDE_DIR}/helper_multi_gpu.h)
 file(GLOB_RECURSE SOURCES "*.cu")
 add_library(tfhe_cuda_backend STATIC ${SOURCES})
 set_target_properties(tfhe_cuda_backend PROPERTIES CUDA_SEPARABLE_COMPILATION ON CUDA_RESOLVE_DEVICE_SYMBOLS ON)
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
index b45353a616..9971b5c000 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/ciphertext.cuh
@@ -38,8 +38,8 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
   auto lwe_out = lwe_array_out + input_id * lwe_output_size;
 
   // We assume each GLWE will store the first polynomial_size inputs
-  uint32_t nth_per_glwe = params::degree;
-  auto glwe_in = glwe_array_in + (input_id / nth_per_glwe) * glwe_input_size;
+  uint32_t lwe_per_glwe = params::degree;
+  auto glwe_in = glwe_array_in + (input_id / lwe_per_glwe) * glwe_input_size;
 
   auto nth = nth_array[input_id];
 
@@ -50,11 +50,11 @@ __global__ void sample_extract(Torus *lwe_array_out, Torus *glwe_array_in,
 template <typename Torus, class params>
 __host__ void host_sample_extract(cudaStream_t stream, uint32_t gpu_index,
                                   Torus *lwe_array_out, Torus *glwe_array_in,
-                                  uint32_t *nth_array, uint32_t num_glwes,
+                                  uint32_t *nth_array, uint32_t num_nths,
                                   uint32_t glwe_dimension) {
   cudaSetDevice(gpu_index);
 
-  dim3 grid(num_glwes);
+  dim3 grid(num_nths);
   dim3 thds(params::degree / params::opt);
   sample_extract<Torus, params><<<grid, thds, 0, stream>>>(
       lwe_array_out, glwe_array_in, nth_array, glwe_dimension);
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
index 524a1fa45e..4f6761d6c6 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cu
@@ -10,7 +10,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_32(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint32_t *>(lwe_array_out),
       static_cast<uint32_t *>(lwe_output_indexes),
@@ -40,7 +40,7 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
     void *lwe_output_indexes, void *lwe_array_in, void *lwe_input_indexes,
     void *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
     uint32_t base_log, uint32_t level_count, uint32_t num_samples) {
-  cuda_keyswitch_lwe_ciphertext_vector(
+  host_keyswitch_lwe_ciphertext_vector(
       static_cast<cudaStream_t>(stream), gpu_index,
       static_cast<uint64_t *>(lwe_array_out),
       static_cast<uint64_t *>(lwe_output_indexes),
@@ -48,3 +48,35 @@ void cuda_keyswitch_lwe_ciphertext_vector_64(
       static_cast<uint64_t *>(lwe_input_indexes), static_cast<uint64_t *>(ksk),
       lwe_dimension_in, lwe_dimension_out, base_log, level_count, num_samples);
 }
+
+void scratch_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  scratch_packing_keyswitch_lwe_list_to_glwe<uint64_t>(
+      static_cast<cudaStream_t>(stream), gpu_index, fp_ks_buffer,
+      glwe_dimension, polynomial_size, num_lwes, allocate_gpu_memory);
+}
+/* Perform functional packing keyswitch on a batch of 64 bits input LWE
+ * ciphertexts.
+ */
+void cuda_packing_keyswitch_lwe_list_to_glwe_64(
+    void *stream, uint32_t gpu_index, void *glwe_array_out, void *lwe_array_in,
+    void *fp_ksk_array, int8_t *fp_ks_buffer, uint32_t input_lwe_dimension,
+    uint32_t output_glwe_dimension, uint32_t output_polynomial_size,
+    uint32_t base_log, uint32_t level_count, uint32_t num_lwes) {
+
+  host_packing_keyswitch_lwe_list_to_glwe(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(fp_ksk_array), fp_ks_buffer, input_lwe_dimension,
+      output_glwe_dimension, output_polynomial_size, base_log, level_count,
+      num_lwes);
+}
+
+void cleanup_packing_keyswitch_lwe_list_to_glwe(void *stream,
+                                                uint32_t gpu_index,
+                                                int8_t **fp_ks_buffer) {
+  cuda_drop_async(*fp_ks_buffer, static_cast<cudaStream_t>(stream), gpu_index);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
index db78104a98..3d1618e413 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/keyswitch.cuh
@@ -7,6 +7,7 @@
 #include "polynomial/functions.cuh"
 #include "polynomial/polynomial_math.cuh"
 #include "torus.cuh"
+#include "utils/helper.cuh"
 #include "utils/kernel_dimensions.cuh"
 #include <thread>
 #include <vector>
@@ -98,7 +99,7 @@ keyswitch(Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
 }
 
 template <typename Torus>
-__host__ void cuda_keyswitch_lwe_ciphertext_vector(
+__host__ void host_keyswitch_lwe_ciphertext_vector(
     cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
     Torus *lwe_output_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
     Torus *ksk, uint32_t lwe_dimension_in, uint32_t lwe_dimension_out,
@@ -146,7 +147,7 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
         GET_VARIANT_ELEMENT(lwe_input_indexes, i);
 
     // Compute Keyswitch
-    cuda_keyswitch_lwe_ciphertext_vector<Torus>(
+    host_keyswitch_lwe_ciphertext_vector<Torus>(
         streams[i], gpu_indexes[i], current_lwe_array_out,
         current_lwe_output_indexes, current_lwe_array_in,
         current_lwe_input_indexes, ksks[i], lwe_dimension_in, lwe_dimension_out,
@@ -154,4 +155,154 @@ void execute_keyswitch_async(cudaStream_t *streams, uint32_t *gpu_indexes,
   }
 }
 
+template <typename Torus>
+__host__ void scratch_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, int8_t **fp_ks_buffer,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t num_lwes,
+    bool allocate_gpu_memory) {
+  cudaSetDevice(gpu_index);
+
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  if (allocate_gpu_memory)
+    *fp_ks_buffer = (int8_t *)cuda_malloc_async(
+        2 * num_lwes * glwe_accumulator_size * sizeof(Torus), stream,
+        gpu_index);
+}
+
+// public functional packing keyswitch for a single LWE ciphertext
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__device__ void packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+    Torus *glwe_out, Torus *lwe_in, Torus *fp_ksk, uint32_t lwe_dimension_in,
+    uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
+    uint32_t level_count) {
+
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  size_t glwe_size = (glwe_dimension + 1);
+
+  if (tid < glwe_size * polynomial_size) {
+    const int local_index = threadIdx.x;
+    // the output_glwe is split in polynomials and each x-block takes one of
+    // them
+    size_t poly_id = blockIdx.x;
+    size_t coef_per_block = blockDim.x;
+
+    // number of coefficients inside fp-ksk block for each lwe_input coefficient
+    size_t ksk_block_size = glwe_size * polynomial_size * level_count;
+
+    // initialize accumulator to 0
+    glwe_out[tid] = SEL(0, lwe_in[lwe_dimension_in],
+                        tid == glwe_dimension * polynomial_size);
+
+    // Iterate through all lwe elements
+    for (int i = 0; i < lwe_dimension_in; i++) {
+      // Round and prepare decomposition
+      Torus a_i = round_to_closest_multiple(lwe_in[i], base_log, level_count);
+
+      Torus state = a_i >> (sizeof(Torus) * 8 - base_log * level_count);
+      Torus mod_b_mask = (1ll << base_log) - 1ll;
+
+      // block of key for current lwe coefficient (cur_input_lwe[i])
+      auto ksk_block = &fp_ksk[i * ksk_block_size];
+      for (int j = 0; j < level_count; j++) {
+        auto ksk_glwe = &ksk_block[j * glwe_size * polynomial_size];
+        // Iterate through each level and multiply by the ksk piece
+        auto ksk_glwe_chunk = &ksk_glwe[poly_id * coef_per_block];
+        Torus decomposed = decompose_one<Torus>(state, mod_b_mask, base_log);
+        glwe_out[tid] -= decomposed * ksk_glwe_chunk[local_index];
+      }
+    }
+  }
+}
+
+// public functional packing keyswitch for a batch of LWE ciphertexts
+//
+// Selects the input each thread is working on using the y-block index.
+//
+// Assumes there are (glwe_dimension+1) * polynomial_size threads split through
+// different thread blocks at the x-axis to work on that input.
+template <typename Torus>
+__global__ void
+packing_keyswitch_lwe_list_to_glwe(Torus *glwe_array_out, Torus *lwe_array_in,
+                                   Torus *fp_ksk, uint32_t lwe_dimension_in,
+                                   uint32_t glwe_dimension,
+                                   uint32_t polynomial_size, uint32_t base_log,
+                                   uint32_t level_count, Torus *d_mem) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+
+  const int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+  const int lwe_size = (lwe_dimension_in + 1);
+
+  const int input_id = blockIdx.y;
+  const int degree = input_id;
+
+  // Select an input
+  auto lwe_in = lwe_array_in + input_id * lwe_size;
+  auto ks_glwe_out = d_mem + input_id * glwe_accumulator_size;
+  auto glwe_out = glwe_array_out + input_id * glwe_accumulator_size;
+  // KS LWE to GLWE
+  packing_keyswitch_lwe_ciphertext_into_glwe_ciphertext(
+      ks_glwe_out, lwe_in, fp_ksk, lwe_dimension_in, glwe_dimension,
+      polynomial_size, base_log, level_count);
+
+  // P * x ^degree
+  auto in_poly = ks_glwe_out + (tid / polynomial_size) * polynomial_size;
+  auto out_result = glwe_out + (tid / polynomial_size) * polynomial_size;
+  polynomial_accumulate_monic_monomial_mul(out_result, in_poly, degree,
+                                           tid % polynomial_size,
+                                           polynomial_size, 1, true);
+}
+
+/// To-do: Rewrite this kernel for efficiency
+template <typename Torus>
+__global__ void accumulate_glwes(Torus *glwe_out, Torus *glwe_array_in,
+                                 uint32_t glwe_dimension,
+                                 uint32_t polynomial_size, uint32_t num_lwes) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < (glwe_dimension + 1) * polynomial_size) {
+    glwe_out[tid] = glwe_array_in[tid];
+
+    // Accumulate
+    for (int i = 1; i < num_lwes; i++) {
+      auto glwe_in = glwe_array_in + i * (glwe_dimension + 1) * polynomial_size;
+      glwe_out[tid] += glwe_in[tid];
+    }
+  }
+}
+
+template <typename Torus>
+__host__ void host_packing_keyswitch_lwe_list_to_glwe(
+    cudaStream_t stream, uint32_t gpu_index, Torus *glwe_out,
+    Torus *lwe_array_in, Torus *fp_ksk_array, int8_t *fp_ks_buffer,
+    uint32_t lwe_dimension_in, uint32_t glwe_dimension,
+    uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
+    uint32_t num_lwes) {
+  cudaSetDevice(gpu_index);
+  int glwe_accumulator_size = (glwe_dimension + 1) * polynomial_size;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(glwe_accumulator_size, 128, num_blocks, num_threads);
+
+  dim3 grid(num_blocks, num_lwes);
+  dim3 threads(num_threads);
+
+  auto d_mem = (Torus *)fp_ks_buffer;
+  auto d_tmp_glwe_array_out = d_mem + num_lwes * glwe_accumulator_size;
+
+  // individually keyswitch each lwe
+  packing_keyswitch_lwe_list_to_glwe<<<grid, threads, 0, stream>>>(
+      d_tmp_glwe_array_out, lwe_array_in, fp_ksk_array, lwe_dimension_in,
+      glwe_dimension, polynomial_size, base_log, level_count, d_mem);
+  check_cuda_error(cudaGetLastError());
+
+  // accumulate to a single glwe
+  accumulate_glwes<<<num_blocks, threads, 0, stream>>>(
+      glwe_out, d_tmp_glwe_array_out, glwe_dimension, polynomial_size,
+      num_lwes);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
index 177892e5a9..4a5cd8fb66 100644
--- a/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/crypto/torus.cuh
@@ -2,6 +2,7 @@
 #define CNCRT_TORUS_CUH
 
 #include "types/int128.cuh"
+#include "utils/kernel_dimensions.cuh"
 #include <limits>
 
 template <typename T>
@@ -29,20 +30,18 @@ __device__ inline void typecast_double_to_torus<uint64_t>(double x,
 template <typename T>
 __device__ inline T round_to_closest_multiple(T x, uint32_t base_log,
                                               uint32_t level_count) {
-  T shift = sizeof(T) * 8 - level_count * base_log;
-  T mask = 1ll << (shift - 1);
-  T b = (x & mask) >> (shift - 1);
+  const T non_rep_bit_count = sizeof(T) * 8 - level_count * base_log;
+  const T shift = non_rep_bit_count - 1;
   T res = x >> shift;
-  res += b;
-  res <<= shift;
-  return res;
+  res += 1;
+  res &= (T)(-2);
+  return res << shift;
 }
 
 template <typename T>
 __device__ __forceinline__ void modulus_switch(T input, T &output,
                                                uint32_t log_modulus) {
   constexpr uint32_t BITS = sizeof(T) * 8;
-
   output = input + (((T)1) << (BITS - log_modulus - 1));
   output >>= (BITS - log_modulus);
 }
@@ -54,4 +53,27 @@ __device__ __forceinline__ T modulus_switch(T input, uint32_t log_modulus) {
   return output;
 }
 
+template <typename Torus>
+__global__ void modulus_switch_inplace(Torus *array, int size,
+                                       uint32_t log_modulus) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < size) {
+    array[tid] = modulus_switch(array[tid], log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void host_modulus_switch_inplace(cudaStream_t stream,
+                                          uint32_t gpu_index, Torus *array,
+                                          int size, uint32_t log_modulus) {
+  cudaSetDevice(gpu_index);
+
+  int num_threads = 0, num_blocks = 0;
+  getNumBlocksAndThreads(size, 1024, num_blocks, num_threads);
+
+  modulus_switch_inplace<<<num_blocks, num_threads, 0, stream>>>(array, size,
+                                                                 log_modulus);
+  check_cuda_error(cudaGetLastError());
+}
+
 #endif // CNCRT_TORUS_H
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
new file mode 100644
index 0000000000..5e0da5a8c8
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cu
@@ -0,0 +1,87 @@
+#include "compression.cuh"
+
+void scratch_cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t ks_level, uint32_t ks_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t lwe_per_glwe, uint32_t storage_log_modulus,
+    bool allocate_gpu_memory) {
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      (compression_glwe_dimension + 1) * compression_polynomial_size,
+      lwe_dimension, ks_level, ks_base_log, 0, 0, 0, message_modulus,
+      carry_modulus);
+
+  scratch_cuda_compress_integer_radix_ciphertext_64(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_compression<uint64_t> **)mem_ptr, num_lwes, compression_params,
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
+}
+void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
+    uint32_t encryption_glwe_dimension, uint32_t encryption_polynomial_size,
+    uint32_t compression_glwe_dimension, uint32_t compression_polynomial_size,
+    uint32_t lwe_dimension, uint32_t pbs_level, uint32_t pbs_base_log,
+    uint32_t num_lwes, uint32_t message_modulus, uint32_t carry_modulus,
+    PBS_TYPE pbs_type, uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  int_radix_params encryption_params(
+      pbs_type, encryption_glwe_dimension, encryption_polynomial_size,
+      (encryption_glwe_dimension + 1) * encryption_polynomial_size,
+      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
+      carry_modulus);
+
+  int_radix_params compression_params(
+      pbs_type, compression_glwe_dimension, compression_polynomial_size,
+      (compression_glwe_dimension + 1) * compression_polynomial_size,
+      lwe_dimension, 0, 0, pbs_level, pbs_base_log, 0, message_modulus,
+      carry_modulus);
+
+  scratch_cuda_integer_decompress_radix_ciphertext_64(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      (int_decompression<uint64_t> **)mem_ptr, num_lwes, encryption_params,
+      compression_params, storage_log_modulus, allocate_gpu_memory);
+}
+void cuda_integer_compress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *glwe_array_out, void *lwe_array_in, void **fp_ksk, uint32_t num_nths,
+    int8_t *mem_ptr) {
+
+  host_integer_compress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(glwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in), (uint64_t **)(fp_ksk), num_nths,
+      (int_compression<uint64_t> *)mem_ptr);
+}
+void cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    void *lwe_array_out, void *glwe_in, void *indexes_array,
+    uint32_t indexes_array_size, void **bsks, int8_t *mem_ptr) {
+
+  host_integer_decompress<uint64_t>(
+      (cudaStream_t *)(streams), gpu_indexes, gpu_count,
+      static_cast<uint64_t *>(lwe_array_out), static_cast<uint64_t *>(glwe_in),
+      static_cast<uint32_t *>(indexes_array), indexes_array_size, bsks,
+      (int_decompression<uint64_t> *)mem_ptr);
+}
+
+void cleanup_cuda_integer_compress_radix_ciphertext_64(void **streams,
+                                                       uint32_t *gpu_indexes,
+                                                       uint32_t gpu_count,
+                                                       int8_t **mem_ptr_void) {
+
+  int_compression<uint64_t> *mem_ptr =
+      (int_compression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
+
+void cleanup_cuda_integer_decompress_radix_ciphertext_64(
+    void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int8_t **mem_ptr_void) {
+
+  int_decompression<uint64_t> *mem_ptr =
+      (int_decompression<uint64_t> *)(*mem_ptr_void);
+  mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
+}
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
new file mode 100644
index 0000000000..229495e324
--- /dev/null
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/compression/compression.cuh
@@ -0,0 +1,238 @@
+#ifndef CUDA_INTEGER_COMPRESSION_CUH
+#define CUDA_INTEGER_COMPRESSION_CUH
+
+#include "ciphertext.h"
+#include "compression.h"
+#include "crypto/keyswitch.cuh"
+#include "device.h"
+#include "integer/integer.cuh"
+#include "linearalgebra/multiplication.cuh"
+#include "polynomial/functions.cuh"
+#include "utils/kernel_dimensions.cuh"
+
+template <typename Torus>
+__global__ void pack(Torus *array_out, Torus *array_in, uint32_t log_modulus,
+                     uint32_t in_len, uint32_t len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+  if (i < len) {
+    auto k = nbits * i / log_modulus;
+    auto j = k;
+
+    auto start_shift = i * nbits - j * log_modulus;
+
+    auto value = array_in[j] >> start_shift;
+    j++;
+
+    while (j * log_modulus < ((i + 1) * nbits) && j < in_len) {
+      auto shift = j * log_modulus - i * nbits;
+      value |= array_in[j] << shift;
+      j++;
+    }
+
+    array_out[i] = value;
+  }
+}
+
+template <typename Torus>
+__host__ void host_pack(cudaStream_t stream, uint32_t gpu_index,
+                        Torus *array_out, Torus *array_in, uint32_t num_inputs,
+                        uint32_t body_count, int_compression<Torus> *mem_ptr) {
+  cudaSetDevice(gpu_index);
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+  auto in_len = params.glwe_dimension * params.polynomial_size + body_count;
+  auto number_bits_to_pack = in_len * log_modulus;
+
+  auto nbits = sizeof(Torus) * 8;
+  // number_bits_to_pack.div_ceil(Scalar::BITS)
+  auto len = (number_bits_to_pack + nbits - 1) / nbits;
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(len, 128, num_blocks, num_threads);
+
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  pack<<<grid, threads, 0, stream>>>(array_out, array_in, log_modulus, in_len,
+                                     len);
+}
+
+template <typename Torus>
+__host__ void host_integer_compress(cudaStream_t *streams,
+                                    uint32_t *gpu_indexes, uint32_t gpu_count,
+                                    Torus *glwe_array_out, Torus *lwe_array_in,
+                                    Torus **fp_ksk, uint32_t num_lwes,
+                                    int_compression<Torus> *mem_ptr) {
+
+  auto compression_params = mem_ptr->compression_params;
+  auto input_lwe_dimension = compression_params.small_lwe_dimension;
+
+  // Shift
+  auto lwe_shifted = mem_ptr->tmp_lwe;
+  host_cleartext_multiplication(streams[0], gpu_indexes[0], lwe_shifted,
+                                lwe_array_in,
+                                (uint64_t)compression_params.message_modulus,
+                                input_lwe_dimension, num_lwes);
+
+  uint32_t lwe_in_size = input_lwe_dimension + 1;
+  uint32_t glwe_out_size = (compression_params.glwe_dimension + 1) *
+                           compression_params.polynomial_size;
+  uint32_t num_glwes = num_lwes / mem_ptr->lwe_per_glwe + 1;
+
+  // Keyswitch LWEs to GLWE
+  auto tmp_glwe_array_out = mem_ptr->tmp_glwe_array_out;
+  auto fp_ks_buffer = mem_ptr->fp_ks_buffer;
+  for (int i = 0; i < num_glwes; i++) {
+    auto lwe_subset = lwe_shifted + i * lwe_in_size;
+    auto glwe_out = tmp_glwe_array_out + i * glwe_out_size;
+
+    host_packing_keyswitch_lwe_list_to_glwe(
+        streams[0], gpu_indexes[0], glwe_out, lwe_subset, fp_ksk[0],
+        fp_ks_buffer, input_lwe_dimension, compression_params.glwe_dimension,
+        compression_params.polynomial_size, compression_params.ks_base_log,
+        compression_params.ks_level, min(num_lwes, mem_ptr->lwe_per_glwe));
+  }
+
+  auto body_count = min(num_lwes, mem_ptr->lwe_per_glwe);
+
+  // Modulus switch
+  host_modulus_switch_inplace(streams[0], gpu_indexes[0], tmp_glwe_array_out,
+                              num_glwes *
+                                  (compression_params.glwe_dimension *
+                                       compression_params.polynomial_size +
+                                   body_count),
+                              mem_ptr->storage_log_modulus);
+  check_cuda_error(cudaGetLastError());
+
+  host_pack(streams[0], gpu_indexes[0], glwe_array_out, tmp_glwe_array_out,
+            num_glwes, body_count, mem_ptr);
+}
+
+template <typename Torus>
+__global__ void extract(Torus *glwe_array_out, Torus *array_in, uint32_t index,
+                        uint32_t log_modulus, uint32_t initial_out_len) {
+  auto nbits = sizeof(Torus) * 8;
+
+  auto i = threadIdx.x + blockIdx.x * blockDim.x;
+
+  if (i < initial_out_len) {
+    // Unpack
+    Torus mask = ((Torus)1 << log_modulus) - 1;
+    auto start = i * log_modulus;
+    auto end = (i + 1) * log_modulus;
+
+    auto start_block = start / nbits;
+    auto start_remainder = start % nbits;
+
+    auto end_block_inclusive = (end - 1) / nbits;
+
+    Torus unpacked_i;
+    if (start_block == end_block_inclusive) {
+      auto single_part = array_in[start_block] >> start_remainder;
+      unpacked_i = single_part & mask;
+    } else {
+      auto first_part = array_in[start_block] >> start_remainder;
+      auto second_part = array_in[start_block + 1] << (nbits - start_remainder);
+
+      unpacked_i = (first_part | second_part) & mask;
+    }
+
+    // Extract
+    glwe_array_out[i] = unpacked_i << (nbits - log_modulus);
+  }
+}
+
+template <typename Torus>
+__host__ void host_extract(cudaStream_t stream, uint32_t gpu_index,
+                           Torus *glwe_array_out, Torus *array_in,
+                           uint32_t glwe_index,
+                           int_decompression<Torus> *mem_ptr) {
+  cudaSetDevice(gpu_index);
+
+  auto params = mem_ptr->compression_params;
+
+  auto log_modulus = mem_ptr->storage_log_modulus;
+
+  uint32_t body_count = mem_ptr->body_count;
+  auto initial_out_len =
+      params.glwe_dimension * params.polynomial_size + body_count * body_count;
+
+  // We assure the tail of the glwe is zeroed
+  auto zeroed_slice =
+      glwe_array_out + params.glwe_dimension * params.polynomial_size;
+  cuda_memset_async(zeroed_slice, 0, params.polynomial_size * sizeof(Torus),
+                    stream, gpu_index);
+
+  int num_blocks = 0, num_threads = 0;
+  getNumBlocksAndThreads(initial_out_len, 128, num_blocks, num_threads);
+  dim3 grid(num_blocks);
+  dim3 threads(num_threads);
+  extract<<<grid, threads, 0, stream>>>(glwe_array_out, array_in, glwe_index,
+                                        log_modulus, initial_out_len);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename Torus>
+__host__ void
+host_integer_decompress(cudaStream_t *streams, uint32_t *gpu_indexes,
+                        uint32_t gpu_count, Torus *lwe_array_out,
+                        Torus *packed_glwe_in, uint32_t *indexes_array,
+                        uint32_t indexes_array_size, void **bsks,
+                        int_decompression<Torus> *mem_ptr) {
+
+  auto extracted_glwe = mem_ptr->tmp_extracted_glwe;
+  auto compression_params = mem_ptr->compression_params;
+  host_extract(streams[0], gpu_indexes[0], extracted_glwe, packed_glwe_in, 0,
+               mem_ptr);
+
+  auto num_lwes = mem_ptr->body_count;
+
+  // Sample extract
+  auto extracted_lwe = mem_ptr->tmp_extracted_lwe;
+  cuda_glwe_sample_extract_64(streams[0], gpu_indexes[0], extracted_lwe,
+                              extracted_glwe, indexes_array, indexes_array_size,
+                              compression_params.glwe_dimension,
+                              compression_params.polynomial_size);
+
+  /// Apply PBS to apply a LUT, reduce the noise and go from a small LWE
+  /// dimension to a big LWE dimension
+  auto encryption_params = mem_ptr->encryption_params;
+  auto carry_extract_lut = mem_ptr->carry_extract_lut;
+  execute_pbs_async<Torus>(
+      streams, gpu_indexes, gpu_count, lwe_array_out,
+      carry_extract_lut->lwe_indexes_out, carry_extract_lut->lut_vec,
+      carry_extract_lut->lut_indexes_vec, extracted_lwe,
+      carry_extract_lut->lwe_indexes_in, bsks, carry_extract_lut->buffer,
+      encryption_params.glwe_dimension,
+      compression_params.glwe_dimension * compression_params.polynomial_size,
+      encryption_params.polynomial_size, encryption_params.pbs_base_log,
+      encryption_params.pbs_level, encryption_params.grouping_factor, num_lwes,
+      encryption_params.pbs_type);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_compress_integer_radix_ciphertext_64(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_compression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params compression_params, uint32_t lwe_per_glwe,
+    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_compression<Torus>(
+      streams, gpu_indexes, gpu_count, compression_params, num_lwes,
+      lwe_per_glwe, storage_log_modulus, allocate_gpu_memory);
+}
+
+template <typename Torus>
+__host__ void scratch_cuda_integer_decompress_radix_ciphertext_64(
+    cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
+    int_decompression<Torus> **mem_ptr, uint32_t num_lwes,
+    int_radix_params encryption_params, int_radix_params compression_params,
+    uint32_t storage_log_modulus, bool allocate_gpu_memory) {
+
+  *mem_ptr = new int_decompression<Torus>(
+      streams, gpu_indexes, gpu_count, encryption_params, compression_params,
+      num_lwes, storage_log_modulus, allocate_gpu_memory);
+}
+#endif
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
index e935d8768d..cd7640d00b 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
@@ -765,7 +765,7 @@ __global__ void device_pack_blocks(Torus *lwe_array_out, Torus *lwe_array_in,
     }
 
     if (num_radix_blocks % 2 == 1) {
-      // We couldn't pack the last block, so we just copy it
+      // We couldn't host_pack the last block, so we just copy it
       Torus *lsb_block =
           lwe_array_in + (num_radix_blocks - 1) * (lwe_dimension + 1);
       Torus *last_block =
diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
index e595781256..801c0f131a 100644
--- a/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/integer/shift_and_rotate.cuh
@@ -133,7 +133,7 @@ __host__ void host_integer_radix_shift_and_rotate_kb_inplace(
       PANIC("Unknown operation")
     }
 
-    // pack bits into one block so that we have
+    // host_pack bits into one block so that we have
     // control_bit|b|a
     cuda_memset_async(mux_inputs, 0, total_nb_bits * big_lwe_size_bytes,
                       streams[0], gpu_indexes[0]); // Do we need this?
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
index 2a936d7cdf..a64c15378d 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cu
@@ -9,12 +9,12 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_32(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint32_t *>(lwe_array_out),
-                                static_cast<uint32_t *>(lwe_array_in),
-                                static_cast<uint32_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint32_t *>(lwe_array_out),
+      static_cast<uint32_t *>(lwe_array_in),
+      static_cast<uint32_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
 /*
  * Perform the multiplication of a u64 input LWE ciphertext vector with a u64
@@ -49,10 +49,10 @@ void cuda_mult_lwe_ciphertext_vector_cleartext_vector_64(
     void *cleartext_array_in, uint32_t input_lwe_dimension,
     uint32_t input_lwe_ciphertext_count) {
 
-  host_cleartext_multiplication(static_cast<cudaStream_t>(stream), gpu_index,
-                                static_cast<uint64_t *>(lwe_array_out),
-                                static_cast<uint64_t *>(lwe_array_in),
-                                static_cast<uint64_t *>(cleartext_array_in),
-                                input_lwe_dimension,
-                                input_lwe_ciphertext_count);
+  host_cleartext_vec_multiplication(
+      static_cast<cudaStream_t>(stream), gpu_index,
+      static_cast<uint64_t *>(lwe_array_out),
+      static_cast<uint64_t *>(lwe_array_in),
+      static_cast<uint64_t *>(cleartext_array_in), input_lwe_dimension,
+      input_lwe_ciphertext_count);
 }
diff --git a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
index 9fd9be9419..b9864973a0 100644
--- a/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/linearalgebra/multiplication.cuh
@@ -14,9 +14,10 @@
 #include <vector>
 
 template <typename T>
-__global__ void
-cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
-                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+__global__ void cleartext_vec_multiplication(T *output, T *lwe_input,
+                                             T *cleartext_input,
+                                             uint32_t input_lwe_dimension,
+                                             uint32_t num_entries) {
 
   int tid = threadIdx.x;
   int index = blockIdx.x * blockDim.x + tid;
@@ -27,10 +28,46 @@ cleartext_multiplication(T *output, T *lwe_input, T *cleartext_input,
   }
 }
 
+template <typename T>
+__host__ void
+host_cleartext_vec_multiplication(cudaStream_t stream, uint32_t gpu_index,
+                                  T *output, T *lwe_input, T *cleartext_input,
+                                  uint32_t input_lwe_dimension,
+                                  uint32_t input_lwe_ciphertext_count) {
+
+  cudaSetDevice(gpu_index);
+  // lwe_size includes the presence of the body
+  // whereas lwe_dimension is the number of elements in the mask
+  int lwe_size = input_lwe_dimension + 1;
+  // Create a 1-dimensional grid of threads
+  int num_blocks = 0, num_threads = 0;
+  int num_entries = input_lwe_ciphertext_count * lwe_size;
+  getNumBlocksAndThreads(num_entries, 512, num_blocks, num_threads);
+  dim3 grid(num_blocks, 1, 1);
+  dim3 thds(num_threads, 1, 1);
+
+  cleartext_vec_multiplication<<<grid, thds, 0, stream>>>(
+      output, lwe_input, cleartext_input, input_lwe_dimension, num_entries);
+  check_cuda_error(cudaGetLastError());
+}
+
+template <typename T>
+__global__ void
+cleartext_multiplication(T *output, T *lwe_input, T cleartext_input,
+                         uint32_t input_lwe_dimension, uint32_t num_entries) {
+
+  int tid = threadIdx.x;
+  int index = blockIdx.x * blockDim.x + tid;
+  if (index < num_entries) {
+    // Here we take advantage of the wrapping behaviour of uint
+    output[index] = lwe_input[index] * cleartext_input;
+  }
+}
+
 template <typename T>
 __host__ void
 host_cleartext_multiplication(cudaStream_t stream, uint32_t gpu_index,
-                              T *output, T *lwe_input, T *cleartext_input,
+                              T *output, T *lwe_input, T cleartext_input,
                               uint32_t input_lwe_dimension,
                               uint32_t input_lwe_ciphertext_count) {
 
diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
index 776880a550..b5f5e43f1e 100644
--- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh
@@ -102,8 +102,9 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
 
       synchronize_threads_in_block();
       // Multiply by the bsk element
-      polynomial_product_accumulate_by_monomial<Torus, params>(
-          accumulator, bsk_poly, monomial_degree, false);
+      polynomial_accumulate_monic_monomial_mul<Torus>(
+          accumulator, bsk_poly, monomial_degree, threadIdx.x, params::degree,
+          params::opt, false);
     }
 
     synchronize_threads_in_block();
diff --git a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
index a6d7901f20..37a1135346 100644
--- a/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
+++ b/backends/tfhe-cuda-backend/cuda/src/polynomial/polynomial_math.cuh
@@ -55,21 +55,22 @@ __device__ void polynomial_product_accumulate_in_fourier_domain(
   }
 }
 
-// If init_accumulator is set, assumes that result was not initialized and does
-// that with the outcome of first * second
-template <typename T, class params>
-__device__ void
-polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
-                                          uint64_t monomial_degree,
-                                          bool init_accumulator = false) {
-  // monomial_degree \in [0, 2 * params::degree)
-  int full_cycles_count = monomial_degree / params::degree;
-  int remainder_degrees = monomial_degree % params::degree;
+// This method expects to work with polynomial_size / compression_params::opt
+// threads in the x-block If init_accumulator is set, assumes that result was
+// not initialized and does that with the outcome of first * second
+template <typename T>
+__device__ void polynomial_accumulate_monic_monomial_mul(
+    T *result, const T *__restrict__ poly, uint64_t monomial_degree,
+    uint32_t tid, uint32_t polynomial_size, int coeff_per_thread,
+    bool init_accumulator = false) {
+  // monomial_degree \in [0, 2 * compression_params::degree)
+  int full_cycles_count = monomial_degree / polynomial_size;
+  int remainder_degrees = monomial_degree % polynomial_size;
 
-  int pos = threadIdx.x;
-  for (int i = 0; i < params::opt; i++) {
+  int pos = tid;
+  for (int i = 0; i < coeff_per_thread; i++) {
     T element = poly[pos];
-    int new_pos = (pos + monomial_degree) % params::degree;
+    int new_pos = (pos + monomial_degree) % polynomial_size;
 
     T x = SEL(element, -element, full_cycles_count % 2); // monomial coefficient
     x = SEL(-x, x, new_pos >= remainder_degrees);
@@ -78,7 +79,7 @@ polynomial_product_accumulate_by_monomial(T *result, const T *__restrict__ poly,
       result[new_pos] = x;
     else
       result[new_pos] += x;
-    pos += params::degree / params::opt;
+    pos += polynomial_size / coeff_per_thread;
   }
 }
 
diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs
index 11f7c30632..17353d48af 100644
--- a/backends/tfhe-cuda-backend/src/cuda_bind.rs
+++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs
@@ -311,6 +311,40 @@ extern "C" {
         num_samples: u32,
     );
 
+    /// This scratch function allocates the necessary amount of data on the GPU for
+    /// the public function packing keyswitch implementation on 64-bit
+    pub fn scratch_packing_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        fp_ks_buffer: *mut *mut i8,
+        glwe_dimension: u32,
+        polynomial_size: u32,
+        input_lwe_ciphertext_count: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    /// Perform public functional packing keyswitch on a vector of 64-bit LWE ciphertexts
+    pub fn cuda_packing_keyswitch_lwe_list_to_glwe_64(
+        stream: *mut c_void,
+        gpu_index: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk_array: *const c_void,
+        fp_ks_buffer: *mut i8,
+        input_lwe_dimension: u32,
+        output_glwe_dimension: u32,
+        polynomial_size: u32,
+        base_log: u32,
+        level_count: u32,
+        num_lwes: u32,
+    );
+
+    pub fn cleanup_packing_keyswitch_lwe_list_to_glwe(
+        stream: *mut c_void,
+        gpu_index: u32,
+        fp_ks_buffer: *mut *mut i8,
+    );
+
     /// Perform the negation of a u64 input LWE ciphertext vector.
     /// - `v_stream` is a void pointer to the Cuda stream to be used in the kernel launch
     /// - `gpu_index` is the index of the GPU to be used in the kernel launch
@@ -484,6 +518,80 @@ extern "C" {
         mem_ptr: *mut *mut i8,
     );
 
+    pub fn scratch_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        ks_level: u32,
+        ks_base_log: u32,
+        num_lwes: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        lwe_per_glwe: u32,
+        storage_log_modulus: u32,
+        allocate_gpu_memory: bool,
+    );
+    pub fn scratch_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+        encryption_glwe_dimension: u32,
+        encryption_polynomial_size: u32,
+        compression_glwe_dimension: u32,
+        compression_polynomial_size: u32,
+        lwe_dimension: u32,
+        pbs_level: u32,
+        pbs_base_log: u32,
+        num_lwes: u32,
+        message_modulus: u32,
+        carry_modulus: u32,
+        pbs_type: u32,
+        storage_log_modulus: u32,
+        allocate_gpu_memory: bool,
+    );
+
+    pub fn cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        glwe_array_out: *mut c_void,
+        lwe_array_in: *const c_void,
+        fp_ksk: *const *mut c_void,
+        num_lwes: u32,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        lwe_out: *mut c_void,
+        glwe_array_in: *const c_void,
+        indexes_array: *const c_void,
+        indexes_array_size: u32,
+        bsks: *const *mut c_void,
+        mem_ptr: *mut i8,
+    );
+
+    pub fn cleanup_cuda_integer_compress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+    pub fn cleanup_cuda_integer_decompress_radix_ciphertext_64(
+        streams: *const *mut c_void,
+        gpu_indexes: *const u32,
+        gpu_count: u32,
+        mem_ptr: *mut *mut i8,
+    );
+
     pub fn cuda_scalar_addition_integer_radix_ciphertext_64_inplace(
         streams: *const *mut c_void,
         gpu_indexes: *const u32,
diff --git a/tfhe/benches/integer/bench.rs b/tfhe/benches/integer/bench.rs
index 6b8b7c1799..c238ad7126 100644
--- a/tfhe/benches/integer/bench.rs
+++ b/tfhe/benches/integer/bench.rs
@@ -1126,11 +1126,15 @@ define_server_key_bench_default_fn!(
 #[cfg(feature = "gpu")]
 mod cuda {
     use super::*;
-    use criterion::criterion_group;
+    use criterion::{black_box, criterion_group};
     use tfhe::core_crypto::gpu::CudaStreams;
     use tfhe::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
-    use tfhe::integer::gpu::ciphertext::CudaUnsignedRadixCiphertext;
+    use tfhe::integer::gpu::ciphertext::compressed_ciphertext_list::CudaCompressedCiphertextListBuilder;
+    use tfhe::integer::gpu::ciphertext::{CudaRadixCiphertext, CudaUnsignedRadixCiphertext};
+    use tfhe::integer::gpu::gen_keys_radix_gpu;
     use tfhe::integer::gpu::server_key::CudaServerKey;
+    use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
 
     fn bench_cuda_server_key_unary_function_clean_inputs<F>(
         c: &mut Criterion,
@@ -1400,6 +1404,121 @@ mod cuda {
         bench_group.finish()
     }
 
+    fn cuda_compress(c: &mut Criterion) {
+        let bench_name = "integer::cuda::compression";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        let stream = CudaStreams::new_multi_gpu();
+
+        let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+        let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+
+        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+
+        for num_bits in [
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            comp_param.lwe_per_glwe.0 * log_message_modulus,
+        ] {
+            assert_eq!(num_bits % log_message_modulus, 0);
+            let num_blocks = num_bits / log_message_modulus;
+
+            // Generate private compression key
+            let (cks, _) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+            let private_compression_key = cks.new_compression_private_key(comp_param);
+
+            // Generate and convert compression keys
+            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
+            let (compressed_compression_key, _) =
+                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
+            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
+
+            // Encrypt
+            let ct = cks.encrypt_radix(0_u32, num_blocks);
+            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+
+            // Benchmark
+            let mut builder = CudaCompressedCiphertextListBuilder::new();
+
+            builder.push(d_ct, &stream);
+
+            bench_group.bench_function(format!("compress_u{num_bits}"), |b| {
+                b.iter(|| {
+                    let compressed = builder.build(&cuda_compression_key, &stream);
+
+                    _ = black_box(compressed);
+                })
+            });
+        }
+    }
+
+    fn cuda_decompress(c: &mut Criterion) {
+        let bench_name = "integer::cuda::compression";
+        let mut bench_group = c.benchmark_group(bench_name);
+        bench_group
+            .sample_size(15)
+            .measurement_time(std::time::Duration::from_secs(30));
+
+        let stream = CudaStreams::new_multi_gpu();
+
+        let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+        let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+
+        let log_message_modulus = param.message_modulus.0.ilog2() as usize;
+
+        for num_bits in [
+            8,
+            16,
+            32,
+            64,
+            128,
+            256,
+            comp_param.lwe_per_glwe.0 * log_message_modulus,
+        ] {
+            assert_eq!(num_bits % log_message_modulus, 0);
+            let num_blocks = num_bits / log_message_modulus;
+
+            // Generate private compression key
+            let (cks, _) = KEY_CACHE.get_from_params(param, IntegerKeyKind::Radix);
+            let private_compression_key = cks.new_compression_private_key(comp_param);
+
+            // Generate and convert compression keys
+            let (radix_cks, _) = gen_keys_radix_gpu(param, num_blocks, &stream);
+            let (compressed_compression_key, compressed_decompression_key) =
+                radix_cks.new_compressed_compression_decompression_keys(&private_compression_key);
+            let cuda_compression_key = compressed_compression_key.decompress_to_cuda(&stream);
+            let cuda_decompression_key =
+                compressed_decompression_key.decompress_to_cuda(radix_cks.parameters(), &stream);
+
+            // Encrypt
+            let ct = cks.encrypt_radix(0_u32, num_blocks);
+            let d_ct = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct, &stream);
+
+            // Benchmark
+            let mut builder = CudaCompressedCiphertextListBuilder::new();
+
+            builder.push(d_ct, &stream);
+
+            let compressed = builder.build(&cuda_compression_key, &stream);
+
+            bench_group.bench_function(format!("decompress_u{num_bits}"), |b| {
+                b.iter(|| {
+                    let unpacked: CudaRadixCiphertext =
+                        compressed.get(0, &cuda_decompression_key, &stream);
+
+                    _ = black_box(unpacked);
+                })
+            });
+        }
+    }
+
     macro_rules! define_cuda_server_key_bench_clean_input_unary_fn (
         (method_name: $server_key_method:ident, display_name:$name:ident) => {
             ::paste::paste!{
@@ -2052,6 +2171,8 @@ mod cuda {
         cuda_unsigned_overflowing_scalar_add,
     );
 
+    criterion_group!(cuda_compress_ops, cuda_compress, cuda_decompress);
+
     fn cuda_bench_server_key_cast_function<F>(
         c: &mut Criterion,
         bench_name: &str,
@@ -2142,8 +2263,8 @@ mod cuda {
 
 #[cfg(feature = "gpu")]
 use cuda::{
-    cuda_cast_ops, default_cuda_dedup_ops, default_cuda_ops, default_scalar_cuda_ops,
-    unchecked_cuda_ops, unchecked_scalar_cuda_ops,
+    cuda_cast_ops, cuda_compress_ops, default_cuda_dedup_ops, default_cuda_ops,
+    default_scalar_cuda_ops, unchecked_cuda_ops, unchecked_scalar_cuda_ops,
 };
 
 criterion_group!(
@@ -2495,11 +2616,13 @@ criterion_group!(oprf, oprf::unsigned_oprf);
 fn go_through_gpu_bench_groups(val: &str) {
     match val.to_lowercase().as_str() {
         "default" => {
+            cuda_compress_ops();
             default_cuda_ops();
             default_scalar_cuda_ops();
             cuda_cast_ops();
         }
         "fast_default" => {
+            cuda_compress_ops();
             default_cuda_dedup_ops();
         }
         "unchecked" => {
diff --git a/tfhe/benches/integer/glwe_packing_compression.rs b/tfhe/benches/integer/glwe_packing_compression.rs
index ee3ffec935..16673ef1c7 100644
--- a/tfhe/benches/integer/glwe_packing_compression.rs
+++ b/tfhe/benches/integer/glwe_packing_compression.rs
@@ -4,7 +4,7 @@ use tfhe::integer::{ClientKey, RadixCiphertext};
 use tfhe::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
 use tfhe::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
 
-fn glwe_packing(c: &mut Criterion) {
+fn cpu_glwe_packing(c: &mut Criterion) {
     let param = PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
 
     let comp_param = COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
@@ -29,7 +29,6 @@ fn glwe_packing(c: &mut Criterion) {
         64,
         128,
         256,
-        256,
         comp_param.lwe_per_glwe.0 * log_message_modulus,
     ] {
         assert_eq!(num_bits % log_message_modulus, 0);
@@ -73,9 +72,10 @@ fn glwe_packing(c: &mut Criterion) {
     }
 }
 
-criterion_group!(glwe_packing2, glwe_packing);
+criterion_group!(cpu_glwe_packing2, cpu_glwe_packing);
 
 fn main() {
-    glwe_packing2();
+    cpu_glwe_packing2();
+
     Criterion::default().configure_from_args().final_summary();
 }
diff --git a/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..29cc3d68b1
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/lwe_packing_keyswitch.rs
@@ -0,0 +1,36 @@
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::{packing_keyswitch_list_async, CudaStreams};
+use crate::core_crypto::prelude::{CastInto, UnsignedTorus};
+
+/// # Safety
+///
+/// - `stream` __must__ be synchronized to guarantee computation has finished, and inputs must not
+///   be dropped until stream is synchronised
+pub unsafe fn cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async<Scalar>(
+    lwe_pksk: &CudaLwePackingKeyswitchKey<Scalar>,
+    input_lwe_ciphertext_list: &CudaLweCiphertextList<Scalar>,
+    output_glwe_ciphertext: &mut CudaGlweCiphertextList<Scalar>,
+    streams: &CudaStreams,
+) where
+    // CastInto required for PBS modulus switch which returns a usize
+    Scalar: UnsignedTorus + CastInto<usize>,
+{
+    let input_lwe_dimension = input_lwe_ciphertext_list.lwe_dimension();
+    let output_glwe_dimension = output_glwe_ciphertext.glwe_dimension();
+    let output_polynomial_size = output_glwe_ciphertext.polynomial_size();
+
+    packing_keyswitch_list_async(
+        streams,
+        &mut output_glwe_ciphertext.0.d_vec,
+        &input_lwe_ciphertext_list.0.d_vec,
+        input_lwe_dimension,
+        output_glwe_dimension,
+        output_polynomial_size,
+        &lwe_pksk.d_vec,
+        lwe_pksk.decomposition_base_log(),
+        lwe_pksk.decomposition_level_count(),
+        input_lwe_ciphertext_list.lwe_ciphertext_count(),
+    );
+}
diff --git a/tfhe/src/core_crypto/gpu/algorithms/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
index ee6ddd2b1d..489bbb1565 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/mod.rs
@@ -1,13 +1,15 @@
+pub mod glwe_sample_extraction;
+pub mod lwe_keyswitch;
 pub mod lwe_linear_algebra;
 pub mod lwe_multi_bit_programmable_bootstrapping;
+pub mod lwe_packing_keyswitch;
 pub mod lwe_programmable_bootstrapping;
 
-pub mod glwe_sample_extraction;
-mod lwe_keyswitch;
 #[cfg(test)]
 mod test;
 
 pub use lwe_keyswitch::*;
 pub use lwe_linear_algebra::*;
 pub use lwe_multi_bit_programmable_bootstrapping::*;
+pub use lwe_packing_keyswitch::*;
 pub use lwe_programmable_bootstrapping::*;
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
new file mode 100644
index 0000000000..e95f4ed281
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/lwe_packing_keyswitch.rs
@@ -0,0 +1,234 @@
+use super::*;
+use crate::core_crypto::gpu::algorithms::lwe_packing_keyswitch::cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::CudaStreams;
+use serde::de::DeserializeOwned;
+use serde::Serialize;
+
+const NB_TESTS: usize = 10;
+fn generate_keys<Scalar: UnsignedTorus + Sync + Send + Serialize + DeserializeOwned>(
+    params: PackingKeySwitchTestParams<Scalar>,
+    streams: &CudaStreams,
+    rsc: &mut TestResources,
+) -> CudaPackingKeySwitchKeys<Scalar> {
+    let lwe_sk = allocate_and_generate_new_binary_lwe_secret_key(
+        params.lwe_dimension,
+        &mut rsc.secret_random_generator,
+    );
+
+    let glwe_sk = allocate_and_generate_new_binary_glwe_secret_key(
+        params.glwe_dimension,
+        params.polynomial_size,
+        &mut rsc.secret_random_generator,
+    );
+
+    let pksk = allocate_and_generate_new_lwe_packing_keyswitch_key(
+        &lwe_sk,
+        &glwe_sk,
+        params.pbs_base_log,
+        params.pbs_level,
+        params.glwe_noise_distribution,
+        params.ciphertext_modulus,
+        &mut rsc.encryption_random_generator,
+    );
+
+    assert!(check_encrypted_content_respects_mod(
+        &pksk,
+        params.ciphertext_modulus
+    ));
+
+    let cuda_pksk = CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(&pksk, streams);
+
+    CudaPackingKeySwitchKeys {
+        lwe_sk,
+        glwe_sk,
+        pksk: cuda_pksk,
+    }
+}
+
+fn lwe_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let keys = generate_keys(params, &stream, &mut rsc);
+            let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+            let plaintext = Plaintext(msg * delta);
+
+            let input_lwe = allocate_and_encrypt_new_lwe_ciphertext(
+                &lwe_sk,
+                plaintext,
+                lwe_noise_distribution,
+                ciphertext_modulus,
+                &mut rsc.encryption_random_generator,
+            );
+
+            let d_input_lwe = CudaLweCiphertextList::from_lwe_ciphertext(&input_lwe, &stream);
+
+            assert!(check_encrypted_content_respects_mod(
+                &input_lwe,
+                ciphertext_modulus
+            ));
+
+            let mut d_output_glwe = CudaGlweCiphertextList::new(
+                glwe_sk.glwe_dimension(),
+                glwe_sk.polynomial_size(),
+                GlweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            unsafe {
+                cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
+                    &pksk,
+                    &d_input_lwe,
+                    &mut d_output_glwe,
+                    &stream,
+                );
+            }
+            let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
+            let mut decrypted_plaintext_list = PlaintextList::new(
+                Scalar::ZERO,
+                PlaintextCount(output_glwe_list.polynomial_size().0),
+            );
+
+            decrypt_glwe_ciphertext_list(
+                &glwe_sk,
+                &output_glwe_list,
+                &mut decrypted_plaintext_list,
+            );
+            let decoded = round_decode(*decrypted_plaintext_list.get(0).0, delta) % msg_modulus;
+
+            assert_eq!(msg, decoded);
+        }
+
+        // In coverage, we break after one while loop iteration, changing message values does not
+        // yield higher coverage
+        #[cfg(tarpaulin)]
+        break;
+    }
+}
+
+fn lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod<Scalar, P>(params: P)
+where
+    Scalar: UnsignedTorus + CastInto<usize> + Serialize + DeserializeOwned,
+    P: Into<PackingKeySwitchTestParams<Scalar>>,
+    PackingKeySwitchTestParams<Scalar>: KeyCacheAccess<Keys = PackingKeySwitchKeys<Scalar>>,
+{
+    let params = params.into();
+
+    let lwe_noise_distribution = params.lwe_noise_distribution;
+    let ciphertext_modulus = params.ciphertext_modulus;
+    let message_modulus_log = params.message_modulus_log;
+    let encoding_with_padding = get_encoding_with_padding(ciphertext_modulus);
+
+    let mut rsc = TestResources::new();
+
+    let msg_modulus = Scalar::ONE.shl(message_modulus_log.0);
+    let mut msg = msg_modulus;
+    let delta: Scalar = encoding_with_padding / msg_modulus;
+
+    let gpu_index = 0;
+    let stream = CudaStreams::new_single_gpu(gpu_index);
+
+    while msg != Scalar::ZERO {
+        msg = msg.wrapping_sub(Scalar::ONE);
+        for _ in 0..NB_TESTS {
+            let keys = generate_keys(params, &stream, &mut rsc);
+            let (pksk, lwe_sk, glwe_sk) = (keys.pksk, keys.lwe_sk, keys.glwe_sk);
+
+            let mut input_lwe_list = LweCiphertextList::new(
+                Scalar::ZERO,
+                lwe_sk.lwe_dimension().to_lwe_size(),
+                LweCiphertextCount(glwe_sk.polynomial_size().0),
+                ciphertext_modulus,
+            );
+
+            let mut input_plaintext_list =
+                PlaintextList::new(msg * delta, PlaintextCount(glwe_sk.polynomial_size().0));
+
+            encrypt_lwe_ciphertext_list(
+                &lwe_sk,
+                &mut input_lwe_list,
+                &input_plaintext_list,
+                lwe_noise_distribution,
+                &mut rsc.encryption_random_generator,
+            );
+
+            let d_input_lwe_list =
+                CudaLweCiphertextList::from_lwe_ciphertext_list(&input_lwe_list, &stream);
+
+            assert!(check_encrypted_content_respects_mod(
+                &input_lwe_list,
+                ciphertext_modulus
+            ));
+
+            let mut d_output_glwe = CudaGlweCiphertextList::new(
+                glwe_sk.glwe_dimension(),
+                glwe_sk.polynomial_size(),
+                GlweCiphertextCount(1),
+                ciphertext_modulus,
+                &stream,
+            );
+
+            unsafe {
+                cuda_keyswitch_lwe_ciphertext_list_into_glwe_ciphertext_async(
+                    &pksk,
+                    &d_input_lwe_list,
+                    &mut d_output_glwe,
+                    &stream,
+                );
+            }
+
+            let output_glwe_list = d_output_glwe.to_glwe_ciphertext_list(&stream);
+
+            let mut decrypted_plaintext_list = PlaintextList::new(
+                Scalar::ZERO,
+                PlaintextCount(output_glwe_list.polynomial_size().0),
+            );
+
+            decrypt_glwe_ciphertext_list(
+                &glwe_sk,
+                &output_glwe_list,
+                &mut decrypted_plaintext_list,
+            );
+
+            decrypted_plaintext_list
+                .iter_mut()
+                .for_each(|x| *x.0 = round_decode(*x.0, delta) % msg_modulus);
+            input_plaintext_list.iter_mut().for_each(|x| *x.0 /= delta);
+
+            assert_eq!(decrypted_plaintext_list, input_plaintext_list);
+        }
+
+        // In coverage, we break after one while loop iteration, changing message values does not
+        // yield higher coverage
+        #[cfg(tarpaulin)]
+        break;
+    }
+}
+
+create_gpu_parametrized_test!(lwe_encrypt_pks_to_glwe_decrypt_custom_mod);
+create_gpu_parametrized_test!(lwe_list_encrypt_pks_to_glwe_decrypt_custom_mod);
diff --git a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
index 59ba20af18..779a419702 100644
--- a/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
+++ b/tfhe/src/core_crypto/gpu/algorithms/test/mod.rs
@@ -4,8 +4,15 @@ mod glwe_sample_extraction;
 mod lwe_keyswitch;
 mod lwe_linear_algebra;
 mod lwe_multi_bit_programmable_bootstrapping;
+mod lwe_packing_keyswitch;
 mod lwe_programmable_bootstrapping;
 
+pub struct CudaPackingKeySwitchKeys<Scalar: UnsignedInteger> {
+    pub lwe_sk: LweSecretKey<Vec<Scalar>>,
+    pub glwe_sk: GlweSecretKey<Vec<Scalar>>,
+    pub pksk: CudaLwePackingKeyswitchKey<Scalar>,
+}
+
 // Macro to generate tests for all parameter sets
 macro_rules! create_gpu_parametrized_test{
     ($name:ident { $($param:ident),* }) => {
@@ -47,4 +54,5 @@ macro_rules! create_gpu_multi_bit_parametrized_test{
     };
 }
 
+use crate::core_crypto::gpu::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
 use {create_gpu_multi_bit_parametrized_test, create_gpu_parametrized_test};
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
index 9defa579f6..536ab06a98 100644
--- a/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_ciphertext_list.rs
@@ -199,51 +199,6 @@ impl<T: UnsignedInteger> CudaLweCiphertextList<T> {
         LweCiphertext::from_container(container, self.ciphertext_modulus())
     }
 
-    /// ```rust
-    /// use tfhe::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
-    /// use tfhe::core_crypto::gpu::CudaStreams;
-    /// use tfhe::core_crypto::prelude::{
-    ///     CiphertextModulus, LweCiphertextCount, LweCiphertextList, LweSize,
-    /// };
-    ///
-    /// let mut streams = CudaStreams::new_single_gpu(0);
-    ///
-    /// let lwe_size = LweSize(743);
-    /// let ciphertext_modulus = CiphertextModulus::new_native();
-    /// let lwe_ciphertext_count = LweCiphertextCount(2);
-    ///
-    /// // Create a new LweCiphertextList
-    /// let lwe_list = LweCiphertextList::new(0u64, lwe_size, lwe_ciphertext_count, ciphertext_modulus);
-    ///
-    /// // Copy to GPU
-    /// let d_lwe_list = CudaLweCiphertextList::from_lwe_ciphertext_list(&lwe_list, &mut streams);
-    /// let d_lwe_list_copied = d_lwe_list.duplicate(&mut streams);
-    ///
-    /// let lwe_list_copied = d_lwe_list_copied.to_lwe_ciphertext_list(&mut streams);
-    ///
-    /// assert_eq!(lwe_list, lwe_list_copied);
-    /// ```
-    pub fn duplicate(&self, streams: &CudaStreams) -> Self {
-        let lwe_dimension = self.lwe_dimension();
-        let lwe_ciphertext_count = self.lwe_ciphertext_count();
-        let ciphertext_modulus = self.ciphertext_modulus();
-
-        // Copy to the GPU
-        let mut d_vec = CudaVec::new(self.0.d_vec.len(), streams, 0);
-        unsafe {
-            d_vec.copy_from_gpu_async(&self.0.d_vec, streams, 0);
-        }
-        streams.synchronize();
-
-        let cuda_lwe_list = CudaLweList {
-            d_vec,
-            lwe_ciphertext_count,
-            lwe_dimension,
-            ciphertext_modulus,
-        };
-        Self(cuda_lwe_list)
-    }
-
     pub(crate) fn lwe_dimension(&self) -> LweDimension {
         self.0.lwe_dimension
     }
diff --git a/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
new file mode 100644
index 0000000000..01d9783727
--- /dev/null
+++ b/tfhe/src/core_crypto/gpu/entities/lwe_packing_keyswitch_key.rs
@@ -0,0 +1,75 @@
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::{convert_lwe_keyswitch_key_async, CudaStreams};
+use crate::core_crypto::prelude::{
+    lwe_packing_keyswitch_key_input_key_element_encrypted_size, CiphertextModulus,
+    DecompositionBaseLog, DecompositionLevelCount, GlweSize, LweDimension,
+    LwePackingKeyswitchKeyOwned, PolynomialSize, UnsignedInteger,
+};
+
+#[derive(Debug)]
+pub struct CudaLwePackingKeyswitchKey<T: UnsignedInteger> {
+    pub(crate) d_vec: CudaVec<T>,
+    decomp_base_log: DecompositionBaseLog,
+    decomp_level_count: DecompositionLevelCount,
+    output_glwe_size: GlweSize,
+    output_polynomial_size: PolynomialSize,
+    ciphertext_modulus: CiphertextModulus<T>,
+}
+
+impl<T: UnsignedInteger> CudaLwePackingKeyswitchKey<T> {
+    pub fn from_lwe_packing_keyswitch_key(
+        h_ksk: &LwePackingKeyswitchKeyOwned<T>,
+        streams: &CudaStreams,
+    ) -> Self {
+        let decomp_base_log = h_ksk.decomposition_base_log();
+        let decomp_level_count = h_ksk.decomposition_level_count();
+        let output_glwe_size = h_ksk.output_key_glwe_dimension().to_glwe_size();
+        let output_polynomial_size = h_ksk.output_polynomial_size();
+        let ciphertext_modulus = h_ksk.ciphertext_modulus();
+
+        // Allocate memory
+        let mut d_vec = CudaVec::<T>::new_multi_gpu(h_ksk.as_ref().len(), streams);
+
+        unsafe {
+            convert_lwe_keyswitch_key_async(streams, &mut d_vec, h_ksk.as_ref());
+        }
+
+        streams.synchronize();
+
+        Self {
+            d_vec,
+            decomp_base_log,
+            decomp_level_count,
+            output_glwe_size,
+            output_polynomial_size,
+            ciphertext_modulus,
+        }
+    }
+
+    pub(crate) fn decomposition_base_log(&self) -> DecompositionBaseLog {
+        self.decomp_base_log
+    }
+    pub(crate) fn decomposition_level_count(&self) -> DecompositionLevelCount {
+        self.decomp_level_count
+    }
+
+    pub(crate) fn output_glwe_size(&self) -> GlweSize {
+        self.output_glwe_size
+    }
+    pub(crate) fn ciphertext_modulus(&self) -> CiphertextModulus<T> {
+        self.ciphertext_modulus
+    }
+    pub(crate) fn output_polynomial_size(&self) -> PolynomialSize {
+        self.output_polynomial_size
+    }
+    pub fn input_key_lwe_dimension(&self) -> LweDimension {
+        LweDimension(
+            self.d_vec.len
+                / lwe_packing_keyswitch_key_input_key_element_encrypted_size(
+                    self.decomp_level_count,
+                    self.output_glwe_size,
+                    self.output_polynomial_size,
+                ),
+        )
+    }
+}
diff --git a/tfhe/src/core_crypto/gpu/entities/mod.rs b/tfhe/src/core_crypto/gpu/entities/mod.rs
index 34dcb5f03e..623bf42497 100644
--- a/tfhe/src/core_crypto/gpu/entities/mod.rs
+++ b/tfhe/src/core_crypto/gpu/entities/mod.rs
@@ -3,3 +3,4 @@ pub mod lwe_bootstrap_key;
 pub mod lwe_ciphertext_list;
 pub mod lwe_keyswitch_key;
 pub mod lwe_multi_bit_bootstrap_key;
+pub mod lwe_packing_keyswitch_key;
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index 3341ba6e5e..d95cfa2bac 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -13,7 +13,6 @@ pub use algorithms::*;
 pub use entities::*;
 use std::ffi::c_void;
 pub(crate) use tfhe_cuda_backend::cuda_bind::*;
-
 #[derive(Debug)]
 pub struct CudaStreams {
     pub ptr: Vec<*mut c_void>,
@@ -260,6 +259,56 @@ pub unsafe fn convert_lwe_keyswitch_key_async<T: UnsignedInteger>(
     dest.copy_from_cpu_multi_gpu_async(src, streams);
 }
 
+/// Applies packing keyswitch on a vector of LWE ciphertexts
+///
+/// # Safety
+///
+/// [CudaStreams::synchronize] __must__ be called as soon as synchronization is
+/// required
+#[allow(clippy::too_many_arguments)]
+pub unsafe fn packing_keyswitch_list_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    input_lwe_dimension: LweDimension,
+    output_glwe_dimension: GlweDimension,
+    output_polynomial_size: PolynomialSize,
+    packing_keyswitch_key: &CudaVec<T>,
+    base_log: DecompositionBaseLog,
+    l_gadget: DecompositionLevelCount,
+    num_lwes: LweCiphertextCount,
+) {
+    let mut fp_ks_buffer: *mut i8 = std::ptr::null_mut();
+    scratch_packing_keyswitch_lwe_list_to_glwe_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        std::ptr::addr_of_mut!(fp_ks_buffer),
+        output_glwe_dimension.0 as u32,
+        output_polynomial_size.0 as u32,
+        num_lwes.0 as u32,
+        true,
+    );
+    cuda_packing_keyswitch_lwe_list_to_glwe_64(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        packing_keyswitch_key.as_c_ptr(0),
+        fp_ks_buffer,
+        input_lwe_dimension.0 as u32,
+        output_glwe_dimension.0 as u32,
+        output_polynomial_size.0 as u32,
+        base_log.0 as u32,
+        l_gadget.0 as u32,
+        num_lwes.0 as u32,
+    );
+    cleanup_packing_keyswitch_lwe_list_to_glwe(
+        streams.ptr[0],
+        streams.gpu_indexes[0],
+        std::ptr::addr_of_mut!(fp_ks_buffer),
+    );
+}
+
 /// Convert programmable bootstrap key
 ///
 /// # Safety
diff --git a/tfhe/src/integer/client_key/radix.rs b/tfhe/src/integer/client_key/radix.rs
index 36333ddbc2..baa1f0a4f9 100644
--- a/tfhe/src/integer/client_key/radix.rs
+++ b/tfhe/src/integer/client_key/radix.rs
@@ -6,6 +6,9 @@ use crate::integer::backward_compatibility::client_key::RadixClientKeyVersions;
 use crate::integer::block_decomposition::{DecomposableInto, RecomposableFrom};
 use crate::integer::ciphertext::{RadixCiphertext, SignedRadixCiphertext};
 use crate::integer::BooleanBlock;
+use crate::shortint::list_compression::{
+    CompressedCompressionKey, CompressedDecompressionKey, CompressionPrivateKeys,
+};
 use crate::shortint::{Ciphertext as ShortintCiphertext, PBSParameters as ShortintParameters};
 use serde::{Deserialize, Serialize};
 use tfhe_versionable::Versionize;
@@ -131,6 +134,15 @@ impl RadixClientKey {
     pub fn num_blocks(&self) -> usize {
         self.num_blocks
     }
+
+    pub fn new_compressed_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+    ) -> (CompressedCompressionKey, CompressedDecompressionKey) {
+        self.key
+            .key
+            .new_compressed_compression_decompression_keys(private_compression_key)
+    }
 }
 
 impl From<(ClientKey, usize)> for RadixClientKey {
diff --git a/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
new file mode 100644
index 0000000000..e83157a7ba
--- /dev/null
+++ b/tfhe/src/integer/gpu/ciphertext/compressed_ciphertext_list.rs
@@ -0,0 +1,216 @@
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::ciphertext::DataKind;
+use crate::integer::gpu::ciphertext::boolean_value::CudaBooleanBlock;
+use crate::integer::gpu::ciphertext::{
+    CudaRadixCiphertext, CudaSignedRadixCiphertext, CudaUnsignedRadixCiphertext,
+};
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey, CudaPackedGlweCiphertext,
+};
+
+pub struct CudaCompressedCiphertextList {
+    pub(crate) packed_list: CudaPackedGlweCiphertext,
+    info: Vec<DataKind>,
+}
+impl CudaCompressedCiphertextList {
+    pub fn len(&self) -> usize {
+        self.info.len()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.info.len() == 0
+    }
+
+    pub fn get(
+        &self,
+        index: usize,
+        decomp_key: &CudaDecompressionKey,
+        streams: &CudaStreams,
+    ) -> CudaRadixCiphertext {
+        let preceding_infos = self.info.get(..index).unwrap();
+        let current_info = self.info.get(index).copied().unwrap();
+
+        let start_block_index: usize = preceding_infos
+            .iter()
+            .copied()
+            .map(DataKind::num_blocks)
+            .sum();
+
+        let end_block_index = start_block_index + current_info.num_blocks() - 1;
+
+        decomp_key.unpack(
+            &self.packed_list,
+            start_block_index,
+            end_block_index,
+            streams,
+        )
+    }
+}
+
+pub trait CudaCompressible {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind;
+}
+
+impl CudaCompressible for CudaSignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+
+        messages.push(x);
+        DataKind::Signed(num_blocks)
+    }
+}
+
+impl CudaCompressible for CudaBooleanBlock {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.0.ciphertext.duplicate(streams);
+
+        messages.push(x);
+        DataKind::Boolean
+    }
+}
+impl CudaCompressible for CudaUnsignedRadixCiphertext {
+    fn compress_into(
+        self,
+        messages: &mut Vec<CudaRadixCiphertext>,
+        streams: &CudaStreams,
+    ) -> DataKind {
+        let x = self.ciphertext.duplicate(streams);
+        let num_blocks = x.d_blocks.lwe_ciphertext_count().0;
+
+        messages.push(x);
+        DataKind::Unsigned(num_blocks)
+    }
+}
+
+pub struct CudaCompressedCiphertextListBuilder {
+    pub(crate) ciphertexts: Vec<CudaRadixCiphertext>,
+    pub(crate) info: Vec<DataKind>,
+}
+
+impl CudaCompressedCiphertextListBuilder {
+    #[allow(clippy::new_without_default)]
+    pub fn new() -> Self {
+        Self {
+            ciphertexts: vec![],
+            info: vec![],
+        }
+    }
+
+    pub fn push<T: CudaCompressible>(&mut self, data: T, streams: &CudaStreams) -> &mut Self {
+        let kind = data.compress_into(&mut self.ciphertexts, streams);
+
+        if kind.num_blocks() != 0 {
+            self.info.push(kind);
+        }
+
+        self
+    }
+
+    pub fn build(
+        &self,
+        comp_key: &CudaCompressionKey,
+        streams: &CudaStreams,
+    ) -> CudaCompressedCiphertextList {
+        let packed_list = comp_key.compress_ciphertexts_into_list(&self.ciphertexts, streams);
+
+        CudaCompressedCiphertextList {
+            packed_list,
+            info: self.info.clone(),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::integer::gpu::gen_keys_radix_gpu;
+    use crate::integer::ClientKey;
+    use crate::shortint::parameters::list_compression::COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+    use crate::shortint::parameters::PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64;
+
+    #[test]
+    fn test_gpu_ciphertext_compression() {
+        let cks = ClientKey::new(PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+
+        let private_compression_key =
+            cks.new_compression_private_key(COMP_PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64);
+
+        let streams = CudaStreams::new_multi_gpu();
+
+        let num_blocks = 4;
+        let (radix_cks, _) = gen_keys_radix_gpu(
+            PARAM_MESSAGE_2_CARRY_2_KS_PBS_TUNIFORM_2M64,
+            num_blocks,
+            &streams,
+        );
+
+        let (cuda_compression_key, cuda_decompression_key) =
+            radix_cks.new_cuda_compression_decompression_keys(&private_compression_key, &streams);
+
+        let ct1 = radix_cks.encrypt(3_u32);
+        let ct2 = radix_cks.encrypt(2_u32);
+        let ct3 = radix_cks.encrypt_signed(-2);
+        let ct4 = cks.encrypt_bool(true);
+
+        // Copy to GPU
+        let d_ct1 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct1, &streams);
+        let d_ct2 = CudaUnsignedRadixCiphertext::from_radix_ciphertext(&ct2, &streams);
+        let d_ct3 = CudaSignedRadixCiphertext::from_signed_radix_ciphertext(&ct3, &streams);
+        let d_ct4 = CudaBooleanBlock::from_boolean_block(&ct4, &streams);
+
+        let cuda_compressed = CudaCompressedCiphertextListBuilder::new()
+            .push(d_ct1, &streams)
+            .push(d_ct2, &streams)
+            .push(d_ct3, &streams)
+            .push(d_ct4, &streams)
+            .build(&cuda_compression_key, &streams);
+
+        let d_decompressed1 = CudaUnsignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(0, &cuda_decompression_key, &streams),
+        };
+
+        let decompressed1 = d_decompressed1.to_radix_ciphertext(&streams);
+        let decrypted: u32 = radix_cks.decrypt(&decompressed1);
+
+        assert_eq!(decrypted, 3_u32);
+        let d_decompressed2 = CudaUnsignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(1, &cuda_decompression_key, &streams),
+        };
+
+        let decompressed2 = d_decompressed2.to_radix_ciphertext(&streams);
+        let decrypted: u32 = radix_cks.decrypt(&decompressed2);
+
+        assert_eq!(decrypted, 2_u32);
+        let d_decompressed3 = CudaSignedRadixCiphertext {
+            ciphertext: cuda_compressed.get(2, &cuda_decompression_key, &streams),
+        };
+
+        let decompressed3 = d_decompressed3.to_signed_radix_ciphertext(&streams);
+        let decrypted: i32 = radix_cks.decrypt_signed(&decompressed3);
+
+        assert_eq!(decrypted, -2);
+        let d_decompressed4 = CudaBooleanBlock::from_cuda_radix_ciphertext(cuda_compressed.get(
+            3,
+            &cuda_decompression_key,
+            &streams,
+        ));
+
+        let decompressed4 = d_decompressed4.to_boolean_block(&streams);
+        let decrypted = radix_cks.decrypt_bool(&decompressed4);
+
+        assert!(decrypted);
+    }
+}
diff --git a/tfhe/src/integer/gpu/ciphertext/mod.rs b/tfhe/src/integer/gpu/ciphertext/mod.rs
index 7f0408eddb..51e702d0c8 100644
--- a/tfhe/src/integer/gpu/ciphertext/mod.rs
+++ b/tfhe/src/integer/gpu/ciphertext/mod.rs
@@ -1,4 +1,5 @@
 pub mod boolean_value;
+pub mod compressed_ciphertext_list;
 pub mod info;
 
 use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
diff --git a/tfhe/src/integer/gpu/client_key/mod.rs b/tfhe/src/integer/gpu/client_key/mod.rs
new file mode 100644
index 0000000000..e7244e3719
--- /dev/null
+++ b/tfhe/src/integer/gpu/client_key/mod.rs
@@ -0,0 +1 @@
+pub mod radix;
diff --git a/tfhe/src/integer/gpu/client_key/radix.rs b/tfhe/src/integer/gpu/client_key/radix.rs
new file mode 100644
index 0000000000..96a5ef6310
--- /dev/null
+++ b/tfhe/src/integer/gpu/client_key/radix.rs
@@ -0,0 +1,95 @@
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{
+    allocate_and_generate_new_lwe_packing_keyswitch_key, par_generate_lwe_bootstrap_key,
+    LweBootstrapKey,
+};
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::integer::RadixClientKey;
+use crate::shortint::engine::ShortintEngine;
+use crate::shortint::list_compression::{CompressionKey, CompressionPrivateKeys};
+use crate::shortint::{ClassicPBSParameters, EncryptionKeyChoice, PBSParameters};
+
+impl RadixClientKey {
+    pub fn new_cuda_compression_decompression_keys(
+        &self,
+        private_compression_key: &CompressionPrivateKeys,
+        streams: &CudaStreams,
+    ) -> (CudaCompressionKey, CudaDecompressionKey) {
+        let cks_params: ClassicPBSParameters = match self.parameters() {
+            PBSParameters::PBS(a) => a,
+            PBSParameters::MultiBitPBS(_) => {
+                panic!("Compression is currently not compatible with Multi Bit PBS")
+            }
+        };
+        let params = &private_compression_key.params;
+
+        assert_eq!(
+            cks_params.encryption_key_choice,
+            EncryptionKeyChoice::Big,
+            "Compression is only compatible with ciphertext in post PBS dimension"
+        );
+
+        // Compression key
+        let packing_key_switching_key = ShortintEngine::with_thread_local_mut(|engine| {
+            allocate_and_generate_new_lwe_packing_keyswitch_key(
+                &self.as_ref().key.large_lwe_secret_key(),
+                &private_compression_key.post_packing_ks_key,
+                params.packing_ks_base_log,
+                params.packing_ks_level,
+                params.packing_ks_key_noise_distribution,
+                self.parameters().ciphertext_modulus(),
+                &mut engine.encryption_generator,
+            )
+        });
+
+        let glwe_compression_key = CompressionKey {
+            packing_key_switching_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            storage_log_modulus: private_compression_key.params.storage_log_modulus,
+        };
+
+        let cuda_compression_key =
+            CudaCompressionKey::from_compression_key(&glwe_compression_key, streams);
+
+        // Decompression key
+        let mut bsk = LweBootstrapKey::new(
+            0u64,
+            self.parameters().glwe_dimension().to_glwe_size(),
+            self.parameters().polynomial_size(),
+            private_compression_key.params.br_base_log,
+            private_compression_key.params.br_level,
+            params
+                .packing_ks_glwe_dimension
+                .to_equivalent_lwe_dimension(params.packing_ks_polynomial_size),
+            self.parameters().ciphertext_modulus(),
+        );
+
+        ShortintEngine::with_thread_local_mut(|engine| {
+            par_generate_lwe_bootstrap_key(
+                &private_compression_key
+                    .post_packing_ks_key
+                    .as_lwe_secret_key(),
+                &self.as_ref().key.glwe_secret_key,
+                &mut bsk,
+                self.parameters().glwe_noise_distribution(),
+                &mut engine.encryption_generator,
+            );
+        });
+
+        let blind_rotate_key = CudaBootstrappingKey::Classic(
+            CudaLweBootstrapKey::from_lwe_bootstrap_key(&bsk, streams),
+        );
+
+        let cuda_decompression_key = CudaDecompressionKey {
+            blind_rotate_key,
+            lwe_per_glwe: params.lwe_per_glwe,
+            parameters: self.parameters(),
+        };
+
+        (cuda_compression_key, cuda_decompression_key)
+    }
+}
diff --git a/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs
new file mode 100644
index 0000000000..0a89c6efba
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/compressed_server_keys.rs
@@ -0,0 +1,51 @@
+use crate::core_crypto::gpu::lwe_bootstrap_key::CudaLweBootstrapKey;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::integer::gpu::list_compression::server_keys::{
+    CudaCompressionKey, CudaDecompressionKey,
+};
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::shortint::list_compression::{
+    CompressedCompressionKey, CompressedDecompressionKey, CompressionKey,
+};
+use crate::shortint::PBSParameters;
+
+impl CompressedDecompressionKey {
+    pub fn decompress_to_cuda(
+        &self,
+        parameters: PBSParameters,
+        streams: &CudaStreams,
+    ) -> CudaDecompressionKey {
+        let h_bootstrap_key = self
+            .blind_rotate_key
+            .as_view()
+            .par_decompress_into_lwe_bootstrap_key();
+
+        let d_bootstrap_key =
+            CudaLweBootstrapKey::from_lwe_bootstrap_key(&h_bootstrap_key, streams);
+
+        let blind_rotate_key = CudaBootstrappingKey::Classic(d_bootstrap_key);
+
+        CudaDecompressionKey {
+            blind_rotate_key,
+            lwe_per_glwe: self.lwe_per_glwe,
+            parameters,
+        }
+    }
+}
+
+impl CompressedCompressionKey {
+    pub fn decompress_to_cuda(&self, streams: &CudaStreams) -> CudaCompressionKey {
+        let packing_key_switching_key = self
+            .packing_key_switching_key
+            .as_view()
+            .decompress_into_lwe_packing_keyswitch_key();
+
+        let glwe_compression_key = CompressionKey {
+            packing_key_switching_key,
+            lwe_per_glwe: self.lwe_per_glwe,
+            storage_log_modulus: self.storage_log_modulus,
+        };
+
+        CudaCompressionKey::from_compression_key(&glwe_compression_key, streams)
+    }
+}
diff --git a/tfhe/src/integer/gpu/list_compression/mod.rs b/tfhe/src/integer/gpu/list_compression/mod.rs
new file mode 100644
index 0000000000..8cb3b2e38c
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/mod.rs
@@ -0,0 +1,2 @@
+pub mod compressed_server_keys;
+pub mod server_keys;
diff --git a/tfhe/src/integer/gpu/list_compression/server_keys.rs b/tfhe/src/integer/gpu/list_compression/server_keys.rs
new file mode 100644
index 0000000000..6b34256446
--- /dev/null
+++ b/tfhe/src/integer/gpu/list_compression/server_keys.rs
@@ -0,0 +1,251 @@
+use crate::core_crypto::gpu::entities::lwe_packing_keyswitch_key::CudaLwePackingKeyswitchKey;
+use crate::core_crypto::gpu::glwe_ciphertext_list::CudaGlweCiphertextList;
+use crate::core_crypto::gpu::lwe_ciphertext_list::CudaLweCiphertextList;
+use crate::core_crypto::gpu::vec::CudaVec;
+use crate::core_crypto::gpu::CudaStreams;
+use crate::core_crypto::prelude::{CiphertextModulusLog, GlweCiphertextCount, LweCiphertextCount};
+use crate::integer::gpu::ciphertext::info::{CudaBlockInfo, CudaRadixCiphertextInfo};
+use crate::integer::gpu::ciphertext::CudaRadixCiphertext;
+use crate::integer::gpu::server_key::CudaBootstrappingKey;
+use crate::integer::gpu::{
+    compress_integer_radix_async, cuda_memcpy_async_gpu_to_gpu, decompress_integer_radix_async,
+};
+use crate::shortint::list_compression::CompressionKey;
+use crate::shortint::PBSParameters;
+use itertools::Itertools;
+
+#[derive(Debug)]
+pub struct CudaCompressionKey {
+    pub packing_key_switching_key: CudaLwePackingKeyswitchKey<u64>,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub storage_log_modulus: CiphertextModulusLog,
+}
+
+pub struct CudaDecompressionKey {
+    pub blind_rotate_key: CudaBootstrappingKey,
+    pub lwe_per_glwe: LweCiphertextCount,
+    pub parameters: PBSParameters,
+}
+
+pub struct CudaPackedGlweCiphertext {
+    pub glwe_ciphertext_list: CudaGlweCiphertextList<u64>,
+    pub block_info: Vec<CudaBlockInfo>,
+    pub storage_log_modulus: CiphertextModulusLog,
+}
+
+impl CudaCompressionKey {
+    pub fn from_compression_key(compression_key: &CompressionKey, streams: &CudaStreams) -> Self {
+        Self {
+            packing_key_switching_key: CudaLwePackingKeyswitchKey::from_lwe_packing_keyswitch_key(
+                &compression_key.packing_key_switching_key,
+                streams,
+            ),
+            lwe_per_glwe: compression_key.lwe_per_glwe,
+            storage_log_modulus: compression_key.storage_log_modulus,
+        }
+    }
+
+    unsafe fn flatten_async(
+        vec_ciphertexts: &[CudaRadixCiphertext],
+        streams: &CudaStreams,
+    ) -> (CudaLweCiphertextList<u64>, Vec<CudaBlockInfo>) {
+        let first_ct = &vec_ciphertexts.first().unwrap().d_blocks;
+
+        // We assume all ciphertexts will have the same lwe dimension
+        let lwe_dimension = first_ct.lwe_dimension();
+        let ciphertext_modulus = first_ct.ciphertext_modulus();
+
+        // Compute total number of lwe ciphertexts we will be handling
+        let total_num_blocks: usize = vec_ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let lwe_ciphertext_count = LweCiphertextCount(total_num_blocks);
+
+        let gpu_index = streams.gpu_indexes[0];
+        let d_vec = unsafe {
+            let mut d_vec = CudaVec::new_async(
+                lwe_dimension.to_lwe_size().0 * lwe_ciphertext_count.0,
+                streams,
+                gpu_index,
+            );
+            let mut offset: usize = 0;
+            for ciphertext in vec_ciphertexts {
+                let dest_ptr = d_vec
+                    .as_mut_c_ptr(gpu_index)
+                    .add(offset * std::mem::size_of::<u64>());
+                let size = ciphertext.d_blocks.0.d_vec.len * std::mem::size_of::<u64>();
+                cuda_memcpy_async_gpu_to_gpu(
+                    dest_ptr,
+                    ciphertext.d_blocks.0.d_vec.as_c_ptr(gpu_index),
+                    size as u64,
+                    streams.ptr[gpu_index as usize],
+                    streams.gpu_indexes[gpu_index as usize],
+                );
+
+                offset += ciphertext.d_blocks.0.d_vec.len;
+            }
+
+            streams.synchronize();
+            d_vec
+        };
+
+        let flattened_ciphertexts =
+            CudaLweCiphertextList::from_cuda_vec(d_vec, lwe_ciphertext_count, ciphertext_modulus);
+
+        let info = vec_ciphertexts
+            .iter()
+            .flat_map(|x| x.info.blocks.clone())
+            .collect_vec();
+
+        (flattened_ciphertexts, info)
+    }
+
+    pub fn compress_ciphertexts_into_list(
+        &self,
+        ciphertexts: &[CudaRadixCiphertext],
+        streams: &CudaStreams,
+    ) -> CudaPackedGlweCiphertext {
+        let lwe_pksk = &self.packing_key_switching_key;
+
+        let ciphertext_modulus = lwe_pksk.ciphertext_modulus();
+        let compress_polynomial_size = lwe_pksk.output_polynomial_size();
+        let compress_glwe_size = lwe_pksk.output_glwe_size();
+
+        let first_ct = ciphertexts.first().unwrap();
+        let first_ct_info = first_ct.info.blocks.first().unwrap();
+        let message_modulus = first_ct_info.message_modulus;
+        let carry_modulus = first_ct_info.carry_modulus;
+
+        let lwe_dimension = first_ct.d_blocks.lwe_dimension();
+
+        let num_lwes: usize = ciphertexts
+            .iter()
+            .map(|x| x.d_blocks.lwe_ciphertext_count().0)
+            .sum();
+
+        let mut output_glwe = CudaGlweCiphertextList::new(
+            compress_glwe_size.to_glwe_dimension(),
+            compress_polynomial_size,
+            GlweCiphertextCount(ciphertexts.len()),
+            ciphertext_modulus,
+            streams,
+        );
+
+        let info = unsafe {
+            let (input_lwes, info) = Self::flatten_async(ciphertexts, streams);
+
+            compress_integer_radix_async(
+                streams,
+                &mut output_glwe.0.d_vec,
+                &input_lwes.0.d_vec,
+                &self.packing_key_switching_key.d_vec,
+                message_modulus,
+                carry_modulus,
+                compress_glwe_size.to_glwe_dimension(),
+                compress_polynomial_size,
+                lwe_dimension,
+                lwe_pksk.decomposition_base_log(),
+                lwe_pksk.decomposition_level_count(),
+                self.lwe_per_glwe.0 as u32,
+                self.storage_log_modulus.0 as u32,
+                num_lwes as u32,
+            );
+
+            streams.synchronize();
+
+            info
+        };
+
+        CudaPackedGlweCiphertext {
+            glwe_ciphertext_list: output_glwe,
+            block_info: info,
+            storage_log_modulus: self.storage_log_modulus,
+        }
+    }
+}
+
+impl CudaDecompressionKey {
+    pub fn unpack(
+        &self,
+        packed_list: &CudaPackedGlweCiphertext,
+        start_block_index: usize,
+        end_block_index: usize,
+        streams: &CudaStreams,
+    ) -> CudaRadixCiphertext {
+        let indexes_array = (start_block_index..=end_block_index)
+            .map(|x| x as u32)
+            .collect_vec();
+
+        let encryption_glwe_dimension = self.parameters.glwe_dimension();
+        let encryption_polynomial_size = self.parameters.polynomial_size();
+        let glwe_ciphertext_list = &packed_list.glwe_ciphertext_list;
+        let compression_glwe_dimension = glwe_ciphertext_list.glwe_dimension();
+        let compression_polynomial_size = glwe_ciphertext_list.polynomial_size();
+        let lwe_ciphertext_count = LweCiphertextCount(indexes_array.len());
+        let message_modulus = self.parameters.message_modulus();
+        let carry_modulus = self.parameters.carry_modulus();
+        let ciphertext_modulus = self.parameters.ciphertext_modulus();
+        let storage_log_modulus = packed_list.storage_log_modulus;
+
+        match &self.blind_rotate_key {
+            CudaBootstrappingKey::Classic(bsk) => {
+                let lwe_dimension = bsk.output_lwe_dimension();
+
+                let mut output_lwe = CudaLweCiphertextList::new(
+                    lwe_dimension,
+                    lwe_ciphertext_count,
+                    ciphertext_modulus,
+                    streams,
+                );
+
+                let gpu_index = streams.gpu_indexes[0];
+                unsafe {
+                    let d_indexes_array =
+                        CudaVec::from_cpu_async(indexes_array.as_slice(), streams, gpu_index);
+
+                    decompress_integer_radix_async(
+                        streams,
+                        &mut output_lwe.0.d_vec,
+                        &glwe_ciphertext_list.0.d_vec,
+                        &bsk.d_vec,
+                        message_modulus,
+                        carry_modulus,
+                        encryption_glwe_dimension,
+                        encryption_polynomial_size,
+                        compression_glwe_dimension,
+                        compression_polynomial_size,
+                        lwe_dimension,
+                        bsk.decomp_base_log(),
+                        bsk.decomp_level_count(),
+                        storage_log_modulus.0 as u32,
+                        &d_indexes_array,
+                        lwe_ciphertext_count.0 as u32,
+                    );
+                }
+
+                streams.synchronize();
+
+                let blocks = packed_list.block_info[start_block_index..=end_block_index].to_vec();
+
+                assert_eq!(
+                    blocks.len(),
+                    output_lwe.lwe_ciphertext_count().0,
+                    "Mismatch between \
+                the number of output LWEs ({:?}) and number of info blocks ({:?})",
+                    output_lwe.lwe_ciphertext_count().0,
+                    blocks.len(),
+                );
+
+                CudaRadixCiphertext {
+                    d_blocks: output_lwe,
+                    info: CudaRadixCiphertextInfo { blocks },
+                }
+            }
+            CudaBootstrappingKey::MultiBit(_) => {
+                panic! {"Compression is currently not compatible with Multi Bit PBS"}
+            }
+        }
+    }
+}
diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs
index e4545da735..7aa2a705b1 100644
--- a/tfhe/src/integer/gpu/mod.rs
+++ b/tfhe/src/integer/gpu/mod.rs
@@ -1,4 +1,6 @@
 pub mod ciphertext;
+pub mod client_key;
+pub mod list_compression;
 pub mod server_key;
 
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
@@ -12,6 +14,7 @@ use crate::integer::{ClientKey, RadixClientKey};
 use crate::shortint::{CarryModulus, MessageModulus};
 pub use server_key::CudaServerKey;
 use std::cmp::min;
+
 use tfhe_cuda_backend::cuda_bind::*;
 
 #[repr(u32)]
@@ -273,6 +276,160 @@ pub unsafe fn unchecked_scalar_mul_integer_radix_kb_async<T: UnsignedInteger, B:
     );
 }
 
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn compress_integer_radix_async<T: UnsignedInteger>(
+    streams: &CudaStreams,
+    glwe_array_out: &mut CudaVec<T>,
+    lwe_array_in: &CudaVec<T>,
+    fp_keyswitch_key: &CudaVec<u64>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    compression_glwe_dimension: GlweDimension,
+    compression_polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    ks_base_log: DecompositionBaseLog,
+    ks_level: DecompositionLevelCount,
+    lwe_per_glwe: u32,
+    storage_log_modulus: u32,
+    num_blocks: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_array_out.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        fp_keyswitch_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_integer_compress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        compression_glwe_dimension.0 as u32,
+        compression_polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        ks_level.0 as u32,
+        ks_base_log.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        lwe_per_glwe,
+        storage_log_modulus,
+        true,
+    );
+
+    cuda_integer_compress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        glwe_array_out.as_mut_c_ptr(0),
+        lwe_array_in.as_c_ptr(0),
+        fp_keyswitch_key.ptr.as_ptr(),
+        num_blocks,
+        mem_ptr,
+    );
+
+    cleanup_cuda_integer_compress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
+#[allow(clippy::too_many_arguments)]
+/// # Safety
+///
+/// - [CudaStreams::synchronize] __must__ be called after this function as soon as synchronization
+///   is required
+pub unsafe fn decompress_integer_radix_async<T: UnsignedInteger, B: Numeric>(
+    streams: &CudaStreams,
+    lwe_array_out: &mut CudaVec<T>,
+    glwe_in: &CudaVec<T>,
+    bootstrapping_key: &CudaVec<B>,
+    message_modulus: MessageModulus,
+    carry_modulus: CarryModulus,
+    encryption_glwe_dimension: GlweDimension,
+    encryption_polynomial_size: PolynomialSize,
+    compression_glwe_dimension: GlweDimension,
+    compression_polynomial_size: PolynomialSize,
+    lwe_dimension: LweDimension,
+    pbs_base_log: DecompositionBaseLog,
+    pbs_level: DecompositionLevelCount,
+    storage_log_modulus: u32,
+    vec_indexes: &CudaVec<u32>,
+    num_blocks: u32,
+) {
+    assert_eq!(
+        streams.gpu_indexes[0],
+        lwe_array_out.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        glwe_in.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    assert_eq!(
+        streams.gpu_indexes[0],
+        bootstrapping_key.gpu_index(0),
+        "GPU error: all data should reside on the same GPU."
+    );
+    let mut mem_ptr: *mut i8 = std::ptr::null_mut();
+    scratch_cuda_integer_decompress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+        encryption_glwe_dimension.0 as u32,
+        encryption_polynomial_size.0 as u32,
+        compression_glwe_dimension.0 as u32,
+        compression_polynomial_size.0 as u32,
+        lwe_dimension.0 as u32,
+        pbs_level.0 as u32,
+        pbs_base_log.0 as u32,
+        num_blocks,
+        message_modulus.0 as u32,
+        carry_modulus.0 as u32,
+        PBSType::Classical as u32,
+        storage_log_modulus,
+        true,
+    );
+
+    cuda_integer_decompress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        lwe_array_out.as_mut_c_ptr(0),
+        glwe_in.as_c_ptr(0),
+        vec_indexes.as_c_ptr(0),
+        vec_indexes.len as u32,
+        bootstrapping_key.ptr.as_ptr(),
+        mem_ptr,
+    );
+
+    cleanup_cuda_integer_decompress_radix_ciphertext_64(
+        streams.ptr.as_ptr(),
+        streams.gpu_indexes.as_ptr(),
+        streams.len() as u32,
+        std::ptr::addr_of_mut!(mem_ptr),
+    );
+}
+
 #[allow(clippy::too_many_arguments)]
 /// # Safety
 ///